2007-08-02 20:11:11 +02:00
|
|
|
//===- DeadStoreElimination.cpp - Fast Dead Store Elimination -------------===//
|
2007-07-11 02:46:18 +02:00
|
|
|
//
|
2019-01-19 09:50:56 +01:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2007-07-11 02:46:18 +02:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
2015-12-11 19:39:41 +01:00
|
|
|
// This file implements a trivial dead store elimination that only considers
|
|
|
|
// basic-block local redundant stores.
|
|
|
|
//
|
|
|
|
// FIXME: This should eventually be extended to be a post-dominator tree
|
|
|
|
// traversal. Doing so would be pretty trivial.
|
2007-07-11 02:46:18 +02:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
#include "llvm/Transforms/Scalar/DeadStoreElimination.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/ADT/APInt.h"
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
#include "llvm/ADT/DenseMap.h"
|
2020-01-03 15:13:55 +01:00
|
|
|
#include "llvm/ADT/MapVector.h"
|
2020-03-20 08:51:29 +01:00
|
|
|
#include "llvm/ADT/PostOrderIterator.h"
|
2012-12-03 17:50:05 +01:00
|
|
|
#include "llvm/ADT/SetVector.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/ADT/SmallPtrSet.h"
|
|
|
|
#include "llvm/ADT/SmallVector.h"
|
2012-12-03 17:50:05 +01:00
|
|
|
#include "llvm/ADT/Statistic.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/ADT/StringRef.h"
|
2007-07-12 01:19:17 +02:00
|
|
|
#include "llvm/Analysis/AliasAnalysis.h"
|
2011-10-22 23:59:35 +02:00
|
|
|
#include "llvm/Analysis/CaptureTracking.h"
|
[PM/AA] Rebuild LLVM's alias analysis infrastructure in a way compatible
with the new pass manager, and no longer relying on analysis groups.
This builds essentially a ground-up new AA infrastructure stack for
LLVM. The core ideas are the same that are used throughout the new pass
manager: type erased polymorphism and direct composition. The design is
as follows:
- FunctionAAResults is a type-erasing alias analysis results aggregation
interface to walk a single query across a range of results from
different alias analyses. Currently this is function-specific as we
always assume that aliasing queries are *within* a function.
- AAResultBase is a CRTP utility providing stub implementations of
various parts of the alias analysis result concept, notably in several
cases in terms of other more general parts of the interface. This can
be used to implement only a narrow part of the interface rather than
the entire interface. This isn't really ideal, this logic should be
hoisted into FunctionAAResults as currently it will cause
a significant amount of redundant work, but it faithfully models the
behavior of the prior infrastructure.
- All the alias analysis passes are ported to be wrapper passes for the
legacy PM and new-style analysis passes for the new PM with a shared
result object. In some cases (most notably CFL), this is an extremely
naive approach that we should revisit when we can specialize for the
new pass manager.
- BasicAA has been restructured to reflect that it is much more
fundamentally a function analysis because it uses dominator trees and
loop info that need to be constructed for each function.
All of the references to getting alias analysis results have been
updated to use the new aggregation interface. All the preservation and
other pass management code has been updated accordingly.
The way the FunctionAAResultsWrapperPass works is to detect the
available alias analyses when run, and add them to the results object.
This means that we should be able to continue to respect when various
passes are added to the pipeline, for example adding CFL or adding TBAA
passes should just cause their results to be available and to get folded
into this. The exception to this rule is BasicAA which really needs to
be a function pass due to using dominator trees and loop info. As
a consequence, the FunctionAAResultsWrapperPass directly depends on
BasicAA and always includes it in the aggregation.
This has significant implications for preserving analyses. Generally,
most passes shouldn't bother preserving FunctionAAResultsWrapperPass
because rebuilding the results just updates the set of known AA passes.
The exception to this rule are LoopPass instances which need to preserve
all the function analyses that the loop pass manager will end up
needing. This means preserving both BasicAAWrapperPass and the
aggregating FunctionAAResultsWrapperPass.
Now, when preserving an alias analysis, you do so by directly preserving
that analysis. This is only necessary for non-immutable-pass-provided
alias analyses though, and there are only three of interest: BasicAA,
GlobalsAA (formerly GlobalsModRef), and SCEVAA. Usually BasicAA is
preserved when needed because it (like DominatorTree and LoopInfo) is
marked as a CFG-only pass. I've expanded GlobalsAA into the preserved
set everywhere we previously were preserving all of AliasAnalysis, and
I've added SCEVAA in the intersection of that with where we preserve
SCEV itself.
One significant challenge to all of this is that the CGSCC passes were
actually using the alias analysis implementations by taking advantage of
a pretty amazing set of loop holes in the old pass manager's analysis
management code which allowed analysis groups to slide through in many
cases. Moving away from analysis groups makes this problem much more
obvious. To fix it, I've leveraged the flexibility the design of the new
PM components provides to just directly construct the relevant alias
analyses for the relevant functions in the IPO passes that need them.
This is a bit hacky, but should go away with the new pass manager, and
is already in many ways cleaner than the prior state.
Another significant challenge is that various facilities of the old
alias analysis infrastructure just don't fit any more. The most
significant of these is the alias analysis 'counter' pass. That pass
relied on the ability to snoop on AA queries at different points in the
analysis group chain. Instead, I'm planning to build printing
functionality directly into the aggregation layer. I've not included
that in this patch merely to keep it smaller.
Note that all of this needs a nearly complete rewrite of the AA
documentation. I'm planning to do that, but I'd like to make sure the
new design settles, and to flesh out a bit more of what it looks like in
the new pass manager first.
Differential Revision: http://reviews.llvm.org/D12080
llvm-svn: 247167
2015-09-09 19:55:00 +02:00
|
|
|
#include "llvm/Analysis/GlobalsModRef.h"
|
2009-10-27 21:05:49 +01:00
|
|
|
#include "llvm/Analysis/MemoryBuiltins.h"
|
2007-07-11 02:46:18 +02:00
|
|
|
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/Analysis/MemoryLocation.h"
|
2020-02-11 19:27:41 +01:00
|
|
|
#include "llvm/Analysis/MemorySSA.h"
|
|
|
|
#include "llvm/Analysis/MemorySSAUpdater.h"
|
|
|
|
#include "llvm/Analysis/PostDominators.h"
|
2015-03-23 20:32:43 +01:00
|
|
|
#include "llvm/Analysis/TargetLibraryInfo.h"
|
2010-12-01 00:05:20 +01:00
|
|
|
#include "llvm/Analysis/ValueTracking.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/IR/Argument.h"
|
|
|
|
#include "llvm/IR/BasicBlock.h"
|
|
|
|
#include "llvm/IR/Constant.h"
|
2013-01-02 12:36:10 +01:00
|
|
|
#include "llvm/IR/Constants.h"
|
|
|
|
#include "llvm/IR/DataLayout.h"
|
2014-01-13 10:26:24 +01:00
|
|
|
#include "llvm/IR/Dominators.h"
|
2013-01-02 12:36:10 +01:00
|
|
|
#include "llvm/IR/Function.h"
|
2020-04-25 16:02:02 +02:00
|
|
|
#include "llvm/IR/InstIterator.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/IR/InstrTypes.h"
|
|
|
|
#include "llvm/IR/Instruction.h"
|
2013-01-02 12:36:10 +01:00
|
|
|
#include "llvm/IR/Instructions.h"
|
|
|
|
#include "llvm/IR/IntrinsicInst.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/IR/Intrinsics.h"
|
2017-09-26 15:54:28 +02:00
|
|
|
#include "llvm/IR/LLVMContext.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/IR/Module.h"
|
|
|
|
#include "llvm/IR/PassManager.h"
|
|
|
|
#include "llvm/IR/Value.h"
|
Sink all InitializePasses.h includes
This file lists every pass in LLVM, and is included by Pass.h, which is
very popular. Every time we add, remove, or rename a pass in LLVM, it
caused lots of recompilation.
I found this fact by looking at this table, which is sorted by the
number of times a file was changed over the last 100,000 git commits
multiplied by the number of object files that depend on it in the
current checkout:
recompiles touches affected_files header
342380 95 3604 llvm/include/llvm/ADT/STLExtras.h
314730 234 1345 llvm/include/llvm/InitializePasses.h
307036 118 2602 llvm/include/llvm/ADT/APInt.h
213049 59 3611 llvm/include/llvm/Support/MathExtras.h
170422 47 3626 llvm/include/llvm/Support/Compiler.h
162225 45 3605 llvm/include/llvm/ADT/Optional.h
158319 63 2513 llvm/include/llvm/ADT/Triple.h
140322 39 3598 llvm/include/llvm/ADT/StringRef.h
137647 59 2333 llvm/include/llvm/Support/Error.h
131619 73 1803 llvm/include/llvm/Support/FileSystem.h
Before this change, touching InitializePasses.h would cause 1345 files
to recompile. After this change, touching it only causes 550 compiles in
an incremental rebuild.
Reviewers: bkramer, asbirlea, bollu, jdoerfert
Differential Revision: https://reviews.llvm.org/D70211
2019-11-13 22:15:01 +01:00
|
|
|
#include "llvm/InitializePasses.h"
|
2012-12-03 17:50:05 +01:00
|
|
|
#include "llvm/Pass.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/Support/Casting.h"
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
#include "llvm/Support/CommandLine.h"
|
2012-12-03 17:50:05 +01:00
|
|
|
#include "llvm/Support/Debug.h"
|
2020-02-21 17:55:18 +01:00
|
|
|
#include "llvm/Support/DebugCounter.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/Support/ErrorHandling.h"
|
|
|
|
#include "llvm/Support/MathExtras.h"
|
2015-03-23 20:32:43 +01:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2016-05-17 23:38:13 +02:00
|
|
|
#include "llvm/Transforms/Scalar.h"
|
2020-04-14 11:56:56 +02:00
|
|
|
#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
|
2020-04-25 16:02:02 +02:00
|
|
|
#include "llvm/Transforms/Utils/Local.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include <algorithm>
|
|
|
|
#include <cassert>
|
|
|
|
#include <cstddef>
|
2018-03-21 23:34:23 +01:00
|
|
|
#include <cstdint>
|
2017-10-13 23:17:07 +02:00
|
|
|
#include <iterator>
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
#include <map>
|
2017-10-13 23:17:07 +02:00
|
|
|
#include <utility>
|
|
|
|
|
2007-07-11 02:46:18 +02:00
|
|
|
using namespace llvm;
|
|
|
|
|
2014-04-22 04:55:47 +02:00
|
|
|
#define DEBUG_TYPE "dse"
|
|
|
|
|
2020-04-25 16:02:02 +02:00
|
|
|
STATISTIC(NumRemainingStores, "Number of stores remaining after DSE");
|
2015-08-13 17:36:11 +02:00
|
|
|
STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
|
2007-07-11 02:46:18 +02:00
|
|
|
STATISTIC(NumFastStores, "Number of stores deleted");
|
2018-08-17 20:40:41 +02:00
|
|
|
STATISTIC(NumFastOther, "Number of other instrs removed");
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
STATISTIC(NumCompletePartials, "Number of stores dead by later partials");
|
2017-09-26 15:54:28 +02:00
|
|
|
STATISTIC(NumModifiedStores, "Number of stores modified");
|
2020-05-30 18:56:04 +02:00
|
|
|
STATISTIC(NumNoopStores, "Number of noop stores deleted");
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
STATISTIC(NumCFGChecks, "Number of stores modified");
|
|
|
|
STATISTIC(NumCFGTries, "Number of stores modified");
|
|
|
|
STATISTIC(NumCFGSuccess, "Number of stores modified");
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
|
2020-02-21 17:55:18 +01:00
|
|
|
DEBUG_COUNTER(MemorySSACounter, "dse-memoryssa",
|
|
|
|
"Controls which MemoryDefs are eliminated.");
|
|
|
|
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
static cl::opt<bool>
|
|
|
|
EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking",
|
|
|
|
cl::init(true), cl::Hidden,
|
|
|
|
cl::desc("Enable partial-overwrite tracking in DSE"));
|
2007-07-11 02:46:18 +02:00
|
|
|
|
2017-09-26 15:54:28 +02:00
|
|
|
static cl::opt<bool>
|
|
|
|
EnablePartialStoreMerging("enable-dse-partial-store-merging",
|
|
|
|
cl::init(true), cl::Hidden,
|
|
|
|
cl::desc("Enable partial store merging in DSE"));
|
|
|
|
|
2019-12-06 15:55:07 +01:00
|
|
|
static cl::opt<bool>
|
|
|
|
EnableMemorySSA("enable-dse-memoryssa", cl::init(false), cl::Hidden,
|
|
|
|
cl::desc("Use the new MemorySSA-backed DSE."));
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
static cl::opt<unsigned>
|
|
|
|
MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(100), cl::Hidden,
|
|
|
|
cl::desc("The number of memory instructions to scan for "
|
|
|
|
"dead store elimination (default = 100)"));
|
|
|
|
|
|
|
|
static cl::opt<unsigned> MemorySSADefsPerBlockLimit(
|
|
|
|
"dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden,
|
|
|
|
cl::desc("The number of MemoryDefs we consider as candidates to eliminated "
|
|
|
|
"other stores per basic block (default = 5000)"));
|
|
|
|
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
static cl::opt<unsigned> MemorySSAPathCheckLimit(
|
|
|
|
"dse-memoryssa-path-check-limit", cl::init(50), cl::Hidden,
|
|
|
|
cl::desc("The maximum number of blocks to check when trying to prove that "
|
|
|
|
"all paths to an exit go through a killing block (default = 50)"));
|
|
|
|
|
2010-11-30 22:58:14 +01:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Helper functions
|
|
|
|
//===----------------------------------------------------------------------===//
|
2017-10-13 23:17:07 +02:00
|
|
|
using OverlapIntervalsTy = std::map<int64_t, int64_t>;
|
|
|
|
using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>;
|
2010-11-30 22:58:14 +01:00
|
|
|
|
2016-06-10 19:58:01 +02:00
|
|
|
/// Delete this instruction. Before we do, go through and zero out all the
|
2016-05-17 23:38:13 +02:00
|
|
|
/// operands of this instruction. If any of them become dead, delete them and
|
|
|
|
/// the computation tree that feeds them.
|
2015-08-19 04:15:13 +02:00
|
|
|
/// If ValueSet is non-null, remove any deleted instructions from it as well.
|
2016-05-17 23:38:13 +02:00
|
|
|
static void
|
2016-07-06 21:48:52 +02:00
|
|
|
deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
|
|
|
|
MemoryDependenceResults &MD, const TargetLibraryInfo &TLI,
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
InstOverlapIntervalsTy &IOL,
|
2020-01-03 15:13:55 +01:00
|
|
|
MapVector<Instruction *, bool> &ThrowableInst,
|
Add "const" in GetUnderlyingObjects. NFC
Summary:
Both the input Value pointer and the returned Value
pointers in GetUnderlyingObjects are now declared as
const.
It turned out that all current (in-tree) uses of
GetUnderlyingObjects were trivial to update, being
satisfied with have those Value pointers declared
as const. Actually, in the past several of the users
had to use const_cast, just because of ValueTracking
not providing a version of GetUnderlyingObjects with
"const" Value pointers. With this patch we get rid
of those const casts.
Reviewers: hfinkel, materi, jkorous
Reviewed By: jkorous
Subscribers: dexonsmith, jkorous, jholewinski, sdardis, eraman, hiraditya, jrtc27, atanasyan, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61038
llvm-svn: 359072
2019-04-24 08:55:50 +02:00
|
|
|
SmallSetVector<const Value *, 16> *ValueSet = nullptr) {
|
2015-08-19 04:15:13 +02:00
|
|
|
SmallVector<Instruction*, 32> NowDeadInsts;
|
|
|
|
|
|
|
|
NowDeadInsts.push_back(I);
|
|
|
|
--NumFastOther;
|
|
|
|
|
2016-07-06 21:48:52 +02:00
|
|
|
// Keeping the iterator straight is a pain, so we let this routine tell the
|
|
|
|
// caller what the next instruction is after we're done mucking about.
|
|
|
|
BasicBlock::iterator NewIter = *BBI;
|
|
|
|
|
2015-08-19 04:15:13 +02:00
|
|
|
// Before we touch this instruction, remove it from memdep!
|
|
|
|
do {
|
|
|
|
Instruction *DeadInst = NowDeadInsts.pop_back_val();
|
2020-01-03 15:13:55 +01:00
|
|
|
// Mark the DeadInst as dead in the list of throwable instructions.
|
|
|
|
auto It = ThrowableInst.find(DeadInst);
|
|
|
|
if (It != ThrowableInst.end())
|
|
|
|
ThrowableInst[It->first] = false;
|
2015-08-19 04:15:13 +02:00
|
|
|
++NumFastOther;
|
|
|
|
|
2018-02-13 19:15:26 +01:00
|
|
|
// Try to preserve debug information attached to the dead instruction.
|
2020-06-08 19:44:11 +02:00
|
|
|
salvageDebugInfo(*DeadInst);
|
2020-04-14 11:56:56 +02:00
|
|
|
salvageKnowledge(DeadInst);
|
2018-02-13 19:15:26 +01:00
|
|
|
|
2015-08-19 04:15:13 +02:00
|
|
|
// This instruction is dead, zap it, in stages. Start by removing it from
|
|
|
|
// MemDep, which needs to know the operands and needs it to be in the
|
|
|
|
// function.
|
|
|
|
MD.removeInstruction(DeadInst);
|
|
|
|
|
|
|
|
for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
|
|
|
|
Value *Op = DeadInst->getOperand(op);
|
|
|
|
DeadInst->setOperand(op, nullptr);
|
|
|
|
|
|
|
|
// If this operand just became dead, add it to the NowDeadInsts list.
|
|
|
|
if (!Op->use_empty()) continue;
|
|
|
|
|
|
|
|
if (Instruction *OpI = dyn_cast<Instruction>(Op))
|
|
|
|
if (isInstructionTriviallyDead(OpI, &TLI))
|
|
|
|
NowDeadInsts.push_back(OpI);
|
|
|
|
}
|
|
|
|
|
2016-08-12 03:09:53 +02:00
|
|
|
if (ValueSet) ValueSet->remove(DeadInst);
|
|
|
|
IOL.erase(DeadInst);
|
2016-07-06 21:48:52 +02:00
|
|
|
|
|
|
|
if (NewIter == DeadInst->getIterator())
|
|
|
|
NewIter = DeadInst->eraseFromParent();
|
|
|
|
else
|
|
|
|
DeadInst->eraseFromParent();
|
2015-08-19 04:15:13 +02:00
|
|
|
} while (!NowDeadInsts.empty());
|
2016-07-06 21:48:52 +02:00
|
|
|
*BBI = NewIter;
|
2020-01-03 15:13:55 +01:00
|
|
|
// Pop dead entries from back of ThrowableInst till we find an alive entry.
|
|
|
|
while (!ThrowableInst.empty() && !ThrowableInst.back().second)
|
|
|
|
ThrowableInst.pop_back();
|
2015-08-19 04:15:13 +02:00
|
|
|
}
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
/// Does this instruction write some memory? This only returns true for things
|
|
|
|
/// that we can analyze with other helpers below.
|
2018-01-21 02:44:33 +01:00
|
|
|
static bool hasAnalyzableMemoryWrite(Instruction *I,
|
|
|
|
const TargetLibraryInfo &TLI) {
|
2009-11-10 07:46:40 +01:00
|
|
|
if (isa<StoreInst>(I))
|
|
|
|
return true;
|
|
|
|
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
|
|
|
|
switch (II->getIntrinsicID()) {
|
2009-12-02 07:35:55 +01:00
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
case Intrinsic::memset:
|
|
|
|
case Intrinsic::memmove:
|
|
|
|
case Intrinsic::memcpy:
|
2018-04-23 21:06:49 +02:00
|
|
|
case Intrinsic::memcpy_element_unordered_atomic:
|
|
|
|
case Intrinsic::memmove_element_unordered_atomic:
|
|
|
|
case Intrinsic::memset_element_unordered_atomic:
|
2009-12-02 07:35:55 +01:00
|
|
|
case Intrinsic::init_trampoline:
|
|
|
|
case Intrinsic::lifetime_end:
|
|
|
|
return true;
|
2009-11-10 07:46:40 +01:00
|
|
|
}
|
|
|
|
}
|
2020-04-23 18:15:04 +02:00
|
|
|
if (auto *CB = dyn_cast<CallBase>(I)) {
|
|
|
|
if (Function *F = CB->getCalledFunction()) {
|
2020-01-11 12:57:29 +01:00
|
|
|
LibFunc LF;
|
|
|
|
if (TLI.getLibFunc(*F, LF) && TLI.has(LF)) {
|
|
|
|
switch (LF) {
|
|
|
|
case LibFunc_strcpy:
|
|
|
|
case LibFunc_strncpy:
|
|
|
|
case LibFunc_strcat:
|
|
|
|
case LibFunc_strncat:
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2012-09-25 00:09:10 +02:00
|
|
|
}
|
|
|
|
}
|
2009-11-10 07:46:40 +01:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
/// Return a Location stored to by the specified instruction. If isRemovable
|
|
|
|
/// returns true, this function and getLocForRead completely describe the memory
|
|
|
|
/// operations for this instruction.
|
2018-01-21 03:10:54 +01:00
|
|
|
static MemoryLocation getLocForWrite(Instruction *Inst) {
|
2018-07-30 21:41:25 +02:00
|
|
|
|
2010-11-30 08:23:21 +01:00
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
|
2015-06-04 04:03:15 +02:00
|
|
|
return MemoryLocation::get(SI);
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2018-04-23 21:06:49 +02:00
|
|
|
if (auto *MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
|
2010-11-30 08:23:21 +01:00
|
|
|
// memcpy/memmove/memset.
|
2015-06-17 09:18:54 +02:00
|
|
|
MemoryLocation Loc = MemoryLocation::getForDest(MI);
|
2010-11-30 08:23:21 +01:00
|
|
|
return Loc;
|
|
|
|
}
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2018-01-21 03:10:54 +01:00
|
|
|
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
|
|
|
|
switch (II->getIntrinsicID()) {
|
|
|
|
default:
|
|
|
|
return MemoryLocation(); // Unhandled intrinsic.
|
|
|
|
case Intrinsic::init_trampoline:
|
|
|
|
return MemoryLocation(II->getArgOperand(0));
|
|
|
|
case Intrinsic::lifetime_end: {
|
|
|
|
uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
|
|
|
|
return MemoryLocation(II->getArgOperand(1), Len);
|
|
|
|
}
|
|
|
|
}
|
2010-11-30 08:23:21 +01:00
|
|
|
}
|
2020-04-23 18:15:04 +02:00
|
|
|
if (auto *CB = dyn_cast<CallBase>(Inst))
|
2018-01-21 03:10:54 +01:00
|
|
|
// All the supported TLI functions so far happen to have dest as their
|
|
|
|
// first argument.
|
2020-04-23 18:15:04 +02:00
|
|
|
return MemoryLocation(CB->getArgOperand(0));
|
2018-01-21 03:10:54 +01:00
|
|
|
return MemoryLocation();
|
2010-11-30 08:23:21 +01:00
|
|
|
}
|
|
|
|
|
2018-01-21 02:44:33 +01:00
|
|
|
/// Return the location read by the specified "hasAnalyzableMemoryWrite"
|
|
|
|
/// instruction if any.
|
2015-08-12 20:01:44 +02:00
|
|
|
static MemoryLocation getLocForRead(Instruction *Inst,
|
|
|
|
const TargetLibraryInfo &TLI) {
|
2018-01-21 02:44:33 +01:00
|
|
|
assert(hasAnalyzableMemoryWrite(Inst, TLI) && "Unknown instruction case");
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2010-12-06 02:48:06 +01:00
|
|
|
// The only instructions that both read and write are the mem transfer
|
|
|
|
// instructions (memcpy/memmove).
|
2018-04-23 21:06:49 +02:00
|
|
|
if (auto *MTI = dyn_cast<AnyMemTransferInst>(Inst))
|
2015-06-04 04:03:15 +02:00
|
|
|
return MemoryLocation::getForSource(MTI);
|
2015-06-17 09:18:54 +02:00
|
|
|
return MemoryLocation();
|
2010-12-06 02:48:06 +01:00
|
|
|
}
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
/// If the value of this instruction and the memory it writes to is unused, may
|
|
|
|
/// we delete this instruction?
|
2010-11-30 06:30:45 +01:00
|
|
|
static bool isRemovable(Instruction *I) {
|
2011-08-18 00:22:24 +02:00
|
|
|
// Don't remove volatile/atomic stores.
|
2009-11-10 07:46:40 +01:00
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(I))
|
2011-08-18 00:22:24 +02:00
|
|
|
return SI->isUnordered();
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2012-09-25 00:09:10 +02:00
|
|
|
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
|
|
|
|
switch (II->getIntrinsicID()) {
|
2018-01-21 02:44:33 +01:00
|
|
|
default: llvm_unreachable("doesn't pass 'hasAnalyzableMemoryWrite' predicate");
|
2012-09-25 00:09:10 +02:00
|
|
|
case Intrinsic::lifetime_end:
|
|
|
|
// Never remove dead lifetime_end's, e.g. because it is followed by a
|
|
|
|
// free.
|
|
|
|
return false;
|
|
|
|
case Intrinsic::init_trampoline:
|
|
|
|
// Always safe to remove init_trampoline.
|
|
|
|
return true;
|
|
|
|
case Intrinsic::memset:
|
|
|
|
case Intrinsic::memmove:
|
|
|
|
case Intrinsic::memcpy:
|
|
|
|
// Don't remove volatile memory intrinsics.
|
|
|
|
return !cast<MemIntrinsic>(II)->isVolatile();
|
2018-04-23 21:06:49 +02:00
|
|
|
case Intrinsic::memcpy_element_unordered_atomic:
|
|
|
|
case Intrinsic::memmove_element_unordered_atomic:
|
|
|
|
case Intrinsic::memset_element_unordered_atomic:
|
|
|
|
return true;
|
2012-09-25 00:09:10 +02:00
|
|
|
}
|
2010-11-30 20:12:10 +01:00
|
|
|
}
|
2012-09-25 00:09:10 +02:00
|
|
|
|
2018-01-21 02:44:33 +01:00
|
|
|
// note: only get here for calls with analyzable writes - i.e. libcalls
|
2020-04-23 18:15:04 +02:00
|
|
|
if (auto *CB = dyn_cast<CallBase>(I))
|
|
|
|
return CB->use_empty();
|
2012-09-25 00:09:10 +02:00
|
|
|
|
|
|
|
return false;
|
2009-11-10 07:46:40 +01:00
|
|
|
}
|
|
|
|
|
2016-04-22 21:51:29 +02:00
|
|
|
/// Returns true if the end of this instruction can be safely shortened in
|
2011-11-10 00:07:35 +01:00
|
|
|
/// length.
|
2016-04-22 21:51:29 +02:00
|
|
|
static bool isShortenableAtTheEnd(Instruction *I) {
|
2011-11-10 00:07:35 +01:00
|
|
|
// Don't shorten stores for now
|
|
|
|
if (isa<StoreInst>(I))
|
|
|
|
return false;
|
2012-07-24 12:51:42 +02:00
|
|
|
|
2012-09-25 00:09:10 +02:00
|
|
|
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
|
|
|
|
switch (II->getIntrinsicID()) {
|
|
|
|
default: return false;
|
|
|
|
case Intrinsic::memset:
|
|
|
|
case Intrinsic::memcpy:
|
2018-05-10 17:12:49 +02:00
|
|
|
case Intrinsic::memcpy_element_unordered_atomic:
|
|
|
|
case Intrinsic::memset_element_unordered_atomic:
|
2012-09-25 00:09:10 +02:00
|
|
|
// Do shorten memory intrinsics.
|
2016-04-22 21:51:29 +02:00
|
|
|
// FIXME: Add memmove if it's also safe to transform.
|
2012-09-25 00:09:10 +02:00
|
|
|
return true;
|
|
|
|
}
|
2011-11-10 00:07:35 +01:00
|
|
|
}
|
2012-09-25 00:09:10 +02:00
|
|
|
|
|
|
|
// Don't shorten libcalls calls for now.
|
|
|
|
|
|
|
|
return false;
|
2011-11-10 00:07:35 +01:00
|
|
|
}
|
|
|
|
|
2016-04-22 21:51:29 +02:00
|
|
|
/// Returns true if the beginning of this instruction can be safely shortened
|
|
|
|
/// in length.
|
|
|
|
static bool isShortenableAtTheBeginning(Instruction *I) {
|
|
|
|
// FIXME: Handle only memset for now. Supporting memcpy/memmove should be
|
|
|
|
// easily done by offsetting the source address.
|
2018-05-10 17:12:49 +02:00
|
|
|
return isa<AnyMemSetInst>(I);
|
2016-04-22 21:51:29 +02:00
|
|
|
}
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
/// Return the pointer that is being written to.
|
2010-11-30 22:58:14 +01:00
|
|
|
static Value *getStoredPointerOperand(Instruction *I) {
|
2018-01-21 02:44:33 +01:00
|
|
|
//TODO: factor this to reuse getLocForWrite
|
2018-01-21 03:10:54 +01:00
|
|
|
MemoryLocation Loc = getLocForWrite(I);
|
|
|
|
assert(Loc.Ptr &&
|
2018-06-14 07:41:49 +02:00
|
|
|
"unable to find pointer written for analyzable instruction?");
|
2018-01-21 03:10:54 +01:00
|
|
|
// TODO: most APIs don't expect const Value *
|
|
|
|
return const_cast<Value*>(Loc.Ptr);
|
2009-11-10 07:46:40 +01:00
|
|
|
}
|
|
|
|
|
2015-03-10 03:37:25 +01:00
|
|
|
static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
|
llvm: Add support for "-fno-delete-null-pointer-checks"
Summary:
Support for this option is needed for building Linux kernel.
This is a very frequently requested feature by kernel developers.
More details : https://lkml.org/lkml/2018/4/4/601
GCC option description for -fdelete-null-pointer-checks:
This Assume that programs cannot safely dereference null pointers,
and that no code or data element resides at address zero.
-fno-delete-null-pointer-checks is the inverse of this implying that
null pointer dereferencing is not undefined.
This feature is implemented in LLVM IR in this CL as the function attribute
"null-pointer-is-valid"="true" in IR (Under review at D47894).
The CL updates several passes that assumed null pointer dereferencing is
undefined to not optimize when the "null-pointer-is-valid"="true"
attribute is present.
Reviewers: t.p.northover, efriedma, jyknight, chandlerc, rnk, srhines, void, george.burgess.iv
Reviewed By: efriedma, george.burgess.iv
Subscribers: eraman, haicheng, george.burgess.iv, drinkcat, theraven, reames, sanjoy, xbolva00, llvm-commits
Differential Revision: https://reviews.llvm.org/D47895
llvm-svn: 336613
2018-07-10 00:27:23 +02:00
|
|
|
const TargetLibraryInfo &TLI,
|
|
|
|
const Function *F) {
|
2012-06-21 17:45:28 +02:00
|
|
|
uint64_t Size;
|
llvm: Add support for "-fno-delete-null-pointer-checks"
Summary:
Support for this option is needed for building Linux kernel.
This is a very frequently requested feature by kernel developers.
More details : https://lkml.org/lkml/2018/4/4/601
GCC option description for -fdelete-null-pointer-checks:
This Assume that programs cannot safely dereference null pointers,
and that no code or data element resides at address zero.
-fno-delete-null-pointer-checks is the inverse of this implying that
null pointer dereferencing is not undefined.
This feature is implemented in LLVM IR in this CL as the function attribute
"null-pointer-is-valid"="true" in IR (Under review at D47894).
The CL updates several passes that assumed null pointer dereferencing is
undefined to not optimize when the "null-pointer-is-valid"="true"
attribute is present.
Reviewers: t.p.northover, efriedma, jyknight, chandlerc, rnk, srhines, void, george.burgess.iv
Reviewed By: efriedma, george.burgess.iv
Subscribers: eraman, haicheng, george.burgess.iv, drinkcat, theraven, reames, sanjoy, xbolva00, llvm-commits
Differential Revision: https://reviews.llvm.org/D47895
llvm-svn: 336613
2018-07-10 00:27:23 +02:00
|
|
|
ObjectSizeOpts Opts;
|
|
|
|
Opts.NullIsUnknownSize = NullPointerIsDefined(F);
|
|
|
|
|
|
|
|
if (getObjectSize(V, Size, DL, &TLI, Opts))
|
2012-06-21 17:45:28 +02:00
|
|
|
return Size;
|
2015-06-17 09:21:38 +02:00
|
|
|
return MemoryLocation::UnknownSize;
|
2010-12-01 00:43:23 +01:00
|
|
|
}
|
2010-11-30 20:34:42 +01:00
|
|
|
|
2011-11-10 00:07:35 +01:00
|
|
|
namespace {
|
2017-10-13 23:17:07 +02:00
|
|
|
|
2017-09-26 15:54:28 +02:00
|
|
|
enum OverwriteResult {
|
|
|
|
OW_Begin,
|
|
|
|
OW_Complete,
|
|
|
|
OW_End,
|
|
|
|
OW_PartialEarlierWithFullLater,
|
|
|
|
OW_Unknown
|
|
|
|
};
|
2017-10-13 23:17:07 +02:00
|
|
|
|
|
|
|
} // end anonymous namespace
|
2011-11-10 00:07:35 +01:00
|
|
|
|
2017-03-29 16:42:27 +02:00
|
|
|
/// Return 'OW_Complete' if a store to the 'Later' location completely
|
|
|
|
/// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the
|
|
|
|
/// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the
|
2017-09-26 15:54:28 +02:00
|
|
|
/// beginning of the 'Earlier' location is overwritten by 'Later'.
|
|
|
|
/// 'OW_PartialEarlierWithFullLater' means that an earlier (big) store was
|
|
|
|
/// overwritten by a latter (smaller) store which doesn't write outside the big
|
|
|
|
/// store's memory locations. Returns 'OW_Unknown' if nothing can be determined.
|
2015-06-17 09:18:54 +02:00
|
|
|
static OverwriteResult isOverwrite(const MemoryLocation &Later,
|
|
|
|
const MemoryLocation &Earlier,
|
2015-03-10 03:37:25 +01:00
|
|
|
const DataLayout &DL,
|
2015-08-12 20:01:44 +02:00
|
|
|
const TargetLibraryInfo &TLI,
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
int64_t &EarlierOff, int64_t &LaterOff,
|
|
|
|
Instruction *DepWrite,
|
2018-05-03 13:03:53 +02:00
|
|
|
InstOverlapIntervalsTy &IOL,
|
llvm: Add support for "-fno-delete-null-pointer-checks"
Summary:
Support for this option is needed for building Linux kernel.
This is a very frequently requested feature by kernel developers.
More details : https://lkml.org/lkml/2018/4/4/601
GCC option description for -fdelete-null-pointer-checks:
This Assume that programs cannot safely dereference null pointers,
and that no code or data element resides at address zero.
-fno-delete-null-pointer-checks is the inverse of this implying that
null pointer dereferencing is not undefined.
This feature is implemented in LLVM IR in this CL as the function attribute
"null-pointer-is-valid"="true" in IR (Under review at D47894).
The CL updates several passes that assumed null pointer dereferencing is
undefined to not optimize when the "null-pointer-is-valid"="true"
attribute is present.
Reviewers: t.p.northover, efriedma, jyknight, chandlerc, rnk, srhines, void, george.burgess.iv
Reviewed By: efriedma, george.burgess.iv
Subscribers: eraman, haicheng, george.burgess.iv, drinkcat, theraven, reames, sanjoy, xbolva00, llvm-commits
Differential Revision: https://reviews.llvm.org/D47895
llvm-svn: 336613
2018-07-10 00:27:23 +02:00
|
|
|
AliasAnalysis &AA,
|
|
|
|
const Function *F) {
|
2018-10-10 08:39:40 +02:00
|
|
|
// FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll
|
|
|
|
// get imprecise values here, though (except for unknown sizes).
|
|
|
|
if (!Later.Size.isPrecise() || !Earlier.Size.isPrecise())
|
2017-03-29 16:42:27 +02:00
|
|
|
return OW_Unknown;
|
2016-06-16 00:17:38 +02:00
|
|
|
|
2018-10-09 05:18:56 +02:00
|
|
|
const uint64_t LaterSize = Later.Size.getValue();
|
|
|
|
const uint64_t EarlierSize = Earlier.Size.getValue();
|
2018-10-09 04:14:33 +02:00
|
|
|
|
2010-12-01 00:05:20 +01:00
|
|
|
const Value *P1 = Earlier.Ptr->stripPointerCasts();
|
|
|
|
const Value *P2 = Later.Ptr->stripPointerCasts();
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2010-12-01 00:05:20 +01:00
|
|
|
// If the start pointers are the same, we just have to compare sizes to see if
|
|
|
|
// the later store was larger than the earlier store.
|
2018-05-03 13:03:53 +02:00
|
|
|
if (P1 == P2 || AA.isMustAlias(P1, P2)) {
|
2010-12-01 00:05:20 +01:00
|
|
|
// Make sure that the Later size is >= the Earlier size.
|
2018-10-09 04:14:33 +02:00
|
|
|
if (LaterSize >= EarlierSize)
|
2017-03-29 16:42:27 +02:00
|
|
|
return OW_Complete;
|
2010-12-01 00:05:20 +01:00
|
|
|
}
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2010-12-01 00:43:23 +01:00
|
|
|
// Check to see if the later store is to the entire object (either a global,
|
2014-01-28 03:38:36 +01:00
|
|
|
// an alloca, or a byval/inalloca argument). If so, then it clearly
|
|
|
|
// overwrites any other store to the same object.
|
2014-02-21 19:34:28 +01:00
|
|
|
const Value *UO1 = GetUnderlyingObject(P1, DL),
|
|
|
|
*UO2 = GetUnderlyingObject(P2, DL);
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2010-12-01 00:43:23 +01:00
|
|
|
// If we can't resolve the same pointers to the same object, then we can't
|
|
|
|
// analyze them at all.
|
|
|
|
if (UO1 != UO2)
|
2017-03-29 16:42:27 +02:00
|
|
|
return OW_Unknown;
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2010-12-01 00:43:23 +01:00
|
|
|
// If the "Later" store is to a recognizable object, get its size.
|
llvm: Add support for "-fno-delete-null-pointer-checks"
Summary:
Support for this option is needed for building Linux kernel.
This is a very frequently requested feature by kernel developers.
More details : https://lkml.org/lkml/2018/4/4/601
GCC option description for -fdelete-null-pointer-checks:
This Assume that programs cannot safely dereference null pointers,
and that no code or data element resides at address zero.
-fno-delete-null-pointer-checks is the inverse of this implying that
null pointer dereferencing is not undefined.
This feature is implemented in LLVM IR in this CL as the function attribute
"null-pointer-is-valid"="true" in IR (Under review at D47894).
The CL updates several passes that assumed null pointer dereferencing is
undefined to not optimize when the "null-pointer-is-valid"="true"
attribute is present.
Reviewers: t.p.northover, efriedma, jyknight, chandlerc, rnk, srhines, void, george.burgess.iv
Reviewed By: efriedma, george.burgess.iv
Subscribers: eraman, haicheng, george.burgess.iv, drinkcat, theraven, reames, sanjoy, xbolva00, llvm-commits
Differential Revision: https://reviews.llvm.org/D47895
llvm-svn: 336613
2018-07-10 00:27:23 +02:00
|
|
|
uint64_t ObjectSize = getPointerSize(UO2, DL, TLI, F);
|
2015-06-17 09:21:38 +02:00
|
|
|
if (ObjectSize != MemoryLocation::UnknownSize)
|
2018-10-09 04:14:33 +02:00
|
|
|
if (ObjectSize == LaterSize && ObjectSize >= EarlierSize)
|
2017-03-29 16:42:27 +02:00
|
|
|
return OW_Complete;
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2010-12-01 00:05:20 +01:00
|
|
|
// Okay, we have stores to two completely different pointers. Try to
|
|
|
|
// decompose the pointer into a "base + constant_offset" form. If the base
|
|
|
|
// pointers are equal, then we can reason about the two stores.
|
2011-11-10 00:07:35 +01:00
|
|
|
EarlierOff = 0;
|
|
|
|
LaterOff = 0;
|
2014-02-21 19:34:28 +01:00
|
|
|
const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, DL);
|
|
|
|
const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, DL);
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2010-12-01 00:05:20 +01:00
|
|
|
// If the base pointers still differ, we have two completely different stores.
|
|
|
|
if (BP1 != BP2)
|
2017-03-29 16:42:27 +02:00
|
|
|
return OW_Unknown;
|
2011-03-26 02:20:37 +01:00
|
|
|
|
2011-03-26 09:02:59 +01:00
|
|
|
// The later store completely overlaps the earlier store if:
|
2011-09-06 20:14:09 +02:00
|
|
|
//
|
2011-03-26 09:02:59 +01:00
|
|
|
// 1. Both start at the same offset and the later one's size is greater than
|
|
|
|
// or equal to the earlier one's, or
|
|
|
|
//
|
|
|
|
// |--earlier--|
|
|
|
|
// |-- later --|
|
2011-09-06 20:14:09 +02:00
|
|
|
//
|
2011-03-26 09:02:59 +01:00
|
|
|
// 2. The earlier store has an offset greater than the later offset, but which
|
|
|
|
// still lies completely within the later store.
|
|
|
|
//
|
|
|
|
// |--earlier--|
|
|
|
|
// |----- later ------|
|
2011-03-30 23:37:19 +02:00
|
|
|
//
|
|
|
|
// We have to be careful here as *Off is signed while *.Size is unsigned.
|
2011-03-26 10:32:07 +01:00
|
|
|
if (EarlierOff >= LaterOff &&
|
2018-10-09 04:14:33 +02:00
|
|
|
LaterSize >= EarlierSize &&
|
|
|
|
uint64_t(EarlierOff - LaterOff) + EarlierSize <= LaterSize)
|
2017-03-29 16:42:27 +02:00
|
|
|
return OW_Complete;
|
2012-07-24 12:51:42 +02:00
|
|
|
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
// We may now overlap, although the overlap is not complete. There might also
|
|
|
|
// be other incomplete overlaps, and together, they might cover the complete
|
|
|
|
// earlier write.
|
|
|
|
// Note: The correctness of this logic depends on the fact that this function
|
|
|
|
// is not even called providing DepWrite when there are any intervening reads.
|
|
|
|
if (EnablePartialOverwriteTracking &&
|
2018-10-09 04:14:33 +02:00
|
|
|
LaterOff < int64_t(EarlierOff + EarlierSize) &&
|
|
|
|
int64_t(LaterOff + LaterSize) >= EarlierOff) {
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
|
|
|
|
// Insert our part of the overlap into the map.
|
|
|
|
auto &IM = IOL[DepWrite];
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff
|
2018-10-09 04:14:33 +02:00
|
|
|
<< ", " << int64_t(EarlierOff + EarlierSize)
|
2018-05-14 14:53:11 +02:00
|
|
|
<< ") Later [" << LaterOff << ", "
|
2018-10-09 04:14:33 +02:00
|
|
|
<< int64_t(LaterOff + LaterSize) << ")\n");
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
|
|
|
|
// Make sure that we only insert non-overlapping intervals and combine
|
|
|
|
// adjacent intervals. The intervals are stored in the map with the ending
|
|
|
|
// offset as the key (in the half-open sense) and the starting offset as
|
|
|
|
// the value.
|
2018-10-09 04:14:33 +02:00
|
|
|
int64_t LaterIntStart = LaterOff, LaterIntEnd = LaterOff + LaterSize;
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
|
|
|
|
// Find any intervals ending at, or after, LaterIntStart which start
|
|
|
|
// before LaterIntEnd.
|
|
|
|
auto ILI = IM.lower_bound(LaterIntStart);
|
2016-06-30 17:32:20 +02:00
|
|
|
if (ILI != IM.end() && ILI->second <= LaterIntEnd) {
|
|
|
|
// This existing interval is overlapped with the current store somewhere
|
|
|
|
// in [LaterIntStart, LaterIntEnd]. Merge them by erasing the existing
|
|
|
|
// intervals and adjusting our start and end.
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
LaterIntStart = std::min(LaterIntStart, ILI->second);
|
|
|
|
LaterIntEnd = std::max(LaterIntEnd, ILI->first);
|
|
|
|
ILI = IM.erase(ILI);
|
|
|
|
|
2016-06-30 17:32:20 +02:00
|
|
|
// Continue erasing and adjusting our end in case other previous
|
|
|
|
// intervals are also overlapped with the current store.
|
|
|
|
//
|
|
|
|
// |--- ealier 1 ---| |--- ealier 2 ---|
|
|
|
|
// |------- later---------|
|
|
|
|
//
|
|
|
|
while (ILI != IM.end() && ILI->second <= LaterIntEnd) {
|
|
|
|
assert(ILI->second > LaterIntStart && "Unexpected interval");
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
LaterIntEnd = std::max(LaterIntEnd, ILI->first);
|
2016-06-30 17:32:20 +02:00
|
|
|
ILI = IM.erase(ILI);
|
|
|
|
}
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
IM[LaterIntEnd] = LaterIntStart;
|
|
|
|
|
|
|
|
ILI = IM.begin();
|
|
|
|
if (ILI->second <= EarlierOff &&
|
2018-10-09 04:14:33 +02:00
|
|
|
ILI->first >= int64_t(EarlierOff + EarlierSize)) {
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier ["
|
|
|
|
<< EarlierOff << ", "
|
2018-10-09 04:14:33 +02:00
|
|
|
<< int64_t(EarlierOff + EarlierSize)
|
2018-05-14 14:53:11 +02:00
|
|
|
<< ") Composite Later [" << ILI->second << ", "
|
|
|
|
<< ILI->first << ")\n");
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
++NumCompletePartials;
|
2017-03-29 16:42:27 +02:00
|
|
|
return OW_Complete;
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-09-26 15:54:28 +02:00
|
|
|
// Check for an earlier store which writes to all the memory locations that
|
|
|
|
// the later store writes to.
|
|
|
|
if (EnablePartialStoreMerging && LaterOff >= EarlierOff &&
|
2018-10-09 04:14:33 +02:00
|
|
|
int64_t(EarlierOff + EarlierSize) > LaterOff &&
|
|
|
|
uint64_t(LaterOff - EarlierOff) + LaterSize <= EarlierSize) {
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "DSE: Partial overwrite an earlier load ["
|
|
|
|
<< EarlierOff << ", "
|
2018-10-09 04:14:33 +02:00
|
|
|
<< int64_t(EarlierOff + EarlierSize)
|
2018-05-14 14:53:11 +02:00
|
|
|
<< ") by a later store [" << LaterOff << ", "
|
2018-10-09 04:14:33 +02:00
|
|
|
<< int64_t(LaterOff + LaterSize) << ")\n");
|
2017-09-26 15:54:28 +02:00
|
|
|
// TODO: Maybe come up with a better name?
|
|
|
|
return OW_PartialEarlierWithFullLater;
|
|
|
|
}
|
|
|
|
|
2016-04-22 21:51:29 +02:00
|
|
|
// Another interesting case is if the later store overwrites the end of the
|
|
|
|
// earlier store.
|
2011-11-10 00:07:35 +01:00
|
|
|
//
|
|
|
|
// |--earlier--|
|
|
|
|
// |-- later --|
|
|
|
|
//
|
|
|
|
// In this case we may want to trim the size of earlier to avoid generating
|
|
|
|
// writes to addresses which will definitely be overwritten later
|
2016-07-22 20:27:24 +02:00
|
|
|
if (!EnablePartialOverwriteTracking &&
|
2018-10-09 04:14:33 +02:00
|
|
|
(LaterOff > EarlierOff && LaterOff < int64_t(EarlierOff + EarlierSize) &&
|
|
|
|
int64_t(LaterOff + LaterSize) >= int64_t(EarlierOff + EarlierSize)))
|
2017-03-29 16:42:27 +02:00
|
|
|
return OW_End;
|
2011-03-26 09:02:59 +01:00
|
|
|
|
2016-04-22 21:51:29 +02:00
|
|
|
// Finally, we also need to check if the later store overwrites the beginning
|
|
|
|
// of the earlier store.
|
|
|
|
//
|
|
|
|
// |--earlier--|
|
|
|
|
// |-- later --|
|
|
|
|
//
|
|
|
|
// In this case we may want to move the destination address and trim the size
|
|
|
|
// of earlier to avoid generating writes to addresses which will definitely
|
|
|
|
// be overwritten later.
|
2016-07-22 20:27:24 +02:00
|
|
|
if (!EnablePartialOverwriteTracking &&
|
2018-10-09 04:14:33 +02:00
|
|
|
(LaterOff <= EarlierOff && int64_t(LaterOff + LaterSize) > EarlierOff)) {
|
|
|
|
assert(int64_t(LaterOff + LaterSize) < int64_t(EarlierOff + EarlierSize) &&
|
2017-03-29 16:42:27 +02:00
|
|
|
"Expect to be handled as OW_Complete");
|
|
|
|
return OW_Begin;
|
2016-04-22 21:51:29 +02:00
|
|
|
}
|
2011-03-26 09:02:59 +01:00
|
|
|
// Otherwise, they don't completely overlap.
|
2017-03-29 16:42:27 +02:00
|
|
|
return OW_Unknown;
|
2009-11-05 00:20:12 +01:00
|
|
|
}
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
/// If 'Inst' might be a self read (i.e. a noop copy of a
|
2010-12-06 02:48:06 +01:00
|
|
|
/// memory region into an identical pointer) then it doesn't actually make its
|
2011-09-06 20:14:09 +02:00
|
|
|
/// input dead in the traditional sense. Consider this case:
|
2010-12-06 02:48:06 +01:00
|
|
|
///
|
2018-02-21 00:19:34 +01:00
|
|
|
/// memmove(A <- B)
|
|
|
|
/// memmove(A <- A)
|
2010-12-06 02:48:06 +01:00
|
|
|
///
|
|
|
|
/// In this case, the second store to A does not make the first store to A dead.
|
|
|
|
/// The usual situation isn't an explicit A<-A store like this (which can be
|
|
|
|
/// trivially removed) but a case where two pointers may alias.
|
|
|
|
///
|
|
|
|
/// This function detects when it is unsafe to remove a dependent instruction
|
|
|
|
/// because the DSE inducing instruction may be a self-read.
|
|
|
|
static bool isPossibleSelfRead(Instruction *Inst,
|
2015-06-17 09:18:54 +02:00
|
|
|
const MemoryLocation &InstStoreLoc,
|
2015-08-12 20:01:44 +02:00
|
|
|
Instruction *DepWrite,
|
|
|
|
const TargetLibraryInfo &TLI,
|
|
|
|
AliasAnalysis &AA) {
|
2010-12-06 02:48:06 +01:00
|
|
|
// Self reads can only happen for instructions that read memory. Get the
|
|
|
|
// location read.
|
2015-08-12 20:01:44 +02:00
|
|
|
MemoryLocation InstReadLoc = getLocForRead(Inst, TLI);
|
2018-02-21 00:19:34 +01:00
|
|
|
if (!InstReadLoc.Ptr)
|
|
|
|
return false; // Not a reading instruction.
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2010-12-06 02:48:06 +01:00
|
|
|
// If the read and written loc obviously don't alias, it isn't a read.
|
2018-02-21 00:19:34 +01:00
|
|
|
if (AA.isNoAlias(InstReadLoc, InstStoreLoc))
|
2010-12-06 02:48:06 +01:00
|
|
|
return false;
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2018-04-23 21:06:49 +02:00
|
|
|
if (isa<AnyMemCpyInst>(Inst)) {
|
2018-02-21 00:19:34 +01:00
|
|
|
// LLVM's memcpy overlap semantics are not fully fleshed out (see PR11763)
|
|
|
|
// but in practice memcpy(A <- B) either means that A and B are disjoint or
|
|
|
|
// are equal (i.e. there are not partial overlaps). Given that, if we have:
|
|
|
|
//
|
|
|
|
// memcpy/memmove(A <- B) // DepWrite
|
|
|
|
// memcpy(A <- B) // Inst
|
|
|
|
//
|
|
|
|
// with Inst reading/writing a >= size than DepWrite, we can reason as
|
|
|
|
// follows:
|
|
|
|
//
|
|
|
|
// - If A == B then both the copies are no-ops, so the DepWrite can be
|
|
|
|
// removed.
|
|
|
|
// - If A != B then A and B are disjoint locations in Inst. Since
|
|
|
|
// Inst.size >= DepWrite.size A and B are disjoint in DepWrite too.
|
|
|
|
// Therefore DepWrite can be removed.
|
|
|
|
MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);
|
|
|
|
|
|
|
|
if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2010-12-06 02:48:06 +01:00
|
|
|
// If DepWrite doesn't read memory or if we can't prove it is a must alias,
|
|
|
|
// then it can't be considered dead.
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-09-23 13:38:44 +02:00
|
|
|
/// Returns true if the memory which is accessed by the second instruction is not
|
|
|
|
/// modified between the first and the second instruction.
|
|
|
|
/// Precondition: Second instruction must be dominated by the first
|
2015-08-13 17:36:11 +02:00
|
|
|
/// instruction.
|
2016-05-17 23:38:13 +02:00
|
|
|
static bool memoryIsNotModifiedBetween(Instruction *FirstI,
|
|
|
|
Instruction *SecondI,
|
2020-02-21 23:40:22 +01:00
|
|
|
AliasAnalysis *AA,
|
|
|
|
const DataLayout &DL,
|
|
|
|
DominatorTree *DT) {
|
|
|
|
// Do a backwards scan through the CFG from SecondI to FirstI. Look for
|
|
|
|
// instructions which can modify the memory location accessed by SecondI.
|
|
|
|
//
|
|
|
|
// While doing the walk keep track of the address to check. It might be
|
|
|
|
// different in different basic blocks due to PHI translation.
|
|
|
|
using BlockAddressPair = std::pair<BasicBlock *, PHITransAddr>;
|
|
|
|
SmallVector<BlockAddressPair, 16> WorkList;
|
|
|
|
// Keep track of the address we visited each block with. Bail out if we
|
|
|
|
// visit a block with different addresses.
|
|
|
|
DenseMap<BasicBlock *, Value *> Visited;
|
|
|
|
|
2015-09-23 13:38:44 +02:00
|
|
|
BasicBlock::iterator FirstBBI(FirstI);
|
|
|
|
++FirstBBI;
|
|
|
|
BasicBlock::iterator SecondBBI(SecondI);
|
|
|
|
BasicBlock *FirstBB = FirstI->getParent();
|
|
|
|
BasicBlock *SecondBB = SecondI->getParent();
|
|
|
|
MemoryLocation MemLoc = MemoryLocation::get(SecondI);
|
2020-02-21 23:40:22 +01:00
|
|
|
auto *MemLocPtr = const_cast<Value *>(MemLoc.Ptr);
|
2015-08-13 17:36:11 +02:00
|
|
|
|
2020-02-01 00:18:59 +01:00
|
|
|
// Start checking the SecondBB.
|
2020-02-21 23:40:22 +01:00
|
|
|
WorkList.push_back(
|
|
|
|
std::make_pair(SecondBB, PHITransAddr(MemLocPtr, DL, nullptr)));
|
2015-08-13 17:36:11 +02:00
|
|
|
bool isFirstBlock = true;
|
|
|
|
|
2020-02-01 00:18:59 +01:00
|
|
|
// Check all blocks going backward until we reach the FirstBB.
|
2015-08-13 17:36:11 +02:00
|
|
|
while (!WorkList.empty()) {
|
2020-02-21 23:40:22 +01:00
|
|
|
BlockAddressPair Current = WorkList.pop_back_val();
|
|
|
|
BasicBlock *B = Current.first;
|
|
|
|
PHITransAddr &Addr = Current.second;
|
|
|
|
Value *Ptr = Addr.getAddr();
|
2015-08-13 17:36:11 +02:00
|
|
|
|
2020-02-01 00:18:59 +01:00
|
|
|
// Ignore instructions before FirstI if this is the FirstBB.
|
2015-09-23 13:38:44 +02:00
|
|
|
BasicBlock::iterator BI = (B == FirstBB ? FirstBBI : B->begin());
|
2015-08-13 17:36:11 +02:00
|
|
|
|
|
|
|
BasicBlock::iterator EI;
|
|
|
|
if (isFirstBlock) {
|
2020-02-01 00:18:59 +01:00
|
|
|
// Ignore instructions after SecondI if this is the first visit of SecondBB.
|
2015-09-23 13:38:44 +02:00
|
|
|
assert(B == SecondBB && "first block is not the store block");
|
|
|
|
EI = SecondBBI;
|
2015-08-13 17:36:11 +02:00
|
|
|
isFirstBlock = false;
|
|
|
|
} else {
|
2015-09-23 13:38:44 +02:00
|
|
|
// It's not SecondBB or (in case of a loop) the second visit of SecondBB.
|
2020-02-01 00:18:59 +01:00
|
|
|
// In this case we also have to look at instructions after SecondI.
|
2015-08-13 17:36:11 +02:00
|
|
|
EI = B->end();
|
|
|
|
}
|
|
|
|
for (; BI != EI; ++BI) {
|
2015-10-13 20:26:00 +02:00
|
|
|
Instruction *I = &*BI;
|
2017-12-05 21:12:23 +01:00
|
|
|
if (I->mayWriteToMemory() && I != SecondI)
|
2020-02-21 23:40:22 +01:00
|
|
|
if (isModSet(AA->getModRefInfo(I, MemLoc.getWithNewPtr(Ptr))))
|
2015-08-13 17:36:11 +02:00
|
|
|
return false;
|
|
|
|
}
|
2015-09-23 13:38:44 +02:00
|
|
|
if (B != FirstBB) {
|
|
|
|
assert(B != &FirstBB->getParent()->getEntryBlock() &&
|
2015-08-13 17:36:11 +02:00
|
|
|
"Should not hit the entry block because SI must be dominated by LI");
|
2015-12-11 19:39:41 +01:00
|
|
|
for (auto PredI = pred_begin(B), PE = pred_end(B); PredI != PE; ++PredI) {
|
2020-02-21 23:40:22 +01:00
|
|
|
PHITransAddr PredAddr = Addr;
|
|
|
|
if (PredAddr.NeedsPHITranslationFromBlock(B)) {
|
|
|
|
if (!PredAddr.IsPotentiallyPHITranslatable())
|
|
|
|
return false;
|
|
|
|
if (PredAddr.PHITranslateValue(B, *PredI, DT, false))
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
Value *TranslatedPtr = PredAddr.getAddr();
|
|
|
|
auto Inserted = Visited.insert(std::make_pair(*PredI, TranslatedPtr));
|
|
|
|
if (!Inserted.second) {
|
|
|
|
// We already visited this block before. If it was with a different
|
|
|
|
// address - bail out!
|
|
|
|
if (TranslatedPtr != Inserted.first->second)
|
|
|
|
return false;
|
|
|
|
// ... otherwise just skip it.
|
2015-08-13 17:36:11 +02:00
|
|
|
continue;
|
2020-02-21 23:40:22 +01:00
|
|
|
}
|
|
|
|
WorkList.push_back(std::make_pair(*PredI, PredAddr));
|
2015-08-13 17:36:11 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2011-11-05 11:48:42 +01:00
|
|
|
/// Find all blocks that will unconditionally lead to the block BB and append
|
|
|
|
/// them to F.
|
2016-05-17 23:38:13 +02:00
|
|
|
static void findUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
|
2011-11-05 11:48:42 +01:00
|
|
|
BasicBlock *BB, DominatorTree *DT) {
|
2014-07-21 19:06:51 +02:00
|
|
|
for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
|
|
|
|
BasicBlock *Pred = *I;
|
2011-12-08 23:36:35 +01:00
|
|
|
if (Pred == BB) continue;
|
2018-10-15 12:04:59 +02:00
|
|
|
Instruction *PredTI = Pred->getTerminator();
|
2011-11-05 11:48:42 +01:00
|
|
|
if (PredTI->getNumSuccessors() != 1)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (DT->isReachableFromEntry(Pred))
|
|
|
|
Blocks.push_back(Pred);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
/// Handle frees of entire structures whose dependency is a store
|
2010-11-30 02:28:33 +01:00
|
|
|
/// to a field of that structure.
|
2016-05-17 23:38:13 +02:00
|
|
|
static bool handleFree(CallInst *F, AliasAnalysis *AA,
|
|
|
|
MemoryDependenceResults *MD, DominatorTree *DT,
|
2016-07-22 20:27:24 +02:00
|
|
|
const TargetLibraryInfo *TLI,
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
InstOverlapIntervalsTy &IOL,
|
2020-01-03 15:13:55 +01:00
|
|
|
MapVector<Instruction *, bool> &ThrowableInst) {
|
2011-06-15 02:47:34 +02:00
|
|
|
bool MadeChange = false;
|
|
|
|
|
2015-06-17 09:18:54 +02:00
|
|
|
MemoryLocation Loc = MemoryLocation(F->getOperand(0));
|
2011-11-05 11:48:42 +01:00
|
|
|
SmallVector<BasicBlock *, 16> Blocks;
|
|
|
|
Blocks.push_back(F->getParent());
|
2015-03-10 03:37:25 +01:00
|
|
|
const DataLayout &DL = F->getModule()->getDataLayout();
|
2011-06-15 02:47:34 +02:00
|
|
|
|
2011-11-05 11:48:42 +01:00
|
|
|
while (!Blocks.empty()) {
|
|
|
|
BasicBlock *BB = Blocks.pop_back_val();
|
|
|
|
Instruction *InstPt = BB->getTerminator();
|
|
|
|
if (BB == F->getParent()) InstPt = F;
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2015-10-13 20:26:00 +02:00
|
|
|
MemDepResult Dep =
|
|
|
|
MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB);
|
2011-11-05 11:48:42 +01:00
|
|
|
while (Dep.isDef() || Dep.isClobber()) {
|
|
|
|
Instruction *Dependency = Dep.getInst();
|
2018-01-21 02:44:33 +01:00
|
|
|
if (!hasAnalyzableMemoryWrite(Dependency, *TLI) ||
|
|
|
|
!isRemovable(Dependency))
|
2011-11-05 11:48:42 +01:00
|
|
|
break;
|
2008-01-20 11:49:23 +01:00
|
|
|
|
2011-11-05 11:48:42 +01:00
|
|
|
Value *DepPointer =
|
2015-03-10 03:37:25 +01:00
|
|
|
GetUnderlyingObject(getStoredPointerOperand(Dependency), DL);
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2011-11-05 11:48:42 +01:00
|
|
|
// Check for aliasing.
|
|
|
|
if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
|
|
|
|
break;
|
2010-11-12 03:19:17 +01:00
|
|
|
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(
|
|
|
|
dbgs() << "DSE: Dead Store to soon to be freed memory:\n DEAD: "
|
|
|
|
<< *Dependency << '\n');
|
2016-07-19 18:50:57 +02:00
|
|
|
|
2016-06-10 19:59:22 +02:00
|
|
|
// DCE instructions only used to calculate that store.
|
2016-07-06 21:48:52 +02:00
|
|
|
BasicBlock::iterator BBI(Dependency);
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL,
|
2020-01-03 15:13:55 +01:00
|
|
|
ThrowableInst);
|
2011-11-05 11:48:42 +01:00
|
|
|
++NumFastStores;
|
|
|
|
MadeChange = true;
|
|
|
|
|
|
|
|
// Inst's old Dependency is now deleted. Compute the next dependency,
|
|
|
|
// which may also be dead, as in
|
|
|
|
// s[0] = 0;
|
|
|
|
// s[1] = 0; // This has just been deleted.
|
|
|
|
// free(s);
|
2016-07-06 21:48:52 +02:00
|
|
|
Dep = MD->getPointerDependencyFrom(Loc, false, BBI, BB);
|
2011-11-05 11:48:42 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (Dep.isNonLocal())
|
2016-05-17 23:38:13 +02:00
|
|
|
findUnconditionalPreds(Blocks, BB, DT);
|
2011-11-05 11:48:42 +01:00
|
|
|
}
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2011-06-15 02:47:34 +02:00
|
|
|
return MadeChange;
|
2007-07-12 01:19:17 +02:00
|
|
|
}
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
/// Check to see if the specified location may alias any of the stack objects in
|
|
|
|
/// the DeadStackObjects set. If so, they become live because the location is
|
|
|
|
/// being loaded.
|
|
|
|
static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
|
Add "const" in GetUnderlyingObjects. NFC
Summary:
Both the input Value pointer and the returned Value
pointers in GetUnderlyingObjects are now declared as
const.
It turned out that all current (in-tree) uses of
GetUnderlyingObjects were trivial to update, being
satisfied with have those Value pointers declared
as const. Actually, in the past several of the users
had to use const_cast, just because of ValueTracking
not providing a version of GetUnderlyingObjects with
"const" Value pointers. With this patch we get rid
of those const casts.
Reviewers: hfinkel, materi, jkorous
Reviewed By: jkorous
Subscribers: dexonsmith, jkorous, jholewinski, sdardis, eraman, hiraditya, jrtc27, atanasyan, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61038
llvm-svn: 359072
2019-04-24 08:55:50 +02:00
|
|
|
SmallSetVector<const Value *, 16> &DeadStackObjects,
|
2016-05-17 23:38:13 +02:00
|
|
|
const DataLayout &DL, AliasAnalysis *AA,
|
llvm: Add support for "-fno-delete-null-pointer-checks"
Summary:
Support for this option is needed for building Linux kernel.
This is a very frequently requested feature by kernel developers.
More details : https://lkml.org/lkml/2018/4/4/601
GCC option description for -fdelete-null-pointer-checks:
This Assume that programs cannot safely dereference null pointers,
and that no code or data element resides at address zero.
-fno-delete-null-pointer-checks is the inverse of this implying that
null pointer dereferencing is not undefined.
This feature is implemented in LLVM IR in this CL as the function attribute
"null-pointer-is-valid"="true" in IR (Under review at D47894).
The CL updates several passes that assumed null pointer dereferencing is
undefined to not optimize when the "null-pointer-is-valid"="true"
attribute is present.
Reviewers: t.p.northover, efriedma, jyknight, chandlerc, rnk, srhines, void, george.burgess.iv
Reviewed By: efriedma, george.burgess.iv
Subscribers: eraman, haicheng, george.burgess.iv, drinkcat, theraven, reames, sanjoy, xbolva00, llvm-commits
Differential Revision: https://reviews.llvm.org/D47895
llvm-svn: 336613
2018-07-10 00:27:23 +02:00
|
|
|
const TargetLibraryInfo *TLI,
|
|
|
|
const Function *F) {
|
2016-05-17 23:38:13 +02:00
|
|
|
const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr, DL);
|
|
|
|
|
|
|
|
// A constant can't be in the dead pointer set.
|
|
|
|
if (isa<Constant>(UnderlyingPointer))
|
|
|
|
return;
|
|
|
|
|
|
|
|
// If the kill pointer can be easily reduced to an alloca, don't bother doing
|
|
|
|
// extraneous AA queries.
|
|
|
|
if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) {
|
Add "const" in GetUnderlyingObjects. NFC
Summary:
Both the input Value pointer and the returned Value
pointers in GetUnderlyingObjects are now declared as
const.
It turned out that all current (in-tree) uses of
GetUnderlyingObjects were trivial to update, being
satisfied with have those Value pointers declared
as const. Actually, in the past several of the users
had to use const_cast, just because of ValueTracking
not providing a version of GetUnderlyingObjects with
"const" Value pointers. With this patch we get rid
of those const casts.
Reviewers: hfinkel, materi, jkorous
Reviewed By: jkorous
Subscribers: dexonsmith, jkorous, jholewinski, sdardis, eraman, hiraditya, jrtc27, atanasyan, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61038
llvm-svn: 359072
2019-04-24 08:55:50 +02:00
|
|
|
DeadStackObjects.remove(UnderlyingPointer);
|
2016-05-17 23:38:13 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Remove objects that could alias LoadedLoc.
|
Add "const" in GetUnderlyingObjects. NFC
Summary:
Both the input Value pointer and the returned Value
pointers in GetUnderlyingObjects are now declared as
const.
It turned out that all current (in-tree) uses of
GetUnderlyingObjects were trivial to update, being
satisfied with have those Value pointers declared
as const. Actually, in the past several of the users
had to use const_cast, just because of ValueTracking
not providing a version of GetUnderlyingObjects with
"const" Value pointers. With this patch we get rid
of those const casts.
Reviewers: hfinkel, materi, jkorous
Reviewed By: jkorous
Subscribers: dexonsmith, jkorous, jholewinski, sdardis, eraman, hiraditya, jrtc27, atanasyan, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61038
llvm-svn: 359072
2019-04-24 08:55:50 +02:00
|
|
|
DeadStackObjects.remove_if([&](const Value *I) {
|
2016-05-17 23:38:13 +02:00
|
|
|
// See if the loaded location could alias the stack location.
|
llvm: Add support for "-fno-delete-null-pointer-checks"
Summary:
Support for this option is needed for building Linux kernel.
This is a very frequently requested feature by kernel developers.
More details : https://lkml.org/lkml/2018/4/4/601
GCC option description for -fdelete-null-pointer-checks:
This Assume that programs cannot safely dereference null pointers,
and that no code or data element resides at address zero.
-fno-delete-null-pointer-checks is the inverse of this implying that
null pointer dereferencing is not undefined.
This feature is implemented in LLVM IR in this CL as the function attribute
"null-pointer-is-valid"="true" in IR (Under review at D47894).
The CL updates several passes that assumed null pointer dereferencing is
undefined to not optimize when the "null-pointer-is-valid"="true"
attribute is present.
Reviewers: t.p.northover, efriedma, jyknight, chandlerc, rnk, srhines, void, george.burgess.iv
Reviewed By: efriedma, george.burgess.iv
Subscribers: eraman, haicheng, george.burgess.iv, drinkcat, theraven, reames, sanjoy, xbolva00, llvm-commits
Differential Revision: https://reviews.llvm.org/D47895
llvm-svn: 336613
2018-07-10 00:27:23 +02:00
|
|
|
MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI, F));
|
2016-05-17 23:38:13 +02:00
|
|
|
return !AA->isNoAlias(StackLoc, LoadedLoc);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Remove dead stores to stack-allocated locations in the function end block.
|
|
|
|
/// Ex:
|
2007-08-08 19:50:09 +02:00
|
|
|
/// %A = alloca i32
|
|
|
|
/// ...
|
|
|
|
/// store i32 1, i32* %A
|
|
|
|
/// ret void
|
2016-05-17 23:38:13 +02:00
|
|
|
static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
|
2019-03-29 15:10:24 +01:00
|
|
|
MemoryDependenceResults *MD,
|
|
|
|
const TargetLibraryInfo *TLI,
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
InstOverlapIntervalsTy &IOL,
|
2020-01-03 15:13:55 +01:00
|
|
|
MapVector<Instruction *, bool> &ThrowableInst) {
|
2007-07-12 23:41:30 +02:00
|
|
|
bool MadeChange = false;
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2010-11-30 22:32:12 +01:00
|
|
|
// Keep track of all of the stack objects that are dead at the end of the
|
|
|
|
// function.
|
Add "const" in GetUnderlyingObjects. NFC
Summary:
Both the input Value pointer and the returned Value
pointers in GetUnderlyingObjects are now declared as
const.
It turned out that all current (in-tree) uses of
GetUnderlyingObjects were trivial to update, being
satisfied with have those Value pointers declared
as const. Actually, in the past several of the users
had to use const_cast, just because of ValueTracking
not providing a version of GetUnderlyingObjects with
"const" Value pointers. With this patch we get rid
of those const casts.
Reviewers: hfinkel, materi, jkorous
Reviewed By: jkorous
Subscribers: dexonsmith, jkorous, jholewinski, sdardis, eraman, hiraditya, jrtc27, atanasyan, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61038
llvm-svn: 359072
2019-04-24 08:55:50 +02:00
|
|
|
SmallSetVector<const Value*, 16> DeadStackObjects;
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2008-11-28 01:27:14 +01:00
|
|
|
// Find all of the alloca'd pointers in the entry block.
|
2015-10-13 20:26:00 +02:00
|
|
|
BasicBlock &Entry = BB.getParent()->front();
|
|
|
|
for (Instruction &I : Entry) {
|
|
|
|
if (isa<AllocaInst>(&I))
|
|
|
|
DeadStackObjects.insert(&I);
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2011-10-22 23:59:35 +02:00
|
|
|
// Okay, so these are dead heap objects, but if the pointer never escapes
|
|
|
|
// then it's leaked by this function anyways.
|
2015-10-13 20:26:00 +02:00
|
|
|
else if (isAllocLikeFn(&I, TLI) && !PointerMayBeCaptured(&I, true, true))
|
|
|
|
DeadStackObjects.insert(&I);
|
2011-10-22 23:59:35 +02:00
|
|
|
}
|
|
|
|
|
2014-01-28 03:38:36 +01:00
|
|
|
// Treat byval or inalloca arguments the same, stores to them are dead at the
|
|
|
|
// end of the function.
|
2015-10-13 20:26:00 +02:00
|
|
|
for (Argument &AI : BB.getParent()->args())
|
2020-04-30 03:50:17 +02:00
|
|
|
if (AI.hasPassPointeeByValueAttr())
|
2015-10-13 20:26:00 +02:00
|
|
|
DeadStackObjects.insert(&AI);
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2015-03-10 03:37:25 +01:00
|
|
|
const DataLayout &DL = BB.getModule()->getDataLayout();
|
|
|
|
|
2007-07-12 23:41:30 +02:00
|
|
|
// Scan the basic block backwards
|
|
|
|
for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){
|
|
|
|
--BBI;
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2015-08-20 10:58:47 +02:00
|
|
|
// If we find a store, check to see if it points into a dead stack value.
|
2018-01-21 02:44:33 +01:00
|
|
|
if (hasAnalyzableMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) {
|
2010-11-30 20:48:15 +01:00
|
|
|
// See through pointer-to-pointer bitcasts
|
Add "const" in GetUnderlyingObjects. NFC
Summary:
Both the input Value pointer and the returned Value
pointers in GetUnderlyingObjects are now declared as
const.
It turned out that all current (in-tree) uses of
GetUnderlyingObjects were trivial to update, being
satisfied with have those Value pointers declared
as const. Actually, in the past several of the users
had to use const_cast, just because of ValueTracking
not providing a version of GetUnderlyingObjects with
"const" Value pointers. With this patch we get rid
of those const casts.
Reviewers: hfinkel, materi, jkorous
Reviewed By: jkorous
Subscribers: dexonsmith, jkorous, jholewinski, sdardis, eraman, hiraditya, jrtc27, atanasyan, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61038
llvm-svn: 359072
2019-04-24 08:55:50 +02:00
|
|
|
SmallVector<const Value *, 4> Pointers;
|
2015-10-13 20:26:00 +02:00
|
|
|
GetUnderlyingObjects(getStoredPointerOperand(&*BBI), Pointers, DL);
|
2010-11-30 20:48:15 +01:00
|
|
|
|
2010-11-30 22:58:14 +01:00
|
|
|
// Stores to stack values are valid candidates for removal.
|
2012-05-10 20:57:38 +02:00
|
|
|
bool AllDead = true;
|
Add "const" in GetUnderlyingObjects. NFC
Summary:
Both the input Value pointer and the returned Value
pointers in GetUnderlyingObjects are now declared as
const.
It turned out that all current (in-tree) uses of
GetUnderlyingObjects were trivial to update, being
satisfied with have those Value pointers declared
as const. Actually, in the past several of the users
had to use const_cast, just because of ValueTracking
not providing a version of GetUnderlyingObjects with
"const" Value pointers. With this patch we get rid
of those const casts.
Reviewers: hfinkel, materi, jkorous
Reviewed By: jkorous
Subscribers: dexonsmith, jkorous, jholewinski, sdardis, eraman, hiraditya, jrtc27, atanasyan, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61038
llvm-svn: 359072
2019-04-24 08:55:50 +02:00
|
|
|
for (const Value *Pointer : Pointers)
|
2016-06-26 14:28:59 +02:00
|
|
|
if (!DeadStackObjects.count(Pointer)) {
|
2012-05-10 20:57:38 +02:00
|
|
|
AllDead = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (AllDead) {
|
2016-07-06 21:48:52 +02:00
|
|
|
Instruction *Dead = &*BBI;
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: "
|
|
|
|
<< *Dead << "\n Objects: ";
|
Add "const" in GetUnderlyingObjects. NFC
Summary:
Both the input Value pointer and the returned Value
pointers in GetUnderlyingObjects are now declared as
const.
It turned out that all current (in-tree) uses of
GetUnderlyingObjects were trivial to update, being
satisfied with have those Value pointers declared
as const. Actually, in the past several of the users
had to use const_cast, just because of ValueTracking
not providing a version of GetUnderlyingObjects with
"const" Value pointers. With this patch we get rid
of those const casts.
Reviewers: hfinkel, materi, jkorous
Reviewed By: jkorous
Subscribers: dexonsmith, jkorous, jholewinski, sdardis, eraman, hiraditya, jrtc27, atanasyan, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61038
llvm-svn: 359072
2019-04-24 08:55:50 +02:00
|
|
|
for (SmallVectorImpl<const Value *>::iterator I =
|
|
|
|
Pointers.begin(),
|
2018-05-14 14:53:11 +02:00
|
|
|
E = Pointers.end();
|
|
|
|
I != E; ++I) {
|
|
|
|
dbgs() << **I;
|
|
|
|
if (std::next(I) != E)
|
|
|
|
dbgs() << ", ";
|
|
|
|
} dbgs()
|
|
|
|
<< '\n');
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2010-12-06 22:13:51 +01:00
|
|
|
// DCE instructions only used to calculate that store.
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, ThrowableInst,
|
2019-03-29 15:10:24 +01:00
|
|
|
&DeadStackObjects);
|
2010-11-30 20:48:15 +01:00
|
|
|
++NumFastStores;
|
|
|
|
MadeChange = true;
|
2011-08-30 23:11:06 +02:00
|
|
|
continue;
|
2010-11-30 20:48:15 +01:00
|
|
|
}
|
|
|
|
}
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2010-11-30 20:48:15 +01:00
|
|
|
// Remove any dead non-memory-mutating instructions.
|
2015-10-13 20:26:00 +02:00
|
|
|
if (isInstructionTriviallyDead(&*BBI, TLI)) {
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n DEAD: "
|
|
|
|
<< *&*BBI << '\n');
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, ThrowableInst,
|
2019-03-29 15:10:24 +01:00
|
|
|
&DeadStackObjects);
|
2010-11-30 20:48:15 +01:00
|
|
|
++NumFastOther;
|
|
|
|
MadeChange = true;
|
|
|
|
continue;
|
|
|
|
}
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2012-08-08 04:17:32 +02:00
|
|
|
if (isa<AllocaInst>(BBI)) {
|
|
|
|
// Remove allocas from the list of dead stack objects; there can't be
|
|
|
|
// any references before the definition.
|
2015-10-13 20:26:00 +02:00
|
|
|
DeadStackObjects.remove(&*BBI);
|
2012-05-10 19:14:00 +02:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2019-01-07 06:42:51 +01:00
|
|
|
if (auto *Call = dyn_cast<CallBase>(&*BBI)) {
|
2016-05-17 23:38:13 +02:00
|
|
|
// Remove allocation function calls from the list of dead stack objects;
|
2012-08-08 04:17:32 +02:00
|
|
|
// there can't be any references before the definition.
|
2015-10-13 20:26:00 +02:00
|
|
|
if (isAllocLikeFn(&*BBI, TLI))
|
|
|
|
DeadStackObjects.remove(&*BBI);
|
2012-08-08 04:17:32 +02:00
|
|
|
|
2010-11-30 20:48:15 +01:00
|
|
|
// If this call does not access memory, it can't be loading any of our
|
|
|
|
// pointers.
|
2019-01-07 06:42:51 +01:00
|
|
|
if (AA->doesNotAccessMemory(Call))
|
2007-08-08 19:58:56 +02:00
|
|
|
continue;
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2010-11-30 22:18:46 +01:00
|
|
|
// If the call might load from any of our allocas, then any store above
|
|
|
|
// the call is live.
|
Add "const" in GetUnderlyingObjects. NFC
Summary:
Both the input Value pointer and the returned Value
pointers in GetUnderlyingObjects are now declared as
const.
It turned out that all current (in-tree) uses of
GetUnderlyingObjects were trivial to update, being
satisfied with have those Value pointers declared
as const. Actually, in the past several of the users
had to use const_cast, just because of ValueTracking
not providing a version of GetUnderlyingObjects with
"const" Value pointers. With this patch we get rid
of those const casts.
Reviewers: hfinkel, materi, jkorous
Reviewed By: jkorous
Subscribers: dexonsmith, jkorous, jholewinski, sdardis, eraman, hiraditya, jrtc27, atanasyan, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D61038
llvm-svn: 359072
2019-04-24 08:55:50 +02:00
|
|
|
DeadStackObjects.remove_if([&](const Value *I) {
|
2014-03-01 12:47:00 +01:00
|
|
|
// See if the call site touches the value.
|
2019-01-07 06:42:51 +01:00
|
|
|
return isRefSet(AA->getModRefInfo(
|
|
|
|
Call, I, getPointerSize(I, DL, *TLI, BB.getParent())));
|
2014-03-03 20:28:52 +01:00
|
|
|
});
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2010-11-30 22:18:46 +01:00
|
|
|
// If all of the allocas were clobbered by the call then we're not going
|
|
|
|
// to find anything else to process.
|
2012-10-14 12:21:31 +02:00
|
|
|
if (DeadStackObjects.empty())
|
2012-08-08 04:17:32 +02:00
|
|
|
break;
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2007-07-12 23:41:30 +02:00
|
|
|
continue;
|
2010-11-30 22:18:46 +01:00
|
|
|
}
|
2011-07-27 03:08:30 +02:00
|
|
|
|
2016-07-07 22:51:42 +02:00
|
|
|
// We can remove the dead stores, irrespective of the fence and its ordering
|
|
|
|
// (release/acquire/seq_cst). Fences only constraints the ordering of
|
|
|
|
// already visible stores, it does not make a store visible to other
|
|
|
|
// threads. So, skipping over a fence does not change a store from being
|
|
|
|
// dead.
|
|
|
|
if (isa<FenceInst>(*BBI))
|
|
|
|
continue;
|
|
|
|
|
2015-06-17 09:18:54 +02:00
|
|
|
MemoryLocation LoadedLoc;
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2010-11-30 22:18:46 +01:00
|
|
|
// If we encounter a use of the pointer, it is no longer considered dead
|
|
|
|
if (LoadInst *L = dyn_cast<LoadInst>(BBI)) {
|
2011-08-18 00:22:24 +02:00
|
|
|
if (!L->isUnordered()) // Be conservative with atomic/volatile load
|
|
|
|
break;
|
2015-06-04 04:03:15 +02:00
|
|
|
LoadedLoc = MemoryLocation::get(L);
|
2010-11-30 22:18:46 +01:00
|
|
|
} else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) {
|
2015-06-04 04:03:15 +02:00
|
|
|
LoadedLoc = MemoryLocation::get(V);
|
2011-09-06 20:14:09 +02:00
|
|
|
} else if (!BBI->mayReadFromMemory()) {
|
|
|
|
// Instruction doesn't read memory. Note that stores that weren't removed
|
|
|
|
// above will hit this case.
|
2008-11-28 01:27:14 +01:00
|
|
|
continue;
|
2011-07-27 03:08:30 +02:00
|
|
|
} else {
|
|
|
|
// Unknown inst; assume it clobbers everything.
|
|
|
|
break;
|
2007-07-12 23:41:30 +02:00
|
|
|
}
|
2008-10-01 17:25:41 +02:00
|
|
|
|
2010-11-30 22:32:12 +01:00
|
|
|
// Remove any allocas from the DeadPointer set that are loaded, as this
|
|
|
|
// makes any stores above the access live.
|
llvm: Add support for "-fno-delete-null-pointer-checks"
Summary:
Support for this option is needed for building Linux kernel.
This is a very frequently requested feature by kernel developers.
More details : https://lkml.org/lkml/2018/4/4/601
GCC option description for -fdelete-null-pointer-checks:
This Assume that programs cannot safely dereference null pointers,
and that no code or data element resides at address zero.
-fno-delete-null-pointer-checks is the inverse of this implying that
null pointer dereferencing is not undefined.
This feature is implemented in LLVM IR in this CL as the function attribute
"null-pointer-is-valid"="true" in IR (Under review at D47894).
The CL updates several passes that assumed null pointer dereferencing is
undefined to not optimize when the "null-pointer-is-valid"="true"
attribute is present.
Reviewers: t.p.northover, efriedma, jyknight, chandlerc, rnk, srhines, void, george.burgess.iv
Reviewed By: efriedma, george.burgess.iv
Subscribers: eraman, haicheng, george.burgess.iv, drinkcat, theraven, reames, sanjoy, xbolva00, llvm-commits
Differential Revision: https://reviews.llvm.org/D47895
llvm-svn: 336613
2018-07-10 00:27:23 +02:00
|
|
|
removeAccessedObjects(LoadedLoc, DeadStackObjects, DL, AA, TLI, BB.getParent());
|
2008-10-01 17:25:41 +02:00
|
|
|
|
2010-11-30 22:32:12 +01:00
|
|
|
// If all of the allocas were clobbered by the access then we're not going
|
|
|
|
// to find anything else to process.
|
|
|
|
if (DeadStackObjects.empty())
|
|
|
|
break;
|
2007-07-12 23:41:30 +02:00
|
|
|
}
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2007-07-12 23:41:30 +02:00
|
|
|
return MadeChange;
|
|
|
|
}
|
|
|
|
|
2016-07-22 20:27:24 +02:00
|
|
|
static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
|
|
|
|
int64_t &EarlierSize, int64_t LaterOffset,
|
|
|
|
int64_t LaterSize, bool IsOverwriteEnd) {
|
|
|
|
// TODO: base this on the target vector size so that if the earlier
|
|
|
|
// store was too small to get vector writes anyway then its likely
|
|
|
|
// a good idea to shorten it
|
|
|
|
// Power of 2 vector writes are probably always a bad idea to optimize
|
|
|
|
// as any store/memset/memcpy is likely using vector instructions so
|
|
|
|
// shortening it to not vector size is likely to be slower
|
2018-05-10 17:12:49 +02:00
|
|
|
auto *EarlierIntrinsic = cast<AnyMemIntrinsic>(EarlierWrite);
|
[DSE] Upgrade uses of MemoryIntrinic::getAlignment() to new API. (NFC)
Summary:
This change is part of step five in the series of changes to remove alignment argument from
memcpy/memmove/memset in favour of alignment attributes. In particular, this changes the
DeadStoreElimination pass to cease using the old getAlignment() API of MemoryIntrinsic
in favour of getting dest specific alignments through the new API.
Steps:
Step 1) Remove alignment parameter and create alignment parameter attributes for
memcpy/memmove/memset. ( rL322965, rC322964, rL322963 )
Step 2) Expand the IRBuilder API to allow creation of memcpy/memmove with differing
source and dest alignments. ( rL323597 )
Step 3) Update Clang to use the new IRBuilder API. ( rC323617 )
Step 4) Update Polly to use the new IRBuilder API. ( rL323618 )
Step 5) Update LLVM passes that create memcpy/memmove calls to use the new IRBuilder API,
and those that use use MemIntrinsicInst::[get|set]Alignment() to use [get|set]DestAlignment()
and [get|set]SourceAlignment() instead. ( rL323886, rL323891, rL324148, rL324273, rL324278,
rL324384, rL324395 )
Step 6) Remove the single-alignment IRBuilder API for memcpy/memmove, and the
MemIntrinsicInst::[get|set]Alignment() methods.
Reference
http://lists.llvm.org/pipermail/llvm-dev/2015-August/089384.html
http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20151109/312083.html
llvm-svn: 324402
2018-02-06 22:18:33 +01:00
|
|
|
unsigned EarlierWriteAlign = EarlierIntrinsic->getDestAlignment();
|
2016-07-22 20:27:24 +02:00
|
|
|
if (!IsOverwriteEnd)
|
|
|
|
LaterOffset = int64_t(LaterOffset + LaterSize);
|
|
|
|
|
2017-10-13 23:17:07 +02:00
|
|
|
if (!(isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) &&
|
2016-07-22 20:27:24 +02:00
|
|
|
!((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0))
|
|
|
|
return false;
|
|
|
|
|
2018-05-10 17:12:49 +02:00
|
|
|
int64_t NewLength = IsOverwriteEnd
|
|
|
|
? LaterOffset - EarlierOffset
|
|
|
|
: EarlierSize - (LaterOffset - EarlierOffset);
|
|
|
|
|
|
|
|
if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(EarlierWrite)) {
|
|
|
|
// When shortening an atomic memory intrinsic, the newly shortened
|
|
|
|
// length must remain an integer multiple of the element size.
|
|
|
|
const uint32_t ElementSize = AMI->getElementSizeInBytes();
|
|
|
|
if (0 != NewLength % ElementSize)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW "
|
|
|
|
<< (IsOverwriteEnd ? "END" : "BEGIN") << ": "
|
|
|
|
<< *EarlierWrite << "\n KILLER (offset " << LaterOffset
|
|
|
|
<< ", " << EarlierSize << ")\n");
|
2016-07-22 20:27:24 +02:00
|
|
|
|
|
|
|
Value *EarlierWriteLength = EarlierIntrinsic->getLength();
|
|
|
|
Value *TrimmedLength =
|
|
|
|
ConstantInt::get(EarlierWriteLength->getType(), NewLength);
|
|
|
|
EarlierIntrinsic->setLength(TrimmedLength);
|
|
|
|
|
|
|
|
EarlierSize = NewLength;
|
|
|
|
if (!IsOverwriteEnd) {
|
|
|
|
int64_t OffsetMoved = (LaterOffset - EarlierOffset);
|
|
|
|
Value *Indices[1] = {
|
|
|
|
ConstantInt::get(EarlierWriteLength->getType(), OffsetMoved)};
|
|
|
|
GetElementPtrInst *NewDestGEP = GetElementPtrInst::CreateInBounds(
|
2019-02-01 21:44:47 +01:00
|
|
|
EarlierIntrinsic->getRawDest()->getType()->getPointerElementType(),
|
2016-07-22 20:27:24 +02:00
|
|
|
EarlierIntrinsic->getRawDest(), Indices, "", EarlierWrite);
|
2019-04-12 11:47:35 +02:00
|
|
|
NewDestGEP->setDebugLoc(EarlierIntrinsic->getDebugLoc());
|
2016-07-22 20:27:24 +02:00
|
|
|
EarlierIntrinsic->setDest(NewDestGEP);
|
|
|
|
EarlierOffset = EarlierOffset + OffsetMoved;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool tryToShortenEnd(Instruction *EarlierWrite,
|
|
|
|
OverlapIntervalsTy &IntervalMap,
|
|
|
|
int64_t &EarlierStart, int64_t &EarlierSize) {
|
|
|
|
if (IntervalMap.empty() || !isShortenableAtTheEnd(EarlierWrite))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
OverlapIntervalsTy::iterator OII = --IntervalMap.end();
|
|
|
|
int64_t LaterStart = OII->second;
|
|
|
|
int64_t LaterSize = OII->first - LaterStart;
|
|
|
|
|
|
|
|
if (LaterStart > EarlierStart && LaterStart < EarlierStart + EarlierSize &&
|
|
|
|
LaterStart + LaterSize >= EarlierStart + EarlierSize) {
|
|
|
|
if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
|
|
|
|
LaterSize, true)) {
|
|
|
|
IntervalMap.erase(OII);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool tryToShortenBegin(Instruction *EarlierWrite,
|
|
|
|
OverlapIntervalsTy &IntervalMap,
|
|
|
|
int64_t &EarlierStart, int64_t &EarlierSize) {
|
|
|
|
if (IntervalMap.empty() || !isShortenableAtTheBeginning(EarlierWrite))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
OverlapIntervalsTy::iterator OII = IntervalMap.begin();
|
|
|
|
int64_t LaterStart = OII->second;
|
|
|
|
int64_t LaterSize = OII->first - LaterStart;
|
|
|
|
|
|
|
|
if (LaterStart <= EarlierStart && LaterStart + LaterSize > EarlierStart) {
|
|
|
|
assert(LaterStart + LaterSize < EarlierStart + EarlierSize &&
|
2017-03-29 16:42:27 +02:00
|
|
|
"Should have been handled as OW_Complete");
|
2016-07-22 20:27:24 +02:00
|
|
|
if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
|
|
|
|
LaterSize, false)) {
|
|
|
|
IntervalMap.erase(OII);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool removePartiallyOverlappedStores(AliasAnalysis *AA,
|
|
|
|
const DataLayout &DL,
|
|
|
|
InstOverlapIntervalsTy &IOL) {
|
|
|
|
bool Changed = false;
|
|
|
|
for (auto OI : IOL) {
|
|
|
|
Instruction *EarlierWrite = OI.first;
|
2018-01-21 03:10:54 +01:00
|
|
|
MemoryLocation Loc = getLocForWrite(EarlierWrite);
|
2016-07-22 20:27:24 +02:00
|
|
|
assert(isRemovable(EarlierWrite) && "Expect only removable instruction");
|
|
|
|
|
|
|
|
const Value *Ptr = Loc.Ptr->stripPointerCasts();
|
|
|
|
int64_t EarlierStart = 0;
|
2018-10-09 05:18:56 +02:00
|
|
|
int64_t EarlierSize = int64_t(Loc.Size.getValue());
|
2016-07-22 20:27:24 +02:00
|
|
|
GetPointerBaseWithConstantOffset(Ptr, EarlierStart, DL);
|
|
|
|
OverlapIntervalsTy &IntervalMap = OI.second;
|
2016-07-27 19:25:20 +02:00
|
|
|
Changed |=
|
2016-07-22 20:27:24 +02:00
|
|
|
tryToShortenEnd(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
|
|
|
|
if (IntervalMap.empty())
|
|
|
|
continue;
|
|
|
|
Changed |=
|
|
|
|
tryToShortenBegin(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
|
|
|
|
}
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
2016-07-08 18:48:40 +02:00
|
|
|
static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
|
|
|
|
AliasAnalysis *AA, MemoryDependenceResults *MD,
|
|
|
|
const DataLayout &DL,
|
2016-07-22 20:27:24 +02:00
|
|
|
const TargetLibraryInfo *TLI,
|
2016-08-12 03:09:53 +02:00
|
|
|
InstOverlapIntervalsTy &IOL,
|
2020-02-21 23:40:22 +01:00
|
|
|
MapVector<Instruction *, bool> &ThrowableInst,
|
|
|
|
DominatorTree *DT) {
|
2016-07-08 18:48:40 +02:00
|
|
|
// Must be a store instruction.
|
|
|
|
StoreInst *SI = dyn_cast<StoreInst>(Inst);
|
|
|
|
if (!SI)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// If we're storing the same value back to a pointer that we just loaded from,
|
|
|
|
// then the store can be removed.
|
|
|
|
if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) {
|
|
|
|
if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
|
2020-02-21 23:40:22 +01:00
|
|
|
isRemovable(SI) &&
|
|
|
|
memoryIsNotModifiedBetween(DepLoad, SI, AA, DL, DT)) {
|
2016-07-08 18:48:40 +02:00
|
|
|
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(
|
|
|
|
dbgs() << "DSE: Remove Store Of Load from same pointer:\n LOAD: "
|
|
|
|
<< *DepLoad << "\n STORE: " << *SI << '\n');
|
2016-07-08 18:48:40 +02:00
|
|
|
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, ThrowableInst);
|
2016-07-08 18:48:40 +02:00
|
|
|
++NumRedundantStores;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Remove null stores into the calloc'ed objects
|
|
|
|
Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand());
|
|
|
|
if (StoredConstant && StoredConstant->isNullValue() && isRemovable(SI)) {
|
|
|
|
Instruction *UnderlyingPointer =
|
|
|
|
dyn_cast<Instruction>(GetUnderlyingObject(SI->getPointerOperand(), DL));
|
|
|
|
|
|
|
|
if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) &&
|
2020-02-21 23:40:22 +01:00
|
|
|
memoryIsNotModifiedBetween(UnderlyingPointer, SI, AA, DL, DT)) {
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(
|
2016-07-08 18:48:40 +02:00
|
|
|
dbgs() << "DSE: Remove null store to the calloc'ed object:\n DEAD: "
|
|
|
|
<< *Inst << "\n OBJECT: " << *UnderlyingPointer << '\n');
|
|
|
|
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, ThrowableInst);
|
2016-07-08 18:48:40 +02:00
|
|
|
++NumRedundantStores;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-06-15 16:37:33 +02:00
|
|
|
static Constant *
|
|
|
|
tryToMergePartialOverlappingStores(StoreInst *Earlier, StoreInst *Later,
|
|
|
|
int64_t InstWriteOffset,
|
|
|
|
int64_t DepWriteOffset, const DataLayout &DL,
|
|
|
|
AliasAnalysis *AA, DominatorTree *DT) {
|
|
|
|
|
|
|
|
if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) &&
|
|
|
|
DL.typeSizeEqualsStoreSize(Earlier->getValueOperand()->getType()) &&
|
|
|
|
Later && isa<ConstantInt>(Later->getValueOperand()) &&
|
|
|
|
DL.typeSizeEqualsStoreSize(Later->getValueOperand()->getType()) &&
|
|
|
|
memoryIsNotModifiedBetween(Earlier, Later, AA, DL, DT)) {
|
|
|
|
// If the store we find is:
|
|
|
|
// a) partially overwritten by the store to 'Loc'
|
|
|
|
// b) the later store is fully contained in the earlier one and
|
|
|
|
// c) they both have a constant value
|
|
|
|
// d) none of the two stores need padding
|
|
|
|
// Merge the two stores, replacing the earlier store's value with a
|
|
|
|
// merge of both values.
|
|
|
|
// TODO: Deal with other constant types (vectors, etc), and probably
|
|
|
|
// some mem intrinsics (if needed)
|
|
|
|
|
|
|
|
APInt EarlierValue =
|
|
|
|
cast<ConstantInt>(Earlier->getValueOperand())->getValue();
|
|
|
|
APInt LaterValue = cast<ConstantInt>(Later->getValueOperand())->getValue();
|
|
|
|
unsigned LaterBits = LaterValue.getBitWidth();
|
|
|
|
assert(EarlierValue.getBitWidth() > LaterValue.getBitWidth());
|
|
|
|
LaterValue = LaterValue.zext(EarlierValue.getBitWidth());
|
|
|
|
|
|
|
|
// Offset of the smaller store inside the larger store
|
|
|
|
unsigned BitOffsetDiff = (InstWriteOffset - DepWriteOffset) * 8;
|
|
|
|
unsigned LShiftAmount = DL.isBigEndian() ? EarlierValue.getBitWidth() -
|
|
|
|
BitOffsetDiff - LaterBits
|
|
|
|
: BitOffsetDiff;
|
|
|
|
APInt Mask = APInt::getBitsSet(EarlierValue.getBitWidth(), LShiftAmount,
|
|
|
|
LShiftAmount + LaterBits);
|
|
|
|
// Clear the bits we'll be replacing, then OR with the smaller
|
|
|
|
// store, shifted appropriately.
|
|
|
|
APInt Merged = (EarlierValue & ~Mask) | (LaterValue << LShiftAmount);
|
|
|
|
LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n Earlier: " << *Earlier
|
|
|
|
<< "\n Later: " << *Later
|
|
|
|
<< "\n Merged Value: " << Merged << '\n');
|
|
|
|
return ConstantInt::get(Earlier->getValueOperand()->getType(), Merged);
|
|
|
|
}
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
|
|
|
|
MemoryDependenceResults *MD, DominatorTree *DT,
|
|
|
|
const TargetLibraryInfo *TLI) {
|
|
|
|
const DataLayout &DL = BB.getModule()->getDataLayout();
|
|
|
|
bool MadeChange = false;
|
2010-11-30 22:32:12 +01:00
|
|
|
|
2020-01-03 15:13:55 +01:00
|
|
|
MapVector<Instruction *, bool> ThrowableInst;
|
2016-08-12 03:09:53 +02:00
|
|
|
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
// A map of interval maps representing partially-overwritten value parts.
|
|
|
|
InstOverlapIntervalsTy IOL;
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
// Do a top-down walk on the BB.
|
|
|
|
for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
|
|
|
|
// Handle 'free' calls specially.
|
2016-07-06 21:48:52 +02:00
|
|
|
if (CallInst *F = isFreeCall(&*BBI, TLI)) {
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
MadeChange |= handleFree(F, AA, MD, DT, TLI, IOL, ThrowableInst);
|
2016-07-06 21:48:52 +02:00
|
|
|
// Increment BBI after handleFree has potentially deleted instructions.
|
|
|
|
// This ensures we maintain a valid iterator.
|
|
|
|
++BBI;
|
2016-05-17 23:38:13 +02:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2016-07-06 21:48:52 +02:00
|
|
|
Instruction *Inst = &*BBI++;
|
|
|
|
|
2016-08-12 03:09:53 +02:00
|
|
|
if (Inst->mayThrow()) {
|
2020-01-03 15:13:55 +01:00
|
|
|
ThrowableInst[Inst] = true;
|
2016-08-12 03:09:53 +02:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2016-07-08 18:48:40 +02:00
|
|
|
// Check to see if Inst writes to memory. If not, continue.
|
2018-01-21 02:44:33 +01:00
|
|
|
if (!hasAnalyzableMemoryWrite(Inst, *TLI))
|
2016-05-17 23:38:13 +02:00
|
|
|
continue;
|
|
|
|
|
2016-07-08 18:48:40 +02:00
|
|
|
// eliminateNoopStore will update in iterator, if necessary.
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL,
|
2020-02-21 23:40:22 +01:00
|
|
|
ThrowableInst, DT)) {
|
2016-07-08 18:48:40 +02:00
|
|
|
MadeChange = true;
|
|
|
|
continue;
|
2016-05-17 23:38:13 +02:00
|
|
|
}
|
|
|
|
|
2016-07-08 18:48:40 +02:00
|
|
|
// If we find something that writes memory, get its memory dependence.
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
MemDepResult InstDep = MD->getDependency(Inst);
|
2016-05-17 23:38:13 +02:00
|
|
|
|
|
|
|
// Ignore any store where we can't find a local dependence.
|
|
|
|
// FIXME: cross-block DSE would be fun. :)
|
|
|
|
if (!InstDep.isDef() && !InstDep.isClobber())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Figure out what location is being stored to.
|
2018-01-21 03:10:54 +01:00
|
|
|
MemoryLocation Loc = getLocForWrite(Inst);
|
2016-05-17 23:38:13 +02:00
|
|
|
|
|
|
|
// If we didn't get a useful location, fail.
|
|
|
|
if (!Loc.Ptr)
|
|
|
|
continue;
|
|
|
|
|
limit the number of instructions per block examined by dead store elimination
Summary: Dead store elimination gets very expensive when large numbers of instructions need to be analyzed. This patch limits the number of instructions analyzed per store to the value of the memdep-block-scan-limit parameter (which defaults to 100). This resulted in no observed difference in performance of the generated code, and no change in the statistics for the dead store elimination pass, but improved compilation time on some files by more than an order of magnitude.
Reviewers: dexonsmith, bruno, george.burgess.iv, dberlin, reames, davidxl
Subscribers: davide, chandlerc, dberlin, davidxl, eraman, tejohnson, mbodart, llvm-commits
Differential Revision: https://reviews.llvm.org/D15537
llvm-svn: 279833
2016-08-26 18:34:27 +02:00
|
|
|
// Loop until we find a store we can eliminate or a load that
|
|
|
|
// invalidates the analysis. Without an upper bound on the number of
|
|
|
|
// instructions examined, this analysis can become very time-consuming.
|
|
|
|
// However, the potential gain diminishes as we process more instructions
|
|
|
|
// without eliminating any of them. Therefore, we limit the number of
|
|
|
|
// instructions we look at.
|
|
|
|
auto Limit = MD->getDefaultBlockScanLimit();
|
2016-05-17 23:38:13 +02:00
|
|
|
while (InstDep.isDef() || InstDep.isClobber()) {
|
|
|
|
// Get the memory clobbered by the instruction we depend on. MemDep will
|
|
|
|
// skip any instructions that 'Loc' clearly doesn't interact with. If we
|
|
|
|
// end up depending on a may- or must-aliased load, then we can't optimize
|
2016-06-15 23:41:22 +02:00
|
|
|
// away the store and we bail out. However, if we depend on something
|
2016-05-17 23:38:13 +02:00
|
|
|
// that overwrites the memory location we *can* potentially optimize it.
|
|
|
|
//
|
|
|
|
// Find out what memory location the dependent instruction stores.
|
|
|
|
Instruction *DepWrite = InstDep.getInst();
|
2018-01-21 03:10:54 +01:00
|
|
|
if (!hasAnalyzableMemoryWrite(DepWrite, *TLI))
|
|
|
|
break;
|
|
|
|
MemoryLocation DepLoc = getLocForWrite(DepWrite);
|
2016-05-17 23:38:13 +02:00
|
|
|
// If we didn't get a useful location, or if it isn't a size, bail out.
|
|
|
|
if (!DepLoc.Ptr)
|
|
|
|
break;
|
|
|
|
|
2020-01-03 15:13:55 +01:00
|
|
|
// Find the last throwable instruction not removed by call to
|
|
|
|
// deleteDeadInstruction.
|
|
|
|
Instruction *LastThrowing = nullptr;
|
|
|
|
if (!ThrowableInst.empty())
|
|
|
|
LastThrowing = ThrowableInst.back().first;
|
|
|
|
|
2016-08-12 03:09:53 +02:00
|
|
|
// Make sure we don't look past a call which might throw. This is an
|
|
|
|
// issue because MemoryDependenceAnalysis works in the wrong direction:
|
|
|
|
// it finds instructions which dominate the current instruction, rather than
|
|
|
|
// instructions which are post-dominated by the current instruction.
|
|
|
|
//
|
|
|
|
// If the underlying object is a non-escaping memory allocation, any store
|
|
|
|
// to it is dead along the unwind edge. Otherwise, we need to preserve
|
|
|
|
// the store.
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
if (LastThrowing && DepWrite->comesBefore(LastThrowing)) {
|
2016-08-12 03:09:53 +02:00
|
|
|
const Value* Underlying = GetUnderlyingObject(DepLoc.Ptr, DL);
|
|
|
|
bool IsStoreDeadOnUnwind = isa<AllocaInst>(Underlying);
|
|
|
|
if (!IsStoreDeadOnUnwind) {
|
|
|
|
// We're looking for a call to an allocation function
|
|
|
|
// where the allocation doesn't escape before the last
|
|
|
|
// throwing instruction; PointerMayBeCaptured
|
|
|
|
// reasonably fast approximation.
|
|
|
|
IsStoreDeadOnUnwind = isAllocLikeFn(Underlying, TLI) &&
|
|
|
|
!PointerMayBeCaptured(Underlying, false, true);
|
|
|
|
}
|
|
|
|
if (!IsStoreDeadOnUnwind)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
// If we find a write that is a) removable (i.e., non-volatile), b) is
|
|
|
|
// completely obliterated by the store to 'Loc', and c) which we know that
|
|
|
|
// 'Inst' doesn't load from, then we can remove it.
|
2017-09-26 15:54:28 +02:00
|
|
|
// Also try to merge two stores if a later one only touches memory written
|
|
|
|
// to by the earlier one.
|
2016-05-17 23:38:13 +02:00
|
|
|
if (isRemovable(DepWrite) &&
|
|
|
|
!isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
|
|
|
|
int64_t InstWriteOffset, DepWriteOffset;
|
2018-05-03 13:03:53 +02:00
|
|
|
OverwriteResult OR = isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset,
|
llvm: Add support for "-fno-delete-null-pointer-checks"
Summary:
Support for this option is needed for building Linux kernel.
This is a very frequently requested feature by kernel developers.
More details : https://lkml.org/lkml/2018/4/4/601
GCC option description for -fdelete-null-pointer-checks:
This Assume that programs cannot safely dereference null pointers,
and that no code or data element resides at address zero.
-fno-delete-null-pointer-checks is the inverse of this implying that
null pointer dereferencing is not undefined.
This feature is implemented in LLVM IR in this CL as the function attribute
"null-pointer-is-valid"="true" in IR (Under review at D47894).
The CL updates several passes that assumed null pointer dereferencing is
undefined to not optimize when the "null-pointer-is-valid"="true"
attribute is present.
Reviewers: t.p.northover, efriedma, jyknight, chandlerc, rnk, srhines, void, george.burgess.iv
Reviewed By: efriedma, george.burgess.iv
Subscribers: eraman, haicheng, george.burgess.iv, drinkcat, theraven, reames, sanjoy, xbolva00, llvm-commits
Differential Revision: https://reviews.llvm.org/D47895
llvm-svn: 336613
2018-07-10 00:27:23 +02:00
|
|
|
InstWriteOffset, DepWrite, IOL, *AA,
|
|
|
|
BB.getParent());
|
2017-03-29 16:42:27 +02:00
|
|
|
if (OR == OW_Complete) {
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite
|
|
|
|
<< "\n KILLER: " << *Inst << '\n');
|
2016-07-18 17:51:31 +02:00
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
// Delete the store and now-dead instructions that feed it.
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL,
|
2020-01-03 15:13:55 +01:00
|
|
|
ThrowableInst);
|
2016-05-17 23:38:13 +02:00
|
|
|
++NumFastStores;
|
|
|
|
MadeChange = true;
|
|
|
|
|
2016-07-06 21:48:52 +02:00
|
|
|
// We erased DepWrite; start over.
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
InstDep = MD->getDependency(Inst);
|
2016-07-06 21:48:52 +02:00
|
|
|
continue;
|
2017-03-29 16:42:27 +02:00
|
|
|
} else if ((OR == OW_End && isShortenableAtTheEnd(DepWrite)) ||
|
|
|
|
((OR == OW_Begin &&
|
2016-05-17 23:38:13 +02:00
|
|
|
isShortenableAtTheBeginning(DepWrite)))) {
|
2016-07-22 20:27:24 +02:00
|
|
|
assert(!EnablePartialOverwriteTracking && "Do not expect to perform "
|
|
|
|
"when partial-overwrite "
|
|
|
|
"tracking is enabled");
|
2018-10-09 05:18:56 +02:00
|
|
|
// The overwrite result is known, so these must be known, too.
|
|
|
|
int64_t EarlierSize = DepLoc.Size.getValue();
|
|
|
|
int64_t LaterSize = Loc.Size.getValue();
|
2017-03-29 16:42:27 +02:00
|
|
|
bool IsOverwriteEnd = (OR == OW_End);
|
2016-07-27 19:25:20 +02:00
|
|
|
MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize,
|
2016-07-22 20:27:24 +02:00
|
|
|
InstWriteOffset, LaterSize, IsOverwriteEnd);
|
2017-09-26 15:54:28 +02:00
|
|
|
} else if (EnablePartialStoreMerging &&
|
|
|
|
OR == OW_PartialEarlierWithFullLater) {
|
|
|
|
auto *Earlier = dyn_cast<StoreInst>(DepWrite);
|
|
|
|
auto *Later = dyn_cast<StoreInst>(Inst);
|
2020-06-15 16:37:33 +02:00
|
|
|
if (Constant *C = tryToMergePartialOverlappingStores(
|
|
|
|
Earlier, Later, InstWriteOffset, DepWriteOffset, DL, AA,
|
|
|
|
DT)) {
|
2017-09-26 15:54:28 +02:00
|
|
|
auto *SI = new StoreInst(
|
2020-06-15 16:37:33 +02:00
|
|
|
C, Earlier->getPointerOperand(), false, Earlier->getAlign(),
|
2020-05-14 23:48:10 +02:00
|
|
|
Earlier->getOrdering(), Earlier->getSyncScopeID(), DepWrite);
|
2017-09-26 15:54:28 +02:00
|
|
|
|
|
|
|
unsigned MDToKeep[] = {LLVMContext::MD_dbg, LLVMContext::MD_tbaa,
|
|
|
|
LLVMContext::MD_alias_scope,
|
|
|
|
LLVMContext::MD_noalias,
|
|
|
|
LLVMContext::MD_nontemporal};
|
|
|
|
SI->copyMetadata(*DepWrite, MDToKeep);
|
|
|
|
++NumModifiedStores;
|
|
|
|
|
|
|
|
// Delete the old stores and now-dead instructions that feed them.
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL,
|
2020-01-03 15:13:55 +01:00
|
|
|
ThrowableInst);
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL,
|
2020-01-03 15:13:55 +01:00
|
|
|
ThrowableInst);
|
2017-09-26 15:54:28 +02:00
|
|
|
MadeChange = true;
|
|
|
|
|
|
|
|
// We erased DepWrite and Inst (Loc); start over.
|
|
|
|
break;
|
|
|
|
}
|
2016-05-17 23:38:13 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If this is a may-aliased store that is clobbering the store value, we
|
|
|
|
// can keep searching past it for another must-aliased pointer that stores
|
|
|
|
// to the same location. For example, in:
|
|
|
|
// store -> P
|
|
|
|
// store -> Q
|
|
|
|
// store -> P
|
|
|
|
// we can remove the first store to P even though we don't know if P and Q
|
|
|
|
// alias.
|
|
|
|
if (DepWrite == &BB.front()) break;
|
|
|
|
|
|
|
|
// Can't look past this instruction if it might read 'Loc'.
|
2017-12-05 21:12:23 +01:00
|
|
|
if (isRefSet(AA->getModRefInfo(DepWrite, Loc)))
|
2016-05-17 23:38:13 +02:00
|
|
|
break;
|
|
|
|
|
limit the number of instructions per block examined by dead store elimination
Summary: Dead store elimination gets very expensive when large numbers of instructions need to be analyzed. This patch limits the number of instructions analyzed per store to the value of the memdep-block-scan-limit parameter (which defaults to 100). This resulted in no observed difference in performance of the generated code, and no change in the statistics for the dead store elimination pass, but improved compilation time on some files by more than an order of magnitude.
Reviewers: dexonsmith, bruno, george.burgess.iv, dberlin, reames, davidxl
Subscribers: davide, chandlerc, dberlin, davidxl, eraman, tejohnson, mbodart, llvm-commits
Differential Revision: https://reviews.llvm.org/D15537
llvm-svn: 279833
2016-08-26 18:34:27 +02:00
|
|
|
InstDep = MD->getPointerDependencyFrom(Loc, /*isLoad=*/ false,
|
|
|
|
DepWrite->getIterator(), &BB,
|
|
|
|
/*QueryInst=*/ nullptr, &Limit);
|
2016-05-17 23:38:13 +02:00
|
|
|
}
|
2010-11-30 22:32:12 +01:00
|
|
|
}
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2016-07-22 20:27:24 +02:00
|
|
|
if (EnablePartialOverwriteTracking)
|
|
|
|
MadeChange |= removePartiallyOverlappedStores(AA, DL, IOL);
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
// If this block ends in a return, unwind, or unreachable, all allocas are
|
|
|
|
// dead at its end, which means stores to them are also dead.
|
|
|
|
if (BB.getTerminator()->getNumSuccessors() == 0)
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
MadeChange |= handleEndBlock(BB, AA, MD, TLI, IOL, ThrowableInst);
|
2016-05-17 23:38:13 +02:00
|
|
|
|
|
|
|
return MadeChange;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool eliminateDeadStores(Function &F, AliasAnalysis *AA,
|
|
|
|
MemoryDependenceResults *MD, DominatorTree *DT,
|
|
|
|
const TargetLibraryInfo *TLI) {
|
|
|
|
bool MadeChange = false;
|
|
|
|
for (BasicBlock &BB : F)
|
|
|
|
// Only check non-dead blocks. Dead blocks may have strange pointer
|
|
|
|
// cycles that will confuse alias analysis.
|
|
|
|
if (DT->isReachableFromEntry(&BB))
|
|
|
|
MadeChange |= eliminateDeadStores(BB, AA, MD, DT, TLI);
|
2016-08-12 03:09:53 +02:00
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
return MadeChange;
|
|
|
|
}
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
namespace {
|
|
|
|
//=============================================================================
|
|
|
|
// MemorySSA backed dead store elimination.
|
|
|
|
//
|
|
|
|
// The code below implements dead store elimination using MemorySSA. It uses
|
|
|
|
// the following general approach: given a MemoryDef, walk upwards to find
|
|
|
|
// clobbering MemoryDefs that may be killed by the starting def. Then check
|
|
|
|
// that there are no uses that may read the location of the original MemoryDef
|
|
|
|
// in between both MemoryDefs. A bit more concretely:
|
|
|
|
//
|
|
|
|
// For all MemoryDefs StartDef:
|
|
|
|
// 1. Get the next dominating clobbering MemoryDef (DomAccess) by walking
|
|
|
|
// upwards.
|
|
|
|
// 2. Check that there are no reads between DomAccess and the StartDef by
|
|
|
|
// checking all uses starting at DomAccess and walking until we see StartDef.
|
|
|
|
// 3. For each found DomDef, check that:
|
|
|
|
// 1. There are no barrier instructions between DomDef and StartDef (like
|
|
|
|
// throws or stores with ordering constraints).
|
|
|
|
// 2. StartDef is executed whenever DomDef is executed.
|
|
|
|
// 3. StartDef completely overwrites DomDef.
|
|
|
|
// 4. Erase DomDef from the function and MemorySSA.
|
|
|
|
|
|
|
|
// Returns true if \p M is an intrisnic that does not read or write memory.
|
|
|
|
bool isNoopIntrinsic(MemoryUseOrDef *M) {
|
|
|
|
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(M->getMemoryInst())) {
|
|
|
|
switch (II->getIntrinsicID()) {
|
|
|
|
case Intrinsic::lifetime_start:
|
|
|
|
case Intrinsic::lifetime_end:
|
|
|
|
case Intrinsic::invariant_end:
|
|
|
|
case Intrinsic::launder_invariant_group:
|
|
|
|
case Intrinsic::assume:
|
|
|
|
return true;
|
|
|
|
case Intrinsic::dbg_addr:
|
|
|
|
case Intrinsic::dbg_declare:
|
|
|
|
case Intrinsic::dbg_label:
|
|
|
|
case Intrinsic::dbg_value:
|
|
|
|
llvm_unreachable("Intrinsic should not be modeled in MemorySSA");
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if we can ignore \p D for DSE.
|
|
|
|
bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
|
|
|
|
Instruction *DI = D->getMemoryInst();
|
|
|
|
// Calls that only access inaccessible memory cannot read or write any memory
|
|
|
|
// locations we consider for elimination.
|
2020-04-23 18:15:04 +02:00
|
|
|
if (auto *CB = dyn_cast<CallBase>(DI))
|
|
|
|
if (CB->onlyAccessesInaccessibleMemory())
|
2020-02-11 19:27:41 +01:00
|
|
|
return true;
|
|
|
|
|
|
|
|
// We can eliminate stores to locations not visible to the caller across
|
|
|
|
// throwing instructions.
|
|
|
|
if (DI->mayThrow() && !DefVisibleToCaller)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// We can remove the dead stores, irrespective of the fence and its ordering
|
|
|
|
// (release/acquire/seq_cst). Fences only constraints the ordering of
|
|
|
|
// already visible stores, it does not make a store visible to other
|
|
|
|
// threads. So, skipping over a fence does not change a store from being
|
|
|
|
// dead.
|
|
|
|
if (isa<FenceInst>(DI))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// Skip intrinsics that do not really read or modify memory.
|
|
|
|
if (isNoopIntrinsic(D))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct DSEState {
|
|
|
|
Function &F;
|
|
|
|
AliasAnalysis &AA;
|
|
|
|
MemorySSA &MSSA;
|
|
|
|
DominatorTree &DT;
|
|
|
|
PostDominatorTree &PDT;
|
|
|
|
const TargetLibraryInfo &TLI;
|
|
|
|
|
|
|
|
// All MemoryDefs that potentially could kill other MemDefs.
|
|
|
|
SmallVector<MemoryDef *, 64> MemDefs;
|
|
|
|
// Any that should be skipped as they are already deleted
|
|
|
|
SmallPtrSet<MemoryAccess *, 4> SkipStores;
|
2020-04-15 11:42:58 +02:00
|
|
|
// Keep track of all of the objects that are invisible to the caller before
|
|
|
|
// the function returns.
|
|
|
|
SmallPtrSet<const Value *, 16> InvisibleToCallerBeforeRet;
|
|
|
|
// Keep track of all of the objects that are invisible to the caller after
|
|
|
|
// the function returns.
|
|
|
|
SmallPtrSet<const Value *, 16> InvisibleToCallerAfterRet;
|
2020-02-11 19:27:41 +01:00
|
|
|
// Keep track of blocks with throwing instructions not modeled in MemorySSA.
|
|
|
|
SmallPtrSet<BasicBlock *, 16> ThrowingBlocks;
|
2020-03-20 08:51:29 +01:00
|
|
|
// Post-order numbers for each basic block. Used to figure out if memory
|
|
|
|
// accesses are executed before another access.
|
|
|
|
DenseMap<BasicBlock *, unsigned> PostOrderNumbers;
|
2020-02-11 19:27:41 +01:00
|
|
|
|
2020-02-23 16:39:15 +01:00
|
|
|
/// Keep track of instructions (partly) overlapping with killing MemoryDefs per
|
|
|
|
/// basic block.
|
|
|
|
DenseMap<BasicBlock *, InstOverlapIntervalsTy> IOLs;
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
|
|
|
|
PostDominatorTree &PDT, const TargetLibraryInfo &TLI)
|
|
|
|
: F(F), AA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI) {}
|
|
|
|
|
|
|
|
static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
|
|
|
|
DominatorTree &DT, PostDominatorTree &PDT,
|
|
|
|
const TargetLibraryInfo &TLI) {
|
|
|
|
DSEState State(F, AA, MSSA, DT, PDT, TLI);
|
|
|
|
// Collect blocks with throwing instructions not modeled in MemorySSA and
|
|
|
|
// alloc-like objects.
|
2020-03-20 08:51:29 +01:00
|
|
|
unsigned PO = 0;
|
|
|
|
for (BasicBlock *BB : post_order(&F)) {
|
|
|
|
State.PostOrderNumbers[BB] = PO++;
|
|
|
|
for (Instruction &I : *BB) {
|
2020-04-08 15:17:48 +02:00
|
|
|
MemoryAccess *MA = MSSA.getMemoryAccess(&I);
|
|
|
|
if (I.mayThrow() && !MA)
|
2020-03-20 08:51:29 +01:00
|
|
|
State.ThrowingBlocks.insert(I.getParent());
|
|
|
|
|
2020-04-08 15:17:48 +02:00
|
|
|
auto *MD = dyn_cast_or_null<MemoryDef>(MA);
|
2020-03-20 08:51:29 +01:00
|
|
|
if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit &&
|
|
|
|
hasAnalyzableMemoryWrite(&I, TLI) && isRemovable(&I))
|
|
|
|
State.MemDefs.push_back(MD);
|
|
|
|
|
2020-04-15 11:42:58 +02:00
|
|
|
// Track whether alloca and alloca-like objects are visible in the
|
|
|
|
// caller before and after the function returns. Alloca objects are
|
|
|
|
// invalid in the caller, so they are neither visible before or after
|
|
|
|
// the function returns.
|
|
|
|
if (isa<AllocaInst>(&I)) {
|
|
|
|
State.InvisibleToCallerBeforeRet.insert(&I);
|
|
|
|
State.InvisibleToCallerAfterRet.insert(&I);
|
|
|
|
}
|
|
|
|
|
|
|
|
// For alloca-like objects we need to check if they are captured before
|
|
|
|
// the function returns and if the return might capture the object.
|
|
|
|
if (isAllocLikeFn(&I, &TLI)) {
|
|
|
|
bool CapturesBeforeRet = PointerMayBeCaptured(&I, false, true);
|
|
|
|
if (!CapturesBeforeRet) {
|
|
|
|
State.InvisibleToCallerBeforeRet.insert(&I);
|
|
|
|
if (!PointerMayBeCaptured(&I, true, false))
|
|
|
|
State.InvisibleToCallerAfterRet.insert(&I);
|
|
|
|
}
|
|
|
|
}
|
2020-03-20 08:51:29 +01:00
|
|
|
}
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
2020-03-20 08:51:29 +01:00
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
// Treat byval or inalloca arguments the same as Allocas, stores to them are
|
|
|
|
// dead at the end of the function.
|
|
|
|
for (Argument &AI : F.args())
|
2020-06-23 09:58:51 +02:00
|
|
|
if (AI.hasPassPointeeByValueAttr()) {
|
|
|
|
// For byval, the caller doesn't know the address of the allocation.
|
|
|
|
if (AI.hasByValAttr())
|
|
|
|
State.InvisibleToCallerBeforeRet.insert(&AI);
|
|
|
|
State.InvisibleToCallerAfterRet.insert(&AI);
|
|
|
|
}
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
return State;
|
|
|
|
}
|
|
|
|
|
|
|
|
Optional<MemoryLocation> getLocForWriteEx(Instruction *I) const {
|
|
|
|
if (!I->mayWriteToMemory())
|
|
|
|
return None;
|
|
|
|
|
|
|
|
if (auto *MTI = dyn_cast<AnyMemIntrinsic>(I))
|
|
|
|
return {MemoryLocation::getForDest(MTI)};
|
|
|
|
|
2020-04-23 18:15:04 +02:00
|
|
|
if (auto *CB = dyn_cast<CallBase>(I)) {
|
|
|
|
if (Function *F = CB->getCalledFunction()) {
|
2020-02-11 19:27:41 +01:00
|
|
|
StringRef FnName = F->getName();
|
|
|
|
if (TLI.has(LibFunc_strcpy) && FnName == TLI.getName(LibFunc_strcpy))
|
2020-04-23 18:15:04 +02:00
|
|
|
return {MemoryLocation(CB->getArgOperand(0))};
|
2020-02-11 19:27:41 +01:00
|
|
|
if (TLI.has(LibFunc_strncpy) && FnName == TLI.getName(LibFunc_strncpy))
|
2020-04-23 18:15:04 +02:00
|
|
|
return {MemoryLocation(CB->getArgOperand(0))};
|
2020-02-11 19:27:41 +01:00
|
|
|
if (TLI.has(LibFunc_strcat) && FnName == TLI.getName(LibFunc_strcat))
|
2020-04-23 18:15:04 +02:00
|
|
|
return {MemoryLocation(CB->getArgOperand(0))};
|
2020-02-11 19:27:41 +01:00
|
|
|
if (TLI.has(LibFunc_strncat) && FnName == TLI.getName(LibFunc_strncat))
|
2020-04-23 18:15:04 +02:00
|
|
|
return {MemoryLocation(CB->getArgOperand(0))};
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
|
|
|
return MemoryLocation::getOrNone(I);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns true if \p Use completely overwrites \p DefLoc.
|
|
|
|
bool isCompleteOverwrite(MemoryLocation DefLoc, Instruction *UseInst) const {
|
|
|
|
// UseInst has a MemoryDef associated in MemorySSA. It's possible for a
|
|
|
|
// MemoryDef to not write to memory, e.g. a volatile load is modeled as a
|
|
|
|
// MemoryDef.
|
|
|
|
if (!UseInst->mayWriteToMemory())
|
|
|
|
return false;
|
|
|
|
|
2020-04-23 18:15:04 +02:00
|
|
|
if (auto *CB = dyn_cast<CallBase>(UseInst))
|
|
|
|
if (CB->onlyAccessesInaccessibleMemory())
|
2020-02-11 19:27:41 +01:00
|
|
|
return false;
|
|
|
|
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
int64_t InstWriteOffset, DepWriteOffset;
|
|
|
|
auto CC = getLocForWriteEx(UseInst);
|
|
|
|
InstOverlapIntervalsTy IOL;
|
|
|
|
|
|
|
|
const DataLayout &DL = F.getParent()->getDataLayout();
|
2020-02-11 19:27:41 +01:00
|
|
|
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
return CC &&
|
2020-06-13 21:24:57 +02:00
|
|
|
isOverwrite(*CC, DefLoc, DL, TLI, DepWriteOffset, InstWriteOffset,
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
UseInst, IOL, AA, &F) == OW_Complete;
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns true if \p Use may read from \p DefLoc.
|
|
|
|
bool isReadClobber(MemoryLocation DefLoc, Instruction *UseInst) const {
|
|
|
|
if (!UseInst->mayReadFromMemory())
|
|
|
|
return false;
|
|
|
|
|
2020-04-23 18:15:04 +02:00
|
|
|
if (auto *CB = dyn_cast<CallBase>(UseInst))
|
|
|
|
if (CB->onlyAccessesInaccessibleMemory())
|
2020-02-11 19:27:41 +01:00
|
|
|
return false;
|
|
|
|
|
|
|
|
ModRefInfo MR = AA.getModRefInfo(UseInst, DefLoc);
|
|
|
|
// If necessary, perform additional analysis.
|
|
|
|
if (isRefSet(MR))
|
|
|
|
MR = AA.callCapturesBefore(UseInst, DefLoc, &DT);
|
|
|
|
return isRefSet(MR);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Find a MemoryDef writing to \p DefLoc and dominating \p Current, with no
|
2020-04-15 11:42:58 +02:00
|
|
|
// read access between them or on any other path to a function exit block if
|
|
|
|
// \p DefLoc is not accessible after the function returns. If there is no such
|
|
|
|
// MemoryDef, return None. The returned value may not (completely) overwrite
|
|
|
|
// \p DefLoc. Currently we bail out when we encounter an aliasing MemoryUse
|
|
|
|
// (read).
|
|
|
|
Optional<MemoryAccess *>
|
|
|
|
getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *Current,
|
|
|
|
MemoryLocation DefLoc, bool DefVisibleToCallerBeforeRet,
|
|
|
|
bool DefVisibleToCallerAfterRet, int &ScanLimit) const {
|
2020-03-20 08:51:29 +01:00
|
|
|
MemoryAccess *DomAccess;
|
2020-02-11 19:27:41 +01:00
|
|
|
bool StepAgain;
|
|
|
|
LLVM_DEBUG(dbgs() << " trying to get dominating access for " << *Current
|
|
|
|
<< "\n");
|
|
|
|
// Find the next clobbering Mod access for DefLoc, starting at Current.
|
|
|
|
do {
|
|
|
|
StepAgain = false;
|
|
|
|
// Reached TOP.
|
|
|
|
if (MSSA.isLiveOnEntryDef(Current))
|
|
|
|
return None;
|
|
|
|
|
2020-03-20 08:51:29 +01:00
|
|
|
if (isa<MemoryPhi>(Current)) {
|
|
|
|
DomAccess = Current;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
MemoryUseOrDef *CurrentUD = cast<MemoryUseOrDef>(Current);
|
2020-02-11 19:27:41 +01:00
|
|
|
// Look for access that clobber DefLoc.
|
2020-03-20 08:51:29 +01:00
|
|
|
DomAccess = MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(CurrentUD,
|
|
|
|
DefLoc);
|
|
|
|
if (MSSA.isLiveOnEntryDef(DomAccess))
|
2020-02-11 19:27:41 +01:00
|
|
|
return None;
|
|
|
|
|
2020-03-20 08:51:29 +01:00
|
|
|
if (isa<MemoryPhi>(DomAccess))
|
|
|
|
break;
|
|
|
|
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
// Check if we can skip DomDef for DSE.
|
2020-03-20 08:51:29 +01:00
|
|
|
MemoryDef *DomDef = dyn_cast<MemoryDef>(DomAccess);
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
if (DomDef && canSkipDef(DomDef, DefVisibleToCallerBeforeRet)) {
|
2020-02-11 19:27:41 +01:00
|
|
|
StepAgain = true;
|
2020-03-20 08:51:29 +01:00
|
|
|
Current = DomDef->getDefiningAccess();
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
} while (StepAgain);
|
|
|
|
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
// Accesses to objects accessible after the function returns can only be
|
|
|
|
// eliminated if the access is killed along all paths to the exit. Collect
|
|
|
|
// the blocks with killing (=completely overwriting MemoryDefs) and check if
|
|
|
|
// they cover all paths from DomAccess to any function exit.
|
|
|
|
SmallPtrSet<BasicBlock *, 16> KillingBlocks = {KillingDef->getBlock()};
|
2020-03-20 08:51:29 +01:00
|
|
|
LLVM_DEBUG({
|
|
|
|
dbgs() << " Checking for reads of " << *DomAccess;
|
|
|
|
if (isa<MemoryDef>(DomAccess))
|
|
|
|
dbgs() << " (" << *cast<MemoryDef>(DomAccess)->getMemoryInst() << ")\n";
|
2020-04-24 18:48:03 +02:00
|
|
|
else
|
|
|
|
dbgs() << ")\n";
|
2020-03-20 08:51:29 +01:00
|
|
|
});
|
2020-02-11 19:27:41 +01:00
|
|
|
|
|
|
|
SmallSetVector<MemoryAccess *, 32> WorkList;
|
|
|
|
auto PushMemUses = [&WorkList](MemoryAccess *Acc) {
|
|
|
|
for (Use &U : Acc->uses())
|
|
|
|
WorkList.insert(cast<MemoryAccess>(U.getUser()));
|
|
|
|
};
|
2020-03-20 08:51:29 +01:00
|
|
|
PushMemUses(DomAccess);
|
2020-02-11 19:27:41 +01:00
|
|
|
|
|
|
|
// Check if DomDef may be read.
|
|
|
|
for (unsigned I = 0; I < WorkList.size(); I++) {
|
|
|
|
MemoryAccess *UseAccess = WorkList[I];
|
|
|
|
|
2020-04-24 18:48:03 +02:00
|
|
|
LLVM_DEBUG(dbgs() << " " << *UseAccess);
|
2020-02-11 19:27:41 +01:00
|
|
|
if (--ScanLimit == 0) {
|
2020-04-24 18:48:03 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "\n ... hit scan limit\n");
|
2020-02-11 19:27:41 +01:00
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (isa<MemoryPhi>(UseAccess)) {
|
2020-04-24 18:48:03 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "\n ... adding PHI uses\n");
|
2020-03-20 08:51:29 +01:00
|
|
|
PushMemUses(UseAccess);
|
|
|
|
continue;
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
|
|
|
|
LLVM_DEBUG(dbgs() << " (" << *UseInst << ")\n");
|
|
|
|
|
|
|
|
if (isNoopIntrinsic(cast<MemoryUseOrDef>(UseAccess))) {
|
2020-04-24 18:48:03 +02:00
|
|
|
LLVM_DEBUG(dbgs() << " ... adding uses of intrinsic\n");
|
2020-02-11 19:27:41 +01:00
|
|
|
PushMemUses(UseAccess);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Uses which may read the original MemoryDef mean we cannot eliminate the
|
|
|
|
// original MD. Stop walk.
|
|
|
|
if (isReadClobber(DefLoc, UseInst)) {
|
2020-04-24 18:48:03 +02:00
|
|
|
LLVM_DEBUG(dbgs() << " ... found read clobber\n");
|
2020-02-11 19:27:41 +01:00
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
2020-04-23 21:37:26 +02:00
|
|
|
// For the KillingDef and DomAccess we only have to check if it reads the
|
|
|
|
// memory location.
|
2020-03-20 08:51:29 +01:00
|
|
|
// TODO: It would probably be better to check for self-reads before
|
|
|
|
// calling the function.
|
2020-04-24 18:48:03 +02:00
|
|
|
if (KillingDef == UseAccess || DomAccess == UseAccess) {
|
|
|
|
LLVM_DEBUG(dbgs() << " ... skipping killing def/dom access\n");
|
2020-02-11 19:27:41 +01:00
|
|
|
continue;
|
2020-04-24 18:48:03 +02:00
|
|
|
}
|
2020-02-11 19:27:41 +01:00
|
|
|
|
|
|
|
// Check all uses for MemoryDefs, except for defs completely overwriting
|
|
|
|
// the original location. Otherwise we have to check uses of *all*
|
|
|
|
// MemoryDefs we discover, including non-aliasing ones. Otherwise we might
|
|
|
|
// miss cases like the following
|
|
|
|
// 1 = Def(LoE) ; <----- DomDef stores [0,1]
|
|
|
|
// 2 = Def(1) ; (2, 1) = NoAlias, stores [2,3]
|
|
|
|
// Use(2) ; MayAlias 2 *and* 1, loads [0, 3].
|
|
|
|
// (The Use points to the *first* Def it may alias)
|
|
|
|
// 3 = Def(1) ; <---- Current (3, 2) = NoAlias, (3,1) = MayAlias,
|
|
|
|
// stores [0,1]
|
|
|
|
if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) {
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
if (isCompleteOverwrite(DefLoc, UseInst)) {
|
|
|
|
if (DefVisibleToCallerAfterRet && UseAccess != DomAccess) {
|
|
|
|
BasicBlock *MaybeKillingBlock = UseInst->getParent();
|
|
|
|
if (PostOrderNumbers.find(MaybeKillingBlock)->second <
|
|
|
|
PostOrderNumbers.find(DomAccess->getBlock())->second) {
|
|
|
|
|
|
|
|
LLVM_DEBUG(dbgs() << " ... found killing block "
|
|
|
|
<< MaybeKillingBlock->getName() << "\n");
|
|
|
|
KillingBlocks.insert(MaybeKillingBlock);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else
|
2020-02-11 19:27:41 +01:00
|
|
|
PushMemUses(UseDef);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
// For accesses to locations visible after the function returns, make sure
|
|
|
|
// that the location is killed (=overwritten) along all paths from DomAccess
|
|
|
|
// to the exit.
|
|
|
|
if (DefVisibleToCallerAfterRet) {
|
|
|
|
assert(!KillingBlocks.empty() &&
|
|
|
|
"Expected at least a single killing block");
|
|
|
|
// Find the common post-dominator of all killing blocks.
|
|
|
|
BasicBlock *CommonPred = *KillingBlocks.begin();
|
|
|
|
for (auto I = std::next(KillingBlocks.begin()), E = KillingBlocks.end();
|
|
|
|
I != E; I++) {
|
|
|
|
if (!CommonPred)
|
|
|
|
break;
|
|
|
|
CommonPred = PDT.findNearestCommonDominator(CommonPred, *I);
|
|
|
|
}
|
|
|
|
|
|
|
|
// If CommonPred is in the set of killing blocks, just check if it
|
|
|
|
// post-dominates DomAccess.
|
|
|
|
if (KillingBlocks.count(CommonPred)) {
|
|
|
|
if (PDT.dominates(CommonPred, DomAccess->getBlock()))
|
|
|
|
return {DomAccess};
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the common post-dominator does not post-dominate DomAccess, there
|
|
|
|
// is a path from DomAccess to an exit not going through a killing block.
|
|
|
|
if (PDT.dominates(CommonPred, DomAccess->getBlock())) {
|
|
|
|
SetVector<BasicBlock *> WorkList;
|
|
|
|
|
|
|
|
// DomAccess's post-order number provides an upper bound of the blocks
|
|
|
|
// on a path starting at DomAccess.
|
|
|
|
unsigned UpperBound =
|
|
|
|
PostOrderNumbers.find(DomAccess->getBlock())->second;
|
|
|
|
|
|
|
|
// If CommonPred is null, there are multiple exits from the function.
|
|
|
|
// They all have to be added to the worklist.
|
|
|
|
if (CommonPred)
|
|
|
|
WorkList.insert(CommonPred);
|
|
|
|
else
|
2020-06-21 17:34:54 +02:00
|
|
|
for (BasicBlock *R : PDT.getRoots())
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
WorkList.insert(R);
|
|
|
|
|
|
|
|
NumCFGTries++;
|
|
|
|
// Check if all paths starting from an exit node go through one of the
|
|
|
|
// killing blocks before reaching DomAccess.
|
|
|
|
for (unsigned I = 0; I < WorkList.size(); I++) {
|
|
|
|
NumCFGChecks++;
|
|
|
|
BasicBlock *Current = WorkList[I];
|
|
|
|
if (KillingBlocks.count(Current))
|
|
|
|
continue;
|
|
|
|
if (Current == DomAccess->getBlock())
|
|
|
|
return None;
|
2020-06-21 17:34:54 +02:00
|
|
|
|
|
|
|
// DomAccess is reachable from the entry, so we don't have to explore
|
|
|
|
// unreachable blocks further.
|
|
|
|
if (!DT.isReachableFromEntry(Current))
|
|
|
|
continue;
|
|
|
|
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
unsigned CPO = PostOrderNumbers.find(Current)->second;
|
|
|
|
// Current block is not on a path starting at DomAccess.
|
|
|
|
if (CPO > UpperBound)
|
|
|
|
continue;
|
|
|
|
for (BasicBlock *Pred : predecessors(Current))
|
|
|
|
WorkList.insert(Pred);
|
|
|
|
|
|
|
|
if (WorkList.size() >= MemorySSAPathCheckLimit)
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
NumCFGSuccess++;
|
|
|
|
return {DomAccess};
|
|
|
|
}
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
2020-03-20 08:51:29 +01:00
|
|
|
// No aliasing MemoryUses of DomAccess found, DomAccess is potentially dead.
|
|
|
|
return {DomAccess};
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Delete dead memory defs
|
|
|
|
void deleteDeadInstruction(Instruction *SI) {
|
|
|
|
MemorySSAUpdater Updater(&MSSA);
|
|
|
|
SmallVector<Instruction *, 32> NowDeadInsts;
|
|
|
|
NowDeadInsts.push_back(SI);
|
|
|
|
--NumFastOther;
|
|
|
|
|
|
|
|
while (!NowDeadInsts.empty()) {
|
|
|
|
Instruction *DeadInst = NowDeadInsts.pop_back_val();
|
|
|
|
++NumFastOther;
|
|
|
|
|
|
|
|
// Try to preserve debug information attached to the dead instruction.
|
|
|
|
salvageDebugInfo(*DeadInst);
|
2020-04-14 11:56:56 +02:00
|
|
|
salvageKnowledge(DeadInst);
|
2020-02-11 19:27:41 +01:00
|
|
|
|
|
|
|
// Remove the Instruction from MSSA.
|
|
|
|
if (MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst)) {
|
|
|
|
if (MemoryDef *MD = dyn_cast<MemoryDef>(MA)) {
|
|
|
|
SkipStores.insert(MD);
|
|
|
|
}
|
|
|
|
Updater.removeMemoryAccess(MA);
|
|
|
|
}
|
|
|
|
|
2020-02-23 16:39:15 +01:00
|
|
|
auto I = IOLs.find(DeadInst->getParent());
|
|
|
|
if (I != IOLs.end())
|
|
|
|
I->second.erase(DeadInst);
|
2020-02-11 19:27:41 +01:00
|
|
|
// Remove its operands
|
|
|
|
for (Use &O : DeadInst->operands())
|
|
|
|
if (Instruction *OpI = dyn_cast<Instruction>(O)) {
|
|
|
|
O = nullptr;
|
|
|
|
if (isInstructionTriviallyDead(OpI, &TLI))
|
|
|
|
NowDeadInsts.push_back(OpI);
|
|
|
|
}
|
|
|
|
|
|
|
|
DeadInst->eraseFromParent();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check for any extra throws between SI and NI that block DSE. This only
|
|
|
|
// checks extra maythrows (those that aren't MemoryDef's). MemoryDef that may
|
|
|
|
// throw are handled during the walk from one def to the next.
|
|
|
|
bool mayThrowBetween(Instruction *SI, Instruction *NI,
|
|
|
|
const Value *SILocUnd) const {
|
|
|
|
// First see if we can ignore it by using the fact that SI is an
|
|
|
|
// alloca/alloca like object that is not visible to the caller during
|
|
|
|
// execution of the function.
|
2020-04-15 11:42:58 +02:00
|
|
|
if (SILocUnd && InvisibleToCallerBeforeRet.count(SILocUnd))
|
2020-02-11 19:27:41 +01:00
|
|
|
return false;
|
|
|
|
|
|
|
|
if (SI->getParent() == NI->getParent())
|
2020-06-07 22:36:10 +02:00
|
|
|
return ThrowingBlocks.count(SI->getParent());
|
2020-02-11 19:27:41 +01:00
|
|
|
return !ThrowingBlocks.empty();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if \p NI acts as a DSE barrier for \p SI. The following instructions
|
|
|
|
// act as barriers:
|
|
|
|
// * A memory instruction that may throw and \p SI accesses a non-stack
|
|
|
|
// object.
|
|
|
|
// * Atomic stores stronger that monotonic.
|
2020-06-22 17:24:27 +02:00
|
|
|
bool isDSEBarrier(const Value *SILocUnd, Instruction *NI) const {
|
2020-02-11 19:27:41 +01:00
|
|
|
// If NI may throw it acts as a barrier, unless we are to an alloca/alloca
|
|
|
|
// like object that does not escape.
|
2020-04-15 11:42:58 +02:00
|
|
|
if (NI->mayThrow() && !InvisibleToCallerBeforeRet.count(SILocUnd))
|
2020-02-11 19:27:41 +01:00
|
|
|
return true;
|
|
|
|
|
2020-06-22 17:24:27 +02:00
|
|
|
// If NI is an atomic load/store stronger than monotonic, do not try to
|
|
|
|
// eliminate/reorder it.
|
2020-02-11 19:27:41 +01:00
|
|
|
if (NI->isAtomic()) {
|
2020-06-22 17:24:27 +02:00
|
|
|
if (auto *LI = dyn_cast<LoadInst>(NI))
|
|
|
|
return isStrongerThanMonotonic(LI->getOrdering());
|
|
|
|
if (auto *SI = dyn_cast<StoreInst>(NI))
|
|
|
|
return isStrongerThanMonotonic(SI->getOrdering());
|
|
|
|
llvm_unreachable("other instructions should be skipped in MemorySSA");
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2020-05-30 18:56:04 +02:00
|
|
|
/// \returns true if \p KillingDef stores the result of \p Load to the source of
|
|
|
|
/// \p Load.
|
|
|
|
static bool storeIsNoop(MemorySSA &MSSA, LoadInst *Load,
|
|
|
|
MemoryDef *KillingDef) {
|
|
|
|
Instruction *Store = KillingDef->getMemoryInst();
|
|
|
|
// If the load's operand isn't the destination of the store, bail.
|
|
|
|
if (Load->getPointerOperand() != Store->getOperand(1))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Get the defining access for the load.
|
|
|
|
auto *LoadAccess = MSSA.getMemoryAccess(Load)->getDefiningAccess();
|
|
|
|
// The store is dead if the defining accesses are the same.
|
|
|
|
return LoadAccess == KillingDef->getDefiningAccess();
|
|
|
|
}
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
|
|
|
|
MemorySSA &MSSA, DominatorTree &DT,
|
|
|
|
PostDominatorTree &PDT,
|
|
|
|
const TargetLibraryInfo &TLI) {
|
|
|
|
const DataLayout &DL = F.getParent()->getDataLayout();
|
|
|
|
bool MadeChange = false;
|
|
|
|
|
|
|
|
DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI);
|
|
|
|
// For each store:
|
|
|
|
for (unsigned I = 0; I < State.MemDefs.size(); I++) {
|
2020-03-20 08:51:29 +01:00
|
|
|
MemoryDef *KillingDef = State.MemDefs[I];
|
|
|
|
if (State.SkipStores.count(KillingDef))
|
2020-02-11 19:27:41 +01:00
|
|
|
continue;
|
2020-03-20 08:51:29 +01:00
|
|
|
Instruction *SI = KillingDef->getMemoryInst();
|
2020-05-30 18:56:04 +02:00
|
|
|
|
|
|
|
// Check if we're storing a value that we just loaded.
|
|
|
|
if (auto *Load = dyn_cast<LoadInst>(SI->getOperand(0))) {
|
|
|
|
if (storeIsNoop(MSSA, Load, KillingDef)) {
|
|
|
|
LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *SI
|
|
|
|
<< '\n');
|
2020-06-15 16:59:09 +02:00
|
|
|
State.deleteDeadInstruction(SI);
|
2020-05-30 18:56:04 +02:00
|
|
|
NumNoopStores++;
|
2020-06-04 22:23:32 +02:00
|
|
|
MadeChange = true;
|
2020-05-30 18:56:04 +02:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
auto MaybeSILoc = State.getLocForWriteEx(SI);
|
|
|
|
if (!MaybeSILoc) {
|
|
|
|
LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for "
|
|
|
|
<< *SI << "\n");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
MemoryLocation SILoc = *MaybeSILoc;
|
|
|
|
assert(SILoc.Ptr && "SILoc should not be null");
|
|
|
|
const Value *SILocUnd = GetUnderlyingObject(SILoc.Ptr, DL);
|
|
|
|
Instruction *DefObj =
|
|
|
|
const_cast<Instruction *>(dyn_cast<Instruction>(SILocUnd));
|
2020-04-15 11:42:58 +02:00
|
|
|
bool DefVisibleToCallerBeforeRet =
|
|
|
|
!State.InvisibleToCallerBeforeRet.count(SILocUnd);
|
|
|
|
bool DefVisibleToCallerAfterRet =
|
|
|
|
!State.InvisibleToCallerAfterRet.count(SILocUnd);
|
|
|
|
if (DefObj && isAllocLikeFn(DefObj, &TLI)) {
|
|
|
|
if (DefVisibleToCallerBeforeRet)
|
|
|
|
DefVisibleToCallerBeforeRet =
|
|
|
|
PointerMayBeCapturedBefore(DefObj, false, true, SI, &DT);
|
|
|
|
}
|
2020-02-11 19:27:41 +01:00
|
|
|
|
2020-03-20 08:51:29 +01:00
|
|
|
MemoryAccess *Current = KillingDef;
|
|
|
|
LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by "
|
|
|
|
<< *KillingDef << " (" << *SI << ")\n");
|
2020-02-11 19:27:41 +01:00
|
|
|
|
|
|
|
int ScanLimit = MemorySSAScanLimit;
|
2020-03-20 08:51:29 +01:00
|
|
|
// Worklist of MemoryAccesses that may be killed by KillingDef.
|
|
|
|
SetVector<MemoryAccess *> ToCheck;
|
|
|
|
ToCheck.insert(KillingDef->getDefiningAccess());
|
|
|
|
|
|
|
|
// Check if MemoryAccesses in the worklist are killed by KillingDef.
|
|
|
|
for (unsigned I = 0; I < ToCheck.size(); I++) {
|
|
|
|
Current = ToCheck[I];
|
|
|
|
if (State.SkipStores.count(Current))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
Optional<MemoryAccess *> Next = State.getDomMemoryDef(
|
2020-04-15 11:42:58 +02:00
|
|
|
KillingDef, Current, SILoc, DefVisibleToCallerBeforeRet,
|
|
|
|
DefVisibleToCallerAfterRet, ScanLimit);
|
2020-03-20 08:51:29 +01:00
|
|
|
|
|
|
|
if (!Next) {
|
|
|
|
LLVM_DEBUG(dbgs() << " finished walk\n");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
MemoryAccess *DomAccess = *Next;
|
2020-04-24 18:48:03 +02:00
|
|
|
LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DomAccess);
|
2020-03-20 08:51:29 +01:00
|
|
|
if (isa<MemoryPhi>(DomAccess)) {
|
2020-04-24 18:48:03 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "\n ... adding incoming values to worklist\n");
|
2020-03-20 08:51:29 +01:00
|
|
|
for (Value *V : cast<MemoryPhi>(DomAccess)->incoming_values()) {
|
|
|
|
MemoryAccess *IncomingAccess = cast<MemoryAccess>(V);
|
|
|
|
BasicBlock *IncomingBlock = IncomingAccess->getBlock();
|
|
|
|
BasicBlock *PhiBlock = DomAccess->getBlock();
|
|
|
|
|
|
|
|
// We only consider incoming MemoryAccesses that come before the
|
|
|
|
// MemoryPhi. Otherwise we could discover candidates that do not
|
|
|
|
// strictly dominate our starting def.
|
|
|
|
if (State.PostOrderNumbers[IncomingBlock] >
|
|
|
|
State.PostOrderNumbers[PhiBlock])
|
|
|
|
ToCheck.insert(IncomingAccess);
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
2020-02-11 19:27:41 +01:00
|
|
|
MemoryDef *NextDef = dyn_cast<MemoryDef>(DomAccess);
|
|
|
|
Instruction *NI = NextDef->getMemoryInst();
|
2020-04-24 18:48:03 +02:00
|
|
|
LLVM_DEBUG(dbgs() << " (" << *NI << ")\n");
|
2020-02-11 19:27:41 +01:00
|
|
|
|
2020-06-22 17:24:27 +02:00
|
|
|
// Before we try to remove anything, check for any extra throwing
|
|
|
|
// instructions that block us from DSEing
|
|
|
|
if (State.mayThrowBetween(SI, NI, SILocUnd)) {
|
|
|
|
LLVM_DEBUG(dbgs() << " ... skip, may throw!\n");
|
|
|
|
break;
|
2020-02-25 14:27:22 +01:00
|
|
|
}
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
// Check for anything that looks like it will be a barrier to further
|
|
|
|
// removal
|
2020-06-22 17:24:27 +02:00
|
|
|
if (State.isDSEBarrier(SILocUnd, NI)) {
|
2020-04-24 18:48:03 +02:00
|
|
|
LLVM_DEBUG(dbgs() << " ... skip, barrier\n");
|
2020-03-20 08:51:29 +01:00
|
|
|
continue;
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
|
|
|
|
2020-06-22 17:24:27 +02:00
|
|
|
ToCheck.insert(NextDef->getDefiningAccess());
|
|
|
|
|
|
|
|
if (!hasAnalyzableMemoryWrite(NI, TLI)) {
|
|
|
|
LLVM_DEBUG(dbgs() << " ... skip, cannot analyze def\n");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!isRemovable(NI)) {
|
|
|
|
LLVM_DEBUG(dbgs() << " ... skip, cannot remove def\n");
|
|
|
|
continue;
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
|
|
|
|
2020-02-21 17:55:18 +01:00
|
|
|
if (!DebugCounter::shouldExecute(MemorySSACounter))
|
2020-04-23 19:58:33 +02:00
|
|
|
continue;
|
2020-02-21 17:55:18 +01:00
|
|
|
|
2020-06-22 11:57:44 +02:00
|
|
|
MemoryLocation NILoc = *State.getLocForWriteEx(NI);
|
2020-02-11 19:27:41 +01:00
|
|
|
// Check if NI overwrites SI.
|
|
|
|
int64_t InstWriteOffset, DepWriteOffset;
|
2020-02-23 16:39:15 +01:00
|
|
|
auto Iter = State.IOLs.insert(
|
|
|
|
std::make_pair<BasicBlock *, InstOverlapIntervalsTy>(
|
|
|
|
NI->getParent(), InstOverlapIntervalsTy()));
|
|
|
|
auto &IOL = Iter.first->second;
|
2020-02-11 19:27:41 +01:00
|
|
|
OverwriteResult OR = isOverwrite(SILoc, NILoc, DL, TLI, DepWriteOffset,
|
|
|
|
InstWriteOffset, NI, IOL, AA, &F);
|
|
|
|
|
2020-06-15 16:40:07 +02:00
|
|
|
if (EnablePartialStoreMerging && OR == OW_PartialEarlierWithFullLater) {
|
|
|
|
auto *Earlier = dyn_cast<StoreInst>(NI);
|
|
|
|
auto *Later = dyn_cast<StoreInst>(SI);
|
|
|
|
if (Constant *Merged = tryToMergePartialOverlappingStores(
|
|
|
|
Earlier, Later, InstWriteOffset, DepWriteOffset, DL, &AA,
|
|
|
|
&DT)) {
|
|
|
|
|
|
|
|
// Update stored value of earlier store to merged constant.
|
|
|
|
Earlier->setOperand(0, Merged);
|
|
|
|
++NumModifiedStores;
|
|
|
|
MadeChange = true;
|
|
|
|
|
|
|
|
// Remove later store and remove any outstanding overlap intervals for
|
|
|
|
// the updated store.
|
|
|
|
State.deleteDeadInstruction(Later);
|
|
|
|
auto I = State.IOLs.find(Earlier->getParent());
|
|
|
|
if (I != State.IOLs.end())
|
|
|
|
I->second.erase(Earlier);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
if (OR == OW_Complete) {
|
|
|
|
LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *NI
|
|
|
|
<< "\n KILLER: " << *SI << '\n');
|
|
|
|
State.deleteDeadInstruction(NI);
|
|
|
|
++NumFastStores;
|
|
|
|
MadeChange = true;
|
2020-03-20 08:51:29 +01:00
|
|
|
}
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-02-23 16:39:15 +01:00
|
|
|
if (EnablePartialOverwriteTracking)
|
|
|
|
for (auto &KV : State.IOLs)
|
|
|
|
MadeChange |= removePartiallyOverlappedStores(&AA, DL, KV.second);
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
return MadeChange;
|
|
|
|
}
|
|
|
|
} // end anonymous namespace
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// DSE Pass
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
|
2020-02-11 19:27:41 +01:00
|
|
|
AliasAnalysis &AA = AM.getResult<AAManager>(F);
|
|
|
|
const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
|
|
|
|
DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
|
|
|
|
|
2020-04-25 16:02:02 +02:00
|
|
|
bool Changed = false;
|
2020-02-11 19:27:41 +01:00
|
|
|
if (EnableMemorySSA) {
|
|
|
|
MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
|
|
|
|
PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
|
|
|
|
|
2020-04-25 16:02:02 +02:00
|
|
|
Changed = eliminateDeadStoresMemorySSA(F, AA, MSSA, DT, PDT, TLI);
|
2020-02-11 19:27:41 +01:00
|
|
|
} else {
|
|
|
|
MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
|
2016-05-17 23:38:13 +02:00
|
|
|
|
2020-04-25 16:02:02 +02:00
|
|
|
Changed = eliminateDeadStores(F, &AA, &MD, &DT, &TLI);
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
2017-01-15 07:32:49 +01:00
|
|
|
|
2020-04-25 16:02:02 +02:00
|
|
|
#ifdef LLVM_ENABLE_STATS
|
|
|
|
if (AreStatisticsEnabled())
|
|
|
|
for (auto &I : instructions(F))
|
|
|
|
NumRemainingStores += isa<StoreInst>(&I);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if (!Changed)
|
|
|
|
return PreservedAnalyses::all();
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
PreservedAnalyses PA;
|
2017-01-15 07:32:49 +01:00
|
|
|
PA.preserveSet<CFGAnalyses>();
|
2016-05-17 23:38:13 +02:00
|
|
|
PA.preserve<GlobalsAA>();
|
2020-02-11 19:27:41 +01:00
|
|
|
if (EnableMemorySSA)
|
|
|
|
PA.preserve<MemorySSAAnalysis>();
|
|
|
|
else
|
|
|
|
PA.preserve<MemoryDependenceAnalysis>();
|
2016-05-17 23:38:13 +02:00
|
|
|
return PA;
|
|
|
|
}
|
|
|
|
|
2016-07-10 13:28:51 +02:00
|
|
|
namespace {
|
2017-10-13 23:17:07 +02:00
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
/// A legacy pass for the legacy pass manager that wraps \c DSEPass.
|
|
|
|
class DSELegacyPass : public FunctionPass {
|
|
|
|
public:
|
2017-10-13 23:17:07 +02:00
|
|
|
static char ID; // Pass identification, replacement for typeid
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
DSELegacyPass() : FunctionPass(ID) {
|
|
|
|
initializeDSELegacyPassPass(*PassRegistry::getPassRegistry());
|
|
|
|
}
|
|
|
|
|
|
|
|
bool runOnFunction(Function &F) override {
|
|
|
|
if (skipFunction(F))
|
|
|
|
return false;
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
|
|
|
|
DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
|
|
|
const TargetLibraryInfo &TLI =
|
|
|
|
getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
|
|
|
|
|
2020-04-25 16:02:02 +02:00
|
|
|
bool Changed = false;
|
2020-02-11 19:27:41 +01:00
|
|
|
if (EnableMemorySSA) {
|
|
|
|
MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
|
|
|
|
PostDominatorTree &PDT =
|
|
|
|
getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
|
|
|
|
|
2020-06-04 22:23:32 +02:00
|
|
|
Changed = eliminateDeadStoresMemorySSA(F, AA, MSSA, DT, PDT, TLI);
|
2020-02-11 19:27:41 +01:00
|
|
|
} else {
|
|
|
|
MemoryDependenceResults &MD =
|
|
|
|
getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
|
2016-05-17 23:38:13 +02:00
|
|
|
|
2020-04-25 16:02:02 +02:00
|
|
|
Changed = eliminateDeadStores(F, &AA, &MD, &DT, &TLI);
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
2020-04-25 16:02:02 +02:00
|
|
|
|
|
|
|
#ifdef LLVM_ENABLE_STATS
|
|
|
|
if (AreStatisticsEnabled())
|
|
|
|
for (auto &I : instructions(F))
|
|
|
|
NumRemainingStores += isa<StoreInst>(&I);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return Changed;
|
2016-05-17 23:38:13 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
|
|
AU.setPreservesCFG();
|
|
|
|
AU.addRequired<AAResultsWrapperPass>();
|
|
|
|
AU.addRequired<TargetLibraryInfoWrapperPass>();
|
2020-02-11 15:26:15 +01:00
|
|
|
AU.addPreserved<GlobalsAAWrapperPass>();
|
2020-02-11 19:27:41 +01:00
|
|
|
AU.addRequired<DominatorTreeWrapperPass>();
|
|
|
|
AU.addPreserved<DominatorTreeWrapperPass>();
|
|
|
|
|
|
|
|
if (EnableMemorySSA) {
|
|
|
|
AU.addRequired<PostDominatorTreeWrapperPass>();
|
|
|
|
AU.addRequired<MemorySSAWrapperPass>();
|
|
|
|
AU.addPreserved<PostDominatorTreeWrapperPass>();
|
|
|
|
AU.addPreserved<MemorySSAWrapperPass>();
|
|
|
|
} else {
|
|
|
|
AU.addRequired<MemoryDependenceWrapperPass>();
|
|
|
|
AU.addPreserved<MemoryDependenceWrapperPass>();
|
|
|
|
}
|
2016-05-17 23:38:13 +02:00
|
|
|
}
|
|
|
|
};
|
2017-10-13 23:17:07 +02:00
|
|
|
|
2016-07-10 13:28:51 +02:00
|
|
|
} // end anonymous namespace
|
2016-05-17 23:38:13 +02:00
|
|
|
|
|
|
|
char DSELegacyPass::ID = 0;
|
2017-10-13 23:17:07 +02:00
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false,
|
|
|
|
false)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
|
2020-02-11 19:27:41 +01:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
|
2016-05-17 23:38:13 +02:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
|
2020-02-11 19:27:41 +01:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
|
2016-05-17 23:38:13 +02:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
|
|
|
|
INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
|
|
|
|
false)
|
|
|
|
|
|
|
|
FunctionPass *llvm::createDeadStoreEliminationPass() {
|
|
|
|
return new DSELegacyPass();
|
2007-07-12 23:41:30 +02:00
|
|
|
}
|