2021-03-03 20:38:50 +01:00
|
|
|
//===- DeadStoreElimination.cpp - MemorySSA Backed Dead Store Elimination -===//
|
2007-07-11 02:46:18 +02:00
|
|
|
//
|
2019-01-19 09:50:56 +01:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2007-07-11 02:46:18 +02:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
2021-03-03 20:38:50 +01:00
|
|
|
// The code below implements dead store elimination using MemorySSA. It uses
|
|
|
|
// the following general approach: given a MemoryDef, walk upwards to find
|
|
|
|
// clobbering MemoryDefs that may be killed by the starting def. Then check
|
|
|
|
// that there are no uses that may read the location of the original MemoryDef
|
|
|
|
// in between both MemoryDefs. A bit more concretely:
|
2015-12-11 19:39:41 +01:00
|
|
|
//
|
2021-03-03 20:38:50 +01:00
|
|
|
// For all MemoryDefs StartDef:
|
|
|
|
// 1. Get the next dominating clobbering MemoryDef (EarlierAccess) by walking
|
|
|
|
// upwards.
|
|
|
|
// 2. Check that there are no reads between EarlierAccess and the StartDef by
|
|
|
|
// checking all uses starting at EarlierAccess and walking until we see
|
|
|
|
// StartDef.
|
|
|
|
// 3. For each found CurrentDef, check that:
|
|
|
|
// 1. There are no barrier instructions between CurrentDef and StartDef (like
|
|
|
|
// throws or stores with ordering constraints).
|
|
|
|
// 2. StartDef is executed whenever CurrentDef is executed.
|
|
|
|
// 3. StartDef completely overwrites CurrentDef.
|
|
|
|
// 4. Erase CurrentDef from the function and MemorySSA.
|
2007-07-11 02:46:18 +02:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
#include "llvm/Transforms/Scalar/DeadStoreElimination.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/ADT/APInt.h"
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
#include "llvm/ADT/DenseMap.h"
|
2020-01-03 15:13:55 +01:00
|
|
|
#include "llvm/ADT/MapVector.h"
|
2020-03-20 08:51:29 +01:00
|
|
|
#include "llvm/ADT/PostOrderIterator.h"
|
2012-12-03 17:50:05 +01:00
|
|
|
#include "llvm/ADT/SetVector.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/ADT/SmallPtrSet.h"
|
|
|
|
#include "llvm/ADT/SmallVector.h"
|
2012-12-03 17:50:05 +01:00
|
|
|
#include "llvm/ADT/Statistic.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/ADT/StringRef.h"
|
2007-07-12 01:19:17 +02:00
|
|
|
#include "llvm/Analysis/AliasAnalysis.h"
|
2011-10-22 23:59:35 +02:00
|
|
|
#include "llvm/Analysis/CaptureTracking.h"
|
[PM/AA] Rebuild LLVM's alias analysis infrastructure in a way compatible
with the new pass manager, and no longer relying on analysis groups.
This builds essentially a ground-up new AA infrastructure stack for
LLVM. The core ideas are the same that are used throughout the new pass
manager: type erased polymorphism and direct composition. The design is
as follows:
- FunctionAAResults is a type-erasing alias analysis results aggregation
interface to walk a single query across a range of results from
different alias analyses. Currently this is function-specific as we
always assume that aliasing queries are *within* a function.
- AAResultBase is a CRTP utility providing stub implementations of
various parts of the alias analysis result concept, notably in several
cases in terms of other more general parts of the interface. This can
be used to implement only a narrow part of the interface rather than
the entire interface. This isn't really ideal, this logic should be
hoisted into FunctionAAResults as currently it will cause
a significant amount of redundant work, but it faithfully models the
behavior of the prior infrastructure.
- All the alias analysis passes are ported to be wrapper passes for the
legacy PM and new-style analysis passes for the new PM with a shared
result object. In some cases (most notably CFL), this is an extremely
naive approach that we should revisit when we can specialize for the
new pass manager.
- BasicAA has been restructured to reflect that it is much more
fundamentally a function analysis because it uses dominator trees and
loop info that need to be constructed for each function.
All of the references to getting alias analysis results have been
updated to use the new aggregation interface. All the preservation and
other pass management code has been updated accordingly.
The way the FunctionAAResultsWrapperPass works is to detect the
available alias analyses when run, and add them to the results object.
This means that we should be able to continue to respect when various
passes are added to the pipeline, for example adding CFL or adding TBAA
passes should just cause their results to be available and to get folded
into this. The exception to this rule is BasicAA which really needs to
be a function pass due to using dominator trees and loop info. As
a consequence, the FunctionAAResultsWrapperPass directly depends on
BasicAA and always includes it in the aggregation.
This has significant implications for preserving analyses. Generally,
most passes shouldn't bother preserving FunctionAAResultsWrapperPass
because rebuilding the results just updates the set of known AA passes.
The exception to this rule are LoopPass instances which need to preserve
all the function analyses that the loop pass manager will end up
needing. This means preserving both BasicAAWrapperPass and the
aggregating FunctionAAResultsWrapperPass.
Now, when preserving an alias analysis, you do so by directly preserving
that analysis. This is only necessary for non-immutable-pass-provided
alias analyses though, and there are only three of interest: BasicAA,
GlobalsAA (formerly GlobalsModRef), and SCEVAA. Usually BasicAA is
preserved when needed because it (like DominatorTree and LoopInfo) is
marked as a CFG-only pass. I've expanded GlobalsAA into the preserved
set everywhere we previously were preserving all of AliasAnalysis, and
I've added SCEVAA in the intersection of that with where we preserve
SCEV itself.
One significant challenge to all of this is that the CGSCC passes were
actually using the alias analysis implementations by taking advantage of
a pretty amazing set of loop holes in the old pass manager's analysis
management code which allowed analysis groups to slide through in many
cases. Moving away from analysis groups makes this problem much more
obvious. To fix it, I've leveraged the flexibility the design of the new
PM components provides to just directly construct the relevant alias
analyses for the relevant functions in the IPO passes that need them.
This is a bit hacky, but should go away with the new pass manager, and
is already in many ways cleaner than the prior state.
Another significant challenge is that various facilities of the old
alias analysis infrastructure just don't fit any more. The most
significant of these is the alias analysis 'counter' pass. That pass
relied on the ability to snoop on AA queries at different points in the
analysis group chain. Instead, I'm planning to build printing
functionality directly into the aggregation layer. I've not included
that in this patch merely to keep it smaller.
Note that all of this needs a nearly complete rewrite of the AA
documentation. I'm planning to do that, but I'd like to make sure the
new design settles, and to flesh out a bit more of what it looks like in
the new pass manager first.
Differential Revision: http://reviews.llvm.org/D12080
llvm-svn: 247167
2015-09-09 19:55:00 +02:00
|
|
|
#include "llvm/Analysis/GlobalsModRef.h"
|
2021-06-20 18:03:30 +02:00
|
|
|
#include "llvm/Analysis/LoopInfo.h"
|
2009-10-27 21:05:49 +01:00
|
|
|
#include "llvm/Analysis/MemoryBuiltins.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/Analysis/MemoryLocation.h"
|
2020-02-11 19:27:41 +01:00
|
|
|
#include "llvm/Analysis/MemorySSA.h"
|
|
|
|
#include "llvm/Analysis/MemorySSAUpdater.h"
|
2021-06-20 18:03:30 +02:00
|
|
|
#include "llvm/Analysis/MustExecute.h"
|
2020-02-11 19:27:41 +01:00
|
|
|
#include "llvm/Analysis/PostDominators.h"
|
2015-03-23 20:32:43 +01:00
|
|
|
#include "llvm/Analysis/TargetLibraryInfo.h"
|
2010-12-01 00:05:20 +01:00
|
|
|
#include "llvm/Analysis/ValueTracking.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/IR/Argument.h"
|
|
|
|
#include "llvm/IR/BasicBlock.h"
|
|
|
|
#include "llvm/IR/Constant.h"
|
2013-01-02 12:36:10 +01:00
|
|
|
#include "llvm/IR/Constants.h"
|
|
|
|
#include "llvm/IR/DataLayout.h"
|
2014-01-13 10:26:24 +01:00
|
|
|
#include "llvm/IR/Dominators.h"
|
2013-01-02 12:36:10 +01:00
|
|
|
#include "llvm/IR/Function.h"
|
2020-04-25 16:02:02 +02:00
|
|
|
#include "llvm/IR/InstIterator.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/IR/InstrTypes.h"
|
|
|
|
#include "llvm/IR/Instruction.h"
|
2013-01-02 12:36:10 +01:00
|
|
|
#include "llvm/IR/Instructions.h"
|
|
|
|
#include "llvm/IR/IntrinsicInst.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/IR/Intrinsics.h"
|
2017-09-26 15:54:28 +02:00
|
|
|
#include "llvm/IR/LLVMContext.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/IR/Module.h"
|
|
|
|
#include "llvm/IR/PassManager.h"
|
2020-07-08 09:42:55 +02:00
|
|
|
#include "llvm/IR/PatternMatch.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/IR/Value.h"
|
Sink all InitializePasses.h includes
This file lists every pass in LLVM, and is included by Pass.h, which is
very popular. Every time we add, remove, or rename a pass in LLVM, it
caused lots of recompilation.
I found this fact by looking at this table, which is sorted by the
number of times a file was changed over the last 100,000 git commits
multiplied by the number of object files that depend on it in the
current checkout:
recompiles touches affected_files header
342380 95 3604 llvm/include/llvm/ADT/STLExtras.h
314730 234 1345 llvm/include/llvm/InitializePasses.h
307036 118 2602 llvm/include/llvm/ADT/APInt.h
213049 59 3611 llvm/include/llvm/Support/MathExtras.h
170422 47 3626 llvm/include/llvm/Support/Compiler.h
162225 45 3605 llvm/include/llvm/ADT/Optional.h
158319 63 2513 llvm/include/llvm/ADT/Triple.h
140322 39 3598 llvm/include/llvm/ADT/StringRef.h
137647 59 2333 llvm/include/llvm/Support/Error.h
131619 73 1803 llvm/include/llvm/Support/FileSystem.h
Before this change, touching InitializePasses.h would cause 1345 files
to recompile. After this change, touching it only causes 550 compiles in
an incremental rebuild.
Reviewers: bkramer, asbirlea, bollu, jdoerfert
Differential Revision: https://reviews.llvm.org/D70211
2019-11-13 22:15:01 +01:00
|
|
|
#include "llvm/InitializePasses.h"
|
2012-12-03 17:50:05 +01:00
|
|
|
#include "llvm/Pass.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/Support/Casting.h"
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
#include "llvm/Support/CommandLine.h"
|
2012-12-03 17:50:05 +01:00
|
|
|
#include "llvm/Support/Debug.h"
|
2020-02-21 17:55:18 +01:00
|
|
|
#include "llvm/Support/DebugCounter.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include "llvm/Support/ErrorHandling.h"
|
|
|
|
#include "llvm/Support/MathExtras.h"
|
2015-03-23 20:32:43 +01:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2016-05-17 23:38:13 +02:00
|
|
|
#include "llvm/Transforms/Scalar.h"
|
2020-04-14 11:56:56 +02:00
|
|
|
#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
|
2020-04-25 16:02:02 +02:00
|
|
|
#include "llvm/Transforms/Utils/Local.h"
|
2017-10-13 23:17:07 +02:00
|
|
|
#include <algorithm>
|
|
|
|
#include <cassert>
|
|
|
|
#include <cstddef>
|
2018-03-21 23:34:23 +01:00
|
|
|
#include <cstdint>
|
2017-10-13 23:17:07 +02:00
|
|
|
#include <iterator>
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
#include <map>
|
2017-10-13 23:17:07 +02:00
|
|
|
#include <utility>
|
|
|
|
|
2007-07-11 02:46:18 +02:00
|
|
|
using namespace llvm;
|
2020-07-08 09:42:55 +02:00
|
|
|
using namespace PatternMatch;
|
2007-07-11 02:46:18 +02:00
|
|
|
|
2014-04-22 04:55:47 +02:00
|
|
|
#define DEBUG_TYPE "dse"
|
|
|
|
|
2020-04-25 16:02:02 +02:00
|
|
|
STATISTIC(NumRemainingStores, "Number of stores remaining after DSE");
|
2015-08-13 17:36:11 +02:00
|
|
|
STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
|
2007-07-11 02:46:18 +02:00
|
|
|
STATISTIC(NumFastStores, "Number of stores deleted");
|
2018-08-17 20:40:41 +02:00
|
|
|
STATISTIC(NumFastOther, "Number of other instrs removed");
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
STATISTIC(NumCompletePartials, "Number of stores dead by later partials");
|
2017-09-26 15:54:28 +02:00
|
|
|
STATISTIC(NumModifiedStores, "Number of stores modified");
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
STATISTIC(NumCFGChecks, "Number of stores modified");
|
|
|
|
STATISTIC(NumCFGTries, "Number of stores modified");
|
|
|
|
STATISTIC(NumCFGSuccess, "Number of stores modified");
|
2020-08-28 11:31:30 +02:00
|
|
|
STATISTIC(NumGetDomMemoryDefPassed,
|
|
|
|
"Number of times a valid candidate is returned from getDomMemoryDef");
|
2020-08-25 09:43:32 +02:00
|
|
|
STATISTIC(NumDomMemDefChecks,
|
|
|
|
"Number iterations check for reads in getDomMemoryDef");
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
|
2020-02-21 17:55:18 +01:00
|
|
|
DEBUG_COUNTER(MemorySSACounter, "dse-memoryssa",
|
|
|
|
"Controls which MemoryDefs are eliminated.");
|
|
|
|
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
static cl::opt<bool>
|
|
|
|
EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking",
|
|
|
|
cl::init(true), cl::Hidden,
|
|
|
|
cl::desc("Enable partial-overwrite tracking in DSE"));
|
2007-07-11 02:46:18 +02:00
|
|
|
|
2017-09-26 15:54:28 +02:00
|
|
|
static cl::opt<bool>
|
|
|
|
EnablePartialStoreMerging("enable-dse-partial-store-merging",
|
|
|
|
cl::init(true), cl::Hidden,
|
|
|
|
cl::desc("Enable partial store merging in DSE"));
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
static cl::opt<unsigned>
|
2020-08-14 22:08:16 +02:00
|
|
|
MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(150), cl::Hidden,
|
2020-02-11 19:27:41 +01:00
|
|
|
cl::desc("The number of memory instructions to scan for "
|
|
|
|
"dead store elimination (default = 100)"));
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
static cl::opt<unsigned> MemorySSAUpwardsStepLimit(
|
2020-09-04 17:44:58 +02:00
|
|
|
"dse-memoryssa-walklimit", cl::init(90), cl::Hidden,
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
cl::desc("The maximum number of steps while walking upwards to find "
|
2020-09-04 17:44:58 +02:00
|
|
|
"MemoryDefs that may be killed (default = 90)"));
|
2020-02-11 19:27:41 +01:00
|
|
|
|
2020-08-28 11:31:30 +02:00
|
|
|
static cl::opt<unsigned> MemorySSAPartialStoreLimit(
|
|
|
|
"dse-memoryssa-partial-store-limit", cl::init(5), cl::Hidden,
|
|
|
|
cl::desc("The maximum number candidates that only partially overwrite the "
|
|
|
|
"killing MemoryDef to consider"
|
|
|
|
" (default = 5)"));
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
static cl::opt<unsigned> MemorySSADefsPerBlockLimit(
|
|
|
|
"dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden,
|
|
|
|
cl::desc("The number of MemoryDefs we consider as candidates to eliminated "
|
|
|
|
"other stores per basic block (default = 5000)"));
|
|
|
|
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
static cl::opt<unsigned> MemorySSASameBBStepCost(
|
|
|
|
"dse-memoryssa-samebb-cost", cl::init(1), cl::Hidden,
|
|
|
|
cl::desc(
|
|
|
|
"The cost of a step in the same basic block as the killing MemoryDef"
|
|
|
|
"(default = 1)"));
|
|
|
|
|
|
|
|
static cl::opt<unsigned>
|
|
|
|
MemorySSAOtherBBStepCost("dse-memoryssa-otherbb-cost", cl::init(5),
|
|
|
|
cl::Hidden,
|
|
|
|
cl::desc("The cost of a step in a different basic "
|
|
|
|
"block than the killing MemoryDef"
|
|
|
|
"(default = 5)"));
|
|
|
|
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
static cl::opt<unsigned> MemorySSAPathCheckLimit(
|
|
|
|
"dse-memoryssa-path-check-limit", cl::init(50), cl::Hidden,
|
|
|
|
cl::desc("The maximum number of blocks to check when trying to prove that "
|
|
|
|
"all paths to an exit go through a killing block (default = 50)"));
|
|
|
|
|
2010-11-30 22:58:14 +01:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Helper functions
|
|
|
|
//===----------------------------------------------------------------------===//
|
2017-10-13 23:17:07 +02:00
|
|
|
using OverlapIntervalsTy = std::map<int64_t, int64_t>;
|
|
|
|
using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>;
|
2010-11-30 22:58:14 +01:00
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
/// Does this instruction write some memory? This only returns true for things
|
|
|
|
/// that we can analyze with other helpers below.
|
2018-01-21 02:44:33 +01:00
|
|
|
static bool hasAnalyzableMemoryWrite(Instruction *I,
|
|
|
|
const TargetLibraryInfo &TLI) {
|
2009-11-10 07:46:40 +01:00
|
|
|
if (isa<StoreInst>(I))
|
|
|
|
return true;
|
|
|
|
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
|
|
|
|
switch (II->getIntrinsicID()) {
|
2009-12-02 07:35:55 +01:00
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
case Intrinsic::memset:
|
|
|
|
case Intrinsic::memmove:
|
|
|
|
case Intrinsic::memcpy:
|
2020-09-10 14:09:25 +02:00
|
|
|
case Intrinsic::memcpy_inline:
|
2018-04-23 21:06:49 +02:00
|
|
|
case Intrinsic::memcpy_element_unordered_atomic:
|
|
|
|
case Intrinsic::memmove_element_unordered_atomic:
|
|
|
|
case Intrinsic::memset_element_unordered_atomic:
|
2009-12-02 07:35:55 +01:00
|
|
|
case Intrinsic::init_trampoline:
|
|
|
|
case Intrinsic::lifetime_end:
|
2020-09-02 21:06:58 +02:00
|
|
|
case Intrinsic::masked_store:
|
2009-12-02 07:35:55 +01:00
|
|
|
return true;
|
2009-11-10 07:46:40 +01:00
|
|
|
}
|
|
|
|
}
|
2020-04-23 18:15:04 +02:00
|
|
|
if (auto *CB = dyn_cast<CallBase>(I)) {
|
2020-07-02 11:27:09 +02:00
|
|
|
LibFunc LF;
|
|
|
|
if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) {
|
|
|
|
switch (LF) {
|
|
|
|
case LibFunc_strcpy:
|
|
|
|
case LibFunc_strncpy:
|
|
|
|
case LibFunc_strcat:
|
|
|
|
case LibFunc_strncat:
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
return false;
|
2020-01-11 12:57:29 +01:00
|
|
|
}
|
2012-09-25 00:09:10 +02:00
|
|
|
}
|
|
|
|
}
|
2009-11-10 07:46:40 +01:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
/// Return a Location stored to by the specified instruction. If isRemovable
|
|
|
|
/// returns true, this function and getLocForRead completely describe the memory
|
|
|
|
/// operations for this instruction.
|
2020-09-02 21:06:58 +02:00
|
|
|
static MemoryLocation getLocForWrite(Instruction *Inst,
|
|
|
|
const TargetLibraryInfo &TLI) {
|
2010-11-30 08:23:21 +01:00
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
|
2015-06-04 04:03:15 +02:00
|
|
|
return MemoryLocation::get(SI);
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2020-12-16 21:34:47 +01:00
|
|
|
// memcpy/memmove/memset.
|
|
|
|
if (auto *MI = dyn_cast<AnyMemIntrinsic>(Inst))
|
|
|
|
return MemoryLocation::getForDest(MI);
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2018-01-21 03:10:54 +01:00
|
|
|
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
|
|
|
|
switch (II->getIntrinsicID()) {
|
|
|
|
default:
|
|
|
|
return MemoryLocation(); // Unhandled intrinsic.
|
|
|
|
case Intrinsic::init_trampoline:
|
2020-11-17 20:11:09 +01:00
|
|
|
return MemoryLocation::getAfter(II->getArgOperand(0));
|
2020-09-02 21:06:58 +02:00
|
|
|
case Intrinsic::masked_store:
|
|
|
|
return MemoryLocation::getForArgument(II, 1, TLI);
|
2018-01-21 03:10:54 +01:00
|
|
|
case Intrinsic::lifetime_end: {
|
|
|
|
uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
|
|
|
|
return MemoryLocation(II->getArgOperand(1), Len);
|
|
|
|
}
|
|
|
|
}
|
2010-11-30 08:23:21 +01:00
|
|
|
}
|
2020-04-23 18:15:04 +02:00
|
|
|
if (auto *CB = dyn_cast<CallBase>(Inst))
|
2018-01-21 03:10:54 +01:00
|
|
|
// All the supported TLI functions so far happen to have dest as their
|
|
|
|
// first argument.
|
2020-11-17 20:11:09 +01:00
|
|
|
return MemoryLocation::getAfter(CB->getArgOperand(0));
|
2018-01-21 03:10:54 +01:00
|
|
|
return MemoryLocation();
|
2010-11-30 08:23:21 +01:00
|
|
|
}
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
/// If the value of this instruction and the memory it writes to is unused, may
|
|
|
|
/// we delete this instruction?
|
2010-11-30 06:30:45 +01:00
|
|
|
static bool isRemovable(Instruction *I) {
|
2011-08-18 00:22:24 +02:00
|
|
|
// Don't remove volatile/atomic stores.
|
2009-11-10 07:46:40 +01:00
|
|
|
if (StoreInst *SI = dyn_cast<StoreInst>(I))
|
2011-08-18 00:22:24 +02:00
|
|
|
return SI->isUnordered();
|
2011-09-06 20:14:09 +02:00
|
|
|
|
2012-09-25 00:09:10 +02:00
|
|
|
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
|
|
|
|
switch (II->getIntrinsicID()) {
|
2018-01-21 02:44:33 +01:00
|
|
|
default: llvm_unreachable("doesn't pass 'hasAnalyzableMemoryWrite' predicate");
|
2012-09-25 00:09:10 +02:00
|
|
|
case Intrinsic::lifetime_end:
|
|
|
|
// Never remove dead lifetime_end's, e.g. because it is followed by a
|
|
|
|
// free.
|
|
|
|
return false;
|
|
|
|
case Intrinsic::init_trampoline:
|
|
|
|
// Always safe to remove init_trampoline.
|
|
|
|
return true;
|
|
|
|
case Intrinsic::memset:
|
|
|
|
case Intrinsic::memmove:
|
|
|
|
case Intrinsic::memcpy:
|
2020-09-10 14:09:25 +02:00
|
|
|
case Intrinsic::memcpy_inline:
|
2012-09-25 00:09:10 +02:00
|
|
|
// Don't remove volatile memory intrinsics.
|
|
|
|
return !cast<MemIntrinsic>(II)->isVolatile();
|
2018-04-23 21:06:49 +02:00
|
|
|
case Intrinsic::memcpy_element_unordered_atomic:
|
|
|
|
case Intrinsic::memmove_element_unordered_atomic:
|
|
|
|
case Intrinsic::memset_element_unordered_atomic:
|
2020-09-02 21:06:58 +02:00
|
|
|
case Intrinsic::masked_store:
|
2018-04-23 21:06:49 +02:00
|
|
|
return true;
|
2012-09-25 00:09:10 +02:00
|
|
|
}
|
2010-11-30 20:12:10 +01:00
|
|
|
}
|
2012-09-25 00:09:10 +02:00
|
|
|
|
2018-01-21 02:44:33 +01:00
|
|
|
// note: only get here for calls with analyzable writes - i.e. libcalls
|
2020-04-23 18:15:04 +02:00
|
|
|
if (auto *CB = dyn_cast<CallBase>(I))
|
|
|
|
return CB->use_empty();
|
2012-09-25 00:09:10 +02:00
|
|
|
|
|
|
|
return false;
|
2009-11-10 07:46:40 +01:00
|
|
|
}
|
|
|
|
|
2016-04-22 21:51:29 +02:00
|
|
|
/// Returns true if the end of this instruction can be safely shortened in
|
2011-11-10 00:07:35 +01:00
|
|
|
/// length.
|
2016-04-22 21:51:29 +02:00
|
|
|
static bool isShortenableAtTheEnd(Instruction *I) {
|
2011-11-10 00:07:35 +01:00
|
|
|
// Don't shorten stores for now
|
|
|
|
if (isa<StoreInst>(I))
|
|
|
|
return false;
|
2012-07-24 12:51:42 +02:00
|
|
|
|
2012-09-25 00:09:10 +02:00
|
|
|
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
|
|
|
|
switch (II->getIntrinsicID()) {
|
|
|
|
default: return false;
|
|
|
|
case Intrinsic::memset:
|
|
|
|
case Intrinsic::memcpy:
|
2018-05-10 17:12:49 +02:00
|
|
|
case Intrinsic::memcpy_element_unordered_atomic:
|
|
|
|
case Intrinsic::memset_element_unordered_atomic:
|
2012-09-25 00:09:10 +02:00
|
|
|
// Do shorten memory intrinsics.
|
2016-04-22 21:51:29 +02:00
|
|
|
// FIXME: Add memmove if it's also safe to transform.
|
2012-09-25 00:09:10 +02:00
|
|
|
return true;
|
|
|
|
}
|
2011-11-10 00:07:35 +01:00
|
|
|
}
|
2012-09-25 00:09:10 +02:00
|
|
|
|
|
|
|
// Don't shorten libcalls calls for now.
|
|
|
|
|
|
|
|
return false;
|
2011-11-10 00:07:35 +01:00
|
|
|
}
|
|
|
|
|
2016-04-22 21:51:29 +02:00
|
|
|
/// Returns true if the beginning of this instruction can be safely shortened
|
|
|
|
/// in length.
|
|
|
|
static bool isShortenableAtTheBeginning(Instruction *I) {
|
|
|
|
// FIXME: Handle only memset for now. Supporting memcpy/memmove should be
|
|
|
|
// easily done by offsetting the source address.
|
2018-05-10 17:12:49 +02:00
|
|
|
return isa<AnyMemSetInst>(I);
|
2016-04-22 21:51:29 +02:00
|
|
|
}
|
|
|
|
|
2015-03-10 03:37:25 +01:00
|
|
|
static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
|
llvm: Add support for "-fno-delete-null-pointer-checks"
Summary:
Support for this option is needed for building Linux kernel.
This is a very frequently requested feature by kernel developers.
More details : https://lkml.org/lkml/2018/4/4/601
GCC option description for -fdelete-null-pointer-checks:
This Assume that programs cannot safely dereference null pointers,
and that no code or data element resides at address zero.
-fno-delete-null-pointer-checks is the inverse of this implying that
null pointer dereferencing is not undefined.
This feature is implemented in LLVM IR in this CL as the function attribute
"null-pointer-is-valid"="true" in IR (Under review at D47894).
The CL updates several passes that assumed null pointer dereferencing is
undefined to not optimize when the "null-pointer-is-valid"="true"
attribute is present.
Reviewers: t.p.northover, efriedma, jyknight, chandlerc, rnk, srhines, void, george.burgess.iv
Reviewed By: efriedma, george.burgess.iv
Subscribers: eraman, haicheng, george.burgess.iv, drinkcat, theraven, reames, sanjoy, xbolva00, llvm-commits
Differential Revision: https://reviews.llvm.org/D47895
llvm-svn: 336613
2018-07-10 00:27:23 +02:00
|
|
|
const TargetLibraryInfo &TLI,
|
|
|
|
const Function *F) {
|
2012-06-21 17:45:28 +02:00
|
|
|
uint64_t Size;
|
llvm: Add support for "-fno-delete-null-pointer-checks"
Summary:
Support for this option is needed for building Linux kernel.
This is a very frequently requested feature by kernel developers.
More details : https://lkml.org/lkml/2018/4/4/601
GCC option description for -fdelete-null-pointer-checks:
This Assume that programs cannot safely dereference null pointers,
and that no code or data element resides at address zero.
-fno-delete-null-pointer-checks is the inverse of this implying that
null pointer dereferencing is not undefined.
This feature is implemented in LLVM IR in this CL as the function attribute
"null-pointer-is-valid"="true" in IR (Under review at D47894).
The CL updates several passes that assumed null pointer dereferencing is
undefined to not optimize when the "null-pointer-is-valid"="true"
attribute is present.
Reviewers: t.p.northover, efriedma, jyknight, chandlerc, rnk, srhines, void, george.burgess.iv
Reviewed By: efriedma, george.burgess.iv
Subscribers: eraman, haicheng, george.burgess.iv, drinkcat, theraven, reames, sanjoy, xbolva00, llvm-commits
Differential Revision: https://reviews.llvm.org/D47895
llvm-svn: 336613
2018-07-10 00:27:23 +02:00
|
|
|
ObjectSizeOpts Opts;
|
|
|
|
Opts.NullIsUnknownSize = NullPointerIsDefined(F);
|
|
|
|
|
|
|
|
if (getObjectSize(V, Size, DL, &TLI, Opts))
|
2012-06-21 17:45:28 +02:00
|
|
|
return Size;
|
2015-06-17 09:21:38 +02:00
|
|
|
return MemoryLocation::UnknownSize;
|
2010-12-01 00:43:23 +01:00
|
|
|
}
|
2010-11-30 20:34:42 +01:00
|
|
|
|
2011-11-10 00:07:35 +01:00
|
|
|
namespace {
|
2017-10-13 23:17:07 +02:00
|
|
|
|
2017-09-26 15:54:28 +02:00
|
|
|
enum OverwriteResult {
|
|
|
|
OW_Begin,
|
|
|
|
OW_Complete,
|
|
|
|
OW_End,
|
|
|
|
OW_PartialEarlierWithFullLater,
|
2020-08-21 10:13:59 +02:00
|
|
|
OW_MaybePartial,
|
2017-09-26 15:54:28 +02:00
|
|
|
OW_Unknown
|
|
|
|
};
|
2017-10-13 23:17:07 +02:00
|
|
|
|
|
|
|
} // end anonymous namespace
|
2011-11-10 00:07:35 +01:00
|
|
|
|
2020-09-09 22:23:34 +02:00
|
|
|
/// Check if two instruction are masked stores that completely
|
|
|
|
/// overwrite one another. More specifically, \p Later has to
|
|
|
|
/// overwrite \p Earlier.
|
|
|
|
static OverwriteResult isMaskedStoreOverwrite(const Instruction *Later,
|
|
|
|
const Instruction *Earlier,
|
2021-03-03 20:38:50 +01:00
|
|
|
BatchAAResults &AA) {
|
2020-09-09 22:23:34 +02:00
|
|
|
const auto *IIL = dyn_cast<IntrinsicInst>(Later);
|
|
|
|
const auto *IIE = dyn_cast<IntrinsicInst>(Earlier);
|
|
|
|
if (IIL == nullptr || IIE == nullptr)
|
|
|
|
return OW_Unknown;
|
|
|
|
if (IIL->getIntrinsicID() != Intrinsic::masked_store ||
|
|
|
|
IIE->getIntrinsicID() != Intrinsic::masked_store)
|
|
|
|
return OW_Unknown;
|
|
|
|
// Pointers.
|
|
|
|
Value *LP = IIL->getArgOperand(1)->stripPointerCasts();
|
|
|
|
Value *EP = IIE->getArgOperand(1)->stripPointerCasts();
|
|
|
|
if (LP != EP && !AA.isMustAlias(LP, EP))
|
|
|
|
return OW_Unknown;
|
|
|
|
// Masks.
|
|
|
|
// TODO: check that Later's mask is a superset of the Earlier's mask.
|
|
|
|
if (IIL->getArgOperand(3) != IIE->getArgOperand(3))
|
|
|
|
return OW_Unknown;
|
|
|
|
return OW_Complete;
|
|
|
|
}
|
|
|
|
|
2020-08-21 10:13:59 +02:00
|
|
|
/// Return 'OW_Complete' if a store to the 'Later' location completely
|
|
|
|
/// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the
|
|
|
|
/// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the
|
|
|
|
/// beginning of the 'Earlier' location is overwritten by 'Later'.
|
|
|
|
/// 'OW_PartialEarlierWithFullLater' means that an earlier (big) store was
|
|
|
|
/// overwritten by a latter (smaller) store which doesn't write outside the big
|
|
|
|
/// store's memory locations. Returns 'OW_Unknown' if nothing can be determined.
|
|
|
|
/// NOTE: This function must only be called if both \p Later and \p Earlier
|
|
|
|
/// write to the same underlying object with valid \p EarlierOff and \p
|
|
|
|
/// LaterOff.
|
|
|
|
static OverwriteResult isPartialOverwrite(const MemoryLocation &Later,
|
|
|
|
const MemoryLocation &Earlier,
|
|
|
|
int64_t EarlierOff, int64_t LaterOff,
|
|
|
|
Instruction *DepWrite,
|
|
|
|
InstOverlapIntervalsTy &IOL) {
|
|
|
|
const uint64_t LaterSize = Later.Size.getValue();
|
|
|
|
const uint64_t EarlierSize = Earlier.Size.getValue();
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
// We may now overlap, although the overlap is not complete. There might also
|
|
|
|
// be other incomplete overlaps, and together, they might cover the complete
|
|
|
|
// earlier write.
|
|
|
|
// Note: The correctness of this logic depends on the fact that this function
|
|
|
|
// is not even called providing DepWrite when there are any intervening reads.
|
|
|
|
if (EnablePartialOverwriteTracking &&
|
2018-10-09 04:14:33 +02:00
|
|
|
LaterOff < int64_t(EarlierOff + EarlierSize) &&
|
|
|
|
int64_t(LaterOff + LaterSize) >= EarlierOff) {
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
|
|
|
|
// Insert our part of the overlap into the map.
|
|
|
|
auto &IM = IOL[DepWrite];
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff
|
2018-10-09 04:14:33 +02:00
|
|
|
<< ", " << int64_t(EarlierOff + EarlierSize)
|
2018-05-14 14:53:11 +02:00
|
|
|
<< ") Later [" << LaterOff << ", "
|
2018-10-09 04:14:33 +02:00
|
|
|
<< int64_t(LaterOff + LaterSize) << ")\n");
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
|
|
|
|
// Make sure that we only insert non-overlapping intervals and combine
|
|
|
|
// adjacent intervals. The intervals are stored in the map with the ending
|
|
|
|
// offset as the key (in the half-open sense) and the starting offset as
|
|
|
|
// the value.
|
2018-10-09 04:14:33 +02:00
|
|
|
int64_t LaterIntStart = LaterOff, LaterIntEnd = LaterOff + LaterSize;
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
|
|
|
|
// Find any intervals ending at, or after, LaterIntStart which start
|
|
|
|
// before LaterIntEnd.
|
|
|
|
auto ILI = IM.lower_bound(LaterIntStart);
|
2016-06-30 17:32:20 +02:00
|
|
|
if (ILI != IM.end() && ILI->second <= LaterIntEnd) {
|
|
|
|
// This existing interval is overlapped with the current store somewhere
|
|
|
|
// in [LaterIntStart, LaterIntEnd]. Merge them by erasing the existing
|
|
|
|
// intervals and adjusting our start and end.
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
LaterIntStart = std::min(LaterIntStart, ILI->second);
|
|
|
|
LaterIntEnd = std::max(LaterIntEnd, ILI->first);
|
|
|
|
ILI = IM.erase(ILI);
|
|
|
|
|
2016-06-30 17:32:20 +02:00
|
|
|
// Continue erasing and adjusting our end in case other previous
|
|
|
|
// intervals are also overlapped with the current store.
|
|
|
|
//
|
|
|
|
// |--- ealier 1 ---| |--- ealier 2 ---|
|
|
|
|
// |------- later---------|
|
|
|
|
//
|
|
|
|
while (ILI != IM.end() && ILI->second <= LaterIntEnd) {
|
|
|
|
assert(ILI->second > LaterIntStart && "Unexpected interval");
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
LaterIntEnd = std::max(LaterIntEnd, ILI->first);
|
2016-06-30 17:32:20 +02:00
|
|
|
ILI = IM.erase(ILI);
|
|
|
|
}
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
IM[LaterIntEnd] = LaterIntStart;
|
|
|
|
|
|
|
|
ILI = IM.begin();
|
|
|
|
if (ILI->second <= EarlierOff &&
|
2018-10-09 04:14:33 +02:00
|
|
|
ILI->first >= int64_t(EarlierOff + EarlierSize)) {
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier ["
|
|
|
|
<< EarlierOff << ", "
|
2018-10-09 04:14:33 +02:00
|
|
|
<< int64_t(EarlierOff + EarlierSize)
|
2018-05-14 14:53:11 +02:00
|
|
|
<< ") Composite Later [" << ILI->second << ", "
|
|
|
|
<< ILI->first << ")\n");
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
++NumCompletePartials;
|
2017-03-29 16:42:27 +02:00
|
|
|
return OW_Complete;
|
Allow DeadStoreElimination to track combinations of partial later wrties
DeadStoreElimination can currently remove a small store rendered unnecessary by
a later larger one, but could not remove a larger store rendered unnecessary by
a series of later smaller ones. This adds that capability.
It works by keeping a map, which is used as an effective interval map, for each
store later overwritten only partially, and filling in that interval map as
more such stores are discovered. No additional walking or aliasing queries are
used. In the map forms an interval covering the the entire earlier store, then
it is dead and can be removed. The map is used as an interval map by storing a
mapping between the ending offset and the beginning offset of each interval.
I discovered this problem when investigating a performance issue with code like
this on PowerPC:
#include <complex>
using namespace std;
complex<float> bar(complex<float> C);
complex<float> foo(complex<float> C) {
return bar(C)*C;
}
which produces this:
define void @_Z4testSt7complexIfE(%"struct.std::complex"* noalias nocapture sret %agg.result, i64 %c.coerce) {
entry:
%ref.tmp = alloca i64, align 8
%tmpcast = bitcast i64* %ref.tmp to %"struct.std::complex"*
%c.sroa.0.0.extract.shift = lshr i64 %c.coerce, 32
%c.sroa.0.0.extract.trunc = trunc i64 %c.sroa.0.0.extract.shift to i32
%0 = bitcast i32 %c.sroa.0.0.extract.trunc to float
%c.sroa.2.0.extract.trunc = trunc i64 %c.coerce to i32
%1 = bitcast i32 %c.sroa.2.0.extract.trunc to float
call void @_Z3barSt7complexIfE(%"struct.std::complex"* nonnull sret %tmpcast, i64 %c.coerce)
%2 = bitcast %"struct.std::complex"* %agg.result to i64*
%3 = load i64, i64* %ref.tmp, align 8
store i64 %3, i64* %2, align 4 ; <--- ***** THIS SHOULD NOT BE HERE ****
%_M_value.realp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 0
%4 = lshr i64 %3, 32
%5 = trunc i64 %4 to i32
%6 = bitcast i32 %5 to float
%_M_value.imagp.i.i = getelementptr inbounds %"struct.std::complex", %"struct.std::complex"* %agg.result, i64 0, i32 0, i32 1
%7 = trunc i64 %3 to i32
%8 = bitcast i32 %7 to float
%mul_ad.i.i = fmul fast float %6, %1
%mul_bc.i.i = fmul fast float %8, %0
%mul_i.i.i = fadd fast float %mul_ad.i.i, %mul_bc.i.i
%mul_ac.i.i = fmul fast float %6, %0
%mul_bd.i.i = fmul fast float %8, %1
%mul_r.i.i = fsub fast float %mul_ac.i.i, %mul_bd.i.i
store float %mul_r.i.i, float* %_M_value.realp.i.i, align 4
store float %mul_i.i.i, float* %_M_value.imagp.i.i, align 4
ret void
}
the problem here is not just that the i64 store is unnecessary, but also that
it blocks further backend optimizations of the other uses of that i64 value in
the backend.
In the future, we might want to add a special case for handling smaller
accesses (e.g. using a bit vector) if the map mechanism turns out to be
noticeably inefficient. A sorted vector is also a possible replacement for the
map for small numbers of tracked intervals.
Differential Revision: http://reviews.llvm.org/D18586
llvm-svn: 273559
2016-06-23 15:46:39 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-09-26 15:54:28 +02:00
|
|
|
// Check for an earlier store which writes to all the memory locations that
|
|
|
|
// the later store writes to.
|
|
|
|
if (EnablePartialStoreMerging && LaterOff >= EarlierOff &&
|
2018-10-09 04:14:33 +02:00
|
|
|
int64_t(EarlierOff + EarlierSize) > LaterOff &&
|
|
|
|
uint64_t(LaterOff - EarlierOff) + LaterSize <= EarlierSize) {
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "DSE: Partial overwrite an earlier load ["
|
|
|
|
<< EarlierOff << ", "
|
2018-10-09 04:14:33 +02:00
|
|
|
<< int64_t(EarlierOff + EarlierSize)
|
2018-05-14 14:53:11 +02:00
|
|
|
<< ") by a later store [" << LaterOff << ", "
|
2018-10-09 04:14:33 +02:00
|
|
|
<< int64_t(LaterOff + LaterSize) << ")\n");
|
2017-09-26 15:54:28 +02:00
|
|
|
// TODO: Maybe come up with a better name?
|
|
|
|
return OW_PartialEarlierWithFullLater;
|
|
|
|
}
|
|
|
|
|
2016-04-22 21:51:29 +02:00
|
|
|
// Another interesting case is if the later store overwrites the end of the
|
|
|
|
// earlier store.
|
2011-11-10 00:07:35 +01:00
|
|
|
//
|
|
|
|
// |--earlier--|
|
|
|
|
// |-- later --|
|
|
|
|
//
|
|
|
|
// In this case we may want to trim the size of earlier to avoid generating
|
|
|
|
// writes to addresses which will definitely be overwritten later
|
2016-07-22 20:27:24 +02:00
|
|
|
if (!EnablePartialOverwriteTracking &&
|
2018-10-09 04:14:33 +02:00
|
|
|
(LaterOff > EarlierOff && LaterOff < int64_t(EarlierOff + EarlierSize) &&
|
|
|
|
int64_t(LaterOff + LaterSize) >= int64_t(EarlierOff + EarlierSize)))
|
2017-03-29 16:42:27 +02:00
|
|
|
return OW_End;
|
2011-03-26 09:02:59 +01:00
|
|
|
|
2016-04-22 21:51:29 +02:00
|
|
|
// Finally, we also need to check if the later store overwrites the beginning
|
|
|
|
// of the earlier store.
|
|
|
|
//
|
|
|
|
// |--earlier--|
|
|
|
|
// |-- later --|
|
|
|
|
//
|
|
|
|
// In this case we may want to move the destination address and trim the size
|
|
|
|
// of earlier to avoid generating writes to addresses which will definitely
|
|
|
|
// be overwritten later.
|
2016-07-22 20:27:24 +02:00
|
|
|
if (!EnablePartialOverwriteTracking &&
|
2018-10-09 04:14:33 +02:00
|
|
|
(LaterOff <= EarlierOff && int64_t(LaterOff + LaterSize) > EarlierOff)) {
|
|
|
|
assert(int64_t(LaterOff + LaterSize) < int64_t(EarlierOff + EarlierSize) &&
|
2017-03-29 16:42:27 +02:00
|
|
|
"Expect to be handled as OW_Complete");
|
|
|
|
return OW_Begin;
|
2016-04-22 21:51:29 +02:00
|
|
|
}
|
2011-03-26 09:02:59 +01:00
|
|
|
// Otherwise, they don't completely overlap.
|
2017-03-29 16:42:27 +02:00
|
|
|
return OW_Unknown;
|
2009-11-05 00:20:12 +01:00
|
|
|
}
|
|
|
|
|
2015-09-23 13:38:44 +02:00
|
|
|
/// Returns true if the memory which is accessed by the second instruction is not
|
|
|
|
/// modified between the first and the second instruction.
|
|
|
|
/// Precondition: Second instruction must be dominated by the first
|
2015-08-13 17:36:11 +02:00
|
|
|
/// instruction.
|
2020-08-22 09:36:35 +02:00
|
|
|
static bool
|
2021-03-03 20:38:50 +01:00
|
|
|
memoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI,
|
|
|
|
BatchAAResults &AA, const DataLayout &DL,
|
|
|
|
DominatorTree *DT) {
|
2020-02-21 23:40:22 +01:00
|
|
|
// Do a backwards scan through the CFG from SecondI to FirstI. Look for
|
|
|
|
// instructions which can modify the memory location accessed by SecondI.
|
|
|
|
//
|
|
|
|
// While doing the walk keep track of the address to check. It might be
|
|
|
|
// different in different basic blocks due to PHI translation.
|
|
|
|
using BlockAddressPair = std::pair<BasicBlock *, PHITransAddr>;
|
|
|
|
SmallVector<BlockAddressPair, 16> WorkList;
|
|
|
|
// Keep track of the address we visited each block with. Bail out if we
|
|
|
|
// visit a block with different addresses.
|
|
|
|
DenseMap<BasicBlock *, Value *> Visited;
|
|
|
|
|
2015-09-23 13:38:44 +02:00
|
|
|
BasicBlock::iterator FirstBBI(FirstI);
|
|
|
|
++FirstBBI;
|
|
|
|
BasicBlock::iterator SecondBBI(SecondI);
|
|
|
|
BasicBlock *FirstBB = FirstI->getParent();
|
|
|
|
BasicBlock *SecondBB = SecondI->getParent();
|
2021-07-23 11:51:59 +02:00
|
|
|
MemoryLocation MemLoc = MemoryLocation::get(SecondI);
|
2020-02-21 23:40:22 +01:00
|
|
|
auto *MemLocPtr = const_cast<Value *>(MemLoc.Ptr);
|
2015-08-13 17:36:11 +02:00
|
|
|
|
2020-02-01 00:18:59 +01:00
|
|
|
// Start checking the SecondBB.
|
2020-02-21 23:40:22 +01:00
|
|
|
WorkList.push_back(
|
|
|
|
std::make_pair(SecondBB, PHITransAddr(MemLocPtr, DL, nullptr)));
|
2015-08-13 17:36:11 +02:00
|
|
|
bool isFirstBlock = true;
|
|
|
|
|
2020-02-01 00:18:59 +01:00
|
|
|
// Check all blocks going backward until we reach the FirstBB.
|
2015-08-13 17:36:11 +02:00
|
|
|
while (!WorkList.empty()) {
|
2020-02-21 23:40:22 +01:00
|
|
|
BlockAddressPair Current = WorkList.pop_back_val();
|
|
|
|
BasicBlock *B = Current.first;
|
|
|
|
PHITransAddr &Addr = Current.second;
|
|
|
|
Value *Ptr = Addr.getAddr();
|
2015-08-13 17:36:11 +02:00
|
|
|
|
2020-02-01 00:18:59 +01:00
|
|
|
// Ignore instructions before FirstI if this is the FirstBB.
|
2015-09-23 13:38:44 +02:00
|
|
|
BasicBlock::iterator BI = (B == FirstBB ? FirstBBI : B->begin());
|
2015-08-13 17:36:11 +02:00
|
|
|
|
|
|
|
BasicBlock::iterator EI;
|
|
|
|
if (isFirstBlock) {
|
2020-02-01 00:18:59 +01:00
|
|
|
// Ignore instructions after SecondI if this is the first visit of SecondBB.
|
2015-09-23 13:38:44 +02:00
|
|
|
assert(B == SecondBB && "first block is not the store block");
|
|
|
|
EI = SecondBBI;
|
2015-08-13 17:36:11 +02:00
|
|
|
isFirstBlock = false;
|
|
|
|
} else {
|
2015-09-23 13:38:44 +02:00
|
|
|
// It's not SecondBB or (in case of a loop) the second visit of SecondBB.
|
2020-02-01 00:18:59 +01:00
|
|
|
// In this case we also have to look at instructions after SecondI.
|
2015-08-13 17:36:11 +02:00
|
|
|
EI = B->end();
|
|
|
|
}
|
|
|
|
for (; BI != EI; ++BI) {
|
2015-10-13 20:26:00 +02:00
|
|
|
Instruction *I = &*BI;
|
2017-12-05 21:12:23 +01:00
|
|
|
if (I->mayWriteToMemory() && I != SecondI)
|
2020-08-22 09:36:35 +02:00
|
|
|
if (isModSet(AA.getModRefInfo(I, MemLoc.getWithNewPtr(Ptr))))
|
2015-08-13 17:36:11 +02:00
|
|
|
return false;
|
|
|
|
}
|
2015-09-23 13:38:44 +02:00
|
|
|
if (B != FirstBB) {
|
|
|
|
assert(B != &FirstBB->getParent()->getEntryBlock() &&
|
2015-08-13 17:36:11 +02:00
|
|
|
"Should not hit the entry block because SI must be dominated by LI");
|
2021-02-05 06:18:05 +01:00
|
|
|
for (BasicBlock *Pred : predecessors(B)) {
|
2020-02-21 23:40:22 +01:00
|
|
|
PHITransAddr PredAddr = Addr;
|
|
|
|
if (PredAddr.NeedsPHITranslationFromBlock(B)) {
|
|
|
|
if (!PredAddr.IsPotentiallyPHITranslatable())
|
|
|
|
return false;
|
2021-02-05 06:18:05 +01:00
|
|
|
if (PredAddr.PHITranslateValue(B, Pred, DT, false))
|
2020-02-21 23:40:22 +01:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
Value *TranslatedPtr = PredAddr.getAddr();
|
2021-02-05 06:18:05 +01:00
|
|
|
auto Inserted = Visited.insert(std::make_pair(Pred, TranslatedPtr));
|
2020-02-21 23:40:22 +01:00
|
|
|
if (!Inserted.second) {
|
|
|
|
// We already visited this block before. If it was with a different
|
|
|
|
// address - bail out!
|
|
|
|
if (TranslatedPtr != Inserted.first->second)
|
|
|
|
return false;
|
|
|
|
// ... otherwise just skip it.
|
2015-08-13 17:36:11 +02:00
|
|
|
continue;
|
2020-02-21 23:40:22 +01:00
|
|
|
}
|
2021-02-05 06:18:05 +01:00
|
|
|
WorkList.push_back(std::make_pair(Pred, PredAddr));
|
2015-08-13 17:36:11 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-12-04 10:57:35 +01:00
|
|
|
static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierStart,
|
|
|
|
uint64_t &EarlierSize, int64_t LaterStart,
|
2020-12-04 10:53:17 +01:00
|
|
|
uint64_t LaterSize, bool IsOverwriteEnd) {
|
2018-05-10 17:12:49 +02:00
|
|
|
auto *EarlierIntrinsic = cast<AnyMemIntrinsic>(EarlierWrite);
|
2020-12-04 10:57:35 +01:00
|
|
|
Align PrefAlign = EarlierIntrinsic->getDestAlign().valueOrOne();
|
|
|
|
|
|
|
|
// We assume that memet/memcpy operates in chunks of the "largest" native
|
|
|
|
// type size and aligned on the same value. That means optimal start and size
|
|
|
|
// of memset/memcpy should be modulo of preferred alignment of that type. That
|
|
|
|
// is it there is no any sense in trying to reduce store size any further
|
|
|
|
// since any "extra" stores comes for free anyway.
|
|
|
|
// On the other hand, maximum alignment we can achieve is limited by alignment
|
|
|
|
// of initial store.
|
|
|
|
|
|
|
|
// TODO: Limit maximum alignment by preferred (or abi?) alignment of the
|
|
|
|
// "largest" native type.
|
|
|
|
// Note: What is the proper way to get that value?
|
|
|
|
// Should TargetTransformInfo::getRegisterBitWidth be used or anything else?
|
|
|
|
// PrefAlign = std::min(DL.getPrefTypeAlign(LargestType), PrefAlign);
|
|
|
|
|
|
|
|
int64_t ToRemoveStart = 0;
|
|
|
|
uint64_t ToRemoveSize = 0;
|
|
|
|
// Compute start and size of the region to remove. Make sure 'PrefAlign' is
|
|
|
|
// maintained on the remaining store.
|
|
|
|
if (IsOverwriteEnd) {
|
|
|
|
// Calculate required adjustment for 'LaterStart'in order to keep remaining
|
|
|
|
// store size aligned on 'PerfAlign'.
|
|
|
|
uint64_t Off =
|
|
|
|
offsetToAlignment(uint64_t(LaterStart - EarlierStart), PrefAlign);
|
|
|
|
ToRemoveStart = LaterStart + Off;
|
|
|
|
if (EarlierSize <= uint64_t(ToRemoveStart - EarlierStart))
|
|
|
|
return false;
|
|
|
|
ToRemoveSize = EarlierSize - uint64_t(ToRemoveStart - EarlierStart);
|
|
|
|
} else {
|
|
|
|
ToRemoveStart = EarlierStart;
|
|
|
|
assert(LaterSize >= uint64_t(EarlierStart - LaterStart) &&
|
|
|
|
"Not overlapping accesses?");
|
|
|
|
ToRemoveSize = LaterSize - uint64_t(EarlierStart - LaterStart);
|
|
|
|
// Calculate required adjustment for 'ToRemoveSize'in order to keep
|
|
|
|
// start of the remaining store aligned on 'PerfAlign'.
|
|
|
|
uint64_t Off = offsetToAlignment(ToRemoveSize, PrefAlign);
|
|
|
|
if (Off != 0) {
|
|
|
|
if (ToRemoveSize <= (PrefAlign.value() - Off))
|
|
|
|
return false;
|
|
|
|
ToRemoveSize -= PrefAlign.value() - Off;
|
|
|
|
}
|
|
|
|
assert(isAligned(PrefAlign, ToRemoveSize) &&
|
|
|
|
"Should preserve selected alignment");
|
|
|
|
}
|
2016-07-22 20:27:24 +02:00
|
|
|
|
2020-12-04 10:57:35 +01:00
|
|
|
assert(ToRemoveSize > 0 && "Shouldn't reach here if nothing to remove");
|
|
|
|
assert(EarlierSize > ToRemoveSize && "Can't remove more than original size");
|
2018-05-10 17:12:49 +02:00
|
|
|
|
2020-12-04 10:57:35 +01:00
|
|
|
uint64_t NewSize = EarlierSize - ToRemoveSize;
|
2018-05-10 17:12:49 +02:00
|
|
|
if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(EarlierWrite)) {
|
|
|
|
// When shortening an atomic memory intrinsic, the newly shortened
|
|
|
|
// length must remain an integer multiple of the element size.
|
|
|
|
const uint32_t ElementSize = AMI->getElementSizeInBytes();
|
2020-12-04 10:57:35 +01:00
|
|
|
if (0 != NewSize % ElementSize)
|
2018-05-10 17:12:49 +02:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW "
|
|
|
|
<< (IsOverwriteEnd ? "END" : "BEGIN") << ": "
|
2020-12-04 10:57:35 +01:00
|
|
|
<< *EarlierWrite << "\n KILLER [" << ToRemoveStart << ", "
|
|
|
|
<< int64_t(ToRemoveStart + ToRemoveSize) << ")\n");
|
2016-07-22 20:27:24 +02:00
|
|
|
|
|
|
|
Value *EarlierWriteLength = EarlierIntrinsic->getLength();
|
|
|
|
Value *TrimmedLength =
|
2020-12-04 10:57:35 +01:00
|
|
|
ConstantInt::get(EarlierWriteLength->getType(), NewSize);
|
2016-07-22 20:27:24 +02:00
|
|
|
EarlierIntrinsic->setLength(TrimmedLength);
|
2020-12-04 10:57:35 +01:00
|
|
|
EarlierIntrinsic->setDestAlignment(PrefAlign);
|
2016-07-22 20:27:24 +02:00
|
|
|
|
|
|
|
if (!IsOverwriteEnd) {
|
2021-06-27 16:11:20 +02:00
|
|
|
Value *OrigDest = EarlierIntrinsic->getRawDest();
|
2021-06-27 20:26:00 +02:00
|
|
|
Type *Int8PtrTy =
|
|
|
|
Type::getInt8PtrTy(EarlierIntrinsic->getContext(),
|
|
|
|
OrigDest->getType()->getPointerAddressSpace());
|
2021-06-27 16:11:20 +02:00
|
|
|
Value *Dest = OrigDest;
|
|
|
|
if (OrigDest->getType() != Int8PtrTy)
|
|
|
|
Dest = CastInst::CreatePointerCast(OrigDest, Int8PtrTy, "", EarlierWrite);
|
2016-07-22 20:27:24 +02:00
|
|
|
Value *Indices[1] = {
|
2020-12-04 10:57:35 +01:00
|
|
|
ConstantInt::get(EarlierWriteLength->getType(), ToRemoveSize)};
|
2021-06-27 16:11:20 +02:00
|
|
|
Instruction *NewDestGEP = GetElementPtrInst::CreateInBounds(
|
|
|
|
Type::getInt8Ty(EarlierIntrinsic->getContext()),
|
|
|
|
Dest, Indices, "", EarlierWrite);
|
2019-04-12 11:47:35 +02:00
|
|
|
NewDestGEP->setDebugLoc(EarlierIntrinsic->getDebugLoc());
|
2021-06-27 16:11:20 +02:00
|
|
|
if (NewDestGEP->getType() != OrigDest->getType())
|
|
|
|
NewDestGEP = CastInst::CreatePointerCast(NewDestGEP, OrigDest->getType(),
|
|
|
|
"", EarlierWrite);
|
2016-07-22 20:27:24 +02:00
|
|
|
EarlierIntrinsic->setDest(NewDestGEP);
|
|
|
|
}
|
2020-12-04 10:57:35 +01:00
|
|
|
|
|
|
|
// Finally update start and size of earlier access.
|
|
|
|
if (!IsOverwriteEnd)
|
|
|
|
EarlierStart += ToRemoveSize;
|
|
|
|
EarlierSize = NewSize;
|
|
|
|
|
2016-07-22 20:27:24 +02:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool tryToShortenEnd(Instruction *EarlierWrite,
|
|
|
|
OverlapIntervalsTy &IntervalMap,
|
2020-12-04 10:53:17 +01:00
|
|
|
int64_t &EarlierStart, uint64_t &EarlierSize) {
|
2016-07-22 20:27:24 +02:00
|
|
|
if (IntervalMap.empty() || !isShortenableAtTheEnd(EarlierWrite))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
OverlapIntervalsTy::iterator OII = --IntervalMap.end();
|
|
|
|
int64_t LaterStart = OII->second;
|
2020-12-04 10:53:17 +01:00
|
|
|
uint64_t LaterSize = OII->first - LaterStart;
|
2016-07-22 20:27:24 +02:00
|
|
|
|
2020-12-04 10:53:17 +01:00
|
|
|
assert(OII->first - LaterStart >= 0 && "Size expected to be positive");
|
|
|
|
|
|
|
|
if (LaterStart > EarlierStart &&
|
|
|
|
// Note: "LaterStart - EarlierStart" is known to be positive due to
|
|
|
|
// preceding check.
|
|
|
|
(uint64_t)(LaterStart - EarlierStart) < EarlierSize &&
|
|
|
|
// Note: "EarlierSize - (uint64_t)(LaterStart - EarlierStart)" is known to
|
|
|
|
// be non negative due to preceding checks.
|
|
|
|
LaterSize >= EarlierSize - (uint64_t)(LaterStart - EarlierStart)) {
|
2016-07-22 20:27:24 +02:00
|
|
|
if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
|
|
|
|
LaterSize, true)) {
|
|
|
|
IntervalMap.erase(OII);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool tryToShortenBegin(Instruction *EarlierWrite,
|
|
|
|
OverlapIntervalsTy &IntervalMap,
|
2020-12-04 10:53:17 +01:00
|
|
|
int64_t &EarlierStart, uint64_t &EarlierSize) {
|
2016-07-22 20:27:24 +02:00
|
|
|
if (IntervalMap.empty() || !isShortenableAtTheBeginning(EarlierWrite))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
OverlapIntervalsTy::iterator OII = IntervalMap.begin();
|
|
|
|
int64_t LaterStart = OII->second;
|
2020-12-04 10:53:17 +01:00
|
|
|
uint64_t LaterSize = OII->first - LaterStart;
|
|
|
|
|
|
|
|
assert(OII->first - LaterStart >= 0 && "Size expected to be positive");
|
2016-07-22 20:27:24 +02:00
|
|
|
|
2020-12-04 10:53:17 +01:00
|
|
|
if (LaterStart <= EarlierStart &&
|
|
|
|
// Note: "EarlierStart - LaterStart" is known to be non negative due to
|
|
|
|
// preceding check.
|
|
|
|
LaterSize > (uint64_t)(EarlierStart - LaterStart)) {
|
|
|
|
// Note: "LaterSize - (uint64_t)(EarlierStart - LaterStart)" is known to be
|
|
|
|
// positive due to preceding checks.
|
|
|
|
assert(LaterSize - (uint64_t)(EarlierStart - LaterStart) < EarlierSize &&
|
2017-03-29 16:42:27 +02:00
|
|
|
"Should have been handled as OW_Complete");
|
2016-07-22 20:27:24 +02:00
|
|
|
if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
|
|
|
|
LaterSize, false)) {
|
|
|
|
IntervalMap.erase(OII);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-08-19 20:32:49 +02:00
|
|
|
static bool removePartiallyOverlappedStores(const DataLayout &DL,
|
2020-09-02 21:06:58 +02:00
|
|
|
InstOverlapIntervalsTy &IOL,
|
|
|
|
const TargetLibraryInfo &TLI) {
|
2016-07-22 20:27:24 +02:00
|
|
|
bool Changed = false;
|
|
|
|
for (auto OI : IOL) {
|
|
|
|
Instruction *EarlierWrite = OI.first;
|
2020-09-02 21:06:58 +02:00
|
|
|
MemoryLocation Loc = getLocForWrite(EarlierWrite, TLI);
|
2016-07-22 20:27:24 +02:00
|
|
|
assert(isRemovable(EarlierWrite) && "Expect only removable instruction");
|
|
|
|
|
|
|
|
const Value *Ptr = Loc.Ptr->stripPointerCasts();
|
|
|
|
int64_t EarlierStart = 0;
|
2020-12-04 10:53:17 +01:00
|
|
|
uint64_t EarlierSize = Loc.Size.getValue();
|
2016-07-22 20:27:24 +02:00
|
|
|
GetPointerBaseWithConstantOffset(Ptr, EarlierStart, DL);
|
|
|
|
OverlapIntervalsTy &IntervalMap = OI.second;
|
2016-07-27 19:25:20 +02:00
|
|
|
Changed |=
|
2016-07-22 20:27:24 +02:00
|
|
|
tryToShortenEnd(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
|
|
|
|
if (IntervalMap.empty())
|
|
|
|
continue;
|
|
|
|
Changed |=
|
|
|
|
tryToShortenBegin(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
|
|
|
|
}
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
2020-08-22 09:36:35 +02:00
|
|
|
static Constant *tryToMergePartialOverlappingStores(
|
|
|
|
StoreInst *Earlier, StoreInst *Later, int64_t InstWriteOffset,
|
2021-03-03 20:38:50 +01:00
|
|
|
int64_t DepWriteOffset, const DataLayout &DL, BatchAAResults &AA,
|
|
|
|
DominatorTree *DT) {
|
2020-06-15 16:37:33 +02:00
|
|
|
|
|
|
|
if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) &&
|
|
|
|
DL.typeSizeEqualsStoreSize(Earlier->getValueOperand()->getType()) &&
|
|
|
|
Later && isa<ConstantInt>(Later->getValueOperand()) &&
|
|
|
|
DL.typeSizeEqualsStoreSize(Later->getValueOperand()->getType()) &&
|
|
|
|
memoryIsNotModifiedBetween(Earlier, Later, AA, DL, DT)) {
|
|
|
|
// If the store we find is:
|
|
|
|
// a) partially overwritten by the store to 'Loc'
|
|
|
|
// b) the later store is fully contained in the earlier one and
|
|
|
|
// c) they both have a constant value
|
|
|
|
// d) none of the two stores need padding
|
|
|
|
// Merge the two stores, replacing the earlier store's value with a
|
|
|
|
// merge of both values.
|
|
|
|
// TODO: Deal with other constant types (vectors, etc), and probably
|
|
|
|
// some mem intrinsics (if needed)
|
|
|
|
|
|
|
|
APInt EarlierValue =
|
|
|
|
cast<ConstantInt>(Earlier->getValueOperand())->getValue();
|
|
|
|
APInt LaterValue = cast<ConstantInt>(Later->getValueOperand())->getValue();
|
|
|
|
unsigned LaterBits = LaterValue.getBitWidth();
|
|
|
|
assert(EarlierValue.getBitWidth() > LaterValue.getBitWidth());
|
|
|
|
LaterValue = LaterValue.zext(EarlierValue.getBitWidth());
|
|
|
|
|
|
|
|
// Offset of the smaller store inside the larger store
|
|
|
|
unsigned BitOffsetDiff = (InstWriteOffset - DepWriteOffset) * 8;
|
|
|
|
unsigned LShiftAmount = DL.isBigEndian() ? EarlierValue.getBitWidth() -
|
|
|
|
BitOffsetDiff - LaterBits
|
|
|
|
: BitOffsetDiff;
|
|
|
|
APInt Mask = APInt::getBitsSet(EarlierValue.getBitWidth(), LShiftAmount,
|
|
|
|
LShiftAmount + LaterBits);
|
|
|
|
// Clear the bits we'll be replacing, then OR with the smaller
|
|
|
|
// store, shifted appropriately.
|
|
|
|
APInt Merged = (EarlierValue & ~Mask) | (LaterValue << LShiftAmount);
|
|
|
|
LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n Earlier: " << *Earlier
|
|
|
|
<< "\n Later: " << *Later
|
|
|
|
<< "\n Merged Value: " << Merged << '\n');
|
|
|
|
return ConstantInt::get(Earlier->getValueOperand()->getType(), Merged);
|
|
|
|
}
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
namespace {
|
2020-10-18 16:19:05 +02:00
|
|
|
// Returns true if \p I is an intrisnic that does not read or write memory.
|
|
|
|
bool isNoopIntrinsic(Instruction *I) {
|
|
|
|
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
|
2020-02-11 19:27:41 +01:00
|
|
|
switch (II->getIntrinsicID()) {
|
|
|
|
case Intrinsic::lifetime_start:
|
|
|
|
case Intrinsic::lifetime_end:
|
|
|
|
case Intrinsic::invariant_end:
|
|
|
|
case Intrinsic::launder_invariant_group:
|
|
|
|
case Intrinsic::assume:
|
|
|
|
return true;
|
|
|
|
case Intrinsic::dbg_addr:
|
|
|
|
case Intrinsic::dbg_declare:
|
|
|
|
case Intrinsic::dbg_label:
|
|
|
|
case Intrinsic::dbg_value:
|
|
|
|
llvm_unreachable("Intrinsic should not be modeled in MemorySSA");
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if we can ignore \p D for DSE.
|
2021-07-23 11:51:59 +02:00
|
|
|
bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
|
2020-02-11 19:27:41 +01:00
|
|
|
Instruction *DI = D->getMemoryInst();
|
|
|
|
// Calls that only access inaccessible memory cannot read or write any memory
|
|
|
|
// locations we consider for elimination.
|
2020-04-23 18:15:04 +02:00
|
|
|
if (auto *CB = dyn_cast<CallBase>(DI))
|
2021-07-23 11:51:59 +02:00
|
|
|
if (CB->onlyAccessesInaccessibleMemory())
|
2020-02-11 19:27:41 +01:00
|
|
|
return true;
|
2021-07-23 11:51:59 +02:00
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
// We can eliminate stores to locations not visible to the caller across
|
|
|
|
// throwing instructions.
|
|
|
|
if (DI->mayThrow() && !DefVisibleToCaller)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// We can remove the dead stores, irrespective of the fence and its ordering
|
|
|
|
// (release/acquire/seq_cst). Fences only constraints the ordering of
|
|
|
|
// already visible stores, it does not make a store visible to other
|
|
|
|
// threads. So, skipping over a fence does not change a store from being
|
|
|
|
// dead.
|
|
|
|
if (isa<FenceInst>(DI))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// Skip intrinsics that do not really read or modify memory.
|
2021-07-23 11:51:59 +02:00
|
|
|
if (isNoopIntrinsic(D->getMemoryInst()))
|
2020-02-11 19:27:41 +01:00
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
struct DSEState {
|
|
|
|
Function &F;
|
|
|
|
AliasAnalysis &AA;
|
2020-08-22 09:36:35 +02:00
|
|
|
|
|
|
|
/// The single BatchAA instance that is used to cache AA queries. It will
|
|
|
|
/// not be invalidated over the whole run. This is safe, because:
|
|
|
|
/// 1. Only memory writes are removed, so the alias cache for memory
|
|
|
|
/// locations remains valid.
|
|
|
|
/// 2. No new instructions are added (only instructions removed), so cached
|
|
|
|
/// information for a deleted value cannot be accessed by a re-used new
|
|
|
|
/// value pointer.
|
|
|
|
BatchAAResults BatchAA;
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
MemorySSA &MSSA;
|
|
|
|
DominatorTree &DT;
|
|
|
|
PostDominatorTree &PDT;
|
|
|
|
const TargetLibraryInfo &TLI;
|
2020-08-23 16:55:48 +02:00
|
|
|
const DataLayout &DL;
|
2021-06-20 18:03:30 +02:00
|
|
|
const LoopInfo &LI;
|
|
|
|
|
|
|
|
// Whether the function contains any irreducible control flow, useful for
|
|
|
|
// being accurately able to detect loops.
|
|
|
|
bool ContainsIrreducibleLoops;
|
2020-02-11 19:27:41 +01:00
|
|
|
|
|
|
|
// All MemoryDefs that potentially could kill other MemDefs.
|
|
|
|
SmallVector<MemoryDef *, 64> MemDefs;
|
|
|
|
// Any that should be skipped as they are already deleted
|
|
|
|
SmallPtrSet<MemoryAccess *, 4> SkipStores;
|
2020-04-15 11:42:58 +02:00
|
|
|
// Keep track of all of the objects that are invisible to the caller before
|
|
|
|
// the function returns.
|
2020-08-21 19:17:00 +02:00
|
|
|
// SmallPtrSet<const Value *, 16> InvisibleToCallerBeforeRet;
|
|
|
|
DenseMap<const Value *, bool> InvisibleToCallerBeforeRet;
|
2020-04-15 11:42:58 +02:00
|
|
|
// Keep track of all of the objects that are invisible to the caller after
|
|
|
|
// the function returns.
|
2020-08-21 19:17:00 +02:00
|
|
|
DenseMap<const Value *, bool> InvisibleToCallerAfterRet;
|
2020-02-11 19:27:41 +01:00
|
|
|
// Keep track of blocks with throwing instructions not modeled in MemorySSA.
|
|
|
|
SmallPtrSet<BasicBlock *, 16> ThrowingBlocks;
|
2020-03-20 08:51:29 +01:00
|
|
|
// Post-order numbers for each basic block. Used to figure out if memory
|
|
|
|
// accesses are executed before another access.
|
|
|
|
DenseMap<BasicBlock *, unsigned> PostOrderNumbers;
|
2020-02-11 19:27:41 +01:00
|
|
|
|
2020-02-23 16:39:15 +01:00
|
|
|
/// Keep track of instructions (partly) overlapping with killing MemoryDefs per
|
|
|
|
/// basic block.
|
|
|
|
DenseMap<BasicBlock *, InstOverlapIntervalsTy> IOLs;
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
|
2021-06-20 18:03:30 +02:00
|
|
|
PostDominatorTree &PDT, const TargetLibraryInfo &TLI,
|
|
|
|
const LoopInfo &LI)
|
2021-03-16 14:36:17 +01:00
|
|
|
: F(F), AA(AA), BatchAA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI),
|
2021-06-20 18:03:30 +02:00
|
|
|
DL(F.getParent()->getDataLayout()), LI(LI) {}
|
2020-02-11 19:27:41 +01:00
|
|
|
|
|
|
|
static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
|
|
|
|
DominatorTree &DT, PostDominatorTree &PDT,
|
2021-06-20 18:03:30 +02:00
|
|
|
const TargetLibraryInfo &TLI, const LoopInfo &LI) {
|
|
|
|
DSEState State(F, AA, MSSA, DT, PDT, TLI, LI);
|
2020-02-11 19:27:41 +01:00
|
|
|
// Collect blocks with throwing instructions not modeled in MemorySSA and
|
|
|
|
// alloc-like objects.
|
2020-03-20 08:51:29 +01:00
|
|
|
unsigned PO = 0;
|
|
|
|
for (BasicBlock *BB : post_order(&F)) {
|
|
|
|
State.PostOrderNumbers[BB] = PO++;
|
|
|
|
for (Instruction &I : *BB) {
|
2020-04-08 15:17:48 +02:00
|
|
|
MemoryAccess *MA = MSSA.getMemoryAccess(&I);
|
|
|
|
if (I.mayThrow() && !MA)
|
2020-03-20 08:51:29 +01:00
|
|
|
State.ThrowingBlocks.insert(I.getParent());
|
|
|
|
|
2020-04-08 15:17:48 +02:00
|
|
|
auto *MD = dyn_cast_or_null<MemoryDef>(MA);
|
2020-03-20 08:51:29 +01:00
|
|
|
if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit &&
|
2020-07-08 09:42:55 +02:00
|
|
|
(State.getLocForWriteEx(&I) || State.isMemTerminatorInst(&I)))
|
2020-03-20 08:51:29 +01:00
|
|
|
State.MemDefs.push_back(MD);
|
|
|
|
}
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
2020-03-20 08:51:29 +01:00
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
// Treat byval or inalloca arguments the same as Allocas, stores to them are
|
|
|
|
// dead at the end of the function.
|
|
|
|
for (Argument &AI : F.args())
|
2020-06-29 21:13:32 +02:00
|
|
|
if (AI.hasPassPointeeByValueCopyAttr()) {
|
2020-06-23 09:58:51 +02:00
|
|
|
// For byval, the caller doesn't know the address of the allocation.
|
|
|
|
if (AI.hasByValAttr())
|
2020-08-21 19:17:00 +02:00
|
|
|
State.InvisibleToCallerBeforeRet.insert({&AI, true});
|
|
|
|
State.InvisibleToCallerAfterRet.insert({&AI, true});
|
2020-06-23 09:58:51 +02:00
|
|
|
}
|
|
|
|
|
2021-06-20 18:03:30 +02:00
|
|
|
// Collect whether there is any irreducible control flow in the function.
|
|
|
|
State.ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI);
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
return State;
|
|
|
|
}
|
|
|
|
|
2021-05-14 10:16:51 +02:00
|
|
|
/// Return 'OW_Complete' if a store to the 'Later' location (by \p LaterI
|
|
|
|
/// instruction) completely overwrites a store to the 'Earlier' location.
|
|
|
|
/// (by \p EarlierI instruction).
|
|
|
|
/// Return OW_MaybePartial if \p Later does not completely overwrite
|
|
|
|
/// \p Earlier, but they both write to the same underlying object. In that
|
|
|
|
/// case, use isPartialOverwrite to check if \p Later partially overwrites
|
|
|
|
/// \p Earlier. Returns 'OW_Unknown' if nothing can be determined.
|
|
|
|
OverwriteResult
|
|
|
|
isOverwrite(const Instruction *LaterI, const Instruction *EarlierI,
|
|
|
|
const MemoryLocation &Later, const MemoryLocation &Earlier,
|
|
|
|
int64_t &EarlierOff, int64_t &LaterOff) {
|
2021-06-20 18:03:30 +02:00
|
|
|
// AliasAnalysis does not always account for loops. Limit overwrite checks
|
|
|
|
// to dependencies for which we can guarantee they are independant of any
|
|
|
|
// loops they are in.
|
|
|
|
if (!isGuaranteedLoopIndependent(EarlierI, LaterI, Earlier))
|
|
|
|
return OW_Unknown;
|
|
|
|
|
2021-05-14 10:16:51 +02:00
|
|
|
// FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll
|
|
|
|
// get imprecise values here, though (except for unknown sizes).
|
|
|
|
if (!Later.Size.isPrecise() || !Earlier.Size.isPrecise()) {
|
|
|
|
// In case no constant size is known, try to an IR values for the number
|
|
|
|
// of bytes written and check if they match.
|
|
|
|
const auto *LaterMemI = dyn_cast<MemIntrinsic>(LaterI);
|
|
|
|
const auto *EarlierMemI = dyn_cast<MemIntrinsic>(EarlierI);
|
|
|
|
if (LaterMemI && EarlierMemI) {
|
|
|
|
const Value *LaterV = LaterMemI->getLength();
|
|
|
|
const Value *EarlierV = EarlierMemI->getLength();
|
|
|
|
if (LaterV == EarlierV && BatchAA.isMustAlias(Earlier, Later))
|
|
|
|
return OW_Complete;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Masked stores have imprecise locations, but we can reason about them
|
|
|
|
// to some extent.
|
|
|
|
return isMaskedStoreOverwrite(LaterI, EarlierI, BatchAA);
|
|
|
|
}
|
|
|
|
|
|
|
|
const uint64_t LaterSize = Later.Size.getValue();
|
|
|
|
const uint64_t EarlierSize = Earlier.Size.getValue();
|
|
|
|
|
|
|
|
// Query the alias information
|
|
|
|
AliasResult AAR = BatchAA.alias(Later, Earlier);
|
|
|
|
|
|
|
|
// If the start pointers are the same, we just have to compare sizes to see if
|
|
|
|
// the later store was larger than the earlier store.
|
|
|
|
if (AAR == AliasResult::MustAlias) {
|
|
|
|
// Make sure that the Later size is >= the Earlier size.
|
|
|
|
if (LaterSize >= EarlierSize)
|
|
|
|
return OW_Complete;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we hit a partial alias we may have a full overwrite
|
|
|
|
if (AAR == AliasResult::PartialAlias && AAR.hasOffset()) {
|
|
|
|
int32_t Off = AAR.getOffset();
|
|
|
|
if (Off >= 0 && (uint64_t)Off + EarlierSize <= LaterSize)
|
|
|
|
return OW_Complete;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check to see if the later store is to the entire object (either a global,
|
|
|
|
// an alloca, or a byval/inalloca argument). If so, then it clearly
|
|
|
|
// overwrites any other store to the same object.
|
|
|
|
const Value *P1 = Earlier.Ptr->stripPointerCasts();
|
|
|
|
const Value *P2 = Later.Ptr->stripPointerCasts();
|
|
|
|
const Value *UO1 = getUnderlyingObject(P1), *UO2 = getUnderlyingObject(P2);
|
|
|
|
|
|
|
|
// If we can't resolve the same pointers to the same object, then we can't
|
|
|
|
// analyze them at all.
|
|
|
|
if (UO1 != UO2)
|
|
|
|
return OW_Unknown;
|
|
|
|
|
|
|
|
// If the "Later" store is to a recognizable object, get its size.
|
|
|
|
uint64_t ObjectSize = getPointerSize(UO2, DL, TLI, &F);
|
|
|
|
if (ObjectSize != MemoryLocation::UnknownSize)
|
|
|
|
if (ObjectSize == LaterSize && ObjectSize >= EarlierSize)
|
|
|
|
return OW_Complete;
|
|
|
|
|
|
|
|
// Okay, we have stores to two completely different pointers. Try to
|
|
|
|
// decompose the pointer into a "base + constant_offset" form. If the base
|
|
|
|
// pointers are equal, then we can reason about the two stores.
|
|
|
|
EarlierOff = 0;
|
|
|
|
LaterOff = 0;
|
|
|
|
const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, DL);
|
|
|
|
const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, DL);
|
|
|
|
|
|
|
|
// If the base pointers still differ, we have two completely different stores.
|
|
|
|
if (BP1 != BP2)
|
|
|
|
return OW_Unknown;
|
|
|
|
|
|
|
|
// The later access completely overlaps the earlier store if and only if
|
|
|
|
// both start and end of the earlier one is "inside" the later one:
|
|
|
|
// |<->|--earlier--|<->|
|
|
|
|
// |-------later-------|
|
|
|
|
// Accesses may overlap if and only if start of one of them is "inside"
|
|
|
|
// another one:
|
|
|
|
// |<->|--earlier--|<----->|
|
|
|
|
// |-------later-------|
|
|
|
|
// OR
|
|
|
|
// |----- earlier -----|
|
|
|
|
// |<->|---later---|<----->|
|
|
|
|
//
|
|
|
|
// We have to be careful here as *Off is signed while *.Size is unsigned.
|
|
|
|
|
|
|
|
// Check if the earlier access starts "not before" the later one.
|
|
|
|
if (EarlierOff >= LaterOff) {
|
|
|
|
// If the earlier access ends "not after" the later access then the earlier
|
|
|
|
// one is completely overwritten by the later one.
|
|
|
|
if (uint64_t(EarlierOff - LaterOff) + EarlierSize <= LaterSize)
|
|
|
|
return OW_Complete;
|
|
|
|
// If start of the earlier access is "before" end of the later access then
|
|
|
|
// accesses overlap.
|
|
|
|
else if ((uint64_t)(EarlierOff - LaterOff) < LaterSize)
|
|
|
|
return OW_MaybePartial;
|
|
|
|
}
|
|
|
|
// If start of the later access is "before" end of the earlier access then
|
|
|
|
// accesses overlap.
|
|
|
|
else if ((uint64_t)(LaterOff - EarlierOff) < EarlierSize) {
|
|
|
|
return OW_MaybePartial;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Can reach here only if accesses are known not to overlap. There is no
|
|
|
|
// dedicated code to indicate no overlap so signal "unknown".
|
|
|
|
return OW_Unknown;
|
|
|
|
}
|
|
|
|
|
2020-08-21 19:17:00 +02:00
|
|
|
bool isInvisibleToCallerAfterRet(const Value *V) {
|
|
|
|
if (isa<AllocaInst>(V))
|
|
|
|
return true;
|
|
|
|
auto I = InvisibleToCallerAfterRet.insert({V, false});
|
|
|
|
if (I.second) {
|
|
|
|
if (!isInvisibleToCallerBeforeRet(V)) {
|
|
|
|
I.first->second = false;
|
|
|
|
} else {
|
|
|
|
auto *Inst = dyn_cast<Instruction>(V);
|
|
|
|
if (Inst && isAllocLikeFn(Inst, &TLI))
|
|
|
|
I.first->second = !PointerMayBeCaptured(V, true, false);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return I.first->second;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool isInvisibleToCallerBeforeRet(const Value *V) {
|
|
|
|
if (isa<AllocaInst>(V))
|
|
|
|
return true;
|
|
|
|
auto I = InvisibleToCallerBeforeRet.insert({V, false});
|
|
|
|
if (I.second) {
|
|
|
|
auto *Inst = dyn_cast<Instruction>(V);
|
|
|
|
if (Inst && isAllocLikeFn(Inst, &TLI))
|
|
|
|
// NOTE: This could be made more precise by PointerMayBeCapturedBefore
|
|
|
|
// with the killing MemoryDef. But we refrain from doing so for now to
|
|
|
|
// limit compile-time and this does not cause any changes to the number
|
|
|
|
// of stores removed on a large test set in practice.
|
|
|
|
I.first->second = !PointerMayBeCaptured(V, false, true);
|
|
|
|
}
|
|
|
|
return I.first->second;
|
|
|
|
}
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
Optional<MemoryLocation> getLocForWriteEx(Instruction *I) const {
|
|
|
|
if (!I->mayWriteToMemory())
|
|
|
|
return None;
|
|
|
|
|
|
|
|
if (auto *MTI = dyn_cast<AnyMemIntrinsic>(I))
|
|
|
|
return {MemoryLocation::getForDest(MTI)};
|
|
|
|
|
2020-04-23 18:15:04 +02:00
|
|
|
if (auto *CB = dyn_cast<CallBase>(I)) {
|
2020-10-20 14:27:54 +02:00
|
|
|
// If the functions may write to memory we do not know about, bail out.
|
|
|
|
if (!CB->onlyAccessesArgMemory() &&
|
|
|
|
!CB->onlyAccessesInaccessibleMemOrArgMem())
|
|
|
|
return None;
|
|
|
|
|
2020-07-02 11:27:09 +02:00
|
|
|
LibFunc LF;
|
|
|
|
if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) {
|
|
|
|
switch (LF) {
|
|
|
|
case LibFunc_strcpy:
|
|
|
|
case LibFunc_strncpy:
|
|
|
|
case LibFunc_strcat:
|
|
|
|
case LibFunc_strncat:
|
2020-11-17 20:11:09 +01:00
|
|
|
return {MemoryLocation::getAfter(CB->getArgOperand(0))};
|
2020-07-02 11:27:09 +02:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
2020-07-15 13:17:31 +02:00
|
|
|
switch (CB->getIntrinsicID()) {
|
|
|
|
case Intrinsic::init_trampoline:
|
2020-11-17 20:11:09 +01:00
|
|
|
return {MemoryLocation::getAfter(CB->getArgOperand(0))};
|
2020-09-09 22:23:34 +02:00
|
|
|
case Intrinsic::masked_store:
|
|
|
|
return {MemoryLocation::getForArgument(CB, 1, TLI)};
|
2020-07-15 13:17:31 +02:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
2020-02-11 19:27:41 +01:00
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
|
|
|
return MemoryLocation::getOrNone(I);
|
|
|
|
}
|
|
|
|
|
2020-09-09 22:23:34 +02:00
|
|
|
/// Returns true if \p UseInst completely overwrites \p DefLoc
|
|
|
|
/// (stored by \p DefInst).
|
2020-12-16 21:34:47 +01:00
|
|
|
bool isCompleteOverwrite(const MemoryLocation &DefLoc, Instruction *DefInst,
|
2020-09-09 22:23:34 +02:00
|
|
|
Instruction *UseInst) {
|
2020-02-11 19:27:41 +01:00
|
|
|
// UseInst has a MemoryDef associated in MemorySSA. It's possible for a
|
|
|
|
// MemoryDef to not write to memory, e.g. a volatile load is modeled as a
|
|
|
|
// MemoryDef.
|
|
|
|
if (!UseInst->mayWriteToMemory())
|
|
|
|
return false;
|
|
|
|
|
2020-04-23 18:15:04 +02:00
|
|
|
if (auto *CB = dyn_cast<CallBase>(UseInst))
|
|
|
|
if (CB->onlyAccessesInaccessibleMemory())
|
2020-02-11 19:27:41 +01:00
|
|
|
return false;
|
|
|
|
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
int64_t InstWriteOffset, DepWriteOffset;
|
2020-09-09 22:23:34 +02:00
|
|
|
if (auto CC = getLocForWriteEx(UseInst))
|
2021-05-14 10:16:51 +02:00
|
|
|
return isOverwrite(UseInst, DefInst, *CC, DefLoc, DepWriteOffset,
|
|
|
|
InstWriteOffset) == OW_Complete;
|
2020-09-09 22:23:34 +02:00
|
|
|
return false;
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
|
|
|
|
2020-06-24 10:56:35 +02:00
|
|
|
/// Returns true if \p Def is not read before returning from the function.
|
|
|
|
bool isWriteAtEndOfFunction(MemoryDef *Def) {
|
|
|
|
LLVM_DEBUG(dbgs() << " Check if def " << *Def << " ("
|
|
|
|
<< *Def->getMemoryInst()
|
|
|
|
<< ") is at the end the function \n");
|
|
|
|
|
|
|
|
auto MaybeLoc = getLocForWriteEx(Def->getMemoryInst());
|
|
|
|
if (!MaybeLoc) {
|
|
|
|
LLVM_DEBUG(dbgs() << " ... could not get location for write.\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
SmallVector<MemoryAccess *, 4> WorkList;
|
|
|
|
SmallPtrSet<MemoryAccess *, 8> Visited;
|
|
|
|
auto PushMemUses = [&WorkList, &Visited](MemoryAccess *Acc) {
|
|
|
|
if (!Visited.insert(Acc).second)
|
|
|
|
return;
|
|
|
|
for (Use &U : Acc->uses())
|
|
|
|
WorkList.push_back(cast<MemoryAccess>(U.getUser()));
|
|
|
|
};
|
|
|
|
PushMemUses(Def);
|
|
|
|
for (unsigned I = 0; I < WorkList.size(); I++) {
|
|
|
|
if (WorkList.size() >= MemorySSAScanLimit) {
|
|
|
|
LLVM_DEBUG(dbgs() << " ... hit exploration limit.\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
MemoryAccess *UseAccess = WorkList[I];
|
2020-09-12 19:57:26 +02:00
|
|
|
// Simply adding the users of MemoryPhi to the worklist is not enough,
|
|
|
|
// because we might miss read clobbers in different iterations of a loop,
|
|
|
|
// for example.
|
|
|
|
// TODO: Add support for phi translation to handle the loop case.
|
|
|
|
if (isa<MemoryPhi>(UseAccess))
|
|
|
|
return false;
|
2020-06-24 10:56:35 +02:00
|
|
|
|
|
|
|
// TODO: Checking for aliasing is expensive. Consider reducing the amount
|
|
|
|
// of times this is called and/or caching it.
|
|
|
|
Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
|
|
|
|
if (isReadClobber(*MaybeLoc, UseInst)) {
|
|
|
|
LLVM_DEBUG(dbgs() << " ... hit read clobber " << *UseInst << ".\n");
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess))
|
|
|
|
PushMemUses(UseDef);
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2020-07-08 09:42:55 +02:00
|
|
|
/// If \p I is a memory terminator like llvm.lifetime.end or free, return a
|
|
|
|
/// pair with the MemoryLocation terminated by \p I and a boolean flag
|
|
|
|
/// indicating whether \p I is a free-like call.
|
|
|
|
Optional<std::pair<MemoryLocation, bool>>
|
|
|
|
getLocForTerminator(Instruction *I) const {
|
|
|
|
uint64_t Len;
|
|
|
|
Value *Ptr;
|
|
|
|
if (match(I, m_Intrinsic<Intrinsic::lifetime_end>(m_ConstantInt(Len),
|
|
|
|
m_Value(Ptr))))
|
|
|
|
return {std::make_pair(MemoryLocation(Ptr, Len), false)};
|
|
|
|
|
|
|
|
if (auto *CB = dyn_cast<CallBase>(I)) {
|
|
|
|
if (isFreeCall(I, &TLI))
|
2020-11-17 20:11:09 +01:00
|
|
|
return {std::make_pair(MemoryLocation::getAfter(CB->getArgOperand(0)),
|
2020-11-19 21:41:51 +01:00
|
|
|
true)};
|
2020-07-08 09:42:55 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Returns true if \p I is a memory terminator instruction like
|
|
|
|
/// llvm.lifetime.end or free.
|
|
|
|
bool isMemTerminatorInst(Instruction *I) const {
|
|
|
|
IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
|
|
|
|
return (II && II->getIntrinsicID() == Intrinsic::lifetime_end) ||
|
|
|
|
isFreeCall(I, &TLI);
|
|
|
|
}
|
|
|
|
|
2020-09-26 12:03:25 +02:00
|
|
|
/// Returns true if \p MaybeTerm is a memory terminator for \p Loc from
|
|
|
|
/// instruction \p AccessI.
|
2020-12-16 21:34:47 +01:00
|
|
|
bool isMemTerminator(const MemoryLocation &Loc, Instruction *AccessI,
|
2020-09-26 12:03:25 +02:00
|
|
|
Instruction *MaybeTerm) {
|
2020-07-08 09:42:55 +02:00
|
|
|
Optional<std::pair<MemoryLocation, bool>> MaybeTermLoc =
|
|
|
|
getLocForTerminator(MaybeTerm);
|
|
|
|
|
|
|
|
if (!MaybeTermLoc)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// If the terminator is a free-like call, all accesses to the underlying
|
|
|
|
// object can be considered terminated.
|
2020-09-26 12:03:25 +02:00
|
|
|
if (getUnderlyingObject(Loc.Ptr) !=
|
|
|
|
getUnderlyingObject(MaybeTermLoc->first.Ptr))
|
|
|
|
return false;
|
|
|
|
|
2020-10-31 20:30:19 +01:00
|
|
|
auto TermLoc = MaybeTermLoc->first;
|
|
|
|
if (MaybeTermLoc->second) {
|
|
|
|
const Value *LocUO = getUnderlyingObject(Loc.Ptr);
|
|
|
|
return BatchAA.isMustAlias(TermLoc.Ptr, LocUO);
|
|
|
|
}
|
2020-09-26 12:03:25 +02:00
|
|
|
int64_t InstWriteOffset, DepWriteOffset;
|
2021-05-14 10:16:51 +02:00
|
|
|
return isOverwrite(MaybeTerm, AccessI, TermLoc, Loc, DepWriteOffset,
|
|
|
|
InstWriteOffset) == OW_Complete;
|
2020-07-08 09:42:55 +02:00
|
|
|
}
|
|
|
|
|
2020-06-24 10:56:35 +02:00
|
|
|
// Returns true if \p Use may read from \p DefLoc.
|
2020-12-16 21:34:47 +01:00
|
|
|
bool isReadClobber(const MemoryLocation &DefLoc, Instruction *UseInst) {
|
2020-10-18 16:19:05 +02:00
|
|
|
if (isNoopIntrinsic(UseInst))
|
|
|
|
return false;
|
|
|
|
|
2020-09-09 20:36:41 +02:00
|
|
|
// Monotonic or weaker atomic stores can be re-ordered and do not need to be
|
|
|
|
// treated as read clobber.
|
|
|
|
if (auto SI = dyn_cast<StoreInst>(UseInst))
|
|
|
|
return isStrongerThan(SI->getOrdering(), AtomicOrdering::Monotonic);
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
if (!UseInst->mayReadFromMemory())
|
|
|
|
return false;
|
|
|
|
|
2020-04-23 18:15:04 +02:00
|
|
|
if (auto *CB = dyn_cast<CallBase>(UseInst))
|
|
|
|
if (CB->onlyAccessesInaccessibleMemory())
|
2020-02-11 19:27:41 +01:00
|
|
|
return false;
|
|
|
|
|
2020-08-18 12:12:31 +02:00
|
|
|
// NOTE: For calls, the number of stores removed could be slightly improved
|
|
|
|
// by using AA.callCapturesBefore(UseInst, DefLoc, &DT), but that showed to
|
|
|
|
// be expensive compared to the benefits in practice. For now, avoid more
|
|
|
|
// expensive analysis to limit compile-time.
|
|
|
|
return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc));
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
|
|
|
|
2021-06-20 18:03:30 +02:00
|
|
|
/// Returns true if a dependency between \p Current and \p KillingDef is
|
|
|
|
/// guaranteed to be loop invariant for the loops that they are in. Either
|
|
|
|
/// because they are known to be in the same block, in the same loop level or
|
|
|
|
/// by guaranteeing that \p CurrentLoc only references a single MemoryLocation
|
|
|
|
/// during execution of the containing function.
|
|
|
|
bool isGuaranteedLoopIndependent(const Instruction *Current,
|
|
|
|
const Instruction *KillingDef,
|
|
|
|
const MemoryLocation &CurrentLoc) {
|
|
|
|
// If the dependency is within the same block or loop level (being careful
|
|
|
|
// of irreducible loops), we know that AA will return a valid result for the
|
|
|
|
// memory dependency. (Both at the function level, outside of any loop,
|
|
|
|
// would also be valid but we currently disable that to limit compile time).
|
|
|
|
if (Current->getParent() == KillingDef->getParent())
|
|
|
|
return true;
|
|
|
|
const Loop *CurrentLI = LI.getLoopFor(Current->getParent());
|
|
|
|
if (!ContainsIrreducibleLoops && CurrentLI &&
|
|
|
|
CurrentLI == LI.getLoopFor(KillingDef->getParent()))
|
|
|
|
return true;
|
|
|
|
// Otherwise check the memory location is invariant to any loops.
|
|
|
|
return isGuaranteedLoopInvariant(CurrentLoc.Ptr);
|
|
|
|
}
|
|
|
|
|
2021-06-08 22:23:08 +02:00
|
|
|
/// Returns true if \p Ptr is guaranteed to be loop invariant for any possible
|
|
|
|
/// loop. In particular, this guarantees that it only references a single
|
|
|
|
/// MemoryLocation during execution of the containing function.
|
2021-06-20 18:03:30 +02:00
|
|
|
bool isGuaranteedLoopInvariant(const Value *Ptr) {
|
|
|
|
auto IsGuaranteedLoopInvariantBase = [this](const Value *Ptr) {
|
2020-09-14 12:49:27 +02:00
|
|
|
Ptr = Ptr->stripPointerCasts();
|
|
|
|
if (auto *I = dyn_cast<Instruction>(Ptr)) {
|
|
|
|
if (isa<AllocaInst>(Ptr))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
if (isAllocLikeFn(I, &TLI))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
};
|
|
|
|
|
2021-06-08 22:23:08 +02:00
|
|
|
Ptr = Ptr->stripPointerCasts();
|
2021-02-23 11:22:53 +01:00
|
|
|
if (auto *I = dyn_cast<Instruction>(Ptr)) {
|
2021-05-15 12:38:27 +02:00
|
|
|
if (I->getParent()->isEntryBlock())
|
2021-02-23 11:22:53 +01:00
|
|
|
return true;
|
|
|
|
}
|
2020-09-14 12:49:27 +02:00
|
|
|
if (auto *GEP = dyn_cast<GEPOperator>(Ptr)) {
|
|
|
|
return IsGuaranteedLoopInvariantBase(GEP->getPointerOperand()) &&
|
|
|
|
GEP->hasAllConstantIndices();
|
|
|
|
}
|
|
|
|
return IsGuaranteedLoopInvariantBase(Ptr);
|
|
|
|
}
|
|
|
|
|
2020-08-28 11:31:30 +02:00
|
|
|
// Find a MemoryDef writing to \p DefLoc and dominating \p StartAccess, with
|
|
|
|
// no read access between them or on any other path to a function exit block
|
|
|
|
// if \p DefLoc is not accessible after the function returns. If there is no
|
|
|
|
// such MemoryDef, return None. The returned value may not (completely)
|
|
|
|
// overwrite \p DefLoc. Currently we bail out when we encounter an aliasing
|
|
|
|
// MemoryUse (read).
|
2020-08-25 09:43:32 +02:00
|
|
|
Optional<MemoryAccess *>
|
2020-08-28 11:31:30 +02:00
|
|
|
getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *StartAccess,
|
2020-12-16 21:34:47 +01:00
|
|
|
const MemoryLocation &DefLoc, const Value *DefUO,
|
[DSE] Use correct memory location for read clobber check
MSSA DSE starts at a killing store, finds an earlier store and
then checks that the earlier store is not read along any paths
(without being killed first). However, it uses the memory location
of the killing store for that, not the earlier store that we're
attempting to eliminate.
This has a number of problems:
* Mismatches between what BasicAA considers aliasing and what DSE
considers an overwrite (even though both are correct in isolation)
can result in miscompiles. This is PR48279, which D92045 tries to
fix in a different way. The problem is that we're using a location
from a store that is potentially not executed and thus may be UB,
in which case analysis results can be arbitrary.
* Metadata on the killing store may be used to determine aliasing,
but there is no guarantee that the metadata is valid, as the specific
killing store may not be executed. Using the metadata on the earlier
store is valid (it is the store we're removing, so on any execution
where its removal may be observed, it must be executed).
* The location is imprecise. For full overwrites the killing store
will always have a location that is larger or equal than the earlier
access location, so it's beneficial to use the earlier access
location. This is not the case for partial overwrites, in which
case either location might be smaller. There is some room for
improvement here.
Using the earlier access location means that we can no longer cache
which accesses are read for a given killing store, as we may be
querying different locations. However, it turns out that simply
dropping the cache has no notable impact on compile-time.
Differential Revision: https://reviews.llvm.org/D93523
2020-11-30 23:51:54 +01:00
|
|
|
unsigned &ScanLimit, unsigned &WalkerStepLimit,
|
|
|
|
bool IsMemTerm, unsigned &PartialLimit) {
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
if (ScanLimit == 0 || WalkerStepLimit == 0) {
|
2020-08-14 22:08:16 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "\n ... hit scan limit\n");
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
2020-08-28 11:31:30 +02:00
|
|
|
MemoryAccess *Current = StartAccess;
|
|
|
|
Instruction *KillingI = KillingDef->getMemoryInst();
|
2020-09-06 21:14:58 +02:00
|
|
|
LLVM_DEBUG(dbgs() << " trying to get dominating access\n");
|
2020-08-28 11:31:30 +02:00
|
|
|
|
|
|
|
// Find the next clobbering Mod access for DefLoc, starting at StartAccess.
|
[DSE] Use correct memory location for read clobber check
MSSA DSE starts at a killing store, finds an earlier store and
then checks that the earlier store is not read along any paths
(without being killed first). However, it uses the memory location
of the killing store for that, not the earlier store that we're
attempting to eliminate.
This has a number of problems:
* Mismatches between what BasicAA considers aliasing and what DSE
considers an overwrite (even though both are correct in isolation)
can result in miscompiles. This is PR48279, which D92045 tries to
fix in a different way. The problem is that we're using a location
from a store that is potentially not executed and thus may be UB,
in which case analysis results can be arbitrary.
* Metadata on the killing store may be used to determine aliasing,
but there is no guarantee that the metadata is valid, as the specific
killing store may not be executed. Using the metadata on the earlier
store is valid (it is the store we're removing, so on any execution
where its removal may be observed, it must be executed).
* The location is imprecise. For full overwrites the killing store
will always have a location that is larger or equal than the earlier
access location, so it's beneficial to use the earlier access
location. This is not the case for partial overwrites, in which
case either location might be smaller. There is some room for
improvement here.
Using the earlier access location means that we can no longer cache
which accesses are read for a given killing store, as we may be
querying different locations. However, it turns out that simply
dropping the cache has no notable impact on compile-time.
Differential Revision: https://reviews.llvm.org/D93523
2020-11-30 23:51:54 +01:00
|
|
|
Optional<MemoryLocation> CurrentLoc;
|
2021-07-02 04:59:06 +02:00
|
|
|
for (;; Current = cast<MemoryDef>(Current)->getDefiningAccess()) {
|
2020-09-06 21:14:58 +02:00
|
|
|
LLVM_DEBUG({
|
|
|
|
dbgs() << " visiting " << *Current;
|
|
|
|
if (!MSSA.isLiveOnEntryDef(Current) && isa<MemoryUseOrDef>(Current))
|
|
|
|
dbgs() << " (" << *cast<MemoryUseOrDef>(Current)->getMemoryInst()
|
|
|
|
<< ")";
|
|
|
|
dbgs() << "\n";
|
|
|
|
});
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
// Reached TOP.
|
2020-09-06 21:14:58 +02:00
|
|
|
if (MSSA.isLiveOnEntryDef(Current)) {
|
|
|
|
LLVM_DEBUG(dbgs() << " ... found LiveOnEntryDef\n");
|
2020-02-11 19:27:41 +01:00
|
|
|
return None;
|
2020-09-06 21:14:58 +02:00
|
|
|
}
|
2020-02-11 19:27:41 +01:00
|
|
|
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
// Cost of a step. Accesses in the same block are more likely to be valid
|
|
|
|
// candidates for elimination, hence consider them cheaper.
|
|
|
|
unsigned StepCost = KillingDef->getBlock() == Current->getBlock()
|
|
|
|
? MemorySSASameBBStepCost
|
|
|
|
: MemorySSAOtherBBStepCost;
|
2020-09-06 21:14:58 +02:00
|
|
|
if (WalkerStepLimit <= StepCost) {
|
|
|
|
LLVM_DEBUG(dbgs() << " ... hit walker step limit\n");
|
2020-02-11 19:27:41 +01:00
|
|
|
return None;
|
2020-09-06 21:14:58 +02:00
|
|
|
}
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
WalkerStepLimit -= StepCost;
|
2020-02-11 19:27:41 +01:00
|
|
|
|
2020-08-24 13:12:15 +02:00
|
|
|
// Return for MemoryPhis. They cannot be eliminated directly and the
|
|
|
|
// caller is responsible for traversing them.
|
2020-09-06 21:14:58 +02:00
|
|
|
if (isa<MemoryPhi>(Current)) {
|
|
|
|
LLVM_DEBUG(dbgs() << " ... found MemoryPhi\n");
|
2020-08-24 13:12:15 +02:00
|
|
|
return Current;
|
2020-09-06 21:14:58 +02:00
|
|
|
}
|
2020-03-20 08:51:29 +01:00
|
|
|
|
2020-08-28 11:31:30 +02:00
|
|
|
// Below, check if CurrentDef is a valid candidate to be eliminated by
|
|
|
|
// KillingDef. If it is not, check the next candidate.
|
|
|
|
MemoryDef *CurrentDef = cast<MemoryDef>(Current);
|
|
|
|
Instruction *CurrentI = CurrentDef->getMemoryInst();
|
|
|
|
|
2021-07-23 11:51:59 +02:00
|
|
|
if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(DefUO)))
|
2020-08-28 11:31:30 +02:00
|
|
|
continue;
|
|
|
|
|
|
|
|
// Before we try to remove anything, check for any extra throwing
|
|
|
|
// instructions that block us from DSEing
|
|
|
|
if (mayThrowBetween(KillingI, CurrentI, DefUO)) {
|
|
|
|
LLVM_DEBUG(dbgs() << " ... skip, may throw!\n");
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check for anything that looks like it will be a barrier to further
|
|
|
|
// removal
|
|
|
|
if (isDSEBarrier(DefUO, CurrentI)) {
|
|
|
|
LLVM_DEBUG(dbgs() << " ... skip, barrier\n");
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If Current is known to be on path that reads DefLoc or is a read
|
|
|
|
// clobber, bail out, as the path is not profitable. We skip this check
|
|
|
|
// for intrinsic calls, because the code knows how to handle memcpy
|
|
|
|
// intrinsics.
|
[DSE] Use correct memory location for read clobber check
MSSA DSE starts at a killing store, finds an earlier store and
then checks that the earlier store is not read along any paths
(without being killed first). However, it uses the memory location
of the killing store for that, not the earlier store that we're
attempting to eliminate.
This has a number of problems:
* Mismatches between what BasicAA considers aliasing and what DSE
considers an overwrite (even though both are correct in isolation)
can result in miscompiles. This is PR48279, which D92045 tries to
fix in a different way. The problem is that we're using a location
from a store that is potentially not executed and thus may be UB,
in which case analysis results can be arbitrary.
* Metadata on the killing store may be used to determine aliasing,
but there is no guarantee that the metadata is valid, as the specific
killing store may not be executed. Using the metadata on the earlier
store is valid (it is the store we're removing, so on any execution
where its removal may be observed, it must be executed).
* The location is imprecise. For full overwrites the killing store
will always have a location that is larger or equal than the earlier
access location, so it's beneficial to use the earlier access
location. This is not the case for partial overwrites, in which
case either location might be smaller. There is some room for
improvement here.
Using the earlier access location means that we can no longer cache
which accesses are read for a given killing store, as we may be
querying different locations. However, it turns out that simply
dropping the cache has no notable impact on compile-time.
Differential Revision: https://reviews.llvm.org/D93523
2020-11-30 23:51:54 +01:00
|
|
|
if (!isa<IntrinsicInst>(CurrentI) && isReadClobber(DefLoc, CurrentI))
|
2020-08-28 11:31:30 +02:00
|
|
|
return None;
|
|
|
|
|
2020-09-07 23:52:10 +02:00
|
|
|
// Quick check if there are direct uses that are read-clobbers.
|
|
|
|
if (any_of(Current->uses(), [this, &DefLoc, StartAccess](Use &U) {
|
|
|
|
if (auto *UseOrDef = dyn_cast<MemoryUseOrDef>(U.getUser()))
|
|
|
|
return !MSSA.dominates(StartAccess, UseOrDef) &&
|
|
|
|
isReadClobber(DefLoc, UseOrDef->getMemoryInst());
|
|
|
|
return false;
|
|
|
|
})) {
|
|
|
|
LLVM_DEBUG(dbgs() << " ... found a read clobber\n");
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
2020-08-28 11:31:30 +02:00
|
|
|
// If Current cannot be analyzed or is not removable, check the next
|
|
|
|
// candidate.
|
2021-07-02 04:59:06 +02:00
|
|
|
if (!hasAnalyzableMemoryWrite(CurrentI, TLI) || !isRemovable(CurrentI))
|
2020-08-28 11:31:30 +02:00
|
|
|
continue;
|
|
|
|
|
2020-08-30 22:53:31 +02:00
|
|
|
// If Current does not have an analyzable write location, skip it
|
[DSE] Use correct memory location for read clobber check
MSSA DSE starts at a killing store, finds an earlier store and
then checks that the earlier store is not read along any paths
(without being killed first). However, it uses the memory location
of the killing store for that, not the earlier store that we're
attempting to eliminate.
This has a number of problems:
* Mismatches between what BasicAA considers aliasing and what DSE
considers an overwrite (even though both are correct in isolation)
can result in miscompiles. This is PR48279, which D92045 tries to
fix in a different way. The problem is that we're using a location
from a store that is potentially not executed and thus may be UB,
in which case analysis results can be arbitrary.
* Metadata on the killing store may be used to determine aliasing,
but there is no guarantee that the metadata is valid, as the specific
killing store may not be executed. Using the metadata on the earlier
store is valid (it is the store we're removing, so on any execution
where its removal may be observed, it must be executed).
* The location is imprecise. For full overwrites the killing store
will always have a location that is larger or equal than the earlier
access location, so it's beneficial to use the earlier access
location. This is not the case for partial overwrites, in which
case either location might be smaller. There is some room for
improvement here.
Using the earlier access location means that we can no longer cache
which accesses are read for a given killing store, as we may be
querying different locations. However, it turns out that simply
dropping the cache has no notable impact on compile-time.
Differential Revision: https://reviews.llvm.org/D93523
2020-11-30 23:51:54 +01:00
|
|
|
CurrentLoc = getLocForWriteEx(CurrentI);
|
2021-07-02 04:59:06 +02:00
|
|
|
if (!CurrentLoc)
|
2020-08-30 22:53:31 +02:00
|
|
|
continue;
|
2020-08-28 11:31:30 +02:00
|
|
|
|
[DSE] Use correct memory location for read clobber check
MSSA DSE starts at a killing store, finds an earlier store and
then checks that the earlier store is not read along any paths
(without being killed first). However, it uses the memory location
of the killing store for that, not the earlier store that we're
attempting to eliminate.
This has a number of problems:
* Mismatches between what BasicAA considers aliasing and what DSE
considers an overwrite (even though both are correct in isolation)
can result in miscompiles. This is PR48279, which D92045 tries to
fix in a different way. The problem is that we're using a location
from a store that is potentially not executed and thus may be UB,
in which case analysis results can be arbitrary.
* Metadata on the killing store may be used to determine aliasing,
but there is no guarantee that the metadata is valid, as the specific
killing store may not be executed. Using the metadata on the earlier
store is valid (it is the store we're removing, so on any execution
where its removal may be observed, it must be executed).
* The location is imprecise. For full overwrites the killing store
will always have a location that is larger or equal than the earlier
access location, so it's beneficial to use the earlier access
location. This is not the case for partial overwrites, in which
case either location might be smaller. There is some room for
improvement here.
Using the earlier access location means that we can no longer cache
which accesses are read for a given killing store, as we may be
querying different locations. However, it turns out that simply
dropping the cache has no notable impact on compile-time.
Differential Revision: https://reviews.llvm.org/D93523
2020-11-30 23:51:54 +01:00
|
|
|
// AliasAnalysis does not account for loops. Limit elimination to
|
|
|
|
// candidates for which we can guarantee they always store to the same
|
2021-06-20 18:03:30 +02:00
|
|
|
// memory location and not located in different loops.
|
|
|
|
if (!isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) {
|
|
|
|
LLVM_DEBUG(dbgs() << " ... not guaranteed loop independent\n");
|
[DSE] Use correct memory location for read clobber check
MSSA DSE starts at a killing store, finds an earlier store and
then checks that the earlier store is not read along any paths
(without being killed first). However, it uses the memory location
of the killing store for that, not the earlier store that we're
attempting to eliminate.
This has a number of problems:
* Mismatches between what BasicAA considers aliasing and what DSE
considers an overwrite (even though both are correct in isolation)
can result in miscompiles. This is PR48279, which D92045 tries to
fix in a different way. The problem is that we're using a location
from a store that is potentially not executed and thus may be UB,
in which case analysis results can be arbitrary.
* Metadata on the killing store may be used to determine aliasing,
but there is no guarantee that the metadata is valid, as the specific
killing store may not be executed. Using the metadata on the earlier
store is valid (it is the store we're removing, so on any execution
where its removal may be observed, it must be executed).
* The location is imprecise. For full overwrites the killing store
will always have a location that is larger or equal than the earlier
access location, so it's beneficial to use the earlier access
location. This is not the case for partial overwrites, in which
case either location might be smaller. There is some room for
improvement here.
Using the earlier access location means that we can no longer cache
which accesses are read for a given killing store, as we may be
querying different locations. However, it turns out that simply
dropping the cache has no notable impact on compile-time.
Differential Revision: https://reviews.llvm.org/D93523
2020-11-30 23:51:54 +01:00
|
|
|
WalkerStepLimit -= 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-08-28 11:31:30 +02:00
|
|
|
if (IsMemTerm) {
|
|
|
|
// If the killing def is a memory terminator (e.g. lifetime.end), check
|
|
|
|
// the next candidate if the current Current does not write the same
|
|
|
|
// underlying object as the terminator.
|
2021-07-02 04:59:06 +02:00
|
|
|
if (!isMemTerminator(*CurrentLoc, CurrentI, KillingI))
|
|
|
|
continue;
|
2020-08-28 11:31:30 +02:00
|
|
|
} else {
|
|
|
|
int64_t InstWriteOffset, DepWriteOffset;
|
2021-05-14 10:16:51 +02:00
|
|
|
auto OR = isOverwrite(KillingI, CurrentI, DefLoc, *CurrentLoc,
|
|
|
|
DepWriteOffset, InstWriteOffset);
|
2020-08-28 11:31:30 +02:00
|
|
|
// If Current does not write to the same object as KillingDef, check
|
|
|
|
// the next candidate.
|
2021-07-02 04:59:06 +02:00
|
|
|
if (OR == OW_Unknown)
|
|
|
|
continue;
|
|
|
|
else if (OR == OW_MaybePartial) {
|
2020-08-28 11:31:30 +02:00
|
|
|
// If KillingDef only partially overwrites Current, check the next
|
|
|
|
// candidate if the partial step limit is exceeded. This aggressively
|
|
|
|
// limits the number of candidates for partial store elimination,
|
|
|
|
// which are less likely to be removable in the end.
|
|
|
|
if (PartialLimit <= 1) {
|
|
|
|
WalkerStepLimit -= 1;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
PartialLimit -= 1;
|
|
|
|
}
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
2021-07-02 04:59:06 +02:00
|
|
|
break;
|
|
|
|
};
|
2020-02-11 19:27:41 +01:00
|
|
|
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
// Accesses to objects accessible after the function returns can only be
|
|
|
|
// eliminated if the access is killed along all paths to the exit. Collect
|
|
|
|
// the blocks with killing (=completely overwriting MemoryDefs) and check if
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
// they cover all paths from EarlierAccess to any function exit.
|
2020-08-17 19:52:57 +02:00
|
|
|
SmallPtrSet<Instruction *, 16> KillingDefs;
|
|
|
|
KillingDefs.insert(KillingDef->getMemoryInst());
|
2020-08-30 22:27:32 +02:00
|
|
|
MemoryAccess *EarlierAccess = Current;
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
Instruction *EarlierMemInst =
|
2020-08-30 22:27:32 +02:00
|
|
|
cast<MemoryDef>(EarlierAccess)->getMemoryInst();
|
|
|
|
LLVM_DEBUG(dbgs() << " Checking for reads of " << *EarlierAccess << " ("
|
|
|
|
<< *EarlierMemInst << ")\n");
|
2020-02-11 19:27:41 +01:00
|
|
|
|
|
|
|
SmallSetVector<MemoryAccess *, 32> WorkList;
|
|
|
|
auto PushMemUses = [&WorkList](MemoryAccess *Acc) {
|
|
|
|
for (Use &U : Acc->uses())
|
|
|
|
WorkList.insert(cast<MemoryAccess>(U.getUser()));
|
|
|
|
};
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
PushMemUses(EarlierAccess);
|
2020-02-11 19:27:41 +01:00
|
|
|
|
2020-08-25 09:43:32 +02:00
|
|
|
// Optimistically collect all accesses for reads. If we do not find any
|
|
|
|
// read clobbers, add them to the cache.
|
|
|
|
SmallPtrSet<MemoryAccess *, 16> KnownNoReads;
|
2020-08-30 22:27:32 +02:00
|
|
|
if (!EarlierMemInst->mayReadFromMemory())
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
KnownNoReads.insert(EarlierAccess);
|
|
|
|
// Check if EarlierDef may be read.
|
2020-02-11 19:27:41 +01:00
|
|
|
for (unsigned I = 0; I < WorkList.size(); I++) {
|
|
|
|
MemoryAccess *UseAccess = WorkList[I];
|
|
|
|
|
2020-04-24 18:48:03 +02:00
|
|
|
LLVM_DEBUG(dbgs() << " " << *UseAccess);
|
2020-08-14 22:08:16 +02:00
|
|
|
// Bail out if the number of accesses to check exceeds the scan limit.
|
|
|
|
if (ScanLimit < (WorkList.size() - I)) {
|
2020-04-24 18:48:03 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "\n ... hit scan limit\n");
|
2020-02-11 19:27:41 +01:00
|
|
|
return None;
|
|
|
|
}
|
2020-08-14 22:08:16 +02:00
|
|
|
--ScanLimit;
|
2020-08-25 09:43:32 +02:00
|
|
|
NumDomMemDefChecks++;
|
|
|
|
KnownNoReads.insert(UseAccess);
|
2020-02-11 19:27:41 +01:00
|
|
|
|
|
|
|
if (isa<MemoryPhi>(UseAccess)) {
|
2020-08-17 19:52:57 +02:00
|
|
|
if (any_of(KillingDefs, [this, UseAccess](Instruction *KI) {
|
|
|
|
return DT.properlyDominates(KI->getParent(),
|
|
|
|
UseAccess->getBlock());
|
|
|
|
})) {
|
|
|
|
LLVM_DEBUG(dbgs() << " ... skipping, dominated by killing block\n");
|
|
|
|
continue;
|
|
|
|
}
|
2020-04-24 18:48:03 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "\n ... adding PHI uses\n");
|
2020-03-20 08:51:29 +01:00
|
|
|
PushMemUses(UseAccess);
|
|
|
|
continue;
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
|
|
|
|
LLVM_DEBUG(dbgs() << " (" << *UseInst << ")\n");
|
|
|
|
|
2020-08-17 19:52:57 +02:00
|
|
|
if (any_of(KillingDefs, [this, UseInst](Instruction *KI) {
|
|
|
|
return DT.dominates(KI, UseInst);
|
|
|
|
})) {
|
|
|
|
LLVM_DEBUG(dbgs() << " ... skipping, dominated by killing def\n");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-10-20 17:41:56 +02:00
|
|
|
// A memory terminator kills all preceeding MemoryDefs and all succeeding
|
|
|
|
// MemoryAccesses. We do not have to check it's users.
|
[DSE] Use correct memory location for read clobber check
MSSA DSE starts at a killing store, finds an earlier store and
then checks that the earlier store is not read along any paths
(without being killed first). However, it uses the memory location
of the killing store for that, not the earlier store that we're
attempting to eliminate.
This has a number of problems:
* Mismatches between what BasicAA considers aliasing and what DSE
considers an overwrite (even though both are correct in isolation)
can result in miscompiles. This is PR48279, which D92045 tries to
fix in a different way. The problem is that we're using a location
from a store that is potentially not executed and thus may be UB,
in which case analysis results can be arbitrary.
* Metadata on the killing store may be used to determine aliasing,
but there is no guarantee that the metadata is valid, as the specific
killing store may not be executed. Using the metadata on the earlier
store is valid (it is the store we're removing, so on any execution
where its removal may be observed, it must be executed).
* The location is imprecise. For full overwrites the killing store
will always have a location that is larger or equal than the earlier
access location, so it's beneficial to use the earlier access
location. This is not the case for partial overwrites, in which
case either location might be smaller. There is some room for
improvement here.
Using the earlier access location means that we can no longer cache
which accesses are read for a given killing store, as we may be
querying different locations. However, it turns out that simply
dropping the cache has no notable impact on compile-time.
Differential Revision: https://reviews.llvm.org/D93523
2020-11-30 23:51:54 +01:00
|
|
|
if (isMemTerminator(*CurrentLoc, EarlierMemInst, UseInst)) {
|
2020-10-20 17:41:56 +02:00
|
|
|
LLVM_DEBUG(
|
|
|
|
dbgs()
|
|
|
|
<< " ... skipping, memterminator invalidates following accesses\n");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-10-18 16:19:05 +02:00
|
|
|
if (isNoopIntrinsic(cast<MemoryUseOrDef>(UseAccess)->getMemoryInst())) {
|
2020-04-24 18:48:03 +02:00
|
|
|
LLVM_DEBUG(dbgs() << " ... adding uses of intrinsic\n");
|
2020-02-11 19:27:41 +01:00
|
|
|
PushMemUses(UseAccess);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2020-09-04 18:19:56 +02:00
|
|
|
if (UseInst->mayThrow() && !isInvisibleToCallerBeforeRet(DefUO)) {
|
|
|
|
LLVM_DEBUG(dbgs() << " ... found throwing instruction\n");
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
// Uses which may read the original MemoryDef mean we cannot eliminate the
|
|
|
|
// original MD. Stop walk.
|
[DSE] Use correct memory location for read clobber check
MSSA DSE starts at a killing store, finds an earlier store and
then checks that the earlier store is not read along any paths
(without being killed first). However, it uses the memory location
of the killing store for that, not the earlier store that we're
attempting to eliminate.
This has a number of problems:
* Mismatches between what BasicAA considers aliasing and what DSE
considers an overwrite (even though both are correct in isolation)
can result in miscompiles. This is PR48279, which D92045 tries to
fix in a different way. The problem is that we're using a location
from a store that is potentially not executed and thus may be UB,
in which case analysis results can be arbitrary.
* Metadata on the killing store may be used to determine aliasing,
but there is no guarantee that the metadata is valid, as the specific
killing store may not be executed. Using the metadata on the earlier
store is valid (it is the store we're removing, so on any execution
where its removal may be observed, it must be executed).
* The location is imprecise. For full overwrites the killing store
will always have a location that is larger or equal than the earlier
access location, so it's beneficial to use the earlier access
location. This is not the case for partial overwrites, in which
case either location might be smaller. There is some room for
improvement here.
Using the earlier access location means that we can no longer cache
which accesses are read for a given killing store, as we may be
querying different locations. However, it turns out that simply
dropping the cache has no notable impact on compile-time.
Differential Revision: https://reviews.llvm.org/D93523
2020-11-30 23:51:54 +01:00
|
|
|
if (isReadClobber(*CurrentLoc, UseInst)) {
|
2020-04-24 18:48:03 +02:00
|
|
|
LLVM_DEBUG(dbgs() << " ... found read clobber\n");
|
2020-02-11 19:27:41 +01:00
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
2021-06-20 18:03:30 +02:00
|
|
|
// If this worklist walks back to the original memory access (and the
|
|
|
|
// pointer is not guarenteed loop invariant) then we cannot assume that a
|
|
|
|
// store kills itself.
|
|
|
|
if (EarlierAccess == UseAccess &&
|
|
|
|
!isGuaranteedLoopInvariant(CurrentLoc->Ptr)) {
|
|
|
|
LLVM_DEBUG(dbgs() << " ... found not loop invariant self access\n");
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
// Otherwise, for the KillingDef and EarlierAccess we only have to check
|
|
|
|
// if it reads the memory location.
|
2020-03-20 08:51:29 +01:00
|
|
|
// TODO: It would probably be better to check for self-reads before
|
|
|
|
// calling the function.
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
if (KillingDef == UseAccess || EarlierAccess == UseAccess) {
|
2020-04-24 18:48:03 +02:00
|
|
|
LLVM_DEBUG(dbgs() << " ... skipping killing def/dom access\n");
|
2020-02-11 19:27:41 +01:00
|
|
|
continue;
|
2020-04-24 18:48:03 +02:00
|
|
|
}
|
2020-02-11 19:27:41 +01:00
|
|
|
|
|
|
|
// Check all uses for MemoryDefs, except for defs completely overwriting
|
|
|
|
// the original location. Otherwise we have to check uses of *all*
|
|
|
|
// MemoryDefs we discover, including non-aliasing ones. Otherwise we might
|
|
|
|
// miss cases like the following
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
// 1 = Def(LoE) ; <----- EarlierDef stores [0,1]
|
2020-02-11 19:27:41 +01:00
|
|
|
// 2 = Def(1) ; (2, 1) = NoAlias, stores [2,3]
|
|
|
|
// Use(2) ; MayAlias 2 *and* 1, loads [0, 3].
|
|
|
|
// (The Use points to the *first* Def it may alias)
|
|
|
|
// 3 = Def(1) ; <---- Current (3, 2) = NoAlias, (3,1) = MayAlias,
|
|
|
|
// stores [0,1]
|
|
|
|
if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) {
|
[DSE] Use correct memory location for read clobber check
MSSA DSE starts at a killing store, finds an earlier store and
then checks that the earlier store is not read along any paths
(without being killed first). However, it uses the memory location
of the killing store for that, not the earlier store that we're
attempting to eliminate.
This has a number of problems:
* Mismatches between what BasicAA considers aliasing and what DSE
considers an overwrite (even though both are correct in isolation)
can result in miscompiles. This is PR48279, which D92045 tries to
fix in a different way. The problem is that we're using a location
from a store that is potentially not executed and thus may be UB,
in which case analysis results can be arbitrary.
* Metadata on the killing store may be used to determine aliasing,
but there is no guarantee that the metadata is valid, as the specific
killing store may not be executed. Using the metadata on the earlier
store is valid (it is the store we're removing, so on any execution
where its removal may be observed, it must be executed).
* The location is imprecise. For full overwrites the killing store
will always have a location that is larger or equal than the earlier
access location, so it's beneficial to use the earlier access
location. This is not the case for partial overwrites, in which
case either location might be smaller. There is some room for
improvement here.
Using the earlier access location means that we can no longer cache
which accesses are read for a given killing store, as we may be
querying different locations. However, it turns out that simply
dropping the cache has no notable impact on compile-time.
Differential Revision: https://reviews.llvm.org/D93523
2020-11-30 23:51:54 +01:00
|
|
|
if (isCompleteOverwrite(*CurrentLoc, EarlierMemInst, UseInst)) {
|
2021-06-20 18:03:30 +02:00
|
|
|
BasicBlock *MaybeKillingBlock = UseInst->getParent();
|
|
|
|
if (PostOrderNumbers.find(MaybeKillingBlock)->second <
|
|
|
|
PostOrderNumbers.find(EarlierAccess->getBlock())->second) {
|
|
|
|
if (!isInvisibleToCallerAfterRet(DefUO)) {
|
2020-08-17 19:52:57 +02:00
|
|
|
LLVM_DEBUG(dbgs()
|
|
|
|
<< " ... found killing def " << *UseInst << "\n");
|
|
|
|
KillingDefs.insert(UseInst);
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
}
|
2021-06-20 18:03:30 +02:00
|
|
|
} else {
|
|
|
|
LLVM_DEBUG(dbgs()
|
|
|
|
<< " ... found preceeding def " << *UseInst << "\n");
|
|
|
|
return None;
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
}
|
|
|
|
} else
|
2020-02-11 19:27:41 +01:00
|
|
|
PushMemUses(UseDef);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
// For accesses to locations visible after the function returns, make sure
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
// that the location is killed (=overwritten) along all paths from
|
|
|
|
// EarlierAccess to the exit.
|
2020-08-21 19:17:00 +02:00
|
|
|
if (!isInvisibleToCallerAfterRet(DefUO)) {
|
2020-08-17 19:52:57 +02:00
|
|
|
SmallPtrSet<BasicBlock *, 16> KillingBlocks;
|
|
|
|
for (Instruction *KD : KillingDefs)
|
|
|
|
KillingBlocks.insert(KD->getParent());
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
assert(!KillingBlocks.empty() &&
|
|
|
|
"Expected at least a single killing block");
|
2020-08-17 19:52:57 +02:00
|
|
|
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
// Find the common post-dominator of all killing blocks.
|
|
|
|
BasicBlock *CommonPred = *KillingBlocks.begin();
|
|
|
|
for (auto I = std::next(KillingBlocks.begin()), E = KillingBlocks.end();
|
|
|
|
I != E; I++) {
|
|
|
|
if (!CommonPred)
|
|
|
|
break;
|
|
|
|
CommonPred = PDT.findNearestCommonDominator(CommonPred, *I);
|
|
|
|
}
|
|
|
|
|
|
|
|
// If CommonPred is in the set of killing blocks, just check if it
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
// post-dominates EarlierAccess.
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
if (KillingBlocks.count(CommonPred)) {
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
if (PDT.dominates(CommonPred, EarlierAccess->getBlock()))
|
|
|
|
return {EarlierAccess};
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
// If the common post-dominator does not post-dominate EarlierAccess,
|
|
|
|
// there is a path from EarlierAccess to an exit not going through a
|
|
|
|
// killing block.
|
|
|
|
if (PDT.dominates(CommonPred, EarlierAccess->getBlock())) {
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
SetVector<BasicBlock *> WorkList;
|
|
|
|
|
|
|
|
// If CommonPred is null, there are multiple exits from the function.
|
|
|
|
// They all have to be added to the worklist.
|
|
|
|
if (CommonPred)
|
|
|
|
WorkList.insert(CommonPred);
|
|
|
|
else
|
2020-05-18 16:28:28 +02:00
|
|
|
for (BasicBlock *R : PDT.roots())
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
WorkList.insert(R);
|
|
|
|
|
|
|
|
NumCFGTries++;
|
|
|
|
// Check if all paths starting from an exit node go through one of the
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
// killing blocks before reaching EarlierAccess.
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
for (unsigned I = 0; I < WorkList.size(); I++) {
|
|
|
|
NumCFGChecks++;
|
|
|
|
BasicBlock *Current = WorkList[I];
|
|
|
|
if (KillingBlocks.count(Current))
|
|
|
|
continue;
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
if (Current == EarlierAccess->getBlock())
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
return None;
|
2020-06-21 17:34:54 +02:00
|
|
|
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
// EarlierAccess is reachable from the entry, so we don't have to
|
|
|
|
// explore unreachable blocks further.
|
2020-06-21 17:34:54 +02:00
|
|
|
if (!DT.isReachableFromEntry(Current))
|
|
|
|
continue;
|
|
|
|
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
for (BasicBlock *Pred : predecessors(Current))
|
|
|
|
WorkList.insert(Pred);
|
|
|
|
|
|
|
|
if (WorkList.size() >= MemorySSAPathCheckLimit)
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
NumCFGSuccess++;
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
return {EarlierAccess};
|
[DSE,MSSA] Relax post-dom restriction for objs visible after return.
This patch relaxes the post-dominance requirement for accesses to
objects visible after the function returns.
Instead of requiring the killing def to post-dominate the access to
eliminate, the set of 'killing blocks' (= blocks that completely
overwrite the original access) is collected.
If all paths from the access to eliminate and an exit block go through a
killing block, the access can be removed.
To check this property, we first get the common post-dominator block for
the killing blocks. If this block does not post-dominate the access
block, there may be a path from DomAccess to an exit block not involving
any killing block.
Otherwise we have to check if there is a path from the DomAccess to the
common post-dominator, that does not contain a killing block. If there
is no such path, we can remove DomAccess. For this check, we start at
the common post-dominator and then traverse the CFG backwards. Paths are
terminated when we hit a killing block or a block that is not executed
between DomAccess and a killing block according to the post-order
numbering (if the post order number of a block is greater than the one
of DomAccess, the block cannot be in in a path starting at DomAccess).
This gives the following improvements on the total number of stores
after DSE for MultiSource, SPEC2K, SPEC2006:
Tests: 237
Same hash: 206 (filtered out)
Remaining: 31
Metric: dse.NumRemainingStores
Program base new100 diff
test-suite...CFP2000/188.ammp/188.ammp.test 3624.00 3544.00 -2.2%
test-suite...ch/g721/g721encode/encode.test 128.00 126.00 -1.6%
test-suite.../Benchmarks/Olden/mst/mst.test 73.00 72.00 -1.4%
test-suite...CFP2006/433.milc/433.milc.test 3202.00 3163.00 -1.2%
test-suite...000/186.crafty/186.crafty.test 5062.00 5010.00 -1.0%
test-suite...-typeset/consumer-typeset.test 40460.00 40248.00 -0.5%
test-suite...Source/Benchmarks/sim/sim.test 642.00 639.00 -0.5%
test-suite...nchmarks/McCat/09-vor/vor.test 642.00 644.00 0.3%
test-suite...lications/sqlite3/sqlite3.test 35664.00 35563.00 -0.3%
test-suite...T2000/300.twolf/300.twolf.test 7202.00 7184.00 -0.2%
test-suite...lications/ClamAV/clamscan.test 19475.00 19444.00 -0.2%
test-suite...INT2000/164.gzip/164.gzip.test 2199.00 2196.00 -0.1%
test-suite...peg2/mpeg2dec/mpeg2decode.test 2380.00 2378.00 -0.1%
test-suite.../Benchmarks/Bullet/bullet.test 39335.00 39309.00 -0.1%
test-suite...:: External/Povray/povray.test 36951.00 36927.00 -0.1%
test-suite...marks/7zip/7zip-benchmark.test 67396.00 67356.00 -0.1%
test-suite...6/464.h264ref/464.h264ref.test 31497.00 31481.00 -0.1%
test-suite...006/453.povray/453.povray.test 51441.00 51416.00 -0.0%
test-suite...T2006/401.bzip2/401.bzip2.test 4450.00 4448.00 -0.0%
test-suite...Applications/kimwitu++/kc.test 23481.00 23471.00 -0.0%
test-suite...chmarks/MallocBench/gs/gs.test 6286.00 6284.00 -0.0%
test-suite.../CINT2000/254.gap/254.gap.test 13719.00 13715.00 -0.0%
test-suite.../Applications/SPASS/SPASS.test 30345.00 30338.00 -0.0%
test-suite...006/450.soplex/450.soplex.test 15018.00 15016.00 -0.0%
test-suite...ications/JM/lencod/lencod.test 27780.00 27777.00 -0.0%
test-suite.../CINT2006/403.gcc/403.gcc.test 105285.00 105276.00 -0.0%
There might be potential to pre-compute some of the information of which
blocks are on the path to an exit for each block, but the overall
benefit might be comparatively small.
On the set of benchmarks, 15738 times out of 20322 we reach the
CFG check, the CFG check is successful. The total number of iterations
in the CFG check is 187810, so on average we need less than 10 steps in
the check loop. Bumping the threshold in the loop from 50 to 150 gives a
few small improvements, but I don't think they warrant such a big bump
at the moment. This is all pending further tuning in the future.
Reviewers: dmgreen, bryant, asbirlea, Tyker, efriedma, george.burgess.iv
Reviewed By: george.burgess.iv
Differential Revision: https://reviews.llvm.org/D78932
2020-06-10 11:23:37 +02:00
|
|
|
}
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
// No aliasing MemoryUses of EarlierAccess found, EarlierAccess is
|
|
|
|
// potentially dead.
|
|
|
|
return {EarlierAccess};
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Delete dead memory defs
|
|
|
|
void deleteDeadInstruction(Instruction *SI) {
|
|
|
|
MemorySSAUpdater Updater(&MSSA);
|
|
|
|
SmallVector<Instruction *, 32> NowDeadInsts;
|
|
|
|
NowDeadInsts.push_back(SI);
|
|
|
|
--NumFastOther;
|
|
|
|
|
|
|
|
while (!NowDeadInsts.empty()) {
|
|
|
|
Instruction *DeadInst = NowDeadInsts.pop_back_val();
|
|
|
|
++NumFastOther;
|
|
|
|
|
|
|
|
// Try to preserve debug information attached to the dead instruction.
|
|
|
|
salvageDebugInfo(*DeadInst);
|
2020-04-14 11:56:56 +02:00
|
|
|
salvageKnowledge(DeadInst);
|
2020-02-11 19:27:41 +01:00
|
|
|
|
|
|
|
// Remove the Instruction from MSSA.
|
|
|
|
if (MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst)) {
|
|
|
|
if (MemoryDef *MD = dyn_cast<MemoryDef>(MA)) {
|
|
|
|
SkipStores.insert(MD);
|
|
|
|
}
|
|
|
|
Updater.removeMemoryAccess(MA);
|
|
|
|
}
|
|
|
|
|
2020-02-23 16:39:15 +01:00
|
|
|
auto I = IOLs.find(DeadInst->getParent());
|
|
|
|
if (I != IOLs.end())
|
|
|
|
I->second.erase(DeadInst);
|
2020-02-11 19:27:41 +01:00
|
|
|
// Remove its operands
|
|
|
|
for (Use &O : DeadInst->operands())
|
|
|
|
if (Instruction *OpI = dyn_cast<Instruction>(O)) {
|
|
|
|
O = nullptr;
|
|
|
|
if (isInstructionTriviallyDead(OpI, &TLI))
|
|
|
|
NowDeadInsts.push_back(OpI);
|
|
|
|
}
|
|
|
|
|
|
|
|
DeadInst->eraseFromParent();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check for any extra throws between SI and NI that block DSE. This only
|
|
|
|
// checks extra maythrows (those that aren't MemoryDef's). MemoryDef that may
|
|
|
|
// throw are handled during the walk from one def to the next.
|
|
|
|
bool mayThrowBetween(Instruction *SI, Instruction *NI,
|
2020-08-21 19:17:00 +02:00
|
|
|
const Value *SILocUnd) {
|
2020-02-11 19:27:41 +01:00
|
|
|
// First see if we can ignore it by using the fact that SI is an
|
|
|
|
// alloca/alloca like object that is not visible to the caller during
|
|
|
|
// execution of the function.
|
2020-08-21 19:17:00 +02:00
|
|
|
if (SILocUnd && isInvisibleToCallerBeforeRet(SILocUnd))
|
2020-02-11 19:27:41 +01:00
|
|
|
return false;
|
|
|
|
|
|
|
|
if (SI->getParent() == NI->getParent())
|
2020-06-07 22:36:10 +02:00
|
|
|
return ThrowingBlocks.count(SI->getParent());
|
2020-02-11 19:27:41 +01:00
|
|
|
return !ThrowingBlocks.empty();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check if \p NI acts as a DSE barrier for \p SI. The following instructions
|
|
|
|
// act as barriers:
|
|
|
|
// * A memory instruction that may throw and \p SI accesses a non-stack
|
|
|
|
// object.
|
|
|
|
// * Atomic stores stronger that monotonic.
|
2020-08-21 19:17:00 +02:00
|
|
|
bool isDSEBarrier(const Value *SILocUnd, Instruction *NI) {
|
2020-02-11 19:27:41 +01:00
|
|
|
// If NI may throw it acts as a barrier, unless we are to an alloca/alloca
|
|
|
|
// like object that does not escape.
|
2020-08-21 19:17:00 +02:00
|
|
|
if (NI->mayThrow() && !isInvisibleToCallerBeforeRet(SILocUnd))
|
2020-02-11 19:27:41 +01:00
|
|
|
return true;
|
|
|
|
|
2020-06-22 17:24:27 +02:00
|
|
|
// If NI is an atomic load/store stronger than monotonic, do not try to
|
|
|
|
// eliminate/reorder it.
|
2020-02-11 19:27:41 +01:00
|
|
|
if (NI->isAtomic()) {
|
2020-06-22 17:24:27 +02:00
|
|
|
if (auto *LI = dyn_cast<LoadInst>(NI))
|
|
|
|
return isStrongerThanMonotonic(LI->getOrdering());
|
|
|
|
if (auto *SI = dyn_cast<StoreInst>(NI))
|
|
|
|
return isStrongerThanMonotonic(SI->getOrdering());
|
2020-08-21 11:18:32 +02:00
|
|
|
if (auto *ARMW = dyn_cast<AtomicRMWInst>(NI))
|
|
|
|
return isStrongerThanMonotonic(ARMW->getOrdering());
|
|
|
|
if (auto *CmpXchg = dyn_cast<AtomicCmpXchgInst>(NI))
|
|
|
|
return isStrongerThanMonotonic(CmpXchg->getSuccessOrdering()) ||
|
|
|
|
isStrongerThanMonotonic(CmpXchg->getFailureOrdering());
|
2020-06-22 17:24:27 +02:00
|
|
|
llvm_unreachable("other instructions should be skipped in MemorySSA");
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2020-06-24 10:56:35 +02:00
|
|
|
/// Eliminate writes to objects that are not visible in the caller and are not
|
|
|
|
/// accessed before returning from the function.
|
|
|
|
bool eliminateDeadWritesAtEndOfFunction() {
|
|
|
|
bool MadeChange = false;
|
|
|
|
LLVM_DEBUG(
|
|
|
|
dbgs()
|
|
|
|
<< "Trying to eliminate MemoryDefs at the end of the function\n");
|
|
|
|
for (int I = MemDefs.size() - 1; I >= 0; I--) {
|
|
|
|
MemoryDef *Def = MemDefs[I];
|
2020-12-19 19:43:18 +01:00
|
|
|
if (SkipStores.contains(Def) || !isRemovable(Def->getMemoryInst()))
|
2020-06-24 10:56:35 +02:00
|
|
|
continue;
|
|
|
|
|
2020-08-14 16:25:01 +02:00
|
|
|
Instruction *DefI = Def->getMemoryInst();
|
|
|
|
SmallVector<const Value *, 4> Pointers;
|
|
|
|
auto DefLoc = getLocForWriteEx(DefI);
|
|
|
|
if (!DefLoc)
|
|
|
|
continue;
|
2020-06-24 10:56:35 +02:00
|
|
|
|
2020-08-22 11:08:59 +02:00
|
|
|
// NOTE: Currently eliminating writes at the end of a function is limited
|
|
|
|
// to MemoryDefs with a single underlying object, to save compile-time. In
|
|
|
|
// practice it appears the case with multiple underlying objects is very
|
|
|
|
// uncommon. If it turns out to be important, we can use
|
|
|
|
// getUnderlyingObjects here instead.
|
|
|
|
const Value *UO = getUnderlyingObject(DefLoc->Ptr);
|
2020-08-21 19:17:00 +02:00
|
|
|
if (!UO || !isInvisibleToCallerAfterRet(UO))
|
2020-08-22 11:08:59 +02:00
|
|
|
continue;
|
2020-06-24 10:56:35 +02:00
|
|
|
|
2020-08-22 11:08:59 +02:00
|
|
|
if (isWriteAtEndOfFunction(Def)) {
|
2020-08-14 16:25:01 +02:00
|
|
|
// See through pointer-to-pointer bitcasts
|
|
|
|
LLVM_DEBUG(dbgs() << " ... MemoryDef is not accessed until the end "
|
|
|
|
"of the function\n");
|
|
|
|
deleteDeadInstruction(DefI);
|
|
|
|
++NumFastStores;
|
|
|
|
MadeChange = true;
|
2020-06-24 10:56:35 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return MadeChange;
|
|
|
|
}
|
|
|
|
|
2020-06-23 20:26:31 +02:00
|
|
|
/// \returns true if \p Def is a no-op store, either because it
|
|
|
|
/// directly stores back a loaded value or stores zero to a calloced object.
|
2020-12-16 21:34:47 +01:00
|
|
|
bool storeIsNoop(MemoryDef *Def, const MemoryLocation &DefLoc,
|
|
|
|
const Value *DefUO) {
|
2020-06-23 20:26:31 +02:00
|
|
|
StoreInst *Store = dyn_cast<StoreInst>(Def->getMemoryInst());
|
2021-04-27 23:19:44 +02:00
|
|
|
MemSetInst *MemSet = dyn_cast<MemSetInst>(Def->getMemoryInst());
|
|
|
|
Constant *StoredConstant = nullptr;
|
|
|
|
if (Store)
|
|
|
|
StoredConstant = dyn_cast<Constant>(Store->getOperand(0));
|
|
|
|
if (MemSet)
|
|
|
|
StoredConstant = dyn_cast<Constant>(MemSet->getValue());
|
|
|
|
|
|
|
|
if (StoredConstant && StoredConstant->isNullValue()) {
|
|
|
|
auto *DefUOInst = dyn_cast<Instruction>(DefUO);
|
2021-07-23 11:51:59 +02:00
|
|
|
if (DefUOInst && isCallocLikeFn(DefUOInst, &TLI)) {
|
|
|
|
auto *UnderlyingDef = cast<MemoryDef>(MSSA.getMemoryAccess(DefUOInst));
|
|
|
|
// If UnderlyingDef is the clobbering access of Def, no instructions
|
|
|
|
// between them can modify the memory location.
|
|
|
|
auto *ClobberDef =
|
|
|
|
MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def);
|
|
|
|
return UnderlyingDef == ClobberDef;
|
2021-04-27 23:19:44 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-06-23 20:26:31 +02:00
|
|
|
if (!Store)
|
|
|
|
return false;
|
2020-05-30 18:56:04 +02:00
|
|
|
|
2020-06-23 20:26:31 +02:00
|
|
|
if (auto *LoadI = dyn_cast<LoadInst>(Store->getOperand(0))) {
|
|
|
|
if (LoadI->getPointerOperand() == Store->getOperand(1)) {
|
2020-10-01 19:40:03 +02:00
|
|
|
// Get the defining access for the load.
|
2020-06-23 20:26:31 +02:00
|
|
|
auto *LoadAccess = MSSA.getMemoryAccess(LoadI)->getDefiningAccess();
|
2020-10-01 19:40:03 +02:00
|
|
|
// Fast path: the defining accesses are the same.
|
|
|
|
if (LoadAccess == Def->getDefiningAccess())
|
|
|
|
return true;
|
|
|
|
|
|
|
|
// Look through phi accesses. Recursively scan all phi accesses by
|
|
|
|
// adding them to a worklist. Bail when we run into a memory def that
|
|
|
|
// does not match LoadAccess.
|
|
|
|
SetVector<MemoryAccess *> ToCheck;
|
2020-10-28 12:01:25 +01:00
|
|
|
MemoryAccess *Current =
|
|
|
|
MSSA.getWalker()->getClobberingMemoryAccess(Def);
|
2020-10-01 19:40:03 +02:00
|
|
|
// We don't want to bail when we run into the store memory def. But,
|
|
|
|
// the phi access may point to it. So, pretend like we've already
|
|
|
|
// checked it.
|
|
|
|
ToCheck.insert(Def);
|
|
|
|
ToCheck.insert(Current);
|
|
|
|
// Start at current (1) to simulate already having checked Def.
|
|
|
|
for (unsigned I = 1; I < ToCheck.size(); ++I) {
|
|
|
|
Current = ToCheck[I];
|
|
|
|
if (auto PhiAccess = dyn_cast<MemoryPhi>(Current)) {
|
|
|
|
// Check all the operands.
|
|
|
|
for (auto &Use : PhiAccess->incoming_values())
|
|
|
|
ToCheck.insert(cast<MemoryAccess>(&Use));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we found a memory def, bail. This happens when we have an
|
|
|
|
// unrelated write in between an otherwise noop store.
|
|
|
|
assert(isa<MemoryDef>(Current) &&
|
|
|
|
"Only MemoryDefs should reach here.");
|
|
|
|
// TODO: Skip no alias MemoryDefs that have no aliasing reads.
|
|
|
|
// We are searching for the definition of the store's destination.
|
|
|
|
// So, if that is the same definition as the load, then this is a
|
|
|
|
// noop. Otherwise, fail.
|
|
|
|
if (LoadAccess != Current)
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return true;
|
2020-06-23 20:26:31 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
};
|
2020-05-30 18:56:04 +02:00
|
|
|
|
2021-06-20 18:03:30 +02:00
|
|
|
static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
|
|
|
|
DominatorTree &DT, PostDominatorTree &PDT,
|
|
|
|
const TargetLibraryInfo &TLI,
|
|
|
|
const LoopInfo &LI) {
|
2020-02-11 19:27:41 +01:00
|
|
|
bool MadeChange = false;
|
|
|
|
|
2021-06-20 18:03:30 +02:00
|
|
|
DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI, LI);
|
2020-02-11 19:27:41 +01:00
|
|
|
// For each store:
|
|
|
|
for (unsigned I = 0; I < State.MemDefs.size(); I++) {
|
2020-03-20 08:51:29 +01:00
|
|
|
MemoryDef *KillingDef = State.MemDefs[I];
|
|
|
|
if (State.SkipStores.count(KillingDef))
|
2020-02-11 19:27:41 +01:00
|
|
|
continue;
|
2020-03-20 08:51:29 +01:00
|
|
|
Instruction *SI = KillingDef->getMemoryInst();
|
2020-05-30 18:56:04 +02:00
|
|
|
|
2020-11-29 02:54:29 +01:00
|
|
|
Optional<MemoryLocation> MaybeSILoc;
|
2020-07-08 09:42:55 +02:00
|
|
|
if (State.isMemTerminatorInst(SI))
|
|
|
|
MaybeSILoc = State.getLocForTerminator(SI).map(
|
|
|
|
[](const std::pair<MemoryLocation, bool> &P) { return P.first; });
|
|
|
|
else
|
|
|
|
MaybeSILoc = State.getLocForWriteEx(SI);
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
if (!MaybeSILoc) {
|
|
|
|
LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for "
|
|
|
|
<< *SI << "\n");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
MemoryLocation SILoc = *MaybeSILoc;
|
|
|
|
assert(SILoc.Ptr && "SILoc should not be null");
|
2020-07-31 11:09:54 +02:00
|
|
|
const Value *SILocUnd = getUnderlyingObject(SILoc.Ptr);
|
2020-06-23 20:26:31 +02:00
|
|
|
|
2020-03-20 08:51:29 +01:00
|
|
|
MemoryAccess *Current = KillingDef;
|
|
|
|
LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by "
|
2021-03-02 14:42:03 +01:00
|
|
|
<< *Current << " (" << *SI << ")\n");
|
2020-02-11 19:27:41 +01:00
|
|
|
|
2020-08-14 22:08:16 +02:00
|
|
|
unsigned ScanLimit = MemorySSAScanLimit;
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit;
|
2020-08-28 11:31:30 +02:00
|
|
|
unsigned PartialLimit = MemorySSAPartialStoreLimit;
|
2020-03-20 08:51:29 +01:00
|
|
|
// Worklist of MemoryAccesses that may be killed by KillingDef.
|
|
|
|
SetVector<MemoryAccess *> ToCheck;
|
|
|
|
|
2020-10-30 10:32:43 +01:00
|
|
|
if (SILocUnd)
|
|
|
|
ToCheck.insert(KillingDef->getDefiningAccess());
|
|
|
|
|
|
|
|
bool Shortend = false;
|
2020-08-28 11:31:30 +02:00
|
|
|
bool IsMemTerm = State.isMemTerminatorInst(SI);
|
2020-03-20 08:51:29 +01:00
|
|
|
// Check if MemoryAccesses in the worklist are killed by KillingDef.
|
|
|
|
for (unsigned I = 0; I < ToCheck.size(); I++) {
|
|
|
|
Current = ToCheck[I];
|
|
|
|
if (State.SkipStores.count(Current))
|
|
|
|
continue;
|
|
|
|
|
2020-08-28 11:31:30 +02:00
|
|
|
Optional<MemoryAccess *> Next = State.getDomMemoryDef(
|
[DSE] Use correct memory location for read clobber check
MSSA DSE starts at a killing store, finds an earlier store and
then checks that the earlier store is not read along any paths
(without being killed first). However, it uses the memory location
of the killing store for that, not the earlier store that we're
attempting to eliminate.
This has a number of problems:
* Mismatches between what BasicAA considers aliasing and what DSE
considers an overwrite (even though both are correct in isolation)
can result in miscompiles. This is PR48279, which D92045 tries to
fix in a different way. The problem is that we're using a location
from a store that is potentially not executed and thus may be UB,
in which case analysis results can be arbitrary.
* Metadata on the killing store may be used to determine aliasing,
but there is no guarantee that the metadata is valid, as the specific
killing store may not be executed. Using the metadata on the earlier
store is valid (it is the store we're removing, so on any execution
where its removal may be observed, it must be executed).
* The location is imprecise. For full overwrites the killing store
will always have a location that is larger or equal than the earlier
access location, so it's beneficial to use the earlier access
location. This is not the case for partial overwrites, in which
case either location might be smaller. There is some room for
improvement here.
Using the earlier access location means that we can no longer cache
which accesses are read for a given killing store, as we may be
querying different locations. However, it turns out that simply
dropping the cache has no notable impact on compile-time.
Differential Revision: https://reviews.llvm.org/D93523
2020-11-30 23:51:54 +01:00
|
|
|
KillingDef, Current, SILoc, SILocUnd, ScanLimit, WalkerStepLimit,
|
|
|
|
IsMemTerm, PartialLimit);
|
2020-03-20 08:51:29 +01:00
|
|
|
|
|
|
|
if (!Next) {
|
|
|
|
LLVM_DEBUG(dbgs() << " finished walk\n");
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
MemoryAccess *EarlierAccess = *Next;
|
|
|
|
LLVM_DEBUG(dbgs() << " Checking if we can kill " << *EarlierAccess);
|
|
|
|
if (isa<MemoryPhi>(EarlierAccess)) {
|
2020-04-24 18:48:03 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "\n ... adding incoming values to worklist\n");
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
for (Value *V : cast<MemoryPhi>(EarlierAccess)->incoming_values()) {
|
2020-03-20 08:51:29 +01:00
|
|
|
MemoryAccess *IncomingAccess = cast<MemoryAccess>(V);
|
|
|
|
BasicBlock *IncomingBlock = IncomingAccess->getBlock();
|
[DSE,MemorySSA] Traverse use-def chain without MemSSA Walker.
For DSE with MemorySSA it is beneficial to manually traverse the
defining access, instead of using a MemorySSA walker, so we can
better control the number of steps together with other limits and
also weed out invalid/unprofitable paths early on.
This patch requires a follow-up patch to be most effective, which I will
share soon after putting this patch up.
This temporarily XFAIL's the limit tests, because we now explore more
MemoryDefs that may not alias/clobber the killing def. This will be
improved/fixed by the follow-up patch.
This patch also renames some `Dom*` variables to `Earlier*`, because the
dominance relation is not really used/important here and potentially
confusing.
This patch allows us to aggressively cut down compile time, geomean
-O3 -0.64%, ReleaseThinLTO -1.65%, at the expense of fewer stores
removed. Subsequent patches will increase the number of removed stores
again, while keeping compile-time in check.
http://llvm-compile-time-tracker.com/compare.php?from=d8e3294118a8c5f3f97688a704d5a05b67646012&to=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&stat=instructions
Reviewed By: asbirlea
Differential Revision: https://reviews.llvm.org/D86486
2020-08-27 10:52:19 +02:00
|
|
|
BasicBlock *PhiBlock = EarlierAccess->getBlock();
|
2020-03-20 08:51:29 +01:00
|
|
|
|
|
|
|
// We only consider incoming MemoryAccesses that come before the
|
|
|
|
// MemoryPhi. Otherwise we could discover candidates that do not
|
|
|
|
// strictly dominate our starting def.
|
|
|
|
if (State.PostOrderNumbers[IncomingBlock] >
|
|
|
|
State.PostOrderNumbers[PhiBlock])
|
|
|
|
ToCheck.insert(IncomingAccess);
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
2020-11-08 14:07:45 +01:00
|
|
|
auto *NextDef = cast<MemoryDef>(EarlierAccess);
|
2020-02-11 19:27:41 +01:00
|
|
|
Instruction *NI = NextDef->getMemoryInst();
|
2020-04-24 18:48:03 +02:00
|
|
|
LLVM_DEBUG(dbgs() << " (" << *NI << ")\n");
|
2020-06-22 17:24:27 +02:00
|
|
|
ToCheck.insert(NextDef->getDefiningAccess());
|
2020-08-28 11:31:30 +02:00
|
|
|
NumGetDomMemoryDefPassed++;
|
2020-02-11 19:27:41 +01:00
|
|
|
|
2020-02-21 17:55:18 +01:00
|
|
|
if (!DebugCounter::shouldExecute(MemorySSACounter))
|
2020-04-23 19:58:33 +02:00
|
|
|
continue;
|
2020-02-21 17:55:18 +01:00
|
|
|
|
2020-06-22 11:57:44 +02:00
|
|
|
MemoryLocation NILoc = *State.getLocForWriteEx(NI);
|
2020-06-15 16:40:07 +02:00
|
|
|
|
2020-08-28 11:31:30 +02:00
|
|
|
if (IsMemTerm) {
|
2020-07-31 11:09:54 +02:00
|
|
|
const Value *NIUnd = getUnderlyingObject(NILoc.Ptr);
|
2020-08-28 11:31:30 +02:00
|
|
|
if (SILocUnd != NIUnd)
|
2020-07-08 09:42:55 +02:00
|
|
|
continue;
|
2020-02-11 19:27:41 +01:00
|
|
|
LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *NI
|
|
|
|
<< "\n KILLER: " << *SI << '\n');
|
|
|
|
State.deleteDeadInstruction(NI);
|
|
|
|
++NumFastStores;
|
|
|
|
MadeChange = true;
|
2020-07-08 09:42:55 +02:00
|
|
|
} else {
|
|
|
|
// Check if NI overwrites SI.
|
|
|
|
int64_t InstWriteOffset, DepWriteOffset;
|
2021-05-14 10:16:51 +02:00
|
|
|
OverwriteResult OR = State.isOverwrite(SI, NI, SILoc, NILoc,
|
|
|
|
DepWriteOffset, InstWriteOffset);
|
2020-08-21 10:13:59 +02:00
|
|
|
if (OR == OW_MaybePartial) {
|
|
|
|
auto Iter = State.IOLs.insert(
|
|
|
|
std::make_pair<BasicBlock *, InstOverlapIntervalsTy>(
|
|
|
|
NI->getParent(), InstOverlapIntervalsTy()));
|
|
|
|
auto &IOL = Iter.first->second;
|
|
|
|
OR = isPartialOverwrite(SILoc, NILoc, DepWriteOffset, InstWriteOffset,
|
|
|
|
NI, IOL);
|
|
|
|
}
|
2020-07-08 09:42:55 +02:00
|
|
|
|
|
|
|
if (EnablePartialStoreMerging && OR == OW_PartialEarlierWithFullLater) {
|
|
|
|
auto *Earlier = dyn_cast<StoreInst>(NI);
|
|
|
|
auto *Later = dyn_cast<StoreInst>(SI);
|
2020-08-13 12:56:40 +02:00
|
|
|
// We are re-using tryToMergePartialOverlappingStores, which requires
|
|
|
|
// Earlier to domiante Later.
|
|
|
|
// TODO: implement tryToMergeParialOverlappingStores using MemorySSA.
|
|
|
|
if (Earlier && Later && DT.dominates(Earlier, Later)) {
|
|
|
|
if (Constant *Merged = tryToMergePartialOverlappingStores(
|
2020-08-23 16:55:48 +02:00
|
|
|
Earlier, Later, InstWriteOffset, DepWriteOffset, State.DL,
|
2020-08-22 09:36:35 +02:00
|
|
|
State.BatchAA, &DT)) {
|
2020-08-13 12:56:40 +02:00
|
|
|
|
|
|
|
// Update stored value of earlier store to merged constant.
|
|
|
|
Earlier->setOperand(0, Merged);
|
|
|
|
++NumModifiedStores;
|
|
|
|
MadeChange = true;
|
|
|
|
|
2020-10-30 10:32:43 +01:00
|
|
|
Shortend = true;
|
2020-08-13 12:56:40 +02:00
|
|
|
// Remove later store and remove any outstanding overlap intervals
|
|
|
|
// for the updated store.
|
|
|
|
State.deleteDeadInstruction(Later);
|
|
|
|
auto I = State.IOLs.find(Earlier->getParent());
|
|
|
|
if (I != State.IOLs.end())
|
|
|
|
I->second.erase(Earlier);
|
|
|
|
break;
|
|
|
|
}
|
2020-07-08 09:42:55 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (OR == OW_Complete) {
|
|
|
|
LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *NI
|
|
|
|
<< "\n KILLER: " << *SI << '\n');
|
|
|
|
State.deleteDeadInstruction(NI);
|
|
|
|
++NumFastStores;
|
|
|
|
MadeChange = true;
|
|
|
|
}
|
2020-03-20 08:51:29 +01:00
|
|
|
}
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
2020-10-30 10:32:43 +01:00
|
|
|
|
|
|
|
// Check if the store is a no-op.
|
|
|
|
if (!Shortend && isRemovable(SI) &&
|
|
|
|
State.storeIsNoop(KillingDef, SILoc, SILocUnd)) {
|
|
|
|
LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n DEAD: " << *SI << '\n');
|
|
|
|
State.deleteDeadInstruction(SI);
|
|
|
|
NumRedundantStores++;
|
|
|
|
MadeChange = true;
|
|
|
|
continue;
|
|
|
|
}
|
2020-02-11 19:27:41 +01:00
|
|
|
}
|
|
|
|
|
2020-02-23 16:39:15 +01:00
|
|
|
if (EnablePartialOverwriteTracking)
|
|
|
|
for (auto &KV : State.IOLs)
|
2020-09-02 21:06:58 +02:00
|
|
|
MadeChange |= removePartiallyOverlappedStores(State.DL, KV.second, TLI);
|
2020-02-23 16:39:15 +01:00
|
|
|
|
2020-06-24 10:56:35 +02:00
|
|
|
MadeChange |= State.eliminateDeadWritesAtEndOfFunction();
|
2020-02-11 19:27:41 +01:00
|
|
|
return MadeChange;
|
|
|
|
}
|
|
|
|
} // end anonymous namespace
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// DSE Pass
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
|
2020-02-11 19:27:41 +01:00
|
|
|
AliasAnalysis &AA = AM.getResult<AAManager>(F);
|
|
|
|
const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
|
|
|
|
DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
|
2021-03-03 20:38:50 +01:00
|
|
|
MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
|
|
|
|
PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
|
2021-06-20 18:03:30 +02:00
|
|
|
LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
|
2020-02-11 19:27:41 +01:00
|
|
|
|
2021-06-20 18:03:30 +02:00
|
|
|
bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
|
2017-01-15 07:32:49 +01:00
|
|
|
|
2020-04-25 16:02:02 +02:00
|
|
|
#ifdef LLVM_ENABLE_STATS
|
|
|
|
if (AreStatisticsEnabled())
|
|
|
|
for (auto &I : instructions(F))
|
|
|
|
NumRemainingStores += isa<StoreInst>(&I);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if (!Changed)
|
|
|
|
return PreservedAnalyses::all();
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
PreservedAnalyses PA;
|
2017-01-15 07:32:49 +01:00
|
|
|
PA.preserveSet<CFGAnalyses>();
|
2021-03-03 20:38:50 +01:00
|
|
|
PA.preserve<MemorySSAAnalysis>();
|
2021-06-20 18:03:30 +02:00
|
|
|
PA.preserve<LoopAnalysis>();
|
2016-05-17 23:38:13 +02:00
|
|
|
return PA;
|
|
|
|
}
|
|
|
|
|
2016-07-10 13:28:51 +02:00
|
|
|
namespace {
|
2017-10-13 23:17:07 +02:00
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
/// A legacy pass for the legacy pass manager that wraps \c DSEPass.
|
|
|
|
class DSELegacyPass : public FunctionPass {
|
|
|
|
public:
|
2017-10-13 23:17:07 +02:00
|
|
|
static char ID; // Pass identification, replacement for typeid
|
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
DSELegacyPass() : FunctionPass(ID) {
|
|
|
|
initializeDSELegacyPassPass(*PassRegistry::getPassRegistry());
|
|
|
|
}
|
|
|
|
|
|
|
|
bool runOnFunction(Function &F) override {
|
|
|
|
if (skipFunction(F))
|
|
|
|
return false;
|
|
|
|
|
2020-02-11 19:27:41 +01:00
|
|
|
AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
|
|
|
|
DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
|
|
|
const TargetLibraryInfo &TLI =
|
|
|
|
getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
|
2021-03-03 20:38:50 +01:00
|
|
|
MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
|
|
|
|
PostDominatorTree &PDT =
|
|
|
|
getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
|
2021-06-20 18:03:30 +02:00
|
|
|
LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
2020-02-11 19:27:41 +01:00
|
|
|
|
2021-06-20 18:03:30 +02:00
|
|
|
bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
|
2020-04-25 16:02:02 +02:00
|
|
|
|
|
|
|
#ifdef LLVM_ENABLE_STATS
|
|
|
|
if (AreStatisticsEnabled())
|
|
|
|
for (auto &I : instructions(F))
|
|
|
|
NumRemainingStores += isa<StoreInst>(&I);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
return Changed;
|
2016-05-17 23:38:13 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
|
|
AU.setPreservesCFG();
|
|
|
|
AU.addRequired<AAResultsWrapperPass>();
|
|
|
|
AU.addRequired<TargetLibraryInfoWrapperPass>();
|
2020-02-11 15:26:15 +01:00
|
|
|
AU.addPreserved<GlobalsAAWrapperPass>();
|
2020-02-11 19:27:41 +01:00
|
|
|
AU.addRequired<DominatorTreeWrapperPass>();
|
|
|
|
AU.addPreserved<DominatorTreeWrapperPass>();
|
2021-03-03 20:38:50 +01:00
|
|
|
AU.addRequired<PostDominatorTreeWrapperPass>();
|
|
|
|
AU.addRequired<MemorySSAWrapperPass>();
|
|
|
|
AU.addPreserved<PostDominatorTreeWrapperPass>();
|
|
|
|
AU.addPreserved<MemorySSAWrapperPass>();
|
2021-06-20 18:03:30 +02:00
|
|
|
AU.addRequired<LoopInfoWrapperPass>();
|
|
|
|
AU.addPreserved<LoopInfoWrapperPass>();
|
2016-05-17 23:38:13 +02:00
|
|
|
}
|
|
|
|
};
|
2017-10-13 23:17:07 +02:00
|
|
|
|
2016-07-10 13:28:51 +02:00
|
|
|
} // end anonymous namespace
|
2016-05-17 23:38:13 +02:00
|
|
|
|
|
|
|
char DSELegacyPass::ID = 0;
|
2017-10-13 23:17:07 +02:00
|
|
|
|
2016-05-17 23:38:13 +02:00
|
|
|
INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false,
|
|
|
|
false)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
|
2020-02-11 19:27:41 +01:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
|
2016-05-17 23:38:13 +02:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
|
2020-02-11 19:27:41 +01:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
|
2016-05-17 23:38:13 +02:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
|
2021-06-20 18:03:30 +02:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
|
2016-05-17 23:38:13 +02:00
|
|
|
INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
|
|
|
|
false)
|
|
|
|
|
|
|
|
FunctionPass *llvm::createDeadStoreEliminationPass() {
|
|
|
|
return new DSELegacyPass();
|
2007-07-12 23:41:30 +02:00
|
|
|
}
|