1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-02-01 05:01:59 +01:00
llvm-mirror/test/Transforms/InstCombine/phi-aware-aggregate-reconstruction.ll
Roman Lebedev e44f2bc0d5 [InstCombine] Aggregate reconstruction simplification (PR47060)
This pattern happens in clang C++ exception lowering code, on unwind branch.
We end up having a `landingpad` block after each `invoke`, where RAII
cleanup is performed, and the elements of an aggregate `{i8*, i32}`
holding exception info are `extractvalue`'d, and we then branch to common block
that takes extracted `i8*` and `i32` elements (via `phi` nodes),
form a new aggregate, and finally `resume`'s the exception.

The problem is that, if the cleanup block is effectively empty,
it shouldn't be there, there shouldn't be that `landingpad` and `resume`,
said `invoke` should be a  `call`.

Indeed, we do that simplification in e.g. SimplifyCFG `SimplifyCFGOpt::simplifyResume()`.
But the thing is, all this extra `extractvalue` + `phi` + `insertvalue` cruft,
while it is pointless, does not look like "empty cleanup block".
So the `SimplifyCFGOpt::simplifyResume()` fails, and the exception is has
higher cost than it could have on unwind branch :S

This doesn't happen *that* often, but it will basically happen once per C++
function with complex CFG that called more than one other function
that isn't known to be `nounwind`.

I think, this is a missing fold in InstCombine, so i've implemented it.

I think, the algorithm/implementation is rather self-explanatory:
1. Find a chain of `insertvalue`'s that fully tell us the initializer of the aggregate.
2. For each element, try to find from which aggregate it was extracted.
   If it was extracted from the aggregate with identical type,
   from identical element index, great.
3. If all elements were found to have been extracted from the same aggregate,
   then we can just use said original source aggregate directly,
   instead of re-creating it.
4. If we fail to find said aggregate when looking only in the current block,
   we need be PHI-aware - we might have different source aggregate when coming
   from each predecessor.

I'm not sure if this already handles everything, and there are some FIXME's,
i'll deal with all that later in followups.

I'd be fine with going with post-commit review here code-wise,
but just in case there are thoughts, i'm posting this.

On RawSpeed, for example, this has the following effect:
```
| statistic name                                    | baseline | proposed |     Δ |       % | abs(%) |
|---------------------------------------------------|---------:|---------:|------:|--------:|-------:|
| instcombine.NumAggregateReconstructionsSimplified |        0 |     1253 |  1253 |   0.00% |  0.00% |
| simplifycfg.NumInvokes                            |      948 |     1355 |   407 |  42.93% | 42.93% |
| instcount.NumInsertValueInst                      |     4382 |     3210 | -1172 | -26.75% | 26.75% |
| simplifycfg.NumSinkCommonCode                     |      574 |      458 |  -116 | -20.21% | 20.21% |
| simplifycfg.NumSinkCommonInstrs                   |     1154 |      921 |  -233 | -20.19% | 20.19% |
| instcount.NumExtractValueInst                     |    29017 |    26397 | -2620 |  -9.03% |  9.03% |
| instcombine.NumDeadInst                           |   166618 |   174705 |  8087 |   4.85% |  4.85% |
| instcount.NumPHIInst                              |    51526 |    50678 |  -848 |  -1.65% |  1.65% |
| instcount.NumLandingPadInst                       |    20865 |    20609 |  -256 |  -1.23% |  1.23% |
| instcount.NumInvokeInst                           |    34023 |    33675 |  -348 |  -1.02% |  1.02% |
| simplifycfg.NumSimpl                              |   113634 |   114708 |  1074 |   0.95% |  0.95% |
| instcombine.NumSunkInst                           |    15030 |    14930 |  -100 |  -0.67% |  0.67% |
| instcount.TotalBlocks                             |   219544 |   219024 |  -520 |  -0.24% |  0.24% |
| instcombine.NumCombined                           |   644562 |   645805 |  1243 |   0.19% |  0.19% |
| instcount.TotalInsts                              |  2139506 |  2135377 | -4129 |  -0.19% |  0.19% |
| instcount.NumBrInst                               |   156988 |   156821 |  -167 |  -0.11% |  0.11% |
| instcount.NumCallInst                             |  1206144 |  1207076 |   932 |   0.08% |  0.08% |
| instcount.NumResumeInst                           |     5193 |     5190 |    -3 |  -0.06% |  0.06% |
| asm-printer.EmittedInsts                          |   948580 |   948299 |  -281 |  -0.03% |  0.03% |
| instcount.TotalFuncs                              |    11509 |    11507 |    -2 |  -0.02% |  0.02% |
| inline.NumDeleted                                 |    97595 |    97597 |     2 |   0.00% |  0.00% |
| inline.NumInlined                                 |   210514 |   210522 |     8 |   0.00% |  0.00% |
```
So we manage to increase the amount of `invoke` -> `call` conversions in SimplifyCFG by almost a half,
and there is a very apparent decrease in instruction and basic block count.

On vanilla llvm-test-suite:
```
| statistic name                                    | baseline | proposed |     Δ |       % | abs(%) |
|---------------------------------------------------|---------:|---------:|------:|--------:|-------:|
| instcombine.NumAggregateReconstructionsSimplified |        0 |      744 |   744 |   0.00% |  0.00% |
| instcount.NumInsertValueInst                      |     2705 |     2053 |  -652 | -24.10% | 24.10% |
| simplifycfg.NumInvokes                            |     1212 |     1424 |   212 |  17.49% | 17.49% |
| instcount.NumExtractValueInst                     |    21681 |    20139 | -1542 |  -7.11% |  7.11% |
| simplifycfg.NumSinkCommonInstrs                   |    14575 |    14361 |  -214 |  -1.47% |  1.47% |
| simplifycfg.NumSinkCommonCode                     |     6815 |     6743 |   -72 |  -1.06% |  1.06% |
| instcount.NumLandingPadInst                       |    14851 |    14712 |  -139 |  -0.94% |  0.94% |
| instcount.NumInvokeInst                           |    27510 |    27332 |  -178 |  -0.65% |  0.65% |
| instcombine.NumDeadInst                           |  1438173 |  1443371 |  5198 |   0.36% |  0.36% |
| instcount.NumResumeInst                           |     2880 |     2872 |    -8 |  -0.28% |  0.28% |
| instcombine.NumSunkInst                           |    55187 |    55076 |  -111 |  -0.20% |  0.20% |
| instcount.NumPHIInst                              |   321366 |   320916 |  -450 |  -0.14% |  0.14% |
| instcount.TotalBlocks                             |   886816 |   886493 |  -323 |  -0.04% |  0.04% |
| instcount.TotalInsts                              |  7663845 |  7661108 | -2737 |  -0.04% |  0.04% |
| simplifycfg.NumSimpl                              |   886791 |   887171 |   380 |   0.04% |  0.04% |
| instcount.NumCallInst                             |   553552 |   553733 |   181 |   0.03% |  0.03% |
| instcombine.NumCombined                           |  3200512 |  3201202 |   690 |   0.02% |  0.02% |
| instcount.NumBrInst                               |   741794 |   741656 |  -138 |  -0.02% |  0.02% |
| simplifycfg.NumHoistCommonInstrs                  |    14443 |    14445 |     2 |   0.01% |  0.01% |
| asm-printer.EmittedInsts                          |  7978085 |  7977916 |  -169 |   0.00% |  0.00% |
| inline.NumDeleted                                 |    73188 |    73189 |     1 |   0.00% |  0.00% |
| inline.NumInlined                                 |   291959 |   291968 |     9 |   0.00% |  0.00% |
```
Roughly similar effect, less instructions and blocks total.

See also: rGe492f0e03b01a5e4ec4b6333abb02d303c3e479e.

Compile-time wise, this appears to be roughly geomean-neutral:
http://llvm-compile-time-tracker.com/compare.php?from=39617aaed95ac00957979bc1525598c1be80e85e&to=b59866cf30420da8f8e3ca239ed3bec577b23387&stat=instructions

And this is a win size-wize in general:
http://llvm-compile-time-tracker.com/compare.php?from=39617aaed95ac00957979bc1525598c1be80e85e&to=b59866cf30420da8f8e3ca239ed3bec577b23387&stat=size-text

See https://bugs.llvm.org/show_bug.cgi?id=47060

Reviewed By: spatel

Differential Revision: https://reviews.llvm.org/D85787
2020-08-16 23:27:56 +03:00

316 lines
11 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -S -instcombine < %s | FileCheck %s
declare void @foo()
declare void @bar()
declare void @baz()
declare void @qux()
declare i1 @geni1()
declare void @usei32(i32)
declare void @usei32i32agg({ i32, i32 })
; Most basic test - diamond structure
define { i32, i32 } @test0({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 %c) {
; CHECK-LABEL: @test0(
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]]
; CHECK: left:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[END:%.*]]
; CHECK: right:
; CHECK-NEXT: call void @bar()
; CHECK-NEXT: br label [[END]]
; CHECK: end:
; CHECK-NEXT: [[I8:%.*]] = phi { i32, i32 } [ [[AGG_RIGHT:%.*]], [[RIGHT]] ], [ [[AGG_LEFT:%.*]], [[LEFT]] ]
; CHECK-NEXT: call void @baz()
; CHECK-NEXT: ret { i32, i32 } [[I8]]
;
entry:
br i1 %c, label %left, label %right
left:
%i0 = extractvalue { i32, i32 } %agg_left, 0
%i2 = extractvalue { i32, i32 } %agg_left, 1
call void @foo()
br label %end
right:
%i3 = extractvalue { i32, i32 } %agg_right, 0
%i4 = extractvalue { i32, i32 } %agg_right, 1
call void @bar()
br label %end
end:
%i5 = phi i32 [ %i0, %left ], [ %i3, %right ]
%i6 = phi i32 [ %i2, %left ], [ %i4, %right ]
call void @baz()
%i7 = insertvalue { i32, i32 } undef, i32 %i5, 0
%i8 = insertvalue { i32, i32 } %i7, i32 %i6, 1
ret { i32, i32 } %i8
}
; Second element is coming from wrong aggregate
define { i32, i32 } @negative_test1({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 %c) {
; CHECK-LABEL: @negative_test1(
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]]
; CHECK: left:
; CHECK-NEXT: [[I4:%.*]] = extractvalue { i32, i32 } [[AGG_RIGHT:%.*]], 1
; CHECK-NEXT: [[I0:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT:%.*]], 0
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[END:%.*]]
; CHECK: right:
; CHECK-NEXT: [[I3:%.*]] = extractvalue { i32, i32 } [[AGG_RIGHT]], 0
; CHECK-NEXT: [[I2:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT]], 1
; CHECK-NEXT: call void @bar()
; CHECK-NEXT: br label [[END]]
; CHECK: end:
; CHECK-NEXT: [[I5:%.*]] = phi i32 [ [[I0]], [[LEFT]] ], [ [[I3]], [[RIGHT]] ]
; CHECK-NEXT: [[I6:%.*]] = phi i32 [ [[I4]], [[LEFT]] ], [ [[I2]], [[RIGHT]] ]
; CHECK-NEXT: call void @baz()
; CHECK-NEXT: [[I7:%.*]] = insertvalue { i32, i32 } undef, i32 [[I5]], 0
; CHECK-NEXT: [[I8:%.*]] = insertvalue { i32, i32 } [[I7]], i32 [[I6]], 1
; CHECK-NEXT: ret { i32, i32 } [[I8]]
;
entry:
%i0 = extractvalue { i32, i32 } %agg_left, 0
%i2 = extractvalue { i32, i32 } %agg_left, 1
%i3 = extractvalue { i32, i32 } %agg_right, 0
%i4 = extractvalue { i32, i32 } %agg_right, 1
br i1 %c, label %left, label %right
left:
call void @foo()
br label %end
right:
call void @bar()
br label %end
end:
%i5 = phi i32 [ %i0, %left ], [ %i3, %right ]
%i6 = phi i32 [ %i4, %left ], [ %i2, %right ]
call void @baz()
%i7 = insertvalue { i32, i32 } undef, i32 %i5, 0
%i8 = insertvalue { i32, i32 } %i7, i32 %i6, 1
ret { i32, i32 } %i8
}
; When coming from %left, elements are swapped
define { i32, i32 } @negative_test2({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 %c) {
; CHECK-LABEL: @negative_test2(
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]]
; CHECK: left:
; CHECK-NEXT: [[I2:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT:%.*]], 1
; CHECK-NEXT: [[I0:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT]], 0
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[END:%.*]]
; CHECK: right:
; CHECK-NEXT: [[I4:%.*]] = extractvalue { i32, i32 } [[AGG_RIGHT:%.*]], 1
; CHECK-NEXT: [[I3:%.*]] = extractvalue { i32, i32 } [[AGG_RIGHT]], 0
; CHECK-NEXT: call void @bar()
; CHECK-NEXT: br label [[END]]
; CHECK: end:
; CHECK-NEXT: [[I5:%.*]] = phi i32 [ [[I2]], [[LEFT]] ], [ [[I3]], [[RIGHT]] ]
; CHECK-NEXT: [[I6:%.*]] = phi i32 [ [[I0]], [[LEFT]] ], [ [[I4]], [[RIGHT]] ]
; CHECK-NEXT: call void @baz()
; CHECK-NEXT: [[I7:%.*]] = insertvalue { i32, i32 } undef, i32 [[I5]], 0
; CHECK-NEXT: [[I8:%.*]] = insertvalue { i32, i32 } [[I7]], i32 [[I6]], 1
; CHECK-NEXT: ret { i32, i32 } [[I8]]
;
entry:
%i0 = extractvalue { i32, i32 } %agg_left, 0
%i2 = extractvalue { i32, i32 } %agg_left, 1
%i3 = extractvalue { i32, i32 } %agg_right, 0
%i4 = extractvalue { i32, i32 } %agg_right, 1
br i1 %c, label %left, label %right
left:
call void @foo()
br label %end
right:
call void @bar()
br label %end
end:
%i5 = phi i32 [ %i2, %left ], [ %i3, %right ]
%i6 = phi i32 [ %i0, %left ], [ %i4, %right ]
call void @baz()
%i7 = insertvalue { i32, i32 } undef, i32 %i5, 0
%i8 = insertvalue { i32, i32 } %i7, i32 %i6, 1
ret { i32, i32 } %i8
}
; FIXME: we should probably be able to handle multiple levels of PHI indirection
define { i32, i32 } @test3({ i32, i32 } %agg_00, { i32, i32 } %agg_01, { i32, i32 } %agg_10, i1 %c0, i1 %c1) {
; CHECK-LABEL: @test3(
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 [[C0:%.*]], label [[BB0_DISPATCH:%.*]], label [[BB10:%.*]]
; CHECK: bb0.dispatch:
; CHECK-NEXT: br i1 [[C1:%.*]], label [[BB00:%.*]], label [[BB01:%.*]]
; CHECK: bb00:
; CHECK-NEXT: [[I0:%.*]] = extractvalue { i32, i32 } [[AGG_00:%.*]], 0
; CHECK-NEXT: [[I1:%.*]] = extractvalue { i32, i32 } [[AGG_00]], 1
; CHECK-NEXT: br label [[BB0_MERGE:%.*]]
; CHECK: bb01:
; CHECK-NEXT: [[I2:%.*]] = extractvalue { i32, i32 } [[AGG_01:%.*]], 0
; CHECK-NEXT: [[I3:%.*]] = extractvalue { i32, i32 } [[AGG_01]], 1
; CHECK-NEXT: br label [[BB0_MERGE]]
; CHECK: bb0.merge:
; CHECK-NEXT: [[I4:%.*]] = phi i32 [ [[I0]], [[BB00]] ], [ [[I2]], [[BB01]] ]
; CHECK-NEXT: [[I5:%.*]] = phi i32 [ [[I1]], [[BB00]] ], [ [[I3]], [[BB01]] ]
; CHECK-NEXT: br label [[END:%.*]]
; CHECK: bb10:
; CHECK-NEXT: [[I6:%.*]] = extractvalue { i32, i32 } [[AGG_10:%.*]], 0
; CHECK-NEXT: [[I7:%.*]] = extractvalue { i32, i32 } [[AGG_10]], 1
; CHECK-NEXT: br label [[END]]
; CHECK: end:
; CHECK-NEXT: [[I8:%.*]] = phi i32 [ [[I4]], [[BB0_MERGE]] ], [ [[I6]], [[BB10]] ]
; CHECK-NEXT: [[I9:%.*]] = phi i32 [ [[I5]], [[BB0_MERGE]] ], [ [[I7]], [[BB10]] ]
; CHECK-NEXT: call void @baz()
; CHECK-NEXT: [[I10:%.*]] = insertvalue { i32, i32 } undef, i32 [[I8]], 0
; CHECK-NEXT: [[I11:%.*]] = insertvalue { i32, i32 } [[I10]], i32 [[I9]], 1
; CHECK-NEXT: ret { i32, i32 } [[I11]]
;
entry:
br i1 %c0, label %bb0.dispatch, label %bb10
bb0.dispatch:
br i1 %c1, label %bb00, label %bb01
bb00:
%i0 = extractvalue { i32, i32 } %agg_00, 0
%i1 = extractvalue { i32, i32 } %agg_00, 1
br label %bb0.merge
bb01:
%i2 = extractvalue { i32, i32 } %agg_01, 0
%i3 = extractvalue { i32, i32 } %agg_01, 1
br label %bb0.merge
bb0.merge:
%i4 = phi i32 [ %i0, %bb00 ], [ %i2, %bb01 ]
%i5 = phi i32 [ %i1, %bb00 ], [ %i3, %bb01 ]
br label %end
bb10:
%i6 = extractvalue { i32, i32 } %agg_10, 0
%i7 = extractvalue { i32, i32 } %agg_10, 1
br label %end
end:
%i8 = phi i32 [ %i4, %bb0.merge ], [ %i6, %bb10 ]
%i9 = phi i32 [ %i5, %bb0.merge ], [ %i7, %bb10 ]
call void @baz()
%i10 = insertvalue { i32, i32 } undef, i32 %i8, 0
%i11 = insertvalue { i32, i32 } %i10, i32 %i9, 1
ret { i32, i32 } %i11
}
; Not sure what should happen for cycles.
define { i32, i32 } @test4({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 %c0) {
; CHECK-LABEL: @test4(
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 [[C0:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]]
; CHECK: left:
; CHECK-NEXT: [[I0:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT:%.*]], 0
; CHECK-NEXT: [[I2:%.*]] = extractvalue { i32, i32 } [[AGG_LEFT]], 1
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[MIDDLE:%.*]]
; CHECK: right:
; CHECK-NEXT: [[I3:%.*]] = extractvalue { i32, i32 } [[AGG_RIGHT:%.*]], 0
; CHECK-NEXT: [[I4:%.*]] = extractvalue { i32, i32 } [[AGG_RIGHT]], 1
; CHECK-NEXT: call void @bar()
; CHECK-NEXT: br label [[MIDDLE]]
; CHECK: middle:
; CHECK-NEXT: [[I5:%.*]] = phi i32 [ [[I0]], [[LEFT]] ], [ [[I3]], [[RIGHT]] ], [ [[I5]], [[MIDDLE]] ]
; CHECK-NEXT: [[I6:%.*]] = phi i32 [ [[I2]], [[LEFT]] ], [ [[I4]], [[RIGHT]] ], [ [[I6]], [[MIDDLE]] ]
; CHECK-NEXT: call void @baz()
; CHECK-NEXT: [[C1:%.*]] = call i1 @geni1()
; CHECK-NEXT: br i1 [[C1]], label [[END:%.*]], label [[MIDDLE]]
; CHECK: end:
; CHECK-NEXT: [[I7:%.*]] = insertvalue { i32, i32 } undef, i32 [[I5]], 0
; CHECK-NEXT: [[I8:%.*]] = insertvalue { i32, i32 } [[I7]], i32 [[I6]], 1
; CHECK-NEXT: ret { i32, i32 } [[I8]]
;
entry:
br i1 %c0, label %left, label %right
left:
%i0 = extractvalue { i32, i32 } %agg_left, 0
%i2 = extractvalue { i32, i32 } %agg_left, 1
call void @foo()
br label %middle
right:
%i3 = extractvalue { i32, i32 } %agg_right, 0
%i4 = extractvalue { i32, i32 } %agg_right, 1
call void @bar()
br label %middle
middle:
%i5 = phi i32 [ %i0, %left ], [ %i3, %right ], [ %i5, %middle ]
%i6 = phi i32 [ %i2, %left ], [ %i4, %right ], [ %i6, %middle ]
call void @baz()
%i7 = insertvalue { i32, i32 } undef, i32 %i5, 0
%i8 = insertvalue { i32, i32 } %i7, i32 %i6, 1
%c1 = call i1 @geni1()
br i1 %c1, label %end, label %middle
end:
ret { i32, i32 } %i8
}
; But here since we start without an explicit self-cycle, we already manage to fold it.
define { i32, i32 } @test5({ i32, i32 } %agg_left, { i32, i32 } %agg_right, i1 %c0) {
; CHECK-LABEL: @test5(
; CHECK-NEXT: entry:
; CHECK-NEXT: br i1 [[C0:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]]
; CHECK: left:
; CHECK-NEXT: call void @foo()
; CHECK-NEXT: br label [[MIDDLE:%.*]]
; CHECK: right:
; CHECK-NEXT: call void @bar()
; CHECK-NEXT: br label [[MIDDLE]]
; CHECK: middle:
; CHECK-NEXT: [[I8:%.*]] = phi { i32, i32 } [ [[I8]], [[MIDDLE]] ], [ [[AGG_RIGHT:%.*]], [[RIGHT]] ], [ [[AGG_LEFT:%.*]], [[LEFT]] ]
; CHECK-NEXT: call void @baz()
; CHECK-NEXT: [[C1:%.*]] = call i1 @geni1()
; CHECK-NEXT: br i1 [[C1]], label [[END:%.*]], label [[MIDDLE]]
; CHECK: end:
; CHECK-NEXT: ret { i32, i32 } [[I8]]
;
entry:
br i1 %c0, label %left, label %right
left:
%i0 = extractvalue { i32, i32 } %agg_left, 0
%i2 = extractvalue { i32, i32 } %agg_left, 1
call void @foo()
br label %middle
right:
%i3 = extractvalue { i32, i32 } %agg_right, 0
%i4 = extractvalue { i32, i32 } %agg_right, 1
call void @bar()
br label %middle
middle:
%i5 = phi i32 [ %i0, %left ], [ %i3, %right ], [ %i9, %middle ]
%i6 = phi i32 [ %i2, %left ], [ %i4, %right ], [ %i10, %middle ]
call void @baz()
%i7 = insertvalue { i32, i32 } undef, i32 %i5, 0
%i8 = insertvalue { i32, i32 } %i7, i32 %i6, 1
%i9 = extractvalue { i32, i32 } %i8, 0
%i10 = extractvalue { i32, i32 } %i8, 1
%c1 = call i1 @geni1()
br i1 %c1, label %end, label %middle
end:
ret { i32, i32 } %i8
}