1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-26 04:32:44 +01:00
llvm-mirror/test/Transforms/SampleProfile/ctxsplit.ll
Hongtao Yu fd7e7fce85 [CSSPGO] Top-down processing order based on full profile.
Use profiled call edges to augment the top-down order. There are cases that the top-down order computed based on the static call graph doesn't reflect real execution order. For example:

1. Incomplete static call graph due to unknown indirect call targets. Adjusting the order by considering indirect call edges from the profile can enable the inlining of indirect call targets by allowing the caller processed before them.

2. Mutual call edges in an SCC. The static processing order computed for an SCC may not reflect the call contexts in the context-sensitive profile, thus may cause potential inlining to be overlooked. The function order in one SCC is being adjusted to a top-down order based on the profile to favor more inlining.

3. Transitive indirect call edges due to inlining. When a callee function is inlined into into a caller function in LTO prelink, every call edge originated from the callee will be transferred to the caller. If any of the transferred edges is indirect, the original profiled indirect edge, even if considered, would not enforce a top-down order from the caller to the potential indirect call target in LTO postlink since the inlined callee is gone from the static call graph.

4. #3 can happen even for direct call targets, due to functions defined in header files. Header functions, when included into source files, are defined multiple times but only one definition survives due to ODR. Therefore, the LTO prelink inlining done on those dropped definitions can be useless based on a local file scope. More importantly, the inlinee, once fully inlined to a to-be-dropped inliner, will have no profile to consume when its outlined version is compiled. This can lead to a profile-less prelink compilation for the outlined version of the inlinee function which may be called from external modules. while this isn't easy to fix, we rely on the postlink AutoFDO pipeline to optimize the inlinee. Since the survived copy of the inliner (defined in headers) can be inlined in its local scope in prelink, it may not exist in the merged IR in postlink, and we'll need the profiled call edges to enforce a top-down order for the rest of the functions.

Considering those cases, a profiled call graph completely independent of the static call graph is constructed based on profile data, where function objects are not even needed to handle case #3 and case 4.

I'm seeing an average 0.4% perf win out of SPEC2017. For certain benchmark such as Xalanbmk and GCC, the win is bigger, above 2%.

The change is an enhancement to https://reviews.llvm.org/D95988.

Reviewed By: wmi, wenlei

Differential Revision: https://reviews.llvm.org/D99351
2021-03-30 10:42:22 -07:00

60 lines
3.1 KiB
LLVM

; Check the nonflattened part of the ctxsplit profile will be read in thinlto
; postlink phase while flattened part of the ctxsplit profile will not be read.
; RUN: opt < %s -passes='thinlto<O2>' -pgo-kind=pgo-sample-use-pipeline -use-profiled-call-graph=0 -profile-file=%S/Inputs/ctxsplit.extbinary.afdo -S | FileCheck %s --check-prefix=POSTLINK
;
; Check both the flattened and nonflattened parts of the ctxsplit profile will
; be read in thinlto prelink phase.
; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -use-profiled-call-graph=0 -profile-file=%S/Inputs/ctxsplit.extbinary.afdo -S | FileCheck %s --check-prefix=PRELINK
;
; Check both the flattened and nonflattened parts of the ctxsplit profile will
; be read in non-thinlto mode.
; RUN: opt < %s -passes='default<O2>' -pgo-kind=pgo-sample-use-pipeline -use-profiled-call-graph=0 -profile-file=%S/Inputs/ctxsplit.extbinary.afdo -S | FileCheck %s --check-prefix=NOTHINLTO
; POSTLINK: define dso_local i32 @goo() {{.*}} !prof ![[ENTRY1:[0-9]+]] {
; POSTLINK: define dso_local i32 @foo() {{.*}} !prof ![[ENTRY2:[0-9]+]] {
; POSTLINK: ![[ENTRY1]] = !{!"function_entry_count", i64 1001}
; POSTLINK: ![[ENTRY2]] = !{!"function_entry_count", i64 -1}
; PRELINK: define dso_local i32 @goo() {{.*}} !prof ![[ENTRY1:[0-9]+]] {
; PRELINK: define dso_local i32 @foo() {{.*}} !prof ![[ENTRY2:[0-9]+]] {
; PRELINK: ![[ENTRY1]] = !{!"function_entry_count", i64 1001}
; PRELINK: ![[ENTRY2]] = !{!"function_entry_count", i64 3001}
; NOTHINLTO: define dso_local i32 @goo() {{.*}} !prof ![[ENTRY1:[0-9]+]] {
; NOTHINLTO: define dso_local i32 @foo() {{.*}} !prof ![[ENTRY2:[0-9]+]] {
; NOTHINLTO: ![[ENTRY1]] = !{!"function_entry_count", i64 1001}
; NOTHINLTO: ![[ENTRY2]] = !{!"function_entry_count", i64 3001}
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
; Function Attrs: norecurse nounwind readnone uwtable
define dso_local i32 @goo() #0 !dbg !10 {
entry:
ret i32 -1, !dbg !11
}
; Function Attrs: norecurse nounwind readnone uwtable
define dso_local i32 @foo() #0 !dbg !7 {
entry:
ret i32 -1, !dbg !9
}
attributes #0 = { "use-sample-profile" }
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4, !5}
!llvm.ident = !{!6}
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 8.0.0 (trunk 345241)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, nameTableKind: None)
!1 = !DIFile(filename: "a.c", directory: "")
!2 = !{}
!3 = !{i32 2, !"Dwarf Version", i32 4}
!4 = !{i32 2, !"Debug Info Version", i32 3}
!5 = !{i32 1, !"wchar_size", i32 4}
!6 = !{!"clang version 8.0.0 (trunk 345241)"}
!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, retainedNodes: !2)
!8 = !DISubroutineType(types: !2)
!9 = !DILocation(line: 2, column: 3, scope: !7)
!10 = distinct !DISubprogram(name: "goo", scope: !1, file: !1, line: 8, type: !8, isLocal: false, isDefinition: true, scopeLine: 8, isOptimized: true, unit: !0, retainedNodes: !2)
!11 = !DILocation(line: 10, column: 3, scope: !10)