[CSSPGO] Load context profile for external functions in PreLink and populate ThinLTO import list
For ThinLTO's prelink compilation, we need to put external inline candidates into an import list attached to function's entry count metadata. This enables ThinLink to treat such cross module callee as hot in summary index, and later helps postlink to import them for profile guided cross module inlining.
For AutoFDO, the import list is retrieved by traversing the nested inlinee functions. For CSSPGO, since profile is flatterned, a few things need to happen for it to work:
- When loading input profile in extended binary format, we need to load all child context profile whose parent is in current module, so context trie for current module includes potential cross module inlinee.
- In order to make the above happen, we need to know whether input profile is CSSPGO profile before start reading function profile, hence a flag for profile summary section is added.
- When searching for cross module inline candidate, we need to walk through the context trie instead of nested inlinee profile (callsite sample of AutoFDO profile).
- Now that we have more accurate counts with CSSPGO, we swtiched to use entry count instead of total count to decided if an external callee is potentially beneficial to inline. This make it consistent with how we determine whether call tagert is potential inline candidate.
Differential Revision: https://reviews.llvm.org/D98590
2021-03-13 22:55:28 +01:00
; Make sure Import GUID list for ThinLTO properly set for CSSPGO
; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list.prof -S | FileCheck %s
; RUN: opt < %s -passes='thinlto-pre-link<O2>' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list.prof.extbin -S | FileCheck %s
declare i32 @_Z5funcBi ( i32 %x )
declare i32 @_Z5funcAi ( i32 %x )
define d s o _ l o c a l i32 @main ( ) local_unnamed_addr #0 !dbg !18 {
entry:
br label %for.body , !dbg !25
for.cond.cleanup: ; preds = %for.body
ret i32 %add3 , !dbg !27
for.body: ; preds = %for.body, %entry
%x.011 = phi i32 [ 300000 , %entry ] , [ %dec , %for.body ]
%r.010 = phi i32 [ 0 , %entry ] , [ %add3 , %for.body ]
%call = tail call i32 @_Z5funcBi ( i32 %x.011 ) , !dbg !32
%add = add nuw nsw i32 %x.011 , 1 , !dbg !31
%call1 = tail call i32 @_Z5funcAi ( i32 %add ) , !dbg !28
%add2 = add i32 %call , %r.010 , !dbg !34
%add3 = add i32 %add2 , %call1 , !dbg !35
%dec = add nsw i32 %x.011 , -1 , !dbg !36
%cmp = icmp eq i32 %x.011 , 0 , !dbg !38
br i1 %cmp , label %for.cond.cleanup , label %for.body , !dbg !25
}
; Make sure the ImportGUID stays with entry count metadata for ThinLTO-PreLink
; CHECK: distinct !DISubprogram(name: "main"
; CHECK: !{!"function_entry_count", i64 3, i64 446061515086924981, i64 3815895320998406042, i64 7102633082150537521, i64 -2862076748587597320}
2021-05-24 19:43:40 +02:00
attributes #0 = { n o free noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math" = "false" "disable-tail-calls" = "false" "frame-pointer" = "none" "less-precise-fpmad" = "false" "min-legal-vector-width" = "0" "no-infs-fp-math" = "false" "no-jump-tables" = "false" "no-nans-fp-math" = "false" "no-signed-zeros-fp-math" = "false" "no-trapping-math" = "false" "stack-protector-buffer-size" = "8" "target-cpu" = "x86-64" "target-features" = "+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math" = "false" "use-soft-float" = "false" "use-sample-profile" }
[CSSPGO] Load context profile for external functions in PreLink and populate ThinLTO import list
For ThinLTO's prelink compilation, we need to put external inline candidates into an import list attached to function's entry count metadata. This enables ThinLink to treat such cross module callee as hot in summary index, and later helps postlink to import them for profile guided cross module inlining.
For AutoFDO, the import list is retrieved by traversing the nested inlinee functions. For CSSPGO, since profile is flatterned, a few things need to happen for it to work:
- When loading input profile in extended binary format, we need to load all child context profile whose parent is in current module, so context trie for current module includes potential cross module inlinee.
- In order to make the above happen, we need to know whether input profile is CSSPGO profile before start reading function profile, hence a flag for profile summary section is added.
- When searching for cross module inline candidate, we need to walk through the context trie instead of nested inlinee profile (callsite sample of AutoFDO profile).
- Now that we have more accurate counts with CSSPGO, we swtiched to use entry count instead of total count to decided if an external callee is potentially beneficial to inline. This make it consistent with how we determine whether call tagert is potential inline candidate.
Differential Revision: https://reviews.llvm.org/D98590
2021-03-13 22:55:28 +01:00
!llvm.dbg.cu = ! { !2 }
!llvm.module.flags = ! { !14 , !15 , !16 }
!llvm.ident = ! { !17 }
!0 = !DIGlobalVariableExpression ( var: !1 , expr: !DIExpression ( ) )
!1 = distinct !DIGlobalVariable ( name: "factor" , scope: !2 , file: !3 , line: 21 , type: !13 , isLocal: false , isDefinition: true )
!2 = distinct !DICompileUnit ( language: D W _ L A N G _ C _ p l u s _ p l u s _ 14 , file: !3 , producer: "clang version 11.0.0" , isOptimized: true , runtimeVersion: 0 , emissionKind: F u l l D e b u g , enums: !4 , retainedTypes: !5 , globals: !12 , splitDebugInlining: false , debugInfoForProfiling: true , nameTableKind: N one )
!3 = !DIFile ( filename: "merged.cpp" , directory: "/local/autofdo" )
!4 = ! { }
!5 = ! { !6 , !10 , !11 }
!6 = !DISubprogram ( name: "funcA" , linkageName: "_Z5funcAi" , scope: !3 , file: !3 , line: 6 , type: !7 , flags: D I F l a g P r o t o t y p e d , spFlags: D I S P F l a g O p t i m i z e d , retainedNodes: !4 )
!7 = !DISubroutineType ( types: !8 )
!8 = ! { !9 , !9 }
!9 = !DIBasicType ( name: "int" , size: 32 , encoding: D W _ A T E _ s i g n e d )
!10 = !DISubprogram ( name: "funcB" , linkageName: "_Z5funcBi" , scope: !3 , file: !3 , line: 7 , type: !7 , flags: D I F l a g P r o t o t y p e d , spFlags: D I S P F l a g O p t i m i z e d , retainedNodes: !4 )
!11 = !DISubprogram ( name: "funcLeaf" , linkageName: "_Z8funcLeafi" , scope: !3 , file: !3 , line: 22 , type: !7 , flags: D I F l a g P r o t o t y p e d , spFlags: D I S P F l a g O p t i m i z e d , retainedNodes: !4 )
!12 = ! { !0 }
!13 = !DIDerivedType ( tag: D W _ T A G _ v o l a t i l e _ type , baseType: !9 )
!14 = ! { i32 7 , !"Dwarf Version" , i32 4 }
!15 = ! { i32 2 , !"Debug Info Version" , i32 3 }
!16 = ! { i32 1 , !"wchar_size" , i32 4 }
!17 = ! { !"clang version 11.0.0" }
!18 = distinct !DISubprogram ( name: "main" , scope: !3 , file: !3 , line: 11 , type: !19 , scopeLine: 11 , flags: D I F l a g P r o t o t y p e d | D I F l a g A l l C a l l s D e s c r i b e d , spFlags: D I S P F l a g D e f i n i t i o n | D I S P F l a g O p t i m i z e d , unit: !2 , retainedNodes: !21 )
!19 = !DISubroutineType ( types: !20 )
!20 = ! { !9 }
!21 = ! { !22 , !23 }
!22 = !DILocalVariable ( name: "r" , scope: !18 , file: !3 , line: 12 , type: !9 )
!23 = !DILocalVariable ( name: "x" , scope: !24 , file: !3 , line: 13 , type: !9 )
!24 = distinct !DILexicalBlock ( scope: !18 , file: !3 , line: 13 , column: 3 )
!25 = !DILocation ( line: 13 , column: 3 , scope: !26 )
!26 = !DILexicalBlockFile ( scope: !24 , file: !3 , discriminator: 2 )
!27 = !DILocation ( line: 17 , column: 3 , scope: !18 )
!28 = !DILocation ( line: 13 , column: 10 , scope: !29 )
!29 = distinct !DILexicalBlock ( scope: !30 , file: !3 , line: 13 , column: 37 )
!30 = distinct !DILexicalBlock ( scope: !24 , file: !3 , line: 13 , column: 3 )
!31 = !DILocation ( line: 14 , column: 29 , scope: !29 )
!32 = !DILocation ( line: 14 , column: 21 , scope: !33 )
!33 = !DILexicalBlockFile ( scope: !29 , file: !3 , discriminator: 2 )
!34 = !DILocation ( line: 14 , column: 19 , scope: !29 )
!35 = !DILocation ( line: 14 , column: 7 , scope: !29 )
!36 = !DILocation ( line: 13 , column: 33 , scope: !37 )
!37 = !DILexicalBlockFile ( scope: !30 , file: !3 , discriminator: 6 )
!38 = !DILocation ( line: 13 , column: 26 , scope: !39 )
!39 = !DILexicalBlockFile ( scope: !30 , file: !3 , discriminator: 2 )