1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-18 10:32:48 +02:00

[OpenMP] Simplify GPU memory globalization

Summary:
Memory globalization is required to maintain OpenMP standard semantics for data sharing between
worker and master threads. The GPU cannot share data between its threads so must allocate global or
shared memory to store the data in. Currently this is implemented fully in the frontend using the
`__kmpc_data_sharing_push_stack` and __kmpc_data_sharing_pop_stack` functions to emulate standard
CPU stack sharing. The front-end scans the target region for variables that escape the region and
must be shared between the threads. Each variable then has a field created for it in a global record
type.

This patch replaces this functinality with a single allocation command, effectively mimicing an
alloca instruction for the variables that must be shared between the threads. This will be much
slower than the current solution, but makes it much easier to optimize as we can analyze each
variable independently and determine if it is not captured. In the future, we can replace these
calls with an `alloca` and small allocations can be pushed to shared memory.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D97680
This commit is contained in:
Joseph Huber 2021-03-22 16:34:11 -04:00 committed by Huber, Joseph
parent 96972f2a2c
commit 3aea5cddbb
4 changed files with 31 additions and 141 deletions

View File

@ -429,20 +429,14 @@ __OMP_RTL(__kmpc_nvptx_teams_reduce_nowait_v2, false, Int32, IdentPtr, Int32,
GlobalListPtr, GlobalListPtr, GlobalListPtr, GlobalListPtr)
__OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)
__OMP_RTL(__kmpc_data_sharing_init_stack, false, Void, )
__OMP_RTL(__kmpc_data_sharing_init_stack_spmd, false, Void, )
__OMP_RTL(__kmpc_data_sharing_coalesced_push_stack, false, VoidPtr, SizeTy, Int16)
__OMP_RTL(__kmpc_data_sharing_push_stack, false, VoidPtr, SizeTy, Int16)
__OMP_RTL(__kmpc_data_sharing_pop_stack, false, Void, VoidPtr)
__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr)
__OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)
__OMP_RTL(__kmpc_end_sharing_variables, false, Void, )
__OMP_RTL(__kmpc_get_shared_variables, false, Void, VoidPtrPtrPtr)
__OMP_RTL(__kmpc_parallel_level, false, Int16, IdentPtr, Int32)
__OMP_RTL(__kmpc_is_spmd_exec_mode, false, Int8, )
__OMP_RTL(__kmpc_get_team_static_memory, false, Void, Int16, VoidPtr, SizeTy,
Int16, VoidPtrPtr)
__OMP_RTL(__kmpc_restore_team_static_memory, false, Void, Int16, Int16)
__OMP_RTL(__kmpc_barrier_simple_spmd, false, Void, IdentPtr, Int32)
__OMP_RTL(__kmpc_warp_active_thread_mask, false, LanemaskTy,)

View File

@ -1122,9 +1122,8 @@ private:
}
void analysisGlobalization() {
RuntimeFunction GlobalizationRuntimeIDs[] = {
OMPRTL___kmpc_data_sharing_coalesced_push_stack,
OMPRTL___kmpc_data_sharing_push_stack};
RuntimeFunction GlobalizationRuntimeIDs[] = {OMPRTL___kmpc_alloc_shared,
OMPRTL___kmpc_free_shared};
for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) {
auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID];

View File

@ -2,144 +2,41 @@
; ModuleID = 'declare_target_codegen_globalization.cpp'
source_filename = "declare_target_codegen_globalization.cpp"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
target triple = "nvptx64"
%struct.ident_t = type { i32, i32, i32, i32, i8* }
%struct._globalized_locals_ty = type { [32 x i32] }
; CHECK: remark: globalization_remarks.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
@0 = private unnamed_addr constant [56 x i8] c";declare_target_codegen_globalization.cpp;maini1;17;1;;\00", align 1
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 1, i32 0, i8* getelementptr inbounds ([56 x i8], [56 x i8]* @0, i32 0, i32 0) }, align 8
@__omp_offloading_801_3022563__Z6maini1v_l17_exec_mode = weak constant i8 0
@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_801_3022563__Z6maini1v_l17_exec_mode], section "llvm.metadata"
@S = external local_unnamed_addr global i8*
; CHECK: remark: declare_target_codegen_globalization.cpp:17:1: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
; CHECK: remark: declare_target_codegen_globalization.cpp:10:1: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
; Function Attrs: norecurse nounwind
define weak void @__omp_offloading_801_3022563__Z6maini1v_l17(i32* nonnull align 4 dereferenceable(4) %a) local_unnamed_addr #0 !dbg !10 {
define void @foo() {
entry:
%nvptx_num_threads = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg !12, !range !13
tail call void @__kmpc_spmd_kernel_init(i32 %nvptx_num_threads, i16 1, i16 0) #4, !dbg !12
tail call void @__kmpc_data_sharing_init_stack_spmd() #4, !dbg !12
%0 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
%1 = tail call i8 @__kmpc_is_spmd_exec_mode() #4
%.not.i.i = icmp eq i8 %1, 0
br i1 %.not.i.i, label %.non-spmd2.i.i, label %__omp_outlined__.exit
.non-spmd2.i.i: ; preds = %entry
%2 = tail call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 128, i16 0) #4, !dbg !12
tail call void @__kmpc_data_sharing_pop_stack(i8* %2) #4, !dbg !14
br label %__omp_outlined__.exit, !dbg !14
__omp_outlined__.exit: ; preds = %entry, %.non-spmd2.i.i
tail call void @__kmpc_spmd_kernel_deinit_v2(i16 1) #4, !dbg !19
ret void, !dbg !20
%0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !8
%x_on_stack = bitcast i8* %0 to i32*
%1 = bitcast i32* %x_on_stack to i8*
call void @share(i8* %1)
call void @__kmpc_free_shared(i8* %0)
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1
declare void @__kmpc_spmd_kernel_init(i32, i16, i16) local_unnamed_addr
declare void @__kmpc_data_sharing_init_stack_spmd() local_unnamed_addr
; Function Attrs: norecurse nounwind readonly
define hidden i32 @_Z3fooRi(i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) local_unnamed_addr #2 !dbg !21 {
define void @share(i8* %x) {
entry:
%0 = load i32, i32* %a, align 4, !dbg !22, !tbaa !23
ret i32 %0, !dbg !27
store i8* %x, i8** @S
ret void
}
; Function Attrs: nounwind
define hidden i32 @_Z3barv() local_unnamed_addr #3 !dbg !15 {
entry:
%a1 = alloca i32, align 4
%0 = tail call i8 @__kmpc_is_spmd_exec_mode() #4
%.not = icmp eq i8 %0, 0
br i1 %.not, label %.non-spmd, label %.exit
declare i8* @__kmpc_alloc_shared(i64)
.non-spmd: ; preds = %entry
%1 = tail call i8* @__kmpc_data_sharing_push_stack(i64 128, i16 0) #4, !dbg !31
%2 = bitcast i8* %1 to %struct._globalized_locals_ty*
br label %.exit
declare void @__kmpc_free_shared(i8*)
.exit: ; preds = %entry, %.non-spmd
%_select_stack = phi %struct._globalized_locals_ty* [ %2, %.non-spmd ], [ null, %entry ]
%nvptx_tid = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !28
%nvptx_lane_id = and i32 %nvptx_tid, 31
%3 = zext i32 %nvptx_lane_id to i64
%4 = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* %_select_stack, i64 0, i32 0, i64 %3
%5 = select i1 %.not, i32* %4, i32* %a1
%6 = load i32, i32* %5, align 4, !dbg !29, !tbaa !23
br i1 %.not, label %.non-spmd2, label %.exit3, !dbg !31
.non-spmd2: ; preds = %.exit
%7 = bitcast %struct._globalized_locals_ty* %_select_stack to i8*, !dbg !31
tail call void @__kmpc_data_sharing_pop_stack(i8* %7) #4, !dbg !31
br label %.exit3, !dbg !31
.exit3: ; preds = %.non-spmd2, %.exit
ret i32 %6, !dbg !31
}
declare i8 @__kmpc_is_spmd_exec_mode() local_unnamed_addr
declare i8* @__kmpc_data_sharing_coalesced_push_stack(i64, i16) local_unnamed_addr
declare i8* @__kmpc_data_sharing_push_stack(i64, i16) local_unnamed_addr
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
declare void @__kmpc_data_sharing_pop_stack(i8*) local_unnamed_addr
; Function Attrs: nounwind
declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr #4
declare void @__kmpc_spmd_kernel_deinit_v2(i16) local_unnamed_addr
attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { nounwind }
!llvm.dbg.cu = !{!0}
!omp_offload.info = !{!3}
!nvvm.annotations = !{!4}
!llvm.module.flags = !{!5, !6, !7, !8}
!llvm.ident = !{!9}
!llvm.module.flags = !{!3, !4}
!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 12.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
!1 = !DIFile(filename: "declare_target_codegen_globalization.cpp", directory: "/home/jhuber/Documents/llvm-project/clang/test/OpenMP")
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
!1 = !DIFile(filename: "globalization_remarks.c", directory: "/tmp/globalization_remarks.c")
!2 = !{}
!3 = !{i32 0, i32 2049, i32 50472291, !"_Z6maini1v", i32 17, i32 0}
!4 = !{void (i32*)* @__omp_offloading_801_3022563__Z6maini1v_l17, !"kernel", i32 1}
!5 = !{i32 7, !"Dwarf Version", i32 2}
!6 = !{i32 2, !"Debug Info Version", i32 3}
!7 = !{i32 1, !"wchar_size", i32 4}
!8 = !{i32 7, !"PIC Level", i32 2}
!9 = !{!"clang version 12.0.0"}
!10 = distinct !DISubprogram(name: "__omp_offloading_801_3022563__Z6maini1v_l17", scope: !1, file: !1, line: 17, type: !11, scopeLine: 17, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!11 = !DISubroutineType(types: !2)
!12 = !DILocation(line: 17, column: 1, scope: !10)
!13 = !{i32 1, i32 1025}
!14 = !DILocation(line: 10, column: 1, scope: !15, inlinedAt: !16)
!15 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !11, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!16 = distinct !DILocation(line: 20, column: 18, scope: !17, inlinedAt: !18)
!17 = distinct !DISubprogram(name: "__omp_outlined__", scope: !1, file: !1, line: 17, type: !11, scopeLine: 17, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!18 = distinct !DILocation(line: 17, column: 1, scope: !10)
!19 = !DILocation(line: 17, column: 40, scope: !10)
!20 = !DILocation(line: 21, column: 3, scope: !10)
!21 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 5, type: !11, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!22 = !DILocation(line: 5, column: 26, scope: !21)
!23 = !{!24, !24, i64 0}
!24 = !{!"int", !25, i64 0}
!25 = !{!"omnipotent char", !26, i64 0}
!26 = !{!"Simple C++ TBAA"}
!27 = !DILocation(line: 5, column: 19, scope: !21)
!28 = !{i32 0, i32 1024}
!29 = !DILocation(line: 5, column: 26, scope: !21, inlinedAt: !30)
!30 = distinct !DILocation(line: 9, column: 10, scope: !15)
!31 = !DILocation(line: 10, column: 1, scope: !15)
!3 = !{i32 2, !"Debug Info Version", i32 3}
!4 = !{i32 1, !"wchar_size", i32 4}
!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !7, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!7 = !DISubroutineType(types: !2)
!8 = !DILocation(line: 5, column: 7, scope: !6)

View File

@ -7,11 +7,11 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
define void @foo() {
entry:
%x = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 0), !dbg !7
%x = call i8* @__kmpc_alloc_shared(i64 4), !dbg !7
%x_on_stack = bitcast i8* %x to i32*
%0 = bitcast i32* %x_on_stack to i8*
call void @use(i8* %0)
call void @__kmpc_data_sharing_pop_stack(i8* %x)
call void @__kmpc_free_shared(i8* %x)
ret void
}
@ -22,7 +22,7 @@ entry:
ret void
}
define internal i8* @__kmpc_data_sharing_push_stack(i64 %DataSize, i16 %shared) {
define internal i8* @__kmpc_alloc_shared(i64 %DataSize) {
entry:
%call = call i8* @_Z10SafeMallocmPKc(i64 %DataSize, i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i64 0, i64 0)) #11
ret i8* %call
@ -31,7 +31,7 @@ entry:
; Function Attrs: convergent nounwind mustprogress
declare i8* @_Z10SafeMallocmPKc(i64 %size, i8* nocapture readnone %msg)
declare void @__kmpc_data_sharing_pop_stack(i8*)
declare void @__kmpc_free_shared(i8*)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4}