From 3aea5cddbb818d07c94460085e96770dad215b45 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Mon, 22 Mar 2021 16:34:11 -0400
Subject: [PATCH] [OpenMP] Simplify GPU memory globalization

Summary:
Memory globalization is required to maintain OpenMP standard semantics for data sharing between
worker and master threads. The GPU cannot share data between its threads so must allocate global or
shared memory to store the data in. Currently this is implemented fully in the frontend using the
`__kmpc_data_sharing_push_stack` and __kmpc_data_sharing_pop_stack` functions to emulate standard
CPU stack sharing. The front-end scans the target region for variables that escape the region and
must be shared between the threads. Each variable then has a field created for it in a global record
type.

This patch replaces this functinality with a single allocation command, effectively mimicing an
alloca instruction for the variables that must be shared between the threads. This will be much
slower than the current solution, but makes it much easier to optimize as we can analyze each
variable independently and determine if it is not captured. In the future, we can replace these
calls with an `alloca` and small allocations can be pushed to shared memory.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D97680
---
 include/llvm/Frontend/OpenMP/OMPKinds.def     |  10 +-
 lib/Transforms/IPO/OpenMPOpt.cpp              |   5 +-
 .../OpenMP/globalization_remarks.ll           | 149 +++---------------
 .../PhaseOrdering/openmp-opt-module.ll        |   8 +-
 4 files changed, 31 insertions(+), 141 deletions(-)

diff --git a/include/llvm/Frontend/OpenMP/OMPKinds.def b/include/llvm/Frontend/OpenMP/OMPKinds.def
index ef5e526efba..bf435c45293 100644
--- a/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -429,20 +429,14 @@ __OMP_RTL(__kmpc_nvptx_teams_reduce_nowait_v2, false, Int32, IdentPtr, Int32,
           GlobalListPtr, GlobalListPtr, GlobalListPtr, GlobalListPtr)
 
 __OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)
-__OMP_RTL(__kmpc_data_sharing_init_stack, false, Void, )
-__OMP_RTL(__kmpc_data_sharing_init_stack_spmd, false, Void, )
 
-__OMP_RTL(__kmpc_data_sharing_coalesced_push_stack, false, VoidPtr, SizeTy, Int16)
-__OMP_RTL(__kmpc_data_sharing_push_stack, false, VoidPtr, SizeTy, Int16)
-__OMP_RTL(__kmpc_data_sharing_pop_stack, false, Void, VoidPtr)
+__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
+__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)
 __OMP_RTL(__kmpc_end_sharing_variables, false, Void, )
 __OMP_RTL(__kmpc_get_shared_variables, false, Void, VoidPtrPtrPtr)
 __OMP_RTL(__kmpc_parallel_level, false, Int16, IdentPtr, Int32)
 __OMP_RTL(__kmpc_is_spmd_exec_mode, false, Int8, )
-__OMP_RTL(__kmpc_get_team_static_memory, false, Void, Int16, VoidPtr, SizeTy,
-          Int16, VoidPtrPtr)
-__OMP_RTL(__kmpc_restore_team_static_memory, false, Void, Int16, Int16)
 __OMP_RTL(__kmpc_barrier_simple_spmd, false, Void, IdentPtr, Int32)
 
 __OMP_RTL(__kmpc_warp_active_thread_mask, false, LanemaskTy,)
diff --git a/lib/Transforms/IPO/OpenMPOpt.cpp b/lib/Transforms/IPO/OpenMPOpt.cpp
index d548f9afb7e..4450a2cfcbf 100644
--- a/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1122,9 +1122,8 @@ private:
   }
 
   void analysisGlobalization() {
-    RuntimeFunction GlobalizationRuntimeIDs[] = {
-        OMPRTL___kmpc_data_sharing_coalesced_push_stack,
-        OMPRTL___kmpc_data_sharing_push_stack};
+    RuntimeFunction GlobalizationRuntimeIDs[] = {OMPRTL___kmpc_alloc_shared,
+                                                 OMPRTL___kmpc_free_shared};
 
     for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) {
       auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID];
diff --git a/test/Transforms/OpenMP/globalization_remarks.ll b/test/Transforms/OpenMP/globalization_remarks.ll
index 9c913d381c5..dfd9173de17 100644
--- a/test/Transforms/OpenMP/globalization_remarks.ll
+++ b/test/Transforms/OpenMP/globalization_remarks.ll
@@ -2,144 +2,41 @@
 ; ModuleID = 'declare_target_codegen_globalization.cpp'
 source_filename = "declare_target_codegen_globalization.cpp"
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-nvidia-cuda"
+target triple = "nvptx64"
 
-%struct.ident_t = type { i32, i32, i32, i32, i8* }
-%struct._globalized_locals_ty = type { [32 x i32] }
+; CHECK: remark: globalization_remarks.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
 
-@0 = private unnamed_addr constant [56 x i8] c";declare_target_codegen_globalization.cpp;maini1;17;1;;\00", align 1
-@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 1, i32 0, i8* getelementptr inbounds ([56 x i8], [56 x i8]* @0, i32 0, i32 0) }, align 8
-@__omp_offloading_801_3022563__Z6maini1v_l17_exec_mode = weak constant i8 0
-@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_801_3022563__Z6maini1v_l17_exec_mode], section "llvm.metadata"
+@S = external local_unnamed_addr global i8*
 
-; CHECK: remark: declare_target_codegen_globalization.cpp:17:1: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
-; CHECK: remark: declare_target_codegen_globalization.cpp:10:1: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
-
-; Function Attrs: norecurse nounwind
-define weak void @__omp_offloading_801_3022563__Z6maini1v_l17(i32* nonnull align 4 dereferenceable(4) %a) local_unnamed_addr #0 !dbg !10 {
+define void @foo() {
 entry:
-  %nvptx_num_threads = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg !12, !range !13
-  tail call void @__kmpc_spmd_kernel_init(i32 %nvptx_num_threads, i16 1, i16 0) #4, !dbg !12
-  tail call void @__kmpc_data_sharing_init_stack_spmd() #4, !dbg !12
-  %0 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
-  %1 = tail call i8 @__kmpc_is_spmd_exec_mode() #4
-  %.not.i.i = icmp eq i8 %1, 0
-  br i1 %.not.i.i, label %.non-spmd2.i.i, label %__omp_outlined__.exit
-
-.non-spmd2.i.i:                                   ; preds = %entry
-  %2 = tail call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 128, i16 0) #4, !dbg !12
-  tail call void @__kmpc_data_sharing_pop_stack(i8* %2) #4, !dbg !14
-  br label %__omp_outlined__.exit, !dbg !14
-
-__omp_outlined__.exit:                            ; preds = %entry, %.non-spmd2.i.i
-  tail call void @__kmpc_spmd_kernel_deinit_v2(i16 1) #4, !dbg !19
-  ret void, !dbg !20
+  %0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !8
+  %x_on_stack = bitcast i8* %0 to i32*
+  %1 = bitcast i32* %x_on_stack to i8*
+  call void @share(i8* %1)
+  call void @__kmpc_free_shared(i8* %0)
+  ret void
 }
 
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1
-
-declare void @__kmpc_spmd_kernel_init(i32, i16, i16) local_unnamed_addr
-
-declare void @__kmpc_data_sharing_init_stack_spmd() local_unnamed_addr
-
-; Function Attrs: norecurse nounwind readonly
-define hidden i32 @_Z3fooRi(i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) local_unnamed_addr #2 !dbg !21 {
+define void @share(i8* %x) {
 entry:
-  %0 = load i32, i32* %a, align 4, !dbg !22, !tbaa !23
-  ret i32 %0, !dbg !27
+  store i8* %x, i8** @S
+  ret void
 }
 
-; Function Attrs: nounwind
-define hidden i32 @_Z3barv() local_unnamed_addr #3 !dbg !15 {
-entry:
-  %a1 = alloca i32, align 4
-  %0 = tail call i8 @__kmpc_is_spmd_exec_mode() #4
-  %.not = icmp eq i8 %0, 0
-  br i1 %.not, label %.non-spmd, label %.exit
+declare i8* @__kmpc_alloc_shared(i64)
 
-.non-spmd:                                        ; preds = %entry
-  %1 = tail call i8* @__kmpc_data_sharing_push_stack(i64 128, i16 0) #4, !dbg !31
-  %2 = bitcast i8* %1 to %struct._globalized_locals_ty*
-  br label %.exit
+declare void @__kmpc_free_shared(i8*)
 
-.exit:                                            ; preds = %entry, %.non-spmd
-  %_select_stack = phi %struct._globalized_locals_ty* [ %2, %.non-spmd ], [ null, %entry ]
-  %nvptx_tid = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !28
-  %nvptx_lane_id = and i32 %nvptx_tid, 31
-  %3 = zext i32 %nvptx_lane_id to i64
-  %4 = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* %_select_stack, i64 0, i32 0, i64 %3
-  %5 = select i1 %.not, i32* %4, i32* %a1
-  %6 = load i32, i32* %5, align 4, !dbg !29, !tbaa !23
-  br i1 %.not, label %.non-spmd2, label %.exit3, !dbg !31
-
-.non-spmd2:                                       ; preds = %.exit
-  %7 = bitcast %struct._globalized_locals_ty* %_select_stack to i8*, !dbg !31
-  tail call void @__kmpc_data_sharing_pop_stack(i8* %7) #4, !dbg !31
-  br label %.exit3, !dbg !31
-
-.exit3:                                           ; preds = %.non-spmd2, %.exit
-  ret i32 %6, !dbg !31
-}
-
-declare i8 @__kmpc_is_spmd_exec_mode() local_unnamed_addr
-
-declare i8* @__kmpc_data_sharing_coalesced_push_stack(i64, i16) local_unnamed_addr
-
-declare i8* @__kmpc_data_sharing_push_stack(i64, i16) local_unnamed_addr
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
-
-declare void @__kmpc_data_sharing_pop_stack(i8*) local_unnamed_addr
-
-; Function Attrs: nounwind
-declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr #4
-
-declare void @__kmpc_spmd_kernel_deinit_v2(i16) local_unnamed_addr
-
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
-!omp_offload.info = !{!3}
-!nvvm.annotations = !{!4}
-!llvm.module.flags = !{!5, !6, !7, !8}
-!llvm.ident = !{!9}
+!llvm.module.flags = !{!3, !4}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 12.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
-!1 = !DIFile(filename: "declare_target_codegen_globalization.cpp", directory: "/home/jhuber/Documents/llvm-project/clang/test/OpenMP")
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "globalization_remarks.c", directory: "/tmp/globalization_remarks.c")
 !2 = !{}
-!3 = !{i32 0, i32 2049, i32 50472291, !"_Z6maini1v", i32 17, i32 0}
-!4 = !{void (i32*)* @__omp_offloading_801_3022563__Z6maini1v_l17, !"kernel", i32 1}
-!5 = !{i32 7, !"Dwarf Version", i32 2}
-!6 = !{i32 2, !"Debug Info Version", i32 3}
-!7 = !{i32 1, !"wchar_size", i32 4}
-!8 = !{i32 7, !"PIC Level", i32 2}
-!9 = !{!"clang version 12.0.0"}
-!10 = distinct !DISubprogram(name: "__omp_offloading_801_3022563__Z6maini1v_l17", scope: !1, file: !1, line: 17, type: !11, scopeLine: 17, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
-!11 = !DISubroutineType(types: !2)
-!12 = !DILocation(line: 17, column: 1, scope: !10)
-!13 = !{i32 1, i32 1025}
-!14 = !DILocation(line: 10, column: 1, scope: !15, inlinedAt: !16)
-!15 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !11, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
-!16 = distinct !DILocation(line: 20, column: 18, scope: !17, inlinedAt: !18)
-!17 = distinct !DISubprogram(name: "__omp_outlined__", scope: !1, file: !1, line: 17, type: !11, scopeLine: 17, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
-!18 = distinct !DILocation(line: 17, column: 1, scope: !10)
-!19 = !DILocation(line: 17, column: 40, scope: !10)
-!20 = !DILocation(line: 21, column: 3, scope: !10)
-!21 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 5, type: !11, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
-!22 = !DILocation(line: 5, column: 26, scope: !21)
-!23 = !{!24, !24, i64 0}
-!24 = !{!"int", !25, i64 0}
-!25 = !{!"omnipotent char", !26, i64 0}
-!26 = !{!"Simple C++ TBAA"}
-!27 = !DILocation(line: 5, column: 19, scope: !21)
-!28 = !{i32 0, i32 1024}
-!29 = !DILocation(line: 5, column: 26, scope: !21, inlinedAt: !30)
-!30 = distinct !DILocation(line: 9, column: 10, scope: !15)
-!31 = !DILocation(line: 10, column: 1, scope: !15)
-
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !7, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 5, column: 7, scope: !6)
diff --git a/test/Transforms/PhaseOrdering/openmp-opt-module.ll b/test/Transforms/PhaseOrdering/openmp-opt-module.ll
index 81c31e91770..aea51fffd2f 100644
--- a/test/Transforms/PhaseOrdering/openmp-opt-module.ll
+++ b/test/Transforms/PhaseOrdering/openmp-opt-module.ll
@@ -7,11 +7,11 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 
 define void @foo() {
 entry:
-  %x = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 0), !dbg !7
+  %x = call i8* @__kmpc_alloc_shared(i64 4), !dbg !7
   %x_on_stack = bitcast i8* %x to i32*
   %0 = bitcast i32* %x_on_stack to i8*
   call void @use(i8* %0)
-  call void @__kmpc_data_sharing_pop_stack(i8* %x)
+  call void @__kmpc_free_shared(i8* %x)
   ret void
 }
 
@@ -22,7 +22,7 @@ entry:
   ret void
 }
 
-define internal i8* @__kmpc_data_sharing_push_stack(i64 %DataSize, i16 %shared) {
+define internal i8* @__kmpc_alloc_shared(i64 %DataSize) {
 entry:
   %call = call i8* @_Z10SafeMallocmPKc(i64 %DataSize, i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i64 0, i64 0)) #11
   ret i8* %call
@@ -31,7 +31,7 @@ entry:
 ; Function Attrs: convergent nounwind mustprogress
 declare i8* @_Z10SafeMallocmPKc(i64 %size, i8* nocapture readnone %msg)
 
-declare void @__kmpc_data_sharing_pop_stack(i8*)
+declare void @__kmpc_free_shared(i8*)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}