[OpenMP] Run rewriteDeviceCodeStateMachine in the Module not CGSCC pass

While rewriteDeviceCodeStateMachine should probably be folded into buildCustomStateMachine, we at least need the optimization to happen. This was not reliably the case in the CGSCC pass but in the Module pass it seems to work reliably. This also ports a test to the new kernel encoding (target_init/deinit), and makes sure we cannot run the kernel in SPMD mode. Differential Revision: https://reviews.llvm.org/D106345
2024-11-21 18:22:53 +01:00 · 2021-07-20 01:58:44 -05:00 · 2021-07-20 01:58:44 -05:00 · 6a849a320d
commit 6a849a320d
parent d40841959b
3 changed files with 63 additions and 162 deletions
--- a/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/lib/Transforms/IPO/OpenMPOpt.cpp
@ -719,6 +719,9 @@ struct OpenMPOpt {
      // Recollect uses, in case Attributor deleted any.
      OMPInfoCache.recollectUses();

+      // TODO: This should be folded into buildCustomStateMachine.
+      Changed |= rewriteDeviceCodeStateMachine();
+
      if (remarksEnabled())
        analysisGlobalization();
    } else {
@ -733,7 +736,6 @@ struct OpenMPOpt {
      OMPInfoCache.recollectUses();

      Changed |= deleteParallelRegions();
-      Changed |= rewriteDeviceCodeStateMachine();

      if (HideMemoryTransferLatency)
        Changed |= hideMemTransfersLatency();
--- a/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/test/Transforms/OpenMP/custom_state_machines.ll
@ -1003,7 +1003,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK:       worker_state_machine.is_active.check:
 ; CHECK-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.check:
-; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__2_wrapper
+; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__2_wrapper.ID to void (i16, i32)*)
 ; CHECK-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.execute:
 ; CHECK-NEXT:    call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP0]])
@ -1046,10 +1046,10 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef @__omp_outlined__2_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
 ; CHECK-NEXT:    call void @no_parallel_region_in_here.internalized() #[[ATTR7]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
 ; CHECK-NEXT:    ret void
 ;
 ;
@ -1138,7 +1138,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    call void @__omp_outlined__17_wrapper(i16 0, i32 [[TMP0]])
 ; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.check1:
-; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__5_wrapper
+; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__5_wrapper.ID to void (i16, i32)*)
 ; CHECK-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.execute2:
 ; CHECK-NEXT:    call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]])
@ -1182,7 +1182,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    call void @no_parallel_region_in_here.internalized() #[[ATTR7]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
 ; CHECK-NEXT:    call void @simple_state_machine_interprocedural_after.internalized() #[[ATTR7]]
 ; CHECK-NEXT:    ret void
 ;
@ -1282,13 +1282,13 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK:       worker_state_machine.is_active.check:
 ; CHECK-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.check:
-; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__7_wrapper
+; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__7_wrapper.ID to void (i16, i32)*)
 ; CHECK-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.execute:
 ; CHECK-NEXT:    call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]])
 ; CHECK-NEXT:    br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.check1:
-; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__8_wrapper
+; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__8_wrapper.ID to void (i16, i32)*)
 ; CHECK-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.execute2:
 ; CHECK-NEXT:    call void @__omp_outlined__8_wrapper(i16 0, i32 [[TMP0]])
@ -1327,10 +1327,10 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @unknown() #[[ATTR8]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0)
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef @__omp_outlined__8_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 0)
 ; CHECK-NEXT:    ret void
 ;
 ;
@ -1413,7 +1413,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK:       worker_state_machine.is_active.check:
 ; CHECK-NEXT:    br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.check:
-; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__10_wrapper
+; CHECK-NEXT:    [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__10_wrapper.ID to void (i16, i32)*)
 ; CHECK-NEXT:    br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
 ; CHECK:       worker_state_machine.parallel_region.execute:
 ; CHECK-NEXT:    call void @__omp_outlined__10_wrapper(i16 0, i32 [[TMP0]])
@ -1456,10 +1456,10 @@ attributes #10 = { convergent nounwind readonly willreturn }
 ; CHECK-NEXT:    store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
-; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__10_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0)
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef @__omp_outlined__10_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
 ; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR9:[0-9]+]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
-; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__11_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0)
+; CHECK-NEXT:    call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef @__omp_outlined__11_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
 ; CHECK-NEXT:    ret void
 ;
 ;
--- a/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
+++ b/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
@ -1,6 +1,5 @@
-; RUN: opt -S -passes=openmp-opt-cgscc -openmp-ir-builder-optimistic-attributes -pass-remarks=openmp-opt -openmp-print-gpu-kernels < %s | FileCheck %s
-; RUN: opt -S -passes=openmp-opt-cgscc -pass-remarks=openmp-opt -openmp-print-gpu-kernels < %s | FileCheck %s
-; RUN: opt -S        -openmp-opt-cgscc -pass-remarks=openmp-opt -openmp-print-gpu-kernels < %s | FileCheck %s
+; RUN: opt -S -passes=openmp-opt -openmp-ir-builder-optimistic-attributes -pass-remarks=openmp-opt -openmp-print-gpu-kernels < %s | FileCheck %s
+; RUN: opt -S -passes=openmp-opt -pass-remarks=openmp-opt -openmp-print-gpu-kernels < %s | FileCheck %s

 ; C input used for this test:

@ -14,6 +13,7 @@
 ;     #pragma omp parallel
 ;     {}
 ;     bar();
+;     unknown();
 ;     #pragma omp parallel
 ;     {}
 ;   }
@ -24,146 +24,49 @@
 ; another kernel.

 ; CHECK-DAG: @__omp_outlined__1_wrapper.ID = private constant i8 undef
-; CHECK-DAG: @__omp_outlined__3_wrapper.ID = private constant i8 undef
+; CHECK-DAG: @__omp_outlined__2_wrapper.ID = private constant i8 undef

-; CHECK-DAG:   icmp eq i8* %5, @__omp_outlined__1_wrapper.ID
-; CHECK-DAG:   icmp eq i8* %7, @__omp_outlined__3_wrapper.ID
+; CHECK-DAG:   icmp eq void (i16, i32)* %worker.work_fn.addr_cast, bitcast (i8* @__omp_outlined__1_wrapper.ID to void (i16, i32)*)
+; CHECK-DAG:   icmp eq void (i16, i32)* %worker.work_fn.addr_cast, bitcast (i8* @__omp_outlined__2_wrapper.ID to void (i16, i32)*)

-; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* noundef @1, i32 %1, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef @__omp_outlined__1_wrapper.ID, i8** noundef %2, i64 noundef 0)
-; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** %1, i64 0)
-; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* noundef @1, i32 %1, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef %3, i64 noundef 0)
+
+; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* noundef @1, i32 %{{.*}}, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef @__omp_outlined__1_wrapper.ID, i8** noundef %{{.*}}, i64 noundef 0)
+; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* noundef @1, i32 %{{.*}}, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef @__omp_outlined__2_wrapper.ID, i8** noundef %{{.*}}, i64 noundef 0)
+; CHECK-DAG:   call void @__kmpc_parallel_51(%struct.ident_t* noundef @2, i32 %{{.*}}, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef %{{.*}}, i64 noundef 0)


 %struct.ident_t = type { i32, i32, i32, i32, i8* }

@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
+@__omp_offloading_10301_87b2c_foo_l7_exec_mode = weak constant i8 1
+@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
+@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_10301_87b2c_foo_l7_exec_mode], section "llvm.metadata"

-define internal void @__omp_offloading_50_6dfa0f01_foo_l6_worker()  {
-entry:
-  %work_fn = alloca i8*, align 8
-  %exec_status = alloca i8, align 1
-  store i8* null, i8** %work_fn, align 8
-  store i8 0, i8* %exec_status, align 1
-  br label %.await.work
-
-.await.work:                                      ; preds = %.barrier.parallel, %entry
-  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
-  %0 = call i1 @__kmpc_kernel_parallel(i8** %work_fn)
-  %1 = zext i1 %0 to i8
-  store i8 %1, i8* %exec_status, align 1
-  %2 = load i8*, i8** %work_fn, align 8
-  %should_terminate = icmp eq i8* %2, null
-  br i1 %should_terminate, label %.exit, label %.select.workers
-
-.select.workers:                                  ; preds = %.await.work
-  %3 = load i8, i8* %exec_status, align 1
-  %is_active = icmp ne i8 %3, 0
-  br i1 %is_active, label %.execute.parallel, label %.barrier.parallel
-
-.execute.parallel:                                ; preds = %.select.workers
-  %4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
-  %5 = load i8*, i8** %work_fn, align 8
-  %work_match = icmp eq i8* %5, bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*)
-  br i1 %work_match, label %.execute.fn, label %.check.next
-
-.execute.fn:                                      ; preds = %.execute.parallel
-  call void @__omp_outlined__1_wrapper(i16 zeroext 0, i32 %4) 
-  br label %.terminate.parallel
-
-.check.next:                                      ; preds = %.execute.parallel
-  %6 = load i8*, i8** %work_fn, align 8
-  %work_match1 = icmp eq i8* %6, bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*)
-  br i1 %work_match1, label %.execute.fn2, label %.check.next3
-
-.execute.fn2:                                     ; preds = %.check.next
-  call void @__omp_outlined__2_wrapper(i16 zeroext 0, i32 %4) 
-  br label %.terminate.parallel
-
-.check.next3:                                     ; preds = %.check.next
-  %7 = load i8*, i8** %work_fn, align 8
-  %work_match4 = icmp eq i8* %7, bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*)
-  br i1 %work_match4, label %.execute.fn5, label %.check.next6
-
-.execute.fn5:                                     ; preds = %.check.next3
-  call void @__omp_outlined__3_wrapper(i16 zeroext 0, i32 %4) 
-  br label %.terminate.parallel
-
-.check.next6:                                     ; preds = %.check.next3
-  %8 = bitcast i8* %2 to void (i16, i32)*
-  call void %8(i16 0, i32 %4)
-  br label %.terminate.parallel
-
-.terminate.parallel:                              ; preds = %.check.next6, %.execute.fn5, %.execute.fn2, %.execute.fn
-  call void @__kmpc_kernel_end_parallel()
-  br label %.barrier.parallel
-
-.barrier.parallel:                                ; preds = %.terminate.parallel, %.select.workers
-  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
-  br label %.await.work
-
-.exit:                                            ; preds = %.await.work
-  ret void
-}
-
-define weak void @__omp_offloading_50_6dfa0f01_foo_l6()  {
+define weak void @__omp_offloading_10301_87b2c_foo_l7() {
 entry:
  %.zero.addr = alloca i32, align 4
  %.threadid_temp. = alloca i32, align 4
  store i32 0, i32* %.zero.addr, align 4
-  %nvptx_tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  %nvptx_warp_size = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  %thread_limit = sub nuw i32 %nvptx_num_threads, %nvptx_warp_size
-  %0 = icmp ult i32 %nvptx_tid, %thread_limit
-  br i1 %0, label %.worker, label %.mastercheck
+  %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true)
+  %exec_user_code = icmp eq i32 %0, -1
+  br i1 %exec_user_code, label %user_code.entry, label %worker.exit

-.worker:                                          ; preds = %entry
-  call void @__omp_offloading_50_6dfa0f01_foo_l6_worker() 
-  br label %.exit
+user_code.entry:                                  ; preds = %entry
+  %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  store i32 %1, i32* %.threadid_temp., align 4
+  call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr)
+  call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
+  ret void

-.mastercheck:                                     ; preds = %entry
-  %nvptx_tid1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %nvptx_num_threads2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  %nvptx_warp_size3 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  %1 = sub nuw i32 %nvptx_warp_size3, 1
-  %2 = sub nuw i32 %nvptx_num_threads2, 1
-  %3 = xor i32 %1, -1
-  %master_tid = and i32 %2, %3
-  %4 = icmp eq i32 %nvptx_tid1, %master_tid
-  br i1 %4, label %.master, label %.exit
-
-.master:                                          ; preds = %.mastercheck
-  %nvptx_num_threads4 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  %nvptx_warp_size5 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  %thread_limit6 = sub nuw i32 %nvptx_num_threads4, %nvptx_warp_size5
-  call void @__kmpc_kernel_init(i32 %thread_limit6, i16 1)
-  call void @__kmpc_data_sharing_init_stack()
-  %5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
-  store i32 %5, i32* %.threadid_temp., align 4
-  call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr) 
-  br label %.termination.notifier
-
-.termination.notifier:                            ; preds = %.master
-  call void @__kmpc_kernel_deinit(i16 1)
-  call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
-  br label %.exit
-
-.exit:                                            ; preds = %.termination.notifier, %.mastercheck, %.worker
+worker.exit:                                      ; preds = %entry
  ret void
 }

-declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() 
+declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1)
+declare void @unknown()

-declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() 
-
-declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() 
-
-declare void @__kmpc_kernel_init(i32, i16)
-
-declare void @__kmpc_data_sharing_init_stack()
-
-define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.)  {
+define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
 entry:
  %.global_tid..addr = alloca i32*, align 8
  %.bound_tid..addr = alloca i32*, align 8
@ -175,13 +78,14 @@ entry:
  %1 = load i32, i32* %0, align 4
  %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
  call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** %2, i64 0)
-  call void @bar() 
+  call void @bar()
+  call void @unknown()
  %3 = bitcast [0 x i8*]* %captured_vars_addrs1 to i8**
-  call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** %3, i64 0)
+  call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** %3, i64 0)
  ret void
 }

-define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid.)  {
+define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
 entry:
  %.global_tid..addr = alloca i32*, align 8
  %.bound_tid..addr = alloca i32*, align 8
@ -190,7 +94,7 @@ entry:
  ret void
 }

-define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1)  {
+define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) {
 entry:
  %.addr = alloca i16, align 2
  %.addr1 = alloca i32, align 4
@ -200,7 +104,7 @@ entry:
  store i16 %0, i16* %.addr, align 2
  store i32 %1, i32* %.addr1, align 4
  call void @__kmpc_get_shared_variables(i8*** %global_args)
-  call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr) 
+  call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr)
  ret void
 }

@ -208,16 +112,16 @@ declare void @__kmpc_get_shared_variables(i8***)

 declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64)

-define hidden void @bar()  {
+define hidden void @bar() {
 entry:
  %captured_vars_addrs = alloca [0 x i8*], align 8
-  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
+  %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
  %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
-  call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** %1, i64 0)
+  call void @__kmpc_parallel_51(%struct.ident_t* @2, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** %1, i64 0)
  ret void
 }

-define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid.)  {
+define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
 entry:
  %.global_tid..addr = alloca i32*, align 8
  %.bound_tid..addr = alloca i32*, align 8
@ -226,7 +130,7 @@ entry:
  ret void
 }

-define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1)  {
+define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1) {
 entry:
  %.addr = alloca i16, align 2
  %.addr1 = alloca i32, align 4
@ -236,13 +140,15 @@ entry:
  store i16 %0, i16* %.addr, align 2
  store i32 %1, i32* %.addr1, align 4
  call void @__kmpc_get_shared_variables(i8*** %global_args)
-  call void @__omp_outlined__2(i32* %.addr1, i32* %.zero.addr) 
+  call void @__omp_outlined__2(i32* %.addr1, i32* %.zero.addr)
  ret void
 }

-declare i32 @__kmpc_global_thread_num(%struct.ident_t*) 
+declare i32 @__kmpc_global_thread_num(%struct.ident_t*)

-define internal void @__omp_outlined__3(i32* noalias %.global_tid., i32* noalias %.bound_tid.)  {
+declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)
+
+define internal void @__omp_outlined__3(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
 entry:
  %.global_tid..addr = alloca i32*, align 8
  %.bound_tid..addr = alloca i32*, align 8
@ -251,7 +157,7 @@ entry:
  ret void
 }

-define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1)  {
+define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) {
 entry:
  %.addr = alloca i16, align 2
  %.addr1 = alloca i32, align 4
@ -261,22 +167,15 @@ entry:
  store i16 %0, i16* %.addr, align 2
  store i32 %1, i32* %.addr1, align 4
  call void @__kmpc_get_shared_variables(i8*** %global_args)
-  call void @__omp_outlined__3(i32* %.addr1, i32* %.zero.addr) 
+  call void @__omp_outlined__3(i32* %.addr1, i32* %.zero.addr)
  ret void
 }

-declare void @__kmpc_kernel_deinit(i16)
-
-declare void @__kmpc_barrier_simple_spmd(%struct.ident_t*, i32) 
-
-declare i1 @__kmpc_kernel_parallel(i8**)
-
-declare void @__kmpc_kernel_end_parallel()
-
-
+!omp_offload.info = !{!0}
 !nvvm.annotations = !{!1}
 !llvm.module.flags = !{!2, !3}

-!1 = !{void ()* @__omp_offloading_50_6dfa0f01_foo_l6, !"kernel", i32 1}
+!0 = !{i32 0, i32 66305, i32 555956, !"foo", i32 7, i32 0}
+!1 = !{void ()* @__omp_offloading_10301_87b2c_foo_l7, !"kernel", i32 1}
 !2 = !{i32 7, !"openmp", i32 50}
 !3 = !{i32 7, !"openmp-device", i32 50}