1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 04:02:41 +01:00

[OpenMP] Run rewriteDeviceCodeStateMachine in the Module not CGSCC pass

While rewriteDeviceCodeStateMachine should probably be folded into
buildCustomStateMachine, we at least need the optimization to happen.
This was not reliably the case in the CGSCC pass but in the Module pass
it seems to work reliably.

This also ports a test to the new kernel encoding (target_init/deinit),
and makes sure we cannot run the kernel in SPMD mode.

Differential Revision: https://reviews.llvm.org/D106345
This commit is contained in:
Johannes Doerfert 2021-07-20 01:58:44 -05:00
parent d40841959b
commit 6a849a320d
3 changed files with 63 additions and 162 deletions

View File

@ -719,6 +719,9 @@ struct OpenMPOpt {
// Recollect uses, in case Attributor deleted any. // Recollect uses, in case Attributor deleted any.
OMPInfoCache.recollectUses(); OMPInfoCache.recollectUses();
// TODO: This should be folded into buildCustomStateMachine.
Changed |= rewriteDeviceCodeStateMachine();
if (remarksEnabled()) if (remarksEnabled())
analysisGlobalization(); analysisGlobalization();
} else { } else {
@ -733,7 +736,6 @@ struct OpenMPOpt {
OMPInfoCache.recollectUses(); OMPInfoCache.recollectUses();
Changed |= deleteParallelRegions(); Changed |= deleteParallelRegions();
Changed |= rewriteDeviceCodeStateMachine();
if (HideMemoryTransferLatency) if (HideMemoryTransferLatency)
Changed |= hideMemTransfersLatency(); Changed |= hideMemTransfersLatency();

View File

@ -1003,7 +1003,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
; CHECK: worker_state_machine.is_active.check: ; CHECK: worker_state_machine.is_active.check:
; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] ; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; CHECK: worker_state_machine.parallel_region.check: ; CHECK: worker_state_machine.parallel_region.check:
; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__2_wrapper ; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__2_wrapper.ID to void (i16, i32)*)
; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] ; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; CHECK: worker_state_machine.parallel_region.execute: ; CHECK: worker_state_machine.parallel_region.execute:
; CHECK-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP0]]) ; CHECK-NEXT: call void @__omp_outlined__2_wrapper(i16 0, i32 [[TMP0]])
@ -1046,10 +1046,10 @@ attributes #10 = { convergent nounwind readonly willreturn }
; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 ; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** ; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) ; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef @__omp_outlined__2_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
; CHECK-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR7]] ; CHECK-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR7]]
; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** ; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) ; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
; ;
@ -1138,7 +1138,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
; CHECK-NEXT: call void @__omp_outlined__17_wrapper(i16 0, i32 [[TMP0]]) ; CHECK-NEXT: call void @__omp_outlined__17_wrapper(i16 0, i32 [[TMP0]])
; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] ; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; CHECK: worker_state_machine.parallel_region.check1: ; CHECK: worker_state_machine.parallel_region.check1:
; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__5_wrapper ; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__5_wrapper.ID to void (i16, i32)*)
; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]] ; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK3:%.*]]
; CHECK: worker_state_machine.parallel_region.execute2: ; CHECK: worker_state_machine.parallel_region.execute2:
; CHECK-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]]) ; CHECK-NEXT: call void @__omp_outlined__5_wrapper(i16 0, i32 [[TMP0]])
@ -1182,7 +1182,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
; CHECK-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR7]] ; CHECK-NEXT: call void @no_parallel_region_in_here.internalized() #[[ATTR7]]
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** ; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__5_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) ; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__5 to i8*), i8* noundef @__omp_outlined__5_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
; CHECK-NEXT: call void @simple_state_machine_interprocedural_after.internalized() #[[ATTR7]] ; CHECK-NEXT: call void @simple_state_machine_interprocedural_after.internalized() #[[ATTR7]]
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
@ -1282,13 +1282,13 @@ attributes #10 = { convergent nounwind readonly willreturn }
; CHECK: worker_state_machine.is_active.check: ; CHECK: worker_state_machine.is_active.check:
; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] ; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; CHECK: worker_state_machine.parallel_region.check: ; CHECK: worker_state_machine.parallel_region.check:
; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__7_wrapper ; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__7_wrapper.ID to void (i16, i32)*)
; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] ; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; CHECK: worker_state_machine.parallel_region.execute: ; CHECK: worker_state_machine.parallel_region.execute:
; CHECK-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]]) ; CHECK-NEXT: call void @__omp_outlined__7_wrapper(i16 0, i32 [[TMP0]])
; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]] ; CHECK-NEXT: br label [[WORKER_STATE_MACHINE_PARALLEL_REGION_END:%.*]]
; CHECK: worker_state_machine.parallel_region.check1: ; CHECK: worker_state_machine.parallel_region.check1:
; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__8_wrapper ; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION4:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__8_wrapper.ID to void (i16, i32)*)
; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]] ; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION4]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE2:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_FALLBACK_EXECUTE:%.*]]
; CHECK: worker_state_machine.parallel_region.execute2: ; CHECK: worker_state_machine.parallel_region.execute2:
; CHECK-NEXT: call void @__omp_outlined__8_wrapper(i16 0, i32 [[TMP0]]) ; CHECK-NEXT: call void @__omp_outlined__8_wrapper(i16 0, i32 [[TMP0]])
@ -1327,10 +1327,10 @@ attributes #10 = { convergent nounwind readonly willreturn }
; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 ; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** ; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__7_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) ; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__7 to i8*), i8* noundef @__omp_outlined__7_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
; CHECK-NEXT: [[TMP2:%.*]] = call i32 @unknown() #[[ATTR8]] ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @unknown() #[[ATTR8]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** ; CHECK-NEXT: [[TMP3:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__8_wrapper to i8*), i8** noundef [[TMP3]], i64 noundef 0) ; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__8 to i8*), i8* noundef @__omp_outlined__8_wrapper.ID, i8** noundef [[TMP3]], i64 noundef 0)
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
; ;
@ -1413,7 +1413,7 @@ attributes #10 = { convergent nounwind readonly willreturn }
; CHECK: worker_state_machine.is_active.check: ; CHECK: worker_state_machine.is_active.check:
; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]] ; CHECK-NEXT: br i1 [[WORKER_IS_ACTIVE]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK:%.*]], label [[WORKER_STATE_MACHINE_DONE_BARRIER:%.*]]
; CHECK: worker_state_machine.parallel_region.check: ; CHECK: worker_state_machine.parallel_region.check:
; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], @__omp_outlined__10_wrapper ; CHECK-NEXT: [[WORKER_CHECK_PARALLEL_REGION:%.*]] = icmp eq void (i16, i32)* [[WORKER_WORK_FN_ADDR_CAST]], bitcast (i8* @__omp_outlined__10_wrapper.ID to void (i16, i32)*)
; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]] ; CHECK-NEXT: br i1 [[WORKER_CHECK_PARALLEL_REGION]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_EXECUTE:%.*]], label [[WORKER_STATE_MACHINE_PARALLEL_REGION_CHECK1:%.*]]
; CHECK: worker_state_machine.parallel_region.execute: ; CHECK: worker_state_machine.parallel_region.execute:
; CHECK-NEXT: call void @__omp_outlined__10_wrapper(i16 0, i32 [[TMP0]]) ; CHECK-NEXT: call void @__omp_outlined__10_wrapper(i16 0, i32 [[TMP0]])
@ -1456,10 +1456,10 @@ attributes #10 = { convergent nounwind readonly willreturn }
; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8 ; CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4 ; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[DOTGLOBAL_TID_]], align 4
; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8** ; CHECK-NEXT: [[TMP1:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__10_wrapper to i8*), i8** noundef [[TMP1]], i64 noundef 0) ; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__10 to i8*), i8* noundef @__omp_outlined__10_wrapper.ID, i8** noundef [[TMP1]], i64 noundef 0)
; CHECK-NEXT: call void @unknown_no_openmp() #[[ATTR9:[0-9]+]] ; CHECK-NEXT: call void @unknown_no_openmp() #[[ATTR9:[0-9]+]]
; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8** ; CHECK-NEXT: [[TMP2:%.*]] = bitcast [0 x i8*]* [[CAPTURED_VARS_ADDRS1]] to i8**
; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__11_wrapper to i8*), i8** noundef [[TMP2]], i64 noundef 0) ; CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* noundef @[[GLOB1]], i32 [[TMP0]], i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__11 to i8*), i8* noundef @__omp_outlined__11_wrapper.ID, i8** noundef [[TMP2]], i64 noundef 0)
; CHECK-NEXT: ret void ; CHECK-NEXT: ret void
; ;
; ;

View File

@ -1,6 +1,5 @@
; RUN: opt -S -passes=openmp-opt-cgscc -openmp-ir-builder-optimistic-attributes -pass-remarks=openmp-opt -openmp-print-gpu-kernels < %s | FileCheck %s ; RUN: opt -S -passes=openmp-opt -openmp-ir-builder-optimistic-attributes -pass-remarks=openmp-opt -openmp-print-gpu-kernels < %s | FileCheck %s
; RUN: opt -S -passes=openmp-opt-cgscc -pass-remarks=openmp-opt -openmp-print-gpu-kernels < %s | FileCheck %s ; RUN: opt -S -passes=openmp-opt -pass-remarks=openmp-opt -openmp-print-gpu-kernels < %s | FileCheck %s
; RUN: opt -S -openmp-opt-cgscc -pass-remarks=openmp-opt -openmp-print-gpu-kernels < %s | FileCheck %s
; C input used for this test: ; C input used for this test:
@ -14,6 +13,7 @@
; #pragma omp parallel ; #pragma omp parallel
; {} ; {}
; bar(); ; bar();
; unknown();
; #pragma omp parallel ; #pragma omp parallel
; {} ; {}
; } ; }
@ -24,146 +24,49 @@
; another kernel. ; another kernel.
; CHECK-DAG: @__omp_outlined__1_wrapper.ID = private constant i8 undef ; CHECK-DAG: @__omp_outlined__1_wrapper.ID = private constant i8 undef
; CHECK-DAG: @__omp_outlined__3_wrapper.ID = private constant i8 undef ; CHECK-DAG: @__omp_outlined__2_wrapper.ID = private constant i8 undef
; CHECK-DAG: icmp eq i8* %5, @__omp_outlined__1_wrapper.ID ; CHECK-DAG: icmp eq void (i16, i32)* %worker.work_fn.addr_cast, bitcast (i8* @__omp_outlined__1_wrapper.ID to void (i16, i32)*)
; CHECK-DAG: icmp eq i8* %7, @__omp_outlined__3_wrapper.ID ; CHECK-DAG: icmp eq void (i16, i32)* %worker.work_fn.addr_cast, bitcast (i8* @__omp_outlined__2_wrapper.ID to void (i16, i32)*)
; CHECK-DAG: call void @__kmpc_parallel_51(%struct.ident_t* noundef @1, i32 %1, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef @__omp_outlined__1_wrapper.ID, i8** noundef %2, i64 noundef 0)
; CHECK-DAG: call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** %1, i64 0) ; CHECK-DAG: call void @__kmpc_parallel_51(%struct.ident_t* noundef @1, i32 %{{.*}}, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* noundef @__omp_outlined__1_wrapper.ID, i8** noundef %{{.*}}, i64 noundef 0)
; CHECK-DAG: call void @__kmpc_parallel_51(%struct.ident_t* noundef @1, i32 %1, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef @__omp_outlined__3_wrapper.ID, i8** noundef %3, i64 noundef 0) ; CHECK-DAG: call void @__kmpc_parallel_51(%struct.ident_t* noundef @1, i32 %{{.*}}, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* noundef @__omp_outlined__2_wrapper.ID, i8** noundef %{{.*}}, i64 noundef 0)
; CHECK-DAG: call void @__kmpc_parallel_51(%struct.ident_t* noundef @2, i32 %{{.*}}, i32 noundef 1, i32 noundef -1, i32 noundef -1, i8* noundef bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* noundef bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** noundef %{{.*}}, i64 noundef 0)
%struct.ident_t = type { i32, i32, i32, i32, i8* } %struct.ident_t = type { i32, i32, i32, i32, i8* }
@0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
@__omp_offloading_10301_87b2c_foo_l7_exec_mode = weak constant i8 1
@2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @0, i32 0, i32 0) }, align 8
@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_10301_87b2c_foo_l7_exec_mode], section "llvm.metadata"
define internal void @__omp_offloading_50_6dfa0f01_foo_l6_worker() { define weak void @__omp_offloading_10301_87b2c_foo_l7() {
entry:
%work_fn = alloca i8*, align 8
%exec_status = alloca i8, align 1
store i8* null, i8** %work_fn, align 8
store i8 0, i8* %exec_status, align 1
br label %.await.work
.await.work: ; preds = %.barrier.parallel, %entry
call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
%0 = call i1 @__kmpc_kernel_parallel(i8** %work_fn)
%1 = zext i1 %0 to i8
store i8 %1, i8* %exec_status, align 1
%2 = load i8*, i8** %work_fn, align 8
%should_terminate = icmp eq i8* %2, null
br i1 %should_terminate, label %.exit, label %.select.workers
.select.workers: ; preds = %.await.work
%3 = load i8, i8* %exec_status, align 1
%is_active = icmp ne i8 %3, 0
br i1 %is_active, label %.execute.parallel, label %.barrier.parallel
.execute.parallel: ; preds = %.select.workers
%4 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
%5 = load i8*, i8** %work_fn, align 8
%work_match = icmp eq i8* %5, bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*)
br i1 %work_match, label %.execute.fn, label %.check.next
.execute.fn: ; preds = %.execute.parallel
call void @__omp_outlined__1_wrapper(i16 zeroext 0, i32 %4)
br label %.terminate.parallel
.check.next: ; preds = %.execute.parallel
%6 = load i8*, i8** %work_fn, align 8
%work_match1 = icmp eq i8* %6, bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*)
br i1 %work_match1, label %.execute.fn2, label %.check.next3
.execute.fn2: ; preds = %.check.next
call void @__omp_outlined__2_wrapper(i16 zeroext 0, i32 %4)
br label %.terminate.parallel
.check.next3: ; preds = %.check.next
%7 = load i8*, i8** %work_fn, align 8
%work_match4 = icmp eq i8* %7, bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*)
br i1 %work_match4, label %.execute.fn5, label %.check.next6
.execute.fn5: ; preds = %.check.next3
call void @__omp_outlined__3_wrapper(i16 zeroext 0, i32 %4)
br label %.terminate.parallel
.check.next6: ; preds = %.check.next3
%8 = bitcast i8* %2 to void (i16, i32)*
call void %8(i16 0, i32 %4)
br label %.terminate.parallel
.terminate.parallel: ; preds = %.check.next6, %.execute.fn5, %.execute.fn2, %.execute.fn
call void @__kmpc_kernel_end_parallel()
br label %.barrier.parallel
.barrier.parallel: ; preds = %.terminate.parallel, %.select.workers
call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
br label %.await.work
.exit: ; preds = %.await.work
ret void
}
define weak void @__omp_offloading_50_6dfa0f01_foo_l6() {
entry: entry:
%.zero.addr = alloca i32, align 4 %.zero.addr = alloca i32, align 4
%.threadid_temp. = alloca i32, align 4 %.threadid_temp. = alloca i32, align 4
store i32 0, i32* %.zero.addr, align 4 store i32 0, i32* %.zero.addr, align 4
%nvptx_tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() %0 = call i32 @__kmpc_target_init(%struct.ident_t* @1, i1 false, i1 true, i1 true)
%nvptx_num_threads = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() %exec_user_code = icmp eq i32 %0, -1
%nvptx_warp_size = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() br i1 %exec_user_code, label %user_code.entry, label %worker.exit
%thread_limit = sub nuw i32 %nvptx_num_threads, %nvptx_warp_size
%0 = icmp ult i32 %nvptx_tid, %thread_limit
br i1 %0, label %.worker, label %.mastercheck
.worker: ; preds = %entry user_code.entry: ; preds = %entry
call void @__omp_offloading_50_6dfa0f01_foo_l6_worker() %1 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
br label %.exit store i32 %1, i32* %.threadid_temp., align 4
call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr)
call void @__kmpc_target_deinit(%struct.ident_t* @1, i1 false, i1 true)
ret void
.mastercheck: ; preds = %entry worker.exit: ; preds = %entry
%nvptx_tid1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%nvptx_num_threads2 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
%nvptx_warp_size3 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
%1 = sub nuw i32 %nvptx_warp_size3, 1
%2 = sub nuw i32 %nvptx_num_threads2, 1
%3 = xor i32 %1, -1
%master_tid = and i32 %2, %3
%4 = icmp eq i32 %nvptx_tid1, %master_tid
br i1 %4, label %.master, label %.exit
.master: ; preds = %.mastercheck
%nvptx_num_threads4 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
%nvptx_warp_size5 = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
%thread_limit6 = sub nuw i32 %nvptx_num_threads4, %nvptx_warp_size5
call void @__kmpc_kernel_init(i32 %thread_limit6, i16 1)
call void @__kmpc_data_sharing_init_stack()
%5 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1)
store i32 %5, i32* %.threadid_temp., align 4
call void @__omp_outlined__(i32* %.threadid_temp., i32* %.zero.addr)
br label %.termination.notifier
.termination.notifier: ; preds = %.master
call void @__kmpc_kernel_deinit(i16 1)
call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
br label %.exit
.exit: ; preds = %.termination.notifier, %.mastercheck, %.worker
ret void ret void
} }
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() declare i32 @__kmpc_target_init(%struct.ident_t*, i1, i1, i1)
declare void @unknown()
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
declare i32 @llvm.nvvm.read.ptx.sreg.warpsize()
declare void @__kmpc_kernel_init(i32, i16)
declare void @__kmpc_data_sharing_init_stack()
define internal void @__omp_outlined__(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
entry: entry:
%.global_tid..addr = alloca i32*, align 8 %.global_tid..addr = alloca i32*, align 8
%.bound_tid..addr = alloca i32*, align 8 %.bound_tid..addr = alloca i32*, align 8
@ -175,13 +78,14 @@ entry:
%1 = load i32, i32* %0, align 4 %1 = load i32, i32* %0, align 4
%2 = bitcast [0 x i8*]* %captured_vars_addrs to i8** %2 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** %2, i64 0) call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** %2, i64 0)
call void @bar() call void @bar()
call void @unknown()
%3 = bitcast [0 x i8*]* %captured_vars_addrs1 to i8** %3 = bitcast [0 x i8*]* %captured_vars_addrs1 to i8**
call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** %3, i64 0) call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %1, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** %3, i64 0)
ret void ret void
} }
define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { define internal void @__omp_outlined__1(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
entry: entry:
%.global_tid..addr = alloca i32*, align 8 %.global_tid..addr = alloca i32*, align 8
%.bound_tid..addr = alloca i32*, align 8 %.bound_tid..addr = alloca i32*, align 8
@ -190,7 +94,7 @@ entry:
ret void ret void
} }
define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) { define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) {
entry: entry:
%.addr = alloca i16, align 2 %.addr = alloca i16, align 2
%.addr1 = alloca i32, align 4 %.addr1 = alloca i32, align 4
@ -200,7 +104,7 @@ entry:
store i16 %0, i16* %.addr, align 2 store i16 %0, i16* %.addr, align 2
store i32 %1, i32* %.addr1, align 4 store i32 %1, i32* %.addr1, align 4
call void @__kmpc_get_shared_variables(i8*** %global_args) call void @__kmpc_get_shared_variables(i8*** %global_args)
call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr) call void @__omp_outlined__1(i32* %.addr1, i32* %.zero.addr)
ret void ret void
} }
@ -208,16 +112,16 @@ declare void @__kmpc_get_shared_variables(i8***)
declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64) declare void @__kmpc_parallel_51(%struct.ident_t*, i32, i32, i32, i32, i8*, i8*, i8**, i64)
define hidden void @bar() { define hidden void @bar() {
entry: entry:
%captured_vars_addrs = alloca [0 x i8*], align 8 %captured_vars_addrs = alloca [0 x i8*], align 8
%0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @1) %0 = call i32 @__kmpc_global_thread_num(%struct.ident_t* @2)
%1 = bitcast [0 x i8*]* %captured_vars_addrs to i8** %1 = bitcast [0 x i8*]* %captured_vars_addrs to i8**
call void @__kmpc_parallel_51(%struct.ident_t* @1, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__2 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__2_wrapper to i8*), i8** %1, i64 0) call void @__kmpc_parallel_51(%struct.ident_t* @2, i32 %0, i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*)* @__omp_outlined__3 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__3_wrapper to i8*), i8** %1, i64 0)
ret void ret void
} }
define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { define internal void @__omp_outlined__2(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
entry: entry:
%.global_tid..addr = alloca i32*, align 8 %.global_tid..addr = alloca i32*, align 8
%.bound_tid..addr = alloca i32*, align 8 %.bound_tid..addr = alloca i32*, align 8
@ -226,7 +130,7 @@ entry:
ret void ret void
} }
define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1) { define internal void @__omp_outlined__2_wrapper(i16 zeroext %0, i32 %1) {
entry: entry:
%.addr = alloca i16, align 2 %.addr = alloca i16, align 2
%.addr1 = alloca i32, align 4 %.addr1 = alloca i32, align 4
@ -236,13 +140,15 @@ entry:
store i16 %0, i16* %.addr, align 2 store i16 %0, i16* %.addr, align 2
store i32 %1, i32* %.addr1, align 4 store i32 %1, i32* %.addr1, align 4
call void @__kmpc_get_shared_variables(i8*** %global_args) call void @__kmpc_get_shared_variables(i8*** %global_args)
call void @__omp_outlined__2(i32* %.addr1, i32* %.zero.addr) call void @__omp_outlined__2(i32* %.addr1, i32* %.zero.addr)
ret void ret void
} }
declare i32 @__kmpc_global_thread_num(%struct.ident_t*) declare i32 @__kmpc_global_thread_num(%struct.ident_t*)
define internal void @__omp_outlined__3(i32* noalias %.global_tid., i32* noalias %.bound_tid.) { declare void @__kmpc_target_deinit(%struct.ident_t*, i1, i1)
define internal void @__omp_outlined__3(i32* noalias %.global_tid., i32* noalias %.bound_tid.) {
entry: entry:
%.global_tid..addr = alloca i32*, align 8 %.global_tid..addr = alloca i32*, align 8
%.bound_tid..addr = alloca i32*, align 8 %.bound_tid..addr = alloca i32*, align 8
@ -251,7 +157,7 @@ entry:
ret void ret void
} }
define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) { define internal void @__omp_outlined__3_wrapper(i16 zeroext %0, i32 %1) {
entry: entry:
%.addr = alloca i16, align 2 %.addr = alloca i16, align 2
%.addr1 = alloca i32, align 4 %.addr1 = alloca i32, align 4
@ -261,22 +167,15 @@ entry:
store i16 %0, i16* %.addr, align 2 store i16 %0, i16* %.addr, align 2
store i32 %1, i32* %.addr1, align 4 store i32 %1, i32* %.addr1, align 4
call void @__kmpc_get_shared_variables(i8*** %global_args) call void @__kmpc_get_shared_variables(i8*** %global_args)
call void @__omp_outlined__3(i32* %.addr1, i32* %.zero.addr) call void @__omp_outlined__3(i32* %.addr1, i32* %.zero.addr)
ret void ret void
} }
declare void @__kmpc_kernel_deinit(i16) !omp_offload.info = !{!0}
declare void @__kmpc_barrier_simple_spmd(%struct.ident_t*, i32)
declare i1 @__kmpc_kernel_parallel(i8**)
declare void @__kmpc_kernel_end_parallel()
!nvvm.annotations = !{!1} !nvvm.annotations = !{!1}
!llvm.module.flags = !{!2, !3} !llvm.module.flags = !{!2, !3}
!1 = !{void ()* @__omp_offloading_50_6dfa0f01_foo_l6, !"kernel", i32 1} !0 = !{i32 0, i32 66305, i32 555956, !"foo", i32 7, i32 0}
!1 = !{void ()* @__omp_offloading_10301_87b2c_foo_l7, !"kernel", i32 1}
!2 = !{i32 7, !"openmp", i32 50} !2 = !{i32 7, !"openmp", i32 50}
!3 = !{i32 7, !"openmp-device", i32 50} !3 = !{i32 7, !"openmp-device", i32 50}