1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-26 04:32:44 +01:00

[AMDGPU] Emit metadata for hidden arguments for kernel enqueue

Identifies kernels which performs device side kernel enqueues and emit
metadata for the associated hidden kernel arguments. Such kernels are
marked with calls-enqueue-kernel function attribute by
AMDGPUOpenCLEnqueueKernelLowering pass and later on
hidden kernel arguments metadata HiddenDefaultQueue and
HiddenCompletionAction are emitted for them.

Differential Revision: https://reviews.llvm.org/D39255

llvm-svn: 316907
This commit is contained in:
Yaxun Liu 2017-10-30 14:30:28 +00:00
parent 32210b316a
commit 3938c6fc0d
6 changed files with 214 additions and 9 deletions

View File

@ -1039,10 +1039,10 @@ non-AMD key names should be prefixed by "*vendor-name*.".
passed in the kernarg.
"HiddenCompletionAction"
*TBD*
.. TODO
Add description.
A global address space pointer
to help link enqueued kernels into
the ancestor tree for determining
when the parent kernel has finished.
"ValueType" string Required Kernel argument value type. Only
present if "ValueKind" is

View File

@ -25,12 +25,20 @@
// linkage does not work since optimization passes will try to replace loads
// of the global variable with its initialization value.
//
// It also identifies the kernels directly or indirectly enqueues kernels
// and adds "calls-enqueue-kernel" function attribute to them, which will
// be used to determine whether to emit runtime metadata for the kernel
// enqueue related hidden kernel arguments.
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/User.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@ -66,7 +74,22 @@ ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
return new AMDGPUOpenCLEnqueuedBlockLowering();
}
/// Collect direct or indrect callers of \p F and save them
/// to \p Callers.
static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
for (auto U : F->users()) {
if (auto *CI = dyn_cast<CallInst>(&*U)) {
auto *Caller = CI->getParent()->getParent();
if (Callers.count(Caller))
continue;
Callers.insert(Caller);
collectCallers(Caller, Callers);
}
}
}
bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
DenseSet<Function *> Callers;
auto &C = M.getContext();
auto AS = AMDGPU::getAMDGPUAS(M);
bool Changed = false;
@ -91,8 +114,23 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
AddrCast->replaceAllUsesWith(NewPtr);
F.addFnAttr("runtime-handle", RuntimeHandle);
F.setLinkage(GlobalValue::ExternalLinkage);
// Collect direct or indirect callers of enqueue_kernel.
for (auto U : NewPtr->users()) {
if (auto *I = dyn_cast<Instruction>(&*U)) {
auto *F = I->getParent()->getParent();
Callers.insert(F);
collectCallers(F, Callers);
}
}
Changed = true;
}
}
for (auto F : Callers) {
if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
continue;
F->addFnAttr("calls-enqueue-kernel");
}
return Changed;
}

View File

@ -266,12 +266,21 @@ void MetadataStreamer::emitKernelArgs(const Function &Func) {
emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
if (!Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
return;
auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
AMDGPUASI.GLOBAL_ADDRESS);
auto CallsPrintf = Func.getParent()->getNamedMetadata("llvm.printf.fmts");
if (CallsPrintf)
emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
if (Func.hasFnAttribute("calls-enqueue-kernel")) {
if (!CallsPrintf) {
// Emit a dummy argument so that the remaining hidden arguments
// have a fixed position relative to the first hidden argument.
// This is to facilitate library code to access hidden arguments.
emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
}
emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue);
emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction);
}
}
void MetadataStreamer::emitKernelArg(const Argument &Arg) {

View File

@ -9,7 +9,21 @@ target triple = "amdgcn-amdhsa-amd-opencl"
%struct.ndrange_t = type { i32 }
%opencl.queue_t = type opaque
define amdgpu_kernel void @test(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
; CHECK: define amdgpu_kernel void @non_caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr !kernel_arg_addr_space
define amdgpu_kernel void @non_caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
!kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
ret void
}
; CHECK: define amdgpu_kernel void @caller_indirect(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr #[[AT_CALLER:[0-9]+]]
define amdgpu_kernel void @caller_indirect(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
!kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
call void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d)
ret void
}
; CHECK: define amdgpu_kernel void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr #[[AT_CALLER]]
define amdgpu_kernel void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
!kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
entry:
%block = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, align 8
@ -77,6 +91,7 @@ entry:
ret void
}
; CHECK: attributes #[[AT_CALLER]] = { "calls-enqueue-kernel" }
; CHECK: attributes #[[AT1]] = {{.*}}"runtime-handle"="__test_block_invoke_kernel_runtime_handle"
; CHECK: attributes #[[AT2]] = {{.*}}"runtime-handle"="__test_block_invoke_2_kernel_runtime_handle"

View File

@ -0,0 +1,96 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
; CHECK: ---
; CHECK: Version: [ 1, 0 ]
; CHECK-NOT: Printf:
; CHECK: Kernels:
; CHECK: - Name: test_non_enqueue_kernel_caller
; CHECK-NEXT: SymbolName: 'test_non_enqueue_kernel_caller@kd'
; CHECK-NEXT: Language: OpenCL C
; CHECK-NEXT: LanguageVersion: [ 2, 0 ]
; CHECK-NEXT: Args:
; CHECK-NEXT: - TypeName: char
; CHECK-NEXT: Size: 1
; CHECK-NEXT: Align: 1
; CHECK-NEXT: ValueKind: ByValue
; CHECK-NEXT: ValueType: I8
; CHECK-NEXT: AccQual: Default
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX
; CHECK-NEXT: ValueType: I64
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY
; CHECK-NEXT: ValueType: I64
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ
; CHECK-NEXT: ValueType: I64
; CHECK-NOT: ValueKind: HiddenNone
; CHECK-NOT: ValueKind: HiddenDefaultQueue
; CHECK-NOT: ValueKind: HiddenCompletionAction
define amdgpu_kernel void @test_non_enqueue_kernel_caller(i8 %a)
!kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
!kernel_arg_base_type !3 !kernel_arg_type_qual !4 {
ret void
}
; CHECK: - Name: test_enqueue_kernel_caller
; CHECK-NEXT: SymbolName: 'test_enqueue_kernel_caller@kd'
; CHECK-NEXT: Language: OpenCL C
; CHECK-NEXT: LanguageVersion: [ 2, 0 ]
; CHECK-NEXT: Args:
; CHECK-NEXT: - TypeName: char
; CHECK-NEXT: Size: 1
; CHECK-NEXT: Align: 1
; CHECK-NEXT: ValueKind: ByValue
; CHECK-NEXT: ValueType: I8
; CHECK-NEXT: AccQual: Default
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX
; CHECK-NEXT: ValueType: I64
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY
; CHECK-NEXT: ValueType: I64
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ
; CHECK-NEXT: ValueType: I64
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenNone
; CHECK-NEXT: ValueType: I8
; CHECK-NEXT: AddrSpaceQual: Global
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenDefaultQueue
; CHECK-NEXT: ValueType: I8
; CHECK-NEXT: AddrSpaceQual: Global
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenCompletionAction
; CHECK-NEXT: ValueType: I8
; CHECK-NEXT: AddrSpaceQual: Global
define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #0
!kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
!kernel_arg_base_type !3 !kernel_arg_type_qual !4 {
ret void
}
attributes #0 = { "calls-enqueue-kernel" }
!1 = !{i32 0}
!2 = !{!"none"}
!3 = !{!"char"}
!4 = !{!""}
!opencl.ocl.version = !{!90}
!90 = !{i32 2, i32 0}
; PARSER: AMDGPU HSA Metadata Parser Test: PASS

View File

@ -51,6 +51,8 @@
; CHECK-NEXT: ValueKind: HiddenPrintfBuffer
; CHECK-NEXT: ValueType: I8
; CHECK-NEXT: AddrSpaceQual: Global
; CHECK-NOT: ValueKind: HiddenDefaultQueue
; CHECK-NOT: ValueKind: HiddenCompletionAction
define amdgpu_kernel void @test_char(i8 %a)
!kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9
!kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
@ -1267,7 +1269,52 @@ define amdgpu_kernel void @__test_block_invoke_kernel(
ret void
}
; CHECK: - Name: test_enqueue_kernel_caller
; CHECK-NEXT: SymbolName: 'test_enqueue_kernel_caller@kd'
; CHECK-NEXT: Language: OpenCL C
; CHECK-NEXT: LanguageVersion: [ 2, 0 ]
; CHECK-NEXT: Args:
; CHECK-NEXT: - TypeName: char
; CHECK-NEXT: Size: 1
; CHECK-NEXT: Align: 1
; CHECK-NEXT: ValueKind: ByValue
; CHECK-NEXT: ValueType: I8
; CHECK-NEXT: AccQual: Default
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX
; CHECK-NEXT: ValueType: I64
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY
; CHECK-NEXT: ValueType: I64
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ
; CHECK-NEXT: ValueType: I64
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenPrintfBuffer
; CHECK-NEXT: ValueType: I8
; CHECK-NEXT: AddrSpaceQual: Global
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenDefaultQueue
; CHECK-NEXT: ValueType: I8
; CHECK-NEXT: AddrSpaceQual: Global
; CHECK-NEXT: - Size: 8
; CHECK-NEXT: Align: 8
; CHECK-NEXT: ValueKind: HiddenCompletionAction
; CHECK-NEXT: ValueType: I8
; CHECK-NEXT: AddrSpaceQual: Global
define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #1
!kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9
!kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
ret void
}
attributes #0 = { "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
attributes #1 = { "calls-enqueue-kernel" }
!llvm.printf.fmts = !{!100, !101}