mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 20:23:11 +01:00
[AMDGPU] Emit metadata for hidden arguments for kernel enqueue
Identifies kernels which performs device side kernel enqueues and emit metadata for the associated hidden kernel arguments. Such kernels are marked with calls-enqueue-kernel function attribute by AMDGPUOpenCLEnqueueKernelLowering pass and later on hidden kernel arguments metadata HiddenDefaultQueue and HiddenCompletionAction are emitted for them. Differential Revision: https://reviews.llvm.org/D39255 llvm-svn: 316907
This commit is contained in:
parent
32210b316a
commit
3938c6fc0d
@ -1039,10 +1039,10 @@ non-AMD key names should be prefixed by "*vendor-name*.".
|
||||
passed in the kernarg.
|
||||
|
||||
"HiddenCompletionAction"
|
||||
*TBD*
|
||||
|
||||
.. TODO
|
||||
Add description.
|
||||
A global address space pointer
|
||||
to help link enqueued kernels into
|
||||
the ancestor tree for determining
|
||||
when the parent kernel has finished.
|
||||
|
||||
"ValueType" string Required Kernel argument value type. Only
|
||||
present if "ValueKind" is
|
||||
|
@ -25,12 +25,20 @@
|
||||
// linkage does not work since optimization passes will try to replace loads
|
||||
// of the global variable with its initialization value.
|
||||
//
|
||||
// It also identifies the kernels directly or indirectly enqueues kernels
|
||||
// and adds "calls-enqueue-kernel" function attribute to them, which will
|
||||
// be used to determine whether to emit runtime metadata for the kernel
|
||||
// enqueue related hidden kernel arguments.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "llvm/ADT/DenseSet.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/User.h"
|
||||
#include "llvm/Pass.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
@ -66,7 +74,22 @@ ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
|
||||
return new AMDGPUOpenCLEnqueuedBlockLowering();
|
||||
}
|
||||
|
||||
/// Collect direct or indrect callers of \p F and save them
|
||||
/// to \p Callers.
|
||||
static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
|
||||
for (auto U : F->users()) {
|
||||
if (auto *CI = dyn_cast<CallInst>(&*U)) {
|
||||
auto *Caller = CI->getParent()->getParent();
|
||||
if (Callers.count(Caller))
|
||||
continue;
|
||||
Callers.insert(Caller);
|
||||
collectCallers(Caller, Callers);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
|
||||
DenseSet<Function *> Callers;
|
||||
auto &C = M.getContext();
|
||||
auto AS = AMDGPU::getAMDGPUAS(M);
|
||||
bool Changed = false;
|
||||
@ -91,8 +114,23 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
|
||||
AddrCast->replaceAllUsesWith(NewPtr);
|
||||
F.addFnAttr("runtime-handle", RuntimeHandle);
|
||||
F.setLinkage(GlobalValue::ExternalLinkage);
|
||||
|
||||
// Collect direct or indirect callers of enqueue_kernel.
|
||||
for (auto U : NewPtr->users()) {
|
||||
if (auto *I = dyn_cast<Instruction>(&*U)) {
|
||||
auto *F = I->getParent()->getParent();
|
||||
Callers.insert(F);
|
||||
collectCallers(F, Callers);
|
||||
}
|
||||
}
|
||||
Changed = true;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto F : Callers) {
|
||||
if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
|
||||
continue;
|
||||
F->addFnAttr("calls-enqueue-kernel");
|
||||
}
|
||||
return Changed;
|
||||
}
|
||||
|
@ -266,12 +266,21 @@ void MetadataStreamer::emitKernelArgs(const Function &Func) {
|
||||
emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
|
||||
emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
|
||||
|
||||
if (!Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
|
||||
return;
|
||||
|
||||
auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
|
||||
AMDGPUASI.GLOBAL_ADDRESS);
|
||||
emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
|
||||
auto CallsPrintf = Func.getParent()->getNamedMetadata("llvm.printf.fmts");
|
||||
if (CallsPrintf)
|
||||
emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
|
||||
if (Func.hasFnAttribute("calls-enqueue-kernel")) {
|
||||
if (!CallsPrintf) {
|
||||
// Emit a dummy argument so that the remaining hidden arguments
|
||||
// have a fixed position relative to the first hidden argument.
|
||||
// This is to facilitate library code to access hidden arguments.
|
||||
emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenNone);
|
||||
}
|
||||
emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenDefaultQueue);
|
||||
emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenCompletionAction);
|
||||
}
|
||||
}
|
||||
|
||||
void MetadataStreamer::emitKernelArg(const Argument &Arg) {
|
||||
|
@ -9,7 +9,21 @@ target triple = "amdgcn-amdhsa-amd-opencl"
|
||||
%struct.ndrange_t = type { i32 }
|
||||
%opencl.queue_t = type opaque
|
||||
|
||||
define amdgpu_kernel void @test(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
|
||||
; CHECK: define amdgpu_kernel void @non_caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr !kernel_arg_addr_space
|
||||
define amdgpu_kernel void @non_caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
|
||||
!kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: define amdgpu_kernel void @caller_indirect(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr #[[AT_CALLER:[0-9]+]]
|
||||
define amdgpu_kernel void @caller_indirect(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
|
||||
!kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
|
||||
call void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: define amdgpu_kernel void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr #[[AT_CALLER]]
|
||||
define amdgpu_kernel void @caller(i8 addrspace(1)* %a, i8 %b, i64 addrspace(1)* %c, i64 %d) local_unnamed_addr
|
||||
!kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
|
||||
entry:
|
||||
%block = alloca <{ i32, i32, i8 addrspace(4)*, i8 addrspace(1)*, i8 }>, align 8
|
||||
@ -77,6 +91,7 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: attributes #[[AT_CALLER]] = { "calls-enqueue-kernel" }
|
||||
; CHECK: attributes #[[AT1]] = {{.*}}"runtime-handle"="__test_block_invoke_kernel_runtime_handle"
|
||||
; CHECK: attributes #[[AT2]] = {{.*}}"runtime-handle"="__test_block_invoke_2_kernel_runtime_handle"
|
||||
|
||||
|
96
test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel.ll
Normal file
96
test/CodeGen/AMDGPU/hsa-metadata-enqueu-kernel.ll
Normal file
@ -0,0 +1,96 @@
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
|
||||
|
||||
; CHECK: ---
|
||||
; CHECK: Version: [ 1, 0 ]
|
||||
; CHECK-NOT: Printf:
|
||||
; CHECK: Kernels:
|
||||
|
||||
; CHECK: - Name: test_non_enqueue_kernel_caller
|
||||
; CHECK-NEXT: SymbolName: 'test_non_enqueue_kernel_caller@kd'
|
||||
; CHECK-NEXT: Language: OpenCL C
|
||||
; CHECK-NEXT: LanguageVersion: [ 2, 0 ]
|
||||
; CHECK-NEXT: Args:
|
||||
; CHECK-NEXT: - TypeName: char
|
||||
; CHECK-NEXT: Size: 1
|
||||
; CHECK-NEXT: Align: 1
|
||||
; CHECK-NEXT: ValueKind: ByValue
|
||||
; CHECK-NEXT: ValueType: I8
|
||||
; CHECK-NEXT: AccQual: Default
|
||||
; CHECK-NEXT: - Size: 8
|
||||
; CHECK-NEXT: Align: 8
|
||||
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX
|
||||
; CHECK-NEXT: ValueType: I64
|
||||
; CHECK-NEXT: - Size: 8
|
||||
; CHECK-NEXT: Align: 8
|
||||
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY
|
||||
; CHECK-NEXT: ValueType: I64
|
||||
; CHECK-NEXT: - Size: 8
|
||||
; CHECK-NEXT: Align: 8
|
||||
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ
|
||||
; CHECK-NEXT: ValueType: I64
|
||||
; CHECK-NOT: ValueKind: HiddenNone
|
||||
; CHECK-NOT: ValueKind: HiddenDefaultQueue
|
||||
; CHECK-NOT: ValueKind: HiddenCompletionAction
|
||||
define amdgpu_kernel void @test_non_enqueue_kernel_caller(i8 %a)
|
||||
!kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
|
||||
!kernel_arg_base_type !3 !kernel_arg_type_qual !4 {
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: - Name: test_enqueue_kernel_caller
|
||||
; CHECK-NEXT: SymbolName: 'test_enqueue_kernel_caller@kd'
|
||||
; CHECK-NEXT: Language: OpenCL C
|
||||
; CHECK-NEXT: LanguageVersion: [ 2, 0 ]
|
||||
; CHECK-NEXT: Args:
|
||||
; CHECK-NEXT: - TypeName: char
|
||||
; CHECK-NEXT: Size: 1
|
||||
; CHECK-NEXT: Align: 1
|
||||
; CHECK-NEXT: ValueKind: ByValue
|
||||
; CHECK-NEXT: ValueType: I8
|
||||
; CHECK-NEXT: AccQual: Default
|
||||
; CHECK-NEXT: - Size: 8
|
||||
; CHECK-NEXT: Align: 8
|
||||
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX
|
||||
; CHECK-NEXT: ValueType: I64
|
||||
; CHECK-NEXT: - Size: 8
|
||||
; CHECK-NEXT: Align: 8
|
||||
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY
|
||||
; CHECK-NEXT: ValueType: I64
|
||||
; CHECK-NEXT: - Size: 8
|
||||
; CHECK-NEXT: Align: 8
|
||||
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ
|
||||
; CHECK-NEXT: ValueType: I64
|
||||
; CHECK-NEXT: - Size: 8
|
||||
; CHECK-NEXT: Align: 8
|
||||
; CHECK-NEXT: ValueKind: HiddenNone
|
||||
; CHECK-NEXT: ValueType: I8
|
||||
; CHECK-NEXT: AddrSpaceQual: Global
|
||||
; CHECK-NEXT: - Size: 8
|
||||
; CHECK-NEXT: Align: 8
|
||||
; CHECK-NEXT: ValueKind: HiddenDefaultQueue
|
||||
; CHECK-NEXT: ValueType: I8
|
||||
; CHECK-NEXT: AddrSpaceQual: Global
|
||||
; CHECK-NEXT: - Size: 8
|
||||
; CHECK-NEXT: Align: 8
|
||||
; CHECK-NEXT: ValueKind: HiddenCompletionAction
|
||||
; CHECK-NEXT: ValueType: I8
|
||||
; CHECK-NEXT: AddrSpaceQual: Global
|
||||
define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #0
|
||||
!kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
|
||||
!kernel_arg_base_type !3 !kernel_arg_type_qual !4 {
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "calls-enqueue-kernel" }
|
||||
|
||||
!1 = !{i32 0}
|
||||
!2 = !{!"none"}
|
||||
!3 = !{!"char"}
|
||||
!4 = !{!""}
|
||||
|
||||
!opencl.ocl.version = !{!90}
|
||||
!90 = !{i32 2, i32 0}
|
||||
|
||||
|
||||
; PARSER: AMDGPU HSA Metadata Parser Test: PASS
|
@ -51,6 +51,8 @@
|
||||
; CHECK-NEXT: ValueKind: HiddenPrintfBuffer
|
||||
; CHECK-NEXT: ValueType: I8
|
||||
; CHECK-NEXT: AddrSpaceQual: Global
|
||||
; CHECK-NOT: ValueKind: HiddenDefaultQueue
|
||||
; CHECK-NOT: ValueKind: HiddenCompletionAction
|
||||
define amdgpu_kernel void @test_char(i8 %a)
|
||||
!kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9
|
||||
!kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
|
||||
@ -1267,7 +1269,52 @@ define amdgpu_kernel void @__test_block_invoke_kernel(
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: - Name: test_enqueue_kernel_caller
|
||||
; CHECK-NEXT: SymbolName: 'test_enqueue_kernel_caller@kd'
|
||||
; CHECK-NEXT: Language: OpenCL C
|
||||
; CHECK-NEXT: LanguageVersion: [ 2, 0 ]
|
||||
; CHECK-NEXT: Args:
|
||||
; CHECK-NEXT: - TypeName: char
|
||||
; CHECK-NEXT: Size: 1
|
||||
; CHECK-NEXT: Align: 1
|
||||
; CHECK-NEXT: ValueKind: ByValue
|
||||
; CHECK-NEXT: ValueType: I8
|
||||
; CHECK-NEXT: AccQual: Default
|
||||
; CHECK-NEXT: - Size: 8
|
||||
; CHECK-NEXT: Align: 8
|
||||
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetX
|
||||
; CHECK-NEXT: ValueType: I64
|
||||
; CHECK-NEXT: - Size: 8
|
||||
; CHECK-NEXT: Align: 8
|
||||
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetY
|
||||
; CHECK-NEXT: ValueType: I64
|
||||
; CHECK-NEXT: - Size: 8
|
||||
; CHECK-NEXT: Align: 8
|
||||
; CHECK-NEXT: ValueKind: HiddenGlobalOffsetZ
|
||||
; CHECK-NEXT: ValueType: I64
|
||||
; CHECK-NEXT: - Size: 8
|
||||
; CHECK-NEXT: Align: 8
|
||||
; CHECK-NEXT: ValueKind: HiddenPrintfBuffer
|
||||
; CHECK-NEXT: ValueType: I8
|
||||
; CHECK-NEXT: AddrSpaceQual: Global
|
||||
; CHECK-NEXT: - Size: 8
|
||||
; CHECK-NEXT: Align: 8
|
||||
; CHECK-NEXT: ValueKind: HiddenDefaultQueue
|
||||
; CHECK-NEXT: ValueType: I8
|
||||
; CHECK-NEXT: AddrSpaceQual: Global
|
||||
; CHECK-NEXT: - Size: 8
|
||||
; CHECK-NEXT: Align: 8
|
||||
; CHECK-NEXT: ValueKind: HiddenCompletionAction
|
||||
; CHECK-NEXT: ValueType: I8
|
||||
; CHECK-NEXT: AddrSpaceQual: Global
|
||||
define amdgpu_kernel void @test_enqueue_kernel_caller(i8 %a) #1
|
||||
!kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9
|
||||
!kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { "runtime-handle"="__test_block_invoke_kernel_runtime_handle" }
|
||||
attributes #1 = { "calls-enqueue-kernel" }
|
||||
|
||||
!llvm.printf.fmts = !{!100, !101}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user