diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 2964fac4624..70df56112de 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -328,6 +328,13 @@ def FeatureDebuggerInsertNops : SubtargetFeature< "Insert two nop instructions for each high level source statement" >; +def FeatureDebuggerReserveTrapRegs : SubtargetFeature< + "amdgpu-debugger-reserve-trap-regs", + "DebuggerReserveTrapVGPRs", + "true", + "Reserve VGPRs for trap handler usage" +>; + //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 843513d23ca..c06e4b57044 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -235,6 +235,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) + " bytes/workgroup (compile time only)", false); + OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst), + false); + OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount), + false); + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)), false); @@ -472,6 +477,14 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, MaxSGPR += ExtraSGPRs; + // Update necessary Reserved* fields and max VGPRs used if + // "amdgpu-debugger-reserved-trap-regs" was specified. + if (STM.debuggerReserveTrapVGPRs()) { + ProgInfo.ReservedVGPRFirst = MaxVGPR + 1; + ProgInfo.ReservedVGPRCount = STM.debuggerReserveTrapVGPRCount(); + MaxVGPR += STM.debuggerReserveTrapVGPRCount(); + } + // We found the maximum register index. They start at 0, so add one to get the // number of registers. ProgInfo.NumVGPR = MaxVGPR + 1; @@ -694,6 +707,8 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF, header.workitem_vgpr_count = KernelInfo.NumVGPR; header.workitem_private_segment_byte_size = KernelInfo.ScratchSize; header.workgroup_group_segment_byte_size = KernelInfo.LDSSize; + header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst; + header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount; AMDGPUTargetStreamer *TS = static_cast(OutStreamer->getTargetStreamer()); diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 2c49ff4316b..acadcc0ebf0 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -40,6 +40,8 @@ private: NumVGPR(0), NumSGPR(0), FlatUsed(false), + ReservedVGPRFirst(0), + ReservedVGPRCount(0), VCCUsed(false), CodeLen(0) {} @@ -67,6 +69,9 @@ private: uint32_t LDSSize; bool FlatUsed; + uint16_t ReservedVGPRFirst; + uint16_t ReservedVGPRCount; + // Bonus information for debugging. bool VCCUsed; uint64_t CodeLen; diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 2861d68104e..2d62abd2b88 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -98,7 +98,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, LDSBankCount(0), IsaVersion(ISAVersion0_0_0), EnableSIScheduler(false), - DebuggerInsertNops(false), + DebuggerInsertNops(false), DebuggerReserveTrapVGPRs(false), FrameLowering(nullptr), GISel(), InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) { diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 645559e2c83..12e6fee7d26 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -96,6 +96,7 @@ private: unsigned IsaVersion; bool EnableSIScheduler; bool DebuggerInsertNops; + bool DebuggerReserveTrapVGPRs; std::unique_ptr FrameLowering; std::unique_ptr TLInfo; @@ -309,6 +310,14 @@ public: return DebuggerInsertNops; } + bool debuggerReserveTrapVGPRs() const { + return DebuggerReserveTrapVGPRs; + } + + unsigned debuggerReserveTrapVGPRCount() const { + return debuggerReserveTrapVGPRs() ? 4 : 0; + } + bool dumpCode() const { return DumpCode; } diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 8f562b66cfb..1f09500ebf1 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -193,6 +193,17 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); } + // Reserve VGPRs for trap handler usage if "amdgpu-debugger-reserve-trap-regs" + // attribute was specified. + const AMDGPUSubtarget &ST = MF.getSubtarget(); + if (ST.debuggerReserveTrapVGPRs()) { + for (unsigned i = MaxWorkGroupVGPRCount - ST.debuggerReserveTrapVGPRCount(); + i < MaxWorkGroupVGPRCount; ++i) { + unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i); + reserveRegisterTuples(Reserved, Reg); + } + } + return Reserved; } diff --git a/test/CodeGen/AMDGPU/debugger_reserve_trap_regs.ll b/test/CodeGen/AMDGPU/debugger_reserve_trap_regs.ll new file mode 100644 index 00000000000..2c857f688af --- /dev/null +++ b/test/CodeGen/AMDGPU/debugger_reserve_trap_regs.ll @@ -0,0 +1,37 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-reserve-trap-regs -verify-machineinstrs < %s | FileCheck %s + +; CHECK: reserved_vgpr_count = 4 +; CHECK: ReservedVGPRCount: 4 + +; Function Attrs: nounwind +define void @debugger_reserve_trap_regs(i32 addrspace(1)* %A) #0 { +entry: + %A.addr = alloca i32 addrspace(1)*, align 4 + store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 + %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0 + store i32 1, i32 addrspace(1)* %arrayidx, align 4 + %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 1 + store i32 2, i32 addrspace(1)* %arrayidx1, align 4 + %2 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %2, i32 2 + store i32 3, i32 addrspace(1)* %arrayidx2, align 4 + %3 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %3, i32 4 + store i32 4, i32 addrspace(1)* %arrayidx3, align 4 + ret void +} + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="fiji" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!opencl.kernels = !{!0} +!llvm.ident = !{!6} + +!0 = !{void (i32 addrspace(1)*)* @debugger_reserve_trap_regs, !1, !2, !3, !4, !5} +!1 = !{!"kernel_arg_addr_space", i32 1} +!2 = !{!"kernel_arg_access_qual", !"none"} +!3 = !{!"kernel_arg_type", !"int*"} +!4 = !{!"kernel_arg_base_type", !"int*"} +!5 = !{!"kernel_arg_type_qual", !""} +!6 = !{!"clang version 3.9.0 (trunk 266639)"}