1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-26 04:32:44 +01:00

[AMDGPU] Add support for architected flat scratch

Add support for the readonly flat Scratch register initialized
by the SPI.

Differential Revision: https://reviews.llvm.org/D102432
This commit is contained in:
Stanislav Mekhanoshin 2021-04-12 14:40:17 -07:00
parent 8a2b8ef8d1
commit af64ca04f5
14 changed files with 196 additions and 71 deletions

View File

@ -3657,14 +3657,22 @@ The fields used by CP for code objects before V3 also match those specified in
``compute_pgm_rsrc2.user_sgpr.user_sgpr_count``. ``compute_pgm_rsrc2.user_sgpr.user_sgpr_count``.
Any requests beyond 16 Any requests beyond 16
will be ignored. will be ignored.
>448 1 bit ENABLE_SGPR_PRIVATE_SEGMENT >448 1 bit ENABLE_SGPR_PRIVATE_SEGMENT If the *Target Properties*
_BUFFER _BUFFER column of
:ref:`amdgpu-processor-table`
specifies *Architected flat
scratch* then not supported
and must be 0,
>449 1 bit ENABLE_SGPR_DISPATCH_PTR >449 1 bit ENABLE_SGPR_DISPATCH_PTR
>450 1 bit ENABLE_SGPR_QUEUE_PTR >450 1 bit ENABLE_SGPR_QUEUE_PTR
>451 1 bit ENABLE_SGPR_KERNARG_SEGMENT_PTR >451 1 bit ENABLE_SGPR_KERNARG_SEGMENT_PTR
>452 1 bit ENABLE_SGPR_DISPATCH_ID >452 1 bit ENABLE_SGPR_DISPATCH_ID
>453 1 bit ENABLE_SGPR_FLAT_SCRATCH_INIT >453 1 bit ENABLE_SGPR_FLAT_SCRATCH_INIT If the *Target Properties*
column of
:ref:`amdgpu-processor-table`
specifies *Architected flat
scratch* then not supported
and must be 0,
>454 1 bit ENABLE_SGPR_PRIVATE_SEGMENT >454 1 bit ENABLE_SGPR_PRIVATE_SEGMENT
_SIZE _SIZE
457:455 3 bits Reserved, must be 0. 457:455 3 bits Reserved, must be 0.
@ -3984,14 +3992,27 @@ The fields used by CP for code objects before V3 also match those specified in
======= ======= =============================== =========================================================================== ======= ======= =============================== ===========================================================================
Bits Size Field Name Description Bits Size Field Name Description
======= ======= =============================== =========================================================================== ======= ======= =============================== ===========================================================================
0 1 bit ENABLE_PRIVATE_SEGMENT Enable the setup of the 0 1 bit ENABLE_PRIVATE_SEGMENT * Enable the setup of the
private segment. private segment.
* If the *Target Properties*
In addition, enable the column of
:ref:`amdgpu-processor-table`
does not specify
*Architected flat
scratch* then enable the
setup of the SGPR setup of the SGPR
wavefront scratch offset wavefront scratch offset
system register (see system register (see
:ref:`amdgpu-amdhsa-initial-kernel-execution-state`). :ref:`amdgpu-amdhsa-initial-kernel-execution-state`).
* If the *Target Properties*
column of
:ref:`amdgpu-processor-table`
specifies *Architected
flat scratch* then enable
the setup of the
FLAT_SCRATCH register
pair (see
:ref:`amdgpu-amdhsa-initial-kernel-execution-state`).
Used by CP to set up Used by CP to set up
``COMPUTE_PGM_RSRC2.SCRATCH_EN``. ``COMPUTE_PGM_RSRC2.SCRATCH_EN``.
@ -4550,12 +4571,26 @@ There are different methods used for initializing flat scratch:
segment address when using the Scratch Segment Buffer (see segment address when using the Scratch Segment Buffer (see
:ref:`amdgpu-amdhsa-kernel-prolog-private-segment-buffer`). :ref:`amdgpu-amdhsa-kernel-prolog-private-segment-buffer`).
* If the *Target Properties* column of :ref:`amdgpu-processor-table`
specifies *Architected flat scratch*:
If ENABLE_PRIVATE_SEGMENT is enabled in
:ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx10-table` then the FLAT_SCRATCH
register pair will be initialized to the 64-bit address of the base of scratch
backing memory being managed by SPI for the queue executing the kernel
dispatch plus the value of the wave's Scratch Wavefront Offset for use as the
flat scratch base in flat memory instructions.
.. _amdgpu-amdhsa-kernel-prolog-private-segment-buffer: .. _amdgpu-amdhsa-kernel-prolog-private-segment-buffer:
Private Segment Buffer Private Segment Buffer
++++++++++++++++++++++ ++++++++++++++++++++++
Private Segment Buffer SGPR register is used to initialize 4 SGPRs If the *Target Properties* column of :ref:`amdgpu-processor-table` specifies
*Architected flat scratch* then a Private Segment Buffer is not supported.
Instead the flat SCRATCH instructions are used.
Otherwise, Private Segment Buffer SGPR register is used to initialize 4 SGPRs
that are used as a V# to access scratch. CP uses the value provided by the that are used as a V# to access scratch. CP uses the value provided by the
runtime. It is used, together with Scratch Wavefront Offset as an offset, to runtime. It is used, together with Scratch Wavefront Offset as an offset, to
access the private memory space using a segment address. See access the private memory space using a segment address. See

View File

@ -731,6 +731,12 @@ def FeaturePackedTID : SubtargetFeature<"packed-tid",
"Workitem IDs are packed into v0 at kernel launch" "Workitem IDs are packed into v0 at kernel launch"
>; >;
def FeatureArchitectedFlatScratch : SubtargetFeature<"architected-flat-scratch",
"HasArchitectedFlatScratch",
"true",
"Flat Scratch register is a readonly SPI initialized architected register"
>;
// Dummy feature used to disable assembler instructions. // Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"", def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true", "FeatureDisable","true",

View File

@ -723,7 +723,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
const SIRegisterInfo &TRI = TII->getRegisterInfo(); const SIRegisterInfo &TRI = TII->getRegisterInfo();
Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI); MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
MRI.isLiveIn(MFI->getPreloadedReg(
AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
// Even if FLAT_SCRATCH is implicitly used, it has no effect if flat // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
// instructions aren't used to access the scratch buffer. Inline assembly may // instructions aren't used to access the scratch buffer. Inline assembly may

View File

@ -289,6 +289,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
FlatGlobalInsts(false), FlatGlobalInsts(false),
FlatScratchInsts(false), FlatScratchInsts(false),
ScalarFlatScratchInsts(false), ScalarFlatScratchInsts(false),
HasArchitectedFlatScratch(false),
AddNoCarryInsts(false), AddNoCarryInsts(false),
HasUnpackedD16VMem(false), HasUnpackedD16VMem(false),
LDSMisalignedBug(false), LDSMisalignedBug(false),
@ -327,7 +328,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
} }
bool GCNSubtarget::enableFlatScratch() const { bool GCNSubtarget::enableFlatScratch() const {
return EnableFlatScratch && hasFlatScratchInsts(); return flatScratchIsArchitected() ||
(EnableFlatScratch && hasFlatScratchInsts());
} }
unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {

View File

@ -1373,6 +1373,10 @@ public:
return getFeatureBits()[AMDGPU::FeatureFlatInstOffsets]; return getFeatureBits()[AMDGPU::FeatureFlatInstOffsets];
} }
bool hasArchitectedFlatScratch() const {
return getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
}
bool hasSGPR102_SGPR103() const { bool hasSGPR102_SGPR103() const {
return !isVI() && !isGFX9(); return !isVI() && !isGFX9();
} }
@ -4549,6 +4553,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
return OutOfRangeError(ValRange); return OutOfRangeError(ValRange);
KD.kernarg_size = Val; KD.kernarg_size = Val;
} else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") { } else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") {
if (hasArchitectedFlatScratch())
return Error(IDRange.Start,
"directive is not supported with architected flat scratch",
IDRange);
PARSE_BITS_ENTRY(KD.kernel_code_properties, PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER, KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
Val, ValRange); Val, ValRange);
@ -4579,6 +4587,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
if (Val) if (Val)
UserSGPRCount += 2; UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") { } else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") {
if (hasArchitectedFlatScratch())
return Error(IDRange.Start,
"directive is not supported with architected flat scratch",
IDRange);
PARSE_BITS_ENTRY(KD.kernel_code_properties, PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val, KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
ValRange); ValRange);
@ -4598,10 +4610,20 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32, KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
Val, ValRange); Val, ValRange);
} else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") { } else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") {
PARSE_BITS_ENTRY( if (hasArchitectedFlatScratch())
KD.compute_pgm_rsrc2, return Error(IDRange.Start,
COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, "directive is not supported with architected flat scratch",
ValRange); IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, ValRange);
} else if (ID == ".amdhsa_enable_private_segment") {
if (!hasArchitectedFlatScratch())
return Error(
IDRange.Start,
"directive is not supported without architected flat scratch",
IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, ValRange);
} else if (ID == ".amdhsa_system_sgpr_workgroup_id_x") { } else if (ID == ".amdhsa_system_sgpr_workgroup_id_x") {
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2, PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, Val, COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, Val,
@ -4639,6 +4661,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
} else if (ID == ".amdhsa_reserve_flat_scratch") { } else if (ID == ".amdhsa_reserve_flat_scratch") {
if (IVersion.Major < 7) if (IVersion.Major < 7)
return Error(IDRange.Start, "directive requires gfx7+", IDRange); return Error(IDRange.Start, "directive requires gfx7+", IDRange);
if (hasArchitectedFlatScratch())
return Error(IDRange.Start,
"directive is not supported with architected flat scratch",
IDRange);
if (!isUInt<1>(Val)) if (!isUInt<1>(Val))
return OutOfRangeError(ValRange); return OutOfRangeError(ValRange);
ReserveFlatScr = Val; ReserveFlatScr = Val;

View File

@ -1457,6 +1457,10 @@ bool AMDGPUDisassembler::isGFX10Plus() const {
return AMDGPU::isGFX10Plus(STI); return AMDGPU::isGFX10Plus(STI);
} }
bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
}
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
// AMDGPU specific symbol handling // AMDGPU specific symbol handling
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
@ -1516,6 +1520,7 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
AMDGPU::IsaInfo::getSGPREncodingGranule(&STI); AMDGPU::IsaInfo::getSGPREncodingGranule(&STI);
KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n'; KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n';
if (!hasArchitectedFlatScratch())
KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n'; KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n'; KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n';
KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n"; KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n";
@ -1567,8 +1572,11 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2(
uint32_t FourByteBuffer, raw_string_ostream &KdStream) const { uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
using namespace amdhsa; using namespace amdhsa;
StringRef Indent = "\t"; StringRef Indent = "\t";
PRINT_DIRECTIVE( if (hasArchitectedFlatScratch())
".amdhsa_system_sgpr_private_segment_wavefront_offset", PRINT_DIRECTIVE(".amdhsa_enable_private_segment",
COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
else
PRINT_DIRECTIVE(".amdhsa_system_sgpr_private_segment_wavefront_offset",
COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT); COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x", PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x",
COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X); COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
@ -1710,6 +1718,7 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective(
using namespace amdhsa; using namespace amdhsa;
TwoByteBuffer = DE.getU16(Cursor); TwoByteBuffer = DE.getU16(Cursor);
if (!hasArchitectedFlatScratch())
PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer", PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer",
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER); KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr", PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr",
@ -1720,6 +1729,7 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective(
KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR); KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id", PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id",
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID); KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
if (!hasArchitectedFlatScratch())
PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init", PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init",
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size", PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",

View File

@ -174,6 +174,8 @@ public:
bool isGFX9Plus() const; bool isGFX9Plus() const;
bool isGFX10() const; bool isGFX10() const;
bool isGFX10Plus() const; bool isGFX10Plus() const;
bool hasArchitectedFlatScratch() const;
}; };
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//

View File

@ -167,6 +167,7 @@ protected:
bool FlatGlobalInsts; bool FlatGlobalInsts;
bool FlatScratchInsts; bool FlatScratchInsts;
bool ScalarFlatScratchInsts; bool ScalarFlatScratchInsts;
bool HasArchitectedFlatScratch;
bool AddNoCarryInsts; bool AddNoCarryInsts;
bool HasUnpackedD16VMem; bool HasUnpackedD16VMem;
bool R600ALUInst; bool R600ALUInst;
@ -985,6 +986,10 @@ public:
return getGeneration() >= AMDGPUSubtarget::GFX9; return getGeneration() >= AMDGPUSubtarget::GFX9;
} }
/// \returns true if the flat_scratch register is initialized by the HW.
/// In this case it is readonly.
bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
/// \returns true if the machine has merged shaders in which s0-s7 are /// \returns true if the machine has merged shaders in which s0-s7 are
/// reserved by the hardware and user SGPRs start at s8 /// reserved by the hardware and user SGPRs start at s8
bool hasMergedShaders() const { bool hasMergedShaders() const {

View File

@ -315,7 +315,9 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
<< KD.private_segment_fixed_size << '\n'; << KD.private_segment_fixed_size << '\n';
OS << "\t\t.amdhsa_kernarg_size " << KD.kernarg_size << '\n'; OS << "\t\t.amdhsa_kernarg_size " << KD.kernarg_size << '\n';
PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_buffer", KD, if (!hasArchitectedFlatScratch(STI))
PRINT_FIELD(
OS, ".amdhsa_user_sgpr_private_segment_buffer", KD,
kernel_code_properties, kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER); amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD, PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD,
@ -330,6 +332,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_id", KD, PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_id", KD,
kernel_code_properties, kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID); amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
if (!hasArchitectedFlatScratch(STI))
PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD, PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD,
kernel_code_properties, kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT); amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
@ -340,9 +343,11 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD, PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD,
kernel_code_properties, kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32); amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
PRINT_FIELD( PRINT_FIELD(OS,
OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD, (hasArchitectedFlatScratch(STI)
compute_pgm_rsrc2, ? ".amdhsa_enable_private_segment"
: ".amdhsa_system_sgpr_private_segment_wavefront_offset"),
KD, compute_pgm_rsrc2,
amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT); amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD, PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD,
compute_pgm_rsrc2, compute_pgm_rsrc2,
@ -372,7 +377,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
if (!ReserveVCC) if (!ReserveVCC)
OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n'; OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n';
if (IVersion.Major >= 7 && !ReserveFlatScr) if (IVersion.Major >= 7 && !ReserveFlatScr && !hasArchitectedFlatScratch(STI))
OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n'; OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n';
if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(&STI)) { if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(&STI)) {

View File

@ -493,7 +493,8 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0); BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
} }
if (MFI->hasFlatScratchInit() || ScratchRsrcReg) { if ((MFI->hasFlatScratchInit() || ScratchRsrcReg) &&
!ST.flatScratchIsArchitected()) {
MRI.addLiveIn(PreloadedScratchWaveOffsetReg); MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
MBB.addLiveIn(PreloadedScratchWaveOffsetReg); MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
} }

View File

@ -124,6 +124,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
if (WorkItemIDZ) if (WorkItemIDZ)
WorkItemIDY = true; WorkItemIDY = true;
if (!ST.flatScratchIsArchitected()) {
PrivateSegmentWaveByteOffset = true; PrivateSegmentWaveByteOffset = true;
// HS and GS always have the scratch wave offset in SGPR5 on GFX9. // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
@ -132,6 +133,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
ArgInfo.PrivateSegmentWaveByteOffset = ArgInfo.PrivateSegmentWaveByteOffset =
ArgDescriptor::createRegister(AMDGPU::SGPR5); ArgDescriptor::createRegister(AMDGPU::SGPR5);
} }
}
bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
if (isAmdHsaOrMesa) { if (isAmdHsaOrMesa) {
@ -162,7 +164,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
KernargSegmentPtr = true; KernargSegmentPtr = true;
if (ST.hasFlatAddressSpace() && isEntryFunction() && if (ST.hasFlatAddressSpace() && isEntryFunction() &&
(isAmdHsaOrMesa || ST.enableFlatScratch())) { (isAmdHsaOrMesa || ST.enableFlatScratch()) &&
!ST.flatScratchIsArchitected()) {
// TODO: This could be refined a lot. The attribute is a poor way of // TODO: This could be refined a lot. The attribute is a poor way of
// detecting calls or stack objects that may require it before argument // detecting calls or stack objects that may require it before argument
// lowering. // lowering.

View File

@ -1459,6 +1459,10 @@ bool isGFX90A(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]; return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts];
} }
bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI) {
return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
}
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0); const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);

View File

@ -740,6 +740,7 @@ bool isGCN3Encoding(const MCSubtargetInfo &STI);
bool isGFX10_BEncoding(const MCSubtargetInfo &STI); bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
bool hasGFX10_3Insts(const MCSubtargetInfo &STI); bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
bool isGFX90A(const MCSubtargetInfo &STI); bool isGFX90A(const MCSubtargetInfo &STI);
bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
/// Is Reg - scalar register /// Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI); bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);

View File

@ -1,16 +1,24 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,RW-FLAT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch < %s | FileCheck -check-prefixes=GCN,RO-FLAT %s
; Make sure flat_scratch_init is set ; Make sure flat_scratch_init is set
; GCN-LABEL: {{^}}stack_object_addrspacecast_in_kernel_no_calls: ; GCN-LABEL: {{^}}stack_object_addrspacecast_in_kernel_no_calls:
; GCN: s_add_u32 flat_scratch_lo, s4, s7 ; RW-FLAT: s_add_u32 flat_scratch_lo, s4, s7
; GCN: s_addc_u32 flat_scratch_hi, s5, 0 ; RW-FLAT: s_addc_u32 flat_scratch_hi, s5, 0
; RO-FLAT-NOT: flat_scratch
; GCN: flat_store_dword ; GCN: flat_store_dword
; GCN: .amdhsa_user_sgpr_flat_scratch_init 1 ; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset ; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 1
; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset
; RW-FLAT-NOT: .amdhsa_enable_private_segment
; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
; RO-FLAT: .amdhsa_enable_private_segment 1
; GCN-NOT: .amdhsa_reserve_flat_scratch ; GCN-NOT: .amdhsa_reserve_flat_scratch
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1 ; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6
; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() { define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
%alloca = alloca i32, addrspace(5) %alloca = alloca i32, addrspace(5)
%cast = addrspacecast i32 addrspace(5)* %alloca to i32* %cast = addrspacecast i32 addrspace(5)* %alloca to i32*
@ -20,15 +28,23 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
; TODO: Could optimize out in this case ; TODO: Could optimize out in this case
; GCN-LABEL: {{^}}stack_object_in_kernel_no_calls: ; GCN-LABEL: {{^}}stack_object_in_kernel_no_calls:
; GCN: s_add_u32 flat_scratch_lo, s4, s7 ; RW-FLAT: s_add_u32 flat_scratch_lo, s4, s7
; GCN: s_addc_u32 flat_scratch_hi, s5, 0 ; RW-FLAT: s_addc_u32 flat_scratch_hi, s5, 0
; GCN: buffer_store_dword ; RO-FLAT-NOT: flat_scratch
; GCN: .amdhsa_user_sgpr_private_segment_buffer 1 ; RW-FLAT: buffer_store_dword
; GCN: .amdhsa_user_sgpr_flat_scratch_init 1 ; RO-FLAT: scratch_store_dword
; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1 ; RW-FLAT: .amdhsa_user_sgpr_private_segment_buffer 1
; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 1
; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; RW-FLAT-NOT: .amdhsa_enable_private_segment
; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
; RO-FLAT: .amdhsa_enable_private_segment 1
; GCN-NOT: .amdhsa_reserve_flat_scratch ; GCN-NOT: .amdhsa_reserve_flat_scratch
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1 ; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 6 ; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6
; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
define amdgpu_kernel void @stack_object_in_kernel_no_calls() { define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
%alloca = alloca i32, addrspace(5) %alloca = alloca i32, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %alloca store volatile i32 0, i32 addrspace(5)* %alloca
@ -37,12 +53,19 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
; GCN-LABEL: {{^}}kernel_no_calls_no_stack: ; GCN-LABEL: {{^}}kernel_no_calls_no_stack:
; GCN-NOT: flat_scratch ; GCN-NOT: flat_scratch
; GCN: .amdhsa_user_sgpr_private_segment_buffer 1 ; RW-FLAT: .amdhsa_user_sgpr_private_segment_buffer 1
; GCN: .amdhsa_user_sgpr_flat_scratch_init 0 ; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 0 ; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 0
; GCN: .amdhsa_reserve_flat_scratch 0 ; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
; RW-FLAT-NOT: .amdhsa_enable_private_segment
; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
; RO-FLAT: .amdhsa_enable_private_segment 0
; RW-FLAT: .amdhsa_reserve_flat_scratch 0
; RO-FLAT-NOT: .amdhsa_reserve_flat_scratch 0
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 ; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 4 ; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 4
; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
define amdgpu_kernel void @kernel_no_calls_no_stack() { define amdgpu_kernel void @kernel_no_calls_no_stack() {
ret void ret void
} }