mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 04:32:44 +01:00
[AMDGPU] Add support for architected flat scratch
Add support for the readonly flat Scratch register initialized by the SPI. Differential Revision: https://reviews.llvm.org/D102432
This commit is contained in:
parent
8a2b8ef8d1
commit
af64ca04f5
@ -3657,14 +3657,22 @@ The fields used by CP for code objects before V3 also match those specified in
|
|||||||
``compute_pgm_rsrc2.user_sgpr.user_sgpr_count``.
|
``compute_pgm_rsrc2.user_sgpr.user_sgpr_count``.
|
||||||
Any requests beyond 16
|
Any requests beyond 16
|
||||||
will be ignored.
|
will be ignored.
|
||||||
>448 1 bit ENABLE_SGPR_PRIVATE_SEGMENT
|
>448 1 bit ENABLE_SGPR_PRIVATE_SEGMENT If the *Target Properties*
|
||||||
_BUFFER
|
_BUFFER column of
|
||||||
|
:ref:`amdgpu-processor-table`
|
||||||
|
specifies *Architected flat
|
||||||
|
scratch* then not supported
|
||||||
|
and must be 0,
|
||||||
>449 1 bit ENABLE_SGPR_DISPATCH_PTR
|
>449 1 bit ENABLE_SGPR_DISPATCH_PTR
|
||||||
>450 1 bit ENABLE_SGPR_QUEUE_PTR
|
>450 1 bit ENABLE_SGPR_QUEUE_PTR
|
||||||
>451 1 bit ENABLE_SGPR_KERNARG_SEGMENT_PTR
|
>451 1 bit ENABLE_SGPR_KERNARG_SEGMENT_PTR
|
||||||
>452 1 bit ENABLE_SGPR_DISPATCH_ID
|
>452 1 bit ENABLE_SGPR_DISPATCH_ID
|
||||||
>453 1 bit ENABLE_SGPR_FLAT_SCRATCH_INIT
|
>453 1 bit ENABLE_SGPR_FLAT_SCRATCH_INIT If the *Target Properties*
|
||||||
|
column of
|
||||||
|
:ref:`amdgpu-processor-table`
|
||||||
|
specifies *Architected flat
|
||||||
|
scratch* then not supported
|
||||||
|
and must be 0,
|
||||||
>454 1 bit ENABLE_SGPR_PRIVATE_SEGMENT
|
>454 1 bit ENABLE_SGPR_PRIVATE_SEGMENT
|
||||||
_SIZE
|
_SIZE
|
||||||
457:455 3 bits Reserved, must be 0.
|
457:455 3 bits Reserved, must be 0.
|
||||||
@ -3984,14 +3992,27 @@ The fields used by CP for code objects before V3 also match those specified in
|
|||||||
======= ======= =============================== ===========================================================================
|
======= ======= =============================== ===========================================================================
|
||||||
Bits Size Field Name Description
|
Bits Size Field Name Description
|
||||||
======= ======= =============================== ===========================================================================
|
======= ======= =============================== ===========================================================================
|
||||||
0 1 bit ENABLE_PRIVATE_SEGMENT Enable the setup of the
|
0 1 bit ENABLE_PRIVATE_SEGMENT * Enable the setup of the
|
||||||
private segment.
|
private segment.
|
||||||
|
* If the *Target Properties*
|
||||||
In addition, enable the
|
column of
|
||||||
|
:ref:`amdgpu-processor-table`
|
||||||
|
does not specify
|
||||||
|
*Architected flat
|
||||||
|
scratch* then enable the
|
||||||
setup of the SGPR
|
setup of the SGPR
|
||||||
wavefront scratch offset
|
wavefront scratch offset
|
||||||
system register (see
|
system register (see
|
||||||
:ref:`amdgpu-amdhsa-initial-kernel-execution-state`).
|
:ref:`amdgpu-amdhsa-initial-kernel-execution-state`).
|
||||||
|
* If the *Target Properties*
|
||||||
|
column of
|
||||||
|
:ref:`amdgpu-processor-table`
|
||||||
|
specifies *Architected
|
||||||
|
flat scratch* then enable
|
||||||
|
the setup of the
|
||||||
|
FLAT_SCRATCH register
|
||||||
|
pair (see
|
||||||
|
:ref:`amdgpu-amdhsa-initial-kernel-execution-state`).
|
||||||
|
|
||||||
Used by CP to set up
|
Used by CP to set up
|
||||||
``COMPUTE_PGM_RSRC2.SCRATCH_EN``.
|
``COMPUTE_PGM_RSRC2.SCRATCH_EN``.
|
||||||
@ -4550,12 +4571,26 @@ There are different methods used for initializing flat scratch:
|
|||||||
segment address when using the Scratch Segment Buffer (see
|
segment address when using the Scratch Segment Buffer (see
|
||||||
:ref:`amdgpu-amdhsa-kernel-prolog-private-segment-buffer`).
|
:ref:`amdgpu-amdhsa-kernel-prolog-private-segment-buffer`).
|
||||||
|
|
||||||
|
* If the *Target Properties* column of :ref:`amdgpu-processor-table`
|
||||||
|
specifies *Architected flat scratch*:
|
||||||
|
|
||||||
|
If ENABLE_PRIVATE_SEGMENT is enabled in
|
||||||
|
:ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx10-table` then the FLAT_SCRATCH
|
||||||
|
register pair will be initialized to the 64-bit address of the base of scratch
|
||||||
|
backing memory being managed by SPI for the queue executing the kernel
|
||||||
|
dispatch plus the value of the wave's Scratch Wavefront Offset for use as the
|
||||||
|
flat scratch base in flat memory instructions.
|
||||||
|
|
||||||
.. _amdgpu-amdhsa-kernel-prolog-private-segment-buffer:
|
.. _amdgpu-amdhsa-kernel-prolog-private-segment-buffer:
|
||||||
|
|
||||||
Private Segment Buffer
|
Private Segment Buffer
|
||||||
++++++++++++++++++++++
|
++++++++++++++++++++++
|
||||||
|
|
||||||
Private Segment Buffer SGPR register is used to initialize 4 SGPRs
|
If the *Target Properties* column of :ref:`amdgpu-processor-table` specifies
|
||||||
|
*Architected flat scratch* then a Private Segment Buffer is not supported.
|
||||||
|
Instead the flat SCRATCH instructions are used.
|
||||||
|
|
||||||
|
Otherwise, Private Segment Buffer SGPR register is used to initialize 4 SGPRs
|
||||||
that are used as a V# to access scratch. CP uses the value provided by the
|
that are used as a V# to access scratch. CP uses the value provided by the
|
||||||
runtime. It is used, together with Scratch Wavefront Offset as an offset, to
|
runtime. It is used, together with Scratch Wavefront Offset as an offset, to
|
||||||
access the private memory space using a segment address. See
|
access the private memory space using a segment address. See
|
||||||
|
@ -731,6 +731,12 @@ def FeaturePackedTID : SubtargetFeature<"packed-tid",
|
|||||||
"Workitem IDs are packed into v0 at kernel launch"
|
"Workitem IDs are packed into v0 at kernel launch"
|
||||||
>;
|
>;
|
||||||
|
|
||||||
|
def FeatureArchitectedFlatScratch : SubtargetFeature<"architected-flat-scratch",
|
||||||
|
"HasArchitectedFlatScratch",
|
||||||
|
"true",
|
||||||
|
"Flat Scratch register is a readonly SPI initialized architected register"
|
||||||
|
>;
|
||||||
|
|
||||||
// Dummy feature used to disable assembler instructions.
|
// Dummy feature used to disable assembler instructions.
|
||||||
def FeatureDisable : SubtargetFeature<"",
|
def FeatureDisable : SubtargetFeature<"",
|
||||||
"FeatureDisable","true",
|
"FeatureDisable","true",
|
||||||
|
@ -723,7 +723,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
|
|||||||
const SIRegisterInfo &TRI = TII->getRegisterInfo();
|
const SIRegisterInfo &TRI = TII->getRegisterInfo();
|
||||||
|
|
||||||
Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
|
Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
|
||||||
MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);
|
MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
|
||||||
|
MRI.isLiveIn(MFI->getPreloadedReg(
|
||||||
|
AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
|
||||||
|
|
||||||
// Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
|
// Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
|
||||||
// instructions aren't used to access the scratch buffer. Inline assembly may
|
// instructions aren't used to access the scratch buffer. Inline assembly may
|
||||||
|
@ -289,6 +289,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
|
|||||||
FlatGlobalInsts(false),
|
FlatGlobalInsts(false),
|
||||||
FlatScratchInsts(false),
|
FlatScratchInsts(false),
|
||||||
ScalarFlatScratchInsts(false),
|
ScalarFlatScratchInsts(false),
|
||||||
|
HasArchitectedFlatScratch(false),
|
||||||
AddNoCarryInsts(false),
|
AddNoCarryInsts(false),
|
||||||
HasUnpackedD16VMem(false),
|
HasUnpackedD16VMem(false),
|
||||||
LDSMisalignedBug(false),
|
LDSMisalignedBug(false),
|
||||||
@ -327,7 +328,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool GCNSubtarget::enableFlatScratch() const {
|
bool GCNSubtarget::enableFlatScratch() const {
|
||||||
return EnableFlatScratch && hasFlatScratchInsts();
|
return flatScratchIsArchitected() ||
|
||||||
|
(EnableFlatScratch && hasFlatScratchInsts());
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
|
unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
|
||||||
|
@ -1373,6 +1373,10 @@ public:
|
|||||||
return getFeatureBits()[AMDGPU::FeatureFlatInstOffsets];
|
return getFeatureBits()[AMDGPU::FeatureFlatInstOffsets];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool hasArchitectedFlatScratch() const {
|
||||||
|
return getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
|
||||||
|
}
|
||||||
|
|
||||||
bool hasSGPR102_SGPR103() const {
|
bool hasSGPR102_SGPR103() const {
|
||||||
return !isVI() && !isGFX9();
|
return !isVI() && !isGFX9();
|
||||||
}
|
}
|
||||||
@ -4549,6 +4553,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
|
|||||||
return OutOfRangeError(ValRange);
|
return OutOfRangeError(ValRange);
|
||||||
KD.kernarg_size = Val;
|
KD.kernarg_size = Val;
|
||||||
} else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") {
|
} else if (ID == ".amdhsa_user_sgpr_private_segment_buffer") {
|
||||||
|
if (hasArchitectedFlatScratch())
|
||||||
|
return Error(IDRange.Start,
|
||||||
|
"directive is not supported with architected flat scratch",
|
||||||
|
IDRange);
|
||||||
PARSE_BITS_ENTRY(KD.kernel_code_properties,
|
PARSE_BITS_ENTRY(KD.kernel_code_properties,
|
||||||
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
|
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
|
||||||
Val, ValRange);
|
Val, ValRange);
|
||||||
@ -4579,6 +4587,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
|
|||||||
if (Val)
|
if (Val)
|
||||||
UserSGPRCount += 2;
|
UserSGPRCount += 2;
|
||||||
} else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") {
|
} else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") {
|
||||||
|
if (hasArchitectedFlatScratch())
|
||||||
|
return Error(IDRange.Start,
|
||||||
|
"directive is not supported with architected flat scratch",
|
||||||
|
IDRange);
|
||||||
PARSE_BITS_ENTRY(KD.kernel_code_properties,
|
PARSE_BITS_ENTRY(KD.kernel_code_properties,
|
||||||
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
|
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
|
||||||
ValRange);
|
ValRange);
|
||||||
@ -4598,10 +4610,20 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
|
|||||||
KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
|
KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
|
||||||
Val, ValRange);
|
Val, ValRange);
|
||||||
} else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") {
|
} else if (ID == ".amdhsa_system_sgpr_private_segment_wavefront_offset") {
|
||||||
PARSE_BITS_ENTRY(
|
if (hasArchitectedFlatScratch())
|
||||||
KD.compute_pgm_rsrc2,
|
return Error(IDRange.Start,
|
||||||
COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val,
|
"directive is not supported with architected flat scratch",
|
||||||
ValRange);
|
IDRange);
|
||||||
|
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
|
||||||
|
COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, ValRange);
|
||||||
|
} else if (ID == ".amdhsa_enable_private_segment") {
|
||||||
|
if (!hasArchitectedFlatScratch())
|
||||||
|
return Error(
|
||||||
|
IDRange.Start,
|
||||||
|
"directive is not supported without architected flat scratch",
|
||||||
|
IDRange);
|
||||||
|
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
|
||||||
|
COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT, Val, ValRange);
|
||||||
} else if (ID == ".amdhsa_system_sgpr_workgroup_id_x") {
|
} else if (ID == ".amdhsa_system_sgpr_workgroup_id_x") {
|
||||||
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
|
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
|
||||||
COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, Val,
|
COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, Val,
|
||||||
@ -4639,6 +4661,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
|
|||||||
} else if (ID == ".amdhsa_reserve_flat_scratch") {
|
} else if (ID == ".amdhsa_reserve_flat_scratch") {
|
||||||
if (IVersion.Major < 7)
|
if (IVersion.Major < 7)
|
||||||
return Error(IDRange.Start, "directive requires gfx7+", IDRange);
|
return Error(IDRange.Start, "directive requires gfx7+", IDRange);
|
||||||
|
if (hasArchitectedFlatScratch())
|
||||||
|
return Error(IDRange.Start,
|
||||||
|
"directive is not supported with architected flat scratch",
|
||||||
|
IDRange);
|
||||||
if (!isUInt<1>(Val))
|
if (!isUInt<1>(Val))
|
||||||
return OutOfRangeError(ValRange);
|
return OutOfRangeError(ValRange);
|
||||||
ReserveFlatScr = Val;
|
ReserveFlatScr = Val;
|
||||||
|
@ -1457,6 +1457,10 @@ bool AMDGPUDisassembler::isGFX10Plus() const {
|
|||||||
return AMDGPU::isGFX10Plus(STI);
|
return AMDGPU::isGFX10Plus(STI);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
|
||||||
|
return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
|
||||||
|
}
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
// AMDGPU specific symbol handling
|
// AMDGPU specific symbol handling
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
@ -1516,6 +1520,7 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
|
|||||||
AMDGPU::IsaInfo::getSGPREncodingGranule(&STI);
|
AMDGPU::IsaInfo::getSGPREncodingGranule(&STI);
|
||||||
|
|
||||||
KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n';
|
KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n';
|
||||||
|
if (!hasArchitectedFlatScratch())
|
||||||
KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
|
KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
|
||||||
KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n';
|
KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n';
|
||||||
KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n";
|
KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n";
|
||||||
@ -1567,8 +1572,11 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2(
|
|||||||
uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
|
uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
|
||||||
using namespace amdhsa;
|
using namespace amdhsa;
|
||||||
StringRef Indent = "\t";
|
StringRef Indent = "\t";
|
||||||
PRINT_DIRECTIVE(
|
if (hasArchitectedFlatScratch())
|
||||||
".amdhsa_system_sgpr_private_segment_wavefront_offset",
|
PRINT_DIRECTIVE(".amdhsa_enable_private_segment",
|
||||||
|
COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
|
||||||
|
else
|
||||||
|
PRINT_DIRECTIVE(".amdhsa_system_sgpr_private_segment_wavefront_offset",
|
||||||
COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
|
COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
|
||||||
PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x",
|
PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x",
|
||||||
COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
|
COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
|
||||||
@ -1710,6 +1718,7 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective(
|
|||||||
using namespace amdhsa;
|
using namespace amdhsa;
|
||||||
TwoByteBuffer = DE.getU16(Cursor);
|
TwoByteBuffer = DE.getU16(Cursor);
|
||||||
|
|
||||||
|
if (!hasArchitectedFlatScratch())
|
||||||
PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer",
|
PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer",
|
||||||
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
|
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
|
||||||
PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr",
|
PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr",
|
||||||
@ -1720,6 +1729,7 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective(
|
|||||||
KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
|
KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
|
||||||
PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id",
|
PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id",
|
||||||
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
|
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
|
||||||
|
if (!hasArchitectedFlatScratch())
|
||||||
PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init",
|
PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init",
|
||||||
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
|
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
|
||||||
PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
|
PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
|
||||||
|
@ -174,6 +174,8 @@ public:
|
|||||||
bool isGFX9Plus() const;
|
bool isGFX9Plus() const;
|
||||||
bool isGFX10() const;
|
bool isGFX10() const;
|
||||||
bool isGFX10Plus() const;
|
bool isGFX10Plus() const;
|
||||||
|
|
||||||
|
bool hasArchitectedFlatScratch() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
@ -167,6 +167,7 @@ protected:
|
|||||||
bool FlatGlobalInsts;
|
bool FlatGlobalInsts;
|
||||||
bool FlatScratchInsts;
|
bool FlatScratchInsts;
|
||||||
bool ScalarFlatScratchInsts;
|
bool ScalarFlatScratchInsts;
|
||||||
|
bool HasArchitectedFlatScratch;
|
||||||
bool AddNoCarryInsts;
|
bool AddNoCarryInsts;
|
||||||
bool HasUnpackedD16VMem;
|
bool HasUnpackedD16VMem;
|
||||||
bool R600ALUInst;
|
bool R600ALUInst;
|
||||||
@ -985,6 +986,10 @@ public:
|
|||||||
return getGeneration() >= AMDGPUSubtarget::GFX9;
|
return getGeneration() >= AMDGPUSubtarget::GFX9;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// \returns true if the flat_scratch register is initialized by the HW.
|
||||||
|
/// In this case it is readonly.
|
||||||
|
bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
|
||||||
|
|
||||||
/// \returns true if the machine has merged shaders in which s0-s7 are
|
/// \returns true if the machine has merged shaders in which s0-s7 are
|
||||||
/// reserved by the hardware and user SGPRs start at s8
|
/// reserved by the hardware and user SGPRs start at s8
|
||||||
bool hasMergedShaders() const {
|
bool hasMergedShaders() const {
|
||||||
|
@ -315,7 +315,9 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
|
|||||||
<< KD.private_segment_fixed_size << '\n';
|
<< KD.private_segment_fixed_size << '\n';
|
||||||
OS << "\t\t.amdhsa_kernarg_size " << KD.kernarg_size << '\n';
|
OS << "\t\t.amdhsa_kernarg_size " << KD.kernarg_size << '\n';
|
||||||
|
|
||||||
PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_buffer", KD,
|
if (!hasArchitectedFlatScratch(STI))
|
||||||
|
PRINT_FIELD(
|
||||||
|
OS, ".amdhsa_user_sgpr_private_segment_buffer", KD,
|
||||||
kernel_code_properties,
|
kernel_code_properties,
|
||||||
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
|
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
|
||||||
PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD,
|
PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_ptr", KD,
|
||||||
@ -330,6 +332,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
|
|||||||
PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_id", KD,
|
PRINT_FIELD(OS, ".amdhsa_user_sgpr_dispatch_id", KD,
|
||||||
kernel_code_properties,
|
kernel_code_properties,
|
||||||
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
|
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
|
||||||
|
if (!hasArchitectedFlatScratch(STI))
|
||||||
PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD,
|
PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD,
|
||||||
kernel_code_properties,
|
kernel_code_properties,
|
||||||
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
|
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
|
||||||
@ -340,9 +343,11 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
|
|||||||
PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD,
|
PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD,
|
||||||
kernel_code_properties,
|
kernel_code_properties,
|
||||||
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
|
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
|
||||||
PRINT_FIELD(
|
PRINT_FIELD(OS,
|
||||||
OS, ".amdhsa_system_sgpr_private_segment_wavefront_offset", KD,
|
(hasArchitectedFlatScratch(STI)
|
||||||
compute_pgm_rsrc2,
|
? ".amdhsa_enable_private_segment"
|
||||||
|
: ".amdhsa_system_sgpr_private_segment_wavefront_offset"),
|
||||||
|
KD, compute_pgm_rsrc2,
|
||||||
amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
|
amdhsa::COMPUTE_PGM_RSRC2_ENABLE_PRIVATE_SEGMENT);
|
||||||
PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD,
|
PRINT_FIELD(OS, ".amdhsa_system_sgpr_workgroup_id_x", KD,
|
||||||
compute_pgm_rsrc2,
|
compute_pgm_rsrc2,
|
||||||
@ -372,7 +377,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
|
|||||||
|
|
||||||
if (!ReserveVCC)
|
if (!ReserveVCC)
|
||||||
OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n';
|
OS << "\t\t.amdhsa_reserve_vcc " << ReserveVCC << '\n';
|
||||||
if (IVersion.Major >= 7 && !ReserveFlatScr)
|
if (IVersion.Major >= 7 && !ReserveFlatScr && !hasArchitectedFlatScratch(STI))
|
||||||
OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n';
|
OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n';
|
||||||
|
|
||||||
if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(&STI)) {
|
if (Optional<uint8_t> HsaAbiVer = getHsaAbiVersion(&STI)) {
|
||||||
|
@ -493,7 +493,8 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
|
|||||||
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
|
BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (MFI->hasFlatScratchInit() || ScratchRsrcReg) {
|
if ((MFI->hasFlatScratchInit() || ScratchRsrcReg) &&
|
||||||
|
!ST.flatScratchIsArchitected()) {
|
||||||
MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
|
MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
|
||||||
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
|
MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
|
||||||
}
|
}
|
||||||
|
@ -124,6 +124,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
|||||||
if (WorkItemIDZ)
|
if (WorkItemIDZ)
|
||||||
WorkItemIDY = true;
|
WorkItemIDY = true;
|
||||||
|
|
||||||
|
if (!ST.flatScratchIsArchitected()) {
|
||||||
PrivateSegmentWaveByteOffset = true;
|
PrivateSegmentWaveByteOffset = true;
|
||||||
|
|
||||||
// HS and GS always have the scratch wave offset in SGPR5 on GFX9.
|
// HS and GS always have the scratch wave offset in SGPR5 on GFX9.
|
||||||
@ -132,6 +133,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
|||||||
ArgInfo.PrivateSegmentWaveByteOffset =
|
ArgInfo.PrivateSegmentWaveByteOffset =
|
||||||
ArgDescriptor::createRegister(AMDGPU::SGPR5);
|
ArgDescriptor::createRegister(AMDGPU::SGPR5);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
|
bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
|
||||||
if (isAmdHsaOrMesa) {
|
if (isAmdHsaOrMesa) {
|
||||||
@ -162,7 +164,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
|
|||||||
KernargSegmentPtr = true;
|
KernargSegmentPtr = true;
|
||||||
|
|
||||||
if (ST.hasFlatAddressSpace() && isEntryFunction() &&
|
if (ST.hasFlatAddressSpace() && isEntryFunction() &&
|
||||||
(isAmdHsaOrMesa || ST.enableFlatScratch())) {
|
(isAmdHsaOrMesa || ST.enableFlatScratch()) &&
|
||||||
|
!ST.flatScratchIsArchitected()) {
|
||||||
// TODO: This could be refined a lot. The attribute is a poor way of
|
// TODO: This could be refined a lot. The attribute is a poor way of
|
||||||
// detecting calls or stack objects that may require it before argument
|
// detecting calls or stack objects that may require it before argument
|
||||||
// lowering.
|
// lowering.
|
||||||
|
@ -1459,6 +1459,10 @@ bool isGFX90A(const MCSubtargetInfo &STI) {
|
|||||||
return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts];
|
return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI) {
|
||||||
|
return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
|
||||||
|
}
|
||||||
|
|
||||||
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
|
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
|
||||||
const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
|
const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
|
||||||
const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);
|
const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);
|
||||||
|
@ -740,6 +740,7 @@ bool isGCN3Encoding(const MCSubtargetInfo &STI);
|
|||||||
bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
|
bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
|
||||||
bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
|
bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
|
||||||
bool isGFX90A(const MCSubtargetInfo &STI);
|
bool isGFX90A(const MCSubtargetInfo &STI);
|
||||||
|
bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
|
||||||
|
|
||||||
/// Is Reg - scalar register
|
/// Is Reg - scalar register
|
||||||
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
|
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
|
||||||
|
@ -1,16 +1,24 @@
|
|||||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,RW-FLAT %s
|
||||||
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch < %s | FileCheck -check-prefixes=GCN,RO-FLAT %s
|
||||||
|
|
||||||
; Make sure flat_scratch_init is set
|
; Make sure flat_scratch_init is set
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}stack_object_addrspacecast_in_kernel_no_calls:
|
; GCN-LABEL: {{^}}stack_object_addrspacecast_in_kernel_no_calls:
|
||||||
; GCN: s_add_u32 flat_scratch_lo, s4, s7
|
; RW-FLAT: s_add_u32 flat_scratch_lo, s4, s7
|
||||||
; GCN: s_addc_u32 flat_scratch_hi, s5, 0
|
; RW-FLAT: s_addc_u32 flat_scratch_hi, s5, 0
|
||||||
|
; RO-FLAT-NOT: flat_scratch
|
||||||
; GCN: flat_store_dword
|
; GCN: flat_store_dword
|
||||||
; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
|
; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
|
||||||
; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset
|
; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 1
|
||||||
|
; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
|
||||||
|
; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset
|
||||||
|
; RW-FLAT-NOT: .amdhsa_enable_private_segment
|
||||||
|
; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
|
||||||
|
; RO-FLAT: .amdhsa_enable_private_segment 1
|
||||||
; GCN-NOT: .amdhsa_reserve_flat_scratch
|
; GCN-NOT: .amdhsa_reserve_flat_scratch
|
||||||
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
|
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
|
||||||
; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 6
|
; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6
|
||||||
|
; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
|
||||||
define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
|
define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
|
||||||
%alloca = alloca i32, addrspace(5)
|
%alloca = alloca i32, addrspace(5)
|
||||||
%cast = addrspacecast i32 addrspace(5)* %alloca to i32*
|
%cast = addrspacecast i32 addrspace(5)* %alloca to i32*
|
||||||
@ -20,15 +28,23 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
|
|||||||
|
|
||||||
; TODO: Could optimize out in this case
|
; TODO: Could optimize out in this case
|
||||||
; GCN-LABEL: {{^}}stack_object_in_kernel_no_calls:
|
; GCN-LABEL: {{^}}stack_object_in_kernel_no_calls:
|
||||||
; GCN: s_add_u32 flat_scratch_lo, s4, s7
|
; RW-FLAT: s_add_u32 flat_scratch_lo, s4, s7
|
||||||
; GCN: s_addc_u32 flat_scratch_hi, s5, 0
|
; RW-FLAT: s_addc_u32 flat_scratch_hi, s5, 0
|
||||||
; GCN: buffer_store_dword
|
; RO-FLAT-NOT: flat_scratch
|
||||||
; GCN: .amdhsa_user_sgpr_private_segment_buffer 1
|
; RW-FLAT: buffer_store_dword
|
||||||
; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
|
; RO-FLAT: scratch_store_dword
|
||||||
; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
|
; RW-FLAT: .amdhsa_user_sgpr_private_segment_buffer 1
|
||||||
|
; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
|
||||||
|
; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 1
|
||||||
|
; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
|
||||||
|
; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
|
||||||
|
; RW-FLAT-NOT: .amdhsa_enable_private_segment
|
||||||
|
; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
|
||||||
|
; RO-FLAT: .amdhsa_enable_private_segment 1
|
||||||
; GCN-NOT: .amdhsa_reserve_flat_scratch
|
; GCN-NOT: .amdhsa_reserve_flat_scratch
|
||||||
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
|
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
|
||||||
; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 6
|
; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6
|
||||||
|
; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
|
||||||
define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
|
define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
|
||||||
%alloca = alloca i32, addrspace(5)
|
%alloca = alloca i32, addrspace(5)
|
||||||
store volatile i32 0, i32 addrspace(5)* %alloca
|
store volatile i32 0, i32 addrspace(5)* %alloca
|
||||||
@ -37,12 +53,19 @@ define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
|
|||||||
|
|
||||||
; GCN-LABEL: {{^}}kernel_no_calls_no_stack:
|
; GCN-LABEL: {{^}}kernel_no_calls_no_stack:
|
||||||
; GCN-NOT: flat_scratch
|
; GCN-NOT: flat_scratch
|
||||||
; GCN: .amdhsa_user_sgpr_private_segment_buffer 1
|
; RW-FLAT: .amdhsa_user_sgpr_private_segment_buffer 1
|
||||||
; GCN: .amdhsa_user_sgpr_flat_scratch_init 0
|
; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
|
||||||
; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
|
; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 0
|
||||||
; GCN: .amdhsa_reserve_flat_scratch 0
|
; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
|
||||||
|
; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
|
||||||
|
; RW-FLAT-NOT: .amdhsa_enable_private_segment
|
||||||
|
; RO-FLAT-NOT: .amdhsa_system_sgpr_private_segment_wavefront_offset
|
||||||
|
; RO-FLAT: .amdhsa_enable_private_segment 0
|
||||||
|
; RW-FLAT: .amdhsa_reserve_flat_scratch 0
|
||||||
|
; RO-FLAT-NOT: .amdhsa_reserve_flat_scratch 0
|
||||||
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
|
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
|
||||||
; GCN: COMPUTE_PGM_RSRC2:USER_SGPR: 4
|
; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 4
|
||||||
|
; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
|
||||||
define amdgpu_kernel void @kernel_no_calls_no_stack() {
|
define amdgpu_kernel void @kernel_no_calls_no_stack() {
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user