AMDGPU: Set element_size in private resource descriptor

Introduce a subtarget feature for this, and leave the default with the current behavior which assumes up to 16-byte loads/stores can be used. The field also seems to have the ability to be set to 2 bytes, but I'm not sure what that would be used for. llvm-svn: 260651
2024-11-24 03:33:20 +01:00 · 2016-02-12 02:40:47 +00:00 · 2016-02-12 02:40:47 +00:00 · 628b2818b6
commit 628b2818b6
parent cf66bc968c
11 changed files with 66 additions and 12 deletions
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@ -175,6 +175,18 @@ def FeatureFPExceptions : SubtargetFeature<"fp-exceptions",
  "Enable floating point exceptions"
 >;

+class FeatureMaxPrivateElementSize<int size> : SubtargetFeature<
+  "max-private-element-size-"#size,
+  "MaxPrivateElementSize",
+  !cast<string>(size),
+  "Maximum private access size may be "#size
+>;
+
+def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
+def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
+def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
+
+
 def FeatureEnableHugeScratchBuffer : SubtargetFeature<
  "huge-scratch-buffer",
  "EnableHugeScratchBuffer",
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@ -593,6 +593,20 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
  }
 }

+// This is supposed to be log2(Size)
+static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMD_ELEMENT_4_BYTES;
+  case 8:
+    return AMD_ELEMENT_8_BYTES;
+  case 16:
+    return AMD_ELEMENT_16_BYTES;
+  default:
+    llvm_unreachable("invalid private_element_size");
+  }
+}
+
 void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
                                         const SIProgramInfo &KernelInfo) const {
  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@ -606,6 +620,11 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
      (KernelInfo.ComputePGMRSrc2 << 32);
  header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;

+
+  AMD_HSA_BITS_SET(header.code_properties,
+                   AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
+                   getElementByteSizeValue(STM.getMaxPrivateElementSize()));
+
  if (MFI->hasPrivateSegmentBuffer()) {
    header.code_properties |=
      AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@ -58,6 +58,11 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
    FP32Denormals = false;
    FP64Denormals = false;
  }
+
+  // Set defaults if needed.
+  if (MaxPrivateElementSize == 0)
+    MaxPrivateElementSize = 16;
+
  return *this;
 }

@ -74,7 +79,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
      EnableUnsafeDSOffsetFolding(false),
      EnableXNACK(false),
      WavefrontSize(0), CFALUBug(false),
-      LocalMemorySize(0),
+      LocalMemorySize(0), MaxPrivateElementSize(0),
      EnableVGPRSpilling(false), SGPRInitBug(false), IsGCN(false),
      GCN1Encoding(false), GCN3Encoding(false), CIInsts(false), LDSBankCount(0),
      IsaVersion(ISAVersion0_0_0), EnableHugeScratchBuffer(false),
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@ -81,6 +81,7 @@ private:
  unsigned WavefrontSize;
  bool CFALUBug;
  int LocalMemorySize;
+  unsigned MaxPrivateElementSize;
  bool EnableVGPRSpilling;
  bool SGPRInitBug;
  bool IsGCN;
@ -253,6 +254,10 @@ public:
    return LocalMemorySize;
  }

+  unsigned getMaxPrivateElementSize() const {
+    return MaxPrivateElementSize;
+  }
+
  bool hasSGPRInitBug() const {
    return SGPRInitBug;
  }
--- a/lib/Target/AMDGPU/AMDKernelCodeT.h
+++ b/lib/Target/AMDGPU/AMDKernelCodeT.h
@ -44,6 +44,15 @@ enum amd_code_version_t {
  AMD_CODE_VERSION_MINOR = 1
 };

+// Sets val bits for specified mask in specified dst packed instance.
+#define AMD_HSA_BITS_SET(dst, mask, val)                                       \
+  dst &= (~(1 << mask ## _SHIFT) & ~mask);                                     \
+  dst |= (((val) << mask ## _SHIFT) & mask)
+
+// Gets bits for specified mask from specified src packed instance.
+#define AMD_HSA_BITS_GET(src, mask)                                            \
+  ((src & mask) >> mask ## _SHIFT)                                             \
+
 /// The values used to define the number of bytes to use for the
 /// swizzle element size.
 enum amd_element_byte_size_t {
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@ -3059,6 +3059,10 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
                    AMDGPU::RSRC_TID_ENABLE |
                    0xffffffff; // Size;

+  uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
+
+  Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT);
+
  // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
  // Clear them unless we want a huge stride.
  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@ -489,7 +489,7 @@ namespace AMDGPU {

  const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
  const uint64_t RSRC_TID_ENABLE = 1LL << 55;
-
+  const uint64_t RSRC_ELEMENT_SIZE_SHIFT = 51;
 } // End namespace AMDGPU

 namespace SI {
--- a/test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll
@ -10,8 +10,8 @@
 ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GCN: s_mov_b32 s10, -1
-; CI: s_mov_b32 s11, 0x80f000
-; VI: s_mov_b32 s11, 0x800000
+; CI: s_mov_b32 s11, 0x98f000
+; VI: s_mov_b32 s11, 0x980000


 ; GCNHSA: .amd_kernel_code_t
--- a/test/CodeGen/AMDGPU/large-alloca-graphics.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
@ -5,8 +5,8 @@
 ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GCN: s_mov_b32 s10, -1
-; CI: s_mov_b32 s11, 0x80f000
-; VI: s_mov_b32 s11, 0x800000
+; CI: s_mov_b32 s11, 0x98f000
+; VI: s_mov_b32 s11, 0x980000

 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
@ -26,8 +26,8 @@ define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 {
 ; GCN: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
 ; GCN: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GCN: s_mov_b32 s10, -1
-; CI: s_mov_b32 s11, 0x80f000
-; VI: s_mov_b32 s11, 0x800000
+; CI: s_mov_b32 s11, 0x98f000
+; VI: s_mov_b32 s11, 0x980000

 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@ -17,8 +17,8 @@
 ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN-NEXT: s_mov_b32 s14, -1
-; SI-NEXT: s_mov_b32 s15, 0x80f000
-; VI-NEXT: s_mov_b32 s15, 0x800000
+; SI-NEXT: s_mov_b32 s15, 0x98f000
+; VI-NEXT: s_mov_b32 s15, 0x980000


 ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@ -14,8 +14,8 @@
 ; GCN: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GCN-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCN-NEXT: s_mov_b32 s14, -1
-; SI-NEXT: s_mov_b32 s15, 0x80f000
-; VI-NEXT: s_mov_b32 s15, 0x800000
+; SI-NEXT: s_mov_b32 s15, 0x98f000
+; VI-NEXT: s_mov_b32 s15, 0x980000

 ; s12 is offset user SGPR
 ; GCN: buffer_store_dword {{v[0-9]+}}, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill