Revert "[AMDGPU] Support disassembly for AMDGPU kernel descriptors"

This reverts commit cacfb02d28a3cabd4e45d2535cb0686cef48a2c9. Reverting due to buildbot failures.
2025-01-31 20:51:52 +01:00 · 2020-08-19 13:07:40 +05:30 · 2020-08-19 13:07:40 +05:30 · 4697f34ed6
commit 4697f34ed6
parent f7a1832d69
11 changed files with 50 additions and 674 deletions
--- a/include/llvm/Support/AMDHSAKernelDescriptor.h
+++ b/include/llvm/Support/AMDHSAKernelDescriptor.h
@ -162,49 +162,39 @@ struct kernel_descriptor_t {
  uint8_t reserved2[6];
 };

-enum : uint32_t {
-  GROUP_SEGMENT_FIXED_SIZE_OFFSET = 0,
-  PRIVATE_SEGMENT_FIXED_SIZE_OFFSET = 4,
-  RESERVED0_OFFSET = 8,
-  KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET = 16,
-  RESERVED1_OFFSET = 24,
-  COMPUTE_PGM_RSRC3_OFFSET = 44,
-  COMPUTE_PGM_RSRC1_OFFSET = 48,
-  COMPUTE_PGM_RSRC2_OFFSET = 52,
-  KERNEL_CODE_PROPERTIES_OFFSET = 56,
-  RESERVED2_OFFSET = 58,
-};
-
 static_assert(
    sizeof(kernel_descriptor_t) == 64,
    "invalid size for kernel_descriptor_t");
-static_assert(offsetof(kernel_descriptor_t, group_segment_fixed_size) ==
-                  GROUP_SEGMENT_FIXED_SIZE_OFFSET,
-              "invalid offset for group_segment_fixed_size");
-static_assert(offsetof(kernel_descriptor_t, private_segment_fixed_size) ==
-                  PRIVATE_SEGMENT_FIXED_SIZE_OFFSET,
-              "invalid offset for private_segment_fixed_size");
-static_assert(offsetof(kernel_descriptor_t, reserved0) == RESERVED0_OFFSET,
-              "invalid offset for reserved0");
-static_assert(offsetof(kernel_descriptor_t, kernel_code_entry_byte_offset) ==
-                  KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET,
-              "invalid offset for kernel_code_entry_byte_offset");
-static_assert(offsetof(kernel_descriptor_t, reserved1) == RESERVED1_OFFSET,
-              "invalid offset for reserved1");
-static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc3) ==
-                  COMPUTE_PGM_RSRC3_OFFSET,
-              "invalid offset for compute_pgm_rsrc3");
-static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc1) ==
-                  COMPUTE_PGM_RSRC1_OFFSET,
-              "invalid offset for compute_pgm_rsrc1");
-static_assert(offsetof(kernel_descriptor_t, compute_pgm_rsrc2) ==
-                  COMPUTE_PGM_RSRC2_OFFSET,
-              "invalid offset for compute_pgm_rsrc2");
-static_assert(offsetof(kernel_descriptor_t, kernel_code_properties) ==
-                  KERNEL_CODE_PROPERTIES_OFFSET,
-              "invalid offset for kernel_code_properties");
-static_assert(offsetof(kernel_descriptor_t, reserved2) == RESERVED2_OFFSET,
-              "invalid offset for reserved2");
+static_assert(
+    offsetof(kernel_descriptor_t, group_segment_fixed_size) == 0,
+    "invalid offset for group_segment_fixed_size");
+static_assert(
+    offsetof(kernel_descriptor_t, private_segment_fixed_size) == 4,
+    "invalid offset for private_segment_fixed_size");
+static_assert(
+    offsetof(kernel_descriptor_t, reserved0) == 8,
+    "invalid offset for reserved0");
+static_assert(
+    offsetof(kernel_descriptor_t, kernel_code_entry_byte_offset) == 16,
+    "invalid offset for kernel_code_entry_byte_offset");
+static_assert(
+    offsetof(kernel_descriptor_t, reserved1) == 24,
+    "invalid offset for reserved1");
+static_assert(
+    offsetof(kernel_descriptor_t, compute_pgm_rsrc3) == 44,
+    "invalid offset for compute_pgm_rsrc3");
+static_assert(
+    offsetof(kernel_descriptor_t, compute_pgm_rsrc1) == 48,
+    "invalid offset for compute_pgm_rsrc1");
+static_assert(
+    offsetof(kernel_descriptor_t, compute_pgm_rsrc2) == 52,
+    "invalid offset for compute_pgm_rsrc2");
+static_assert(
+    offsetof(kernel_descriptor_t, kernel_code_properties) == 56,
+    "invalid offset for kernel_code_properties");
+static_assert(
+    offsetof(kernel_descriptor_t, reserved2) == 58,
+    "invalid offset for reserved2");

 } // end namespace amdhsa
 } // end namespace llvm
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@ -34,7 +34,6 @@
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/AMDHSAKernelDescriptor.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
@ -1216,366 +1215,6 @@ bool AMDGPUDisassembler::isGFX10() const {
  return STI.getFeatureBits()[AMDGPU::FeatureGFX10];
 }

-static void checkError(DataExtractor::Cursor &C) {
-  // For some malformed KD cases, the Cursor tends to hold Error::success().
-  // We check that here to prevent runtime crash in this case.
-  if (!C) {
-    auto Err = C.takeError();
-    assert(!Err);
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// AMDGPU specific symbol handling
-//===----------------------------------------------------------------------===//
-#define PRINT_DIRECTIVE(DIRECTIVE, MASK)                                       \
-  do {                                                                         \
-    KdStream << Indent << DIRECTIVE " "                                        \
-             << ((FourByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n';           \
-  } while (0)
-
-// NOLINTNEXTLINE(readability-identifier-naming)
-MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
-    uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
-  using namespace amdhsa;
-  StringRef Indent = "\t";
-
-  // We cannot accurately backward compute #VGPRs used from
-  // GRANULATED_WORKITEM_VGPR_COUNT. But we are concerned with getting the same
-  // value of GRANULATED_WORKITEM_VGPR_COUNT in the reassembled binary. So we
-  // simply calculate the inverse of what the assembler does.
-
-  uint32_t GranulatedWorkitemVGPRCount =
-      (FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT) >>
-      COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT;
-
-  uint32_t NextFreeVGPR = (GranulatedWorkitemVGPRCount + 1) *
-                          AMDGPU::IsaInfo::getVGPREncodingGranule(&STI);
-
-  KdStream << Indent << ".amdhsa_next_free_vgpr " << NextFreeVGPR << '\n';
-
-  // We cannot backward compute values used to calculate
-  // GRANULATED_WAVEFRONT_SGPR_COUNT. Hence the original values for following
-  // directives can't be computed:
-  // .amdhsa_reserve_vcc
-  // .amdhsa_reserve_flat_scratch
-  // .amdhsa_reserve_xnack_mask
-  // They take their respective default values if not specified in the assembly.
-  //
-  // GRANULATED_WAVEFRONT_SGPR_COUNT
-  //    = f(NEXT_FREE_SGPR + VCC + FLAT_SCRATCH + XNACK_MASK)
-  //
-  // We compute the inverse as though all directives apart from NEXT_FREE_SGPR
-  // are set to 0. So while disassembling we consider that:
-  //
-  // GRANULATED_WAVEFRONT_SGPR_COUNT
-  //    = f(NEXT_FREE_SGPR + 0 + 0 + 0)
-  //
-  // The disassembler cannot recover the original values of those 3 directives.
-
-  uint32_t GranulatedWavefrontSGPRCount =
-      (FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT) >>
-      COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT;
-
-  if (isGFX10() && GranulatedWavefrontSGPRCount)
-    return MCDisassembler::Fail;
-
-  uint32_t NextFreeSGPR = (GranulatedWavefrontSGPRCount + 1) *
-                          AMDGPU::IsaInfo::getSGPREncodingGranule(&STI);
-
-  KdStream << Indent << ".amdhsa_reserve_vcc " << 0 << '\n';
-  KdStream << Indent << ".amdhsa_reserve_flat_scratch " << 0 << '\n';
-  KdStream << Indent << ".amdhsa_reserve_xnack_mask " << 0 << '\n';
-  KdStream << Indent << ".amdhsa_next_free_sgpr " << NextFreeSGPR << "\n";
-
-  if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIORITY)
-    return MCDisassembler::Fail;
-
-  PRINT_DIRECTIVE(".amdhsa_float_round_mode_32",
-                  COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_32);
-  PRINT_DIRECTIVE(".amdhsa_float_round_mode_16_64",
-                  COMPUTE_PGM_RSRC1_FLOAT_ROUND_MODE_16_64);
-  PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_32",
-                  COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_32);
-  PRINT_DIRECTIVE(".amdhsa_float_denorm_mode_16_64",
-                  COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
-
-  if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIV)
-    return MCDisassembler::Fail;
-
-  PRINT_DIRECTIVE(".amdhsa_dx10_clamp", COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP);
-
-  if (FourByteBuffer & COMPUTE_PGM_RSRC1_DEBUG_MODE)
-    return MCDisassembler::Fail;
-
-  PRINT_DIRECTIVE(".amdhsa_ieee_mode", COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE);
-
-  if (FourByteBuffer & COMPUTE_PGM_RSRC1_BULKY)
-    return MCDisassembler::Fail;
-
-  if (FourByteBuffer & COMPUTE_PGM_RSRC1_CDBG_USER)
-    return MCDisassembler::Fail;
-
-  PRINT_DIRECTIVE(".amdhsa_fp16_overflow", COMPUTE_PGM_RSRC1_FP16_OVFL);
-
-  if (FourByteBuffer & COMPUTE_PGM_RSRC1_RESERVED0)
-    return MCDisassembler::Fail;
-
-  if (isGFX10()) {
-    PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode",
-                    COMPUTE_PGM_RSRC1_WGP_MODE);
-    PRINT_DIRECTIVE(".amdhsa_memory_ordered", COMPUTE_PGM_RSRC1_MEM_ORDERED);
-    PRINT_DIRECTIVE(".amdhsa_forward_progress", COMPUTE_PGM_RSRC1_FWD_PROGRESS);
-  }
-  return MCDisassembler::Success;
-}
-
-// NOLINTNEXTLINE(readability-identifier-naming)
-MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2(
-    uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
-  using namespace amdhsa;
-  StringRef Indent = "\t";
-  PRINT_DIRECTIVE(
-      ".amdhsa_system_sgpr_private_segment_wavefront_offset",
-      COMPUTE_PGM_RSRC2_ENABLE_SGPR_PRIVATE_SEGMENT_WAVEFRONT_OFFSET);
-  PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_x",
-                  COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X);
-  PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_y",
-                  COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Y);
-  PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_id_z",
-                  COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_Z);
-  PRINT_DIRECTIVE(".amdhsa_system_sgpr_workgroup_info",
-                  COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_INFO);
-  PRINT_DIRECTIVE(".amdhsa_system_vgpr_workitem_id",
-                  COMPUTE_PGM_RSRC2_ENABLE_VGPR_WORKITEM_ID);
-
-  if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_ADDRESS_WATCH)
-    return MCDisassembler::Fail;
-
-  if (FourByteBuffer & COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_MEMORY)
-    return MCDisassembler::Fail;
-
-  if (FourByteBuffer & COMPUTE_PGM_RSRC2_GRANULATED_LDS_SIZE)
-    return MCDisassembler::Fail;
-
-  PRINT_DIRECTIVE(
-      ".amdhsa_exception_fp_ieee_invalid_op",
-      COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INVALID_OPERATION);
-  PRINT_DIRECTIVE(".amdhsa_exception_fp_denorm_src",
-                  COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_FP_DENORMAL_SOURCE);
-  PRINT_DIRECTIVE(
-      ".amdhsa_exception_fp_ieee_div_zero",
-      COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_DIVISION_BY_ZERO);
-  PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_overflow",
-                  COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_OVERFLOW);
-  PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_underflow",
-                  COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_UNDERFLOW);
-  PRINT_DIRECTIVE(".amdhsa_exception_fp_ieee_inexact",
-                  COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_IEEE_754_FP_INEXACT);
-  PRINT_DIRECTIVE(".amdhsa_exception_int_div_zero",
-                  COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO);
-
-  if (FourByteBuffer & COMPUTE_PGM_RSRC2_RESERVED0)
-    return MCDisassembler::Fail;
-
-  return MCDisassembler::Success;
-}
-
-#undef PRINT_DIRECTIVE
-
-MCDisassembler::DecodeStatus
-AMDGPUDisassembler::decodeKernelDescriptorDirective(
-    DataExtractor::Cursor &Cursor, ArrayRef<uint8_t> Bytes,
-    raw_string_ostream &KdStream) const {
-#define PRINT_DIRECTIVE(DIRECTIVE, MASK)                                       \
-  do {                                                                         \
-    KdStream << Indent << DIRECTIVE " "                                        \
-             << ((TwoByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n';            \
-  } while (0)
-
-  uint16_t TwoByteBuffer = 0;
-  uint32_t FourByteBuffer = 0;
-  uint64_t EightByteBuffer = 0;
-
-  StringRef ReservedBytes;
-  StringRef Indent = "\t";
-
-  DataExtractor DE(Bytes, /*IsLittleEndian=*/true, /*AddressSize=*/8);
-
-  switch (Cursor.tell()) {
-  case amdhsa::GROUP_SEGMENT_FIXED_SIZE_OFFSET:
-    FourByteBuffer = DE.getU32(Cursor);
-    checkError(Cursor);
-    KdStream << Indent << ".amdhsa_group_segment_fixed_size " << FourByteBuffer
-             << '\n';
-    return MCDisassembler::Success;
-
-  case amdhsa::PRIVATE_SEGMENT_FIXED_SIZE_OFFSET:
-    FourByteBuffer = DE.getU32(Cursor);
-    checkError(Cursor);
-    KdStream << Indent << ".amdhsa_private_segment_fixed_size "
-             << FourByteBuffer << '\n';
-    return MCDisassembler::Success;
-
-  case amdhsa::RESERVED0_OFFSET:
-    // 8 reserved bytes, must be 0.
-    EightByteBuffer = DE.getU64(Cursor);
-    checkError(Cursor);
-    if (EightByteBuffer) {
-      return MCDisassembler::Fail;
-    }
-    return MCDisassembler::Success;
-
-  case amdhsa::KERNEL_CODE_ENTRY_BYTE_OFFSET_OFFSET:
-    // KERNEL_CODE_ENTRY_BYTE_OFFSET
-    // So far no directive controls this for Code Object V3, so simply skip for
-    // disassembly.
-    DE.skip(Cursor, 8);
-    checkError(Cursor);
-    return MCDisassembler::Success;
-
-  case amdhsa::RESERVED1_OFFSET:
-    // 20 reserved bytes, must be 0.
-    ReservedBytes = DE.getBytes(Cursor, 20);
-    checkError(Cursor);
-    for (int I = 0; I < 20; ++I) {
-      if (ReservedBytes[I] != 0) {
-        return MCDisassembler::Fail;
-      }
-    }
-    return MCDisassembler::Success;
-
-  case amdhsa::COMPUTE_PGM_RSRC3_OFFSET:
-    // COMPUTE_PGM_RSRC3
-    //  - Only set for GFX10, GFX6-9 have this to be 0.
-    //  - Currently no directives directly control this.
-    FourByteBuffer = DE.getU32(Cursor);
-    checkError(Cursor);
-    if (!isGFX10() && FourByteBuffer) {
-      return MCDisassembler::Fail;
-    }
-    return MCDisassembler::Success;
-
-  case amdhsa::COMPUTE_PGM_RSRC1_OFFSET:
-    FourByteBuffer = DE.getU32(Cursor);
-    checkError(Cursor);
-    if (decodeCOMPUTE_PGM_RSRC1(FourByteBuffer, KdStream) ==
-        MCDisassembler::Fail) {
-      return MCDisassembler::Fail;
-    }
-    return MCDisassembler::Success;
-
-  case amdhsa::COMPUTE_PGM_RSRC2_OFFSET:
-    FourByteBuffer = DE.getU32(Cursor);
-    checkError(Cursor);
-    if (decodeCOMPUTE_PGM_RSRC2(FourByteBuffer, KdStream) ==
-        MCDisassembler::Fail) {
-      return MCDisassembler::Fail;
-    }
-    return MCDisassembler::Success;
-
-  case amdhsa::KERNEL_CODE_PROPERTIES_OFFSET:
-    using namespace amdhsa;
-    TwoByteBuffer = DE.getU16(Cursor);
-    checkError(Cursor);
-
-    PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_buffer",
-                    KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER);
-    PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_ptr",
-                    KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR);
-    PRINT_DIRECTIVE(".amdhsa_user_sgpr_queue_ptr",
-                    KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR);
-    PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_segment_ptr",
-                    KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR);
-    PRINT_DIRECTIVE(".amdhsa_user_sgpr_dispatch_id",
-                    KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID);
-    PRINT_DIRECTIVE(".amdhsa_user_sgpr_flat_scratch_init",
-                    KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
-    PRINT_DIRECTIVE(".amdhsa_user_sgpr_private_segment_size",
-                    KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
-
-    if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED0)
-      return MCDisassembler::Fail;
-
-    // Reserved for GFX9
-    if (isGFX9() &&
-        (TwoByteBuffer & KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32)) {
-      return MCDisassembler::Fail;
-    } else if (isGFX10()) {
-      PRINT_DIRECTIVE(".amdhsa_wavefront_size32",
-                      KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
-    }
-
-    if (TwoByteBuffer & KERNEL_CODE_PROPERTY_RESERVED1)
-      return MCDisassembler::Fail;
-
-    return MCDisassembler::Success;
-
-  case amdhsa::RESERVED2_OFFSET:
-    // 6 bytes from here are reserved, must be 0.
-    ReservedBytes = DE.getBytes(Cursor, 6);
-    checkError(Cursor);
-    for (int I = 0; I < 6; ++I) {
-      if (ReservedBytes[I] != 0)
-        return MCDisassembler::Fail;
-    }
-    return MCDisassembler::Success;
-
-  default:
-    llvm_unreachable("Unhandled index. Case statements cover everything.");
-    return MCDisassembler::Fail;
-  }
-#undef PRINT_DIRECTIVE
-}
-
-MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeKernelDescriptor(
-    StringRef KdName, ArrayRef<uint8_t> Bytes, uint64_t KdAddress) const {
-  // CP microcode requires the kernel descriptor to be 64 aligned.
-  if (Bytes.size() != 64 || KdAddress % 64 != 0)
-    return MCDisassembler::Fail;
-
-  std::string Kd;
-  raw_string_ostream KdStream(Kd);
-  KdStream << ".amdhsa_kernel " << KdName << '\n';
-
-  DataExtractor::Cursor C(0);
-  while (C && C.tell() < Bytes.size()) {
-    MCDisassembler::DecodeStatus Status =
-        decodeKernelDescriptorDirective(C, Bytes, KdStream);
-
-    if (Status == MCDisassembler::Fail)
-      return MCDisassembler::Fail;
-  }
-  KdStream << ".end_amdhsa_kernel\n";
-  outs() << KdStream.str();
-  return MCDisassembler::Success;
-}
-
-Optional<MCDisassembler::DecodeStatus>
-AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
-                                  ArrayRef<uint8_t> Bytes, uint64_t Address,
-                                  raw_ostream &CStream) const {
-  // Right now only kernel descriptor needs to be handled.
-  // We ignore all other symbols for target specific handling.
-  // TODO:
-  // Fix the spurious symbol issue for AMDGPU kernels. Exists for both Code
-  // Object V2 and V3 when symbols are marked protected.
-
-  // amd_kernel_code_t for Code Object V2.
-  if (Symbol.Type == ELF::STT_AMDGPU_HSA_KERNEL) {
-    Size = 256;
-    return MCDisassembler::Fail;
-  }
-
-  // Code Object V3 kernel descriptors.
-  StringRef Name = Symbol.Name;
-  if (Symbol.Type == ELF::STT_OBJECT && Name.endswith(StringRef(".kd"))) {
-    Size = 64; // Size = 64 regardless of success or failure.
-    return decodeKernelDescriptor(Name.drop_back(3), Bytes, Address);
-  }
-  return None;
-}
-
 //===----------------------------------------------------------------------===//
 // AMDGPUSymbolizer
 //===----------------------------------------------------------------------===//
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@ -17,11 +17,10 @@

 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
 #include "llvm/MC/MCDisassembler/MCSymbolizer.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/Support/DataExtractor.h"

 #include <algorithm>
 #include <cstdint>
@ -67,33 +66,6 @@ public:
  DecodeStatus tryDecodeInst(const uint8_t* Table, MCInst &MI, uint64_t Inst,
                             uint64_t Address) const;

-  Optional<DecodeStatus> onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
-                                       ArrayRef<uint8_t> Bytes,
-                                       uint64_t Address,
-                                       raw_ostream &CStream) const override;
-
-  DecodeStatus decodeKernelDescriptor(StringRef KdName, ArrayRef<uint8_t> Bytes,
-                                      uint64_t KdAddress) const;
-
-  DecodeStatus
-  decodeKernelDescriptorDirective(DataExtractor::Cursor &Cursor,
-                                  ArrayRef<uint8_t> Bytes,
-                                  raw_string_ostream &KdStream) const;
-
-  /// Decode as directives that handle COMPUTE_PGM_RSRC1.
-  /// \param FourByteBuffer - Bytes holding contents of COMPUTE_PGM_RSRC1.
-  /// \param KdStream       - Stream to write the disassembled directives to.
-  // NOLINTNEXTLINE(readability-identifier-naming)
-  DecodeStatus decodeCOMPUTE_PGM_RSRC1(uint32_t FourByteBuffer,
-                                       raw_string_ostream &KdStream) const;
-
-  /// Decode as directives that handle COMPUTE_PGM_RSRC2.
-  /// \param FourByteBuffer - Bytes holding contents of COMPUTE_PGM_RSRC2.
-  /// \param KdStream       - Stream to write the disassembled directives to.
-  // NOLINTNEXTLINE(readability-identifier-naming)
-  DecodeStatus decodeCOMPUTE_PGM_RSRC2(uint32_t FourByteBuffer,
-                                       raw_string_ostream &KdStream) const;
-
  DecodeStatus convertSDWAInst(MCInst &MI) const;
  DecodeStatus convertDPP8Inst(MCInst &MI) const;
  DecodeStatus convertMIMGInst(MCInst &MI) const;
--- a/test/CodeGen/AMDGPU/nop-data.ll
+++ b/test/CodeGen/AMDGPU/nop-data.ll
@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mattr=-code-object-v3 -mcpu=fiji -filetype=obj < %s | llvm-objdump -d - --mcpu=fiji | FileCheck %s

 ; CHECK: <kernel0>:
-; CHECK: s_endpgm
+; CHECK-NEXT: s_endpgm
 define amdgpu_kernel void @kernel0() align 256 {
 entry:
  ret void
@ -80,7 +80,7 @@ entry:

 ; CHECK-EMPTY:
 ; CHECK-NEXT: <kernel1>:
-; CHECK: s_endpgm
+; CHECK-NEXT: s_endpgm
 define amdgpu_kernel void @kernel1(i32 addrspace(1)* addrspace(4)* %ptr.out) align 256 {
 entry:
  ret void
--- a/test/tools/llvm-objdump/ELF/AMDGPU/kd-failure.s
+++ b/test/tools/llvm-objdump/ELF/AMDGPU/kd-failure.s
@ -1,35 +0,0 @@
-; RUN: llvm-mc %s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t.o
-
-; RUN: printf ".type  my_kernel.kd, @object \nmy_kernel.kd:\n.size my_kernel.kd, 64\n" > %t1.sym_info
-; RUN: llvm-objdump --arch-name=amdgcn --mcpu=gfx908 --disassemble-symbols=my_kernel.kd %t.o \
-; RUN: | tail -n +9 > %t1.sym_content
-; RUN: cat %t1.sym_info %t1.sym_content > %t1.s
-
-; RUN: llvm-mc %t1.s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t-re-assemble.o
-; RUN: diff %t.o %t-re-assemble.o
-
-// Test failure by setting one of the reserved bytes to non-zero value.
-
-.type	my_kernel.kd, @object
-.size my_kernel.kd, 64
-my_kernel.kd:
-  .long 0x00000000           ; group_segment_fixed_size
-  .long 0x00000000           ; private_segment_fixed_size
-  .quad 0x00FF000000000000   ; reserved bytes.
-  .quad 0x0000000000000000   ; kernel_code_entry_byte_offset, any value works.
-
-  ; 20 reserved bytes.
-  .quad 0x0000000000000000
-  .quad 0x0000000000000000
-  .long 0x00000000
-
-  .long 0x00000000           ; compute_PGM_RSRC3
-  .long 0x00000000           ; compute_PGM_RSRC1
-  .long 0x00000000           ; compute_PGM_RSRC2
-  .short 0x0000              ; additional fields.
-
-  ; 6 reserved bytes.
-  .long 0x0000000
-  .short 0x0000
-
-
--- a/test/tools/llvm-objdump/ELF/AMDGPU/kd-sgpr.s
+++ b/test/tools/llvm-objdump/ELF/AMDGPU/kd-sgpr.s
@ -1,35 +0,0 @@
-; RUN: llvm-mc %s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t
-; RUN: llvm-objdump --arch-name=amdgcn --mcpu=gfx908 --disassemble-symbols=my_kernel_1.kd %t | tail -n +8 > %t1.s
-; RUN: llvm-objdump --arch-name=amdgcn --mcpu=gfx908 --disassemble-symbols=my_kernel_2.kd %t | tail -n +8 > %t2.s
-; RUN: llvm-objdump --arch-name=amdgcn --mcpu=gfx908 --disassemble-symbols=my_kernel_3.kd %t | tail -n +8 > %t3.s
-; RUN: cat %t1.s %t2.s %t3.s | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t-re-assemble
-; RUN: diff %t %t-re-assemble
-
-// Test disassembly for GRANULATED_WAVEFRONT_SGPR_COUNT.
-
-// Only set next_free_sgpr
-.amdhsa_kernel my_kernel_1
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_next_free_sgpr 42
-  .amdhsa_reserve_flat_scratch 0
-  .amdhsa_reserve_xnack_mask 0
-  .amdhsa_reserve_vcc 0
-.end_amdhsa_kernel
-
-// Only set other directives.
-.amdhsa_kernel my_kernel_2
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_next_free_sgpr 0
-  .amdhsa_reserve_flat_scratch 1
-  .amdhsa_reserve_xnack_mask 1
-  .amdhsa_reserve_vcc 1
-.end_amdhsa_kernel
-
-// Set all affecting directives.
-.amdhsa_kernel my_kernel_3
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_next_free_sgpr 35
-  .amdhsa_reserve_flat_scratch 1
-  .amdhsa_reserve_xnack_mask 1
-  .amdhsa_reserve_vcc 1
-.end_amdhsa_kernel
--- a/test/tools/llvm-objdump/ELF/AMDGPU/kd-vgpr.s
+++ b/test/tools/llvm-objdump/ELF/AMDGPU/kd-vgpr.s
@ -1,24 +0,0 @@
-; RUN: llvm-mc %s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t
-; RUN: llvm-objdump --arch-name=amdgcn --mcpu=gfx908 --disassemble-symbols=my_kernel_1.kd %t | tail -n +8 > %t1.s
-; RUN: llvm-objdump --arch-name=amdgcn --mcpu=gfx908 --disassemble-symbols=my_kernel_2.kd %t | tail -n +8 > %t2.s
-; RUN: llvm-objdump --arch-name=amdgcn --mcpu=gfx908 --disassemble-symbols=my_kernel_3.kd %t | tail -n +8 > %t3.s
-; RUN: cat %t1.s %t2.s %t3.s | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t-re-assemble
-; RUN: diff %t %t-re-assemble
-
-// Test disassembly for GRANULATED_WORKITEM_VGPR_COUNT.
-
-.amdhsa_kernel my_kernel_1
-  .amdhsa_next_free_vgpr 23
-  .amdhsa_next_free_sgpr 0
-.end_amdhsa_kernel
-
-.amdhsa_kernel my_kernel_2
-  .amdhsa_next_free_vgpr 14
-  .amdhsa_next_free_sgpr 0
-.end_amdhsa_kernel
-
-.amdhsa_kernel my_kernel_3
-  .amdhsa_next_free_vgpr 32
-  .amdhsa_next_free_sgpr 0
-.end_amdhsa_kernel
-
--- a/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s
+++ b/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s
@ -1,56 +0,0 @@
-; RUN: llvm-mc %s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=obj -o %t
-; RUN: llvm-objdump -s -j .text %t | FileCheck --check-prefix=OBJDUMP %s
-
-// TODO:
-// This file and kd-zeroed-raw.s should produce the same output for the kernel
-// descriptor - a block of 64 zeroed bytes. But looks like the assembler sets
-// the FWD_PROGRESS bit in COMPUTE_PGM_RSRC1 to 1 even when the directive
-// mentions 0 (see line 34).
-
-// Check the raw bytes right now
-
-// OBJDUMP:      0000 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0030 01000000 00000000 00000000 00000000
-
-.amdhsa_kernel my_kernel
-  .amdhsa_group_segment_fixed_size 0
-  .amdhsa_private_segment_fixed_size 0
-  .amdhsa_next_free_vgpr 8
-  .amdhsa_reserve_vcc 0
-  .amdhsa_reserve_flat_scratch 0
-  .amdhsa_reserve_xnack_mask 0
-  .amdhsa_next_free_sgpr 8
-  .amdhsa_float_round_mode_32 0
-  .amdhsa_float_round_mode_16_64 0
-  .amdhsa_float_denorm_mode_32 0
-  .amdhsa_float_denorm_mode_16_64 0
-  .amdhsa_dx10_clamp 0
-  .amdhsa_ieee_mode 0
-  .amdhsa_fp16_overflow 0
-  .amdhsa_workgroup_processor_mode 0
-  .amdhsa_memory_ordered 0
-  .amdhsa_forward_progress 0
-  .amdhsa_system_sgpr_private_segment_wavefront_offset 0
-  .amdhsa_system_sgpr_workgroup_id_x 0
-  .amdhsa_system_sgpr_workgroup_id_y 0
-  .amdhsa_system_sgpr_workgroup_id_z 0
-  .amdhsa_system_sgpr_workgroup_info 0
-  .amdhsa_system_vgpr_workitem_id 0
-  .amdhsa_exception_fp_ieee_invalid_op 0
-  .amdhsa_exception_fp_denorm_src 0
-  .amdhsa_exception_fp_ieee_div_zero 0
-  .amdhsa_exception_fp_ieee_overflow 0
-  .amdhsa_exception_fp_ieee_underflow 0
-  .amdhsa_exception_fp_ieee_inexact 0
-  .amdhsa_exception_int_div_zero 0
-  .amdhsa_user_sgpr_private_segment_buffer 0
-  .amdhsa_user_sgpr_dispatch_ptr 0
-  .amdhsa_user_sgpr_queue_ptr 0
-  .amdhsa_user_sgpr_kernarg_segment_ptr 0
-  .amdhsa_user_sgpr_dispatch_id 0
-  .amdhsa_user_sgpr_flat_scratch_init 0
-  .amdhsa_user_sgpr_private_segment_size 0
-  .amdhsa_wavefront_size32 0
-.end_amdhsa_kernel
--- a/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx9.s
+++ b/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx9.s
@ -1,51 +0,0 @@
-; RUN: llvm-mc %s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1
-; RUN: llvm-objdump --arch-name=amdgcn --mcpu=gfx908 --disassemble-symbols=my_kernel.kd %t1 \
-; RUN: | tail -n +8 | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2
-; RUN: diff %t1 %t2
-
-; RUN: llvm-objdump -s -j .text %t1 | FileCheck --check-prefix=OBJDUMP %s
-
-; OBJDUMP:      0000 00000000 00000000 00000000 00000000
-; OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
-; OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
-; OBJDUMP-NEXT: 0030 00000000 00000000 00000000 00000000
-
-// This file and kd-zeroed-raw.s produce the same output for the kernel
-// descriptor - a block of 64 zeroed bytes.
-
-.amdhsa_kernel my_kernel
-  .amdhsa_group_segment_fixed_size 0
-  .amdhsa_private_segment_fixed_size 0
-  .amdhsa_next_free_vgpr 0
-  .amdhsa_reserve_vcc 0
-  .amdhsa_reserve_flat_scratch 0
-  .amdhsa_reserve_xnack_mask 0
-  .amdhsa_next_free_sgpr 0
-  .amdhsa_float_round_mode_32 0
-  .amdhsa_float_round_mode_16_64 0
-  .amdhsa_float_denorm_mode_32 0
-  .amdhsa_float_denorm_mode_16_64 0
-  .amdhsa_dx10_clamp 0
-  .amdhsa_ieee_mode 0
-  .amdhsa_fp16_overflow 0
-  .amdhsa_system_sgpr_private_segment_wavefront_offset 0
-  .amdhsa_system_sgpr_workgroup_id_x 0
-  .amdhsa_system_sgpr_workgroup_id_y 0
-  .amdhsa_system_sgpr_workgroup_id_z 0
-  .amdhsa_system_sgpr_workgroup_info 0
-  .amdhsa_system_vgpr_workitem_id 0
-  .amdhsa_exception_fp_ieee_invalid_op 0
-  .amdhsa_exception_fp_denorm_src 0
-  .amdhsa_exception_fp_ieee_div_zero 0
-  .amdhsa_exception_fp_ieee_overflow 0
-  .amdhsa_exception_fp_ieee_underflow 0
-  .amdhsa_exception_fp_ieee_inexact 0
-  .amdhsa_exception_int_div_zero 0
-  .amdhsa_user_sgpr_private_segment_buffer 0
-  .amdhsa_user_sgpr_dispatch_ptr 0
-  .amdhsa_user_sgpr_queue_ptr 0
-  .amdhsa_user_sgpr_kernarg_segment_ptr 0
-  .amdhsa_user_sgpr_dispatch_id 0
-  .amdhsa_user_sgpr_flat_scratch_init 0
-  .amdhsa_user_sgpr_private_segment_size 0
-.end_amdhsa_kernel
--- a/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-raw.s
+++ b/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-raw.s
@ -1,41 +0,0 @@
-; RUN: llvm-mc %s -mattr=+code-object-v3 --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t1
-; RUN: llvm-objdump --arch-name=amdgcn --mcpu=gfx908 --disassemble-symbols=my_kernel.kd %t1 \
-; RUN: | tail -n +8 | llvm-mc --triple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=obj -o %t2
-; RUN: llvm-objdump -s -j .text %t2 | FileCheck --check-prefix=OBJDUMP %s
-
-// Not running lit-test over gfx10 (see kd-zeroed-gfx10.s for details).
-// kd-zeroed-raw.s and kd-zeroed-*.s should produce the same output for the
-// kernel descriptor - a block of 64 zeroed bytes.
-
-// The disassembly will produce the contents of kd-zeroed-*.s which on being
-// assembled contains additional relocation info. A diff over the entire object
-// will fail in this case. So we check by looking the bytes in .text.
-
-// OBJDUMP:      0000 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0030 00000000 00000000 00000000 00000000
-
-// The entire object is zeroed out.
-
-.type	my_kernel.kd, @object
-.size my_kernel.kd, 64
-my_kernel.kd:
-  .long 0x00000000           ; group_segment_fixed_size
-  .long 0x00000000           ; private_segment_fixed_size
-  .quad 0x0000000000000000   ; reserved bytes.
-  .quad 0x0000000000000000   ; kernel_code_entry_byte_offset, any value works.
-
-  ; 20 reserved bytes.
-  .quad 0x0000000000000000
-  .quad 0x0000000000000000
-  .long 0x00000000
-
-  .long 0x00000000           ; compute_PGM_RSRC3
-  .long 0x00000000           ; compute_PGM_RSRC1
-  .long 0x00000000           ; compute_PGM_RSRC2
-  .short 0x0000              ; additional fields.
-
-  ; 6 reserved bytes.
-  .long 0x0000000
-  .short 0x0000
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@ -1849,6 +1849,23 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
        outs() << SectionName << ":\n";
      }

+      if (Obj->isELF() && Obj->getArch() == Triple::amdgcn) {
+        if (Symbols[SI].Type == ELF::STT_AMDGPU_HSA_KERNEL) {
+          // skip amd_kernel_code_t at the begining of kernel symbol (256 bytes)
+          Start += 256;
+        }
+        if (SI == SE - 1 ||
+            Symbols[SI + 1].Type == ELF::STT_AMDGPU_HSA_KERNEL) {
+          // cut trailing zeroes at the end of kernel
+          // cut up to 256 bytes
+          const uint64_t EndAlign = 256;
+          const auto Limit = End - (std::min)(EndAlign, End - Start);
+          while (End > Limit &&
+            *reinterpret_cast<const support::ulittle32_t*>(&Bytes[End - 4]) == 0)
+            End -= 4;
+        }
+      }
+
      outs() << '\n';
      if (!NoLeadingAddr)
        outs() << format(Is64Bits ? "%016" PRIx64 " " : "%08" PRIx64 " ",