mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 12:43:36 +01:00
c86d684da8
[amdgpu] Implement lower function LDS pass Local variables are allocated at kernel launch. This pass collects global variables that are used from non-kernel functions, moves them into a new struct type, and allocates an instance of that type in every kernel. Uses are then replaced with a constantexpr offset. Prior to this pass, accesses from a function are compiled to trap. With this pass, most such accesses are removed before reaching codegen. The trap logic is left unchanged by this pass. It is still reachable for the cases this pass misses, notably the extern shared construct from hip and variables marked constant which survive the optimizer. This is of interest to the openmp project because the deviceRTL runtime library uses cuda shared variables from functions that cannot be inlined. Trunk llvm therefore cannot compile some openmp kernels for amdgpu. In addition to the unit tests attached, this patch applied to ROCm llvm with fixed-abi enabled and the function pointer hashing scheme deleted passes the openmp suite. This lowering will use more LDS than strictly necessary. It is intended to be a functionally correct fallback for cases that are difficult to target from future optimisation passes. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D94648
106 lines
3.0 KiB
C++
106 lines
3.0 KiB
C++
//===-- AMDGPUMachineFunctionInfo.h -------------------------------*- C++ -*-=//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
|
|
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUMACHINEFUNCTION_H
|
|
|
|
#include "Utils/AMDGPUBaseInfo.h"
|
|
#include "llvm/ADT/DenseMap.h"
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
|
|
|
namespace llvm {
|
|
|
|
class GCNSubtarget;
|
|
|
|
class AMDGPUMachineFunction : public MachineFunctionInfo {
|
|
/// A map to keep track of local memory objects and their offsets within the
|
|
/// local memory space.
|
|
SmallDenseMap<const GlobalValue *, unsigned, 4> LocalMemoryObjects;
|
|
|
|
protected:
|
|
uint64_t ExplicitKernArgSize = 0; // Cache for this.
|
|
Align MaxKernArgAlign; // Cache for this.
|
|
|
|
/// Number of bytes in the LDS that are being used.
|
|
unsigned LDSSize = 0;
|
|
|
|
/// Number of bytes in the LDS allocated statically. This field is only used
|
|
/// in the instruction selector and not part of the machine function info.
|
|
unsigned StaticLDSSize = 0;
|
|
|
|
/// Align for dynamic shared memory if any. Dynamic shared memory is
|
|
/// allocated directly after the static one, i.e., LDSSize. Need to pad
|
|
/// LDSSize to ensure that dynamic one is aligned accordingly.
|
|
/// The maximal alignment is updated during IR translation or lowering
|
|
/// stages.
|
|
Align DynLDSAlign;
|
|
|
|
// State of MODE register, assumed FP mode.
|
|
AMDGPU::SIModeRegisterDefaults Mode;
|
|
|
|
// Kernels + shaders. i.e. functions called by the hardware and not called
|
|
// by other functions.
|
|
bool IsEntryFunction = false;
|
|
|
|
// Entry points called by other functions instead of directly by the hardware.
|
|
bool IsModuleEntryFunction = false;
|
|
|
|
bool NoSignedZerosFPMath = false;
|
|
|
|
// Function may be memory bound.
|
|
bool MemoryBound = false;
|
|
|
|
// Kernel may need limited waves per EU for better performance.
|
|
bool WaveLimiter = false;
|
|
|
|
public:
|
|
AMDGPUMachineFunction(const MachineFunction &MF);
|
|
|
|
uint64_t getExplicitKernArgSize() const {
|
|
return ExplicitKernArgSize;
|
|
}
|
|
|
|
unsigned getMaxKernArgAlign() const { return MaxKernArgAlign.value(); }
|
|
|
|
unsigned getLDSSize() const {
|
|
return LDSSize;
|
|
}
|
|
|
|
AMDGPU::SIModeRegisterDefaults getMode() const {
|
|
return Mode;
|
|
}
|
|
|
|
bool isEntryFunction() const {
|
|
return IsEntryFunction;
|
|
}
|
|
|
|
bool isModuleEntryFunction() const { return IsModuleEntryFunction; }
|
|
|
|
bool hasNoSignedZerosFPMath() const {
|
|
return NoSignedZerosFPMath;
|
|
}
|
|
|
|
bool isMemoryBound() const {
|
|
return MemoryBound;
|
|
}
|
|
|
|
bool needsWaveLimiter() const {
|
|
return WaveLimiter;
|
|
}
|
|
|
|
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
|
|
void allocateModuleLDSGlobal(const Module *M);
|
|
|
|
Align getDynLDSAlign() const { return DynLDSAlign; }
|
|
|
|
void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV);
|
|
};
|
|
|
|
}
|
|
#endif
|