mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 12:12:47 +01:00
47ca9dba51
Update AMDGPU gfx90a memory model to make coarse grain memory allocations consistent when fine grained system scope atomic acquire and release is performed. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D105137
1868 lines
67 KiB
C++
1868 lines
67 KiB
C++
//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
|
|
//
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
//
|
|
/// \file
|
|
/// Memory legalizer - implements memory model. More information can be
|
|
/// found here:
|
|
/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include "AMDGPU.h"
|
|
#include "AMDGPUMachineModuleInfo.h"
|
|
#include "GCNSubtarget.h"
|
|
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
|
#include "llvm/ADT/BitmaskEnum.h"
|
|
#include "llvm/CodeGen/MachineBasicBlock.h"
|
|
#include "llvm/IR/DiagnosticInfo.h"
|
|
#include "llvm/Support/AtomicOrdering.h"
|
|
#include "llvm/Support/TargetParser.h"
|
|
|
|
using namespace llvm;
|
|
using namespace llvm::AMDGPU;
|
|
|
|
#define DEBUG_TYPE "si-memory-legalizer"
|
|
#define PASS_NAME "SI Memory Legalizer"
|
|
|
|
static cl::opt<bool> AmdgcnSkipCacheInvalidations(
|
|
"amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
|
|
cl::desc("Use this to skip inserting cache invalidating instructions."));
|
|
|
|
namespace {
|
|
|
|
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
|
|
|
|
/// Memory operation flags. Can be ORed together.
|
|
enum class SIMemOp {
|
|
NONE = 0u,
|
|
LOAD = 1u << 0,
|
|
STORE = 1u << 1,
|
|
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
|
|
};
|
|
|
|
/// Position to insert a new instruction relative to an existing
|
|
/// instruction.
|
|
enum class Position {
|
|
BEFORE,
|
|
AFTER
|
|
};
|
|
|
|
/// The atomic synchronization scopes supported by the AMDGPU target.
|
|
enum class SIAtomicScope {
|
|
NONE,
|
|
SINGLETHREAD,
|
|
WAVEFRONT,
|
|
WORKGROUP,
|
|
AGENT,
|
|
SYSTEM
|
|
};
|
|
|
|
/// The distinct address spaces supported by the AMDGPU target for
|
|
/// atomic memory operation. Can be ORed toether.
|
|
enum class SIAtomicAddrSpace {
|
|
NONE = 0u,
|
|
GLOBAL = 1u << 0,
|
|
LDS = 1u << 1,
|
|
SCRATCH = 1u << 2,
|
|
GDS = 1u << 3,
|
|
OTHER = 1u << 4,
|
|
|
|
/// The address spaces that can be accessed by a FLAT instruction.
|
|
FLAT = GLOBAL | LDS | SCRATCH,
|
|
|
|
/// The address spaces that support atomic instructions.
|
|
ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
|
|
|
|
/// All address spaces.
|
|
ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
|
|
|
|
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
|
|
};
|
|
|
|
class SIMemOpInfo final {
|
|
private:
|
|
|
|
friend class SIMemOpAccess;
|
|
|
|
AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
|
|
AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
|
|
SIAtomicScope Scope = SIAtomicScope::SYSTEM;
|
|
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
|
|
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
|
|
bool IsCrossAddressSpaceOrdering = false;
|
|
bool IsVolatile = false;
|
|
bool IsNonTemporal = false;
|
|
|
|
SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
|
|
SIAtomicScope Scope = SIAtomicScope::SYSTEM,
|
|
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
|
|
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
|
|
bool IsCrossAddressSpaceOrdering = true,
|
|
AtomicOrdering FailureOrdering =
|
|
AtomicOrdering::SequentiallyConsistent,
|
|
bool IsVolatile = false,
|
|
bool IsNonTemporal = false)
|
|
: Ordering(Ordering), FailureOrdering(FailureOrdering),
|
|
Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
|
|
InstrAddrSpace(InstrAddrSpace),
|
|
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
|
|
IsVolatile(IsVolatile),
|
|
IsNonTemporal(IsNonTemporal) {
|
|
|
|
if (Ordering == AtomicOrdering::NotAtomic) {
|
|
assert(Scope == SIAtomicScope::NONE &&
|
|
OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
|
|
!IsCrossAddressSpaceOrdering &&
|
|
FailureOrdering == AtomicOrdering::NotAtomic);
|
|
return;
|
|
}
|
|
|
|
assert(Scope != SIAtomicScope::NONE &&
|
|
(OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
|
|
SIAtomicAddrSpace::NONE &&
|
|
(InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
|
|
SIAtomicAddrSpace::NONE &&
|
|
!isStrongerThan(FailureOrdering, Ordering));
|
|
|
|
// There is also no cross address space ordering if the ordering
|
|
// address space is the same as the instruction address space and
|
|
// only contains a single address space.
|
|
if ((OrderingAddrSpace == InstrAddrSpace) &&
|
|
isPowerOf2_32(uint32_t(InstrAddrSpace)))
|
|
this->IsCrossAddressSpaceOrdering = false;
|
|
|
|
// Limit the scope to the maximum supported by the instruction's address
|
|
// spaces.
|
|
if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
|
|
SIAtomicAddrSpace::NONE) {
|
|
this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
|
|
} else if ((InstrAddrSpace &
|
|
~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
|
|
SIAtomicAddrSpace::NONE) {
|
|
this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
|
|
} else if ((InstrAddrSpace &
|
|
~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
|
|
SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
|
|
this->Scope = std::min(Scope, SIAtomicScope::AGENT);
|
|
}
|
|
}
|
|
|
|
public:
|
|
/// \returns Atomic synchronization scope of the machine instruction used to
|
|
/// create this SIMemOpInfo.
|
|
SIAtomicScope getScope() const {
|
|
return Scope;
|
|
}
|
|
|
|
/// \returns Ordering constraint of the machine instruction used to
|
|
/// create this SIMemOpInfo.
|
|
AtomicOrdering getOrdering() const {
|
|
return Ordering;
|
|
}
|
|
|
|
/// \returns Failure ordering constraint of the machine instruction used to
|
|
/// create this SIMemOpInfo.
|
|
AtomicOrdering getFailureOrdering() const {
|
|
return FailureOrdering;
|
|
}
|
|
|
|
/// \returns The address spaces be accessed by the machine
|
|
/// instruction used to create this SiMemOpInfo.
|
|
SIAtomicAddrSpace getInstrAddrSpace() const {
|
|
return InstrAddrSpace;
|
|
}
|
|
|
|
/// \returns The address spaces that must be ordered by the machine
|
|
/// instruction used to create this SiMemOpInfo.
|
|
SIAtomicAddrSpace getOrderingAddrSpace() const {
|
|
return OrderingAddrSpace;
|
|
}
|
|
|
|
/// \returns Return true iff memory ordering of operations on
|
|
/// different address spaces is required.
|
|
bool getIsCrossAddressSpaceOrdering() const {
|
|
return IsCrossAddressSpaceOrdering;
|
|
}
|
|
|
|
/// \returns True if memory access of the machine instruction used to
|
|
/// create this SIMemOpInfo is volatile, false otherwise.
|
|
bool isVolatile() const {
|
|
return IsVolatile;
|
|
}
|
|
|
|
/// \returns True if memory access of the machine instruction used to
|
|
/// create this SIMemOpInfo is nontemporal, false otherwise.
|
|
bool isNonTemporal() const {
|
|
return IsNonTemporal;
|
|
}
|
|
|
|
/// \returns True if ordering constraint of the machine instruction used to
|
|
/// create this SIMemOpInfo is unordered or higher, false otherwise.
|
|
bool isAtomic() const {
|
|
return Ordering != AtomicOrdering::NotAtomic;
|
|
}
|
|
|
|
};
|
|
|
|
class SIMemOpAccess final {
|
|
private:
|
|
AMDGPUMachineModuleInfo *MMI = nullptr;
|
|
|
|
/// Reports unsupported message \p Msg for \p MI to LLVM context.
|
|
void reportUnsupported(const MachineBasicBlock::iterator &MI,
|
|
const char *Msg) const;
|
|
|
|
/// Inspects the target synchronization scope \p SSID and determines
|
|
/// the SI atomic scope it corresponds to, the address spaces it
|
|
/// covers, and whether the memory ordering applies between address
|
|
/// spaces.
|
|
Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
|
|
toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
|
|
|
|
/// \return Return a bit set of the address spaces accessed by \p AS.
|
|
SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
|
|
|
|
/// \returns Info constructed from \p MI, which has at least machine memory
|
|
/// operand.
|
|
Optional<SIMemOpInfo> constructFromMIWithMMO(
|
|
const MachineBasicBlock::iterator &MI) const;
|
|
|
|
public:
|
|
/// Construct class to support accessing the machine memory operands
|
|
/// of instructions in the machine function \p MF.
|
|
SIMemOpAccess(MachineFunction &MF);
|
|
|
|
/// \returns Load info if \p MI is a load operation, "None" otherwise.
|
|
Optional<SIMemOpInfo> getLoadInfo(
|
|
const MachineBasicBlock::iterator &MI) const;
|
|
|
|
/// \returns Store info if \p MI is a store operation, "None" otherwise.
|
|
Optional<SIMemOpInfo> getStoreInfo(
|
|
const MachineBasicBlock::iterator &MI) const;
|
|
|
|
/// \returns Atomic fence info if \p MI is an atomic fence operation,
|
|
/// "None" otherwise.
|
|
Optional<SIMemOpInfo> getAtomicFenceInfo(
|
|
const MachineBasicBlock::iterator &MI) const;
|
|
|
|
/// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
|
|
/// rmw operation, "None" otherwise.
|
|
Optional<SIMemOpInfo> getAtomicCmpxchgOrRmwInfo(
|
|
const MachineBasicBlock::iterator &MI) const;
|
|
};
|
|
|
|
class SICacheControl {
|
|
protected:
|
|
|
|
/// AMDGPU subtarget info.
|
|
const GCNSubtarget &ST;
|
|
|
|
/// Instruction info.
|
|
const SIInstrInfo *TII = nullptr;
|
|
|
|
IsaVersion IV;
|
|
|
|
/// Whether to insert cache invalidating instructions.
|
|
bool InsertCacheInv;
|
|
|
|
SICacheControl(const GCNSubtarget &ST);
|
|
|
|
/// Sets named bit \p BitName to "true" if present in instruction \p MI.
|
|
/// \returns Returns true if \p MI is modified, false otherwise.
|
|
bool enableNamedBit(const MachineBasicBlock::iterator MI,
|
|
AMDGPU::CPol::CPol Bit) const;
|
|
|
|
public:
|
|
|
|
/// Create a cache control for the subtarget \p ST.
|
|
static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
|
|
|
|
/// Update \p MI memory load instruction to bypass any caches up to
|
|
/// the \p Scope memory scope for address spaces \p
|
|
/// AddrSpace. Return true iff the instruction was modified.
|
|
virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace) const = 0;
|
|
|
|
/// Update \p MI memory store instruction to bypass any caches up to
|
|
/// the \p Scope memory scope for address spaces \p
|
|
/// AddrSpace. Return true iff the instruction was modified.
|
|
virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace) const = 0;
|
|
|
|
/// Update \p MI memory read-modify-write instruction to bypass any caches up
|
|
/// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
|
|
/// iff the instruction was modified.
|
|
virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace) const = 0;
|
|
|
|
/// Update \p MI memory instruction of kind \p Op associated with address
|
|
/// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
|
|
/// true iff the instruction was modified.
|
|
virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
SIMemOp Op, bool IsVolatile,
|
|
bool IsNonTemporal) const = 0;
|
|
|
|
/// Inserts any necessary instructions at position \p Pos relative
|
|
/// to instruction \p MI to ensure memory instructions before \p Pos of kind
|
|
/// \p Op associated with address spaces \p AddrSpace have completed. Used
|
|
/// between memory instructions to enforce the order they become visible as
|
|
/// observed by other memory instructions executing in memory scope \p Scope.
|
|
/// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
|
|
/// address spaces. Returns true iff any instructions inserted.
|
|
virtual bool insertWait(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
SIMemOp Op,
|
|
bool IsCrossAddrSpaceOrdering,
|
|
Position Pos) const = 0;
|
|
|
|
/// Inserts any necessary instructions at position \p Pos relative to
|
|
/// instruction \p MI to ensure any subsequent memory instructions of this
|
|
/// thread with address spaces \p AddrSpace will observe the previous memory
|
|
/// operations by any thread for memory scopes up to memory scope \p Scope .
|
|
/// Returns true iff any instructions inserted.
|
|
virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
Position Pos) const = 0;
|
|
|
|
/// Inserts any necessary instructions at position \p Pos relative to
|
|
/// instruction \p MI to ensure previous memory instructions by this thread
|
|
/// with address spaces \p AddrSpace have completed and can be observed by
|
|
/// subsequent memory instructions by any thread executing in memory scope \p
|
|
/// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
|
|
/// between address spaces. Returns true iff any instructions inserted.
|
|
virtual bool insertRelease(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
bool IsCrossAddrSpaceOrdering,
|
|
Position Pos) const = 0;
|
|
|
|
/// Virtual destructor to allow derivations to be deleted.
|
|
virtual ~SICacheControl() = default;
|
|
|
|
};
|
|
|
|
class SIGfx6CacheControl : public SICacheControl {
|
|
protected:
|
|
|
|
/// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
|
|
/// is modified, false otherwise.
|
|
bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
|
|
return enableNamedBit(MI, AMDGPU::CPol::GLC);
|
|
}
|
|
|
|
/// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
|
|
/// is modified, false otherwise.
|
|
bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
|
|
return enableNamedBit(MI, AMDGPU::CPol::SLC);
|
|
}
|
|
|
|
public:
|
|
|
|
SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {};
|
|
|
|
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace) const override;
|
|
|
|
bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace) const override;
|
|
|
|
bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace) const override;
|
|
|
|
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
|
|
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
|
|
bool IsVolatile,
|
|
bool IsNonTemporal) const override;
|
|
|
|
bool insertWait(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
SIMemOp Op,
|
|
bool IsCrossAddrSpaceOrdering,
|
|
Position Pos) const override;
|
|
|
|
bool insertAcquire(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
Position Pos) const override;
|
|
|
|
bool insertRelease(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
bool IsCrossAddrSpaceOrdering,
|
|
Position Pos) const override;
|
|
};
|
|
|
|
class SIGfx7CacheControl : public SIGfx6CacheControl {
|
|
public:
|
|
|
|
SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {};
|
|
|
|
bool insertAcquire(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
Position Pos) const override;
|
|
|
|
};
|
|
|
|
class SIGfx90ACacheControl : public SIGfx7CacheControl {
|
|
public:
|
|
|
|
SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
|
|
|
|
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace) const override;
|
|
|
|
bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace) const override;
|
|
|
|
bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace) const override;
|
|
|
|
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
|
|
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
|
|
bool IsVolatile,
|
|
bool IsNonTemporal) const override;
|
|
|
|
bool insertWait(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
SIMemOp Op,
|
|
bool IsCrossAddrSpaceOrdering,
|
|
Position Pos) const override;
|
|
|
|
bool insertAcquire(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
Position Pos) const override;
|
|
|
|
bool insertRelease(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
bool IsCrossAddrSpaceOrdering,
|
|
Position Pos) const override;
|
|
};
|
|
|
|
class SIGfx10CacheControl : public SIGfx7CacheControl {
|
|
protected:
|
|
|
|
/// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
|
|
/// is modified, false otherwise.
|
|
bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
|
|
return enableNamedBit(MI, AMDGPU::CPol::DLC);
|
|
}
|
|
|
|
public:
|
|
|
|
SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {};
|
|
|
|
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace) const override;
|
|
|
|
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
|
|
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
|
|
bool IsVolatile,
|
|
bool IsNonTemporal) const override;
|
|
|
|
bool insertWait(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
SIMemOp Op,
|
|
bool IsCrossAddrSpaceOrdering,
|
|
Position Pos) const override;
|
|
|
|
bool insertAcquire(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
Position Pos) const override;
|
|
};
|
|
|
|
class SIMemoryLegalizer final : public MachineFunctionPass {
|
|
private:
|
|
|
|
/// Cache Control.
|
|
std::unique_ptr<SICacheControl> CC = nullptr;
|
|
|
|
/// List of atomic pseudo instructions.
|
|
std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
|
|
|
|
/// Return true iff instruction \p MI is a atomic instruction that
|
|
/// returns a result.
|
|
bool isAtomicRet(const MachineInstr &MI) const {
|
|
return SIInstrInfo::isAtomicRet(MI);
|
|
}
|
|
|
|
/// Removes all processed atomic pseudo instructions from the current
|
|
/// function. Returns true if current function is modified, false otherwise.
|
|
bool removeAtomicPseudoMIs();
|
|
|
|
/// Expands load operation \p MI. Returns true if instructions are
|
|
/// added/deleted or \p MI is modified, false otherwise.
|
|
bool expandLoad(const SIMemOpInfo &MOI,
|
|
MachineBasicBlock::iterator &MI);
|
|
/// Expands store operation \p MI. Returns true if instructions are
|
|
/// added/deleted or \p MI is modified, false otherwise.
|
|
bool expandStore(const SIMemOpInfo &MOI,
|
|
MachineBasicBlock::iterator &MI);
|
|
/// Expands atomic fence operation \p MI. Returns true if
|
|
/// instructions are added/deleted or \p MI is modified, false otherwise.
|
|
bool expandAtomicFence(const SIMemOpInfo &MOI,
|
|
MachineBasicBlock::iterator &MI);
|
|
/// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
|
|
/// instructions are added/deleted or \p MI is modified, false otherwise.
|
|
bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
|
|
MachineBasicBlock::iterator &MI);
|
|
|
|
public:
|
|
static char ID;
|
|
|
|
SIMemoryLegalizer() : MachineFunctionPass(ID) {}
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
AU.setPreservesCFG();
|
|
MachineFunctionPass::getAnalysisUsage(AU);
|
|
}
|
|
|
|
StringRef getPassName() const override {
|
|
return PASS_NAME;
|
|
}
|
|
|
|
bool runOnMachineFunction(MachineFunction &MF) override;
|
|
};
|
|
|
|
} // end namespace anonymous
|
|
|
|
void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
|
|
const char *Msg) const {
|
|
const Function &Func = MI->getParent()->getParent()->getFunction();
|
|
DiagnosticInfoUnsupported Diag(Func, Msg, MI->getDebugLoc());
|
|
Func.getContext().diagnose(Diag);
|
|
}
|
|
|
|
Optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
|
|
SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
|
|
SIAtomicAddrSpace InstrAddrSpace) const {
|
|
if (SSID == SyncScope::System)
|
|
return std::make_tuple(SIAtomicScope::SYSTEM,
|
|
SIAtomicAddrSpace::ATOMIC,
|
|
true);
|
|
if (SSID == MMI->getAgentSSID())
|
|
return std::make_tuple(SIAtomicScope::AGENT,
|
|
SIAtomicAddrSpace::ATOMIC,
|
|
true);
|
|
if (SSID == MMI->getWorkgroupSSID())
|
|
return std::make_tuple(SIAtomicScope::WORKGROUP,
|
|
SIAtomicAddrSpace::ATOMIC,
|
|
true);
|
|
if (SSID == MMI->getWavefrontSSID())
|
|
return std::make_tuple(SIAtomicScope::WAVEFRONT,
|
|
SIAtomicAddrSpace::ATOMIC,
|
|
true);
|
|
if (SSID == SyncScope::SingleThread)
|
|
return std::make_tuple(SIAtomicScope::SINGLETHREAD,
|
|
SIAtomicAddrSpace::ATOMIC,
|
|
true);
|
|
if (SSID == MMI->getSystemOneAddressSpaceSSID())
|
|
return std::make_tuple(SIAtomicScope::SYSTEM,
|
|
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
|
|
false);
|
|
if (SSID == MMI->getAgentOneAddressSpaceSSID())
|
|
return std::make_tuple(SIAtomicScope::AGENT,
|
|
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
|
|
false);
|
|
if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
|
|
return std::make_tuple(SIAtomicScope::WORKGROUP,
|
|
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
|
|
false);
|
|
if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
|
|
return std::make_tuple(SIAtomicScope::WAVEFRONT,
|
|
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
|
|
false);
|
|
if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
|
|
return std::make_tuple(SIAtomicScope::SINGLETHREAD,
|
|
SIAtomicAddrSpace::ATOMIC & InstrAddrSpace,
|
|
false);
|
|
return None;
|
|
}
|
|
|
|
SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
|
|
if (AS == AMDGPUAS::FLAT_ADDRESS)
|
|
return SIAtomicAddrSpace::FLAT;
|
|
if (AS == AMDGPUAS::GLOBAL_ADDRESS)
|
|
return SIAtomicAddrSpace::GLOBAL;
|
|
if (AS == AMDGPUAS::LOCAL_ADDRESS)
|
|
return SIAtomicAddrSpace::LDS;
|
|
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
|
|
return SIAtomicAddrSpace::SCRATCH;
|
|
if (AS == AMDGPUAS::REGION_ADDRESS)
|
|
return SIAtomicAddrSpace::GDS;
|
|
|
|
return SIAtomicAddrSpace::OTHER;
|
|
}
|
|
|
|
SIMemOpAccess::SIMemOpAccess(MachineFunction &MF) {
|
|
MMI = &MF.getMMI().getObjFileInfo<AMDGPUMachineModuleInfo>();
|
|
}
|
|
|
|
Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
|
|
const MachineBasicBlock::iterator &MI) const {
|
|
assert(MI->getNumMemOperands() > 0);
|
|
|
|
SyncScope::ID SSID = SyncScope::SingleThread;
|
|
AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
|
|
AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
|
|
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
|
|
bool IsNonTemporal = true;
|
|
bool IsVolatile = false;
|
|
|
|
// Validator should check whether or not MMOs cover the entire set of
|
|
// locations accessed by the memory instruction.
|
|
for (const auto &MMO : MI->memoperands()) {
|
|
IsNonTemporal &= MMO->isNonTemporal();
|
|
IsVolatile |= MMO->isVolatile();
|
|
InstrAddrSpace |=
|
|
toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
|
|
AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
|
|
if (OpOrdering != AtomicOrdering::NotAtomic) {
|
|
const auto &IsSyncScopeInclusion =
|
|
MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
|
|
if (!IsSyncScopeInclusion) {
|
|
reportUnsupported(MI,
|
|
"Unsupported non-inclusive atomic synchronization scope");
|
|
return None;
|
|
}
|
|
|
|
SSID = IsSyncScopeInclusion.getValue() ? SSID : MMO->getSyncScopeID();
|
|
Ordering = isStrongerThan(Ordering, OpOrdering)
|
|
? Ordering
|
|
: MMO->getSuccessOrdering();
|
|
assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
|
|
MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
|
|
FailureOrdering =
|
|
isStrongerThan(FailureOrdering, MMO->getFailureOrdering()) ?
|
|
FailureOrdering : MMO->getFailureOrdering();
|
|
}
|
|
}
|
|
|
|
SIAtomicScope Scope = SIAtomicScope::NONE;
|
|
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
|
|
bool IsCrossAddressSpaceOrdering = false;
|
|
if (Ordering != AtomicOrdering::NotAtomic) {
|
|
auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
|
|
if (!ScopeOrNone) {
|
|
reportUnsupported(MI, "Unsupported atomic synchronization scope");
|
|
return None;
|
|
}
|
|
std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
|
|
ScopeOrNone.getValue();
|
|
if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
|
|
((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
|
|
((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
|
|
reportUnsupported(MI, "Unsupported atomic address space");
|
|
return None;
|
|
}
|
|
}
|
|
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
|
|
IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
|
|
IsNonTemporal);
|
|
}
|
|
|
|
Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
|
|
const MachineBasicBlock::iterator &MI) const {
|
|
assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
|
|
|
|
if (!(MI->mayLoad() && !MI->mayStore()))
|
|
return None;
|
|
|
|
// Be conservative if there are no memory operands.
|
|
if (MI->getNumMemOperands() == 0)
|
|
return SIMemOpInfo();
|
|
|
|
return constructFromMIWithMMO(MI);
|
|
}
|
|
|
|
Optional<SIMemOpInfo> SIMemOpAccess::getStoreInfo(
|
|
const MachineBasicBlock::iterator &MI) const {
|
|
assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
|
|
|
|
if (!(!MI->mayLoad() && MI->mayStore()))
|
|
return None;
|
|
|
|
// Be conservative if there are no memory operands.
|
|
if (MI->getNumMemOperands() == 0)
|
|
return SIMemOpInfo();
|
|
|
|
return constructFromMIWithMMO(MI);
|
|
}
|
|
|
|
Optional<SIMemOpInfo> SIMemOpAccess::getAtomicFenceInfo(
|
|
const MachineBasicBlock::iterator &MI) const {
|
|
assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
|
|
|
|
if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
|
|
return None;
|
|
|
|
AtomicOrdering Ordering =
|
|
static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
|
|
|
|
SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
|
|
auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
|
|
if (!ScopeOrNone) {
|
|
reportUnsupported(MI, "Unsupported atomic synchronization scope");
|
|
return None;
|
|
}
|
|
|
|
SIAtomicScope Scope = SIAtomicScope::NONE;
|
|
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
|
|
bool IsCrossAddressSpaceOrdering = false;
|
|
std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
|
|
ScopeOrNone.getValue();
|
|
|
|
if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
|
|
((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace)) {
|
|
reportUnsupported(MI, "Unsupported atomic address space");
|
|
return None;
|
|
}
|
|
|
|
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, SIAtomicAddrSpace::ATOMIC,
|
|
IsCrossAddressSpaceOrdering, AtomicOrdering::NotAtomic);
|
|
}
|
|
|
|
Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
|
|
const MachineBasicBlock::iterator &MI) const {
|
|
assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
|
|
|
|
if (!(MI->mayLoad() && MI->mayStore()))
|
|
return None;
|
|
|
|
// Be conservative if there are no memory operands.
|
|
if (MI->getNumMemOperands() == 0)
|
|
return SIMemOpInfo();
|
|
|
|
return constructFromMIWithMMO(MI);
|
|
}
|
|
|
|
SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
|
|
TII = ST.getInstrInfo();
|
|
IV = getIsaVersion(ST.getCPU());
|
|
InsertCacheInv = !AmdgcnSkipCacheInvalidations;
|
|
}
|
|
|
|
bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
|
|
AMDGPU::CPol::CPol Bit) const {
|
|
MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
|
|
if (!CPol)
|
|
return false;
|
|
|
|
CPol->setImm(CPol->getImm() | Bit);
|
|
return true;
|
|
}
|
|
|
|
/* static */
|
|
std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
|
|
GCNSubtarget::Generation Generation = ST.getGeneration();
|
|
if (ST.hasGFX90AInsts())
|
|
return std::make_unique<SIGfx90ACacheControl>(ST);
|
|
if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
|
|
return std::make_unique<SIGfx6CacheControl>(ST);
|
|
if (Generation < AMDGPUSubtarget::GFX10)
|
|
return std::make_unique<SIGfx7CacheControl>(ST);
|
|
return std::make_unique<SIGfx10CacheControl>(ST);
|
|
}
|
|
|
|
bool SIGfx6CacheControl::enableLoadCacheBypass(
|
|
const MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace) const {
|
|
assert(MI->mayLoad() && !MI->mayStore());
|
|
bool Changed = false;
|
|
|
|
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
|
|
switch (Scope) {
|
|
case SIAtomicScope::SYSTEM:
|
|
case SIAtomicScope::AGENT:
|
|
Changed |= enableGLCBit(MI);
|
|
break;
|
|
case SIAtomicScope::WORKGROUP:
|
|
case SIAtomicScope::WAVEFRONT:
|
|
case SIAtomicScope::SINGLETHREAD:
|
|
// No cache to bypass.
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unsupported synchronization scope");
|
|
}
|
|
}
|
|
|
|
/// The scratch address space does not need the global memory caches
|
|
/// to be bypassed as all memory operations by the same thread are
|
|
/// sequentially consistent, and no other thread can access scratch
|
|
/// memory.
|
|
|
|
/// Other address spaces do not have a cache.
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIGfx6CacheControl::enableStoreCacheBypass(
|
|
const MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace) const {
|
|
assert(!MI->mayLoad() && MI->mayStore());
|
|
bool Changed = false;
|
|
|
|
/// The L1 cache is write through so does not need to be bypassed. There is no
|
|
/// bypass control for the L2 cache at the isa level.
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIGfx6CacheControl::enableRMWCacheBypass(
|
|
const MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace) const {
|
|
assert(MI->mayLoad() && MI->mayStore());
|
|
bool Changed = false;
|
|
|
|
/// The L1 cache is write through so does not need to be bypassed. There is no
|
|
/// bypass control for the L2 cache at the isa level.
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
|
|
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
|
|
bool IsVolatile, bool IsNonTemporal) const {
|
|
// Only handle load and store, not atomic read-modify-write insructions. The
|
|
// latter use glc to indicate if the atomic returns a result and so must not
|
|
// be used for cache control.
|
|
assert(MI->mayLoad() ^ MI->mayStore());
|
|
|
|
// Only update load and store, not LLVM IR atomic read-modify-write
|
|
// instructions. The latter are always marked as volatile so cannot sensibly
|
|
// handle it as do not want to pessimize all atomics. Also they do not support
|
|
// the nontemporal attribute.
|
|
assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
|
|
|
|
bool Changed = false;
|
|
|
|
if (IsVolatile) {
|
|
if (Op == SIMemOp::LOAD)
|
|
Changed |= enableGLCBit(MI);
|
|
|
|
// Ensure operation has completed at system scope to cause all volatile
|
|
// operations to be visible outside the program in a global order. Do not
|
|
// request cross address space as only the global address space can be
|
|
// observable outside the program, so no need to cause a waitcnt for LDS
|
|
// address space operations.
|
|
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
|
|
Position::AFTER);
|
|
|
|
return Changed;
|
|
}
|
|
|
|
if (IsNonTemporal) {
|
|
// Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
|
|
Changed |= enableGLCBit(MI);
|
|
Changed |= enableSLCBit(MI);
|
|
return Changed;
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
SIMemOp Op,
|
|
bool IsCrossAddrSpaceOrdering,
|
|
Position Pos) const {
|
|
bool Changed = false;
|
|
|
|
MachineBasicBlock &MBB = *MI->getParent();
|
|
DebugLoc DL = MI->getDebugLoc();
|
|
|
|
if (Pos == Position::AFTER)
|
|
++MI;
|
|
|
|
bool VMCnt = false;
|
|
bool LGKMCnt = false;
|
|
|
|
if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
|
|
SIAtomicAddrSpace::NONE) {
|
|
switch (Scope) {
|
|
case SIAtomicScope::SYSTEM:
|
|
case SIAtomicScope::AGENT:
|
|
VMCnt |= true;
|
|
break;
|
|
case SIAtomicScope::WORKGROUP:
|
|
case SIAtomicScope::WAVEFRONT:
|
|
case SIAtomicScope::SINGLETHREAD:
|
|
// The L1 cache keeps all memory operations in order for
|
|
// wavefronts in the same work-group.
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unsupported synchronization scope");
|
|
}
|
|
}
|
|
|
|
if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
|
|
switch (Scope) {
|
|
case SIAtomicScope::SYSTEM:
|
|
case SIAtomicScope::AGENT:
|
|
case SIAtomicScope::WORKGROUP:
|
|
// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
|
|
// not needed as LDS operations for all waves are executed in a total
|
|
// global ordering as observed by all waves. Required if also
|
|
// synchronizing with global/GDS memory as LDS operations could be
|
|
// reordered with respect to later global/GDS memory operations of the
|
|
// same wave.
|
|
LGKMCnt |= IsCrossAddrSpaceOrdering;
|
|
break;
|
|
case SIAtomicScope::WAVEFRONT:
|
|
case SIAtomicScope::SINGLETHREAD:
|
|
// The LDS keeps all memory operations in order for
|
|
// the same wavesfront.
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unsupported synchronization scope");
|
|
}
|
|
}
|
|
|
|
if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
|
|
switch (Scope) {
|
|
case SIAtomicScope::SYSTEM:
|
|
case SIAtomicScope::AGENT:
|
|
// If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
|
|
// is not needed as GDS operations for all waves are executed in a total
|
|
// global ordering as observed by all waves. Required if also
|
|
// synchronizing with global/LDS memory as GDS operations could be
|
|
// reordered with respect to later global/LDS memory operations of the
|
|
// same wave.
|
|
LGKMCnt |= IsCrossAddrSpaceOrdering;
|
|
break;
|
|
case SIAtomicScope::WORKGROUP:
|
|
case SIAtomicScope::WAVEFRONT:
|
|
case SIAtomicScope::SINGLETHREAD:
|
|
// The GDS keeps all memory operations in order for
|
|
// the same work-group.
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unsupported synchronization scope");
|
|
}
|
|
}
|
|
|
|
if (VMCnt || LGKMCnt) {
|
|
unsigned WaitCntImmediate =
|
|
AMDGPU::encodeWaitcnt(IV,
|
|
VMCnt ? 0 : getVmcntBitMask(IV),
|
|
getExpcntBitMask(IV),
|
|
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
|
|
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
|
|
Changed = true;
|
|
}
|
|
|
|
if (Pos == Position::AFTER)
|
|
--MI;
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
Position Pos) const {
|
|
if (!InsertCacheInv)
|
|
return false;
|
|
|
|
bool Changed = false;
|
|
|
|
MachineBasicBlock &MBB = *MI->getParent();
|
|
DebugLoc DL = MI->getDebugLoc();
|
|
|
|
if (Pos == Position::AFTER)
|
|
++MI;
|
|
|
|
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
|
|
switch (Scope) {
|
|
case SIAtomicScope::SYSTEM:
|
|
case SIAtomicScope::AGENT:
|
|
BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
|
|
Changed = true;
|
|
break;
|
|
case SIAtomicScope::WORKGROUP:
|
|
case SIAtomicScope::WAVEFRONT:
|
|
case SIAtomicScope::SINGLETHREAD:
|
|
// No cache to invalidate.
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unsupported synchronization scope");
|
|
}
|
|
}
|
|
|
|
/// The scratch address space does not need the global memory cache
|
|
/// to be flushed as all memory operations by the same thread are
|
|
/// sequentially consistent, and no other thread can access scratch
|
|
/// memory.
|
|
|
|
/// Other address spaces do not have a cache.
|
|
|
|
if (Pos == Position::AFTER)
|
|
--MI;
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
bool IsCrossAddrSpaceOrdering,
|
|
Position Pos) const {
|
|
return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
|
|
IsCrossAddrSpaceOrdering, Pos);
|
|
}
|
|
|
|
bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
Position Pos) const {
|
|
if (!InsertCacheInv)
|
|
return false;
|
|
|
|
bool Changed = false;
|
|
|
|
MachineBasicBlock &MBB = *MI->getParent();
|
|
DebugLoc DL = MI->getDebugLoc();
|
|
|
|
const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
|
|
|
|
const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
|
|
? AMDGPU::BUFFER_WBINVL1
|
|
: AMDGPU::BUFFER_WBINVL1_VOL;
|
|
|
|
if (Pos == Position::AFTER)
|
|
++MI;
|
|
|
|
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
|
|
switch (Scope) {
|
|
case SIAtomicScope::SYSTEM:
|
|
case SIAtomicScope::AGENT:
|
|
BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
|
|
Changed = true;
|
|
break;
|
|
case SIAtomicScope::WORKGROUP:
|
|
case SIAtomicScope::WAVEFRONT:
|
|
case SIAtomicScope::SINGLETHREAD:
|
|
// No cache to invalidate.
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unsupported synchronization scope");
|
|
}
|
|
}
|
|
|
|
/// The scratch address space does not need the global memory cache
|
|
/// to be flushed as all memory operations by the same thread are
|
|
/// sequentially consistent, and no other thread can access scratch
|
|
/// memory.
|
|
|
|
/// Other address spaces do not have a cache.
|
|
|
|
if (Pos == Position::AFTER)
|
|
--MI;
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIGfx90ACacheControl::enableLoadCacheBypass(
|
|
const MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace) const {
|
|
assert(MI->mayLoad() && !MI->mayStore());
|
|
bool Changed = false;
|
|
|
|
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
|
|
switch (Scope) {
|
|
case SIAtomicScope::SYSTEM:
|
|
case SIAtomicScope::AGENT:
|
|
Changed |= enableGLCBit(MI);
|
|
break;
|
|
case SIAtomicScope::WORKGROUP:
|
|
// In threadgroup split mode the waves of a work-group can be executing on
|
|
// different CUs. Therefore need to bypass the L1 which is per CU.
|
|
// Otherwise in non-threadgroup split mode all waves of a work-group are
|
|
// on the same CU, and so the L1 does not need to be bypassed.
|
|
if (ST.isTgSplitEnabled()) Changed |= enableGLCBit(MI);
|
|
break;
|
|
case SIAtomicScope::WAVEFRONT:
|
|
case SIAtomicScope::SINGLETHREAD:
|
|
// No cache to bypass.
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unsupported synchronization scope");
|
|
}
|
|
}
|
|
|
|
/// The scratch address space does not need the global memory caches
|
|
/// to be bypassed as all memory operations by the same thread are
|
|
/// sequentially consistent, and no other thread can access scratch
|
|
/// memory.
|
|
|
|
/// Other address spaces do not have a cache.
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIGfx90ACacheControl::enableStoreCacheBypass(
|
|
const MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace) const {
|
|
assert(!MI->mayLoad() && MI->mayStore());
|
|
bool Changed = false;
|
|
|
|
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
|
|
switch (Scope) {
|
|
case SIAtomicScope::SYSTEM:
|
|
case SIAtomicScope::AGENT:
|
|
/// Do not set glc for store atomic operations as they implicitly write
|
|
/// through the L1 cache.
|
|
break;
|
|
case SIAtomicScope::WORKGROUP:
|
|
case SIAtomicScope::WAVEFRONT:
|
|
case SIAtomicScope::SINGLETHREAD:
|
|
// No cache to bypass. Store atomics implicitly write through the L1
|
|
// cache.
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unsupported synchronization scope");
|
|
}
|
|
}
|
|
|
|
/// The scratch address space does not need the global memory caches
|
|
/// to be bypassed as all memory operations by the same thread are
|
|
/// sequentially consistent, and no other thread can access scratch
|
|
/// memory.
|
|
|
|
/// Other address spaces do not have a cache.
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIGfx90ACacheControl::enableRMWCacheBypass(
|
|
const MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace) const {
|
|
assert(MI->mayLoad() && MI->mayStore());
|
|
bool Changed = false;
|
|
|
|
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
|
|
switch (Scope) {
|
|
case SIAtomicScope::SYSTEM:
|
|
case SIAtomicScope::AGENT:
|
|
/// Do not set glc for RMW atomic operations as they implicitly bypass
|
|
/// the L1 cache, and the glc bit is instead used to indicate if they are
|
|
/// return or no-return.
|
|
break;
|
|
case SIAtomicScope::WORKGROUP:
|
|
case SIAtomicScope::WAVEFRONT:
|
|
case SIAtomicScope::SINGLETHREAD:
|
|
// No cache to bypass. RMW atomics implicitly bypass the L1 cache.
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unsupported synchronization scope");
|
|
}
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
|
|
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
|
|
bool IsVolatile, bool IsNonTemporal) const {
|
|
// Only handle load and store, not atomic read-modify-write insructions. The
|
|
// latter use glc to indicate if the atomic returns a result and so must not
|
|
// be used for cache control.
|
|
assert(MI->mayLoad() ^ MI->mayStore());
|
|
|
|
// Only update load and store, not LLVM IR atomic read-modify-write
|
|
// instructions. The latter are always marked as volatile so cannot sensibly
|
|
// handle it as do not want to pessimize all atomics. Also they do not support
|
|
// the nontemporal attribute.
|
|
assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
|
|
|
|
bool Changed = false;
|
|
|
|
if (IsVolatile) {
|
|
if (Op == SIMemOp::LOAD) {
|
|
Changed |= enableGLCBit(MI);
|
|
}
|
|
|
|
// Ensure operation has completed at system scope to cause all volatile
|
|
// operations to be visible outside the program in a global order. Do not
|
|
// request cross address space as only the global address space can be
|
|
// observable outside the program, so no need to cause a waitcnt for LDS
|
|
// address space operations.
|
|
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
|
|
Position::AFTER);
|
|
|
|
return Changed;
|
|
}
|
|
|
|
if (IsNonTemporal) {
|
|
// Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
|
|
Changed |= enableGLCBit(MI);
|
|
Changed |= enableSLCBit(MI);
|
|
return Changed;
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
SIMemOp Op,
|
|
bool IsCrossAddrSpaceOrdering,
|
|
Position Pos) const {
|
|
if (ST.isTgSplitEnabled()) {
|
|
// In threadgroup split mode the waves of a work-group can be executing on
|
|
// different CUs. Therefore need to wait for global or GDS memory operations
|
|
// to complete to ensure they are visible to waves in the other CUs.
|
|
// Otherwise in non-threadgroup split mode all waves of a work-group are on
|
|
// the same CU, so no need to wait for global memory as all waves in the
|
|
// work-group access the same the L1, nor wait for GDS as access are ordered
|
|
// on a CU.
|
|
if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
|
|
SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
|
|
(Scope == SIAtomicScope::WORKGROUP)) {
|
|
// Same as GFX7 using agent scope.
|
|
Scope = SIAtomicScope::AGENT;
|
|
}
|
|
// In threadgroup split mode LDS cannot be allocated so no need to wait for
|
|
// LDS memory operations.
|
|
AddrSpace &= ~SIAtomicAddrSpace::LDS;
|
|
}
|
|
return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
|
|
IsCrossAddrSpaceOrdering, Pos);
|
|
}
|
|
|
|
bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
Position Pos) const {
|
|
if (!InsertCacheInv)
|
|
return false;
|
|
|
|
bool Changed = false;
|
|
|
|
MachineBasicBlock &MBB = *MI->getParent();
|
|
DebugLoc DL = MI->getDebugLoc();
|
|
|
|
if (Pos == Position::AFTER)
|
|
++MI;
|
|
|
|
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
|
|
switch (Scope) {
|
|
case SIAtomicScope::SYSTEM:
|
|
// Ensures that following loads will not see stale remote VMEM data or
|
|
// stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
|
|
// CC will never be stale due to the local memory probes.
|
|
BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
|
|
// Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
|
|
// hardware does not reorder memory operations by the same wave with
|
|
// respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
|
|
// remove any cache lines of earlier writes by the same wave and ensures
|
|
// later reads by the same wave will refetch the cache lines.
|
|
Changed = true;
|
|
break;
|
|
case SIAtomicScope::AGENT:
|
|
// Same as GFX7.
|
|
break;
|
|
case SIAtomicScope::WORKGROUP:
|
|
// In threadgroup split mode the waves of a work-group can be executing on
|
|
// different CUs. Therefore need to invalidate the L1 which is per CU.
|
|
// Otherwise in non-threadgroup split mode all waves of a work-group are
|
|
// on the same CU, and so the L1 does not need to be invalidated.
|
|
if (ST.isTgSplitEnabled()) {
|
|
// Same as GFX7 using agent scope.
|
|
Scope = SIAtomicScope::AGENT;
|
|
}
|
|
break;
|
|
case SIAtomicScope::WAVEFRONT:
|
|
case SIAtomicScope::SINGLETHREAD:
|
|
// Same as GFX7.
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unsupported synchronization scope");
|
|
}
|
|
}
|
|
|
|
/// The scratch address space does not need the global memory cache
|
|
/// to be flushed as all memory operations by the same thread are
|
|
/// sequentially consistent, and no other thread can access scratch
|
|
/// memory.
|
|
|
|
/// Other address spaces do not have a cache.
|
|
|
|
if (Pos == Position::AFTER)
|
|
--MI;
|
|
|
|
Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
bool IsCrossAddrSpaceOrdering,
|
|
Position Pos) const {
|
|
bool Changed = false;
|
|
|
|
MachineBasicBlock &MBB = *MI->getParent();
|
|
DebugLoc DL = MI->getDebugLoc();
|
|
|
|
if (Pos == Position::AFTER)
|
|
++MI;
|
|
|
|
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
|
|
switch (Scope) {
|
|
case SIAtomicScope::SYSTEM:
|
|
// Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
|
|
// hardware does not reorder memory operations by the same wave with
|
|
// respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
|
|
// to initiate writeback of any dirty cache lines of earlier writes by the
|
|
// same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
|
|
// writeback has completed.
|
|
BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2));
|
|
// Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
|
|
// vmcnt(0)" needed by the "BUFFER_WBL2".
|
|
Changed = true;
|
|
break;
|
|
case SIAtomicScope::AGENT:
|
|
case SIAtomicScope::WORKGROUP:
|
|
case SIAtomicScope::WAVEFRONT:
|
|
case SIAtomicScope::SINGLETHREAD:
|
|
// Same as GFX7.
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unsupported synchronization scope");
|
|
}
|
|
}
|
|
|
|
if (Pos == Position::AFTER)
|
|
--MI;
|
|
|
|
Changed |=
|
|
SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
|
|
IsCrossAddrSpaceOrdering, Pos);
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIGfx10CacheControl::enableLoadCacheBypass(
|
|
const MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace) const {
|
|
assert(MI->mayLoad() && !MI->mayStore());
|
|
bool Changed = false;
|
|
|
|
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
|
|
/// TODO Do not set glc for rmw atomic operations as they
|
|
/// implicitly bypass the L0/L1 caches.
|
|
|
|
switch (Scope) {
|
|
case SIAtomicScope::SYSTEM:
|
|
case SIAtomicScope::AGENT:
|
|
Changed |= enableGLCBit(MI);
|
|
Changed |= enableDLCBit(MI);
|
|
break;
|
|
case SIAtomicScope::WORKGROUP:
|
|
// In WGP mode the waves of a work-group can be executing on either CU of
|
|
// the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
|
|
// CU mode all waves of a work-group are on the same CU, and so the L0
|
|
// does not need to be bypassed.
|
|
if (!ST.isCuModeEnabled()) Changed |= enableGLCBit(MI);
|
|
break;
|
|
case SIAtomicScope::WAVEFRONT:
|
|
case SIAtomicScope::SINGLETHREAD:
|
|
// No cache to bypass.
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unsupported synchronization scope");
|
|
}
|
|
}
|
|
|
|
/// The scratch address space does not need the global memory caches
|
|
/// to be bypassed as all memory operations by the same thread are
|
|
/// sequentially consistent, and no other thread can access scratch
|
|
/// memory.
|
|
|
|
/// Other address spaces do not have a cache.
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
|
|
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
|
|
bool IsVolatile, bool IsNonTemporal) const {
|
|
|
|
// Only handle load and store, not atomic read-modify-write insructions. The
|
|
// latter use glc to indicate if the atomic returns a result and so must not
|
|
// be used for cache control.
|
|
assert(MI->mayLoad() ^ MI->mayStore());
|
|
|
|
// Only update load and store, not LLVM IR atomic read-modify-write
|
|
// instructions. The latter are always marked as volatile so cannot sensibly
|
|
// handle it as do not want to pessimize all atomics. Also they do not support
|
|
// the nontemporal attribute.
|
|
assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
|
|
|
|
bool Changed = false;
|
|
|
|
if (IsVolatile) {
|
|
|
|
if (Op == SIMemOp::LOAD) {
|
|
Changed |= enableGLCBit(MI);
|
|
Changed |= enableDLCBit(MI);
|
|
}
|
|
|
|
// Ensure operation has completed at system scope to cause all volatile
|
|
// operations to be visible outside the program in a global order. Do not
|
|
// request cross address space as only the global address space can be
|
|
// observable outside the program, so no need to cause a waitcnt for LDS
|
|
// address space operations.
|
|
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
|
|
Position::AFTER);
|
|
return Changed;
|
|
}
|
|
|
|
if (IsNonTemporal) {
|
|
// Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
|
|
Changed |= enableSLCBit(MI);
|
|
return Changed;
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
SIMemOp Op,
|
|
bool IsCrossAddrSpaceOrdering,
|
|
Position Pos) const {
|
|
bool Changed = false;
|
|
|
|
MachineBasicBlock &MBB = *MI->getParent();
|
|
DebugLoc DL = MI->getDebugLoc();
|
|
|
|
if (Pos == Position::AFTER)
|
|
++MI;
|
|
|
|
bool VMCnt = false;
|
|
bool VSCnt = false;
|
|
bool LGKMCnt = false;
|
|
|
|
if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
|
|
SIAtomicAddrSpace::NONE) {
|
|
switch (Scope) {
|
|
case SIAtomicScope::SYSTEM:
|
|
case SIAtomicScope::AGENT:
|
|
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
|
|
VMCnt |= true;
|
|
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
|
|
VSCnt |= true;
|
|
break;
|
|
case SIAtomicScope::WORKGROUP:
|
|
// In WGP mode the waves of a work-group can be executing on either CU of
|
|
// the WGP. Therefore need to wait for operations to complete to ensure
|
|
// they are visible to waves in the other CU as the L0 is per CU.
|
|
// Otherwise in CU mode and all waves of a work-group are on the same CU
|
|
// which shares the same L0.
|
|
if (!ST.isCuModeEnabled()) {
|
|
if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
|
|
VMCnt |= true;
|
|
if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
|
|
VSCnt |= true;
|
|
}
|
|
break;
|
|
case SIAtomicScope::WAVEFRONT:
|
|
case SIAtomicScope::SINGLETHREAD:
|
|
// The L0 cache keeps all memory operations in order for
|
|
// work-items in the same wavefront.
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unsupported synchronization scope");
|
|
}
|
|
}
|
|
|
|
if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
|
|
switch (Scope) {
|
|
case SIAtomicScope::SYSTEM:
|
|
case SIAtomicScope::AGENT:
|
|
case SIAtomicScope::WORKGROUP:
|
|
// If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
|
|
// not needed as LDS operations for all waves are executed in a total
|
|
// global ordering as observed by all waves. Required if also
|
|
// synchronizing with global/GDS memory as LDS operations could be
|
|
// reordered with respect to later global/GDS memory operations of the
|
|
// same wave.
|
|
LGKMCnt |= IsCrossAddrSpaceOrdering;
|
|
break;
|
|
case SIAtomicScope::WAVEFRONT:
|
|
case SIAtomicScope::SINGLETHREAD:
|
|
// The LDS keeps all memory operations in order for
|
|
// the same wavesfront.
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unsupported synchronization scope");
|
|
}
|
|
}
|
|
|
|
if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
|
|
switch (Scope) {
|
|
case SIAtomicScope::SYSTEM:
|
|
case SIAtomicScope::AGENT:
|
|
// If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
|
|
// is not needed as GDS operations for all waves are executed in a total
|
|
// global ordering as observed by all waves. Required if also
|
|
// synchronizing with global/LDS memory as GDS operations could be
|
|
// reordered with respect to later global/LDS memory operations of the
|
|
// same wave.
|
|
LGKMCnt |= IsCrossAddrSpaceOrdering;
|
|
break;
|
|
case SIAtomicScope::WORKGROUP:
|
|
case SIAtomicScope::WAVEFRONT:
|
|
case SIAtomicScope::SINGLETHREAD:
|
|
// The GDS keeps all memory operations in order for
|
|
// the same work-group.
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unsupported synchronization scope");
|
|
}
|
|
}
|
|
|
|
if (VMCnt || LGKMCnt) {
|
|
unsigned WaitCntImmediate =
|
|
AMDGPU::encodeWaitcnt(IV,
|
|
VMCnt ? 0 : getVmcntBitMask(IV),
|
|
getExpcntBitMask(IV),
|
|
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
|
|
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
|
|
Changed = true;
|
|
}
|
|
|
|
if (VSCnt) {
|
|
BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
|
|
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
|
|
.addImm(0);
|
|
Changed = true;
|
|
}
|
|
|
|
if (Pos == Position::AFTER)
|
|
--MI;
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
|
|
SIAtomicScope Scope,
|
|
SIAtomicAddrSpace AddrSpace,
|
|
Position Pos) const {
|
|
if (!InsertCacheInv)
|
|
return false;
|
|
|
|
bool Changed = false;
|
|
|
|
MachineBasicBlock &MBB = *MI->getParent();
|
|
DebugLoc DL = MI->getDebugLoc();
|
|
|
|
if (Pos == Position::AFTER)
|
|
++MI;
|
|
|
|
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
|
|
switch (Scope) {
|
|
case SIAtomicScope::SYSTEM:
|
|
case SIAtomicScope::AGENT:
|
|
BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
|
|
BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
|
|
Changed = true;
|
|
break;
|
|
case SIAtomicScope::WORKGROUP:
|
|
// In WGP mode the waves of a work-group can be executing on either CU of
|
|
// the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
|
|
// in CU mode and all waves of a work-group are on the same CU, and so the
|
|
// L0 does not need to be invalidated.
|
|
if (!ST.isCuModeEnabled()) {
|
|
BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
|
|
Changed = true;
|
|
}
|
|
break;
|
|
case SIAtomicScope::WAVEFRONT:
|
|
case SIAtomicScope::SINGLETHREAD:
|
|
// No cache to invalidate.
|
|
break;
|
|
default:
|
|
llvm_unreachable("Unsupported synchronization scope");
|
|
}
|
|
}
|
|
|
|
/// The scratch address space does not need the global memory cache
|
|
/// to be flushed as all memory operations by the same thread are
|
|
/// sequentially consistent, and no other thread can access scratch
|
|
/// memory.
|
|
|
|
/// Other address spaces do not have a cache.
|
|
|
|
if (Pos == Position::AFTER)
|
|
--MI;
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
|
|
if (AtomicPseudoMIs.empty())
|
|
return false;
|
|
|
|
for (auto &MI : AtomicPseudoMIs)
|
|
MI->eraseFromParent();
|
|
|
|
AtomicPseudoMIs.clear();
|
|
return true;
|
|
}
|
|
|
|
bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
|
|
MachineBasicBlock::iterator &MI) {
|
|
assert(MI->mayLoad() && !MI->mayStore());
|
|
|
|
bool Changed = false;
|
|
|
|
if (MOI.isAtomic()) {
|
|
if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
|
|
MOI.getOrdering() == AtomicOrdering::Acquire ||
|
|
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
|
|
Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
|
|
MOI.getOrderingAddrSpace());
|
|
}
|
|
|
|
if (MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
|
|
Changed |= CC->insertWait(MI, MOI.getScope(),
|
|
MOI.getOrderingAddrSpace(),
|
|
SIMemOp::LOAD | SIMemOp::STORE,
|
|
MOI.getIsCrossAddressSpaceOrdering(),
|
|
Position::BEFORE);
|
|
|
|
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
|
|
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
|
|
Changed |= CC->insertWait(MI, MOI.getScope(),
|
|
MOI.getInstrAddrSpace(),
|
|
SIMemOp::LOAD,
|
|
MOI.getIsCrossAddressSpaceOrdering(),
|
|
Position::AFTER);
|
|
Changed |= CC->insertAcquire(MI, MOI.getScope(),
|
|
MOI.getOrderingAddrSpace(),
|
|
Position::AFTER);
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
// Atomic instructions already bypass caches to the scope specified by the
|
|
// SyncScope operand. Only non-atomic volatile and nontemporal instructions
|
|
// need additional treatment.
|
|
Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
|
|
SIMemOp::LOAD, MOI.isVolatile(),
|
|
MOI.isNonTemporal());
|
|
return Changed;
|
|
}
|
|
|
|
bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
|
|
MachineBasicBlock::iterator &MI) {
|
|
assert(!MI->mayLoad() && MI->mayStore());
|
|
|
|
bool Changed = false;
|
|
|
|
if (MOI.isAtomic()) {
|
|
if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
|
|
MOI.getOrdering() == AtomicOrdering::Release ||
|
|
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
|
|
Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
|
|
MOI.getOrderingAddrSpace());
|
|
}
|
|
|
|
if (MOI.getOrdering() == AtomicOrdering::Release ||
|
|
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
|
|
Changed |= CC->insertRelease(MI, MOI.getScope(),
|
|
MOI.getOrderingAddrSpace(),
|
|
MOI.getIsCrossAddressSpaceOrdering(),
|
|
Position::BEFORE);
|
|
|
|
return Changed;
|
|
}
|
|
|
|
// Atomic instructions already bypass caches to the scope specified by the
|
|
// SyncScope operand. Only non-atomic volatile and nontemporal instructions
|
|
// need additional treatment.
|
|
Changed |= CC->enableVolatileAndOrNonTemporal(
|
|
MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
|
|
MOI.isNonTemporal());
|
|
return Changed;
|
|
}
|
|
|
|
bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
|
|
MachineBasicBlock::iterator &MI) {
|
|
assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
|
|
|
|
AtomicPseudoMIs.push_back(MI);
|
|
bool Changed = false;
|
|
|
|
if (MOI.isAtomic()) {
|
|
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
|
|
MOI.getOrdering() == AtomicOrdering::Release ||
|
|
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
|
|
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
|
|
/// TODO: This relies on a barrier always generating a waitcnt
|
|
/// for LDS to ensure it is not reordered with the completion of
|
|
/// the proceeding LDS operations. If barrier had a memory
|
|
/// ordering and memory scope, then library does not need to
|
|
/// generate a fence. Could add support in this file for
|
|
/// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
|
|
/// adding S_WAITCNT before a S_BARRIER.
|
|
Changed |= CC->insertRelease(MI, MOI.getScope(),
|
|
MOI.getOrderingAddrSpace(),
|
|
MOI.getIsCrossAddressSpaceOrdering(),
|
|
Position::BEFORE);
|
|
|
|
// TODO: If both release and invalidate are happening they could be combined
|
|
// to use the single "BUFFER_WBINV*" instruction. This could be done by
|
|
// reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
|
|
// track cache invalidate and write back instructions.
|
|
|
|
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
|
|
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
|
|
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
|
|
Changed |= CC->insertAcquire(MI, MOI.getScope(),
|
|
MOI.getOrderingAddrSpace(),
|
|
Position::BEFORE);
|
|
|
|
return Changed;
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
|
|
MachineBasicBlock::iterator &MI) {
|
|
assert(MI->mayLoad() && MI->mayStore());
|
|
|
|
bool Changed = false;
|
|
|
|
if (MOI.isAtomic()) {
|
|
if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
|
|
MOI.getOrdering() == AtomicOrdering::Acquire ||
|
|
MOI.getOrdering() == AtomicOrdering::Release ||
|
|
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
|
|
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
|
|
Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
|
|
MOI.getInstrAddrSpace());
|
|
}
|
|
|
|
if (MOI.getOrdering() == AtomicOrdering::Release ||
|
|
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
|
|
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
|
|
MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
|
|
Changed |= CC->insertRelease(MI, MOI.getScope(),
|
|
MOI.getOrderingAddrSpace(),
|
|
MOI.getIsCrossAddressSpaceOrdering(),
|
|
Position::BEFORE);
|
|
|
|
if (MOI.getOrdering() == AtomicOrdering::Acquire ||
|
|
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
|
|
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent ||
|
|
MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
|
|
MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
|
|
Changed |= CC->insertWait(MI, MOI.getScope(),
|
|
MOI.getInstrAddrSpace(),
|
|
isAtomicRet(*MI) ? SIMemOp::LOAD :
|
|
SIMemOp::STORE,
|
|
MOI.getIsCrossAddressSpaceOrdering(),
|
|
Position::AFTER);
|
|
Changed |= CC->insertAcquire(MI, MOI.getScope(),
|
|
MOI.getOrderingAddrSpace(),
|
|
Position::AFTER);
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
return Changed;
|
|
}
|
|
|
|
bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
|
|
bool Changed = false;
|
|
|
|
SIMemOpAccess MOA(MF);
|
|
CC = SICacheControl::create(MF.getSubtarget<GCNSubtarget>());
|
|
|
|
for (auto &MBB : MF) {
|
|
for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
|
|
|
|
// Unbundle instructions after the post-RA scheduler.
|
|
if (MI->isBundle() && MI->mayLoadOrStore()) {
|
|
MachineBasicBlock::instr_iterator II(MI->getIterator());
|
|
for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
|
|
I != E && I->isBundledWithPred(); ++I) {
|
|
I->unbundleFromPred();
|
|
for (MachineOperand &MO : I->operands())
|
|
if (MO.isReg())
|
|
MO.setIsInternalRead(false);
|
|
}
|
|
|
|
MI->eraseFromParent();
|
|
MI = II->getIterator();
|
|
}
|
|
|
|
if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
|
|
continue;
|
|
|
|
if (const auto &MOI = MOA.getLoadInfo(MI))
|
|
Changed |= expandLoad(MOI.getValue(), MI);
|
|
else if (const auto &MOI = MOA.getStoreInfo(MI))
|
|
Changed |= expandStore(MOI.getValue(), MI);
|
|
else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
|
|
Changed |= expandAtomicFence(MOI.getValue(), MI);
|
|
else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
|
|
Changed |= expandAtomicCmpxchgOrRmw(MOI.getValue(), MI);
|
|
}
|
|
}
|
|
|
|
Changed |= removeAtomicPseudoMIs();
|
|
return Changed;
|
|
}
|
|
|
|
INITIALIZE_PASS(SIMemoryLegalizer, DEBUG_TYPE, PASS_NAME, false, false)
|
|
|
|
char SIMemoryLegalizer::ID = 0;
|
|
char &llvm::SIMemoryLegalizerID = SIMemoryLegalizer::ID;
|
|
|
|
FunctionPass *llvm::createSIMemoryLegalizerPass() {
|
|
return new SIMemoryLegalizer();
|
|
}
|