1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

AMDGPU/SI: Select constant loads with non-uniform addresses to MUBUF instructions

Summary:
We were previously selecting all constant loads to SMRD instructions and legalizing
the SMRDs with non-uniform addresses during the SIFixSGPRCopesPass.

This new solution is more simple and also generates much better code, because
the instruction selector is able to take advantage of all the MUBUF addressing
modes that are legalization pass wasn't able to.

We also no longer need to generate v_add_* instructions when we
have a uniform pointer and a non-uniform offset, as this is now folded into the
MUBUF instruction during instruction selection.

Reviewers: arsenm

Subscribers: arsenm, llvm-commits

Differential Revision: http://reviews.llvm.org/D15425

llvm-svn: 255672
This commit is contained in:
Tom Stellard 2015-12-15 20:55:55 +00:00
parent 50b81b59c2
commit 518c0f18a8
9 changed files with 178 additions and 43 deletions

View File

@ -71,6 +71,7 @@ Pass *createAMDGPUStructurizeCFGPass();
FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
ModulePass *createAMDGPUAlwaysInlinePass();
ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
FunctionPass *createAMDGPUAnnotateUniformValues();
void initializeSIFixControlFlowLiveIntervalsPass(PassRegistry&);
extern char &SIFixControlFlowLiveIntervalsID;
@ -78,6 +79,8 @@ extern char &SIFixControlFlowLiveIntervalsID;
void initializeSIFixSGPRLiveRangesPass(PassRegistry&);
extern char &SIFixSGPRLiveRangesID;
void initializeAMDGPUAnnotateUniformValuesPass(PassRegistry&);
extern char &AMDGPUAnnotateUniformValuesPassID;
extern Target TheAMDGPUTarget;
extern Target TheGCNTarget;

View File

@ -0,0 +1,84 @@
//===-- AMDGPUAnnotateUniformValues.cpp - ---------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file
/// This pass adds amdgpu.uniform metadata to IR values so this information
/// can be used during instruction selection.
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUIntrinsicInfo.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#define DEBUG_TYPE "amdgpu-annotate-uniform"
using namespace llvm;
namespace {
class AMDGPUAnnotateUniformValues : public FunctionPass,
public InstVisitor<AMDGPUAnnotateUniformValues> {
DivergenceAnalysis *DA;
public:
static char ID;
AMDGPUAnnotateUniformValues() :
FunctionPass(ID) { }
bool doInitialization(Module &M) override;
bool runOnFunction(Function &F) override;
const char *getPassName() const override { return "AMDGPU Annotate Uniform Values"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<DivergenceAnalysis>();
AU.setPreservesAll();
}
void visitLoadInst(LoadInst &I);
};
} // End anonymous namespace
INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
"Add AMDGPU uniform metadata", false, false)
INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
"Add AMDGPU uniform metadata", false, false)
char AMDGPUAnnotateUniformValues::ID = 0;
void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
Value *Ptr = I.getPointerOperand();
if (!DA->isUniform(Ptr))
return;
if (Instruction *PtrI = dyn_cast<Instruction>(Ptr))
PtrI->setMetadata("amdgpu.uniform", MDNode::get(I.getContext(), {}));
}
bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
return false;
}
bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
DA = &getAnalysis<DivergenceAnalysis>();
visit(F);
return true;
}
FunctionPass *
llvm::createAMDGPUAnnotateUniformValues() {
return new AMDGPUAnnotateUniformValues();
}

View File

@ -51,6 +51,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSIFixControlFlowLiveIntervalsPass(*PR);
initializeSILoadStoreOptimizerPass(*PR);
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@ -279,6 +280,8 @@ bool GCNPassConfig::addPreISel() {
addPass(createSinkingPass());
addPass(createSITypeRewriter());
addPass(createSIAnnotateControlFlowPass());
addPass(createAMDGPUAnnotateUniformValues());
return false;
}

View File

@ -16,6 +16,7 @@ add_llvm_target(AMDGPUCodeGen
AMDILCFGStructurizer.cpp
AMDGPUAlwaysInlinePass.cpp
AMDGPUAnnotateKernelFeatures.cpp
AMDGPUAnnotateUniformValues.cpp
AMDGPUAsmPrinter.cpp
AMDGPUDiagnosticInfoUnsupported.cpp
AMDGPUFrameLowering.cpp

View File

@ -504,6 +504,21 @@ bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
}
bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
const MemSDNode *MemNode = cast<MemSDNode>(N);
const Value *Ptr = MemNode->getMemOperand()->getValue();
// UndefValue means this is a load of a kernel input. These are uniform.
// Sometimes LDS instructions have constant pointers
if (isa<UndefValue>(Ptr) || isa<Argument>(Ptr) || isa<Constant>(Ptr) ||
isa<GlobalValue>(Ptr))
return true;
const Instruction *I = dyn_cast_or_null<Instruction>(Ptr);
return I && I->getMetadata("amdgpu.uniform");
}
TargetLoweringBase::LegalizeTypeAction
SITargetLowering::getPreferredVectorAction(EVT VT) const {
if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
@ -1328,6 +1343,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
switch (Load->getAddressSpace()) {
default: break;
case AMDGPUAS::CONSTANT_ADDRESS:
if (isMemOpUniform(Load))
break;
// Non-uniform loads will be selected to MUBUF instructions, so they
// have the same legalization requires ments as global and private
// loads.
//
// Fall-through
case AMDGPUAS::GLOBAL_ADDRESS:
case AMDGPUAS::PRIVATE_ADDRESS:
if (NumElements >= 8)

View File

@ -80,6 +80,7 @@ public:
bool MemcpyStrSrc,
MachineFunction &MF) const override;
bool isMemOpUniform(const SDNode *N) const;
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
TargetLoweringBase::LegalizeTypeAction

View File

@ -137,6 +137,16 @@ def SIconstdata_ptr : SDNode<
SDTCisVT<0, i64>]>
>;
def mubuf_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
return isGlobalLoad(cast<LoadSDNode>(N)) ||
isConstantLoad(cast<LoadSDNode>(N), -1);
}]>;
def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
return isConstantLoad(cast<LoadSDNode>(N), -1) &&
static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N);
}]>;
//===----------------------------------------------------------------------===//
// SDNodes and PatFrag for local loads and stores to enable s_mov_b32 m0, -1
// to be glued to the memory instructions.

View File

@ -953,13 +953,13 @@ defm BUFFER_LOAD_SSHORT : MUBUF_Load_Helper <
mubuf<0x0b, 0x13>, "buffer_load_sshort", VGPR_32, i32, sextloadi16_global
>;
defm BUFFER_LOAD_DWORD : MUBUF_Load_Helper <
mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, global_load
mubuf<0x0c, 0x14>, "buffer_load_dword", VGPR_32, i32, mubuf_load
>;
defm BUFFER_LOAD_DWORDX2 : MUBUF_Load_Helper <
mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, global_load
mubuf<0x0d, 0x15>, "buffer_load_dwordx2", VReg_64, v2i32, mubuf_load
>;
defm BUFFER_LOAD_DWORDX4 : MUBUF_Load_Helper <
mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, global_load
mubuf<0x0e, 0x17>, "buffer_load_dwordx4", VReg_128, v4i32, mubuf_load
>;
defm BUFFER_STORE_BYTE : MUBUF_Store_Helper <
@ -2087,24 +2087,29 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> {
// 1. IMM offset
def : Pat <
(constant_load (SMRDImm i64:$sbase, i32:$offset)),
(smrd_load (SMRDImm i64:$sbase, i32:$offset)),
(vt (!cast<SMRD>(Instr#"_IMM") $sbase, $offset))
>;
// 2. SGPR offset
def : Pat <
(constant_load (SMRDSgpr i64:$sbase, i32:$offset)),
(smrd_load (SMRDSgpr i64:$sbase, i32:$offset)),
(vt (!cast<SMRD>(Instr#"_SGPR") $sbase, $offset))
>;
def : Pat <
(constant_load (SMRDImm32 i64:$sbase, i32:$offset)),
(smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
(vt (!cast<SMRD>(Instr#"_IMM_ci") $sbase, $offset))
> {
let Predicates = [isCIOnly];
}
}
// Global and constant loads can be selected to either MUBUF or SMRD
// instructions, but SMRD instructions are faster so we want the instruction
// selector to prefer those.
let AddedComplexity = 100 in {
defm : SMRD_Pattern <"S_LOAD_DWORD", i32>;
defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>;
defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>;
@ -2133,6 +2138,8 @@ def : Pat <
} // End Predicates = [isCI]
} // End let AddedComplexity = 10000
//===----------------------------------------------------------------------===//
// SOP1 Patterns
//===----------------------------------------------------------------------===//

View File

@ -77,7 +77,8 @@ endif: ; preds = %else, %if
; Test moving an SMRD with an immediate offset to the VALU
; GCN-LABEL: {{^}}smrd_valu2:
; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
; GCN-NOT: v_add
; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}}
define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 {
entry:
%tmp = call i32 @llvm.r600.read.tidig.x() #0
@ -90,8 +91,10 @@ entry:
; Use a big offset that will use the SMRD literal offset on CI
; GCN-LABEL: {{^}}smrd_valu_ci_offset:
; GCN: s_movk_i32 s[[OFFSET:[0-9]+]], 0x4e20{{$}}
; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET]]:{{[0-9]+}}], 0 addr64{{$}}
; GCN-NOT: v_add
; GCN: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4e20{{$}}
; GCN-NOT: v_add
; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
; GCN: v_add_i32_e32
; GCN: buffer_store_dword
define void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 {
@ -106,8 +109,10 @@ entry:
}
; GCN-LABEL: {{^}}smrd_valu_ci_offset_x2:
; GCN: s_mov_b32 s[[OFFSET:[0-9]+]], 0x9c40{{$}}
; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET]]:{{[0-9]+}}], 0 addr64{{$}}
; GCN-NOT: v_add
; GCN: s_mov_b32 [[OFFSET:s[0-9]+]], 0x9c40{{$}}
; GCN-NOT: v_add
; GCN: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GCN: buffer_store_dwordx2
@ -123,8 +128,10 @@ entry:
}
; GCN-LABEL: {{^}}smrd_valu_ci_offset_x4:
; GCN: s_movk_i32 s[[OFFSET:[0-9]+]], 0x4d20{{$}}
; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET]]:{{[0-9]+}}], 0 addr64{{$}}
; GCN-NOT: v_add
; GCN: s_movk_i32 [[OFFSET:s[0-9]+]], 0x4d20{{$}}
; GCN-NOT: v_add
; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET]] addr64{{$}}
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
@ -145,14 +152,14 @@ entry:
; CI.
; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8:
; GCN: s_mov_b32 s[[OFFSET0:[0-9]+]], 0x9a40{{$}}
; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET0]]:{{[0-9]+}}], 0 addr64{{$}}
; SI: s_add_i32 s[[OFFSET1:[0-9]+]], s[[OFFSET0]], 16
; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET1]]:{{[0-9]+}}], 0 addr64{{$}}
; CI: s_mov_b32 s[[OFFSET1:[0-9]+]], 0x9a50{{$}}
; CI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET1]]:{{[0-9]+}}], 0 addr64{{$}}
; GCN-NOT: v_add
; GCN: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
; GCN-NOT: v_add
; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
; GCN-NOT: v_add
; GCN: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
; GCN-NOT: v_add
; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
@ -175,22 +182,24 @@ entry:
ret void
}
; FIXME: should use immediate offset instead of using s_add_i32 for adding to constant.
; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16:
; GCN-DAG: s_mov_b32 s[[OFFSET0:[0-9]+]], 0x13480{{$}}
; SI-DAG: s_add_i32 s[[OFFSET1:[0-9]+]], s[[OFFSET0]], 16
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET0]]:{{[0-9]+}}], 0 addr64{{$}}
; CI-DAG: s_mov_b32 s[[OFFSET1:[0-9]+]], 0x13490{{$}}
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET1]]:{{[0-9]+}}], 0 addr64{{$}}
; SI-DAG: s_add_i32 s[[OFFSET2:[0-9]+]], s[[OFFSET0]], 32
; CI-DAG: s_mov_b32 s[[OFFSET2:[0-9]+]], 0x134a0
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET2]]:{{[0-9]+}}], 0 addr64{{$}}
; GCN-DAG: s_add_i32 s[[OFFSET3:[0-9]+]], s[[OFFSET2]], 16
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET3]]:{{[0-9]+}}], 0 addr64{{$}}
; GCN-NOT: v_add
; GCN: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
; GCN-NOT: v_add
; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
; GCN-NOT: v_add
; GCN: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}
; GCN-NOT: v_add
; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
; GCN-NOT: v_add
; GCN: s_mov_b32 [[OFFSET2:s[0-9]+]], 0x134a0{{$}}
; GCN-NOT: v_add
; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET2]] addr64{{$}}
; GCN-NOT: v_add
; GCN: s_mov_b32 [[OFFSET3:s[0-9]+]], 0x134b0{{$}}
; GCN-NOT: v_add
; GCN: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET3]] addr64{{$}}
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
; GCN: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
@ -244,15 +253,9 @@ entry:
ret void
}
; Offset is too big to fit in SMRD 8-bit offset, but small enough to
; fit in MUBUF offset.
; FIXME: We should be using the offset but we don't
; GCN-LABEL: {{^}}smrd_valu2_mubuf_offset:
; SI: s_movk_i32 s[[OFFSET:[0-9]+]], 0x400{{$}}
; SI: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[OFFSET]]:{{[0-9]+\]}}, 0 addr64{{$}}
; CI: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}}
; GCN-NOT: v_add
; GCN: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}}
define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
entry:
%tmp = call i32 @llvm.r600.read.tidig.x() #0