mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 12:12:47 +01:00
[AMDGPU] Switch AnnotateUniformValues to MemorySSA
This shall speedup compilation and also remove threshold limitations used by memory dependency analysis. It also seem to fix the bug in the coalescer_remat.ll where an SMRD load was used in presence of a potentially clobbering store. Fixes: SWDEV-272132 Differential Revision: https://reviews.llvm.org/D101962
This commit is contained in:
parent
f82032cf1f
commit
eda6209c9f
@ -14,11 +14,8 @@
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "Utils/AMDGPUBaseInfo.h"
|
||||
#include "llvm/ADT/DepthFirstIterator.h"
|
||||
#include "llvm/ADT/SetVector.h"
|
||||
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
|
||||
#include "llvm/Analysis/LoopInfo.h"
|
||||
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
|
||||
#include "llvm/Analysis/MemorySSA.h"
|
||||
#include "llvm/IR/InstVisitor.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
|
||||
@ -31,8 +28,7 @@ namespace {
|
||||
class AMDGPUAnnotateUniformValues : public FunctionPass,
|
||||
public InstVisitor<AMDGPUAnnotateUniformValues> {
|
||||
LegacyDivergenceAnalysis *DA;
|
||||
MemoryDependenceResults *MDR;
|
||||
LoopInfo *LI;
|
||||
MemorySSA *MSSA;
|
||||
DenseMap<Value*, GetElementPtrInst*> noClobberClones;
|
||||
bool isEntryFunc;
|
||||
|
||||
@ -47,8 +43,7 @@ public:
|
||||
}
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.addRequired<LegacyDivergenceAnalysis>();
|
||||
AU.addRequired<MemoryDependenceWrapperPass>();
|
||||
AU.addRequired<LoopInfoWrapperPass>();
|
||||
AU.addRequired<MemorySSAWrapperPass>();
|
||||
AU.setPreservesAll();
|
||||
}
|
||||
|
||||
@ -62,8 +57,7 @@ public:
|
||||
INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
|
||||
"Add AMDGPU uniform metadata", false, false)
|
||||
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
|
||||
INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
|
||||
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
|
||||
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
|
||||
INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
|
||||
"Add AMDGPU uniform metadata", false, false)
|
||||
|
||||
@ -77,37 +71,8 @@ static void setNoClobberMetadata(Instruction *I) {
|
||||
}
|
||||
|
||||
bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
|
||||
// 1. get Loop for the Load->getparent();
|
||||
// 2. if it exists, collect all the BBs from the most outer
|
||||
// loop and check for the writes. If NOT - start DFS over all preds.
|
||||
// 3. Start DFS over all preds from the most outer loop header.
|
||||
SetVector<BasicBlock *> Checklist;
|
||||
BasicBlock *Start = Load->getParent();
|
||||
Checklist.insert(Start);
|
||||
const Value *Ptr = Load->getPointerOperand();
|
||||
const Loop *L = LI->getLoopFor(Start);
|
||||
if (L) {
|
||||
const Loop *P = L;
|
||||
do {
|
||||
L = P;
|
||||
P = P->getParentLoop();
|
||||
} while (P);
|
||||
Checklist.insert(L->block_begin(), L->block_end());
|
||||
Start = L->getHeader();
|
||||
}
|
||||
|
||||
Checklist.insert(idf_begin(Start), idf_end(Start));
|
||||
for (auto &BB : Checklist) {
|
||||
BasicBlock::iterator StartIt = (!L && (BB == Load->getParent())) ?
|
||||
BasicBlock::iterator(Load) : BB->end();
|
||||
auto Q = MDR->getPointerDependencyFrom(
|
||||
MemoryLocation::getBeforeOrAfter(Ptr), true, StartIt, BB, Load);
|
||||
if (Q.isClobber() || Q.isUnknown() ||
|
||||
// Store defines the load and thus clobbers it.
|
||||
(Q.isDef() && Q.getInst()->mayWriteToMemory()))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(Load);
|
||||
return !MSSA->isLiveOnEntryDef(MA);
|
||||
}
|
||||
|
||||
void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
|
||||
@ -172,9 +137,8 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
|
||||
if (skipFunction(F))
|
||||
return false;
|
||||
|
||||
DA = &getAnalysis<LegacyDivergenceAnalysis>();
|
||||
MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
|
||||
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
||||
DA = &getAnalysis<LegacyDivergenceAnalysis>();
|
||||
MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
|
||||
isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
|
||||
|
||||
visit(F);
|
||||
|
@ -12,8 +12,8 @@ declare float @llvm.fma.f32(float, float, float)
|
||||
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
|
||||
; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0
|
||||
; It's probably OK if this is slightly higher:
|
||||
; CHECK: ; NumVgprs: 4
|
||||
define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
|
||||
; CHECK: ; NumVgprs: 8
|
||||
define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* noalias %out, <4 x float> addrspace(1)* noalias %in, i32 %flag) {
|
||||
entry:
|
||||
%cmpflag = icmp eq i32 %flag, 1
|
||||
br i1 %cmpflag, label %loop, label %exit
|
||||
|
@ -4,7 +4,7 @@
|
||||
; GCN: flat_load_dword
|
||||
; GCN: flat_load_dword
|
||||
; GCN: flat_store_dword
|
||||
define amdgpu_kernel void @unknown_memdep_analysis(float addrspace(1)* nocapture readonly %arg, float %arg1) #0 {
|
||||
define void @unknown_memdep_analysis(float addrspace(1)* nocapture readonly %arg, float %arg1) #0 {
|
||||
bb:
|
||||
%tmp53 = load float, float addrspace(1)* undef, align 4
|
||||
%tmp54 = getelementptr inbounds float, float addrspace(1)* %arg, i32 31
|
||||
|
@ -95,9 +95,8 @@
|
||||
; GCN-O0-NEXT: Code sinking
|
||||
; GCN-O0-NEXT: Post-Dominator Tree Construction
|
||||
; GCN-O0-NEXT: Legacy Divergence Analysis
|
||||
; GCN-O0-NEXT: Phi Values Analysis
|
||||
; GCN-O0-NEXT: Function Alias Analysis Results
|
||||
; GCN-O0-NEXT: Memory Dependence Analysis
|
||||
; GCN-O0-NEXT: Memory SSA
|
||||
; GCN-O0-NEXT: AMDGPU Annotate Uniform Values
|
||||
; GCN-O0-NEXT: SI annotate control flow
|
||||
; GCN-O0-NEXT: Natural Loop Information
|
||||
@ -275,9 +274,8 @@
|
||||
; GCN-O1-NEXT: Code sinking
|
||||
; GCN-O1-NEXT: Post-Dominator Tree Construction
|
||||
; GCN-O1-NEXT: Legacy Divergence Analysis
|
||||
; GCN-O1-NEXT: Phi Values Analysis
|
||||
; GCN-O1-NEXT: Function Alias Analysis Results
|
||||
; GCN-O1-NEXT: Memory Dependence Analysis
|
||||
; GCN-O1-NEXT: Memory SSA
|
||||
; GCN-O1-NEXT: AMDGPU Annotate Uniform Values
|
||||
; GCN-O1-NEXT: SI annotate control flow
|
||||
; GCN-O1-NEXT: Natural Loop Information
|
||||
@ -550,9 +548,8 @@
|
||||
; GCN-O1-OPTS-NEXT: Code sinking
|
||||
; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction
|
||||
; GCN-O1-OPTS-NEXT: Legacy Divergence Analysis
|
||||
; GCN-O1-OPTS-NEXT: Phi Values Analysis
|
||||
; GCN-O1-OPTS-NEXT: Function Alias Analysis Results
|
||||
; GCN-O1-OPTS-NEXT: Memory Dependence Analysis
|
||||
; GCN-O1-OPTS-NEXT: Memory SSA
|
||||
; GCN-O1-OPTS-NEXT: AMDGPU Annotate Uniform Values
|
||||
; GCN-O1-OPTS-NEXT: SI annotate control flow
|
||||
; GCN-O1-OPTS-NEXT: Natural Loop Information
|
||||
@ -833,9 +830,8 @@
|
||||
; GCN-O2-NEXT: Code sinking
|
||||
; GCN-O2-NEXT: Post-Dominator Tree Construction
|
||||
; GCN-O2-NEXT: Legacy Divergence Analysis
|
||||
; GCN-O2-NEXT: Phi Values Analysis
|
||||
; GCN-O2-NEXT: Function Alias Analysis Results
|
||||
; GCN-O2-NEXT: Memory Dependence Analysis
|
||||
; GCN-O2-NEXT: Memory SSA
|
||||
; GCN-O2-NEXT: AMDGPU Annotate Uniform Values
|
||||
; GCN-O2-NEXT: SI annotate control flow
|
||||
; GCN-O2-NEXT: Natural Loop Information
|
||||
@ -1129,9 +1125,8 @@
|
||||
; GCN-O3-NEXT: Code sinking
|
||||
; GCN-O3-NEXT: Post-Dominator Tree Construction
|
||||
; GCN-O3-NEXT: Legacy Divergence Analysis
|
||||
; GCN-O3-NEXT: Phi Values Analysis
|
||||
; GCN-O3-NEXT: Function Alias Analysis Results
|
||||
; GCN-O3-NEXT: Memory Dependence Analysis
|
||||
; GCN-O3-NEXT: Memory SSA
|
||||
; GCN-O3-NEXT: AMDGPU Annotate Uniform Values
|
||||
; GCN-O3-NEXT: SI annotate control flow
|
||||
; GCN-O3-NEXT: Natural Loop Information
|
||||
|
@ -1,11 +1,9 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
||||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck %s
|
||||
|
||||
; FIXME: The wide loads and bundles introduce so much spilling.
|
||||
define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrspace(4)* %wei_ptr, float addrspace(1)* %out_ptr) {
|
||||
define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrspace(4)* %wei_ptr, float addrspace(1)* %out_ptr, float addrspace(1)* %in) {
|
||||
; CHECK-LABEL: excess_soft_clause_reg_pressure:
|
||||
; CHECK: BB0_1: ; %for.cond28.preheader
|
||||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
|
||||
; CHECK: global_load_dword
|
||||
; CHECK-NEXT: global_load_dword
|
||||
; CHECK-NEXT: global_load_dword
|
||||
@ -14,24 +12,6 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrs
|
||||
; CHECK: s_load_dwordx16
|
||||
; CHECK-NEXT: s_load_dwordx16
|
||||
; CHECK-NEXT: s_load_dwordx16
|
||||
|
||||
; CHECK: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: s_load_dwordx16
|
||||
; CHECK-NEXT: s_load_dwordx16
|
||||
|
||||
; CHECK: v_writelane_b32
|
||||
@ -52,6 +32,23 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrs
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: s_load_dwordx16
|
||||
|
||||
; CHECK: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
; CHECK-NEXT: v_writelane_b32
|
||||
|
||||
; CHECK: v_readlane_b32
|
||||
; CHECK-NEXT: v_readlane_b32
|
||||
; CHECK-NEXT: v_readlane_b32
|
||||
@ -70,8 +67,10 @@ define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrs
|
||||
; CHECK-NEXT: v_readlane_b32
|
||||
|
||||
; CHECK: s_load_dwordx16
|
||||
; CHECK-NEXT: s_load_dwordx16
|
||||
; CHECK-NEXT: v_readlane_b32
|
||||
; CHECK: s_load_dwordx16
|
||||
; CHECK: s_load_dwordx16
|
||||
|
||||
; CHECK: v_readlane_b32
|
||||
; CHECK-NEXT: v_readlane_b32
|
||||
; CHECK-NEXT: v_readlane_b32
|
||||
; CHECK-NEXT: v_readlane_b32
|
||||
@ -100,6 +99,7 @@ entry:
|
||||
%conv.frozen = freeze i32 %conv
|
||||
%div = udiv i32 %conv.frozen, 49
|
||||
%add.ptr22 = getelementptr inbounds float, float addrspace(4)* %wei_ptr, i64 undef
|
||||
%in.ptr1 = getelementptr inbounds float, float addrspace(1)* %in, i32 %i5
|
||||
br label %for.cond28.preheader
|
||||
|
||||
for.cond28.preheader: ; preds = %for.cond28.preheader, %entry
|
||||
@ -135,7 +135,7 @@ for.cond28.preheader: ; preds = %for.cond28.preheade
|
||||
%accum.sroa.118.0 = phi float [ 0.000000e+00, %entry ], [ %i259, %for.cond28.preheader ]
|
||||
%accum.sroa.122.0 = phi float [ 0.000000e+00, %entry ], [ %i263, %for.cond28.preheader ]
|
||||
%accum.sroa.126.0 = phi float [ 0.000000e+00, %entry ], [ %i267, %for.cond28.preheader ]
|
||||
%i_ptr.0288 = phi float addrspace(1)* [ undef, %entry ], [ %add.ptr47.3, %for.cond28.preheader ]
|
||||
%i_ptr.0288 = phi float addrspace(1)* [ %in.ptr1, %entry ], [ %add.ptr47.3, %for.cond28.preheader ]
|
||||
%w_ptr.0287 = phi float addrspace(4)* [ %add.ptr22, %entry ], [ %add.ptr74, %for.cond28.preheader ]
|
||||
%ci.0286 = phi i32 [ 0, %entry ], [ %inc116, %for.cond28.preheader ]
|
||||
%i8 = load float, float addrspace(1)* %i_ptr.0288, align 4
|
||||
|
Loading…
Reference in New Issue
Block a user