1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-26 04:32:44 +01:00

Allow target to decide when to cluster loads/stores in misched

MachineScheduler when clustering loads or stores checks if base
pointers point to the same memory. This check is done through
comparison of base registers of two memory instructions. This
works fine when instructions have separate offset operand. If
they require a full calculated pointer such instructions can
never be clustered according to such logic.

Changed shouldClusterMemOps to accept base registers as well and
let it decide what to do about it.

Differential Revision: https://reviews.llvm.org/D37698

llvm-svn: 313208
This commit is contained in:
Stanislav Mekhanoshin 2017-09-13 22:20:47 +00:00
parent 93fed884cc
commit fbfa163a41
12 changed files with 146 additions and 24 deletions

View File

@ -1097,8 +1097,8 @@ public:
/// or
/// DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
/// to TargetPassConfig::createMachineScheduler() to have an effect.
virtual bool shouldClusterMemOps(MachineInstr &FirstLdSt,
MachineInstr &SecondLdSt,
virtual bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1,
MachineInstr &SecondLdSt, unsigned BaseReg2,
unsigned NumLoads) const {
llvm_unreachable("target did not implement shouldClusterMemOps()");
}

View File

@ -1561,14 +1561,10 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
std::sort(MemOpRecords.begin(), MemOpRecords.end());
unsigned ClusterLength = 1;
for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
if (MemOpRecords[Idx].BaseReg != MemOpRecords[Idx+1].BaseReg) {
ClusterLength = 1;
continue;
}
SUnit *SUa = MemOpRecords[Idx].SU;
SUnit *SUb = MemOpRecords[Idx+1].SU;
if (TII->shouldClusterMemOps(*SUa->getInstr(), *SUb->getInstr(),
if (TII->shouldClusterMemOps(*SUa->getInstr(), MemOpRecords[Idx].BaseReg,
*SUb->getInstr(), MemOpRecords[Idx+1].BaseReg,
ClusterLength) &&
DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("

View File

@ -2104,8 +2104,13 @@ static bool canPairLdStOpc(unsigned FirstOpc, unsigned SecondOpc) {
///
/// Only called for LdSt for which getMemOpBaseRegImmOfs returns true.
bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
unsigned BaseReg1,
MachineInstr &SecondLdSt,
unsigned BaseReg2,
unsigned NumLoads) const {
if (BaseReg1 != BaseReg2)
return false;
// Only cluster up to a single pair.
if (NumLoads > 1)
return false;

View File

@ -242,7 +242,8 @@ public:
bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width,
int64_t &MinOffset, int64_t &MaxOffset) const;
bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1,
MachineInstr &SecondLdSt, unsigned BaseReg2,
unsigned NumLoads) const override;
void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,

View File

@ -27,6 +27,7 @@
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@ -356,15 +357,52 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg,
return false;
}
static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, unsigned BaseReg1,
const MachineInstr &MI2, unsigned BaseReg2) {
if (BaseReg1 == BaseReg2)
return true;
if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
return false;
auto MO1 = *MI1.memoperands_begin();
auto MO2 = *MI2.memoperands_begin();
if (MO1->getAddrSpace() != MO2->getAddrSpace())
return false;
auto Base1 = MO1->getValue();
auto Base2 = MO2->getValue();
if (!Base1 || !Base2)
return false;
const MachineFunction &MF = *MI1.getParent()->getParent();
const DataLayout &DL = MF.getFunction()->getParent()->getDataLayout();
Base1 = GetUnderlyingObject(Base1, DL);
Base2 = GetUnderlyingObject(Base1, DL);
if (isa<UndefValue>(Base1) || isa<UndefValue>(Base2))
return false;
return Base1 == Base2;
}
bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
unsigned BaseReg1,
MachineInstr &SecondLdSt,
unsigned BaseReg2,
unsigned NumLoads) const {
if (!memOpsHaveSameBasePtr(FirstLdSt, BaseReg1, SecondLdSt, BaseReg2))
return false;
const MachineOperand *FirstDst = nullptr;
const MachineOperand *SecondDst = nullptr;
if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
(isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
(isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
const unsigned MaxGlobalLoadCluster = 6;
if (NumLoads > MaxGlobalLoadCluster)
return false;
FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
if (!FirstDst)
FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);

View File

@ -151,7 +151,8 @@ public:
int64_t &Offset,
const TargetRegisterInfo *TRI) const final;
bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
bool shouldClusterMemOps(MachineInstr &FirstLdSt, unsigned BaseReg1,
MachineInstr &SecondLdSt, unsigned BaseReg2,
unsigned NumLoads) const final;
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,

View File

@ -105,7 +105,7 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i1
; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i32:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
; VI-NEXT: buffer_store_dword [[SEXT]]
define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
@ -125,7 +125,7 @@ define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i1
; GCN-LABEL: {{^}}v_test_add_i16_sext_to_i64:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}

View File

@ -42,8 +42,10 @@ define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrs
}
; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32:
; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]],
; GCN: {{buffer|flat}}_load_dword [[VAL1:v[0-9]+]],
; SI: buffer_load_dword [[VAL0:v[0-9]+]],
; SI: buffer_load_dword [[VAL1:v[0-9]+]],
; VI: flat_load_dword [[VAL1:v[0-9]+]],
; VI: flat_load_dword [[VAL0:v[0-9]+]],
; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0
; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]]
@ -280,8 +282,8 @@ define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %ou
; SI: buffer_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
; SI: buffer_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64
; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAR]], [[VAL]]
; VI: flat_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
; VI: flat_load_dword [[VAR:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
; VI: flat_load_dword [[VAL:v[0-9]+]], v[{{[0-9]+:[0-9]+}}]
; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm

View File

@ -60,8 +60,10 @@ entry:
}
; GCN-LABEL: {{^}}fadd_v2f16:
; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; VI: flat_load_dword v[[B_V2_F16:[0-9]+]]
; VI: flat_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]

View File

@ -0,0 +1,77 @@
# RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -run-pass machine-scheduler -o - %s | FileCheck -check-prefix=GCN %s
# CGN-LABEL: name: flat_load_clustering
# GCN: FLAT_LOAD_DWORD
# GCN-NEXT: FLAT_LOAD_DWORD
--- |
define amdgpu_kernel void @flat_load_clustering(i32 addrspace(1)* nocapture %arg, i32 addrspace(2)* nocapture readonly %arg1) {
bb:
%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
%idxprom = sext i32 %tid to i64
%gep1 = getelementptr inbounds i32, i32 addrspace(2)* %arg1, i64 %idxprom
%load1 = load i32, i32 addrspace(2)* %gep1, align 4
%gep2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %idxprom
%gep34 = getelementptr inbounds i32, i32 addrspace(2)* %gep1, i64 4
%load2 = load i32, i32 addrspace(2)* %gep34, align 4
%gep4 = getelementptr inbounds i32, i32 addrspace(1)* %gep2, i64 4
store i32 %load1, i32 addrspace(1)* %gep2, align 4
store i32 %load2, i32 addrspace(1)* %gep4, align 4
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x()
...
---
name: flat_load_clustering
alignment: 0
exposesReturnsTwice: false
legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
registers:
- { id: 0, class: vgpr_32 }
- { id: 1, class: sgpr_64 }
- { id: 2, class: vgpr_32 }
- { id: 3, class: sreg_64_xexec }
- { id: 4, class: sreg_64_xexec }
- { id: 5, class: vgpr_32 }
- { id: 6, class: vgpr_32 }
- { id: 7, class: vgpr_32 }
- { id: 8, class: vgpr_32 }
- { id: 9, class: vreg_64 }
- { id: 10, class: vreg_64 }
- { id: 11, class: vgpr_32 }
- { id: 12, class: vreg_64 }
- { id: 13, class: vreg_64 }
liveins:
- { reg: '%vgpr0', virtual-reg: '%0' }
- { reg: '%sgpr4_sgpr5', virtual-reg: '%1' }
body: |
bb.0.bb:
liveins: %vgpr0, %sgpr4_sgpr5
%1 = COPY %sgpr4_sgpr5
%0 = COPY %vgpr0
%3 = S_LOAD_DWORDX2_IMM %1, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
%4 = S_LOAD_DWORDX2_IMM %1, 8, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
%7 = V_LSHLREV_B32_e32 2, %0, implicit %exec
%2 = V_MOV_B32_e32 0, implicit %exec
undef %12.sub0 = V_ADD_I32_e32 %4.sub0, %7, implicit-def %vcc, implicit %exec
%11 = COPY %4.sub1
%12.sub1 = V_ADDC_U32_e32 %11, %2, implicit-def dead %vcc, implicit killed %vcc, implicit %exec
%5 = FLAT_LOAD_DWORD %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.gep1)
undef %9.sub0 = V_ADD_I32_e32 %3.sub0, %7, implicit-def %vcc, implicit %exec
%8 = COPY %3.sub1
%9.sub1 = V_ADDC_U32_e32 %8, %2, implicit-def dead %vcc, implicit killed %vcc, implicit %exec
undef %13.sub0 = V_ADD_I32_e32 16, %12.sub0, implicit-def %vcc, implicit %exec
%13.sub1 = V_ADDC_U32_e32 %12.sub1, %2, implicit-def dead %vcc, implicit killed %vcc, implicit %exec
%6 = FLAT_LOAD_DWORD %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.gep34)
undef %10.sub0 = V_ADD_I32_e32 16, %9.sub0, implicit-def %vcc, implicit %exec
%10.sub1 = V_ADDC_U32_e32 %9.sub1, %2, implicit-def dead %vcc, implicit killed %vcc, implicit %exec
FLAT_STORE_DWORD %9, %5, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.gep2)
FLAT_STORE_DWORD %10, %6, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.gep4)
S_ENDPGM
...

View File

@ -222,11 +222,11 @@ define amdgpu_kernel void @reorder_local_offsets(i32 addrspace(1)* nocapture %ou
; CI: buffer_store_dword
; CI: s_endpgm
; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:400
; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:408
; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:12
; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:400
; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:408
; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:400
; GFX9-DAG: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:408
; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:12
; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:400
; GFX9-DAG: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, off offset:408
; GFX9: global_store_dword
; GFX9: s_endpgm
define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {

View File

@ -107,7 +107,7 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i1
; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i32:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
; VI-NEXT: buffer_store_dword [[SEXT]]
define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
@ -127,7 +127,7 @@ define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i1
; GCN-LABEL: {{^}}v_test_sub_i16_sext_to_i64:
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
; VI: v_sub_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}