mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 18:54:02 +01:00
[AMDGPU] Add SIPreEmitPeephole pass.
This pass can handle all the optimization opportunities found just before code emission. Presently it includes the handling of vcc branch optimization that was handled earlier in SIInsertSkips. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D76712
This commit is contained in:
parent
7dbe7d4c84
commit
63a2b80308
@ -166,6 +166,9 @@ extern char &SILowerControlFlowID;
|
||||
void initializeSIRemoveShortExecBranchesPass(PassRegistry &);
|
||||
extern char &SIRemoveShortExecBranchesID;
|
||||
|
||||
void initializeSIPreEmitPeepholePass(PassRegistry &);
|
||||
extern char &SIPreEmitPeepholeID;
|
||||
|
||||
void initializeSIInsertSkipsPass(PassRegistry &);
|
||||
extern char &SIInsertSkipsPassID;
|
||||
|
||||
|
@ -240,6 +240,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
|
||||
initializeSIWholeQuadModePass(*PR);
|
||||
initializeSILowerControlFlowPass(*PR);
|
||||
initializeSIRemoveShortExecBranchesPass(*PR);
|
||||
initializeSIPreEmitPeepholePass(*PR);
|
||||
initializeSIInsertSkipsPass(*PR);
|
||||
initializeSIMemoryLegalizerPass(*PR);
|
||||
initializeSIOptimizeExecMaskingPass(*PR);
|
||||
@ -1029,6 +1030,7 @@ void GCNPassConfig::addPreEmitPass() {
|
||||
addPass(&PostRAHazardRecognizerID);
|
||||
|
||||
addPass(&SIRemoveShortExecBranchesID);
|
||||
addPass(&SIPreEmitPeepholeID);
|
||||
addPass(&SIInsertSkipsPassID);
|
||||
addPass(&BranchRelaxationPassID);
|
||||
}
|
||||
|
@ -121,6 +121,7 @@ add_llvm_target(AMDGPUCodeGen
|
||||
SIOptimizeExecMaskingPreRA.cpp
|
||||
SIPeepholeSDWA.cpp
|
||||
SIPostRABundler.cpp
|
||||
SIPreEmitPeephole.cpp
|
||||
SIRegisterInfo.cpp
|
||||
SIRemoveShortExecBranches.cpp
|
||||
SIShrinkInstructions.cpp
|
||||
|
@ -68,8 +68,6 @@ private:
|
||||
|
||||
bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
|
||||
|
||||
bool optimizeVccBranch(MachineInstr &MI) const;
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
@ -361,98 +359,6 @@ bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const {
|
||||
// Match:
|
||||
// sreg = -1
|
||||
// vcc = S_AND_B64 exec, sreg
|
||||
// S_CBRANCH_VCC[N]Z
|
||||
// =>
|
||||
// S_CBRANCH_EXEC[N]Z
|
||||
bool Changed = false;
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
|
||||
const bool IsWave32 = ST.isWave32();
|
||||
const unsigned CondReg = TRI->getVCC();
|
||||
const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
|
||||
const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
|
||||
|
||||
MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
|
||||
E = MBB.rend();
|
||||
bool ReadsCond = false;
|
||||
unsigned Threshold = 5;
|
||||
for (++A ; A != E ; ++A) {
|
||||
if (!--Threshold)
|
||||
return false;
|
||||
if (A->modifiesRegister(ExecReg, TRI))
|
||||
return false;
|
||||
if (A->modifiesRegister(CondReg, TRI)) {
|
||||
if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And)
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
ReadsCond |= A->readsRegister(CondReg, TRI);
|
||||
}
|
||||
if (A == E)
|
||||
return false;
|
||||
|
||||
MachineOperand &Op1 = A->getOperand(1);
|
||||
MachineOperand &Op2 = A->getOperand(2);
|
||||
if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
|
||||
TII->commuteInstruction(*A);
|
||||
Changed = true;
|
||||
}
|
||||
if (Op1.getReg() != ExecReg)
|
||||
return Changed;
|
||||
if (Op2.isImm() && Op2.getImm() != -1)
|
||||
return Changed;
|
||||
|
||||
unsigned SReg = AMDGPU::NoRegister;
|
||||
if (Op2.isReg()) {
|
||||
SReg = Op2.getReg();
|
||||
auto M = std::next(A);
|
||||
bool ReadsSreg = false;
|
||||
for ( ; M != E ; ++M) {
|
||||
if (M->definesRegister(SReg, TRI))
|
||||
break;
|
||||
if (M->modifiesRegister(SReg, TRI))
|
||||
return Changed;
|
||||
ReadsSreg |= M->readsRegister(SReg, TRI);
|
||||
}
|
||||
if (M == E ||
|
||||
!M->isMoveImmediate() ||
|
||||
!M->getOperand(1).isImm() ||
|
||||
M->getOperand(1).getImm() != -1)
|
||||
return Changed;
|
||||
// First if sreg is only used in and instruction fold the immediate
|
||||
// into that and.
|
||||
if (!ReadsSreg && Op2.isKill()) {
|
||||
A->getOperand(2).ChangeToImmediate(-1);
|
||||
M->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
|
||||
MI.killsRegister(CondReg, TRI))
|
||||
A->eraseFromParent();
|
||||
|
||||
bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
|
||||
if (SReg == ExecReg) {
|
||||
if (IsVCCZ) {
|
||||
MI.eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
MI.setDesc(TII->get(AMDGPU::S_BRANCH));
|
||||
} else {
|
||||
MI.setDesc(TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ
|
||||
: AMDGPU::S_CBRANCH_EXECNZ));
|
||||
}
|
||||
|
||||
MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
|
||||
MI.addImplicitDefUseOperands(*MBB.getParent());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
|
||||
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
||||
TII = ST.getInstrInfo();
|
||||
@ -534,11 +440,6 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
|
||||
}
|
||||
break;
|
||||
|
||||
case AMDGPU::S_CBRANCH_VCCZ:
|
||||
case AMDGPU::S_CBRANCH_VCCNZ:
|
||||
MadeChange |= optimizeVccBranch(MI);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
169
lib/Target/AMDGPU/SIPreEmitPeephole.cpp
Normal file
169
lib/Target/AMDGPU/SIPreEmitPeephole.cpp
Normal file
@ -0,0 +1,169 @@
|
||||
//===-- SIPreEmitPeephole.cpp ------------------------------------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file
|
||||
/// This pass performs the peephole optimizations before code emission.
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "AMDGPUSubtarget.h"
|
||||
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
||||
#include "SIInstrInfo.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
#define DEBUG_TYPE "si-pre-emit-peephole"
|
||||
|
||||
namespace {
|
||||
|
||||
class SIPreEmitPeephole : public MachineFunctionPass {
|
||||
private:
|
||||
const SIInstrInfo *TII = nullptr;
|
||||
const SIRegisterInfo *TRI = nullptr;
|
||||
|
||||
bool optimizeVccBranch(MachineInstr &MI) const;
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
SIPreEmitPeephole() : MachineFunctionPass(ID) {
|
||||
initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
|
||||
}
|
||||
|
||||
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||
};
|
||||
|
||||
} // End anonymous namespace.
|
||||
|
||||
INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
|
||||
"SI peephole optimizations", false, false)
|
||||
|
||||
char SIPreEmitPeephole::ID = 0;
|
||||
|
||||
char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
|
||||
|
||||
bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
|
||||
// Match:
|
||||
// sreg = -1
|
||||
// vcc = S_AND_B64 exec, sreg
|
||||
// S_CBRANCH_VCC[N]Z
|
||||
// =>
|
||||
// S_CBRANCH_EXEC[N]Z
|
||||
// We end up with this pattern sometimes after basic block placement.
|
||||
// It happens while combining a block which assigns -1 to a saved mask and
|
||||
// another block which consumes that saved mask and then a branch.
|
||||
bool Changed = false;
|
||||
MachineBasicBlock &MBB = *MI.getParent();
|
||||
const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
|
||||
const bool IsWave32 = ST.isWave32();
|
||||
const unsigned CondReg = TRI->getVCC();
|
||||
const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
|
||||
const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
|
||||
|
||||
MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
|
||||
E = MBB.rend();
|
||||
bool ReadsCond = false;
|
||||
unsigned Threshold = 5;
|
||||
for (++A; A != E; ++A) {
|
||||
if (!--Threshold)
|
||||
return false;
|
||||
if (A->modifiesRegister(ExecReg, TRI))
|
||||
return false;
|
||||
if (A->modifiesRegister(CondReg, TRI)) {
|
||||
if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And)
|
||||
return false;
|
||||
break;
|
||||
}
|
||||
ReadsCond |= A->readsRegister(CondReg, TRI);
|
||||
}
|
||||
if (A == E)
|
||||
return false;
|
||||
|
||||
MachineOperand &Op1 = A->getOperand(1);
|
||||
MachineOperand &Op2 = A->getOperand(2);
|
||||
if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
|
||||
TII->commuteInstruction(*A);
|
||||
Changed = true;
|
||||
}
|
||||
if (Op1.getReg() != ExecReg)
|
||||
return Changed;
|
||||
if (Op2.isImm() && Op2.getImm() != -1)
|
||||
return Changed;
|
||||
|
||||
Register SReg;
|
||||
if (Op2.isReg()) {
|
||||
SReg = Op2.getReg();
|
||||
auto M = std::next(A);
|
||||
bool ReadsSreg = false;
|
||||
for (; M != E; ++M) {
|
||||
if (M->definesRegister(SReg, TRI))
|
||||
break;
|
||||
if (M->modifiesRegister(SReg, TRI))
|
||||
return Changed;
|
||||
ReadsSreg |= M->readsRegister(SReg, TRI);
|
||||
}
|
||||
if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
|
||||
M->getOperand(1).getImm() != -1)
|
||||
return Changed;
|
||||
// First if sreg is only used in and instruction fold the immediate
|
||||
// into that and.
|
||||
if (!ReadsSreg && Op2.isKill()) {
|
||||
A->getOperand(2).ChangeToImmediate(-1);
|
||||
M->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
|
||||
MI.killsRegister(CondReg, TRI))
|
||||
A->eraseFromParent();
|
||||
|
||||
bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
|
||||
if (SReg == ExecReg) {
|
||||
if (IsVCCZ) {
|
||||
MI.eraseFromParent();
|
||||
return true;
|
||||
}
|
||||
MI.setDesc(TII->get(AMDGPU::S_BRANCH));
|
||||
} else {
|
||||
MI.setDesc(
|
||||
TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
|
||||
}
|
||||
|
||||
MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
|
||||
MI.addImplicitDefUseOperands(*MBB.getParent());
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
|
||||
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
||||
TII = ST.getInstrInfo();
|
||||
TRI = &TII->getRegisterInfo();
|
||||
bool Changed = false;
|
||||
|
||||
for (MachineBasicBlock &MBB : MF) {
|
||||
MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
|
||||
if (MBBI == MBB.end())
|
||||
continue;
|
||||
|
||||
MachineInstr &MI = *MBBI;
|
||||
switch (MI.getOpcode()) {
|
||||
case AMDGPU::S_CBRANCH_VCCZ:
|
||||
case AMDGPU::S_CBRANCH_VCCNZ:
|
||||
Changed |= optimizeVccBranch(MI);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
# RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-skips -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass si-insert-skips -verify-machineinstrs -o - %s | FileCheck -check-prefix=W32 %s
|
||||
# RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s | FileCheck -check-prefix=W32 %s
|
||||
|
||||
---
|
||||
# GCN-LABEL: name: and_execz_mov_vccz
|
||||
|
Loading…
Reference in New Issue
Block a user