mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 11:13:28 +01:00
[NVPTX] Move NVPTXPeephole after NVPTXPrologEpilogPass
Summary: Offset of frame index is calculated by NVPTXPrologEpilogPass. Before that the correct offset of stack objects cannot be obtained, which leads to wrong offset if there are more than 2 frame objects. This patch move NVPTXPeephole after NVPTXPrologEpilogPass. Because the frame index is already replaced by %VRFrame in NVPTXPrologEpilogPass, we check VRFrame register instead, and try to remove the VRFrame if there is no usage after NVPTXPeephole pass. Patched by Xuetian Weng. Test Plan: Strengthened test/CodeGen/NVPTX/local-stack-frame.ll to check the offset calculation based on SP and SPL. Reviewers: jholewinski, jingyue Reviewed By: jingyue Subscribers: jholewinski, llvm-commits Differential Revision: http://reviews.llvm.org/D10853 llvm-svn: 241185
This commit is contained in:
parent
2cbe4a84c5
commit
bf15de754a
@ -22,7 +22,7 @@
|
||||
// This peephole pass optimizes these cases, for example
|
||||
//
|
||||
// It will transform the following pattern
|
||||
// %vreg0<def> = LEA_ADDRi64 <fi#0>, 4
|
||||
// %vreg0<def> = LEA_ADDRi64 %VRFrame, 4
|
||||
// %vreg1<def> = cvta_to_local_yes_64 %vreg0
|
||||
//
|
||||
// into
|
||||
@ -36,7 +36,6 @@
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
#include "llvm/CodeGen/MachineFrameInfo.h"
|
||||
#include "llvm/Target/TargetRegisterInfo.h"
|
||||
#include "llvm/Target/TargetInstrInfo.h"
|
||||
|
||||
@ -96,7 +95,7 @@ static bool isCVTAToLocalCombinationCandidate(MachineInstr &Root) {
|
||||
|
||||
// Check the LEA_ADDRi operand is Frame index
|
||||
auto &BaseAddrOp = GenericAddrDef->getOperand(1);
|
||||
if (BaseAddrOp.getType() == MachineOperand::MO_FrameIndex) {
|
||||
if (BaseAddrOp.isReg() && BaseAddrOp.getReg() == NVPTX::VRFrame) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -110,16 +109,11 @@ static void CombineCVTAToLocal(MachineInstr &Root) {
|
||||
const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
|
||||
auto &Prev = *MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
|
||||
|
||||
// Get the correct offset
|
||||
int FrameIndex = Prev.getOperand(1).getIndex();
|
||||
int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
|
||||
Prev.getOperand(2).getImm();
|
||||
|
||||
MachineInstrBuilder MIB =
|
||||
BuildMI(MF, Root.getDebugLoc(), TII->get(Prev.getOpcode()),
|
||||
Root.getOperand(0).getReg())
|
||||
.addReg(NVPTX::VRFrameLocal)
|
||||
.addOperand(MachineOperand::CreateImm(Offset));
|
||||
.addOperand(Prev.getOperand(2));
|
||||
|
||||
MBB.insert((MachineBasicBlock::iterator)&Root, MIB);
|
||||
|
||||
@ -145,6 +139,15 @@ bool NVPTXPeephole::runOnMachineFunction(MachineFunction &MF) {
|
||||
}
|
||||
} // Instruction
|
||||
} // Basic Block
|
||||
|
||||
// Remove unnecessary %VRFrame = cvta.local %VRFrameLocal
|
||||
const auto &MRI = MF.getRegInfo();
|
||||
if (MRI.use_empty(NVPTX::VRFrame)) {
|
||||
if (auto MI = MRI.getUniqueVRegDef(NVPTX::VRFrame)) {
|
||||
MI->eraseFromParentAndMarkDBGValuesForRemoval();
|
||||
}
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
||||
|
||||
|
@ -205,13 +205,15 @@ bool NVPTXPassConfig::addInstSelector() {
|
||||
if (!ST.hasImageHandles())
|
||||
addPass(createNVPTXReplaceImageHandlesPass());
|
||||
|
||||
addPass(createNVPTXPeephole());
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void NVPTXPassConfig::addPostRegAlloc() {
|
||||
addPass(createNVPTXPrologEpilogPass(), false);
|
||||
// NVPTXPrologEpilogPass calculates frame object offset and replace frame
|
||||
// index with VRFrame register. NVPTXPeephole need to be run after that and
|
||||
// will replace VRFrame with VRFrameLocal when possible.
|
||||
addPass(createNVPTXPeephole());
|
||||
}
|
||||
|
||||
FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
|
||||
|
@ -59,10 +59,16 @@ define void @foo3(i32 %a) {
|
||||
|
||||
; PTX32: cvta.local.u32 %SP, %SPL;
|
||||
; PTX32: add.u32 {{%r[0-9]+}}, %SP, 0;
|
||||
; PTX32: add.u32 {{%r[0-9]+}}, %SPL, 0;
|
||||
; PTX32: add.u32 {{%r[0-9]+}}, %SP, 4;
|
||||
; PTX32: add.u32 {{%r[0-9]+}}, %SPL, 4;
|
||||
; PTX32: st.local.u32 [{{%r[0-9]+}}], {{%r[0-9]+}}
|
||||
; PTX32: st.local.u32 [{{%r[0-9]+}}], {{%r[0-9]+}}
|
||||
; PTX64: cvta.local.u64 %SP, %SPL;
|
||||
; PTX64: add.u64 {{%rd[0-9]+}}, %SP, 0;
|
||||
; PTX64: add.u64 {{%rd[0-9]+}}, %SPL, 0;
|
||||
; PTX64: add.u64 {{%rd[0-9]+}}, %SP, 4;
|
||||
; PTX64: add.u64 {{%rd[0-9]+}}, %SPL, 4;
|
||||
; PTX64: st.local.u32 [{{%rd[0-9]+}}], {{%r[0-9]+}}
|
||||
; PTX64: st.local.u32 [{{%rd[0-9]+}}], {{%r[0-9]+}}
|
||||
define void @foo4() {
|
||||
|
Loading…
Reference in New Issue
Block a user