1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-25 14:02:52 +02:00
llvm-mirror/lib/Target/PowerPC/PPCRegisterInfo.cpp

992 lines
36 KiB
C++
Raw Normal View History

//===-- PPCRegisterInfo.cpp - PowerPC Register Information ----------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file contains the PowerPC implementation of the TargetRegisterInfo
// class.
//
//===----------------------------------------------------------------------===//
#include "PPCRegisterInfo.h"
#include "PPC.h"
#include "PPCFrameLowering.h"
#include "PPCInstrBuilder.h"
#include "PPCMachineFunctionInfo.h"
#include "PPCSubtarget.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetFrameLowering.h"
#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include <cstdlib>
using namespace llvm;
#define DEBUG_TYPE "reginfo"
#define GET_REGINFO_TARGET_DESC
#include "PPCGenRegisterInfo.inc"
static cl::opt<bool>
EnableBasePointer("ppc-use-base-pointer", cl::Hidden, cl::init(true),
cl::desc("Enable use of a base pointer for complex stack frames"));
static cl::opt<bool>
AlwaysBasePointer("ppc-always-use-base-pointer", cl::Hidden, cl::init(false),
cl::desc("Force the use of a base pointer in every function"));
PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST)
: PPCGenRegisterInfo(ST.isPPC64() ? PPC::LR8 : PPC::LR,
ST.isPPC64() ? 0 : 1,
ST.isPPC64() ? 0 : 1),
Subtarget(ST) {
ImmToIdxMap[PPC::LD] = PPC::LDX; ImmToIdxMap[PPC::STD] = PPC::STDX;
ImmToIdxMap[PPC::LBZ] = PPC::LBZX; ImmToIdxMap[PPC::STB] = PPC::STBX;
ImmToIdxMap[PPC::LHZ] = PPC::LHZX; ImmToIdxMap[PPC::LHA] = PPC::LHAX;
ImmToIdxMap[PPC::LWZ] = PPC::LWZX; ImmToIdxMap[PPC::LWA] = PPC::LWAX;
ImmToIdxMap[PPC::LFS] = PPC::LFSX; ImmToIdxMap[PPC::LFD] = PPC::LFDX;
ImmToIdxMap[PPC::STH] = PPC::STHX; ImmToIdxMap[PPC::STW] = PPC::STWX;
ImmToIdxMap[PPC::STFS] = PPC::STFSX; ImmToIdxMap[PPC::STFD] = PPC::STFDX;
ImmToIdxMap[PPC::ADDI] = PPC::ADD4;
ImmToIdxMap[PPC::LWA_32] = PPC::LWAX_32;
// 64-bit
ImmToIdxMap[PPC::LHA8] = PPC::LHAX8; ImmToIdxMap[PPC::LBZ8] = PPC::LBZX8;
ImmToIdxMap[PPC::LHZ8] = PPC::LHZX8; ImmToIdxMap[PPC::LWZ8] = PPC::LWZX8;
ImmToIdxMap[PPC::STB8] = PPC::STBX8; ImmToIdxMap[PPC::STH8] = PPC::STHX8;
ImmToIdxMap[PPC::STW8] = PPC::STWX8; ImmToIdxMap[PPC::STDU] = PPC::STDUX;
ImmToIdxMap[PPC::ADDI8] = PPC::ADD8;
}
/// getPointerRegClass - Return the register class to use to hold pointers.
/// This is used for addressing modes.
const TargetRegisterClass *
PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
const {
// Note that PPCInstrInfo::FoldImmediate also directly uses this Kind value
// when it checks for ZERO folding.
if (Kind == 1) {
if (Subtarget.isPPC64())
return &PPC::G8RC_NOX0RegClass;
return &PPC::GPRC_NOR0RegClass;
}
if (Subtarget.isPPC64())
return &PPC::G8RCRegClass;
return &PPC::GPRCRegClass;
}
const MCPhysReg*
PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
if (Subtarget.isDarwinABI())
Cleanup PPC Altivec registers in CSR lists and improve VRSAVE handling There are a couple of (small) related changes here: 1. The printed name of the VRSAVE register has been changed from VRsave to vrsave in order to match the name accepted by GNU binutils. 2. Support for parsing vrsave has been added to the asm parser (it seems that there was no test case specifically covering this code, so I've added one). 3. The list of Altivec registers, which was common to all calling conventions, has been separated out. This allows us to define the base CSR lists, and then lists for each ABI with Altivec included. This allows SjLj, for example, to work correctly on non-Altivec targets without using unnatural definitions of the NoRegs CSR list. 4. VRSAVE is now always reserved on non-Darwin targets and all Altivec registers are reserved when Altivec is disabled. With these changes, it is now possible to compile a function containing __builtin_unwind_init() on Linux/PPC64 with debugging information. This did not work previously because GNU binutils assumes that all .cfi_offset offsets will be 8-byte aligned on PPC64 (and errors out if you provide a non-8-byte-aligned offset). This is not true for the vrsave register, however, because this register is used only on Darwin, GCC does not bother printing a .cfi_offset entry for it (even though there is a slot in the stack frame for it as specified by the ABI). This change allows us to do the same: we will also not print .cfi_offset directives for vrsave. llvm-svn: 185409
2013-07-02 05:39:34 +02:00
return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ?
CSR_Darwin64_Altivec_SaveList :
CSR_Darwin64_SaveList) :
(Subtarget.hasAltivec() ?
CSR_Darwin32_Altivec_SaveList :
CSR_Darwin32_SaveList);
return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ?
CSR_SVR464_Altivec_SaveList :
CSR_SVR464_SaveList) :
(Subtarget.hasAltivec() ?
CSR_SVR432_Altivec_SaveList :
CSR_SVR432_SaveList);
}
const uint32_t*
PPCRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
if (Subtarget.isDarwinABI())
Cleanup PPC Altivec registers in CSR lists and improve VRSAVE handling There are a couple of (small) related changes here: 1. The printed name of the VRSAVE register has been changed from VRsave to vrsave in order to match the name accepted by GNU binutils. 2. Support for parsing vrsave has been added to the asm parser (it seems that there was no test case specifically covering this code, so I've added one). 3. The list of Altivec registers, which was common to all calling conventions, has been separated out. This allows us to define the base CSR lists, and then lists for each ABI with Altivec included. This allows SjLj, for example, to work correctly on non-Altivec targets without using unnatural definitions of the NoRegs CSR list. 4. VRSAVE is now always reserved on non-Darwin targets and all Altivec registers are reserved when Altivec is disabled. With these changes, it is now possible to compile a function containing __builtin_unwind_init() on Linux/PPC64 with debugging information. This did not work previously because GNU binutils assumes that all .cfi_offset offsets will be 8-byte aligned on PPC64 (and errors out if you provide a non-8-byte-aligned offset). This is not true for the vrsave register, however, because this register is used only on Darwin, GCC does not bother printing a .cfi_offset entry for it (even though there is a slot in the stack frame for it as specified by the ABI). This change allows us to do the same: we will also not print .cfi_offset directives for vrsave. llvm-svn: 185409
2013-07-02 05:39:34 +02:00
return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ?
CSR_Darwin64_Altivec_RegMask :
CSR_Darwin64_RegMask) :
(Subtarget.hasAltivec() ?
CSR_Darwin32_Altivec_RegMask :
CSR_Darwin32_RegMask);
return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ?
CSR_SVR464_Altivec_RegMask :
CSR_SVR464_RegMask) :
(Subtarget.hasAltivec() ?
CSR_SVR432_Altivec_RegMask :
CSR_SVR432_RegMask);
}
const uint32_t*
PPCRegisterInfo::getNoPreservedMask() const {
return CSR_NoRegs_RegMask;
}
BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
BitVector Reserved(getNumRegs());
const PPCFrameLowering *PPCFI =
static_cast<const PPCFrameLowering*>(MF.getTarget().getFrameLowering());
// The ZERO register is not really a register, but the representation of r0
// when used in instructions that treat r0 as the constant 0.
Reserved.set(PPC::ZERO);
Reserved.set(PPC::ZERO8);
// The FP register is also not really a register, but is the representation
// of the frame pointer register used by ISD::FRAMEADDR.
Reserved.set(PPC::FP);
Reserved.set(PPC::FP8);
// The BP register is also not really a register, but is the representation
// of the base pointer register used by setjmp.
Reserved.set(PPC::BP);
Reserved.set(PPC::BP8);
Implement PPC counter loops as a late IR-level pass The old PPCCTRLoops pass, like the Hexagon pass version from which it was derived, could only handle some simple loops in canonical form. We cannot directly adapt the new Hexagon hardware loops pass, however, because the Hexagon pass contains a fundamental assumption that non-constant-trip-count loops will contain a guard, and this is not always true (the result being that incorrect negative counts can be generated). With this commit, we replace the pass with a late IR-level pass which makes use of SE to calculate the backedge-taken counts and safely generate the loop-count expressions (including any necessary max() parts). This IR level pass inserts custom intrinsics that are lowered into the desired decrement-and-branch instructions. The most fragile part of this new implementation is that interfering uses of the counter register must be detected on the IR level (and, on PPC, this also includes any indirect branches in addition to function calls). Also, to make all of this work, we need a variant of the mtctr instruction that is marked as having side effects. Without this, machine-code level CSE, DCE, etc. illegally transform the resulting code. Hopefully, this can be improved in the future. This new pass is smaller than the original (and much smaller than the new Hexagon hardware loops pass), and can handle many additional cases correctly. In addition, the preheader-creation code has been copied from LoopSimplify, and after we decide on where it belongs, this code will be refactored so that it can be explicitly shared (making this implementation even smaller). The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for the new Hexagon pass. There are a few classes of loops that this pass does not transform (noted by FIXMEs in the files), but these deficiencies can be addressed within the SE infrastructure (thus helping many other passes as well). llvm-svn: 181927
2013-05-15 23:37:41 +02:00
// The counter registers must be reserved so that counter-based loops can
// be correctly formed (and the mtctr instructions are not DCE'd).
Reserved.set(PPC::CTR);
Reserved.set(PPC::CTR8);
Reserved.set(PPC::R1);
Reserved.set(PPC::LR);
Reserved.set(PPC::LR8);
Reserved.set(PPC::RM);
Cleanup PPC Altivec registers in CSR lists and improve VRSAVE handling There are a couple of (small) related changes here: 1. The printed name of the VRSAVE register has been changed from VRsave to vrsave in order to match the name accepted by GNU binutils. 2. Support for parsing vrsave has been added to the asm parser (it seems that there was no test case specifically covering this code, so I've added one). 3. The list of Altivec registers, which was common to all calling conventions, has been separated out. This allows us to define the base CSR lists, and then lists for each ABI with Altivec included. This allows SjLj, for example, to work correctly on non-Altivec targets without using unnatural definitions of the NoRegs CSR list. 4. VRSAVE is now always reserved on non-Darwin targets and all Altivec registers are reserved when Altivec is disabled. With these changes, it is now possible to compile a function containing __builtin_unwind_init() on Linux/PPC64 with debugging information. This did not work previously because GNU binutils assumes that all .cfi_offset offsets will be 8-byte aligned on PPC64 (and errors out if you provide a non-8-byte-aligned offset). This is not true for the vrsave register, however, because this register is used only on Darwin, GCC does not bother printing a .cfi_offset entry for it (even though there is a slot in the stack frame for it as specified by the ABI). This change allows us to do the same: we will also not print .cfi_offset directives for vrsave. llvm-svn: 185409
2013-07-02 05:39:34 +02:00
if (!Subtarget.isDarwinABI() || !Subtarget.hasAltivec())
Reserved.set(PPC::VRSAVE);
// The SVR4 ABI reserves r2 and r13
if (Subtarget.isSVR4ABI()) {
Reserved.set(PPC::R2); // System-reserved register
Reserved.set(PPC::R13); // Small Data Area pointer register
}
// On PPC64, r13 is the thread pointer. Never allocate this register.
if (Subtarget.isPPC64()) {
Reserved.set(PPC::R13);
Reserved.set(PPC::X1);
Reserved.set(PPC::X13);
if (PPCFI->needsFP(MF))
Reserved.set(PPC::X31);
if (hasBasePointer(MF))
Reserved.set(PPC::X30);
// The 64-bit SVR4 ABI reserves r2 for the TOC pointer.
if (Subtarget.isSVR4ABI()) {
Reserved.set(PPC::X2);
}
}
if (PPCFI->needsFP(MF))
Reserved.set(PPC::R31);
if (hasBasePointer(MF))
Reserved.set(PPC::R30);
Cleanup PPC Altivec registers in CSR lists and improve VRSAVE handling There are a couple of (small) related changes here: 1. The printed name of the VRSAVE register has been changed from VRsave to vrsave in order to match the name accepted by GNU binutils. 2. Support for parsing vrsave has been added to the asm parser (it seems that there was no test case specifically covering this code, so I've added one). 3. The list of Altivec registers, which was common to all calling conventions, has been separated out. This allows us to define the base CSR lists, and then lists for each ABI with Altivec included. This allows SjLj, for example, to work correctly on non-Altivec targets without using unnatural definitions of the NoRegs CSR list. 4. VRSAVE is now always reserved on non-Darwin targets and all Altivec registers are reserved when Altivec is disabled. With these changes, it is now possible to compile a function containing __builtin_unwind_init() on Linux/PPC64 with debugging information. This did not work previously because GNU binutils assumes that all .cfi_offset offsets will be 8-byte aligned on PPC64 (and errors out if you provide a non-8-byte-aligned offset). This is not true for the vrsave register, however, because this register is used only on Darwin, GCC does not bother printing a .cfi_offset entry for it (even though there is a slot in the stack frame for it as specified by the ABI). This change allows us to do the same: we will also not print .cfi_offset directives for vrsave. llvm-svn: 185409
2013-07-02 05:39:34 +02:00
// Reserve Altivec registers when Altivec is unavailable.
if (!Subtarget.hasAltivec())
for (TargetRegisterClass::iterator I = PPC::VRRCRegClass.begin(),
IE = PPC::VRRCRegClass.end(); I != IE; ++I)
Reserved.set(*I);
return Reserved;
}
unsigned
PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const {
const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
const unsigned DefaultSafety = 1;
switch (RC->getID()) {
default:
return 0;
case PPC::G8RC_NOX0RegClassID:
case PPC::GPRC_NOR0RegClassID:
case PPC::G8RCRegClassID:
case PPC::GPRCRegClassID: {
unsigned FP = TFI->hasFP(MF) ? 1 : 0;
return 32 - FP - DefaultSafety;
}
case PPC::F8RCRegClassID:
case PPC::F4RCRegClassID:
case PPC::VRRCRegClassID:
case PPC::VFRCRegClassID:
[PowerPC] Initial support for the VSX instruction set VSX is an ISA extension supported on the POWER7 and later cores that enhances floating-point vector and scalar capabilities. Among other things, this adds <2 x double> support and generally helps to reduce register pressure. The interesting part of this ISA feature is the register configuration: there are 64 new 128-bit vector registers, the 32 of which are super-registers of the existing 32 scalar floating-point registers, and the second 32 of which overlap with the 32 Altivec vector registers. This makes things like vector insertion and extraction tricky: this can be free but only if we force a restriction to the right register subclass when needed. A new "minipass" PPCVSXCopy takes care of this (although it could do a more-optimal job of it; see the comment about unnecessary copies below). Please note that, currently, VSX is not enabled by default when targeting anything because it is not yet ready for that. The assembler and disassembler are fully implemented and tested. However: - CodeGen support causes miscompiles; test-suite runtime failures: MultiSource/Benchmarks/FreeBench/distray/distray MultiSource/Benchmarks/McCat/08-main/main MultiSource/Benchmarks/Olden/voronoi/voronoi MultiSource/Benchmarks/mafft/pairlocalalign MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4 SingleSource/Benchmarks/CoyoteBench/almabench SingleSource/Benchmarks/Misc/matmul_f64_4x4 - The lowering currently falls back to using Altivec instructions far more than it should. Worse, there are some things that are scalarized through the stack that shouldn't be. - A lot of unnecessary copies make it past the optimizers, and this needs to be fixed. - Many more regression tests are needed. Normally, I'd fix these things prior to committing, but there are some students and other contributors who would like to work this, and so it makes sense to move this development process upstream where it can be subject to the regular code-review procedures. llvm-svn: 203768
2014-03-13 08:58:58 +01:00
case PPC::VSLRCRegClassID:
case PPC::VSHRCRegClassID:
return 32 - DefaultSafety;
[PowerPC] Initial support for the VSX instruction set VSX is an ISA extension supported on the POWER7 and later cores that enhances floating-point vector and scalar capabilities. Among other things, this adds <2 x double> support and generally helps to reduce register pressure. The interesting part of this ISA feature is the register configuration: there are 64 new 128-bit vector registers, the 32 of which are super-registers of the existing 32 scalar floating-point registers, and the second 32 of which overlap with the 32 Altivec vector registers. This makes things like vector insertion and extraction tricky: this can be free but only if we force a restriction to the right register subclass when needed. A new "minipass" PPCVSXCopy takes care of this (although it could do a more-optimal job of it; see the comment about unnecessary copies below). Please note that, currently, VSX is not enabled by default when targeting anything because it is not yet ready for that. The assembler and disassembler are fully implemented and tested. However: - CodeGen support causes miscompiles; test-suite runtime failures: MultiSource/Benchmarks/FreeBench/distray/distray MultiSource/Benchmarks/McCat/08-main/main MultiSource/Benchmarks/Olden/voronoi/voronoi MultiSource/Benchmarks/mafft/pairlocalalign MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4 SingleSource/Benchmarks/CoyoteBench/almabench SingleSource/Benchmarks/Misc/matmul_f64_4x4 - The lowering currently falls back to using Altivec instructions far more than it should. Worse, there are some things that are scalarized through the stack that shouldn't be. - A lot of unnecessary copies make it past the optimizers, and this needs to be fixed. - Many more regression tests are needed. Normally, I'd fix these things prior to committing, but there are some students and other contributors who would like to work this, and so it makes sense to move this development process upstream where it can be subject to the regular code-review procedures. llvm-svn: 203768
2014-03-13 08:58:58 +01:00
case PPC::VSRCRegClassID:
case PPC::VSFRCRegClassID:
[PowerPC] Initial support for the VSX instruction set VSX is an ISA extension supported on the POWER7 and later cores that enhances floating-point vector and scalar capabilities. Among other things, this adds <2 x double> support and generally helps to reduce register pressure. The interesting part of this ISA feature is the register configuration: there are 64 new 128-bit vector registers, the 32 of which are super-registers of the existing 32 scalar floating-point registers, and the second 32 of which overlap with the 32 Altivec vector registers. This makes things like vector insertion and extraction tricky: this can be free but only if we force a restriction to the right register subclass when needed. A new "minipass" PPCVSXCopy takes care of this (although it could do a more-optimal job of it; see the comment about unnecessary copies below). Please note that, currently, VSX is not enabled by default when targeting anything because it is not yet ready for that. The assembler and disassembler are fully implemented and tested. However: - CodeGen support causes miscompiles; test-suite runtime failures: MultiSource/Benchmarks/FreeBench/distray/distray MultiSource/Benchmarks/McCat/08-main/main MultiSource/Benchmarks/Olden/voronoi/voronoi MultiSource/Benchmarks/mafft/pairlocalalign MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4 SingleSource/Benchmarks/CoyoteBench/almabench SingleSource/Benchmarks/Misc/matmul_f64_4x4 - The lowering currently falls back to using Altivec instructions far more than it should. Worse, there are some things that are scalarized through the stack that shouldn't be. - A lot of unnecessary copies make it past the optimizers, and this needs to be fixed. - Many more regression tests are needed. Normally, I'd fix these things prior to committing, but there are some students and other contributors who would like to work this, and so it makes sense to move this development process upstream where it can be subject to the regular code-review procedures. llvm-svn: 203768
2014-03-13 08:58:58 +01:00
return 64 - DefaultSafety;
case PPC::CRRCRegClassID:
return 8 - DefaultSafety;
}
}
const TargetRegisterClass*
PPCRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC)const {
if (Subtarget.hasVSX()) {
// With VSX, we can inflate various sub-register classes to the full VSX
// register set.
if (RC == &PPC::F8RCRegClass)
return &PPC::VSFRCRegClass;
else if (RC == &PPC::VRRCRegClass)
return &PPC::VSRCRegClass;
}
return TargetRegisterInfo::getLargestLegalSuperClass(RC);
}
//===----------------------------------------------------------------------===//
// Stack Frame Processing methods
//===----------------------------------------------------------------------===//
/// lowerDynamicAlloc - Generate the code for allocating an object in the
/// current frame. The sequence of code with be in the general form
///
/// addi R0, SP, \#frameSize ; get the address of the previous frame
/// stwxu R0, SP, Rnegsize ; add and update the SP with the negated size
/// addi Rnew, SP, \#maxCalFrameSize ; get the top of the allocation
///
void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
// Get the instruction.
MachineInstr &MI = *II;
// Get the instruction's basic block.
MachineBasicBlock &MBB = *MI.getParent();
// Get the basic block's function.
MachineFunction &MF = *MBB.getParent();
// Get the frame info.
MachineFrameInfo *MFI = MF.getFrameInfo();
// Get the instruction info.
const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
// Determine whether 64-bit pointers are used.
bool LP64 = Subtarget.isPPC64();
DebugLoc dl = MI.getDebugLoc();
2007-01-25 23:48:25 +01:00
// Get the maximum call stack size.
unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
// Get the total frame size.
unsigned FrameSize = MFI->getStackSize();
// Get stack alignments.
unsigned TargetAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
unsigned MaxAlign = MFI->getMaxAlignment();
assert((maxCallFrameSize & (MaxAlign-1)) == 0 &&
"Maximum call-frame size not sufficiently aligned");
// Determine the previous frame's address. If FrameSize can't be
// represented as 16 bits or we need special alignment, then we load the
// previous frame's address from 0(SP). Why not do an addis of the hi?
// Because R0 is our only safe tmp register and addi/addis treat R0 as zero.
// Constructing the constant and adding would take 3 instructions.
// Fortunately, a frame greater than 32K is rare.
const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) {
BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
.addReg(PPC::R31)
.addImm(FrameSize);
} else if (LP64) {
BuildMI(MBB, II, dl, TII.get(PPC::LD), Reg)
.addImm(0)
.addReg(PPC::X1);
} else {
BuildMI(MBB, II, dl, TII.get(PPC::LWZ), Reg)
.addImm(0)
.addReg(PPC::R1);
}
bool KillNegSizeReg = MI.getOperand(1).isKill();
unsigned NegSizeReg = MI.getOperand(1).getReg();
// Grow the stack and update the stack pointer link, then determine the
// address of new allocated space.
if (LP64) {
if (MaxAlign > TargetAlign) {
unsigned UnalNegSizeReg = NegSizeReg;
NegSizeReg = MF.getRegInfo().createVirtualRegister(G8RC);
// Unfortunately, there is no andi, only andi., and we can't insert that
// here because we might clobber cr0 while it is live.
BuildMI(MBB, II, dl, TII.get(PPC::LI8), NegSizeReg)
.addImm(~(MaxAlign-1));
unsigned NegSizeReg1 = NegSizeReg;
NegSizeReg = MF.getRegInfo().createVirtualRegister(G8RC);
BuildMI(MBB, II, dl, TII.get(PPC::AND8), NegSizeReg)
.addReg(UnalNegSizeReg, getKillRegState(KillNegSizeReg))
.addReg(NegSizeReg1, RegState::Kill);
KillNegSizeReg = true;
}
BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1)
.addReg(Reg, RegState::Kill)
.addReg(PPC::X1)
.addReg(NegSizeReg, getKillRegState(KillNegSizeReg));
BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), MI.getOperand(0).getReg())
.addReg(PPC::X1)
.addImm(maxCallFrameSize);
} else {
if (MaxAlign > TargetAlign) {
unsigned UnalNegSizeReg = NegSizeReg;
NegSizeReg = MF.getRegInfo().createVirtualRegister(GPRC);
// Unfortunately, there is no andi, only andi., and we can't insert that
// here because we might clobber cr0 while it is live.
BuildMI(MBB, II, dl, TII.get(PPC::LI), NegSizeReg)
.addImm(~(MaxAlign-1));
unsigned NegSizeReg1 = NegSizeReg;
NegSizeReg = MF.getRegInfo().createVirtualRegister(GPRC);
BuildMI(MBB, II, dl, TII.get(PPC::AND), NegSizeReg)
.addReg(UnalNegSizeReg, getKillRegState(KillNegSizeReg))
.addReg(NegSizeReg1, RegState::Kill);
KillNegSizeReg = true;
}
BuildMI(MBB, II, dl, TII.get(PPC::STWUX), PPC::R1)
.addReg(Reg, RegState::Kill)
.addReg(PPC::R1)
.addReg(NegSizeReg, getKillRegState(KillNegSizeReg));
BuildMI(MBB, II, dl, TII.get(PPC::ADDI), MI.getOperand(0).getReg())
.addReg(PPC::R1)
.addImm(maxCallFrameSize);
}
// Discard the DYNALLOC instruction.
MBB.erase(II);
}
/// lowerCRSpilling - Generate the code for spilling a CR register. Instead of
/// reserving a whole register (R0), we scrounge for one here. This generates
/// code like this:
///
/// mfcr rA ; Move the conditional register into GPR rA.
/// rlwinm rA, rA, SB, 0, 31 ; Shift the bits left so they are in CR0's slot.
/// stw rA, FI ; Store rA to the frame.
///
void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
unsigned FrameIndex) const {
// Get the instruction.
MachineInstr &MI = *II; // ; SPILL_CR <SrcReg>, <offset>
// Get the instruction's basic block.
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
DebugLoc dl = MI.getDebugLoc();
bool LP64 = Subtarget.isPPC64();
const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
unsigned SrcReg = MI.getOperand(0).getReg();
// We need to store the CR in the low 4-bits of the saved value. First, issue
// an MFOCRF to save all of the CRBits and, if needed, kill the SrcReg.
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
.addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
// If the saved register wasn't CR0, shift the bits left so that they are in
// CR0's slot.
if (SrcReg != PPC::CR0) {
unsigned Reg1 = Reg;
Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
// rlwinm rA, rA, ShiftBits, 0, 31.
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg)
.addReg(Reg1, RegState::Kill)
.addImm(getEncodingValue(SrcReg) * 4)
.addImm(0)
.addImm(31);
}
addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::STW8 : PPC::STW))
.addReg(Reg, RegState::Kill),
FrameIndex);
// Discard the pseudo instruction.
MBB.erase(II);
}
void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II,
unsigned FrameIndex) const {
// Get the instruction.
MachineInstr &MI = *II; // ; <DestReg> = RESTORE_CR <offset>
// Get the instruction's basic block.
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
DebugLoc dl = MI.getDebugLoc();
bool LP64 = Subtarget.isPPC64();
const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
unsigned DestReg = MI.getOperand(0).getReg();
assert(MI.definesRegister(DestReg) &&
"RESTORE_CR does not define its destination");
addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LWZ8 : PPC::LWZ),
Reg), FrameIndex);
// If the reloaded register isn't CR0, shift the bits right so that they are
// in the right CR's slot.
if (DestReg != PPC::CR0) {
unsigned Reg1 = Reg;
Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
unsigned ShiftBits = getEncodingValue(DestReg)*4;
// rlwinm r11, r11, 32-ShiftBits, 0, 31.
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg)
.addReg(Reg1, RegState::Kill).addImm(32-ShiftBits).addImm(0)
.addImm(31);
}
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTOCRF8 : PPC::MTOCRF), DestReg)
.addReg(Reg, RegState::Kill);
// Discard the pseudo instruction.
MBB.erase(II);
}
Add CR-bit tracking to the PowerPC backend for i1 values This change enables tracking i1 values in the PowerPC backend using the condition register bits. These bits can be treated on PowerPC as separate registers; individual bit operations (and, or, xor, etc.) are supported. Tracking booleans in CR bits has several advantages: - Reduction in register pressure (because we no longer need GPRs to store boolean values). - Logical operations on booleans can be handled more efficiently; we used to have to move all results from comparisons into GPRs, perform promoted logical operations in GPRs, and then move the result back into condition register bits to be used by conditional branches. This can be very inefficient, because the throughput of these CR <-> GPR moves have high latency and low throughput (especially when other associated instructions are accounted for). - On the POWER7 and similar cores, we can increase total throughput by using the CR bits. CR bit operations have a dedicated functional unit. Most of this is more-or-less mechanical: Adjustments were needed in the calling-convention code, support was added for spilling/restoring individual condition-register bits, and conditional branch instruction definitions taking specific CR bits were added (plus patterns and code for generating bit-level operations). This is enabled by default when running at -O2 and higher. For -O0 and -O1, where the ability to debug is more important, this feature is disabled by default. Individual CR bits do not have assigned DWARF register numbers, and storing values in CR bits makes them invisible to the debugger. It is critical, however, that we don't move i1 values that have been promoted to larger values (such as those passed as function arguments) into bit registers only to quickly turn around and move the values back into GPRs (such as happens when values are returned by functions). A pair of target-specific DAG combines are added to remove the trunc/extends in: trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) and: zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) In short, we only want to use CR bits where some of the i1 values come from comparisons or are used by conditional branches or selects. To put it another way, if we can do the entire i1 computation in GPRs, then we probably should (on the POWER7, the GPR-operation throughput is higher, and for all cores, the CR <-> GPR moves are expensive). POWER7 test-suite performance results (from 10 runs in each configuration): SingleSource/Benchmarks/Misc/mandel-2: 35% speedup MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown MultiSource/Applications/lemon/lemon: 8% slowdown llvm-svn: 202451
2014-02-28 01:27:01 +01:00
static unsigned getCRFromCRBit(unsigned SrcReg) {
unsigned Reg = 0;
if (SrcReg == PPC::CR0LT || SrcReg == PPC::CR0GT ||
SrcReg == PPC::CR0EQ || SrcReg == PPC::CR0UN)
Reg = PPC::CR0;
else if (SrcReg == PPC::CR1LT || SrcReg == PPC::CR1GT ||
SrcReg == PPC::CR1EQ || SrcReg == PPC::CR1UN)
Reg = PPC::CR1;
else if (SrcReg == PPC::CR2LT || SrcReg == PPC::CR2GT ||
SrcReg == PPC::CR2EQ || SrcReg == PPC::CR2UN)
Reg = PPC::CR2;
else if (SrcReg == PPC::CR3LT || SrcReg == PPC::CR3GT ||
SrcReg == PPC::CR3EQ || SrcReg == PPC::CR3UN)
Reg = PPC::CR3;
else if (SrcReg == PPC::CR4LT || SrcReg == PPC::CR4GT ||
SrcReg == PPC::CR4EQ || SrcReg == PPC::CR4UN)
Reg = PPC::CR4;
else if (SrcReg == PPC::CR5LT || SrcReg == PPC::CR5GT ||
SrcReg == PPC::CR5EQ || SrcReg == PPC::CR5UN)
Reg = PPC::CR5;
else if (SrcReg == PPC::CR6LT || SrcReg == PPC::CR6GT ||
SrcReg == PPC::CR6EQ || SrcReg == PPC::CR6UN)
Reg = PPC::CR6;
else if (SrcReg == PPC::CR7LT || SrcReg == PPC::CR7GT ||
SrcReg == PPC::CR7EQ || SrcReg == PPC::CR7UN)
Reg = PPC::CR7;
assert(Reg != 0 && "Invalid CR bit register");
return Reg;
}
void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
unsigned FrameIndex) const {
// Get the instruction.
MachineInstr &MI = *II; // ; SPILL_CRBIT <SrcReg>, <offset>
// Get the instruction's basic block.
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
DebugLoc dl = MI.getDebugLoc();
bool LP64 = Subtarget.isPPC64();
const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
unsigned SrcReg = MI.getOperand(0).getReg();
BuildMI(MBB, II, dl, TII.get(TargetOpcode::KILL),
getCRFromCRBit(SrcReg))
.addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), Reg)
.addReg(getCRFromCRBit(SrcReg));
// If the saved register wasn't CR0LT, shift the bits left so that the bit to
// store is the first one. Mask all but that bit.
unsigned Reg1 = Reg;
Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
// rlwinm rA, rA, ShiftBits, 0, 0.
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWINM8 : PPC::RLWINM), Reg)
.addReg(Reg1, RegState::Kill)
.addImm(getEncodingValue(SrcReg))
.addImm(0).addImm(0);
addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::STW8 : PPC::STW))
.addReg(Reg, RegState::Kill),
FrameIndex);
// Discard the pseudo instruction.
MBB.erase(II);
}
void PPCRegisterInfo::lowerCRBitRestore(MachineBasicBlock::iterator II,
unsigned FrameIndex) const {
// Get the instruction.
MachineInstr &MI = *II; // ; <DestReg> = RESTORE_CRBIT <offset>
// Get the instruction's basic block.
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
DebugLoc dl = MI.getDebugLoc();
bool LP64 = Subtarget.isPPC64();
const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
unsigned Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
unsigned DestReg = MI.getOperand(0).getReg();
assert(MI.definesRegister(DestReg) &&
"RESTORE_CRBIT does not define its destination");
addFrameReference(BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::LWZ8 : PPC::LWZ),
Reg), FrameIndex);
BuildMI(MBB, II, dl, TII.get(TargetOpcode::IMPLICIT_DEF), DestReg);
unsigned RegO = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MFOCRF8 : PPC::MFOCRF), RegO)
.addReg(getCRFromCRBit(DestReg));
unsigned ShiftBits = getEncodingValue(DestReg);
// rlwimi r11, r10, 32-ShiftBits, ..., ...
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::RLWIMI8 : PPC::RLWIMI), RegO)
.addReg(RegO, RegState::Kill).addReg(Reg, RegState::Kill)
.addImm(ShiftBits ? 32-ShiftBits : 0)
.addImm(ShiftBits).addImm(ShiftBits);
BuildMI(MBB, II, dl, TII.get(LP64 ? PPC::MTOCRF8 : PPC::MTOCRF),
getCRFromCRBit(DestReg))
.addReg(RegO, RegState::Kill)
// Make sure we have a use dependency all the way through this
// sequence of instructions. We can't have the other bits in the CR
// modified in between the mfocrf and the mtocrf.
.addReg(getCRFromCRBit(DestReg), RegState::Implicit);
// Discard the pseudo instruction.
MBB.erase(II);
}
void PPCRegisterInfo::lowerVRSAVESpilling(MachineBasicBlock::iterator II,
unsigned FrameIndex) const {
// Get the instruction.
MachineInstr &MI = *II; // ; SPILL_VRSAVE <SrcReg>, <offset>
// Get the instruction's basic block.
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
DebugLoc dl = MI.getDebugLoc();
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC);
unsigned SrcReg = MI.getOperand(0).getReg();
BuildMI(MBB, II, dl, TII.get(PPC::MFVRSAVEv), Reg)
.addReg(SrcReg, getKillRegState(MI.getOperand(0).isKill()));
addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::STW))
.addReg(Reg, RegState::Kill),
FrameIndex);
// Discard the pseudo instruction.
MBB.erase(II);
}
void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II,
unsigned FrameIndex) const {
// Get the instruction.
MachineInstr &MI = *II; // ; <DestReg> = RESTORE_VRSAVE <offset>
// Get the instruction's basic block.
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
DebugLoc dl = MI.getDebugLoc();
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
unsigned Reg = MF.getRegInfo().createVirtualRegister(GPRC);
unsigned DestReg = MI.getOperand(0).getReg();
assert(MI.definesRegister(DestReg) &&
"RESTORE_VRSAVE does not define its destination");
addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::LWZ),
Reg), FrameIndex);
BuildMI(MBB, II, dl, TII.get(PPC::MTVRSAVEv), DestReg)
.addReg(Reg, RegState::Kill);
// Discard the pseudo instruction.
MBB.erase(II);
}
bool
PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
unsigned Reg, int &FrameIdx) const {
// For the nonvolatile condition registers (CR2, CR3, CR4) in an SVR4
// ABI, return true to prevent allocating an additional frame slot.
// For 64-bit, the CR save area is at SP+8; the value of FrameIdx = 0
// is arbitrary and will be subsequently ignored. For 32-bit, we have
// previously created the stack slot if needed, so return its FrameIdx.
if (Subtarget.isSVR4ABI() && PPC::CR2 <= Reg && Reg <= PPC::CR4) {
if (Subtarget.isPPC64())
FrameIdx = 0;
else {
const PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
FrameIdx = FI->getCRSpillFrameIndex();
}
return true;
}
return false;
}
[PowerPC] Use true offset value in "memrix" machine operands This is the second part of the change to always return "true" offset values from getPreIndexedAddressParts, tackling the case of "memrix" type operands. This is about instructions like LD/STD that only have a 14-bit field to encode immediate offsets, which are implicitly extended by two zero bits by the machine, so that in effect we can access 16-bit offsets as long as they are a multiple of 4. The PowerPC back end currently handles such instructions by carrying the 14-bit value (as it will get encoded into the actual machine instructions) in the machine operand fields for such instructions. This means that those values are in fact not the true offset, but rather the offset divided by 4 (and then truncated to an unsigned 14-bit value). Like in the case fixed in r182012, this makes common code operations on such offset values not work as expected. Furthermore, there doesn't really appear to be any strong reason why we should encode machine operands this way. This patch therefore changes the encoding of "memrix" type machine operands to simply contain the "true" offset value as a signed immediate value, while enforcing the rules that it must fit in a 16-bit signed value and must also be a multiple of 4. This change must be made simultaneously in all places that access machine operands of this type. However, just about all those changes make the code simpler; in many cases we can now just share the same code for memri and memrix operands. llvm-svn: 182032
2013-05-16 19:58:02 +02:00
// Figure out if the offset in the instruction must be a multiple of 4.
// This is true for instructions like "STD".
static bool usesIXAddr(const MachineInstr &MI) {
unsigned OpC = MI.getOpcode();
switch (OpC) {
default:
return false;
case PPC::LWA:
case PPC::LWA_32:
case PPC::LD:
case PPC::STD:
return true;
}
}
// Return the OffsetOperandNo given the FIOperandNum (and the instruction).
static unsigned getOffsetONFromFION(const MachineInstr &MI,
unsigned FIOperandNum) {
// Take into account whether it's an add or mem instruction
unsigned OffsetOperandNo = (FIOperandNum == 2) ? 1 : 2;
if (MI.isInlineAsm())
OffsetOperandNo = FIOperandNum-1;
return OffsetOperandNo;
}
void
PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
assert(SPAdj == 0 && "Unexpected");
// Get the instruction.
MachineInstr &MI = *II;
// Get the instruction's basic block.
MachineBasicBlock &MBB = *MI.getParent();
// Get the basic block's function.
MachineFunction &MF = *MBB.getParent();
// Get the instruction info.
const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
// Get the frame info.
MachineFrameInfo *MFI = MF.getFrameInfo();
DebugLoc dl = MI.getDebugLoc();
unsigned OffsetOperandNo = getOffsetONFromFION(MI, FIOperandNum);
// Get the frame index.
int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
// Get the frame pointer save index. Users of this index are primarily
// DYNALLOC instructions.
PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
int FPSI = FI->getFramePointerSaveIndex();
// Get the instruction opcode.
unsigned OpC = MI.getOpcode();
// Special case for dynamic alloca.
if (FPSI && FrameIndex == FPSI &&
(OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) {
lowerDynamicAlloc(II);
return;
}
// Special case for pseudo-ops SPILL_CR and RESTORE_CR, etc.
if (OpC == PPC::SPILL_CR) {
lowerCRSpilling(II, FrameIndex);
return;
} else if (OpC == PPC::RESTORE_CR) {
lowerCRRestore(II, FrameIndex);
return;
Add CR-bit tracking to the PowerPC backend for i1 values This change enables tracking i1 values in the PowerPC backend using the condition register bits. These bits can be treated on PowerPC as separate registers; individual bit operations (and, or, xor, etc.) are supported. Tracking booleans in CR bits has several advantages: - Reduction in register pressure (because we no longer need GPRs to store boolean values). - Logical operations on booleans can be handled more efficiently; we used to have to move all results from comparisons into GPRs, perform promoted logical operations in GPRs, and then move the result back into condition register bits to be used by conditional branches. This can be very inefficient, because the throughput of these CR <-> GPR moves have high latency and low throughput (especially when other associated instructions are accounted for). - On the POWER7 and similar cores, we can increase total throughput by using the CR bits. CR bit operations have a dedicated functional unit. Most of this is more-or-less mechanical: Adjustments were needed in the calling-convention code, support was added for spilling/restoring individual condition-register bits, and conditional branch instruction definitions taking specific CR bits were added (plus patterns and code for generating bit-level operations). This is enabled by default when running at -O2 and higher. For -O0 and -O1, where the ability to debug is more important, this feature is disabled by default. Individual CR bits do not have assigned DWARF register numbers, and storing values in CR bits makes them invisible to the debugger. It is critical, however, that we don't move i1 values that have been promoted to larger values (such as those passed as function arguments) into bit registers only to quickly turn around and move the values back into GPRs (such as happens when values are returned by functions). A pair of target-specific DAG combines are added to remove the trunc/extends in: trunc(binary-ops(binary-ops(zext(x), zext(y)), ...) and: zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...) In short, we only want to use CR bits where some of the i1 values come from comparisons or are used by conditional branches or selects. To put it another way, if we can do the entire i1 computation in GPRs, then we probably should (on the POWER7, the GPR-operation throughput is higher, and for all cores, the CR <-> GPR moves are expensive). POWER7 test-suite performance results (from 10 runs in each configuration): SingleSource/Benchmarks/Misc/mandel-2: 35% speedup MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown MultiSource/Applications/lemon/lemon: 8% slowdown llvm-svn: 202451
2014-02-28 01:27:01 +01:00
} else if (OpC == PPC::SPILL_CRBIT) {
lowerCRBitSpilling(II, FrameIndex);
return;
} else if (OpC == PPC::RESTORE_CRBIT) {
lowerCRBitRestore(II, FrameIndex);
return;
} else if (OpC == PPC::SPILL_VRSAVE) {
lowerVRSAVESpilling(II, FrameIndex);
return;
} else if (OpC == PPC::RESTORE_VRSAVE) {
lowerVRSAVERestore(II, FrameIndex);
return;
}
// Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP).
MI.getOperand(FIOperandNum).ChangeToRegister(
FrameIndex < 0 ? getBaseRegister(MF) : getFrameRegister(MF), false);
// Figure out if the offset in the instruction is shifted right two bits.
bool isIXAddr = usesIXAddr(MI);
// If the instruction is not present in ImmToIdxMap, then it has no immediate
// form (and must be r+r).
bool noImmForm = !MI.isInlineAsm() && !ImmToIdxMap.count(OpC);
// Now add the frame object offset to the offset from r1.
int Offset = MFI->getObjectOffset(FrameIndex);
[PowerPC] Use true offset value in "memrix" machine operands This is the second part of the change to always return "true" offset values from getPreIndexedAddressParts, tackling the case of "memrix" type operands. This is about instructions like LD/STD that only have a 14-bit field to encode immediate offsets, which are implicitly extended by two zero bits by the machine, so that in effect we can access 16-bit offsets as long as they are a multiple of 4. The PowerPC back end currently handles such instructions by carrying the 14-bit value (as it will get encoded into the actual machine instructions) in the machine operand fields for such instructions. This means that those values are in fact not the true offset, but rather the offset divided by 4 (and then truncated to an unsigned 14-bit value). Like in the case fixed in r182012, this makes common code operations on such offset values not work as expected. Furthermore, there doesn't really appear to be any strong reason why we should encode machine operands this way. This patch therefore changes the encoding of "memrix" type machine operands to simply contain the "true" offset value as a signed immediate value, while enforcing the rules that it must fit in a 16-bit signed value and must also be a multiple of 4. This change must be made simultaneously in all places that access machine operands of this type. However, just about all those changes make the code simpler; in many cases we can now just share the same code for memri and memrix operands. llvm-svn: 182032
2013-05-16 19:58:02 +02:00
Offset += MI.getOperand(OffsetOperandNo).getImm();
// If we're not using a Frame Pointer that has been set to the value of the
// SP before having the stack size subtracted from it, then add the stack size
// to Offset to get the correct offset.
// Naked functions have stack size 0, although getStackSize may not reflect that
// because we didn't call all the pieces that compute it for naked functions.
if (!MF.getFunction()->getAttributes().
hasAttribute(AttributeSet::FunctionIndex, Attribute::Naked)) {
if (!(hasBasePointer(MF) && FrameIndex < 0))
Offset += MFI->getStackSize();
}
// If we can, encode the offset directly into the instruction. If this is a
// normal PPC "ri" instruction, any 16-bit value can be safely encoded. If
// this is a PPC64 "ix" instruction, only a 16-bit value with the low two bits
// clear can be encoded. This is extremely uncommon, because normally you
// only "std" to a stack slot that is at least 4-byte aligned, but it can
// happen in invalid code.
assert(OpC != PPC::DBG_VALUE &&
"This should be handle in a target independent way");
if (!noImmForm && isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) {
MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
return;
}
// The offset doesn't fit into a single register, scavenge one to build the
// offset in.
bool is64Bit = Subtarget.isPPC64();
const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
const TargetRegisterClass *RC = is64Bit ? G8RC : GPRC;
unsigned SRegHi = MF.getRegInfo().createVirtualRegister(RC),
SReg = MF.getRegInfo().createVirtualRegister(RC);
// Insert a set of rA with the full offset value before the ld, st, or add
BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::LIS8 : PPC::LIS), SRegHi)
.addImm(Offset >> 16);
BuildMI(MBB, II, dl, TII.get(is64Bit ? PPC::ORI8 : PPC::ORI), SReg)
.addReg(SRegHi, RegState::Kill)
.addImm(Offset);
// Convert into indexed form of the instruction:
//
// sth 0:rA, 1:imm 2:(rB) ==> sthx 0:rA, 2:rB, 1:r0
// addi 0:rA 1:rB, 2, imm ==> add 0:rA, 1:rB, 2:r0
unsigned OperandBase;
if (noImmForm)
OperandBase = 1;
else if (OpC != TargetOpcode::INLINEASM) {
assert(ImmToIdxMap.count(OpC) &&
"No indexed form of load or store available!");
unsigned NewOpcode = ImmToIdxMap.find(OpC)->second;
MI.setDesc(TII.get(NewOpcode));
OperandBase = 1;
} else {
OperandBase = OffsetOperandNo;
}
unsigned StackReg = MI.getOperand(FIOperandNum).getReg();
MI.getOperand(OperandBase).ChangeToRegister(StackReg, false);
MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false, false, true);
}
unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
if (!Subtarget.isPPC64())
return TFI->hasFP(MF) ? PPC::R31 : PPC::R1;
else
return TFI->hasFP(MF) ? PPC::X31 : PPC::X1;
}
unsigned PPCRegisterInfo::getBaseRegister(const MachineFunction &MF) const {
if (!hasBasePointer(MF))
return getFrameRegister(MF);
return Subtarget.isPPC64() ? PPC::X30 : PPC::R30;
}
bool PPCRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
if (!EnableBasePointer)
return false;
if (AlwaysBasePointer)
return true;
// If we need to realign the stack, then the stack pointer can no longer
// serve as an offset into the caller's stack space. As a result, we need a
// base pointer.
return needsStackRealignment(MF);
}
bool PPCRegisterInfo::canRealignStack(const MachineFunction &MF) const {
if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
return false;
return true;
}
bool PPCRegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
const MachineFrameInfo *MFI = MF.getFrameInfo();
const Function *F = MF.getFunction();
unsigned StackAlign = MF.getTarget().getFrameLowering()->getStackAlignment();
bool requiresRealignment =
((MFI->getMaxAlignment() > StackAlign) ||
F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
Attribute::StackAlignment));
return requiresRealignment && canRealignStack(MF);
}
/// Returns true if the instruction's frame index
/// reference would be better served by a base register other than FP
/// or SP. Used by LocalStackFrameAllocation to determine which frame index
/// references it should create new base registers for.
bool PPCRegisterInfo::
needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
assert(Offset < 0 && "Local offset must be negative");
// It's the load/store FI references that cause issues, as it can be difficult
// to materialize the offset if it won't fit in the literal field. Estimate
// based on the size of the local frame and some conservative assumptions
// about the rest of the stack frame (note, this is pre-regalloc, so
// we don't know everything for certain yet) whether this offset is likely
// to be out of range of the immediate. Return true if so.
// We only generate virtual base registers for loads and stores that have
// an r+i form. Return false for everything else.
unsigned OpC = MI->getOpcode();
if (!ImmToIdxMap.count(OpC))
return false;
// Don't generate a new virtual base register just to add zero to it.
if ((OpC == PPC::ADDI || OpC == PPC::ADDI8) &&
MI->getOperand(2).getImm() == 0)
return false;
MachineBasicBlock &MBB = *MI->getParent();
MachineFunction &MF = *MBB.getParent();
const PPCFrameLowering *PPCFI =
static_cast<const PPCFrameLowering*>(MF.getTarget().getFrameLowering());
unsigned StackEst =
PPCFI->determineFrameLayout(MF, false, true);
// If we likely don't need a stack frame, then we probably don't need a
// virtual base register either.
if (!StackEst)
return false;
// Estimate an offset from the stack pointer.
// The incoming offset is relating to the SP at the start of the function,
// but when we access the local it'll be relative to the SP after local
// allocation, so adjust our SP-relative offset by that allocation size.
Offset += StackEst;
// The frame pointer will point to the end of the stack, so estimate the
// offset as the difference between the object offset and the FP location.
return !isFrameOffsetLegal(MI, Offset);
}
/// Insert defining instruction(s) for BaseReg to
/// be a pointer to FrameIdx at the beginning of the basic block.
void PPCRegisterInfo::
materializeFrameBaseRegister(MachineBasicBlock *MBB,
unsigned BaseReg, int FrameIdx,
int64_t Offset) const {
unsigned ADDriOpc = Subtarget.isPPC64() ? PPC::ADDI8 : PPC::ADDI;
MachineBasicBlock::iterator Ins = MBB->begin();
DebugLoc DL; // Defaults to "unknown"
if (Ins != MBB->end())
DL = Ins->getDebugLoc();
const MachineFunction &MF = *MBB->getParent();
const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
const MCInstrDesc &MCID = TII.get(ADDriOpc);
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
MRI.constrainRegClass(BaseReg, TII.getRegClass(MCID, 0, this, MF));
BuildMI(*MBB, Ins, DL, MCID, BaseReg)
.addFrameIndex(FrameIdx).addImm(Offset);
}
void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
int64_t Offset) const {
unsigned FIOperandNum = 0;
while (!MI.getOperand(FIOperandNum).isFI()) {
++FIOperandNum;
assert(FIOperandNum < MI.getNumOperands() &&
"Instr doesn't have FrameIndex operand!");
}
MI.getOperand(FIOperandNum).ChangeToRegister(BaseReg, false);
unsigned OffsetOperandNo = getOffsetONFromFION(MI, FIOperandNum);
[PowerPC] Use true offset value in "memrix" machine operands This is the second part of the change to always return "true" offset values from getPreIndexedAddressParts, tackling the case of "memrix" type operands. This is about instructions like LD/STD that only have a 14-bit field to encode immediate offsets, which are implicitly extended by two zero bits by the machine, so that in effect we can access 16-bit offsets as long as they are a multiple of 4. The PowerPC back end currently handles such instructions by carrying the 14-bit value (as it will get encoded into the actual machine instructions) in the machine operand fields for such instructions. This means that those values are in fact not the true offset, but rather the offset divided by 4 (and then truncated to an unsigned 14-bit value). Like in the case fixed in r182012, this makes common code operations on such offset values not work as expected. Furthermore, there doesn't really appear to be any strong reason why we should encode machine operands this way. This patch therefore changes the encoding of "memrix" type machine operands to simply contain the "true" offset value as a signed immediate value, while enforcing the rules that it must fit in a 16-bit signed value and must also be a multiple of 4. This change must be made simultaneously in all places that access machine operands of this type. However, just about all those changes make the code simpler; in many cases we can now just share the same code for memri and memrix operands. llvm-svn: 182032
2013-05-16 19:58:02 +02:00
Offset += MI.getOperand(OffsetOperandNo).getImm();
MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
const MCInstrDesc &MCID = MI.getDesc();
MachineRegisterInfo &MRI = MF.getRegInfo();
MRI.constrainRegClass(BaseReg,
TII.getRegClass(MCID, FIOperandNum, this, MF));
}
bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
int64_t Offset) const {
unsigned FIOperandNum = 0;
while (!MI->getOperand(FIOperandNum).isFI()) {
++FIOperandNum;
assert(FIOperandNum < MI->getNumOperands() &&
"Instr doesn't have FrameIndex operand!");
}
unsigned OffsetOperandNo = getOffsetONFromFION(*MI, FIOperandNum);
Offset += MI->getOperand(OffsetOperandNo).getImm();
return MI->getOpcode() == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm
(isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0));
}