2004-07-26 20:45:48 +02:00
|
|
|
//===-- X86ISelSimple.cpp - A simple instruction selector for x86 ---------===//
|
2005-04-22 01:38:14 +02:00
|
|
|
//
|
2003-10-20 21:43:21 +02:00
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file was developed by the LLVM research group and is distributed under
|
|
|
|
// the University of Illinois Open Source License. See LICENSE.TXT for details.
|
2005-04-22 01:38:14 +02:00
|
|
|
//
|
2003-10-20 21:43:21 +02:00
|
|
|
//===----------------------------------------------------------------------===//
|
2002-10-26 00:55:53 +02:00
|
|
|
//
|
2003-01-13 01:32:26 +01:00
|
|
|
// This file defines a simple peephole instruction selector for the x86 target
|
2002-10-26 00:55:53 +02:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
#include "X86.h"
|
2002-11-17 22:11:55 +01:00
|
|
|
#include "X86InstrBuilder.h"
|
2003-10-23 18:22:08 +02:00
|
|
|
#include "X86InstrInfo.h"
|
|
|
|
#include "llvm/Constants.h"
|
|
|
|
#include "llvm/DerivedTypes.h"
|
2002-10-26 00:55:53 +02:00
|
|
|
#include "llvm/Function.h"
|
2003-05-13 22:21:19 +02:00
|
|
|
#include "llvm/Instructions.h"
|
2003-10-23 18:22:08 +02:00
|
|
|
#include "llvm/Pass.h"
|
2004-06-20 09:49:54 +02:00
|
|
|
#include "llvm/CodeGen/IntrinsicLowering.h"
|
2003-10-23 18:22:08 +02:00
|
|
|
#include "llvm/CodeGen/MachineConstantPool.h"
|
|
|
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
2002-10-29 18:43:55 +01:00
|
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
2002-12-25 06:13:53 +01:00
|
|
|
#include "llvm/CodeGen/SSARegMap.h"
|
2002-11-20 01:58:23 +01:00
|
|
|
#include "llvm/Target/MRegisterInfo.h"
|
2003-10-23 18:22:08 +02:00
|
|
|
#include "llvm/Target/TargetMachine.h"
|
2004-02-22 08:04:00 +01:00
|
|
|
#include "llvm/Support/GetElementPtrTypeIterator.h"
|
2003-05-13 22:21:19 +02:00
|
|
|
#include "llvm/Support/InstVisitor.h"
|
2004-09-02 00:55:40 +02:00
|
|
|
#include "llvm/ADT/Statistic.h"
|
2003-12-28 10:47:19 +01:00
|
|
|
using namespace llvm;
|
2003-11-11 23:41:34 +01:00
|
|
|
|
2004-02-22 20:47:26 +01:00
|
|
|
namespace {
|
|
|
|
Statistic<>
|
|
|
|
NumFPKill("x86-codegen", "Number of FP_REG_KILL instructions added");
|
2004-04-11 21:21:59 +02:00
|
|
|
|
|
|
|
/// TypeClass - Used by the X86 backend to group LLVM types by their basic X86
|
|
|
|
/// Representation.
|
|
|
|
///
|
|
|
|
enum TypeClass {
|
|
|
|
cByte, cShort, cInt, cFP, cLong
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
/// getClass - Turn a primitive type into a "class" number which is based on the
|
|
|
|
/// size of the type, and whether or not it is floating point.
|
|
|
|
///
|
|
|
|
static inline TypeClass getClass(const Type *Ty) {
|
2004-06-17 20:19:28 +02:00
|
|
|
switch (Ty->getTypeID()) {
|
2004-04-11 21:21:59 +02:00
|
|
|
case Type::SByteTyID:
|
|
|
|
case Type::UByteTyID: return cByte; // Byte operands are class #0
|
|
|
|
case Type::ShortTyID:
|
|
|
|
case Type::UShortTyID: return cShort; // Short operands are class #1
|
|
|
|
case Type::IntTyID:
|
|
|
|
case Type::UIntTyID:
|
|
|
|
case Type::PointerTyID: return cInt; // Int's and pointers are class #2
|
|
|
|
|
|
|
|
case Type::FloatTyID:
|
|
|
|
case Type::DoubleTyID: return cFP; // Floating Point is #3
|
|
|
|
|
|
|
|
case Type::LongTyID:
|
|
|
|
case Type::ULongTyID: return cLong; // Longs are class #4
|
|
|
|
default:
|
|
|
|
assert(0 && "Invalid type to getClass!");
|
|
|
|
return cByte; // not reached
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// getClassB - Just like getClass, but treat boolean values as bytes.
|
|
|
|
static inline TypeClass getClassB(const Type *Ty) {
|
|
|
|
if (Ty == Type::BoolTy) return cByte;
|
|
|
|
return getClass(Ty);
|
2004-02-22 20:47:26 +01:00
|
|
|
}
|
2004-01-30 23:13:44 +01:00
|
|
|
|
2002-10-26 00:55:53 +02:00
|
|
|
namespace {
|
2004-09-21 20:21:21 +02:00
|
|
|
struct X86ISel : public FunctionPass, InstVisitor<X86ISel> {
|
2002-10-29 23:37:54 +01:00
|
|
|
TargetMachine &TM;
|
2003-05-08 21:44:13 +02:00
|
|
|
MachineFunction *F; // The function we are compiling into
|
|
|
|
MachineBasicBlock *BB; // The current MBB we are compiling
|
|
|
|
int VarArgsFrameIndex; // FrameIndex for start of varargs area
|
2004-02-15 02:04:03 +01:00
|
|
|
int ReturnAddressIndex; // FrameIndex for the return address
|
2002-10-26 00:55:53 +02:00
|
|
|
|
|
|
|
std::map<Value*, unsigned> RegMap; // Mapping between Val's and SSA Regs
|
|
|
|
|
2002-12-13 11:09:43 +01:00
|
|
|
// MBBMap - Mapping between LLVM BB -> Machine BB
|
|
|
|
std::map<const BasicBlock*, MachineBasicBlock*> MBBMap;
|
|
|
|
|
2004-05-13 09:40:27 +02:00
|
|
|
// AllocaMap - Mapping from fixed sized alloca instructions to the
|
|
|
|
// FrameIndex for the alloca.
|
|
|
|
std::map<AllocaInst*, unsigned> AllocaMap;
|
|
|
|
|
2004-09-21 20:21:21 +02:00
|
|
|
X86ISel(TargetMachine &tm) : TM(tm), F(0), BB(0) {}
|
2002-10-26 00:55:53 +02:00
|
|
|
|
|
|
|
/// runOnFunction - Top level implementation of instruction selection for
|
|
|
|
/// the entire function.
|
|
|
|
///
|
2002-10-29 23:37:54 +01:00
|
|
|
bool runOnFunction(Function &Fn) {
|
2004-12-17 01:07:46 +01:00
|
|
|
// Lazily create a stack slot for the return address if needed.
|
2004-12-17 01:46:51 +01:00
|
|
|
ReturnAddressIndex = 0;
|
2004-12-17 01:07:46 +01:00
|
|
|
|
2003-12-28 10:47:19 +01:00
|
|
|
// First pass over the function, lower any unknown intrinsic functions
|
|
|
|
// with the IntrinsicLowering class.
|
|
|
|
LowerUnknownIntrinsicFunctionCalls(Fn);
|
|
|
|
|
2002-10-30 00:40:58 +01:00
|
|
|
F = &MachineFunction::construct(&Fn, TM);
|
2002-12-13 11:09:43 +01:00
|
|
|
|
2002-12-28 21:24:02 +01:00
|
|
|
// Create all of the machine basic blocks for the function...
|
2002-12-13 11:09:43 +01:00
|
|
|
for (Function::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I)
|
|
|
|
F->getBasicBlockList().push_back(MBBMap[I] = new MachineBasicBlock(I));
|
|
|
|
|
2002-12-16 23:54:46 +01:00
|
|
|
BB = &F->front();
|
2003-05-06 23:32:22 +02:00
|
|
|
|
|
|
|
// Copy incoming arguments off of the stack...
|
2002-12-28 21:24:02 +01:00
|
|
|
LoadArgumentsToVirtualRegs(Fn);
|
2002-12-16 23:54:46 +01:00
|
|
|
|
2004-12-13 18:23:11 +01:00
|
|
|
// If this is main, emit special code.
|
|
|
|
if (Fn.hasExternalLinkage() && Fn.getName() == "main")
|
|
|
|
EmitSpecialCodeForMain();
|
|
|
|
|
2002-12-13 11:09:43 +01:00
|
|
|
// Instruction select everything except PHI nodes
|
2002-10-29 23:37:54 +01:00
|
|
|
visit(Fn);
|
2002-12-13 11:09:43 +01:00
|
|
|
|
|
|
|
// Select the PHI nodes
|
|
|
|
SelectPHINodes();
|
|
|
|
|
2004-02-22 20:47:26 +01:00
|
|
|
// Insert the FP_REG_KILL instructions into blocks that need them.
|
|
|
|
InsertFPRegKills();
|
|
|
|
|
2002-10-26 00:55:53 +02:00
|
|
|
RegMap.clear();
|
2002-12-13 11:09:43 +01:00
|
|
|
MBBMap.clear();
|
2004-05-13 09:40:27 +02:00
|
|
|
AllocaMap.clear();
|
2002-10-29 23:37:54 +01:00
|
|
|
F = 0;
|
2003-07-27 01:05:37 +02:00
|
|
|
// We always build a machine code representation for the function
|
|
|
|
return true;
|
2002-10-26 00:55:53 +02:00
|
|
|
}
|
|
|
|
|
2002-12-15 22:13:40 +01:00
|
|
|
virtual const char *getPassName() const {
|
|
|
|
return "X86 Simple Instruction Selection";
|
|
|
|
}
|
|
|
|
|
2004-12-13 18:23:11 +01:00
|
|
|
/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
|
|
|
|
/// the main function.
|
|
|
|
void EmitSpecialCodeForMain();
|
|
|
|
|
2002-10-26 00:55:53 +02:00
|
|
|
/// visitBasicBlock - This method is called when we are visiting a new basic
|
2002-10-29 21:48:56 +01:00
|
|
|
/// block. This simply creates a new MachineBasicBlock to emit code into
|
|
|
|
/// and adds it to the current MachineFunction. Subsequent visit* for
|
|
|
|
/// instructions will be invoked for all instructions in the basic block.
|
2002-10-26 00:55:53 +02:00
|
|
|
///
|
|
|
|
void visitBasicBlock(BasicBlock &LLVM_BB) {
|
2002-12-13 11:09:43 +01:00
|
|
|
BB = MBBMap[&LLVM_BB];
|
2002-10-26 00:55:53 +02:00
|
|
|
}
|
|
|
|
|
2003-12-28 10:47:19 +01:00
|
|
|
/// LowerUnknownIntrinsicFunctionCalls - This performs a prepass over the
|
|
|
|
/// function, lowering any calls to unknown intrinsic functions into the
|
|
|
|
/// equivalent LLVM code.
|
2004-03-02 00:53:11 +01:00
|
|
|
///
|
2003-12-28 10:47:19 +01:00
|
|
|
void LowerUnknownIntrinsicFunctionCalls(Function &F);
|
|
|
|
|
2002-12-28 21:24:02 +01:00
|
|
|
/// LoadArgumentsToVirtualRegs - Load all of the arguments to this function
|
|
|
|
/// from the stack into virtual registers.
|
|
|
|
///
|
|
|
|
void LoadArgumentsToVirtualRegs(Function &F);
|
2002-12-13 11:09:43 +01:00
|
|
|
|
|
|
|
/// SelectPHINodes - Insert machine code to generate phis. This is tricky
|
|
|
|
/// because we have to generate our sources into the source basic blocks,
|
|
|
|
/// not the current one.
|
|
|
|
///
|
|
|
|
void SelectPHINodes();
|
|
|
|
|
2004-02-22 20:47:26 +01:00
|
|
|
/// InsertFPRegKills - Insert FP_REG_KILL instructions into basic blocks
|
|
|
|
/// that need them. This only occurs due to the floating point stackifier
|
|
|
|
/// not being aggressive enough to handle arbitrary global stackification.
|
|
|
|
///
|
|
|
|
void InsertFPRegKills();
|
|
|
|
|
2002-10-26 00:55:53 +02:00
|
|
|
// Visitation methods for various instructions. These methods simply emit
|
|
|
|
// fixed X86 code for each instruction.
|
|
|
|
//
|
2002-11-22 12:07:01 +01:00
|
|
|
|
|
|
|
// Control flow operators
|
2002-10-26 00:55:53 +02:00
|
|
|
void visitReturnInst(ReturnInst &RI);
|
2002-11-02 20:27:56 +01:00
|
|
|
void visitBranchInst(BranchInst &BI);
|
2004-10-16 20:13:05 +02:00
|
|
|
void visitUnreachableInst(UnreachableInst &UI) {}
|
2003-01-13 01:32:26 +01:00
|
|
|
|
|
|
|
struct ValueRecord {
|
2003-08-04 04:12:48 +02:00
|
|
|
Value *Val;
|
2003-01-13 01:32:26 +01:00
|
|
|
unsigned Reg;
|
|
|
|
const Type *Ty;
|
2003-08-04 04:12:48 +02:00
|
|
|
ValueRecord(unsigned R, const Type *T) : Val(0), Reg(R), Ty(T) {}
|
|
|
|
ValueRecord(Value *V) : Val(V), Reg(0), Ty(V->getType()) {}
|
2003-01-13 01:32:26 +01:00
|
|
|
};
|
|
|
|
void doCall(const ValueRecord &Ret, MachineInstr *CallMI,
|
2003-10-23 18:22:08 +02:00
|
|
|
const std::vector<ValueRecord> &Args);
|
2002-11-22 12:07:01 +01:00
|
|
|
void visitCallInst(CallInst &I);
|
2003-11-11 23:41:34 +01:00
|
|
|
void visitIntrinsicCall(Intrinsic::ID ID, CallInst &I);
|
2002-11-02 21:04:26 +01:00
|
|
|
|
|
|
|
// Arithmetic operators
|
2002-11-02 21:54:46 +01:00
|
|
|
void visitSimpleBinary(BinaryOperator &B, unsigned OpcodeClass);
|
2002-11-02 21:13:22 +01:00
|
|
|
void visitAdd(BinaryOperator &B) { visitSimpleBinary(B, 0); }
|
|
|
|
void visitSub(BinaryOperator &B) { visitSimpleBinary(B, 1); }
|
2002-11-02 21:28:58 +01:00
|
|
|
void visitMul(BinaryOperator &B);
|
2002-11-02 21:04:26 +01:00
|
|
|
|
2002-11-02 21:54:46 +01:00
|
|
|
void visitDiv(BinaryOperator &B) { visitDivRem(B); }
|
|
|
|
void visitRem(BinaryOperator &B) { visitDivRem(B); }
|
|
|
|
void visitDivRem(BinaryOperator &B);
|
|
|
|
|
2002-11-02 21:04:26 +01:00
|
|
|
// Bitwise operators
|
2002-11-02 21:13:22 +01:00
|
|
|
void visitAnd(BinaryOperator &B) { visitSimpleBinary(B, 2); }
|
|
|
|
void visitOr (BinaryOperator &B) { visitSimpleBinary(B, 3); }
|
|
|
|
void visitXor(BinaryOperator &B) { visitSimpleBinary(B, 4); }
|
2002-11-02 21:04:26 +01:00
|
|
|
|
2003-01-16 17:43:00 +01:00
|
|
|
// Comparison operators...
|
|
|
|
void visitSetCondInst(SetCondInst &I);
|
2003-10-19 23:09:10 +02:00
|
|
|
unsigned EmitComparison(unsigned OpNum, Value *Op0, Value *Op1,
|
|
|
|
MachineBasicBlock *MBB,
|
2004-02-23 04:10:10 +01:00
|
|
|
MachineBasicBlock::iterator MBBI);
|
2004-03-30 23:22:00 +02:00
|
|
|
void visitSelectInst(SelectInst &SI);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
|
|
|
|
2002-11-17 22:11:55 +01:00
|
|
|
// Memory Instructions
|
|
|
|
void visitLoadInst(LoadInst &I);
|
|
|
|
void visitStoreInst(StoreInst &I);
|
This checkin is brought to you by the brian gaeke allnighter fund.
(lib/Target/X86) InstSelectSimple.cpp:
Include llvm/DerivedTypes.h and iostream.
Refactor visitMul out into a wrapper around doMultiply(), so that we
can do multiplications on temporary values when we are doing
getelementptrs.
Refactor part of getReg out into makeAnotherReg, so that we can create
registers willy-nilly to hold temporary values, when we are doing
getelementptrs.
Add stub implementations of visitMallocInst and visitAllocaInst.
Add initial implementation of visitGetElementPtrInst.
In copyConstantToRegister:
We throw a *lot* of our asserts here. So, when we want to throw an
assert, print out to stderr whatever expr or whatever constant made
us barf.
Support copying ConstantPointerNull to register, using a move immediate
of zero.
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
Teach visitCallInst to extract byte- and short-class return values
from subregs of EAX. Add a FIXME note about how we would do it for
float-class return values.
Add a FIXME note about how we would cast float to int and back.
X86InstrInfo.def:
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
(tools/jello) GlobalVars.cpp:
Include iostream.
If we have to emit a floating-point constant to memory, gamble and use
the same method as for ints.
If we have to emit a ConstantPointerNull to memory, try using a "void *"
and "NULL".
Otherwise, if we are going to throw an assert, print out whatever constant
made us barf, first.
llvm-svn: 4973
2002-12-12 16:33:40 +01:00
|
|
|
void visitGetElementPtrInst(GetElementPtrInst &I);
|
|
|
|
void visitAllocaInst(AllocaInst &I);
|
2003-01-13 01:32:26 +01:00
|
|
|
void visitMallocInst(MallocInst &I);
|
|
|
|
void visitFreeInst(FreeInst &I);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2002-11-02 21:04:26 +01:00
|
|
|
// Other operators
|
2002-11-01 00:03:59 +01:00
|
|
|
void visitShiftInst(ShiftInst &I);
|
2002-12-13 11:09:43 +01:00
|
|
|
void visitPHINode(PHINode &I) {} // PHI nodes handled by second pass
|
2002-11-22 12:07:01 +01:00
|
|
|
void visitCastInst(CastInst &I);
|
2003-10-18 07:56:40 +02:00
|
|
|
void visitVANextInst(VANextInst &I);
|
|
|
|
void visitVAArgInst(VAArgInst &I);
|
2002-10-26 00:55:53 +02:00
|
|
|
|
|
|
|
void visitInstruction(Instruction &I) {
|
|
|
|
std::cerr << "Cannot instruction select: " << I;
|
|
|
|
abort();
|
|
|
|
}
|
|
|
|
|
2002-12-13 08:56:18 +01:00
|
|
|
/// promote32 - Make a value 32-bits wide, and put it somewhere.
|
2003-01-13 01:32:26 +01:00
|
|
|
///
|
|
|
|
void promote32(unsigned targetReg, const ValueRecord &VR);
|
|
|
|
|
2004-03-08 02:18:36 +01:00
|
|
|
/// getAddressingMode - Get the addressing mode to use to address the
|
|
|
|
/// specified value. The returned value should be used with addFullAddress.
|
2004-08-30 02:13:26 +02:00
|
|
|
void getAddressingMode(Value *Addr, X86AddressMode &AM);
|
2004-03-08 02:18:36 +01:00
|
|
|
|
|
|
|
|
|
|
|
/// getGEPIndex - This is used to fold GEP instructions into X86 addressing
|
|
|
|
/// expressions.
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
void getGEPIndex(MachineBasicBlock *MBB, MachineBasicBlock::iterator IP,
|
|
|
|
std::vector<Value*> &GEPOps,
|
2004-08-30 02:13:26 +02:00
|
|
|
std::vector<const Type*> &GEPTypes,
|
|
|
|
X86AddressMode &AM);
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
|
|
|
|
/// isGEPFoldable - Return true if the specified GEP can be completely
|
|
|
|
/// folded into the addressing mode of a load/store or lea instruction.
|
|
|
|
bool isGEPFoldable(MachineBasicBlock *MBB,
|
|
|
|
Value *Src, User::op_iterator IdxBegin,
|
2004-08-30 02:13:26 +02:00
|
|
|
User::op_iterator IdxEnd, X86AddressMode &AM);
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
|
2003-01-13 01:32:26 +01:00
|
|
|
/// emitGEPOperation - Common code shared between visitGetElementPtrInst and
|
|
|
|
/// constant expression GEP support.
|
|
|
|
///
|
2004-02-22 18:05:38 +01:00
|
|
|
void emitGEPOperation(MachineBasicBlock *BB, MachineBasicBlock::iterator IP,
|
2002-12-13 11:09:43 +01:00
|
|
|
Value *Src, User::op_iterator IdxBegin,
|
2002-12-13 07:56:29 +01:00
|
|
|
User::op_iterator IdxEnd, unsigned TargetReg);
|
|
|
|
|
2003-04-23 19:22:12 +02:00
|
|
|
/// emitCastOperation - Common code shared between visitCastInst and
|
|
|
|
/// constant expression cast support.
|
2004-03-02 00:53:11 +01:00
|
|
|
///
|
2004-02-23 04:10:10 +01:00
|
|
|
void emitCastOperation(MachineBasicBlock *BB,MachineBasicBlock::iterator IP,
|
2003-04-23 19:22:12 +02:00
|
|
|
Value *Src, const Type *DestTy, unsigned TargetReg);
|
|
|
|
|
2003-05-08 22:49:25 +02:00
|
|
|
/// emitSimpleBinaryOperation - Common code shared between visitSimpleBinary
|
|
|
|
/// and constant expression support.
|
2004-03-02 00:53:11 +01:00
|
|
|
///
|
2003-05-08 22:49:25 +02:00
|
|
|
void emitSimpleBinaryOperation(MachineBasicBlock *BB,
|
2004-02-23 04:10:10 +01:00
|
|
|
MachineBasicBlock::iterator IP,
|
2003-05-08 22:49:25 +02:00
|
|
|
Value *Op0, Value *Op1,
|
|
|
|
unsigned OperatorClass, unsigned TargetReg);
|
|
|
|
|
2004-04-11 23:23:56 +02:00
|
|
|
/// emitBinaryFPOperation - This method handles emission of floating point
|
|
|
|
/// Add (0), Sub (1), Mul (2), and Div (3) operations.
|
|
|
|
void emitBinaryFPOperation(MachineBasicBlock *BB,
|
|
|
|
MachineBasicBlock::iterator IP,
|
|
|
|
Value *Op0, Value *Op1,
|
|
|
|
unsigned OperatorClass, unsigned TargetReg);
|
|
|
|
|
2004-04-11 22:56:28 +02:00
|
|
|
void emitMultiply(MachineBasicBlock *BB, MachineBasicBlock::iterator IP,
|
|
|
|
Value *Op0, Value *Op1, unsigned TargetReg);
|
|
|
|
|
|
|
|
void doMultiply(MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI,
|
|
|
|
unsigned DestReg, const Type *DestTy,
|
|
|
|
unsigned Op0Reg, unsigned Op1Reg);
|
2005-04-22 01:38:14 +02:00
|
|
|
void doMultiplyConst(MachineBasicBlock *MBB,
|
2004-04-11 22:56:28 +02:00
|
|
|
MachineBasicBlock::iterator MBBI,
|
|
|
|
unsigned DestReg, const Type *DestTy,
|
|
|
|
unsigned Op0Reg, unsigned Op1Val);
|
|
|
|
|
2003-10-23 19:21:43 +02:00
|
|
|
void emitDivRemOperation(MachineBasicBlock *BB,
|
2004-02-23 04:10:10 +01:00
|
|
|
MachineBasicBlock::iterator IP,
|
2004-04-11 22:56:28 +02:00
|
|
|
Value *Op0, Value *Op1, bool isDiv,
|
|
|
|
unsigned TargetReg);
|
2003-10-23 19:21:43 +02:00
|
|
|
|
2003-08-24 21:19:47 +02:00
|
|
|
/// emitSetCCOperation - Common code shared between visitSetCondInst and
|
|
|
|
/// constant expression support.
|
2004-03-02 00:53:11 +01:00
|
|
|
///
|
2003-08-24 21:19:47 +02:00
|
|
|
void emitSetCCOperation(MachineBasicBlock *BB,
|
2004-02-23 04:10:10 +01:00
|
|
|
MachineBasicBlock::iterator IP,
|
2003-08-24 21:19:47 +02:00
|
|
|
Value *Op0, Value *Op1, unsigned Opcode,
|
|
|
|
unsigned TargetReg);
|
2003-11-22 06:18:35 +01:00
|
|
|
|
|
|
|
/// emitShiftOperation - Common code shared between visitShiftInst and
|
|
|
|
/// constant expression support.
|
2004-03-02 00:53:11 +01:00
|
|
|
///
|
2003-11-22 07:49:41 +01:00
|
|
|
void emitShiftOperation(MachineBasicBlock *MBB,
|
2004-02-23 04:10:10 +01:00
|
|
|
MachineBasicBlock::iterator IP,
|
2003-11-22 07:49:41 +01:00
|
|
|
Value *Op, Value *ShiftAmount, bool isLeftShift,
|
|
|
|
const Type *ResultTy, unsigned DestReg);
|
shld is a very high latency operation. Instead of emitting it for shifts of
two or three, open code the equivalent operation which is faster on athlon
and P4 (by a substantial margin).
For example, instead of compiling this:
long long X2(long long Y) { return Y << 2; }
to:
X3_2:
movl 4(%esp), %eax
movl 8(%esp), %edx
shldl $2, %eax, %edx
shll $2, %eax
ret
Compile it to:
X2:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $30, %edx
leal (%edx,%ecx,4), %edx
shll $2, %eax
ret
Likewise, for << 3, compile to:
X3:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $29, %edx
leal (%edx,%ecx,8), %edx
shll $3, %eax
ret
This matches icc, except that icc open codes the shifts as adds on the P4.
llvm-svn: 17707
2004-11-13 21:48:57 +01:00
|
|
|
|
|
|
|
// Emit code for a 'SHLD DestReg, Op0, Op1, Amt' operation, where Amt is a
|
|
|
|
// constant.
|
2005-04-22 01:38:14 +02:00
|
|
|
void doSHLDConst(MachineBasicBlock *MBB,
|
shld is a very high latency operation. Instead of emitting it for shifts of
two or three, open code the equivalent operation which is faster on athlon
and P4 (by a substantial margin).
For example, instead of compiling this:
long long X2(long long Y) { return Y << 2; }
to:
X3_2:
movl 4(%esp), %eax
movl 8(%esp), %edx
shldl $2, %eax, %edx
shll $2, %eax
ret
Compile it to:
X2:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $30, %edx
leal (%edx,%ecx,4), %edx
shll $2, %eax
ret
Likewise, for << 3, compile to:
X3:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $29, %edx
leal (%edx,%ecx,8), %edx
shll $3, %eax
ret
This matches icc, except that icc open codes the shifts as adds on the P4.
llvm-svn: 17707
2004-11-13 21:48:57 +01:00
|
|
|
MachineBasicBlock::iterator MBBI,
|
|
|
|
unsigned DestReg, unsigned Op0Reg, unsigned Op1Reg,
|
|
|
|
unsigned Op1Val);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-03-30 23:22:00 +02:00
|
|
|
/// emitSelectOperation - Common code shared between visitSelectInst and the
|
|
|
|
/// constant expression support.
|
|
|
|
void emitSelectOperation(MachineBasicBlock *MBB,
|
|
|
|
MachineBasicBlock::iterator IP,
|
|
|
|
Value *Cond, Value *TrueVal, Value *FalseVal,
|
|
|
|
unsigned DestReg);
|
2003-08-24 21:19:47 +02:00
|
|
|
|
2002-10-27 22:16:59 +01:00
|
|
|
/// copyConstantToRegister - Output the instructions required to put the
|
|
|
|
/// specified constant into the specified register.
|
|
|
|
///
|
2002-12-16 20:32:50 +01:00
|
|
|
void copyConstantToRegister(MachineBasicBlock *MBB,
|
2004-02-23 04:10:10 +01:00
|
|
|
MachineBasicBlock::iterator MBBI,
|
2002-12-16 20:32:50 +01:00
|
|
|
Constant *C, unsigned Reg);
|
2002-10-27 22:16:59 +01:00
|
|
|
|
2004-06-11 07:33:49 +02:00
|
|
|
void emitUCOMr(MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI,
|
|
|
|
unsigned LHS, unsigned RHS);
|
|
|
|
|
2003-01-13 01:32:26 +01:00
|
|
|
/// makeAnotherReg - This method returns the next register number we haven't
|
|
|
|
/// yet used.
|
|
|
|
///
|
|
|
|
/// Long values are handled somewhat specially. They are always allocated
|
|
|
|
/// as pairs of 32 bit integer values. The register number returned is the
|
|
|
|
/// lower 32 bits of the long value, and the regNum+1 is the upper 32 bits
|
|
|
|
/// of the long value.
|
|
|
|
///
|
2002-12-13 07:56:29 +01:00
|
|
|
unsigned makeAnotherReg(const Type *Ty) {
|
2003-07-30 07:33:48 +02:00
|
|
|
assert(dynamic_cast<const X86RegisterInfo*>(TM.getRegisterInfo()) &&
|
|
|
|
"Current target doesn't have X86 reg info??");
|
|
|
|
const X86RegisterInfo *MRI =
|
|
|
|
static_cast<const X86RegisterInfo*>(TM.getRegisterInfo());
|
2003-01-13 01:32:26 +01:00
|
|
|
if (Ty == Type::LongTy || Ty == Type::ULongTy) {
|
2003-10-23 18:22:08 +02:00
|
|
|
const TargetRegisterClass *RC = MRI->getRegClassForType(Type::IntTy);
|
|
|
|
// Create the lower part
|
|
|
|
F->getSSARegMap()->createVirtualRegister(RC);
|
|
|
|
// Create the upper part.
|
|
|
|
return F->getSSARegMap()->createVirtualRegister(RC)-1;
|
2003-01-13 01:32:26 +01:00
|
|
|
}
|
|
|
|
|
2002-12-13 07:56:29 +01:00
|
|
|
// Add the mapping of regnumber => reg class to MachineFunction
|
2003-07-30 07:33:48 +02:00
|
|
|
const TargetRegisterClass *RC = MRI->getRegClassForType(Ty);
|
2003-01-13 01:32:26 +01:00
|
|
|
return F->getSSARegMap()->createVirtualRegister(RC);
|
This checkin is brought to you by the brian gaeke allnighter fund.
(lib/Target/X86) InstSelectSimple.cpp:
Include llvm/DerivedTypes.h and iostream.
Refactor visitMul out into a wrapper around doMultiply(), so that we
can do multiplications on temporary values when we are doing
getelementptrs.
Refactor part of getReg out into makeAnotherReg, so that we can create
registers willy-nilly to hold temporary values, when we are doing
getelementptrs.
Add stub implementations of visitMallocInst and visitAllocaInst.
Add initial implementation of visitGetElementPtrInst.
In copyConstantToRegister:
We throw a *lot* of our asserts here. So, when we want to throw an
assert, print out to stderr whatever expr or whatever constant made
us barf.
Support copying ConstantPointerNull to register, using a move immediate
of zero.
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
Teach visitCallInst to extract byte- and short-class return values
from subregs of EAX. Add a FIXME note about how we would do it for
float-class return values.
Add a FIXME note about how we would cast float to int and back.
X86InstrInfo.def:
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
(tools/jello) GlobalVars.cpp:
Include iostream.
If we have to emit a floating-point constant to memory, gamble and use
the same method as for ints.
If we have to emit a ConstantPointerNull to memory, try using a "void *"
and "NULL".
Otherwise, if we are going to throw an assert, print out whatever constant
made us barf, first.
llvm-svn: 4973
2002-12-12 16:33:40 +01:00
|
|
|
}
|
|
|
|
|
2004-05-13 09:40:27 +02:00
|
|
|
/// getReg - This method turns an LLVM value into a register number.
|
2002-10-26 00:55:53 +02:00
|
|
|
///
|
|
|
|
unsigned getReg(Value &V) { return getReg(&V); } // Allow references
|
2002-12-13 11:50:40 +01:00
|
|
|
unsigned getReg(Value *V) {
|
|
|
|
// Just append to the end of the current bb.
|
|
|
|
MachineBasicBlock::iterator It = BB->end();
|
|
|
|
return getReg(V, BB, It);
|
|
|
|
}
|
2002-12-13 12:22:48 +01:00
|
|
|
unsigned getReg(Value *V, MachineBasicBlock *MBB,
|
2004-05-13 09:40:27 +02:00
|
|
|
MachineBasicBlock::iterator IPt);
|
2002-10-27 22:16:59 +01:00
|
|
|
|
2004-05-13 09:40:27 +02:00
|
|
|
/// getFixedSizedAllocaFI - Return the frame index for a fixed sized alloca
|
|
|
|
/// that is to be statically allocated with the initial stack frame
|
|
|
|
/// adjustment.
|
|
|
|
unsigned getFixedSizedAllocaFI(AllocaInst *AI);
|
2002-10-26 00:55:53 +02:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2004-05-13 09:40:27 +02:00
|
|
|
/// dyn_castFixedAlloca - If the specified value is a fixed size alloca
|
|
|
|
/// instruction in the entry block, return it. Otherwise, return a null
|
|
|
|
/// pointer.
|
|
|
|
static AllocaInst *dyn_castFixedAlloca(Value *V) {
|
|
|
|
if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
|
|
|
|
BasicBlock *BB = AI->getParent();
|
|
|
|
if (isa<ConstantUInt>(AI->getArraySize()) && BB ==&BB->getParent()->front())
|
|
|
|
return AI;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// getReg - This method turns an LLVM value into a register number.
|
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
unsigned X86ISel::getReg(Value *V, MachineBasicBlock *MBB,
|
|
|
|
MachineBasicBlock::iterator IPt) {
|
2004-05-13 09:40:27 +02:00
|
|
|
// If this operand is a constant, emit the code to copy the constant into
|
|
|
|
// the register here...
|
|
|
|
if (Constant *C = dyn_cast<Constant>(V)) {
|
|
|
|
unsigned Reg = makeAnotherReg(V->getType());
|
|
|
|
copyConstantToRegister(MBB, IPt, C, Reg);
|
|
|
|
return Reg;
|
|
|
|
} else if (CastInst *CI = dyn_cast<CastInst>(V)) {
|
2004-06-29 02:14:38 +02:00
|
|
|
// Do not emit noop casts at all, unless it's a double -> float cast.
|
|
|
|
if (getClassB(CI->getType()) == getClassB(CI->getOperand(0)->getType()) &&
|
2005-04-22 01:38:14 +02:00
|
|
|
(CI->getType() != Type::FloatTy ||
|
2004-06-29 02:14:38 +02:00
|
|
|
CI->getOperand(0)->getType() != Type::DoubleTy))
|
2004-05-13 09:40:27 +02:00
|
|
|
return getReg(CI->getOperand(0), MBB, IPt);
|
|
|
|
} else if (AllocaInst *AI = dyn_castFixedAlloca(V)) {
|
|
|
|
// If the alloca address couldn't be folded into the instruction addressing,
|
|
|
|
// emit an explicit LEA as appropriate.
|
|
|
|
unsigned Reg = makeAnotherReg(V->getType());
|
|
|
|
unsigned FI = getFixedSizedAllocaFI(AI);
|
|
|
|
addFrameReference(BuildMI(*MBB, IPt, X86::LEA32r, 4, Reg), FI);
|
|
|
|
return Reg;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned &Reg = RegMap[V];
|
|
|
|
if (Reg == 0) {
|
|
|
|
Reg = makeAnotherReg(V->getType());
|
|
|
|
RegMap[V] = Reg;
|
|
|
|
}
|
|
|
|
|
|
|
|
return Reg;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// getFixedSizedAllocaFI - Return the frame index for a fixed sized alloca
|
|
|
|
/// that is to be statically allocated with the initial stack frame
|
|
|
|
/// adjustment.
|
2004-09-21 20:21:21 +02:00
|
|
|
unsigned X86ISel::getFixedSizedAllocaFI(AllocaInst *AI) {
|
2004-05-13 09:40:27 +02:00
|
|
|
// Already computed this?
|
|
|
|
std::map<AllocaInst*, unsigned>::iterator I = AllocaMap.lower_bound(AI);
|
|
|
|
if (I != AllocaMap.end() && I->first == AI) return I->second;
|
|
|
|
|
|
|
|
const Type *Ty = AI->getAllocatedType();
|
|
|
|
ConstantUInt *CUI = cast<ConstantUInt>(AI->getArraySize());
|
|
|
|
unsigned TySize = TM.getTargetData().getTypeSize(Ty);
|
|
|
|
TySize *= CUI->getValue(); // Get total allocated size...
|
|
|
|
unsigned Alignment = TM.getTargetData().getTypeAlignment(Ty);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-05-13 09:40:27 +02:00
|
|
|
// Create a new stack object using the frame manager...
|
|
|
|
int FrameIdx = F->getFrameInfo()->CreateStackObject(TySize, Alignment);
|
|
|
|
AllocaMap.insert(I, std::make_pair(AI, FrameIdx));
|
|
|
|
return FrameIdx;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2002-10-27 22:16:59 +01:00
|
|
|
/// copyConstantToRegister - Output the instructions required to put the
|
|
|
|
/// specified constant into the specified register.
|
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::copyConstantToRegister(MachineBasicBlock *MBB,
|
|
|
|
MachineBasicBlock::iterator IP,
|
|
|
|
Constant *C, unsigned R) {
|
2004-10-16 20:13:05 +02:00
|
|
|
if (isa<UndefValue>(C)) {
|
|
|
|
switch (getClassB(C->getType())) {
|
|
|
|
case cFP:
|
|
|
|
// FIXME: SHOULD TEACH STACKIFIER ABOUT UNDEF VALUES!
|
|
|
|
BuildMI(*MBB, IP, X86::FLD0, 0, R);
|
|
|
|
return;
|
|
|
|
case cLong:
|
|
|
|
BuildMI(*MBB, IP, X86::IMPLICIT_DEF, 0, R+1);
|
|
|
|
// FALL THROUGH
|
|
|
|
default:
|
|
|
|
BuildMI(*MBB, IP, X86::IMPLICIT_DEF, 0, R);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
} else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
|
2003-05-08 22:49:25 +02:00
|
|
|
unsigned Class = 0;
|
|
|
|
switch (CE->getOpcode()) {
|
|
|
|
case Instruction::GetElementPtr:
|
2002-12-16 05:23:29 +01:00
|
|
|
emitGEPOperation(MBB, IP, CE->getOperand(0),
|
2002-12-13 11:09:43 +01:00
|
|
|
CE->op_begin()+1, CE->op_end(), R);
|
2002-12-13 07:56:29 +01:00
|
|
|
return;
|
2003-05-08 22:49:25 +02:00
|
|
|
case Instruction::Cast:
|
2003-04-23 19:22:12 +02:00
|
|
|
emitCastOperation(MBB, IP, CE->getOperand(0), CE->getType(), R);
|
2003-04-21 23:33:44 +02:00
|
|
|
return;
|
2002-12-13 07:56:29 +01:00
|
|
|
|
2003-05-08 22:49:25 +02:00
|
|
|
case Instruction::Xor: ++Class; // FALL THROUGH
|
|
|
|
case Instruction::Or: ++Class; // FALL THROUGH
|
|
|
|
case Instruction::And: ++Class; // FALL THROUGH
|
|
|
|
case Instruction::Sub: ++Class; // FALL THROUGH
|
|
|
|
case Instruction::Add:
|
|
|
|
emitSimpleBinaryOperation(MBB, IP, CE->getOperand(0), CE->getOperand(1),
|
|
|
|
Class, R);
|
|
|
|
return;
|
|
|
|
|
2004-04-11 22:56:28 +02:00
|
|
|
case Instruction::Mul:
|
|
|
|
emitMultiply(MBB, IP, CE->getOperand(0), CE->getOperand(1), R);
|
2003-10-23 19:21:43 +02:00
|
|
|
return;
|
2004-04-11 22:56:28 +02:00
|
|
|
|
2003-10-23 19:21:43 +02:00
|
|
|
case Instruction::Div:
|
2004-04-11 22:56:28 +02:00
|
|
|
case Instruction::Rem:
|
|
|
|
emitDivRemOperation(MBB, IP, CE->getOperand(0), CE->getOperand(1),
|
|
|
|
CE->getOpcode() == Instruction::Div, R);
|
2003-10-23 19:21:43 +02:00
|
|
|
return;
|
|
|
|
|
2003-08-24 21:19:47 +02:00
|
|
|
case Instruction::SetNE:
|
|
|
|
case Instruction::SetEQ:
|
|
|
|
case Instruction::SetLT:
|
|
|
|
case Instruction::SetGT:
|
|
|
|
case Instruction::SetLE:
|
|
|
|
case Instruction::SetGE:
|
|
|
|
emitSetCCOperation(MBB, IP, CE->getOperand(0), CE->getOperand(1),
|
|
|
|
CE->getOpcode(), R);
|
|
|
|
return;
|
|
|
|
|
2003-11-22 06:18:35 +01:00
|
|
|
case Instruction::Shl:
|
|
|
|
case Instruction::Shr:
|
|
|
|
emitShiftOperation(MBB, IP, CE->getOperand(0), CE->getOperand(1),
|
2003-11-22 07:49:41 +01:00
|
|
|
CE->getOpcode() == Instruction::Shl, CE->getType(), R);
|
|
|
|
return;
|
2003-11-22 06:18:35 +01:00
|
|
|
|
2004-03-30 23:22:00 +02:00
|
|
|
case Instruction::Select:
|
|
|
|
emitSelectOperation(MBB, IP, CE->getOperand(0), CE->getOperand(1),
|
|
|
|
CE->getOperand(2), R);
|
|
|
|
return;
|
|
|
|
|
2003-05-08 22:49:25 +02:00
|
|
|
default:
|
2004-07-15 04:14:30 +02:00
|
|
|
std::cerr << "Offending expr: " << *C << "\n";
|
2003-10-19 23:09:10 +02:00
|
|
|
assert(0 && "Constant expression not yet handled!\n");
|
2003-05-08 22:49:25 +02:00
|
|
|
}
|
This checkin is brought to you by the brian gaeke allnighter fund.
(lib/Target/X86) InstSelectSimple.cpp:
Include llvm/DerivedTypes.h and iostream.
Refactor visitMul out into a wrapper around doMultiply(), so that we
can do multiplications on temporary values when we are doing
getelementptrs.
Refactor part of getReg out into makeAnotherReg, so that we can create
registers willy-nilly to hold temporary values, when we are doing
getelementptrs.
Add stub implementations of visitMallocInst and visitAllocaInst.
Add initial implementation of visitGetElementPtrInst.
In copyConstantToRegister:
We throw a *lot* of our asserts here. So, when we want to throw an
assert, print out to stderr whatever expr or whatever constant made
us barf.
Support copying ConstantPointerNull to register, using a move immediate
of zero.
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
Teach visitCallInst to extract byte- and short-class return values
from subregs of EAX. Add a FIXME note about how we would do it for
float-class return values.
Add a FIXME note about how we would cast float to int and back.
X86InstrInfo.def:
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
(tools/jello) GlobalVars.cpp:
Include iostream.
If we have to emit a floating-point constant to memory, gamble and use
the same method as for ints.
If we have to emit a ConstantPointerNull to memory, try using a "void *"
and "NULL".
Otherwise, if we are going to throw an assert, print out whatever constant
made us barf, first.
llvm-svn: 4973
2002-12-12 16:33:40 +01:00
|
|
|
}
|
2002-10-27 22:16:59 +01:00
|
|
|
|
2002-11-02 02:15:18 +01:00
|
|
|
if (C->getType()->isIntegral()) {
|
2002-12-15 09:02:15 +01:00
|
|
|
unsigned Class = getClassB(C->getType());
|
2003-01-13 01:32:26 +01:00
|
|
|
|
|
|
|
if (Class == cLong) {
|
|
|
|
// Copy the value into the register pair.
|
2003-07-23 17:22:26 +02:00
|
|
|
uint64_t Val = cast<ConstantInt>(C)->getRawValue();
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::MOV32ri, 1, R).addImm(Val & 0xFFFFFFFF);
|
|
|
|
BuildMI(*MBB, IP, X86::MOV32ri, 1, R+1).addImm(Val >> 32);
|
2003-01-13 01:32:26 +01:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2002-12-25 06:13:53 +01:00
|
|
|
assert(Class <= cInt && "Type not handled yet!");
|
2002-11-02 02:15:18 +01:00
|
|
|
|
|
|
|
static const unsigned IntegralOpcodeTab[] = {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
X86::MOV8ri, X86::MOV16ri, X86::MOV32ri
|
2002-11-02 02:15:18 +01:00
|
|
|
};
|
|
|
|
|
2002-12-15 09:02:15 +01:00
|
|
|
if (C->getType() == Type::BoolTy) {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::MOV8ri, 1, R).addImm(C == ConstantBool::True);
|
2002-11-02 02:15:18 +01:00
|
|
|
} else {
|
2003-07-23 17:22:26 +02:00
|
|
|
ConstantInt *CI = cast<ConstantInt>(C);
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*MBB, IP, IntegralOpcodeTab[Class],1,R).addImm(CI->getRawValue());
|
2002-11-02 02:15:18 +01:00
|
|
|
}
|
2002-12-25 06:13:53 +01:00
|
|
|
} else if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
|
2004-02-02 19:56:30 +01:00
|
|
|
if (CFP->isExactlyValue(+0.0))
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*MBB, IP, X86::FLD0, 0, R);
|
2004-02-02 19:56:30 +01:00
|
|
|
else if (CFP->isExactlyValue(+1.0))
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*MBB, IP, X86::FLD1, 0, R);
|
2005-01-06 22:19:16 +01:00
|
|
|
else if (CFP->isExactlyValue(-0.0)) {
|
|
|
|
unsigned Tmp = makeAnotherReg(Type::DoubleTy);
|
|
|
|
BuildMI(*MBB, IP, X86::FLD0, 0, Tmp);
|
2005-04-22 01:38:14 +02:00
|
|
|
BuildMI(*MBB, IP, X86::FCHS, 1, R).addReg(Tmp);
|
2005-01-06 22:19:16 +01:00
|
|
|
} else if (CFP->isExactlyValue(-1.0)) {
|
|
|
|
unsigned Tmp = makeAnotherReg(Type::DoubleTy);
|
|
|
|
BuildMI(*MBB, IP, X86::FLD1, 0, Tmp);
|
2005-04-22 01:38:14 +02:00
|
|
|
BuildMI(*MBB, IP, X86::FCHS, 1, R).addReg(Tmp);
|
2005-01-06 22:19:16 +01:00
|
|
|
} else { // FIXME: PI, other native values
|
2005-01-05 17:30:14 +01:00
|
|
|
// FIXME: 2*PI -> LDPI + FADD
|
|
|
|
|
|
|
|
// Otherwise we need to spill the constant to memory.
|
2003-01-13 01:32:26 +01:00
|
|
|
MachineConstantPool *CP = F->getConstantPool();
|
2005-01-05 17:30:14 +01:00
|
|
|
|
2003-10-20 06:11:23 +02:00
|
|
|
const Type *Ty = CFP->getType();
|
|
|
|
|
2005-01-05 17:30:14 +01:00
|
|
|
// If a FP immediate is precise when represented as a float, we put it
|
|
|
|
// into the constant pool as a float, even if it's is statically typed as
|
|
|
|
// a double.
|
|
|
|
if (Ty == Type::DoubleTy)
|
|
|
|
if (CFP->isExactlyValue((float)CFP->getValue())) {
|
|
|
|
Ty = Type::FloatTy;
|
|
|
|
CFP = cast<ConstantFP>(ConstantExpr::getCast(CFP, Ty));
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned CPI = CP->getConstantPoolIndex(CFP);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2003-10-20 06:11:23 +02:00
|
|
|
assert(Ty == Type::FloatTy || Ty == Type::DoubleTy && "Unknown FP type!");
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
unsigned LoadOpcode = Ty == Type::FloatTy ? X86::FLD32m : X86::FLD64m;
|
2004-02-29 08:22:16 +01:00
|
|
|
addConstantPoolReference(BuildMI(*MBB, IP, LoadOpcode, 4, R), CPI);
|
2002-12-25 06:13:53 +01:00
|
|
|
}
|
|
|
|
|
2002-12-13 11:50:40 +01:00
|
|
|
} else if (isa<ConstantPointerNull>(C)) {
|
This checkin is brought to you by the brian gaeke allnighter fund.
(lib/Target/X86) InstSelectSimple.cpp:
Include llvm/DerivedTypes.h and iostream.
Refactor visitMul out into a wrapper around doMultiply(), so that we
can do multiplications on temporary values when we are doing
getelementptrs.
Refactor part of getReg out into makeAnotherReg, so that we can create
registers willy-nilly to hold temporary values, when we are doing
getelementptrs.
Add stub implementations of visitMallocInst and visitAllocaInst.
Add initial implementation of visitGetElementPtrInst.
In copyConstantToRegister:
We throw a *lot* of our asserts here. So, when we want to throw an
assert, print out to stderr whatever expr or whatever constant made
us barf.
Support copying ConstantPointerNull to register, using a move immediate
of zero.
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
Teach visitCallInst to extract byte- and short-class return values
from subregs of EAX. Add a FIXME note about how we would do it for
float-class return values.
Add a FIXME note about how we would cast float to int and back.
X86InstrInfo.def:
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
(tools/jello) GlobalVars.cpp:
Include iostream.
If we have to emit a floating-point constant to memory, gamble and use
the same method as for ints.
If we have to emit a ConstantPointerNull to memory, try using a "void *"
and "NULL".
Otherwise, if we are going to throw an assert, print out whatever constant
made us barf, first.
llvm-svn: 4973
2002-12-12 16:33:40 +01:00
|
|
|
// Copy zero (null pointer) to the register.
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::MOV32ri, 1, R).addImm(0);
|
2004-07-18 02:38:32 +02:00
|
|
|
} else if (GlobalValue *GV = dyn_cast<GlobalValue>(C)) {
|
|
|
|
BuildMI(*MBB, IP, X86::MOV32ri, 1, R).addGlobalAddress(GV);
|
2002-11-02 02:15:18 +01:00
|
|
|
} else {
|
2004-07-15 04:14:30 +02:00
|
|
|
std::cerr << "Offending constant: " << *C << "\n";
|
2002-11-02 02:15:18 +01:00
|
|
|
assert(0 && "Type not handled yet!");
|
2002-10-27 22:16:59 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2002-12-28 21:24:02 +01:00
|
|
|
/// LoadArgumentsToVirtualRegs - Load all of the arguments to this function from
|
|
|
|
/// the stack into virtual registers.
|
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::LoadArgumentsToVirtualRegs(Function &Fn) {
|
2002-12-28 21:24:02 +01:00
|
|
|
// Emit instructions to load the arguments... On entry to a function on the
|
|
|
|
// X86, the stack frame looks like this:
|
|
|
|
//
|
|
|
|
// [ESP] -- return address
|
2003-01-13 01:32:26 +01:00
|
|
|
// [ESP + 4] -- first argument (leftmost lexically)
|
|
|
|
// [ESP + 8] -- second argument, if first argument is four bytes in size
|
2005-04-22 01:38:14 +02:00
|
|
|
// ...
|
2002-12-28 21:24:02 +01:00
|
|
|
//
|
2003-01-16 03:20:12 +01:00
|
|
|
unsigned ArgOffset = 0; // Frame mechanisms handle retaddr slot
|
2002-12-28 22:08:28 +01:00
|
|
|
MachineFrameInfo *MFI = F->getFrameInfo();
|
2002-12-28 21:24:02 +01:00
|
|
|
|
2005-04-09 17:23:56 +02:00
|
|
|
for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end();
|
|
|
|
I != E; ++I) {
|
2004-04-11 21:21:59 +02:00
|
|
|
bool ArgLive = !I->use_empty();
|
|
|
|
unsigned Reg = ArgLive ? getReg(*I) : 0;
|
2002-12-28 21:24:02 +01:00
|
|
|
int FI; // Frame object index
|
2004-04-11 21:21:59 +02:00
|
|
|
|
2002-12-28 21:24:02 +01:00
|
|
|
switch (getClassB(I->getType())) {
|
|
|
|
case cByte:
|
2004-04-11 21:21:59 +02:00
|
|
|
if (ArgLive) {
|
|
|
|
FI = MFI->CreateFixedObject(1, ArgOffset);
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV8rm, 4, Reg), FI);
|
|
|
|
}
|
2002-12-28 21:24:02 +01:00
|
|
|
break;
|
|
|
|
case cShort:
|
2004-04-11 21:21:59 +02:00
|
|
|
if (ArgLive) {
|
|
|
|
FI = MFI->CreateFixedObject(2, ArgOffset);
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV16rm, 4, Reg), FI);
|
|
|
|
}
|
2002-12-28 21:24:02 +01:00
|
|
|
break;
|
|
|
|
case cInt:
|
2004-04-11 21:21:59 +02:00
|
|
|
if (ArgLive) {
|
|
|
|
FI = MFI->CreateFixedObject(4, ArgOffset);
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV32rm, 4, Reg), FI);
|
|
|
|
}
|
2002-12-28 21:24:02 +01:00
|
|
|
break;
|
2003-01-13 01:32:26 +01:00
|
|
|
case cLong:
|
2004-04-11 21:21:59 +02:00
|
|
|
if (ArgLive) {
|
|
|
|
FI = MFI->CreateFixedObject(8, ArgOffset);
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV32rm, 4, Reg), FI);
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV32rm, 4, Reg+1), FI, 4);
|
|
|
|
}
|
2003-01-13 01:32:26 +01:00
|
|
|
ArgOffset += 4; // longs require 4 additional bytes
|
|
|
|
break;
|
2002-12-28 21:24:02 +01:00
|
|
|
case cFP:
|
2004-04-11 21:21:59 +02:00
|
|
|
if (ArgLive) {
|
|
|
|
unsigned Opcode;
|
|
|
|
if (I->getType() == Type::FloatTy) {
|
|
|
|
Opcode = X86::FLD32m;
|
|
|
|
FI = MFI->CreateFixedObject(4, ArgOffset);
|
|
|
|
} else {
|
|
|
|
Opcode = X86::FLD64m;
|
|
|
|
FI = MFI->CreateFixedObject(8, ArgOffset);
|
|
|
|
}
|
|
|
|
addFrameReference(BuildMI(BB, Opcode, 4, Reg), FI);
|
2002-12-28 21:24:02 +01:00
|
|
|
}
|
2004-04-11 21:21:59 +02:00
|
|
|
if (I->getType() == Type::DoubleTy)
|
|
|
|
ArgOffset += 4; // doubles require 4 additional bytes
|
2002-12-28 21:24:02 +01:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(0 && "Unhandled argument type!");
|
|
|
|
}
|
2003-01-13 01:32:26 +01:00
|
|
|
ArgOffset += 4; // Each argument takes at least 4 bytes on the stack...
|
2002-12-28 21:24:02 +01:00
|
|
|
}
|
2003-05-08 21:44:13 +02:00
|
|
|
|
|
|
|
// If the function takes variable number of arguments, add a frame offset for
|
|
|
|
// the start of the first vararg value... this is used to expand
|
|
|
|
// llvm.va_start.
|
|
|
|
if (Fn.getFunctionType()->isVarArg())
|
|
|
|
VarArgsFrameIndex = MFI->CreateFixedObject(1, ArgOffset);
|
2005-04-09 17:23:56 +02:00
|
|
|
|
|
|
|
// Finally, inform the compiler what our live-outs will be, aka, what we will
|
|
|
|
// be returning in registers.
|
|
|
|
if (Fn.getReturnType() != Type::VoidTy)
|
|
|
|
switch (getClassB(Fn.getReturnType())) {
|
|
|
|
default: assert(0 && "Unknown type!");
|
|
|
|
case cByte:
|
|
|
|
case cShort:
|
|
|
|
case cInt:
|
|
|
|
F->addLiveOut(X86::EAX);
|
|
|
|
break;
|
|
|
|
case cLong:
|
|
|
|
F->addLiveOut(X86::EAX);
|
|
|
|
F->addLiveOut(X86::EDX);
|
|
|
|
break;
|
|
|
|
case cFP:
|
|
|
|
F->addLiveOut(X86::ST0);
|
|
|
|
break;
|
|
|
|
}
|
2002-12-28 21:24:02 +01:00
|
|
|
}
|
|
|
|
|
2004-12-13 18:23:11 +01:00
|
|
|
/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
|
|
|
|
/// the main function.
|
|
|
|
void X86ISel::EmitSpecialCodeForMain() {
|
|
|
|
// Switch the FPU to 64-bit precision mode for better compatibility and speed.
|
|
|
|
int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2);
|
|
|
|
addFrameReference(BuildMI(BB, X86::FNSTCW16m, 4), CWFrameIdx);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-12-13 18:23:11 +01:00
|
|
|
// Set the high part to be 64-bit precision.
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV8mi, 5),
|
|
|
|
CWFrameIdx, 1).addImm(2);
|
|
|
|
|
|
|
|
// Reload the modified control word now.
|
|
|
|
addFrameReference(BuildMI(BB, X86::FLDCW16m, 4), CWFrameIdx);
|
|
|
|
}
|
2002-12-28 21:24:02 +01:00
|
|
|
|
2002-12-13 11:09:43 +01:00
|
|
|
/// SelectPHINodes - Insert machine code to generate phis. This is tricky
|
|
|
|
/// because we have to generate our sources into the source basic blocks, not
|
|
|
|
/// the current one.
|
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::SelectPHINodes() {
|
2004-06-02 07:55:25 +02:00
|
|
|
const TargetInstrInfo &TII = *TM.getInstrInfo();
|
2002-12-13 11:09:43 +01:00
|
|
|
const Function &LF = *F->getFunction(); // The LLVM function...
|
|
|
|
for (Function::const_iterator I = LF.begin(), E = LF.end(); I != E; ++I) {
|
|
|
|
const BasicBlock *BB = I;
|
2004-02-29 08:10:16 +01:00
|
|
|
MachineBasicBlock &MBB = *MBBMap[I];
|
2002-12-13 11:09:43 +01:00
|
|
|
|
|
|
|
// Loop over all of the PHI nodes in the LLVM basic block...
|
2004-02-29 08:10:16 +01:00
|
|
|
MachineBasicBlock::iterator PHIInsertPoint = MBB.begin();
|
2004-09-15 19:06:42 +02:00
|
|
|
for (BasicBlock::const_iterator I = BB->begin(); isa<PHINode>(I); ++I) {
|
|
|
|
PHINode *PN = const_cast<PHINode*>(dyn_cast<PHINode>(I));
|
2003-01-13 01:32:26 +01:00
|
|
|
|
2002-12-13 11:09:43 +01:00
|
|
|
// Create a new machine instr PHI node, and insert it.
|
2003-01-13 01:32:26 +01:00
|
|
|
unsigned PHIReg = getReg(*PN);
|
2004-02-29 08:10:16 +01:00
|
|
|
MachineInstr *PhiMI = BuildMI(MBB, PHIInsertPoint,
|
|
|
|
X86::PHI, PN->getNumOperands(), PHIReg);
|
2003-01-13 01:32:26 +01:00
|
|
|
|
|
|
|
MachineInstr *LongPhiMI = 0;
|
2004-02-29 08:10:16 +01:00
|
|
|
if (PN->getType() == Type::LongTy || PN->getType() == Type::ULongTy)
|
|
|
|
LongPhiMI = BuildMI(MBB, PHIInsertPoint,
|
|
|
|
X86::PHI, PN->getNumOperands(), PHIReg+1);
|
2002-12-13 11:09:43 +01:00
|
|
|
|
2003-05-12 16:22:21 +02:00
|
|
|
// PHIValues - Map of blocks to incoming virtual registers. We use this
|
|
|
|
// so that we only initialize one incoming value for a particular block,
|
|
|
|
// even if the block has multiple entries in the PHI node.
|
|
|
|
//
|
|
|
|
std::map<MachineBasicBlock*, unsigned> PHIValues;
|
|
|
|
|
2002-12-13 11:09:43 +01:00
|
|
|
for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
|
|
|
|
MachineBasicBlock *PredMBB = MBBMap[PN->getIncomingBlock(i)];
|
2003-05-12 16:22:21 +02:00
|
|
|
unsigned ValReg;
|
|
|
|
std::map<MachineBasicBlock*, unsigned>::iterator EntryIt =
|
|
|
|
PHIValues.lower_bound(PredMBB);
|
|
|
|
|
|
|
|
if (EntryIt != PHIValues.end() && EntryIt->first == PredMBB) {
|
|
|
|
// We already inserted an initialization of the register for this
|
|
|
|
// predecessor. Recycle it.
|
|
|
|
ValReg = EntryIt->second;
|
|
|
|
|
2005-04-22 01:38:14 +02:00
|
|
|
} else {
|
2003-10-19 02:26:11 +02:00
|
|
|
// Get the incoming value into a virtual register.
|
2003-05-12 16:22:21 +02:00
|
|
|
//
|
2003-10-19 02:26:11 +02:00
|
|
|
Value *Val = PN->getIncomingValue(i);
|
|
|
|
|
|
|
|
// If this is a constant or GlobalValue, we may have to insert code
|
|
|
|
// into the basic block to compute it into a virtual register.
|
2004-07-18 02:38:32 +02:00
|
|
|
if ((isa<Constant>(Val) && !isa<ConstantExpr>(Val))) {
|
2004-05-13 09:40:27 +02:00
|
|
|
// Simple constants get emitted at the end of the basic block,
|
|
|
|
// before any terminator instructions. We "know" that the code to
|
|
|
|
// move a constant into a register will never clobber any flags.
|
|
|
|
ValReg = getReg(Val, PredMBB, PredMBB->getFirstTerminator());
|
2003-10-19 02:26:11 +02:00
|
|
|
} else {
|
2004-05-13 09:40:27 +02:00
|
|
|
// Because we don't want to clobber any values which might be in
|
|
|
|
// physical registers with the computation of this constant (which
|
|
|
|
// might be arbitrarily complex if it is a constant expression),
|
|
|
|
// just insert the computation at the top of the basic block.
|
|
|
|
MachineBasicBlock::iterator PI = PredMBB->begin();
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-05-13 09:40:27 +02:00
|
|
|
// Skip over any PHI nodes though!
|
|
|
|
while (PI != PredMBB->end() && PI->getOpcode() == X86::PHI)
|
|
|
|
++PI;
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-05-13 09:40:27 +02:00
|
|
|
ValReg = getReg(Val, PredMBB, PI);
|
2003-10-19 02:26:11 +02:00
|
|
|
}
|
2003-05-12 16:22:21 +02:00
|
|
|
|
|
|
|
// Remember that we inserted a value for this PHI for this predecessor
|
|
|
|
PHIValues.insert(EntryIt, std::make_pair(PredMBB, ValReg));
|
|
|
|
}
|
2002-12-13 11:09:43 +01:00
|
|
|
|
2003-10-23 18:22:08 +02:00
|
|
|
PhiMI->addRegOperand(ValReg);
|
2003-01-13 01:32:26 +01:00
|
|
|
PhiMI->addMachineBasicBlockOperand(PredMBB);
|
2003-10-23 18:22:08 +02:00
|
|
|
if (LongPhiMI) {
|
|
|
|
LongPhiMI->addRegOperand(ValReg+1);
|
|
|
|
LongPhiMI->addMachineBasicBlockOperand(PredMBB);
|
|
|
|
}
|
2002-12-13 11:09:43 +01:00
|
|
|
}
|
2004-02-29 08:10:16 +01:00
|
|
|
|
|
|
|
// Now that we emitted all of the incoming values for the PHI node, make
|
|
|
|
// sure to reposition the InsertPoint after the PHI that we just added.
|
|
|
|
// This is needed because we might have inserted a constant into this
|
|
|
|
// block, right after the PHI's which is before the old insert point!
|
|
|
|
PHIInsertPoint = LongPhiMI ? LongPhiMI : PhiMI;
|
|
|
|
++PHIInsertPoint;
|
2002-12-13 11:09:43 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2004-02-22 20:47:26 +01:00
|
|
|
/// RequiresFPRegKill - The floating point stackifier pass cannot insert
|
|
|
|
/// compensation code on critical edges. As such, it requires that we kill all
|
|
|
|
/// FP registers on the exit from any blocks that either ARE critical edges, or
|
|
|
|
/// branch to a block that has incoming critical edges.
|
|
|
|
///
|
|
|
|
/// Note that this kill instruction will eventually be eliminated when
|
|
|
|
/// restrictions in the stackifier are relaxed.
|
|
|
|
///
|
2004-04-28 06:45:55 +02:00
|
|
|
static bool RequiresFPRegKill(const MachineBasicBlock *MBB) {
|
2004-02-22 20:47:26 +01:00
|
|
|
#if 0
|
2004-04-28 06:45:55 +02:00
|
|
|
const BasicBlock *BB = MBB->getBasicBlock ();
|
2004-02-22 20:47:26 +01:00
|
|
|
for (succ_const_iterator SI = succ_begin(BB), E = succ_end(BB); SI!=E; ++SI) {
|
|
|
|
const BasicBlock *Succ = *SI;
|
|
|
|
pred_const_iterator PI = pred_begin(Succ), PE = pred_end(Succ);
|
|
|
|
++PI; // Block have at least one predecessory
|
|
|
|
if (PI != PE) { // If it has exactly one, this isn't crit edge
|
|
|
|
// If this block has more than one predecessor, check all of the
|
|
|
|
// predecessors to see if they have multiple successors. If so, then the
|
|
|
|
// block we are analyzing needs an FPRegKill.
|
|
|
|
for (PI = pred_begin(Succ); PI != PE; ++PI) {
|
|
|
|
const BasicBlock *Pred = *PI;
|
|
|
|
succ_const_iterator SI2 = succ_begin(Pred);
|
|
|
|
++SI2; // There must be at least one successor of this block.
|
|
|
|
if (SI2 != succ_end(Pred))
|
|
|
|
return true; // Yes, we must insert the kill on this edge.
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// If we got this far, there is no need to insert the kill instruction.
|
|
|
|
return false;
|
|
|
|
#else
|
|
|
|
return true;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
// InsertFPRegKills - Insert FP_REG_KILL instructions into basic blocks that
|
|
|
|
// need them. This only occurs due to the floating point stackifier not being
|
|
|
|
// aggressive enough to handle arbitrary global stackification.
|
|
|
|
//
|
|
|
|
// Currently we insert an FP_REG_KILL instruction into each block that uses or
|
|
|
|
// defines a floating point virtual register.
|
|
|
|
//
|
|
|
|
// When the global register allocators (like linear scan) finally update live
|
|
|
|
// variable analysis, we can keep floating point values in registers across
|
|
|
|
// portions of the CFG that do not involve critical edges. This will be a big
|
|
|
|
// win, but we are waiting on the global allocators before we can do this.
|
|
|
|
//
|
|
|
|
// With a bit of work, the floating point stackifier pass can be enhanced to
|
|
|
|
// break critical edges as needed (to make a place to put compensation code),
|
|
|
|
// but this will require some infrastructure improvements as well.
|
|
|
|
//
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::InsertFPRegKills() {
|
2004-02-22 20:47:26 +01:00
|
|
|
SSARegMap &RegMap = *F->getSSARegMap();
|
|
|
|
|
|
|
|
for (MachineFunction::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
|
|
|
|
for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I!=E; ++I)
|
2004-02-26 23:00:20 +01:00
|
|
|
for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
|
|
|
|
MachineOperand& MO = I->getOperand(i);
|
|
|
|
if (MO.isRegister() && MO.getReg()) {
|
|
|
|
unsigned Reg = MO.getReg();
|
2004-12-03 06:13:15 +01:00
|
|
|
if (MRegisterInfo::isVirtualRegister(Reg)) {
|
|
|
|
unsigned RegSize = RegMap.getRegClass(Reg)->getSize();
|
|
|
|
if (RegSize == 10 || RegSize == 8)
|
2004-02-23 08:29:45 +01:00
|
|
|
goto UsesFPReg;
|
2004-12-03 06:13:15 +01:00
|
|
|
}
|
2004-02-23 08:29:45 +01:00
|
|
|
}
|
2004-02-26 23:00:20 +01:00
|
|
|
}
|
2004-02-23 08:29:45 +01:00
|
|
|
// If we haven't found an FP register use or def in this basic block, check
|
|
|
|
// to see if any of our successors has an FP PHI node, which will cause a
|
|
|
|
// copy to be inserted into this block.
|
2004-04-28 06:34:16 +02:00
|
|
|
for (MachineBasicBlock::const_succ_iterator SI = BB->succ_begin(),
|
|
|
|
SE = BB->succ_end(); SI != SE; ++SI) {
|
|
|
|
MachineBasicBlock *SBB = *SI;
|
2004-02-23 08:42:19 +01:00
|
|
|
for (MachineBasicBlock::iterator I = SBB->begin();
|
|
|
|
I != SBB->end() && I->getOpcode() == X86::PHI; ++I) {
|
2004-12-02 18:57:21 +01:00
|
|
|
const TargetRegisterClass *RC =
|
|
|
|
RegMap.getRegClass(I->getOperand(0).getReg());
|
|
|
|
if (RC->getSize() == 10 || RC->getSize() == 8)
|
2004-02-23 08:42:19 +01:00
|
|
|
goto UsesFPReg;
|
2004-02-22 20:47:26 +01:00
|
|
|
}
|
2004-02-23 08:42:19 +01:00
|
|
|
}
|
2004-02-23 08:29:45 +01:00
|
|
|
continue;
|
|
|
|
UsesFPReg:
|
|
|
|
// Okay, this block uses an FP register. If the block has successors (ie,
|
|
|
|
// it's not an unwind/return), insert the FP_REG_KILL instruction.
|
2005-01-05 17:30:14 +01:00
|
|
|
if (BB->succ_size() && RequiresFPRegKill(BB)) {
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*BB, BB->getFirstTerminator(), X86::FP_REG_KILL, 0);
|
2004-02-23 08:29:45 +01:00
|
|
|
++NumFPKill;
|
2004-02-22 20:47:26 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::getAddressingMode(Value *Addr, X86AddressMode &AM) {
|
2004-08-30 02:13:26 +02:00
|
|
|
AM.BaseType = X86AddressMode::RegBase;
|
|
|
|
AM.Base.Reg = 0; AM.Scale = 1; AM.IndexReg = 0; AM.Disp = 0;
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr)) {
|
|
|
|
if (isGEPFoldable(BB, GEP->getOperand(0), GEP->op_begin()+1, GEP->op_end(),
|
2004-08-30 02:13:26 +02:00
|
|
|
AM))
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
return;
|
|
|
|
} else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
|
|
|
|
if (CE->getOpcode() == Instruction::GetElementPtr)
|
|
|
|
if (isGEPFoldable(BB, CE->getOperand(0), CE->op_begin()+1, CE->op_end(),
|
2004-08-30 02:13:26 +02:00
|
|
|
AM))
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
return;
|
2004-08-30 02:13:26 +02:00
|
|
|
} else if (AllocaInst *AI = dyn_castFixedAlloca(Addr)) {
|
|
|
|
AM.BaseType = X86AddressMode::FrameIndexBase;
|
|
|
|
AM.Base.FrameIndex = getFixedSizedAllocaFI(AI);
|
|
|
|
return;
|
2004-10-15 07:05:29 +02:00
|
|
|
} else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
|
|
|
|
AM.GV = GV;
|
|
|
|
return;
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// If it's not foldable, reset addr mode.
|
2004-08-30 02:13:26 +02:00
|
|
|
AM.BaseType = X86AddressMode::RegBase;
|
|
|
|
AM.Base.Reg = getReg(Addr);
|
|
|
|
AM.Scale = 1; AM.IndexReg = 0; AM.Disp = 0;
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
}
|
|
|
|
|
2004-03-31 00:39:09 +02:00
|
|
|
// canFoldSetCCIntoBranchOrSelect - Return the setcc instruction if we can fold
|
|
|
|
// it into the conditional branch or select instruction which is the only user
|
|
|
|
// of the cc instruction. This is the case if the conditional branch is the
|
2005-04-22 01:38:14 +02:00
|
|
|
// only user of the setcc. We also don't handle long arguments below, so we
|
2004-06-18 02:29:22 +02:00
|
|
|
// reject them here as well.
|
2003-01-16 17:43:00 +01:00
|
|
|
//
|
2004-03-31 00:39:09 +02:00
|
|
|
static SetCondInst *canFoldSetCCIntoBranchOrSelect(Value *V) {
|
2003-01-16 17:43:00 +01:00
|
|
|
if (SetCondInst *SCI = dyn_cast<SetCondInst>(V))
|
2004-03-31 00:39:09 +02:00
|
|
|
if (SCI->hasOneUse()) {
|
|
|
|
Instruction *User = cast<Instruction>(SCI->use_back());
|
2004-12-01 19:27:03 +01:00
|
|
|
if ((isa<BranchInst>(User) || isa<SelectInst>(User)) &&
|
|
|
|
(getClassB(SCI->getOperand(0)->getType()) != cLong ||
|
|
|
|
SCI->getOpcode() == Instruction::SetEQ ||
|
|
|
|
SCI->getOpcode() == Instruction::SetNE) &&
|
|
|
|
(isa<BranchInst>(User) || User->getOperand(0) == V))
|
2003-01-16 17:43:00 +01:00
|
|
|
return SCI;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
2002-12-13 11:09:43 +01:00
|
|
|
|
2003-01-16 17:43:00 +01:00
|
|
|
// Return a fixed numbering for setcc instructions which does not depend on the
|
|
|
|
// order of the opcodes.
|
|
|
|
//
|
|
|
|
static unsigned getSetCCNumber(unsigned Opcode) {
|
|
|
|
switch(Opcode) {
|
|
|
|
default: assert(0 && "Unknown setcc instruction!");
|
|
|
|
case Instruction::SetEQ: return 0;
|
|
|
|
case Instruction::SetNE: return 1;
|
|
|
|
case Instruction::SetLT: return 2;
|
2003-01-16 19:07:23 +01:00
|
|
|
case Instruction::SetGE: return 3;
|
|
|
|
case Instruction::SetGT: return 4;
|
|
|
|
case Instruction::SetLE: return 5;
|
2003-01-16 17:43:00 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// LLVM -> X86 signed X86 unsigned
|
|
|
|
// ----- ---------- ------------
|
|
|
|
// seteq -> sete sete
|
|
|
|
// setne -> setne setne
|
|
|
|
// setlt -> setl setb
|
2003-01-16 19:07:23 +01:00
|
|
|
// setge -> setge setae
|
2003-01-16 17:43:00 +01:00
|
|
|
// setgt -> setg seta
|
|
|
|
// setle -> setle setbe
|
2003-10-19 23:09:10 +02:00
|
|
|
// ----
|
|
|
|
// sets // Used by comparison with 0 optimization
|
|
|
|
// setns
|
|
|
|
static const unsigned SetCCOpcodeTab[2][8] = {
|
|
|
|
{ X86::SETEr, X86::SETNEr, X86::SETBr, X86::SETAEr, X86::SETAr, X86::SETBEr,
|
|
|
|
0, 0 },
|
|
|
|
{ X86::SETEr, X86::SETNEr, X86::SETLr, X86::SETGEr, X86::SETGr, X86::SETLEr,
|
|
|
|
X86::SETSr, X86::SETNSr },
|
2003-01-16 17:43:00 +01:00
|
|
|
};
|
|
|
|
|
2004-06-11 07:33:49 +02:00
|
|
|
/// emitUCOMr - In the future when we support processors before the P6, this
|
|
|
|
/// wraps the logic for emitting an FUCOMr vs FUCOMIr.
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::emitUCOMr(MachineBasicBlock *MBB, MachineBasicBlock::iterator IP,
|
|
|
|
unsigned LHS, unsigned RHS) {
|
2004-06-11 07:33:49 +02:00
|
|
|
if (0) { // for processors prior to the P6
|
|
|
|
BuildMI(*MBB, IP, X86::FUCOMr, 2).addReg(LHS).addReg(RHS);
|
|
|
|
BuildMI(*MBB, IP, X86::FNSTSW8r, 0);
|
|
|
|
BuildMI(*MBB, IP, X86::SAHF, 1);
|
|
|
|
} else {
|
|
|
|
BuildMI(*MBB, IP, X86::FUCOMIr, 2).addReg(LHS).addReg(RHS);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2003-10-19 23:09:10 +02:00
|
|
|
// EmitComparison - This function emits a comparison of the two operands,
|
|
|
|
// returning the extended setcc code to use.
|
2004-09-21 20:21:21 +02:00
|
|
|
unsigned X86ISel::EmitComparison(unsigned OpNum, Value *Op0, Value *Op1,
|
|
|
|
MachineBasicBlock *MBB,
|
|
|
|
MachineBasicBlock::iterator IP) {
|
2002-11-07 18:59:21 +01:00
|
|
|
// The arguments are already supposed to be of the same type.
|
2003-01-16 17:43:00 +01:00
|
|
|
const Type *CompTy = Op0->getType();
|
2003-01-13 01:32:26 +01:00
|
|
|
unsigned Class = getClassB(CompTy);
|
2003-06-05 21:30:30 +02:00
|
|
|
|
|
|
|
// Special case handling of: cmp R, i
|
2004-05-07 21:55:55 +02:00
|
|
|
if (isa<ConstantPointerNull>(Op1)) {
|
2004-10-17 08:10:40 +02:00
|
|
|
unsigned Op0r = getReg(Op0, MBB, IP);
|
2004-05-07 21:55:55 +02:00
|
|
|
if (OpNum < 2) // seteq/setne -> test
|
|
|
|
BuildMI(*MBB, IP, X86::TEST32rr, 2).addReg(Op0r).addReg(Op0r);
|
|
|
|
else
|
|
|
|
BuildMI(*MBB, IP, X86::CMP32ri, 2).addReg(Op0r).addImm(0);
|
|
|
|
return OpNum;
|
|
|
|
|
|
|
|
} else if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
|
Improve codegen of long == and != comparisons against constants. Before,
comparing a long against zero got us this:
sub %ESP, 8
mov DWORD PTR [%ESP + 4], %ESI
mov DWORD PTR [%ESP], %EDI
mov %EAX, DWORD PTR [%ESP + 12]
mov %EDX, DWORD PTR [%ESP + 16]
mov %ECX, 0
mov %ESI, 0
mov %EDI, %EAX
xor %EDI, %ECX
mov %ECX, %EDX
xor %ECX, %ESI
or %EDI, %ECX
sete %CL
test %CL, %CL
je .LBB2 # PC rel: F
Now it gets us this:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
mov %ECX, %EAX
or %ECX, %EDX
sete %CL
test %CL, %CL
je .LBB2 # PC rel: F
llvm-svn: 12696
2004-04-06 18:02:27 +02:00
|
|
|
if (Class == cByte || Class == cShort || Class == cInt) {
|
|
|
|
unsigned Op1v = CI->getRawValue();
|
2003-07-23 17:22:26 +02:00
|
|
|
|
2003-06-05 21:30:30 +02:00
|
|
|
// Mask off any upper bits of the constant, if there are any...
|
|
|
|
Op1v &= (1ULL << (8 << Class)) - 1;
|
|
|
|
|
2003-10-19 23:09:10 +02:00
|
|
|
// If this is a comparison against zero, emit more efficient code. We
|
|
|
|
// can't handle unsigned comparisons against zero unless they are == or
|
|
|
|
// !=. These should have been strength reduced already anyway.
|
|
|
|
if (Op1v == 0 && (CompTy->isSigned() || OpNum < 2)) {
|
2004-10-17 08:10:40 +02:00
|
|
|
|
|
|
|
// If this is a comparison against zero and the LHS is an and of a
|
|
|
|
// register with a constant, use the test to do the and.
|
|
|
|
if (Instruction *Op0I = dyn_cast<Instruction>(Op0))
|
|
|
|
if (Op0I->getOpcode() == Instruction::And && Op0->hasOneUse() &&
|
|
|
|
isa<ConstantInt>(Op0I->getOperand(1))) {
|
|
|
|
static const unsigned TESTTab[] = {
|
|
|
|
X86::TEST8ri, X86::TEST16ri, X86::TEST32ri
|
|
|
|
};
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-10-17 08:10:40 +02:00
|
|
|
// Emit test X, i
|
|
|
|
unsigned LHS = getReg(Op0I->getOperand(0), MBB, IP);
|
|
|
|
unsigned Imm =
|
|
|
|
cast<ConstantInt>(Op0I->getOperand(1))->getRawValue();
|
|
|
|
BuildMI(*MBB, IP, TESTTab[Class], 2).addReg(LHS).addImm(Imm);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-10-17 08:10:40 +02:00
|
|
|
if (OpNum == 2) return 6; // Map jl -> js
|
|
|
|
if (OpNum == 3) return 7; // Map jg -> jns
|
|
|
|
return OpNum;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned Op0r = getReg(Op0, MBB, IP);
|
2003-10-19 23:09:10 +02:00
|
|
|
static const unsigned TESTTab[] = {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
X86::TEST8rr, X86::TEST16rr, X86::TEST32rr
|
2003-10-19 23:09:10 +02:00
|
|
|
};
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*MBB, IP, TESTTab[Class], 2).addReg(Op0r).addReg(Op0r);
|
2003-10-19 23:09:10 +02:00
|
|
|
|
|
|
|
if (OpNum == 2) return 6; // Map jl -> js
|
|
|
|
if (OpNum == 3) return 7; // Map jg -> jns
|
|
|
|
return OpNum;
|
2003-06-05 21:30:30 +02:00
|
|
|
}
|
2003-10-19 23:09:10 +02:00
|
|
|
|
|
|
|
static const unsigned CMPTab[] = {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
X86::CMP8ri, X86::CMP16ri, X86::CMP32ri
|
2003-10-19 23:09:10 +02:00
|
|
|
};
|
|
|
|
|
2004-10-17 08:10:40 +02:00
|
|
|
unsigned Op0r = getReg(Op0, MBB, IP);
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*MBB, IP, CMPTab[Class], 2).addReg(Op0r).addImm(Op1v);
|
2003-10-19 23:09:10 +02:00
|
|
|
return OpNum;
|
Improve codegen of long == and != comparisons against constants. Before,
comparing a long against zero got us this:
sub %ESP, 8
mov DWORD PTR [%ESP + 4], %ESI
mov DWORD PTR [%ESP], %EDI
mov %EAX, DWORD PTR [%ESP + 12]
mov %EDX, DWORD PTR [%ESP + 16]
mov %ECX, 0
mov %ESI, 0
mov %EDI, %EAX
xor %EDI, %ECX
mov %ECX, %EDX
xor %ECX, %ESI
or %EDI, %ECX
sete %CL
test %CL, %CL
je .LBB2 # PC rel: F
Now it gets us this:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
mov %ECX, %EAX
or %ECX, %EDX
sete %CL
test %CL, %CL
je .LBB2 # PC rel: F
llvm-svn: 12696
2004-04-06 18:02:27 +02:00
|
|
|
} else {
|
2004-10-17 08:10:40 +02:00
|
|
|
unsigned Op0r = getReg(Op0, MBB, IP);
|
Improve codegen of long == and != comparisons against constants. Before,
comparing a long against zero got us this:
sub %ESP, 8
mov DWORD PTR [%ESP + 4], %ESI
mov DWORD PTR [%ESP], %EDI
mov %EAX, DWORD PTR [%ESP + 12]
mov %EDX, DWORD PTR [%ESP + 16]
mov %ECX, 0
mov %ESI, 0
mov %EDI, %EAX
xor %EDI, %ECX
mov %ECX, %EDX
xor %ECX, %ESI
or %EDI, %ECX
sete %CL
test %CL, %CL
je .LBB2 # PC rel: F
Now it gets us this:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
mov %ECX, %EAX
or %ECX, %EDX
sete %CL
test %CL, %CL
je .LBB2 # PC rel: F
llvm-svn: 12696
2004-04-06 18:02:27 +02:00
|
|
|
assert(Class == cLong && "Unknown integer class!");
|
|
|
|
unsigned LowCst = CI->getRawValue();
|
|
|
|
unsigned HiCst = CI->getRawValue() >> 32;
|
|
|
|
if (OpNum < 2) { // seteq, setne
|
|
|
|
unsigned LoTmp = Op0r;
|
|
|
|
if (LowCst != 0) {
|
|
|
|
LoTmp = makeAnotherReg(Type::IntTy);
|
|
|
|
BuildMI(*MBB, IP, X86::XOR32ri, 2, LoTmp).addReg(Op0r).addImm(LowCst);
|
|
|
|
}
|
|
|
|
unsigned HiTmp = Op0r+1;
|
|
|
|
if (HiCst != 0) {
|
|
|
|
HiTmp = makeAnotherReg(Type::IntTy);
|
2004-04-06 19:34:50 +02:00
|
|
|
BuildMI(*MBB, IP, X86::XOR32ri, 2,HiTmp).addReg(Op0r+1).addImm(HiCst);
|
Improve codegen of long == and != comparisons against constants. Before,
comparing a long against zero got us this:
sub %ESP, 8
mov DWORD PTR [%ESP + 4], %ESI
mov DWORD PTR [%ESP], %EDI
mov %EAX, DWORD PTR [%ESP + 12]
mov %EDX, DWORD PTR [%ESP + 16]
mov %ECX, 0
mov %ESI, 0
mov %EDI, %EAX
xor %EDI, %ECX
mov %ECX, %EDX
xor %ECX, %ESI
or %EDI, %ECX
sete %CL
test %CL, %CL
je .LBB2 # PC rel: F
Now it gets us this:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
mov %ECX, %EAX
or %ECX, %EDX
sete %CL
test %CL, %CL
je .LBB2 # PC rel: F
llvm-svn: 12696
2004-04-06 18:02:27 +02:00
|
|
|
}
|
|
|
|
unsigned FinalTmp = makeAnotherReg(Type::IntTy);
|
|
|
|
BuildMI(*MBB, IP, X86::OR32rr, 2, FinalTmp).addReg(LoTmp).addReg(HiTmp);
|
|
|
|
return OpNum;
|
2004-04-06 19:34:50 +02:00
|
|
|
} else {
|
2004-12-01 19:27:03 +01:00
|
|
|
// Emit a sequence of code which compares the high and low parts once
|
|
|
|
// each, then uses a conditional move to handle the overflow case. For
|
|
|
|
// example, a setlt for long would generate code like this:
|
|
|
|
//
|
|
|
|
// AL = lo(op1) < lo(op2) // Always unsigned comparison
|
|
|
|
// BL = hi(op1) < hi(op2) // Signedness depends on operands
|
|
|
|
// dest = hi(op1) == hi(op2) ? BL : AL;
|
|
|
|
//
|
|
|
|
|
|
|
|
// FIXME: This would be much better if we had hierarchical register
|
|
|
|
// classes! Until then, hardcode registers so that we can deal with
|
|
|
|
// their aliases (because we don't have conditional byte moves).
|
|
|
|
//
|
|
|
|
BuildMI(*MBB, IP, X86::CMP32ri, 2).addReg(Op0r).addImm(LowCst);
|
|
|
|
BuildMI(*MBB, IP, SetCCOpcodeTab[0][OpNum], 0, X86::AL);
|
|
|
|
BuildMI(*MBB, IP, X86::CMP32ri, 2).addReg(Op0r+1).addImm(HiCst);
|
|
|
|
BuildMI(*MBB, IP, SetCCOpcodeTab[CompTy->isSigned()][OpNum], 0,X86::BL);
|
|
|
|
BuildMI(*MBB, IP, X86::IMPLICIT_DEF, 0, X86::BH);
|
|
|
|
BuildMI(*MBB, IP, X86::IMPLICIT_DEF, 0, X86::AH);
|
|
|
|
BuildMI(*MBB, IP, X86::CMOVE16rr, 2, X86::BX).addReg(X86::BX)
|
|
|
|
.addReg(X86::AX);
|
|
|
|
// NOTE: visitSetCondInst knows that the value is dumped into the BL
|
|
|
|
// register at this point for long values...
|
2004-04-06 19:34:50 +02:00
|
|
|
return OpNum;
|
Improve codegen of long == and != comparisons against constants. Before,
comparing a long against zero got us this:
sub %ESP, 8
mov DWORD PTR [%ESP + 4], %ESI
mov DWORD PTR [%ESP], %EDI
mov %EAX, DWORD PTR [%ESP + 12]
mov %EDX, DWORD PTR [%ESP + 16]
mov %ECX, 0
mov %ESI, 0
mov %EDI, %EAX
xor %EDI, %ECX
mov %ECX, %EDX
xor %ECX, %ESI
or %EDI, %ECX
sete %CL
test %CL, %CL
je .LBB2 # PC rel: F
Now it gets us this:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
mov %ECX, %EAX
or %ECX, %EDX
sete %CL
test %CL, %CL
je .LBB2 # PC rel: F
llvm-svn: 12696
2004-04-06 18:02:27 +02:00
|
|
|
}
|
2003-06-05 21:30:30 +02:00
|
|
|
}
|
Improve codegen of long == and != comparisons against constants. Before,
comparing a long against zero got us this:
sub %ESP, 8
mov DWORD PTR [%ESP + 4], %ESI
mov DWORD PTR [%ESP], %EDI
mov %EAX, DWORD PTR [%ESP + 12]
mov %EDX, DWORD PTR [%ESP + 16]
mov %ECX, 0
mov %ESI, 0
mov %EDI, %EAX
xor %EDI, %ECX
mov %ECX, %EDX
xor %ECX, %ESI
or %EDI, %ECX
sete %CL
test %CL, %CL
je .LBB2 # PC rel: F
Now it gets us this:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
mov %ECX, %EAX
or %ECX, %EDX
sete %CL
test %CL, %CL
je .LBB2 # PC rel: F
llvm-svn: 12696
2004-04-06 18:02:27 +02:00
|
|
|
}
|
2003-06-05 21:30:30 +02:00
|
|
|
|
2004-10-17 08:10:40 +02:00
|
|
|
unsigned Op0r = getReg(Op0, MBB, IP);
|
|
|
|
|
2004-02-03 19:54:04 +01:00
|
|
|
// Special case handling of comparison against +/- 0.0
|
|
|
|
if (ConstantFP *CFP = dyn_cast<ConstantFP>(Op1))
|
|
|
|
if (CFP->isExactlyValue(+0.0) || CFP->isExactlyValue(-0.0)) {
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*MBB, IP, X86::FTST, 1).addReg(Op0r);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::FNSTSW8r, 0);
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*MBB, IP, X86::SAHF, 1);
|
2004-02-03 19:54:04 +01:00
|
|
|
return OpNum;
|
|
|
|
}
|
|
|
|
|
2003-08-24 21:19:47 +02:00
|
|
|
unsigned Op1r = getReg(Op1, MBB, IP);
|
2002-11-21 16:52:38 +01:00
|
|
|
switch (Class) {
|
2003-01-13 01:32:26 +01:00
|
|
|
default: assert(0 && "Unknown type class!");
|
2002-11-21 16:52:38 +01:00
|
|
|
// Emit: cmp <var1>, <var2> (do the comparison). We can
|
|
|
|
// compare 8-bit with 8-bit, 16-bit with 16-bit, 32-bit with
|
|
|
|
// 32-bit.
|
|
|
|
case cByte:
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::CMP8rr, 2).addReg(Op0r).addReg(Op1r);
|
2002-11-21 16:52:38 +01:00
|
|
|
break;
|
|
|
|
case cShort:
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::CMP16rr, 2).addReg(Op0r).addReg(Op1r);
|
2002-11-21 16:52:38 +01:00
|
|
|
break;
|
|
|
|
case cInt:
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::CMP32rr, 2).addReg(Op0r).addReg(Op1r);
|
2002-11-21 16:52:38 +01:00
|
|
|
break;
|
2003-01-13 01:32:26 +01:00
|
|
|
case cFP:
|
2004-06-11 07:33:49 +02:00
|
|
|
emitUCOMr(MBB, IP, Op0r, Op1r);
|
2002-11-21 16:52:38 +01:00
|
|
|
break;
|
2003-01-13 01:32:26 +01:00
|
|
|
|
2002-11-21 16:52:38 +01:00
|
|
|
case cLong:
|
2003-01-13 01:32:26 +01:00
|
|
|
if (OpNum < 2) { // seteq, setne
|
|
|
|
unsigned LoTmp = makeAnotherReg(Type::IntTy);
|
|
|
|
unsigned HiTmp = makeAnotherReg(Type::IntTy);
|
|
|
|
unsigned FinalTmp = makeAnotherReg(Type::IntTy);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::XOR32rr, 2, LoTmp).addReg(Op0r).addReg(Op1r);
|
|
|
|
BuildMI(*MBB, IP, X86::XOR32rr, 2, HiTmp).addReg(Op0r+1).addReg(Op1r+1);
|
|
|
|
BuildMI(*MBB, IP, X86::OR32rr, 2, FinalTmp).addReg(LoTmp).addReg(HiTmp);
|
2003-01-13 01:32:26 +01:00
|
|
|
break; // Allow the sete or setne to be generated from flags set by OR
|
|
|
|
} else {
|
|
|
|
// Emit a sequence of code which compares the high and low parts once
|
|
|
|
// each, then uses a conditional move to handle the overflow case. For
|
|
|
|
// example, a setlt for long would generate code like this:
|
|
|
|
//
|
|
|
|
// AL = lo(op1) < lo(op2) // Signedness depends on operands
|
|
|
|
// BL = hi(op1) < hi(op2) // Always unsigned comparison
|
2004-05-10 01:16:33 +02:00
|
|
|
// dest = hi(op1) == hi(op2) ? BL : AL;
|
2003-01-13 01:32:26 +01:00
|
|
|
//
|
2002-11-21 16:52:38 +01:00
|
|
|
|
2003-01-16 17:43:00 +01:00
|
|
|
// FIXME: This would be much better if we had hierarchical register
|
2003-01-13 01:32:26 +01:00
|
|
|
// classes! Until then, hardcode registers so that we can deal with their
|
|
|
|
// aliases (because we don't have conditional byte moves).
|
|
|
|
//
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::CMP32rr, 2).addReg(Op0r).addReg(Op1r);
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*MBB, IP, SetCCOpcodeTab[0][OpNum], 0, X86::AL);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::CMP32rr, 2).addReg(Op0r+1).addReg(Op1r+1);
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*MBB, IP, SetCCOpcodeTab[CompTy->isSigned()][OpNum], 0, X86::BL);
|
|
|
|
BuildMI(*MBB, IP, X86::IMPLICIT_DEF, 0, X86::BH);
|
|
|
|
BuildMI(*MBB, IP, X86::IMPLICIT_DEF, 0, X86::AH);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::CMOVE16rr, 2, X86::BX).addReg(X86::BX)
|
2004-02-29 08:22:16 +01:00
|
|
|
.addReg(X86::AX);
|
2003-01-16 17:43:00 +01:00
|
|
|
// NOTE: visitSetCondInst knows that the value is dumped into the BL
|
|
|
|
// register at this point for long values...
|
2003-10-19 23:09:10 +02:00
|
|
|
return OpNum;
|
2003-01-13 01:32:26 +01:00
|
|
|
}
|
2002-11-21 16:52:38 +01:00
|
|
|
}
|
2003-10-19 23:09:10 +02:00
|
|
|
return OpNum;
|
2003-01-16 17:43:00 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/// SetCC instructions - Here we just emit boilerplate code to set a byte-sized
|
2005-04-22 01:38:14 +02:00
|
|
|
/// register, then move it to wherever the result should be.
|
2003-01-16 17:43:00 +01:00
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitSetCondInst(SetCondInst &I) {
|
2004-03-31 00:39:09 +02:00
|
|
|
if (canFoldSetCCIntoBranchOrSelect(&I))
|
|
|
|
return; // Fold this into a branch or select.
|
2003-01-16 17:43:00 +01:00
|
|
|
|
|
|
|
unsigned DestReg = getReg(I);
|
2003-08-24 21:19:47 +02:00
|
|
|
MachineBasicBlock::iterator MII = BB->end();
|
|
|
|
emitSetCCOperation(BB, MII, I.getOperand(0), I.getOperand(1), I.getOpcode(),
|
|
|
|
DestReg);
|
|
|
|
}
|
2002-11-21 16:52:38 +01:00
|
|
|
|
2003-08-24 21:19:47 +02:00
|
|
|
/// emitSetCCOperation - Common code shared between visitSetCondInst and
|
|
|
|
/// constant expression support.
|
2004-03-02 00:53:11 +01:00
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::emitSetCCOperation(MachineBasicBlock *MBB,
|
|
|
|
MachineBasicBlock::iterator IP,
|
|
|
|
Value *Op0, Value *Op1, unsigned Opcode,
|
|
|
|
unsigned TargetReg) {
|
2003-08-24 21:19:47 +02:00
|
|
|
unsigned OpNum = getSetCCNumber(Opcode);
|
2003-10-19 23:09:10 +02:00
|
|
|
OpNum = EmitComparison(OpNum, Op0, Op1, MBB, IP);
|
2003-08-24 21:19:47 +02:00
|
|
|
|
2003-10-19 23:09:10 +02:00
|
|
|
const Type *CompTy = Op0->getType();
|
|
|
|
unsigned CompClass = getClassB(CompTy);
|
|
|
|
bool isSigned = CompTy->isSigned() && CompClass != cFP;
|
|
|
|
|
2004-12-01 19:27:03 +01:00
|
|
|
if (CompClass != cLong || OpNum < 2) {
|
|
|
|
// Handle normal comparisons with a setcc instruction...
|
|
|
|
BuildMI(*MBB, IP, SetCCOpcodeTab[isSigned][OpNum], 0, TargetReg);
|
|
|
|
} else {
|
|
|
|
// Handle long comparisons by copying the value which is already in BL into
|
|
|
|
// the register we want...
|
|
|
|
BuildMI(*MBB, IP, X86::MOV8rr, 1, TargetReg).addReg(X86::BL);
|
|
|
|
}
|
2002-11-07 18:59:21 +01:00
|
|
|
}
|
2002-11-02 20:45:49 +01:00
|
|
|
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitSelectInst(SelectInst &SI) {
|
2004-03-30 23:22:00 +02:00
|
|
|
unsigned DestReg = getReg(SI);
|
|
|
|
MachineBasicBlock::iterator MII = BB->end();
|
|
|
|
emitSelectOperation(BB, MII, SI.getCondition(), SI.getTrueValue(),
|
|
|
|
SI.getFalseValue(), DestReg);
|
|
|
|
}
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-03-30 23:22:00 +02:00
|
|
|
/// emitSelect - Common code shared between visitSelectInst and the constant
|
|
|
|
/// expression support.
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::emitSelectOperation(MachineBasicBlock *MBB,
|
|
|
|
MachineBasicBlock::iterator IP,
|
|
|
|
Value *Cond, Value *TrueVal, Value *FalseVal,
|
|
|
|
unsigned DestReg) {
|
2004-03-30 23:22:00 +02:00
|
|
|
unsigned SelectClass = getClassB(TrueVal->getType());
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-03-30 23:22:00 +02:00
|
|
|
// We don't support 8-bit conditional moves. If we have incoming constants,
|
|
|
|
// transform them into 16-bit constants to avoid having a run-time conversion.
|
|
|
|
if (SelectClass == cByte) {
|
|
|
|
if (Constant *T = dyn_cast<Constant>(TrueVal))
|
|
|
|
TrueVal = ConstantExpr::getCast(T, Type::ShortTy);
|
|
|
|
if (Constant *F = dyn_cast<Constant>(FalseVal))
|
|
|
|
FalseVal = ConstantExpr::getCast(F, Type::ShortTy);
|
|
|
|
}
|
|
|
|
|
2004-04-13 23:56:09 +02:00
|
|
|
unsigned TrueReg = getReg(TrueVal, MBB, IP);
|
|
|
|
unsigned FalseReg = getReg(FalseVal, MBB, IP);
|
|
|
|
if (TrueReg == FalseReg) {
|
|
|
|
static const unsigned Opcode[] = {
|
|
|
|
X86::MOV8rr, X86::MOV16rr, X86::MOV32rr, X86::FpMOV, X86::MOV32rr
|
|
|
|
};
|
|
|
|
BuildMI(*MBB, IP, Opcode[SelectClass], 1, DestReg).addReg(TrueReg);
|
|
|
|
if (SelectClass == cLong)
|
|
|
|
BuildMI(*MBB, IP, X86::MOV32rr, 1, DestReg+1).addReg(TrueReg+1);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2004-03-31 00:39:09 +02:00
|
|
|
unsigned Opcode;
|
|
|
|
if (SetCondInst *SCI = canFoldSetCCIntoBranchOrSelect(Cond)) {
|
|
|
|
// We successfully folded the setcc into the select instruction.
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-03-31 00:39:09 +02:00
|
|
|
unsigned OpNum = getSetCCNumber(SCI->getOpcode());
|
|
|
|
OpNum = EmitComparison(OpNum, SCI->getOperand(0), SCI->getOperand(1), MBB,
|
|
|
|
IP);
|
|
|
|
|
|
|
|
const Type *CompTy = SCI->getOperand(0)->getType();
|
|
|
|
bool isSigned = CompTy->isSigned() && getClassB(CompTy) != cFP;
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-03-31 00:39:09 +02:00
|
|
|
// LLVM -> X86 signed X86 unsigned
|
|
|
|
// ----- ---------- ------------
|
|
|
|
// seteq -> cmovNE cmovNE
|
|
|
|
// setne -> cmovE cmovE
|
|
|
|
// setlt -> cmovGE cmovAE
|
|
|
|
// setge -> cmovL cmovB
|
|
|
|
// setgt -> cmovLE cmovBE
|
|
|
|
// setle -> cmovG cmovA
|
|
|
|
// ----
|
|
|
|
// cmovNS // Used by comparison with 0 optimization
|
|
|
|
// cmovS
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-03-31 00:39:09 +02:00
|
|
|
switch (SelectClass) {
|
2004-04-01 00:03:35 +02:00
|
|
|
default: assert(0 && "Unknown value class!");
|
|
|
|
case cFP: {
|
|
|
|
// Annoyingly, we don't have a full set of floating point conditional
|
|
|
|
// moves. :(
|
|
|
|
static const unsigned OpcodeTab[2][8] = {
|
|
|
|
{ X86::FCMOVNE, X86::FCMOVE, X86::FCMOVAE, X86::FCMOVB,
|
|
|
|
X86::FCMOVBE, X86::FCMOVA, 0, 0 },
|
|
|
|
{ X86::FCMOVNE, X86::FCMOVE, 0, 0, 0, 0, 0, 0 },
|
|
|
|
};
|
|
|
|
Opcode = OpcodeTab[isSigned][OpNum];
|
|
|
|
|
|
|
|
// If opcode == 0, we hit a case that we don't support. Output a setcc
|
|
|
|
// and compare the result against zero.
|
|
|
|
if (Opcode == 0) {
|
|
|
|
unsigned CompClass = getClassB(CompTy);
|
|
|
|
unsigned CondReg;
|
|
|
|
if (CompClass != cLong || OpNum < 2) {
|
|
|
|
CondReg = makeAnotherReg(Type::BoolTy);
|
|
|
|
// Handle normal comparisons with a setcc instruction...
|
|
|
|
BuildMI(*MBB, IP, SetCCOpcodeTab[isSigned][OpNum], 0, CondReg);
|
|
|
|
} else {
|
|
|
|
// Long comparisons end up in the BL register.
|
|
|
|
CondReg = X86::BL;
|
|
|
|
}
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-01 00:22:36 +02:00
|
|
|
BuildMI(*MBB, IP, X86::TEST8rr, 2).addReg(CondReg).addReg(CondReg);
|
2004-04-01 00:03:35 +02:00
|
|
|
Opcode = X86::FCMOVE;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2004-03-31 00:39:09 +02:00
|
|
|
case cByte:
|
|
|
|
case cShort: {
|
|
|
|
static const unsigned OpcodeTab[2][8] = {
|
|
|
|
{ X86::CMOVNE16rr, X86::CMOVE16rr, X86::CMOVAE16rr, X86::CMOVB16rr,
|
|
|
|
X86::CMOVBE16rr, X86::CMOVA16rr, 0, 0 },
|
|
|
|
{ X86::CMOVNE16rr, X86::CMOVE16rr, X86::CMOVGE16rr, X86::CMOVL16rr,
|
|
|
|
X86::CMOVLE16rr, X86::CMOVG16rr, X86::CMOVNS16rr, X86::CMOVS16rr },
|
|
|
|
};
|
|
|
|
Opcode = OpcodeTab[isSigned][OpNum];
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case cInt:
|
|
|
|
case cLong: {
|
|
|
|
static const unsigned OpcodeTab[2][8] = {
|
|
|
|
{ X86::CMOVNE32rr, X86::CMOVE32rr, X86::CMOVAE32rr, X86::CMOVB32rr,
|
|
|
|
X86::CMOVBE32rr, X86::CMOVA32rr, 0, 0 },
|
|
|
|
{ X86::CMOVNE32rr, X86::CMOVE32rr, X86::CMOVGE32rr, X86::CMOVL32rr,
|
|
|
|
X86::CMOVLE32rr, X86::CMOVG32rr, X86::CMOVNS32rr, X86::CMOVS32rr },
|
|
|
|
};
|
|
|
|
Opcode = OpcodeTab[isSigned][OpNum];
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Get the value being branched on, and use it to set the condition codes.
|
|
|
|
unsigned CondReg = getReg(Cond, MBB, IP);
|
2004-04-01 00:22:36 +02:00
|
|
|
BuildMI(*MBB, IP, X86::TEST8rr, 2).addReg(CondReg).addReg(CondReg);
|
2004-03-31 00:39:09 +02:00
|
|
|
switch (SelectClass) {
|
2004-04-01 00:03:35 +02:00
|
|
|
default: assert(0 && "Unknown value class!");
|
|
|
|
case cFP: Opcode = X86::FCMOVE; break;
|
2004-03-31 00:39:09 +02:00
|
|
|
case cByte:
|
2004-04-01 00:03:35 +02:00
|
|
|
case cShort: Opcode = X86::CMOVE16rr; break;
|
2004-03-31 00:39:09 +02:00
|
|
|
case cInt:
|
2004-04-01 00:03:35 +02:00
|
|
|
case cLong: Opcode = X86::CMOVE32rr; break;
|
2004-03-31 00:39:09 +02:00
|
|
|
}
|
|
|
|
}
|
2004-03-30 23:22:00 +02:00
|
|
|
|
|
|
|
unsigned RealDestReg = DestReg;
|
|
|
|
|
|
|
|
|
|
|
|
// Annoyingly enough, X86 doesn't HAVE 8-bit conditional moves. Because of
|
|
|
|
// this, we have to promote the incoming values to 16 bits, perform a 16-bit
|
|
|
|
// cmove, then truncate the result.
|
|
|
|
if (SelectClass == cByte) {
|
|
|
|
DestReg = makeAnotherReg(Type::ShortTy);
|
|
|
|
if (getClassB(TrueVal->getType()) == cByte) {
|
|
|
|
// Promote the true value, by storing it into AL, and reading from AX.
|
|
|
|
BuildMI(*MBB, IP, X86::MOV8rr, 1, X86::AL).addReg(TrueReg);
|
|
|
|
BuildMI(*MBB, IP, X86::MOV8ri, 1, X86::AH).addImm(0);
|
|
|
|
TrueReg = makeAnotherReg(Type::ShortTy);
|
|
|
|
BuildMI(*MBB, IP, X86::MOV16rr, 1, TrueReg).addReg(X86::AX);
|
|
|
|
}
|
|
|
|
if (getClassB(FalseVal->getType()) == cByte) {
|
|
|
|
// Promote the true value, by storing it into CL, and reading from CX.
|
|
|
|
BuildMI(*MBB, IP, X86::MOV8rr, 1, X86::CL).addReg(FalseReg);
|
|
|
|
BuildMI(*MBB, IP, X86::MOV8ri, 1, X86::CH).addImm(0);
|
|
|
|
FalseReg = makeAnotherReg(Type::ShortTy);
|
|
|
|
BuildMI(*MBB, IP, X86::MOV16rr, 1, FalseReg).addReg(X86::CX);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
BuildMI(*MBB, IP, Opcode, 2, DestReg).addReg(TrueReg).addReg(FalseReg);
|
|
|
|
|
|
|
|
switch (SelectClass) {
|
|
|
|
case cByte:
|
|
|
|
// We did the computation with 16-bit registers. Truncate back to our
|
|
|
|
// result by copying into AX then copying out AL.
|
|
|
|
BuildMI(*MBB, IP, X86::MOV16rr, 1, X86::AX).addReg(DestReg);
|
|
|
|
BuildMI(*MBB, IP, X86::MOV8rr, 1, RealDestReg).addReg(X86::AL);
|
|
|
|
break;
|
|
|
|
case cLong:
|
|
|
|
// Move the upper half of the value as well.
|
|
|
|
BuildMI(*MBB, IP, Opcode, 2,DestReg+1).addReg(TrueReg+1).addReg(FalseReg+1);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2003-08-24 21:19:47 +02:00
|
|
|
|
|
|
|
|
|
|
|
|
2002-11-30 12:57:28 +01:00
|
|
|
/// promote32 - Emit instructions to turn a narrow operand into a 32-bit-wide
|
|
|
|
/// operand, in the specified target register.
|
2004-03-02 00:53:11 +01:00
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::promote32(unsigned targetReg, const ValueRecord &VR) {
|
2004-05-10 01:16:33 +02:00
|
|
|
bool isUnsigned = VR.Ty->isUnsigned() || VR.Ty == Type::BoolTy;
|
2003-08-04 04:12:48 +02:00
|
|
|
|
2004-04-06 03:21:00 +02:00
|
|
|
Value *Val = VR.Val;
|
|
|
|
const Type *Ty = VR.Ty;
|
2004-04-06 03:25:33 +02:00
|
|
|
if (Val) {
|
2004-04-06 03:21:00 +02:00
|
|
|
if (Constant *C = dyn_cast<Constant>(Val)) {
|
|
|
|
Val = ConstantExpr::getCast(C, Type::IntTy);
|
|
|
|
Ty = Type::IntTy;
|
|
|
|
}
|
|
|
|
|
2004-04-06 03:25:33 +02:00
|
|
|
// If this is a simple constant, just emit a MOVri directly to avoid the
|
|
|
|
// copy.
|
|
|
|
if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
|
|
|
|
int TheVal = CI->getRawValue() & 0xFFFFFFFF;
|
2004-05-12 18:35:04 +02:00
|
|
|
BuildMI(BB, X86::MOV32ri, 1, targetReg).addImm(TheVal);
|
2004-04-06 03:25:33 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2003-08-04 04:12:48 +02:00
|
|
|
// Make sure we have the register number for this value...
|
2004-04-06 03:21:00 +02:00
|
|
|
unsigned Reg = Val ? getReg(Val) : VR.Reg;
|
2003-08-04 04:12:48 +02:00
|
|
|
|
2004-04-06 03:21:00 +02:00
|
|
|
switch (getClassB(Ty)) {
|
2002-12-25 06:13:53 +01:00
|
|
|
case cByte:
|
|
|
|
// Extend value into target register (8->32)
|
|
|
|
if (isUnsigned)
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::MOVZX32rr8, 1, targetReg).addReg(Reg);
|
2002-12-25 06:13:53 +01:00
|
|
|
else
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::MOVSX32rr8, 1, targetReg).addReg(Reg);
|
2002-12-25 06:13:53 +01:00
|
|
|
break;
|
|
|
|
case cShort:
|
|
|
|
// Extend value into target register (16->32)
|
|
|
|
if (isUnsigned)
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::MOVZX32rr16, 1, targetReg).addReg(Reg);
|
2002-12-25 06:13:53 +01:00
|
|
|
else
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::MOVSX32rr16, 1, targetReg).addReg(Reg);
|
2002-12-25 06:13:53 +01:00
|
|
|
break;
|
|
|
|
case cInt:
|
|
|
|
// Move value into target register (32->32)
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::MOV32rr, 1, targetReg).addReg(Reg);
|
2002-12-25 06:13:53 +01:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
assert(0 && "Unpromotable operand class in promote32");
|
|
|
|
}
|
2002-11-30 12:57:28 +01:00
|
|
|
}
|
2002-10-27 22:16:59 +01:00
|
|
|
|
2002-10-26 00:55:53 +02:00
|
|
|
/// 'ret' instruction - Here we are interested in meeting the x86 ABI. As such,
|
|
|
|
/// we have the following possibilities:
|
|
|
|
///
|
|
|
|
/// ret void: No return value, simply emit a 'ret' instruction
|
|
|
|
/// ret sbyte, ubyte : Extend value into EAX and return
|
|
|
|
/// ret short, ushort: Extend value into EAX and return
|
|
|
|
/// ret int, uint : Move value into EAX and return
|
|
|
|
/// ret pointer : Move value into EAX and return
|
2002-11-17 22:56:38 +01:00
|
|
|
/// ret long, ulong : Move value into EAX/EDX and return
|
|
|
|
/// ret float/double : Top of FP stack
|
2002-10-26 00:55:53 +02:00
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitReturnInst(ReturnInst &I) {
|
2002-12-25 06:13:53 +01:00
|
|
|
if (I.getNumOperands() == 0) {
|
|
|
|
BuildMI(BB, X86::RET, 0); // Just emit a 'ret' instruction
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
Value *RetVal = I.getOperand(0);
|
2003-01-13 01:32:26 +01:00
|
|
|
switch (getClassB(RetVal->getType())) {
|
2002-12-25 06:13:53 +01:00
|
|
|
case cByte: // integral return values: extend or move into EAX and return
|
|
|
|
case cShort:
|
|
|
|
case cInt:
|
2004-04-06 03:21:00 +02:00
|
|
|
promote32(X86::EAX, ValueRecord(RetVal));
|
2002-12-25 06:13:53 +01:00
|
|
|
break;
|
2004-04-06 03:21:00 +02:00
|
|
|
case cFP: { // Floats & Doubles: Return in ST(0)
|
|
|
|
unsigned RetReg = getReg(RetVal);
|
2003-01-13 01:32:26 +01:00
|
|
|
BuildMI(BB, X86::FpSETRESULT, 1).addReg(RetReg);
|
2002-12-25 06:13:53 +01:00
|
|
|
break;
|
2004-04-06 03:21:00 +02:00
|
|
|
}
|
|
|
|
case cLong: {
|
|
|
|
unsigned RetReg = getReg(RetVal);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::MOV32rr, 1, X86::EAX).addReg(RetReg);
|
|
|
|
BuildMI(BB, X86::MOV32rr, 1, X86::EDX).addReg(RetReg+1);
|
2003-01-13 01:32:26 +01:00
|
|
|
break;
|
2004-04-06 03:21:00 +02:00
|
|
|
}
|
2002-12-25 06:13:53 +01:00
|
|
|
default:
|
2003-01-13 01:32:26 +01:00
|
|
|
visitInstruction(I);
|
2002-12-25 06:13:53 +01:00
|
|
|
}
|
2002-11-17 21:07:45 +01:00
|
|
|
// Emit a 'ret' instruction
|
2002-12-25 06:13:53 +01:00
|
|
|
BuildMI(BB, X86::RET, 0);
|
2002-10-26 00:55:53 +02:00
|
|
|
}
|
|
|
|
|
2003-01-16 19:07:23 +01:00
|
|
|
// getBlockAfter - Return the basic block which occurs lexically after the
|
|
|
|
// specified one.
|
|
|
|
static inline BasicBlock *getBlockAfter(BasicBlock *BB) {
|
|
|
|
Function::iterator I = BB; ++I; // Get iterator to next block
|
|
|
|
return I != BB->getParent()->end() ? &*I : 0;
|
|
|
|
}
|
|
|
|
|
2002-11-02 20:45:49 +01:00
|
|
|
/// visitBranchInst - Handle conditional and unconditional branches here. Note
|
|
|
|
/// that since code layout is frozen at this point, that if we are trying to
|
|
|
|
/// jump to a block that is the immediate successor of the current block, we can
|
2003-01-16 17:43:00 +01:00
|
|
|
/// just make a fall-through (but we don't currently).
|
2002-11-02 20:45:49 +01:00
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitBranchInst(BranchInst &BI) {
|
2004-04-28 06:19:37 +02:00
|
|
|
// Update machine-CFG edges
|
|
|
|
BB->addSuccessor (MBBMap[BI.getSuccessor(0)]);
|
|
|
|
if (BI.isConditional())
|
|
|
|
BB->addSuccessor (MBBMap[BI.getSuccessor(1)]);
|
|
|
|
|
2003-01-16 19:07:23 +01:00
|
|
|
BasicBlock *NextBB = getBlockAfter(BI.getParent()); // BB after current one
|
|
|
|
|
|
|
|
if (!BI.isConditional()) { // Unconditional branch?
|
2004-01-30 23:13:44 +01:00
|
|
|
if (BI.getSuccessor(0) != NextBB)
|
2004-05-14 08:54:56 +02:00
|
|
|
BuildMI(BB, X86::JMP, 1).addMBB(MBBMap[BI.getSuccessor(0)]);
|
2003-01-16 17:43:00 +01:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// See if we can fold the setcc into the branch itself...
|
2004-03-31 00:39:09 +02:00
|
|
|
SetCondInst *SCI = canFoldSetCCIntoBranchOrSelect(BI.getCondition());
|
2003-01-16 17:43:00 +01:00
|
|
|
if (SCI == 0) {
|
|
|
|
// Nope, cannot fold setcc into this branch. Emit a branch on a condition
|
|
|
|
// computed some other way...
|
2002-12-28 21:24:02 +01:00
|
|
|
unsigned condReg = getReg(BI.getCondition());
|
2004-04-01 00:22:36 +02:00
|
|
|
BuildMI(BB, X86::TEST8rr, 2).addReg(condReg).addReg(condReg);
|
2003-01-16 19:07:23 +01:00
|
|
|
if (BI.getSuccessor(1) == NextBB) {
|
|
|
|
if (BI.getSuccessor(0) != NextBB)
|
2004-05-14 08:54:56 +02:00
|
|
|
BuildMI(BB, X86::JNE, 1).addMBB(MBBMap[BI.getSuccessor(0)]);
|
2003-01-16 19:07:23 +01:00
|
|
|
} else {
|
2004-05-14 08:54:56 +02:00
|
|
|
BuildMI(BB, X86::JE, 1).addMBB(MBBMap[BI.getSuccessor(1)]);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2003-01-16 19:07:23 +01:00
|
|
|
if (BI.getSuccessor(0) != NextBB)
|
2004-05-14 08:54:56 +02:00
|
|
|
BuildMI(BB, X86::JMP, 1).addMBB(MBBMap[BI.getSuccessor(0)]);
|
2003-01-16 19:07:23 +01:00
|
|
|
}
|
2003-01-16 17:43:00 +01:00
|
|
|
return;
|
2002-12-25 06:13:53 +01:00
|
|
|
}
|
2003-01-16 17:43:00 +01:00
|
|
|
|
|
|
|
unsigned OpNum = getSetCCNumber(SCI->getOpcode());
|
2003-08-24 21:19:47 +02:00
|
|
|
MachineBasicBlock::iterator MII = BB->end();
|
2003-10-23 18:22:08 +02:00
|
|
|
OpNum = EmitComparison(OpNum, SCI->getOperand(0), SCI->getOperand(1), BB,MII);
|
2003-10-19 23:09:10 +02:00
|
|
|
|
|
|
|
const Type *CompTy = SCI->getOperand(0)->getType();
|
|
|
|
bool isSigned = CompTy->isSigned() && getClassB(CompTy) != cFP;
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2003-10-19 23:09:10 +02:00
|
|
|
|
2003-01-16 17:43:00 +01:00
|
|
|
// LLVM -> X86 signed X86 unsigned
|
|
|
|
// ----- ---------- ------------
|
|
|
|
// seteq -> je je
|
|
|
|
// setne -> jne jne
|
|
|
|
// setlt -> jl jb
|
2003-01-16 19:07:23 +01:00
|
|
|
// setge -> jge jae
|
2003-01-16 17:43:00 +01:00
|
|
|
// setgt -> jg ja
|
|
|
|
// setle -> jle jbe
|
2003-10-19 23:09:10 +02:00
|
|
|
// ----
|
|
|
|
// js // Used by comparison with 0 optimization
|
|
|
|
// jns
|
|
|
|
|
|
|
|
static const unsigned OpcodeTab[2][8] = {
|
|
|
|
{ X86::JE, X86::JNE, X86::JB, X86::JAE, X86::JA, X86::JBE, 0, 0 },
|
|
|
|
{ X86::JE, X86::JNE, X86::JL, X86::JGE, X86::JG, X86::JLE,
|
|
|
|
X86::JS, X86::JNS },
|
2003-01-16 17:43:00 +01:00
|
|
|
};
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2003-01-16 19:07:23 +01:00
|
|
|
if (BI.getSuccessor(0) != NextBB) {
|
2004-05-14 08:54:56 +02:00
|
|
|
BuildMI(BB, OpcodeTab[isSigned][OpNum], 1)
|
|
|
|
.addMBB(MBBMap[BI.getSuccessor(0)]);
|
2003-01-16 19:07:23 +01:00
|
|
|
if (BI.getSuccessor(1) != NextBB)
|
2004-05-14 08:54:56 +02:00
|
|
|
BuildMI(BB, X86::JMP, 1).addMBB(MBBMap[BI.getSuccessor(1)]);
|
2003-01-16 19:07:23 +01:00
|
|
|
} else {
|
|
|
|
// Change to the inverse condition...
|
|
|
|
if (BI.getSuccessor(1) != NextBB) {
|
|
|
|
OpNum ^= 1;
|
2004-05-14 08:54:56 +02:00
|
|
|
BuildMI(BB, OpcodeTab[isSigned][OpNum], 1)
|
|
|
|
.addMBB(MBBMap[BI.getSuccessor(1)]);
|
2003-01-16 19:07:23 +01:00
|
|
|
}
|
|
|
|
}
|
2002-11-02 20:27:56 +01:00
|
|
|
}
|
|
|
|
|
2003-01-13 01:32:26 +01:00
|
|
|
|
|
|
|
/// doCall - This emits an abstract call instruction, setting up the arguments
|
|
|
|
/// and the return value as appropriate. For the actual function call itself,
|
|
|
|
/// it inserts the specified CallMI instruction into the stream.
|
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::doCall(const ValueRecord &Ret, MachineInstr *CallMI,
|
|
|
|
const std::vector<ValueRecord> &Args) {
|
2002-12-28 21:24:02 +01:00
|
|
|
// Count how many bytes are to be pushed on the stack...
|
|
|
|
unsigned NumBytes = 0;
|
|
|
|
|
2003-01-13 01:32:26 +01:00
|
|
|
if (!Args.empty()) {
|
|
|
|
for (unsigned i = 0, e = Args.size(); i != e; ++i)
|
|
|
|
switch (getClassB(Args[i].Ty)) {
|
2002-12-28 21:24:02 +01:00
|
|
|
case cByte: case cShort: case cInt:
|
2003-10-23 18:22:08 +02:00
|
|
|
NumBytes += 4; break;
|
2002-12-28 21:24:02 +01:00
|
|
|
case cLong:
|
2003-10-23 18:22:08 +02:00
|
|
|
NumBytes += 8; break;
|
2002-12-28 21:24:02 +01:00
|
|
|
case cFP:
|
2003-10-23 18:22:08 +02:00
|
|
|
NumBytes += Args[i].Ty == Type::FloatTy ? 4 : 8;
|
|
|
|
break;
|
2002-12-28 21:24:02 +01:00
|
|
|
default: assert(0 && "Unknown class!");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Adjust the stack pointer for the new arguments...
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(BB, X86::ADJCALLSTACKDOWN, 1).addImm(NumBytes);
|
2002-12-28 21:24:02 +01:00
|
|
|
|
|
|
|
// Arguments go on the stack in reverse order, as specified by the ABI.
|
|
|
|
unsigned ArgOffset = 0;
|
2003-01-13 01:32:26 +01:00
|
|
|
for (unsigned i = 0, e = Args.size(); i != e; ++i) {
|
Fix a minor code-quality issue. When passing 8 and 16-bit integer constants
to function calls, we would emit dead code, like this:
int Y(int, short, double);
int X() {
Y(4, 123, 4);
}
--- Old
X:
sub %ESP, 20
mov %EAX, 4
mov DWORD PTR [%ESP], %EAX
*** mov %AX, 123
mov %AX, 123
movsx %EAX, %AX
mov DWORD PTR [%ESP + 4], %EAX
fld QWORD PTR [.CPIX_0]
fstp QWORD PTR [%ESP + 8]
call Y
mov %EAX, 0
# IMPLICIT_USE %EAX %ESP
add %ESP, 20
ret
Now we emit:
X:
sub %ESP, 20
mov %EAX, 4
mov DWORD PTR [%ESP], %EAX
mov %AX, 123
movsx %EAX, %AX
mov DWORD PTR [%ESP + 4], %EAX
fld QWORD PTR [.CPIX_0]
fstp QWORD PTR [%ESP + 8]
call Y
mov %EAX, 0
# IMPLICIT_USE %EAX %ESP
add %ESP, 20
ret
Next up, eliminate the mov AX and movsx entirely!
llvm-svn: 12026
2004-03-01 03:34:08 +01:00
|
|
|
unsigned ArgReg;
|
2003-01-13 01:32:26 +01:00
|
|
|
switch (getClassB(Args[i].Ty)) {
|
2002-12-28 21:24:02 +01:00
|
|
|
case cByte:
|
2004-05-12 18:35:04 +02:00
|
|
|
if (Args[i].Val && isa<ConstantBool>(Args[i].Val)) {
|
|
|
|
addRegOffset(BuildMI(BB, X86::MOV32mi, 5), X86::ESP, ArgOffset)
|
|
|
|
.addImm(Args[i].Val == ConstantBool::True);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// FALL THROUGH
|
2004-03-01 03:42:43 +01:00
|
|
|
case cShort:
|
|
|
|
if (Args[i].Val && isa<ConstantInt>(Args[i].Val)) {
|
|
|
|
// Zero/Sign extend constant, then stuff into memory.
|
|
|
|
ConstantInt *Val = cast<ConstantInt>(Args[i].Val);
|
|
|
|
Val = cast<ConstantInt>(ConstantExpr::getCast(Val, Type::IntTy));
|
|
|
|
addRegOffset(BuildMI(BB, X86::MOV32mi, 5), X86::ESP, ArgOffset)
|
|
|
|
.addImm(Val->getRawValue() & 0xFFFFFFFF);
|
|
|
|
} else {
|
|
|
|
// Promote arg to 32 bits wide into a temporary register...
|
|
|
|
ArgReg = makeAnotherReg(Type::UIntTy);
|
|
|
|
promote32(ArgReg, Args[i]);
|
|
|
|
addRegOffset(BuildMI(BB, X86::MOV32mr, 5),
|
|
|
|
X86::ESP, ArgOffset).addReg(ArgReg);
|
|
|
|
}
|
2003-10-23 18:22:08 +02:00
|
|
|
break;
|
2002-12-28 21:24:02 +01:00
|
|
|
case cInt:
|
2004-03-01 03:42:43 +01:00
|
|
|
if (Args[i].Val && isa<ConstantInt>(Args[i].Val)) {
|
|
|
|
unsigned Val = cast<ConstantInt>(Args[i].Val)->getRawValue();
|
|
|
|
addRegOffset(BuildMI(BB, X86::MOV32mi, 5),
|
|
|
|
X86::ESP, ArgOffset).addImm(Val);
|
Two more improvements for null pointer handling: storing a null pointer
and passing a null pointer into a function.
For this testcase:
void %test(int** %X) {
store int* null, int** %X
call void %test(int** null)
ret void
}
we now generate this:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov DWORD PTR [%EAX], 0
mov DWORD PTR [%ESP], 0
call test
add %ESP, 12
ret
instead of this:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %ECX, 0
mov DWORD PTR [%EAX], %ECX
mov %EAX, 0
mov DWORD PTR [%ESP], %EAX
call test
add %ESP, 12
ret
llvm-svn: 13558
2004-05-13 17:26:48 +02:00
|
|
|
} else if (Args[i].Val && isa<ConstantPointerNull>(Args[i].Val)) {
|
|
|
|
addRegOffset(BuildMI(BB, X86::MOV32mi, 5),
|
|
|
|
X86::ESP, ArgOffset).addImm(0);
|
2005-04-21 21:11:03 +02:00
|
|
|
} else if (Args[i].Val && isa<GlobalValue>(Args[i].Val)) {
|
|
|
|
addRegOffset(BuildMI(BB, X86::MOV32mi, 5), X86::ESP, ArgOffset)
|
|
|
|
.addGlobalAddress(cast<GlobalValue>(Args[i].Val));
|
2004-03-01 03:42:43 +01:00
|
|
|
} else {
|
|
|
|
ArgReg = Args[i].Val ? getReg(Args[i].Val) : Args[i].Reg;
|
|
|
|
addRegOffset(BuildMI(BB, X86::MOV32mr, 5),
|
|
|
|
X86::ESP, ArgOffset).addReg(ArgReg);
|
|
|
|
}
|
2003-10-23 18:22:08 +02:00
|
|
|
break;
|
2003-01-13 01:32:26 +01:00
|
|
|
case cLong:
|
2004-04-06 05:23:00 +02:00
|
|
|
if (Args[i].Val && isa<ConstantInt>(Args[i].Val)) {
|
|
|
|
uint64_t Val = cast<ConstantInt>(Args[i].Val)->getRawValue();
|
|
|
|
addRegOffset(BuildMI(BB, X86::MOV32mi, 5),
|
|
|
|
X86::ESP, ArgOffset).addImm(Val & ~0U);
|
|
|
|
addRegOffset(BuildMI(BB, X86::MOV32mi, 5),
|
|
|
|
X86::ESP, ArgOffset+4).addImm(Val >> 32ULL);
|
|
|
|
} else {
|
|
|
|
ArgReg = Args[i].Val ? getReg(Args[i].Val) : Args[i].Reg;
|
|
|
|
addRegOffset(BuildMI(BB, X86::MOV32mr, 5),
|
|
|
|
X86::ESP, ArgOffset).addReg(ArgReg);
|
|
|
|
addRegOffset(BuildMI(BB, X86::MOV32mr, 5),
|
|
|
|
X86::ESP, ArgOffset+4).addReg(ArgReg+1);
|
|
|
|
}
|
2003-10-23 18:22:08 +02:00
|
|
|
ArgOffset += 4; // 8 byte entry, not 4.
|
|
|
|
break;
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2002-12-28 21:24:02 +01:00
|
|
|
case cFP:
|
2005-01-08 06:45:24 +01:00
|
|
|
if (ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(Args[i].Val)) {
|
2005-01-08 07:59:50 +01:00
|
|
|
// Store constant FP values with integer instructions to avoid having
|
|
|
|
// to load the constants from the constant pool then do a store.
|
2005-01-08 06:45:24 +01:00
|
|
|
if (CFP->getType() == Type::FloatTy) {
|
|
|
|
union {
|
|
|
|
unsigned I;
|
|
|
|
float F;
|
|
|
|
} V;
|
|
|
|
V.F = CFP->getValue();
|
|
|
|
addRegOffset(BuildMI(BB, X86::MOV32mi, 5),
|
|
|
|
X86::ESP, ArgOffset).addImm(V.I);
|
|
|
|
} else {
|
|
|
|
union {
|
|
|
|
uint64_t I;
|
|
|
|
double F;
|
|
|
|
} V;
|
|
|
|
V.F = CFP->getValue();
|
|
|
|
addRegOffset(BuildMI(BB, X86::MOV32mi, 5),
|
|
|
|
X86::ESP, ArgOffset).addImm((unsigned)V.I);
|
|
|
|
addRegOffset(BuildMI(BB, X86::MOV32mi, 5),
|
|
|
|
X86::ESP, ArgOffset+4).addImm(unsigned(V.I >> 32));
|
|
|
|
ArgOffset += 4; // 8 byte entry, not 4.
|
|
|
|
}
|
2003-10-23 18:22:08 +02:00
|
|
|
} else {
|
2005-01-08 06:45:24 +01:00
|
|
|
ArgReg = Args[i].Val ? getReg(Args[i].Val) : Args[i].Reg;
|
|
|
|
if (Args[i].Ty == Type::FloatTy) {
|
|
|
|
addRegOffset(BuildMI(BB, X86::FST32m, 5),
|
|
|
|
X86::ESP, ArgOffset).addReg(ArgReg);
|
|
|
|
} else {
|
|
|
|
assert(Args[i].Ty == Type::DoubleTy && "Unknown FP type!");
|
|
|
|
addRegOffset(BuildMI(BB, X86::FST64m, 5),
|
|
|
|
X86::ESP, ArgOffset).addReg(ArgReg);
|
|
|
|
ArgOffset += 4; // 8 byte entry, not 4.
|
|
|
|
}
|
2003-10-23 18:22:08 +02:00
|
|
|
}
|
|
|
|
break;
|
2002-12-28 21:24:02 +01:00
|
|
|
|
2003-01-13 01:32:26 +01:00
|
|
|
default: assert(0 && "Unknown class!");
|
2002-12-28 21:24:02 +01:00
|
|
|
}
|
|
|
|
ArgOffset += 4;
|
2002-11-29 13:01:58 +01:00
|
|
|
}
|
2002-12-13 15:13:27 +01:00
|
|
|
} else {
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(BB, X86::ADJCALLSTACKDOWN, 1).addImm(0);
|
2002-12-13 15:13:27 +01:00
|
|
|
}
|
2002-12-04 20:22:53 +01:00
|
|
|
|
2003-01-13 01:32:26 +01:00
|
|
|
BB->push_back(CallMI);
|
|
|
|
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(BB, X86::ADJCALLSTACKUP, 1).addImm(NumBytes);
|
2002-12-05 00:45:28 +01:00
|
|
|
|
|
|
|
// If there is a return value, scavenge the result from the location the call
|
|
|
|
// leaves it in...
|
|
|
|
//
|
2003-01-13 01:32:26 +01:00
|
|
|
if (Ret.Ty != Type::VoidTy) {
|
|
|
|
unsigned DestClass = getClassB(Ret.Ty);
|
|
|
|
switch (DestClass) {
|
This checkin is brought to you by the brian gaeke allnighter fund.
(lib/Target/X86) InstSelectSimple.cpp:
Include llvm/DerivedTypes.h and iostream.
Refactor visitMul out into a wrapper around doMultiply(), so that we
can do multiplications on temporary values when we are doing
getelementptrs.
Refactor part of getReg out into makeAnotherReg, so that we can create
registers willy-nilly to hold temporary values, when we are doing
getelementptrs.
Add stub implementations of visitMallocInst and visitAllocaInst.
Add initial implementation of visitGetElementPtrInst.
In copyConstantToRegister:
We throw a *lot* of our asserts here. So, when we want to throw an
assert, print out to stderr whatever expr or whatever constant made
us barf.
Support copying ConstantPointerNull to register, using a move immediate
of zero.
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
Teach visitCallInst to extract byte- and short-class return values
from subregs of EAX. Add a FIXME note about how we would do it for
float-class return values.
Add a FIXME note about how we would cast float to int and back.
X86InstrInfo.def:
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
(tools/jello) GlobalVars.cpp:
Include iostream.
If we have to emit a floating-point constant to memory, gamble and use
the same method as for ints.
If we have to emit a ConstantPointerNull to memory, try using a "void *"
and "NULL".
Otherwise, if we are going to throw an assert, print out whatever constant
made us barf, first.
llvm-svn: 4973
2002-12-12 16:33:40 +01:00
|
|
|
case cByte:
|
|
|
|
case cShort:
|
|
|
|
case cInt: {
|
|
|
|
// Integral results are in %eax, or the appropriate portion
|
|
|
|
// thereof.
|
|
|
|
static const unsigned regRegMove[] = {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
X86::MOV8rr, X86::MOV16rr, X86::MOV32rr
|
This checkin is brought to you by the brian gaeke allnighter fund.
(lib/Target/X86) InstSelectSimple.cpp:
Include llvm/DerivedTypes.h and iostream.
Refactor visitMul out into a wrapper around doMultiply(), so that we
can do multiplications on temporary values when we are doing
getelementptrs.
Refactor part of getReg out into makeAnotherReg, so that we can create
registers willy-nilly to hold temporary values, when we are doing
getelementptrs.
Add stub implementations of visitMallocInst and visitAllocaInst.
Add initial implementation of visitGetElementPtrInst.
In copyConstantToRegister:
We throw a *lot* of our asserts here. So, when we want to throw an
assert, print out to stderr whatever expr or whatever constant made
us barf.
Support copying ConstantPointerNull to register, using a move immediate
of zero.
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
Teach visitCallInst to extract byte- and short-class return values
from subregs of EAX. Add a FIXME note about how we would do it for
float-class return values.
Add a FIXME note about how we would cast float to int and back.
X86InstrInfo.def:
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
(tools/jello) GlobalVars.cpp:
Include iostream.
If we have to emit a floating-point constant to memory, gamble and use
the same method as for ints.
If we have to emit a ConstantPointerNull to memory, try using a "void *"
and "NULL".
Otherwise, if we are going to throw an assert, print out whatever constant
made us barf, first.
llvm-svn: 4973
2002-12-12 16:33:40 +01:00
|
|
|
};
|
|
|
|
static const unsigned AReg[] = { X86::AL, X86::AX, X86::EAX };
|
2003-01-13 01:32:26 +01:00
|
|
|
BuildMI(BB, regRegMove[DestClass], 1, Ret.Reg).addReg(AReg[DestClass]);
|
This checkin is brought to you by the brian gaeke allnighter fund.
(lib/Target/X86) InstSelectSimple.cpp:
Include llvm/DerivedTypes.h and iostream.
Refactor visitMul out into a wrapper around doMultiply(), so that we
can do multiplications on temporary values when we are doing
getelementptrs.
Refactor part of getReg out into makeAnotherReg, so that we can create
registers willy-nilly to hold temporary values, when we are doing
getelementptrs.
Add stub implementations of visitMallocInst and visitAllocaInst.
Add initial implementation of visitGetElementPtrInst.
In copyConstantToRegister:
We throw a *lot* of our asserts here. So, when we want to throw an
assert, print out to stderr whatever expr or whatever constant made
us barf.
Support copying ConstantPointerNull to register, using a move immediate
of zero.
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
Teach visitCallInst to extract byte- and short-class return values
from subregs of EAX. Add a FIXME note about how we would do it for
float-class return values.
Add a FIXME note about how we would cast float to int and back.
X86InstrInfo.def:
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
(tools/jello) GlobalVars.cpp:
Include iostream.
If we have to emit a floating-point constant to memory, gamble and use
the same method as for ints.
If we have to emit a ConstantPointerNull to memory, try using a "void *"
and "NULL".
Otherwise, if we are going to throw an assert, print out whatever constant
made us barf, first.
llvm-svn: 4973
2002-12-12 16:33:40 +01:00
|
|
|
break;
|
|
|
|
}
|
2002-12-25 06:13:53 +01:00
|
|
|
case cFP: // Floating-point return values live in %ST(0)
|
2003-01-13 01:32:26 +01:00
|
|
|
BuildMI(BB, X86::FpGETRESULT, 1, Ret.Reg);
|
2002-12-05 00:50:28 +01:00
|
|
|
break;
|
2003-01-13 01:32:26 +01:00
|
|
|
case cLong: // Long values are left in EDX:EAX
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::MOV32rr, 1, Ret.Reg).addReg(X86::EAX);
|
|
|
|
BuildMI(BB, X86::MOV32rr, 1, Ret.Reg+1).addReg(X86::EDX);
|
2003-01-13 01:32:26 +01:00
|
|
|
break;
|
|
|
|
default: assert(0 && "Unknown class!");
|
2002-12-05 00:50:28 +01:00
|
|
|
}
|
2002-12-05 00:45:28 +01:00
|
|
|
}
|
2002-11-22 12:07:01 +01:00
|
|
|
}
|
2002-11-02 20:27:56 +01:00
|
|
|
|
2003-01-13 01:32:26 +01:00
|
|
|
|
|
|
|
/// visitCallInst - Push args on stack and do a procedure call instruction.
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitCallInst(CallInst &CI) {
|
2003-01-13 01:32:26 +01:00
|
|
|
MachineInstr *TheCall;
|
|
|
|
if (Function *F = CI.getCalledFunction()) {
|
2003-05-08 21:44:13 +02:00
|
|
|
// Is it an intrinsic function call?
|
2003-11-11 23:41:34 +01:00
|
|
|
if (Intrinsic::ID ID = (Intrinsic::ID)F->getIntrinsicID()) {
|
2003-05-08 21:44:13 +02:00
|
|
|
visitIntrinsicCall(ID, CI); // Special intrinsics are not handled here
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2003-01-13 01:32:26 +01:00
|
|
|
// Emit a CALL instruction with PC-relative displacement.
|
|
|
|
TheCall = BuildMI(X86::CALLpcrel32, 1).addGlobalAddress(F, true);
|
|
|
|
} else { // Emit an indirect call...
|
|
|
|
unsigned Reg = getReg(CI.getCalledValue());
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
TheCall = BuildMI(X86::CALL32r, 1).addReg(Reg);
|
2003-01-13 01:32:26 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<ValueRecord> Args;
|
|
|
|
for (unsigned i = 1, e = CI.getNumOperands(); i != e; ++i)
|
2003-08-04 04:12:48 +02:00
|
|
|
Args.push_back(ValueRecord(CI.getOperand(i)));
|
2003-01-13 01:32:26 +01:00
|
|
|
|
|
|
|
unsigned DestReg = CI.getType() != Type::VoidTy ? getReg(CI) : 0;
|
|
|
|
doCall(ValueRecord(DestReg, CI.getType()), TheCall, Args);
|
2005-04-22 01:38:14 +02:00
|
|
|
}
|
2003-01-13 01:32:26 +01:00
|
|
|
|
2003-12-28 10:47:19 +01:00
|
|
|
/// LowerUnknownIntrinsicFunctionCalls - This performs a prepass over the
|
|
|
|
/// function, lowering any calls to unknown intrinsic functions into the
|
|
|
|
/// equivalent LLVM code.
|
2004-03-02 00:53:11 +01:00
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::LowerUnknownIntrinsicFunctionCalls(Function &F) {
|
2003-12-28 10:47:19 +01:00
|
|
|
for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
|
|
|
|
for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; )
|
|
|
|
if (CallInst *CI = dyn_cast<CallInst>(I++))
|
|
|
|
if (Function *F = CI->getCalledFunction())
|
|
|
|
switch (F->getIntrinsicID()) {
|
2003-12-28 10:53:23 +01:00
|
|
|
case Intrinsic::not_intrinsic:
|
2004-03-13 01:24:00 +01:00
|
|
|
case Intrinsic::vastart:
|
|
|
|
case Intrinsic::vacopy:
|
|
|
|
case Intrinsic::vaend:
|
2004-02-15 02:04:03 +01:00
|
|
|
case Intrinsic::returnaddress:
|
|
|
|
case Intrinsic::frameaddress:
|
2004-02-12 18:53:22 +01:00
|
|
|
case Intrinsic::memcpy:
|
2004-02-14 05:46:05 +01:00
|
|
|
case Intrinsic::memset:
|
2004-06-15 23:36:44 +02:00
|
|
|
case Intrinsic::isunordered:
|
2004-04-08 22:31:47 +02:00
|
|
|
case Intrinsic::readport:
|
|
|
|
case Intrinsic::writeport:
|
2003-12-28 10:47:19 +01:00
|
|
|
// We directly implement these intrinsics
|
|
|
|
break;
|
2004-04-14 00:13:14 +02:00
|
|
|
case Intrinsic::readio: {
|
|
|
|
// On X86, memory operations are in-order. Lower this intrinsic
|
|
|
|
// into a volatile load.
|
2004-06-11 06:31:10 +02:00
|
|
|
LoadInst * LI = new LoadInst(CI->getOperand(1), "", true, CI);
|
|
|
|
CI->replaceAllUsesWith(LI);
|
|
|
|
BB->getInstList().erase(CI);
|
2004-04-14 00:13:14 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case Intrinsic::writeio: {
|
|
|
|
// On X86, memory operations are in-order. Lower this intrinsic
|
|
|
|
// into a volatile store.
|
2004-06-11 06:31:10 +02:00
|
|
|
StoreInst *LI = new StoreInst(CI->getOperand(1),
|
|
|
|
CI->getOperand(2), true, CI);
|
|
|
|
CI->replaceAllUsesWith(LI);
|
|
|
|
BB->getInstList().erase(CI);
|
2004-04-14 00:13:14 +02:00
|
|
|
break;
|
|
|
|
}
|
2003-12-28 10:47:19 +01:00
|
|
|
default:
|
|
|
|
// All other intrinsic calls we must lower.
|
|
|
|
Instruction *Before = CI->getPrev();
|
2003-12-28 22:23:38 +01:00
|
|
|
TM.getIntrinsicLowering().LowerIntrinsicCall(CI);
|
2003-12-28 10:47:19 +01:00
|
|
|
if (Before) { // Move iterator to instruction after call
|
2004-06-11 06:31:10 +02:00
|
|
|
I = Before; ++I;
|
2003-12-28 10:47:19 +01:00
|
|
|
} else {
|
|
|
|
I = BB->begin();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitIntrinsicCall(Intrinsic::ID ID, CallInst &CI) {
|
2003-05-08 21:44:13 +02:00
|
|
|
unsigned TmpReg1, TmpReg2;
|
|
|
|
switch (ID) {
|
2004-03-13 01:24:52 +01:00
|
|
|
case Intrinsic::vastart:
|
2003-05-08 21:44:13 +02:00
|
|
|
// Get the address of the first vararg value...
|
2003-10-18 07:56:40 +02:00
|
|
|
TmpReg1 = getReg(CI);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
addFrameReference(BuildMI(BB, X86::LEA32r, 5, TmpReg1), VarArgsFrameIndex);
|
2003-05-08 21:44:13 +02:00
|
|
|
return;
|
|
|
|
|
2004-03-13 01:24:52 +01:00
|
|
|
case Intrinsic::vacopy:
|
2003-10-18 07:56:40 +02:00
|
|
|
TmpReg1 = getReg(CI);
|
|
|
|
TmpReg2 = getReg(CI.getOperand(1));
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::MOV32rr, 1, TmpReg1).addReg(TmpReg2);
|
2003-05-08 21:44:13 +02:00
|
|
|
return;
|
2004-03-13 01:24:52 +01:00
|
|
|
case Intrinsic::vaend: return; // Noop on X86
|
2003-05-08 21:44:13 +02:00
|
|
|
|
2004-02-15 02:04:03 +01:00
|
|
|
case Intrinsic::returnaddress:
|
|
|
|
case Intrinsic::frameaddress:
|
|
|
|
TmpReg1 = getReg(CI);
|
|
|
|
if (cast<Constant>(CI.getOperand(1))->isNullValue()) {
|
2004-12-17 01:46:51 +01:00
|
|
|
if (ReturnAddressIndex == 0) {
|
2004-12-17 01:07:46 +01:00
|
|
|
// Set up a frame object for the return address.
|
|
|
|
ReturnAddressIndex = F->getFrameInfo()->CreateFixedObject(4, -4);
|
|
|
|
}
|
|
|
|
|
2004-02-15 02:04:03 +01:00
|
|
|
if (ID == Intrinsic::returnaddress) {
|
|
|
|
// Just load the return address
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
addFrameReference(BuildMI(BB, X86::MOV32rm, 4, TmpReg1),
|
2004-02-15 02:04:03 +01:00
|
|
|
ReturnAddressIndex);
|
|
|
|
} else {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
addFrameReference(BuildMI(BB, X86::LEA32r, 4, TmpReg1),
|
2004-02-15 02:04:03 +01:00
|
|
|
ReturnAddressIndex, -4);
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// Values other than zero are not implemented yet.
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::MOV32ri, 1, TmpReg1).addImm(0);
|
2004-02-15 02:04:03 +01:00
|
|
|
}
|
|
|
|
return;
|
|
|
|
|
2004-06-15 23:36:44 +02:00
|
|
|
case Intrinsic::isunordered:
|
|
|
|
TmpReg1 = getReg(CI.getOperand(1));
|
|
|
|
TmpReg2 = getReg(CI.getOperand(2));
|
|
|
|
emitUCOMr(BB, BB->end(), TmpReg2, TmpReg1);
|
|
|
|
TmpReg2 = getReg(CI);
|
|
|
|
BuildMI(BB, X86::SETPr, 0, TmpReg2);
|
|
|
|
return;
|
|
|
|
|
2004-02-12 18:53:22 +01:00
|
|
|
case Intrinsic::memcpy: {
|
|
|
|
assert(CI.getNumOperands() == 5 && "Illegal llvm.memcpy call!");
|
|
|
|
unsigned Align = 1;
|
|
|
|
if (ConstantInt *AlignC = dyn_cast<ConstantInt>(CI.getOperand(4))) {
|
|
|
|
Align = AlignC->getRawValue();
|
|
|
|
if (Align == 0) Align = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Turn the byte code into # iterations
|
|
|
|
unsigned CountReg;
|
2004-02-14 05:46:05 +01:00
|
|
|
unsigned Opcode;
|
2004-02-12 18:53:22 +01:00
|
|
|
switch (Align & 3) {
|
|
|
|
case 2: // WORD aligned
|
2004-02-14 00:36:47 +01:00
|
|
|
if (ConstantInt *I = dyn_cast<ConstantInt>(CI.getOperand(3))) {
|
|
|
|
CountReg = getReg(ConstantUInt::get(Type::UIntTy, I->getRawValue()/2));
|
|
|
|
} else {
|
|
|
|
CountReg = makeAnotherReg(Type::IntTy);
|
2004-02-26 02:20:02 +01:00
|
|
|
unsigned ByteReg = getReg(CI.getOperand(3));
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::SHR32ri, 2, CountReg).addReg(ByteReg).addImm(1);
|
2004-02-14 00:36:47 +01:00
|
|
|
}
|
2004-02-14 05:46:05 +01:00
|
|
|
Opcode = X86::REP_MOVSW;
|
2004-02-12 18:53:22 +01:00
|
|
|
break;
|
|
|
|
case 0: // DWORD aligned
|
2004-02-14 00:36:47 +01:00
|
|
|
if (ConstantInt *I = dyn_cast<ConstantInt>(CI.getOperand(3))) {
|
|
|
|
CountReg = getReg(ConstantUInt::get(Type::UIntTy, I->getRawValue()/4));
|
|
|
|
} else {
|
|
|
|
CountReg = makeAnotherReg(Type::IntTy);
|
2004-02-26 02:20:02 +01:00
|
|
|
unsigned ByteReg = getReg(CI.getOperand(3));
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::SHR32ri, 2, CountReg).addReg(ByteReg).addImm(2);
|
2004-02-14 00:36:47 +01:00
|
|
|
}
|
2004-02-14 05:46:05 +01:00
|
|
|
Opcode = X86::REP_MOVSD;
|
2004-02-12 18:53:22 +01:00
|
|
|
break;
|
2004-02-26 02:20:02 +01:00
|
|
|
default: // BYTE aligned
|
2004-02-14 00:36:47 +01:00
|
|
|
CountReg = getReg(CI.getOperand(3));
|
2004-02-14 05:46:05 +01:00
|
|
|
Opcode = X86::REP_MOVSB;
|
2004-02-12 18:53:22 +01:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// No matter what the alignment is, we put the source in ESI, the
|
|
|
|
// destination in EDI, and the count in ECX.
|
|
|
|
TmpReg1 = getReg(CI.getOperand(1));
|
|
|
|
TmpReg2 = getReg(CI.getOperand(2));
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::MOV32rr, 1, X86::ECX).addReg(CountReg);
|
|
|
|
BuildMI(BB, X86::MOV32rr, 1, X86::EDI).addReg(TmpReg1);
|
|
|
|
BuildMI(BB, X86::MOV32rr, 1, X86::ESI).addReg(TmpReg2);
|
2004-02-14 05:46:05 +01:00
|
|
|
BuildMI(BB, Opcode, 0);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
case Intrinsic::memset: {
|
|
|
|
assert(CI.getNumOperands() == 5 && "Illegal llvm.memset call!");
|
|
|
|
unsigned Align = 1;
|
|
|
|
if (ConstantInt *AlignC = dyn_cast<ConstantInt>(CI.getOperand(4))) {
|
|
|
|
Align = AlignC->getRawValue();
|
|
|
|
if (Align == 0) Align = 1;
|
|
|
|
}
|
2004-02-12 18:53:22 +01:00
|
|
|
|
2004-02-14 05:46:05 +01:00
|
|
|
// Turn the byte code into # iterations
|
|
|
|
unsigned CountReg;
|
|
|
|
unsigned Opcode;
|
|
|
|
if (ConstantInt *ValC = dyn_cast<ConstantInt>(CI.getOperand(2))) {
|
|
|
|
unsigned Val = ValC->getRawValue() & 255;
|
|
|
|
|
|
|
|
// If the value is a constant, then we can potentially use larger copies.
|
|
|
|
switch (Align & 3) {
|
|
|
|
case 2: // WORD aligned
|
|
|
|
if (ConstantInt *I = dyn_cast<ConstantInt>(CI.getOperand(3))) {
|
2004-02-14 07:00:36 +01:00
|
|
|
CountReg =getReg(ConstantUInt::get(Type::UIntTy, I->getRawValue()/2));
|
2004-02-14 05:46:05 +01:00
|
|
|
} else {
|
|
|
|
CountReg = makeAnotherReg(Type::IntTy);
|
2004-02-26 02:20:02 +01:00
|
|
|
unsigned ByteReg = getReg(CI.getOperand(3));
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::SHR32ri, 2, CountReg).addReg(ByteReg).addImm(1);
|
2004-02-14 05:46:05 +01:00
|
|
|
}
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::MOV16ri, 1, X86::AX).addImm((Val << 8) | Val);
|
2004-02-14 05:46:05 +01:00
|
|
|
Opcode = X86::REP_STOSW;
|
|
|
|
break;
|
|
|
|
case 0: // DWORD aligned
|
|
|
|
if (ConstantInt *I = dyn_cast<ConstantInt>(CI.getOperand(3))) {
|
2004-02-14 07:00:36 +01:00
|
|
|
CountReg =getReg(ConstantUInt::get(Type::UIntTy, I->getRawValue()/4));
|
2004-02-14 05:46:05 +01:00
|
|
|
} else {
|
|
|
|
CountReg = makeAnotherReg(Type::IntTy);
|
2004-02-26 02:20:02 +01:00
|
|
|
unsigned ByteReg = getReg(CI.getOperand(3));
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::SHR32ri, 2, CountReg).addReg(ByteReg).addImm(2);
|
2004-02-14 05:46:05 +01:00
|
|
|
}
|
|
|
|
Val = (Val << 8) | Val;
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::MOV32ri, 1, X86::EAX).addImm((Val << 16) | Val);
|
2004-02-14 05:46:05 +01:00
|
|
|
Opcode = X86::REP_STOSD;
|
|
|
|
break;
|
2004-02-26 02:20:02 +01:00
|
|
|
default: // BYTE aligned
|
2004-02-14 05:46:05 +01:00
|
|
|
CountReg = getReg(CI.getOperand(3));
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::MOV8ri, 1, X86::AL).addImm(Val);
|
2004-02-14 05:46:05 +01:00
|
|
|
Opcode = X86::REP_STOSB;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// If it's not a constant value we are storing, just fall back. We could
|
|
|
|
// try to be clever to form 16 bit and 32 bit values, but we don't yet.
|
|
|
|
unsigned ValReg = getReg(CI.getOperand(2));
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::MOV8rr, 1, X86::AL).addReg(ValReg);
|
2004-02-14 05:46:05 +01:00
|
|
|
CountReg = getReg(CI.getOperand(3));
|
|
|
|
Opcode = X86::REP_STOSB;
|
2004-02-12 18:53:22 +01:00
|
|
|
}
|
|
|
|
|
2004-02-14 05:46:05 +01:00
|
|
|
// No matter what the alignment is, we put the source in ESI, the
|
|
|
|
// destination in EDI, and the count in ECX.
|
|
|
|
TmpReg1 = getReg(CI.getOperand(1));
|
|
|
|
//TmpReg2 = getReg(CI.getOperand(2));
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::MOV32rr, 1, X86::ECX).addReg(CountReg);
|
|
|
|
BuildMI(BB, X86::MOV32rr, 1, X86::EDI).addReg(TmpReg1);
|
2004-02-14 05:46:05 +01:00
|
|
|
BuildMI(BB, Opcode, 0);
|
2004-02-12 18:53:22 +01:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2004-04-13 19:20:37 +02:00
|
|
|
case Intrinsic::readport: {
|
|
|
|
// First, determine that the size of the operand falls within the acceptable
|
|
|
|
// range for this architecture.
|
2004-04-08 22:31:47 +02:00
|
|
|
//
|
2004-04-13 19:20:37 +02:00
|
|
|
if (getClassB(CI.getOperand(1)->getType()) != cShort) {
|
2004-04-09 00:39:13 +02:00
|
|
|
std::cerr << "llvm.readport: Address size is not 16 bits\n";
|
2004-04-13 19:20:37 +02:00
|
|
|
exit(1);
|
2004-04-09 00:39:13 +02:00
|
|
|
}
|
2004-04-08 22:31:47 +02:00
|
|
|
|
|
|
|
// Now, move the I/O port address into the DX register and use the IN
|
|
|
|
// instruction to get the input data.
|
|
|
|
//
|
2004-04-13 19:20:37 +02:00
|
|
|
unsigned Class = getClass(CI.getCalledFunction()->getReturnType());
|
|
|
|
unsigned DestReg = getReg(CI);
|
|
|
|
|
|
|
|
// If the port is a single-byte constant, use the immediate form.
|
|
|
|
if (ConstantInt *C = dyn_cast<ConstantInt>(CI.getOperand(1)))
|
|
|
|
if ((C->getRawValue() & 255) == C->getRawValue()) {
|
|
|
|
switch (Class) {
|
|
|
|
case cByte:
|
|
|
|
BuildMI(BB, X86::IN8ri, 1).addImm((unsigned char)C->getRawValue());
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, DestReg).addReg(X86::AL);
|
|
|
|
return;
|
|
|
|
case cShort:
|
|
|
|
BuildMI(BB, X86::IN16ri, 1).addImm((unsigned char)C->getRawValue());
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, DestReg).addReg(X86::AX);
|
|
|
|
return;
|
|
|
|
case cInt:
|
|
|
|
BuildMI(BB, X86::IN32ri, 1).addImm((unsigned char)C->getRawValue());
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, DestReg).addReg(X86::EAX);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned Reg = getReg(CI.getOperand(1));
|
|
|
|
BuildMI(BB, X86::MOV16rr, 1, X86::DX).addReg(Reg);
|
|
|
|
switch (Class) {
|
|
|
|
case cByte:
|
|
|
|
BuildMI(BB, X86::IN8rr, 0);
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, DestReg).addReg(X86::AL);
|
|
|
|
break;
|
|
|
|
case cShort:
|
|
|
|
BuildMI(BB, X86::IN16rr, 0);
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, DestReg).addReg(X86::AX);
|
|
|
|
break;
|
|
|
|
case cInt:
|
|
|
|
BuildMI(BB, X86::IN32rr, 0);
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, DestReg).addReg(X86::EAX);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
std::cerr << "Cannot do input on this data type";
|
|
|
|
exit (1);
|
2004-04-08 22:31:47 +02:00
|
|
|
}
|
|
|
|
return;
|
2004-04-13 19:20:37 +02:00
|
|
|
}
|
2004-04-08 22:31:47 +02:00
|
|
|
|
2004-04-13 19:20:37 +02:00
|
|
|
case Intrinsic::writeport: {
|
2004-04-08 22:31:47 +02:00
|
|
|
// First, determine that the size of the operand falls within the
|
|
|
|
// acceptable range for this architecture.
|
2004-04-13 19:20:37 +02:00
|
|
|
if (getClass(CI.getOperand(2)->getType()) != cShort) {
|
2004-04-09 00:39:13 +02:00
|
|
|
std::cerr << "llvm.writeport: Address size is not 16 bits\n";
|
2004-04-13 19:20:37 +02:00
|
|
|
exit(1);
|
2004-04-09 00:39:13 +02:00
|
|
|
}
|
2004-04-08 22:31:47 +02:00
|
|
|
|
2004-04-13 19:20:37 +02:00
|
|
|
unsigned Class = getClassB(CI.getOperand(1)->getType());
|
|
|
|
unsigned ValReg = getReg(CI.getOperand(1));
|
|
|
|
switch (Class) {
|
|
|
|
case cByte:
|
|
|
|
BuildMI(BB, X86::MOV8rr, 1, X86::AL).addReg(ValReg);
|
|
|
|
break;
|
|
|
|
case cShort:
|
|
|
|
BuildMI(BB, X86::MOV16rr, 1, X86::AX).addReg(ValReg);
|
|
|
|
break;
|
|
|
|
case cInt:
|
|
|
|
BuildMI(BB, X86::MOV32rr, 1, X86::EAX).addReg(ValReg);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
std::cerr << "llvm.writeport: invalid data type for X86 target";
|
|
|
|
exit(1);
|
2004-04-08 22:31:47 +02:00
|
|
|
}
|
|
|
|
|
2004-04-13 19:20:37 +02:00
|
|
|
|
|
|
|
// If the port is a single-byte constant, use the immediate form.
|
|
|
|
if (ConstantInt *C = dyn_cast<ConstantInt>(CI.getOperand(2)))
|
|
|
|
if ((C->getRawValue() & 255) == C->getRawValue()) {
|
|
|
|
static const unsigned O[] = { X86::OUT8ir, X86::OUT16ir, X86::OUT32ir };
|
|
|
|
BuildMI(BB, O[Class], 1).addImm((unsigned char)C->getRawValue());
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Otherwise, move the I/O port address into the DX register and the value
|
|
|
|
// to write into the AL/AX/EAX register.
|
|
|
|
static const unsigned Opc[] = { X86::OUT8rr, X86::OUT16rr, X86::OUT32rr };
|
|
|
|
unsigned Reg = getReg(CI.getOperand(2));
|
|
|
|
BuildMI(BB, X86::MOV16rr, 1, X86::DX).addReg(Reg);
|
|
|
|
BuildMI(BB, Opc[Class], 0);
|
|
|
|
return;
|
|
|
|
}
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2003-12-28 10:47:19 +01:00
|
|
|
default: assert(0 && "Error: unknown intrinsics should have been lowered!");
|
2003-05-08 21:44:13 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Implement folding explicit load instructions into binary operations. For a
testcase like this:
int %test(int* %P, int %A) {
%Pv = load int* %P
%B = add int %A, %Pv
ret int %B
}
We now generate:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EAX, DWORD PTR [%ESP + 8]
add %EAX, DWORD PTR [%ECX]
ret
Instead of:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EAX, DWORD PTR [%EAX]
add %EAX, %ECX
ret
... saving one instruction, and often a register. Note that there are a lot
of other instructions that could use this, but they aren't handled. I'm not
really interested in adding them, but mul/div and all of the FP instructions
could be supported as well if someone wanted to add them.
llvm-svn: 12204
2004-03-08 02:58:35 +01:00
|
|
|
static bool isSafeToFoldLoadIntoInstruction(LoadInst &LI, Instruction &User) {
|
|
|
|
if (LI.getParent() != User.getParent())
|
|
|
|
return false;
|
|
|
|
BasicBlock::iterator It = &LI;
|
|
|
|
// Check all of the instructions between the load and the user. We should
|
|
|
|
// really use alias analysis here, but for now we just do something simple.
|
|
|
|
for (++It; It != BasicBlock::iterator(&User); ++It) {
|
|
|
|
switch (It->getOpcode()) {
|
2004-03-18 07:29:54 +01:00
|
|
|
case Instruction::Free:
|
Implement folding explicit load instructions into binary operations. For a
testcase like this:
int %test(int* %P, int %A) {
%Pv = load int* %P
%B = add int %A, %Pv
ret int %B
}
We now generate:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EAX, DWORD PTR [%ESP + 8]
add %EAX, DWORD PTR [%ECX]
ret
Instead of:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EAX, DWORD PTR [%EAX]
add %EAX, %ECX
ret
... saving one instruction, and often a register. Note that there are a lot
of other instructions that could use this, but they aren't handled. I'm not
really interested in adding them, but mul/div and all of the FP instructions
could be supported as well if someone wanted to add them.
llvm-svn: 12204
2004-03-08 02:58:35 +01:00
|
|
|
case Instruction::Store:
|
|
|
|
case Instruction::Call:
|
|
|
|
case Instruction::Invoke:
|
|
|
|
return false;
|
2004-04-12 05:02:48 +02:00
|
|
|
case Instruction::Load:
|
|
|
|
if (cast<LoadInst>(It)->isVolatile() && LI.isVolatile())
|
|
|
|
return false;
|
|
|
|
break;
|
Implement folding explicit load instructions into binary operations. For a
testcase like this:
int %test(int* %P, int %A) {
%Pv = load int* %P
%B = add int %A, %Pv
ret int %B
}
We now generate:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EAX, DWORD PTR [%ESP + 8]
add %EAX, DWORD PTR [%ECX]
ret
Instead of:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EAX, DWORD PTR [%EAX]
add %EAX, %ECX
ret
... saving one instruction, and often a register. Note that there are a lot
of other instructions that could use this, but they aren't handled. I'm not
really interested in adding them, but mul/div and all of the FP instructions
could be supported as well if someone wanted to add them.
llvm-svn: 12204
2004-03-08 02:58:35 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2003-05-08 22:49:25 +02:00
|
|
|
/// visitSimpleBinary - Implement simple binary operators for integral types...
|
|
|
|
/// OperatorClass is one of: 0 for Add, 1 for Sub, 2 for And, 3 for Or, 4 for
|
|
|
|
/// Xor.
|
2004-03-02 00:53:11 +01:00
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitSimpleBinary(BinaryOperator &B, unsigned OperatorClass) {
|
2003-05-08 22:49:25 +02:00
|
|
|
unsigned DestReg = getReg(B);
|
|
|
|
MachineBasicBlock::iterator MI = BB->end();
|
2004-03-08 02:18:36 +01:00
|
|
|
Value *Op0 = B.getOperand(0), *Op1 = B.getOperand(1);
|
2004-05-10 17:15:55 +02:00
|
|
|
unsigned Class = getClassB(B.getType());
|
2004-03-08 02:18:36 +01:00
|
|
|
|
2004-10-17 08:10:40 +02:00
|
|
|
// If this is AND X, C, and it is only used by a setcc instruction, it will
|
|
|
|
// be folded. There is no need to emit this instruction.
|
|
|
|
if (B.hasOneUse() && OperatorClass == 2 && isa<ConstantInt>(Op1))
|
|
|
|
if (Class == cByte || Class == cShort || Class == cInt) {
|
|
|
|
Instruction *Use = cast<Instruction>(B.use_back());
|
|
|
|
if (isa<SetCondInst>(Use) &&
|
|
|
|
Use->getOperand(1) == Constant::getNullValue(B.getType())) {
|
|
|
|
switch (getSetCCNumber(Use->getOpcode())) {
|
|
|
|
case 0:
|
|
|
|
case 1:
|
|
|
|
return;
|
|
|
|
default:
|
|
|
|
if (B.getType()->isSigned()) return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Implement folding explicit load instructions into binary operations. For a
testcase like this:
int %test(int* %P, int %A) {
%Pv = load int* %P
%B = add int %A, %Pv
ret int %B
}
We now generate:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EAX, DWORD PTR [%ESP + 8]
add %EAX, DWORD PTR [%ECX]
ret
Instead of:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EAX, DWORD PTR [%EAX]
add %EAX, %ECX
ret
... saving one instruction, and often a register. Note that there are a lot
of other instructions that could use this, but they aren't handled. I'm not
really interested in adding them, but mul/div and all of the FP instructions
could be supported as well if someone wanted to add them.
llvm-svn: 12204
2004-03-08 02:58:35 +01:00
|
|
|
// Special case: op Reg, load [mem]
|
2004-05-10 17:15:55 +02:00
|
|
|
if (isa<LoadInst>(Op0) && !isa<LoadInst>(Op1) && Class != cLong &&
|
2005-04-22 01:38:14 +02:00
|
|
|
Op0->hasOneUse() &&
|
2004-05-10 17:15:55 +02:00
|
|
|
isSafeToFoldLoadIntoInstruction(*cast<LoadInst>(Op0), B))
|
Implement folding explicit load instructions into binary operations. For a
testcase like this:
int %test(int* %P, int %A) {
%Pv = load int* %P
%B = add int %A, %Pv
ret int %B
}
We now generate:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EAX, DWORD PTR [%ESP + 8]
add %EAX, DWORD PTR [%ECX]
ret
Instead of:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EAX, DWORD PTR [%EAX]
add %EAX, %ECX
ret
... saving one instruction, and often a register. Note that there are a lot
of other instructions that could use this, but they aren't handled. I'm not
really interested in adding them, but mul/div and all of the FP instructions
could be supported as well if someone wanted to add them.
llvm-svn: 12204
2004-03-08 02:58:35 +01:00
|
|
|
if (!B.swapOperands())
|
|
|
|
std::swap(Op0, Op1); // Make sure any loads are in the RHS.
|
|
|
|
|
2004-06-18 00:15:25 +02:00
|
|
|
if (isa<LoadInst>(Op1) && Class != cLong && Op1->hasOneUse() &&
|
Implement folding explicit load instructions into binary operations. For a
testcase like this:
int %test(int* %P, int %A) {
%Pv = load int* %P
%B = add int %A, %Pv
ret int %B
}
We now generate:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EAX, DWORD PTR [%ESP + 8]
add %EAX, DWORD PTR [%ECX]
ret
Instead of:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EAX, DWORD PTR [%EAX]
add %EAX, %ECX
ret
... saving one instruction, and often a register. Note that there are a lot
of other instructions that could use this, but they aren't handled. I'm not
really interested in adding them, but mul/div and all of the FP instructions
could be supported as well if someone wanted to add them.
llvm-svn: 12204
2004-03-08 02:58:35 +01:00
|
|
|
isSafeToFoldLoadIntoInstruction(*cast<LoadInst>(Op1), B)) {
|
|
|
|
|
2004-04-12 00:05:45 +02:00
|
|
|
unsigned Opcode;
|
|
|
|
if (Class != cFP) {
|
|
|
|
static const unsigned OpcodeTab[][3] = {
|
|
|
|
// Arithmetic operators
|
|
|
|
{ X86::ADD8rm, X86::ADD16rm, X86::ADD32rm }, // ADD
|
|
|
|
{ X86::SUB8rm, X86::SUB16rm, X86::SUB32rm }, // SUB
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-12 00:05:45 +02:00
|
|
|
// Bitwise operators
|
|
|
|
{ X86::AND8rm, X86::AND16rm, X86::AND32rm }, // AND
|
|
|
|
{ X86:: OR8rm, X86:: OR16rm, X86:: OR32rm }, // OR
|
|
|
|
{ X86::XOR8rm, X86::XOR16rm, X86::XOR32rm }, // XOR
|
|
|
|
};
|
|
|
|
Opcode = OpcodeTab[OperatorClass][Class];
|
|
|
|
} else {
|
|
|
|
static const unsigned OpcodeTab[][2] = {
|
|
|
|
{ X86::FADD32m, X86::FADD64m }, // ADD
|
|
|
|
{ X86::FSUB32m, X86::FSUB64m }, // SUB
|
|
|
|
};
|
|
|
|
const Type *Ty = Op0->getType();
|
|
|
|
assert(Ty == Type::FloatTy || Ty == Type::DoubleTy && "Unknown FP type!");
|
|
|
|
Opcode = OpcodeTab[OperatorClass][Ty == Type::DoubleTy];
|
|
|
|
}
|
Implement folding explicit load instructions into binary operations. For a
testcase like this:
int %test(int* %P, int %A) {
%Pv = load int* %P
%B = add int %A, %Pv
ret int %B
}
We now generate:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EAX, DWORD PTR [%ESP + 8]
add %EAX, DWORD PTR [%ECX]
ret
Instead of:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EAX, DWORD PTR [%EAX]
add %EAX, %ECX
ret
... saving one instruction, and often a register. Note that there are a lot
of other instructions that could use this, but they aren't handled. I'm not
really interested in adding them, but mul/div and all of the FP instructions
could be supported as well if someone wanted to add them.
llvm-svn: 12204
2004-03-08 02:58:35 +01:00
|
|
|
|
|
|
|
unsigned Op0r = getReg(Op0);
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
if (AllocaInst *AI =
|
|
|
|
dyn_castFixedAlloca(cast<LoadInst>(Op1)->getOperand(0))) {
|
|
|
|
unsigned FI = getFixedSizedAllocaFI(AI);
|
|
|
|
addFrameReference(BuildMI(BB, Opcode, 5, DestReg).addReg(Op0r), FI);
|
|
|
|
|
|
|
|
} else {
|
2004-08-30 02:13:26 +02:00
|
|
|
X86AddressMode AM;
|
|
|
|
getAddressingMode(cast<LoadInst>(Op1)->getOperand(0), AM);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-08-30 02:13:26 +02:00
|
|
|
addFullAddress(BuildMI(BB, Opcode, 5, DestReg).addReg(Op0r), AM);
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
}
|
Implement folding explicit load instructions into binary operations. For a
testcase like this:
int %test(int* %P, int %A) {
%Pv = load int* %P
%B = add int %A, %Pv
ret int %B
}
We now generate:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EAX, DWORD PTR [%ESP + 8]
add %EAX, DWORD PTR [%ECX]
ret
Instead of:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EAX, DWORD PTR [%EAX]
add %EAX, %ECX
ret
... saving one instruction, and often a register. Note that there are a lot
of other instructions that could use this, but they aren't handled. I'm not
really interested in adding them, but mul/div and all of the FP instructions
could be supported as well if someone wanted to add them.
llvm-svn: 12204
2004-03-08 02:58:35 +01:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2004-04-12 00:05:45 +02:00
|
|
|
// If this is a floating point subtract, check to see if we can fold the first
|
|
|
|
// operand in.
|
|
|
|
if (Class == cFP && OperatorClass == 1 &&
|
2005-04-22 01:38:14 +02:00
|
|
|
isa<LoadInst>(Op0) &&
|
2004-04-12 00:05:45 +02:00
|
|
|
isSafeToFoldLoadIntoInstruction(*cast<LoadInst>(Op0), B)) {
|
|
|
|
const Type *Ty = Op0->getType();
|
|
|
|
assert(Ty == Type::FloatTy || Ty == Type::DoubleTy && "Unknown FP type!");
|
|
|
|
unsigned Opcode = Ty == Type::FloatTy ? X86::FSUBR32m : X86::FSUBR64m;
|
|
|
|
|
|
|
|
unsigned Op1r = getReg(Op1);
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
if (AllocaInst *AI =
|
|
|
|
dyn_castFixedAlloca(cast<LoadInst>(Op0)->getOperand(0))) {
|
|
|
|
unsigned FI = getFixedSizedAllocaFI(AI);
|
|
|
|
addFrameReference(BuildMI(BB, Opcode, 5, DestReg).addReg(Op1r), FI);
|
|
|
|
} else {
|
2004-08-30 02:13:26 +02:00
|
|
|
X86AddressMode AM;
|
|
|
|
getAddressingMode(cast<LoadInst>(Op0)->getOperand(0), AM);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-08-30 02:13:26 +02:00
|
|
|
addFullAddress(BuildMI(BB, Opcode, 5, DestReg).addReg(Op1r), AM);
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
}
|
2004-04-12 00:05:45 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2004-03-08 02:18:36 +01:00
|
|
|
emitSimpleBinaryOperation(BB, MI, Op0, Op1, OperatorClass, DestReg);
|
2003-05-08 22:49:25 +02:00
|
|
|
}
|
2003-01-13 01:32:26 +01:00
|
|
|
|
2004-04-11 23:23:56 +02:00
|
|
|
|
|
|
|
/// emitBinaryFPOperation - This method handles emission of floating point
|
|
|
|
/// Add (0), Sub (1), Mul (2), and Div (3) operations.
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::emitBinaryFPOperation(MachineBasicBlock *BB,
|
|
|
|
MachineBasicBlock::iterator IP,
|
|
|
|
Value *Op0, Value *Op1,
|
|
|
|
unsigned OperatorClass, unsigned DestReg) {
|
2004-04-11 23:23:56 +02:00
|
|
|
// Special case: op Reg, <const fp>
|
|
|
|
if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1))
|
|
|
|
if (!Op1C->isExactlyValue(+0.0) && !Op1C->isExactlyValue(+1.0)) {
|
|
|
|
// Create a constant pool entry for this constant.
|
|
|
|
MachineConstantPool *CP = F->getConstantPool();
|
|
|
|
unsigned CPI = CP->getConstantPoolIndex(Op1C);
|
|
|
|
const Type *Ty = Op1->getType();
|
|
|
|
|
|
|
|
static const unsigned OpcodeTab[][4] = {
|
|
|
|
{ X86::FADD32m, X86::FSUB32m, X86::FMUL32m, X86::FDIV32m }, // Float
|
|
|
|
{ X86::FADD64m, X86::FSUB64m, X86::FMUL64m, X86::FDIV64m }, // Double
|
|
|
|
};
|
|
|
|
|
|
|
|
assert(Ty == Type::FloatTy || Ty == Type::DoubleTy && "Unknown FP type!");
|
|
|
|
unsigned Opcode = OpcodeTab[Ty != Type::FloatTy][OperatorClass];
|
|
|
|
unsigned Op0r = getReg(Op0, BB, IP);
|
|
|
|
addConstantPoolReference(BuildMI(*BB, IP, Opcode, 5,
|
|
|
|
DestReg).addReg(Op0r), CPI);
|
|
|
|
return;
|
|
|
|
}
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-12 02:12:04 +02:00
|
|
|
// Special case: R1 = op <const fp>, R2
|
2004-04-11 23:23:56 +02:00
|
|
|
if (ConstantFP *CFP = dyn_cast<ConstantFP>(Op0))
|
|
|
|
if (CFP->isExactlyValue(-0.0) && OperatorClass == 1) {
|
|
|
|
// -0.0 - X === -X
|
|
|
|
unsigned op1Reg = getReg(Op1, BB, IP);
|
|
|
|
BuildMI(*BB, IP, X86::FCHS, 1, DestReg).addReg(op1Reg);
|
|
|
|
return;
|
|
|
|
} else if (!CFP->isExactlyValue(+0.0) && !CFP->isExactlyValue(+1.0)) {
|
2004-04-12 02:12:04 +02:00
|
|
|
// R1 = op CST, R2 --> R1 = opr R2, CST
|
2004-04-11 23:23:56 +02:00
|
|
|
|
|
|
|
// Create a constant pool entry for this constant.
|
|
|
|
MachineConstantPool *CP = F->getConstantPool();
|
|
|
|
unsigned CPI = CP->getConstantPoolIndex(CFP);
|
|
|
|
const Type *Ty = CFP->getType();
|
|
|
|
|
|
|
|
static const unsigned OpcodeTab[][4] = {
|
|
|
|
{ X86::FADD32m, X86::FSUBR32m, X86::FMUL32m, X86::FDIVR32m }, // Float
|
|
|
|
{ X86::FADD64m, X86::FSUBR64m, X86::FMUL64m, X86::FDIVR64m }, // Double
|
|
|
|
};
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 23:23:56 +02:00
|
|
|
assert(Ty == Type::FloatTy||Ty == Type::DoubleTy && "Unknown FP type!");
|
|
|
|
unsigned Opcode = OpcodeTab[Ty != Type::FloatTy][OperatorClass];
|
|
|
|
unsigned Op1r = getReg(Op1, BB, IP);
|
|
|
|
addConstantPoolReference(BuildMI(*BB, IP, Opcode, 5,
|
|
|
|
DestReg).addReg(Op1r), CPI);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// General case.
|
|
|
|
static const unsigned OpcodeTab[4] = {
|
|
|
|
X86::FpADD, X86::FpSUB, X86::FpMUL, X86::FpDIV
|
|
|
|
};
|
|
|
|
|
|
|
|
unsigned Opcode = OpcodeTab[OperatorClass];
|
|
|
|
unsigned Op0r = getReg(Op0, BB, IP);
|
|
|
|
unsigned Op1r = getReg(Op1, BB, IP);
|
|
|
|
BuildMI(*BB, IP, Opcode, 2, DestReg).addReg(Op0r).addReg(Op1r);
|
|
|
|
}
|
|
|
|
|
2003-10-19 23:09:10 +02:00
|
|
|
/// emitSimpleBinaryOperation - Implement simple binary operators for integral
|
|
|
|
/// types... OperatorClass is one of: 0 for Add, 1 for Sub, 2 for And, 3 for
|
|
|
|
/// Or, 4 for Xor.
|
2002-11-02 21:13:22 +01:00
|
|
|
///
|
2003-05-08 22:49:25 +02:00
|
|
|
/// emitSimpleBinaryOperation - Common code shared between visitSimpleBinary
|
|
|
|
/// and constant expression support.
|
2003-10-19 23:09:10 +02:00
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::emitSimpleBinaryOperation(MachineBasicBlock *MBB,
|
|
|
|
MachineBasicBlock::iterator IP,
|
|
|
|
Value *Op0, Value *Op1,
|
2005-04-22 01:38:14 +02:00
|
|
|
unsigned OperatorClass,
|
2004-09-21 20:21:21 +02:00
|
|
|
unsigned DestReg) {
|
2003-05-08 22:49:25 +02:00
|
|
|
unsigned Class = getClassB(Op0->getType());
|
2003-10-19 23:09:10 +02:00
|
|
|
|
2004-04-11 23:23:56 +02:00
|
|
|
if (Class == cFP) {
|
|
|
|
assert(OperatorClass < 2 && "No logical ops for FP!");
|
|
|
|
emitBinaryFPOperation(MBB, IP, Op0, Op1, OperatorClass, DestReg);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2004-04-11 22:26:20 +02:00
|
|
|
if (ConstantInt *CI = dyn_cast<ConstantInt>(Op0))
|
2004-06-18 02:50:37 +02:00
|
|
|
if (OperatorClass == 1) {
|
2004-04-11 22:26:20 +02:00
|
|
|
static unsigned const NEGTab[] = {
|
|
|
|
X86::NEG8r, X86::NEG16r, X86::NEG32r, 0, X86::NEG32r
|
|
|
|
};
|
2004-06-18 02:50:37 +02:00
|
|
|
|
|
|
|
// sub 0, X -> neg X
|
|
|
|
if (CI->isNullValue()) {
|
|
|
|
unsigned op1Reg = getReg(Op1, MBB, IP);
|
|
|
|
BuildMI(*MBB, IP, NEGTab[Class], 1, DestReg).addReg(op1Reg);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-06-18 02:50:37 +02:00
|
|
|
if (Class == cLong) {
|
|
|
|
// We just emitted: Dl = neg Sl
|
|
|
|
// Now emit : T = addc Sh, 0
|
|
|
|
// : Dh = neg T
|
|
|
|
unsigned T = makeAnotherReg(Type::IntTy);
|
|
|
|
BuildMI(*MBB, IP, X86::ADC32ri, 2, T).addReg(op1Reg+1).addImm(0);
|
|
|
|
BuildMI(*MBB, IP, X86::NEG32r, 1, DestReg+1).addReg(T);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
} else if (Op1->hasOneUse() && Class != cLong) {
|
|
|
|
// sub C, X -> tmp = neg X; DestReg = add tmp, C. This is better
|
|
|
|
// than copying C into a temporary register, because of register
|
|
|
|
// pressure (tmp and destreg can share a register.
|
2005-04-22 01:38:14 +02:00
|
|
|
static unsigned const ADDRITab[] = {
|
2004-06-18 02:50:37 +02:00
|
|
|
X86::ADD8ri, X86::ADD16ri, X86::ADD32ri, 0, X86::ADD32ri
|
|
|
|
};
|
|
|
|
unsigned op1Reg = getReg(Op1, MBB, IP);
|
|
|
|
unsigned Tmp = makeAnotherReg(Op0->getType());
|
|
|
|
BuildMI(*MBB, IP, NEGTab[Class], 1, Tmp).addReg(op1Reg);
|
2004-06-20 09:49:54 +02:00
|
|
|
BuildMI(*MBB, IP, ADDRITab[Class], 2,
|
|
|
|
DestReg).addReg(Tmp).addImm(CI->getRawValue());
|
2004-06-18 02:50:37 +02:00
|
|
|
return;
|
2004-02-02 20:31:38 +01:00
|
|
|
}
|
2004-04-11 22:26:20 +02:00
|
|
|
}
|
2003-10-19 23:09:10 +02:00
|
|
|
|
2004-04-11 22:26:20 +02:00
|
|
|
// Special case: op Reg, <const int>
|
|
|
|
if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
|
2004-03-08 02:18:36 +01:00
|
|
|
unsigned Op0r = getReg(Op0, MBB, IP);
|
|
|
|
|
|
|
|
// xor X, -1 -> not X
|
|
|
|
if (OperatorClass == 4 && Op1C->isAllOnesValue()) {
|
2004-04-06 04:11:49 +02:00
|
|
|
static unsigned const NOTTab[] = {
|
|
|
|
X86::NOT8r, X86::NOT16r, X86::NOT32r, 0, X86::NOT32r
|
|
|
|
};
|
2004-03-08 02:18:36 +01:00
|
|
|
BuildMI(*MBB, IP, NOTTab[Class], 1, DestReg).addReg(Op0r);
|
2004-04-06 04:11:49 +02:00
|
|
|
if (Class == cLong) // Invert the top part too
|
|
|
|
BuildMI(*MBB, IP, X86::NOT32r, 1, DestReg+1).addReg(Op0r+1);
|
2004-03-08 02:18:36 +01:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// add X, -1 -> dec X
|
2004-04-06 05:36:57 +02:00
|
|
|
if (OperatorClass == 0 && Op1C->isAllOnesValue() && Class != cLong) {
|
|
|
|
// Note that we can't use dec for 64-bit decrements, because it does not
|
|
|
|
// set the carry flag!
|
|
|
|
static unsigned const DECTab[] = { X86::DEC8r, X86::DEC16r, X86::DEC32r };
|
2004-03-08 02:18:36 +01:00
|
|
|
BuildMI(*MBB, IP, DECTab[Class], 1, DestReg).addReg(Op0r);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// add X, 1 -> inc X
|
2004-04-06 05:36:57 +02:00
|
|
|
if (OperatorClass == 0 && Op1C->equalsInt(1) && Class != cLong) {
|
|
|
|
// Note that we can't use inc for 64-bit increments, because it does not
|
|
|
|
// set the carry flag!
|
|
|
|
static unsigned const INCTab[] = { X86::INC8r, X86::INC16r, X86::INC32r };
|
2004-04-02 20:11:32 +02:00
|
|
|
BuildMI(*MBB, IP, INCTab[Class], 1, DestReg).addReg(Op0r);
|
2004-03-08 02:18:36 +01:00
|
|
|
return;
|
|
|
|
}
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-06 04:11:49 +02:00
|
|
|
static const unsigned OpcodeTab[][5] = {
|
2003-06-05 20:28:55 +02:00
|
|
|
// Arithmetic operators
|
2004-04-06 04:11:49 +02:00
|
|
|
{ X86::ADD8ri, X86::ADD16ri, X86::ADD32ri, 0, X86::ADD32ri }, // ADD
|
|
|
|
{ X86::SUB8ri, X86::SUB16ri, X86::SUB32ri, 0, X86::SUB32ri }, // SUB
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2003-06-05 20:28:55 +02:00
|
|
|
// Bitwise operators
|
2004-04-06 04:11:49 +02:00
|
|
|
{ X86::AND8ri, X86::AND16ri, X86::AND32ri, 0, X86::AND32ri }, // AND
|
|
|
|
{ X86:: OR8ri, X86:: OR16ri, X86:: OR32ri, 0, X86::OR32ri }, // OR
|
|
|
|
{ X86::XOR8ri, X86::XOR16ri, X86::XOR32ri, 0, X86::XOR32ri }, // XOR
|
2003-06-05 20:28:55 +02:00
|
|
|
};
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2003-06-05 20:28:55 +02:00
|
|
|
unsigned Opcode = OpcodeTab[OperatorClass][Class];
|
2004-04-06 05:15:53 +02:00
|
|
|
unsigned Op1l = cast<ConstantInt>(Op1C)->getRawValue();
|
2002-11-02 21:13:22 +01:00
|
|
|
|
2004-04-06 05:15:53 +02:00
|
|
|
if (Class != cLong) {
|
|
|
|
BuildMI(*MBB, IP, Opcode, 2, DestReg).addReg(Op0r).addImm(Op1l);
|
|
|
|
return;
|
2004-04-06 04:11:49 +02:00
|
|
|
}
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 23:23:56 +02:00
|
|
|
// If this is a long value and the high or low bits have a special
|
|
|
|
// property, emit some special cases.
|
|
|
|
unsigned Op1h = cast<ConstantInt>(Op1C)->getRawValue() >> 32LL;
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 23:23:56 +02:00
|
|
|
// If the constant is zero in the low 32-bits, just copy the low part
|
|
|
|
// across and apply the normal 32-bit operation to the high parts. There
|
|
|
|
// will be no carry or borrow into the top.
|
|
|
|
if (Op1l == 0) {
|
|
|
|
if (OperatorClass != 2) // All but and...
|
|
|
|
BuildMI(*MBB, IP, X86::MOV32rr, 1, DestReg).addReg(Op0r);
|
|
|
|
else
|
|
|
|
BuildMI(*MBB, IP, X86::MOV32ri, 1, DestReg).addImm(0);
|
|
|
|
BuildMI(*MBB, IP, OpcodeTab[OperatorClass][cLong], 2, DestReg+1)
|
|
|
|
.addReg(Op0r+1).addImm(Op1h);
|
2004-04-11 22:26:20 +02:00
|
|
|
return;
|
|
|
|
}
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 23:23:56 +02:00
|
|
|
// If this is a logical operation and the top 32-bits are zero, just
|
|
|
|
// operate on the lower 32.
|
|
|
|
if (Op1h == 0 && OperatorClass > 1) {
|
|
|
|
BuildMI(*MBB, IP, OpcodeTab[OperatorClass][cLong], 2, DestReg)
|
|
|
|
.addReg(Op0r).addImm(Op1l);
|
|
|
|
if (OperatorClass != 2) // All but and
|
|
|
|
BuildMI(*MBB, IP, X86::MOV32rr, 1, DestReg+1).addReg(Op0r+1);
|
|
|
|
else
|
|
|
|
BuildMI(*MBB, IP, X86::MOV32ri, 1, DestReg+1).addImm(0);
|
|
|
|
return;
|
2004-04-11 22:26:20 +02:00
|
|
|
}
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 23:23:56 +02:00
|
|
|
// TODO: We could handle lots of other special cases here, such as AND'ing
|
|
|
|
// with 0xFFFFFFFF00000000 -> noop, etc.
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 23:23:56 +02:00
|
|
|
// Otherwise, code generate the full operation with a constant.
|
|
|
|
static const unsigned TopTab[] = {
|
|
|
|
X86::ADC32ri, X86::SBB32ri, X86::AND32ri, X86::OR32ri, X86::XOR32ri
|
|
|
|
};
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 23:23:56 +02:00
|
|
|
BuildMI(*MBB, IP, Opcode, 2, DestReg).addReg(Op0r).addImm(Op1l);
|
|
|
|
BuildMI(*MBB, IP, TopTab[OperatorClass], 2, DestReg+1)
|
|
|
|
.addReg(Op0r+1).addImm(Op1h);
|
|
|
|
return;
|
|
|
|
}
|
2004-04-11 22:26:20 +02:00
|
|
|
|
2004-03-08 02:18:36 +01:00
|
|
|
// Finally, handle the general case now.
|
2004-04-06 04:13:25 +02:00
|
|
|
static const unsigned OpcodeTab[][5] = {
|
2003-10-19 23:09:10 +02:00
|
|
|
// Arithmetic operators
|
2004-04-11 23:23:56 +02:00
|
|
|
{ X86::ADD8rr, X86::ADD16rr, X86::ADD32rr, 0, X86::ADD32rr }, // ADD
|
|
|
|
{ X86::SUB8rr, X86::SUB16rr, X86::SUB32rr, 0, X86::SUB32rr }, // SUB
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2003-10-19 23:09:10 +02:00
|
|
|
// Bitwise operators
|
2004-04-06 04:11:49 +02:00
|
|
|
{ X86::AND8rr, X86::AND16rr, X86::AND32rr, 0, X86::AND32rr }, // AND
|
|
|
|
{ X86:: OR8rr, X86:: OR16rr, X86:: OR32rr, 0, X86:: OR32rr }, // OR
|
|
|
|
{ X86::XOR8rr, X86::XOR16rr, X86::XOR32rr, 0, X86::XOR32rr }, // XOR
|
2003-10-19 23:09:10 +02:00
|
|
|
};
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2003-10-19 23:09:10 +02:00
|
|
|
unsigned Opcode = OpcodeTab[OperatorClass][Class];
|
2004-03-08 02:18:36 +01:00
|
|
|
unsigned Op0r = getReg(Op0, MBB, IP);
|
|
|
|
unsigned Op1r = getReg(Op1, MBB, IP);
|
|
|
|
BuildMI(*MBB, IP, Opcode, 2, DestReg).addReg(Op0r).addReg(Op1r);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-06 04:11:49 +02:00
|
|
|
if (Class == cLong) { // Handle the upper 32 bits of long values...
|
2004-03-08 02:18:36 +01:00
|
|
|
static const unsigned TopTab[] = {
|
|
|
|
X86::ADC32rr, X86::SBB32rr, X86::AND32rr, X86::OR32rr, X86::XOR32rr
|
|
|
|
};
|
|
|
|
BuildMI(*MBB, IP, TopTab[OperatorClass], 2,
|
|
|
|
DestReg+1).addReg(Op0r+1).addReg(Op1r+1);
|
|
|
|
}
|
2002-11-02 21:04:26 +01:00
|
|
|
}
|
|
|
|
|
2003-01-13 01:32:26 +01:00
|
|
|
/// doMultiply - Emit appropriate instructions to multiply together the
|
|
|
|
/// registers op0Reg and op1Reg, and put the result in DestReg. The type of the
|
|
|
|
/// result should be given as DestTy.
|
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::doMultiply(MachineBasicBlock *MBB,
|
|
|
|
MachineBasicBlock::iterator MBBI,
|
|
|
|
unsigned DestReg, const Type *DestTy,
|
|
|
|
unsigned op0Reg, unsigned op1Reg) {
|
2003-01-13 01:32:26 +01:00
|
|
|
unsigned Class = getClass(DestTy);
|
2002-12-25 06:13:53 +01:00
|
|
|
switch (Class) {
|
2003-06-21 19:16:58 +02:00
|
|
|
case cInt:
|
|
|
|
case cShort:
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, MBBI, Class == cInt ? X86::IMUL32rr:X86::IMUL16rr, 2, DestReg)
|
2003-06-21 19:16:58 +02:00
|
|
|
.addReg(op0Reg).addReg(op1Reg);
|
|
|
|
return;
|
|
|
|
case cByte:
|
|
|
|
// Must use the MUL instruction, which forces use of AL...
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, MBBI, X86::MOV8rr, 1, X86::AL).addReg(op0Reg);
|
|
|
|
BuildMI(*MBB, MBBI, X86::MUL8r, 1).addReg(op1Reg);
|
|
|
|
BuildMI(*MBB, MBBI, X86::MOV8rr, 1, DestReg).addReg(X86::AL);
|
2003-06-21 19:16:58 +02:00
|
|
|
return;
|
2002-12-25 06:13:53 +01:00
|
|
|
default:
|
2003-01-13 01:32:26 +01:00
|
|
|
case cLong: assert(0 && "doMultiply cannot operate on LONG values!");
|
2002-12-25 06:13:53 +01:00
|
|
|
}
|
This checkin is brought to you by the brian gaeke allnighter fund.
(lib/Target/X86) InstSelectSimple.cpp:
Include llvm/DerivedTypes.h and iostream.
Refactor visitMul out into a wrapper around doMultiply(), so that we
can do multiplications on temporary values when we are doing
getelementptrs.
Refactor part of getReg out into makeAnotherReg, so that we can create
registers willy-nilly to hold temporary values, when we are doing
getelementptrs.
Add stub implementations of visitMallocInst and visitAllocaInst.
Add initial implementation of visitGetElementPtrInst.
In copyConstantToRegister:
We throw a *lot* of our asserts here. So, when we want to throw an
assert, print out to stderr whatever expr or whatever constant made
us barf.
Support copying ConstantPointerNull to register, using a move immediate
of zero.
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
Teach visitCallInst to extract byte- and short-class return values
from subregs of EAX. Add a FIXME note about how we would do it for
float-class return values.
Add a FIXME note about how we would cast float to int and back.
X86InstrInfo.def:
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
(tools/jello) GlobalVars.cpp:
Include iostream.
If we have to emit a floating-point constant to memory, gamble and use
the same method as for ints.
If we have to emit a ConstantPointerNull to memory, try using a "void *"
and "NULL".
Otherwise, if we are going to throw an assert, print out whatever constant
made us barf, first.
llvm-svn: 4973
2002-12-12 16:33:40 +01:00
|
|
|
}
|
|
|
|
|
2003-10-19 23:09:10 +02:00
|
|
|
// ExactLog2 - This function solves for (Val == 1 << (N-1)) and returns N. It
|
|
|
|
// returns zero when the input is not exactly a power of two.
|
|
|
|
static unsigned ExactLog2(unsigned Val) {
|
Improve signed division by power of 2 *dramatically* from this:
div:
mov %EDX, DWORD PTR [%ESP + 4]
mov %ECX, 64
mov %EAX, %EDX
sar %EDX, 31
idiv %ECX
ret
to this:
div:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
sar %ECX, 5
shr %ECX, 26
mov %EDX, %EAX
add %EDX, %ECX
sar %EAX, 6
ret
Note that the intel compiler is currently making this:
div:
movl 4(%esp), %edx #3.5
movl %edx, %eax #4.14
sarl $5, %eax #4.14
shrl $26, %eax #4.14
addl %edx, %eax #4.14
sarl $6, %eax #4.14
ret #4.14
Which has one less register->register copy. (hint hint alkis :)
llvm-svn: 13354
2004-05-04 21:33:58 +02:00
|
|
|
if (Val == 0 || (Val & (Val-1))) return 0;
|
2003-10-19 23:09:10 +02:00
|
|
|
unsigned Count = 0;
|
|
|
|
while (Val != 1) {
|
|
|
|
Val >>= 1;
|
|
|
|
++Count;
|
|
|
|
}
|
|
|
|
return Count+1;
|
|
|
|
}
|
|
|
|
|
2004-04-11 22:56:28 +02:00
|
|
|
|
|
|
|
/// doMultiplyConst - This function is specialized to efficiently codegen an 8,
|
|
|
|
/// 16, or 32-bit integer multiply by a constant.
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::doMultiplyConst(MachineBasicBlock *MBB,
|
|
|
|
MachineBasicBlock::iterator IP,
|
|
|
|
unsigned DestReg, const Type *DestTy,
|
|
|
|
unsigned op0Reg, unsigned ConstRHS) {
|
Handle various other important cases of multiplying a long constant immediate. For
example, multiplying X*(1 + (1LL << 32)) now produces:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
mov %EAX, %ECX
add %EDX, %ECX
ret
[[[Note to Alkis: why isn't linear scan generating this code?? This might be a
problem with your intervals being too conservative:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
add %EDX, %EAX
ret
end note]]]
Whereas GCC produces this:
T:
sub %esp, 12
mov %edx, DWORD PTR [%esp+16]
mov DWORD PTR [%esp+8], %edi
mov %ecx, DWORD PTR [%esp+20]
xor %edi, %edi
mov DWORD PTR [%esp], %ebx
mov %ebx, %edi
mov %eax, %edx
mov DWORD PTR [%esp+4], %esi
add %ebx, %edx
mov %edi, DWORD PTR [%esp+8]
lea %edx, [%ecx+%ebx]
mov %esi, DWORD PTR [%esp+4]
mov %ebx, DWORD PTR [%esp]
add %esp, 12
ret
I'm not sure example what GCC is smoking here, but it looks like it has just
confused itself with a bunch of stack slots or something. The intel compiler
is better, but still not good:
T:
movl 4(%esp), %edx #2.11
movl 8(%esp), %eax #2.11
lea (%eax,%edx), %ecx #3.12
movl $1, %eax #3.12
mull %edx #3.12
addl %ecx, %edx #3.12
ret #3.12
llvm-svn: 12693
2004-04-06 06:55:43 +02:00
|
|
|
static const unsigned MOVrrTab[] = {X86::MOV8rr, X86::MOV16rr, X86::MOV32rr};
|
|
|
|
static const unsigned MOVriTab[] = {X86::MOV8ri, X86::MOV16ri, X86::MOV32ri};
|
2004-05-04 17:47:14 +02:00
|
|
|
static const unsigned ADDrrTab[] = {X86::ADD8rr, X86::ADD16rr, X86::ADD32rr};
|
2004-07-20 01:47:21 +02:00
|
|
|
static const unsigned NEGrTab[] = {X86::NEG8r , X86::NEG16r , X86::NEG32r };
|
Handle various other important cases of multiplying a long constant immediate. For
example, multiplying X*(1 + (1LL << 32)) now produces:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
mov %EAX, %ECX
add %EDX, %ECX
ret
[[[Note to Alkis: why isn't linear scan generating this code?? This might be a
problem with your intervals being too conservative:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
add %EDX, %EAX
ret
end note]]]
Whereas GCC produces this:
T:
sub %esp, 12
mov %edx, DWORD PTR [%esp+16]
mov DWORD PTR [%esp+8], %edi
mov %ecx, DWORD PTR [%esp+20]
xor %edi, %edi
mov DWORD PTR [%esp], %ebx
mov %ebx, %edi
mov %eax, %edx
mov DWORD PTR [%esp+4], %esi
add %ebx, %edx
mov %edi, DWORD PTR [%esp+8]
lea %edx, [%ecx+%ebx]
mov %esi, DWORD PTR [%esp+4]
mov %ebx, DWORD PTR [%esp]
add %esp, 12
ret
I'm not sure example what GCC is smoking here, but it looks like it has just
confused itself with a bunch of stack slots or something. The intel compiler
is better, but still not good:
T:
movl 4(%esp), %edx #2.11
movl 8(%esp), %eax #2.11
lea (%eax,%edx), %ecx #3.12
movl $1, %eax #3.12
mull %edx #3.12
addl %ecx, %edx #3.12
ret #3.12
llvm-svn: 12693
2004-04-06 06:55:43 +02:00
|
|
|
|
2003-10-19 23:09:10 +02:00
|
|
|
unsigned Class = getClass(DestTy);
|
2004-07-20 01:47:21 +02:00
|
|
|
unsigned TmpReg;
|
2003-10-19 23:09:10 +02:00
|
|
|
|
2004-05-04 17:47:14 +02:00
|
|
|
// Handle special cases here.
|
|
|
|
switch (ConstRHS) {
|
2004-07-20 01:47:21 +02:00
|
|
|
case -2:
|
|
|
|
TmpReg = makeAnotherReg(DestTy);
|
|
|
|
BuildMI(*MBB, IP, NEGrTab[Class], 1, TmpReg).addReg(op0Reg);
|
|
|
|
BuildMI(*MBB, IP, ADDrrTab[Class], 1,DestReg).addReg(TmpReg).addReg(TmpReg);
|
|
|
|
return;
|
|
|
|
case -1:
|
|
|
|
BuildMI(*MBB, IP, NEGrTab[Class], 1, DestReg).addReg(op0Reg);
|
|
|
|
return;
|
2004-05-04 17:47:14 +02:00
|
|
|
case 0:
|
Handle various other important cases of multiplying a long constant immediate. For
example, multiplying X*(1 + (1LL << 32)) now produces:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
mov %EAX, %ECX
add %EDX, %ECX
ret
[[[Note to Alkis: why isn't linear scan generating this code?? This might be a
problem with your intervals being too conservative:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
add %EDX, %EAX
ret
end note]]]
Whereas GCC produces this:
T:
sub %esp, 12
mov %edx, DWORD PTR [%esp+16]
mov DWORD PTR [%esp+8], %edi
mov %ecx, DWORD PTR [%esp+20]
xor %edi, %edi
mov DWORD PTR [%esp], %ebx
mov %ebx, %edi
mov %eax, %edx
mov DWORD PTR [%esp+4], %esi
add %ebx, %edx
mov %edi, DWORD PTR [%esp+8]
lea %edx, [%ecx+%ebx]
mov %esi, DWORD PTR [%esp+4]
mov %ebx, DWORD PTR [%esp]
add %esp, 12
ret
I'm not sure example what GCC is smoking here, but it looks like it has just
confused itself with a bunch of stack slots or something. The intel compiler
is better, but still not good:
T:
movl 4(%esp), %edx #2.11
movl 8(%esp), %eax #2.11
lea (%eax,%edx), %ecx #3.12
movl $1, %eax #3.12
mull %edx #3.12
addl %ecx, %edx #3.12
ret #3.12
llvm-svn: 12693
2004-04-06 06:55:43 +02:00
|
|
|
BuildMI(*MBB, IP, MOVriTab[Class], 1, DestReg).addImm(0);
|
|
|
|
return;
|
2004-05-04 17:47:14 +02:00
|
|
|
case 1:
|
Handle various other important cases of multiplying a long constant immediate. For
example, multiplying X*(1 + (1LL << 32)) now produces:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
mov %EAX, %ECX
add %EDX, %ECX
ret
[[[Note to Alkis: why isn't linear scan generating this code?? This might be a
problem with your intervals being too conservative:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
add %EDX, %EAX
ret
end note]]]
Whereas GCC produces this:
T:
sub %esp, 12
mov %edx, DWORD PTR [%esp+16]
mov DWORD PTR [%esp+8], %edi
mov %ecx, DWORD PTR [%esp+20]
xor %edi, %edi
mov DWORD PTR [%esp], %ebx
mov %ebx, %edi
mov %eax, %edx
mov DWORD PTR [%esp+4], %esi
add %ebx, %edx
mov %edi, DWORD PTR [%esp+8]
lea %edx, [%ecx+%ebx]
mov %esi, DWORD PTR [%esp+4]
mov %ebx, DWORD PTR [%esp]
add %esp, 12
ret
I'm not sure example what GCC is smoking here, but it looks like it has just
confused itself with a bunch of stack slots or something. The intel compiler
is better, but still not good:
T:
movl 4(%esp), %edx #2.11
movl 8(%esp), %eax #2.11
lea (%eax,%edx), %ecx #3.12
movl $1, %eax #3.12
mull %edx #3.12
addl %ecx, %edx #3.12
ret #3.12
llvm-svn: 12693
2004-04-06 06:55:43 +02:00
|
|
|
BuildMI(*MBB, IP, MOVrrTab[Class], 1, DestReg).addReg(op0Reg);
|
|
|
|
return;
|
2004-05-04 17:47:14 +02:00
|
|
|
case 2:
|
|
|
|
BuildMI(*MBB, IP, ADDrrTab[Class], 1,DestReg).addReg(op0Reg).addReg(op0Reg);
|
|
|
|
return;
|
|
|
|
case 3:
|
|
|
|
case 5:
|
|
|
|
case 9:
|
|
|
|
if (Class == cInt) {
|
2004-08-30 02:13:26 +02:00
|
|
|
X86AddressMode AM;
|
|
|
|
AM.BaseType = X86AddressMode::RegBase;
|
|
|
|
AM.Base.Reg = op0Reg;
|
|
|
|
AM.Scale = ConstRHS-1;
|
|
|
|
AM.IndexReg = op0Reg;
|
|
|
|
AM.Disp = 0;
|
|
|
|
addFullAddress(BuildMI(*MBB, IP, X86::LEA32r, 5, DestReg), AM);
|
2004-05-04 17:47:14 +02:00
|
|
|
return;
|
|
|
|
}
|
2004-07-20 01:47:21 +02:00
|
|
|
case -3:
|
|
|
|
case -5:
|
|
|
|
case -9:
|
|
|
|
if (Class == cInt) {
|
|
|
|
TmpReg = makeAnotherReg(DestTy);
|
2004-08-30 02:13:26 +02:00
|
|
|
X86AddressMode AM;
|
|
|
|
AM.BaseType = X86AddressMode::RegBase;
|
|
|
|
AM.Base.Reg = op0Reg;
|
|
|
|
AM.Scale = -ConstRHS-1;
|
|
|
|
AM.IndexReg = op0Reg;
|
|
|
|
AM.Disp = 0;
|
|
|
|
addFullAddress(BuildMI(*MBB, IP, X86::LEA32r, 5, TmpReg), AM);
|
2004-07-20 01:47:21 +02:00
|
|
|
BuildMI(*MBB, IP, NEGrTab[Class], 1, DestReg).addReg(TmpReg);
|
|
|
|
return;
|
|
|
|
}
|
Handle various other important cases of multiplying a long constant immediate. For
example, multiplying X*(1 + (1LL << 32)) now produces:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
mov %EAX, %ECX
add %EDX, %ECX
ret
[[[Note to Alkis: why isn't linear scan generating this code?? This might be a
problem with your intervals being too conservative:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
add %EDX, %EAX
ret
end note]]]
Whereas GCC produces this:
T:
sub %esp, 12
mov %edx, DWORD PTR [%esp+16]
mov DWORD PTR [%esp+8], %edi
mov %ecx, DWORD PTR [%esp+20]
xor %edi, %edi
mov DWORD PTR [%esp], %ebx
mov %ebx, %edi
mov %eax, %edx
mov DWORD PTR [%esp+4], %esi
add %ebx, %edx
mov %edi, DWORD PTR [%esp+8]
lea %edx, [%ecx+%ebx]
mov %esi, DWORD PTR [%esp+4]
mov %ebx, DWORD PTR [%esp]
add %esp, 12
ret
I'm not sure example what GCC is smoking here, but it looks like it has just
confused itself with a bunch of stack slots or something. The intel compiler
is better, but still not good:
T:
movl 4(%esp), %edx #2.11
movl 8(%esp), %eax #2.11
lea (%eax,%edx), %ecx #3.12
movl $1, %eax #3.12
mull %edx #3.12
addl %ecx, %edx #3.12
ret #3.12
llvm-svn: 12693
2004-04-06 06:55:43 +02:00
|
|
|
}
|
|
|
|
|
2003-10-19 23:09:10 +02:00
|
|
|
// If the element size is exactly a power of 2, use a shift to get it.
|
|
|
|
if (unsigned Shift = ExactLog2(ConstRHS)) {
|
|
|
|
switch (Class) {
|
|
|
|
default: assert(0 && "Unknown class for this function!");
|
|
|
|
case cByte:
|
2004-07-20 01:47:21 +02:00
|
|
|
BuildMI(*MBB, IP, X86::SHL8ri,2, DestReg).addReg(op0Reg).addImm(Shift-1);
|
2003-10-19 23:09:10 +02:00
|
|
|
return;
|
|
|
|
case cShort:
|
2004-07-20 01:47:21 +02:00
|
|
|
BuildMI(*MBB, IP, X86::SHL16ri,2, DestReg).addReg(op0Reg).addImm(Shift-1);
|
2003-10-19 23:09:10 +02:00
|
|
|
return;
|
|
|
|
case cInt:
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::SHL32ri,2, DestReg).addReg(op0Reg).addImm(Shift-1);
|
2003-10-19 23:09:10 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2004-07-20 01:47:21 +02:00
|
|
|
|
|
|
|
// If the element size is a negative power of 2, use a shift/neg to get it.
|
|
|
|
if (unsigned Shift = ExactLog2(-ConstRHS)) {
|
|
|
|
TmpReg = makeAnotherReg(DestTy);
|
|
|
|
BuildMI(*MBB, IP, NEGrTab[Class], 1, TmpReg).addReg(op0Reg);
|
|
|
|
switch (Class) {
|
|
|
|
default: assert(0 && "Unknown class for this function!");
|
|
|
|
case cByte:
|
|
|
|
BuildMI(*MBB, IP, X86::SHL8ri,2, DestReg).addReg(TmpReg).addImm(Shift-1);
|
|
|
|
return;
|
|
|
|
case cShort:
|
|
|
|
BuildMI(*MBB, IP, X86::SHL16ri,2, DestReg).addReg(TmpReg).addImm(Shift-1);
|
|
|
|
return;
|
|
|
|
case cInt:
|
|
|
|
BuildMI(*MBB, IP, X86::SHL32ri,2, DestReg).addReg(TmpReg).addImm(Shift-1);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2003-10-20 05:42:58 +02:00
|
|
|
if (Class == cShort) {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::IMUL16rri,2,DestReg).addReg(op0Reg).addImm(ConstRHS);
|
2003-10-20 05:42:58 +02:00
|
|
|
return;
|
|
|
|
} else if (Class == cInt) {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::IMUL32rri,2,DestReg).addReg(op0Reg).addImm(ConstRHS);
|
2003-10-20 05:42:58 +02:00
|
|
|
return;
|
|
|
|
}
|
2003-10-19 23:09:10 +02:00
|
|
|
|
|
|
|
// Most general case, emit a normal multiply...
|
2004-07-20 01:47:21 +02:00
|
|
|
TmpReg = makeAnotherReg(DestTy);
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*MBB, IP, MOVriTab[Class], 1, TmpReg).addImm(ConstRHS);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2003-10-19 23:09:10 +02:00
|
|
|
// Emit a MUL to multiply the register holding the index by
|
|
|
|
// elementSize, putting the result in OffsetReg.
|
|
|
|
doMultiply(MBB, IP, DestReg, DestTy, op0Reg, TmpReg);
|
|
|
|
}
|
|
|
|
|
This checkin is brought to you by the brian gaeke allnighter fund.
(lib/Target/X86) InstSelectSimple.cpp:
Include llvm/DerivedTypes.h and iostream.
Refactor visitMul out into a wrapper around doMultiply(), so that we
can do multiplications on temporary values when we are doing
getelementptrs.
Refactor part of getReg out into makeAnotherReg, so that we can create
registers willy-nilly to hold temporary values, when we are doing
getelementptrs.
Add stub implementations of visitMallocInst and visitAllocaInst.
Add initial implementation of visitGetElementPtrInst.
In copyConstantToRegister:
We throw a *lot* of our asserts here. So, when we want to throw an
assert, print out to stderr whatever expr or whatever constant made
us barf.
Support copying ConstantPointerNull to register, using a move immediate
of zero.
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
Teach visitCallInst to extract byte- and short-class return values
from subregs of EAX. Add a FIXME note about how we would do it for
float-class return values.
Add a FIXME note about how we would cast float to int and back.
X86InstrInfo.def:
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
(tools/jello) GlobalVars.cpp:
Include iostream.
If we have to emit a floating-point constant to memory, gamble and use
the same method as for ints.
If we have to emit a ConstantPointerNull to memory, try using a "void *"
and "NULL".
Otherwise, if we are going to throw an assert, print out whatever constant
made us barf, first.
llvm-svn: 4973
2002-12-12 16:33:40 +01:00
|
|
|
/// visitMul - Multiplies are not simple binary operators because they must deal
|
|
|
|
/// with the EAX register explicitly.
|
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitMul(BinaryOperator &I) {
|
2004-04-11 22:56:28 +02:00
|
|
|
unsigned ResultReg = getReg(I);
|
|
|
|
|
2004-04-12 00:05:45 +02:00
|
|
|
Value *Op0 = I.getOperand(0);
|
|
|
|
Value *Op1 = I.getOperand(1);
|
|
|
|
|
|
|
|
// Fold loads into floating point multiplies.
|
|
|
|
if (getClass(Op0->getType()) == cFP) {
|
|
|
|
if (isa<LoadInst>(Op0) && !isa<LoadInst>(Op1))
|
|
|
|
if (!I.swapOperands())
|
|
|
|
std::swap(Op0, Op1); // Make sure any loads are in the RHS.
|
|
|
|
if (LoadInst *LI = dyn_cast<LoadInst>(Op1))
|
|
|
|
if (isSafeToFoldLoadIntoInstruction(*LI, I)) {
|
|
|
|
const Type *Ty = Op0->getType();
|
|
|
|
assert(Ty == Type::FloatTy||Ty == Type::DoubleTy && "Unknown FP type!");
|
|
|
|
unsigned Opcode = Ty == Type::FloatTy ? X86::FMUL32m : X86::FMUL64m;
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-12 00:05:45 +02:00
|
|
|
unsigned Op0r = getReg(Op0);
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
if (AllocaInst *AI = dyn_castFixedAlloca(LI->getOperand(0))) {
|
|
|
|
unsigned FI = getFixedSizedAllocaFI(AI);
|
|
|
|
addFrameReference(BuildMI(BB, Opcode, 5, ResultReg).addReg(Op0r), FI);
|
|
|
|
} else {
|
2004-08-30 02:13:26 +02:00
|
|
|
X86AddressMode AM;
|
|
|
|
getAddressingMode(LI->getOperand(0), AM);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-08-30 02:13:26 +02:00
|
|
|
addFullAddress(BuildMI(BB, Opcode, 5, ResultReg).addReg(Op0r), AM);
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
}
|
2004-04-12 00:05:45 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2004-04-11 22:56:28 +02:00
|
|
|
MachineBasicBlock::iterator IP = BB->end();
|
2004-04-12 00:05:45 +02:00
|
|
|
emitMultiply(BB, IP, Op0, Op1, ResultReg);
|
2004-04-11 22:56:28 +02:00
|
|
|
}
|
|
|
|
|
2005-04-22 01:38:14 +02:00
|
|
|
void X86ISel::emitMultiply(MachineBasicBlock *MBB,
|
2004-09-21 20:21:21 +02:00
|
|
|
MachineBasicBlock::iterator IP,
|
|
|
|
Value *Op0, Value *Op1, unsigned DestReg) {
|
2004-04-11 22:56:28 +02:00
|
|
|
MachineBasicBlock &BB = *MBB;
|
|
|
|
TypeClass Class = getClass(Op0->getType());
|
2003-01-13 01:32:26 +01:00
|
|
|
|
|
|
|
// Simple scalar multiply?
|
2004-04-11 23:09:14 +02:00
|
|
|
unsigned Op0Reg = getReg(Op0, &BB, IP);
|
2004-04-11 22:56:28 +02:00
|
|
|
switch (Class) {
|
|
|
|
case cByte:
|
|
|
|
case cShort:
|
|
|
|
case cInt:
|
|
|
|
if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
|
|
|
|
unsigned Val = (unsigned)CI->getRawValue(); // Isn't a 64-bit constant
|
|
|
|
doMultiplyConst(&BB, IP, DestReg, Op0->getType(), Op0Reg, Val);
|
2003-10-19 23:09:10 +02:00
|
|
|
} else {
|
2004-04-11 22:56:28 +02:00
|
|
|
unsigned Op1Reg = getReg(Op1, &BB, IP);
|
|
|
|
doMultiply(&BB, IP, DestReg, Op1->getType(), Op0Reg, Op1Reg);
|
2003-10-19 23:09:10 +02:00
|
|
|
}
|
2004-04-11 22:56:28 +02:00
|
|
|
return;
|
|
|
|
case cFP:
|
2004-04-11 23:23:56 +02:00
|
|
|
emitBinaryFPOperation(MBB, IP, Op0, Op1, 2, DestReg);
|
|
|
|
return;
|
2004-04-11 22:56:28 +02:00
|
|
|
case cLong:
|
|
|
|
break;
|
|
|
|
}
|
Handle various other important cases of multiplying a long constant immediate. For
example, multiplying X*(1 + (1LL << 32)) now produces:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
mov %EAX, %ECX
add %EDX, %ECX
ret
[[[Note to Alkis: why isn't linear scan generating this code?? This might be a
problem with your intervals being too conservative:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %EDX, DWORD PTR [%ESP + 8]
add %EDX, %EAX
ret
end note]]]
Whereas GCC produces this:
T:
sub %esp, 12
mov %edx, DWORD PTR [%esp+16]
mov DWORD PTR [%esp+8], %edi
mov %ecx, DWORD PTR [%esp+20]
xor %edi, %edi
mov DWORD PTR [%esp], %ebx
mov %ebx, %edi
mov %eax, %edx
mov DWORD PTR [%esp+4], %esi
add %ebx, %edx
mov %edi, DWORD PTR [%esp+8]
lea %edx, [%ecx+%ebx]
mov %esi, DWORD PTR [%esp+4]
mov %ebx, DWORD PTR [%esp]
add %esp, 12
ret
I'm not sure example what GCC is smoking here, but it looks like it has just
confused itself with a bunch of stack slots or something. The intel compiler
is better, but still not good:
T:
movl 4(%esp), %edx #2.11
movl 8(%esp), %eax #2.11
lea (%eax,%edx), %ecx #3.12
movl $1, %eax #3.12
mull %edx #3.12
addl %ecx, %edx #3.12
ret #3.12
llvm-svn: 12693
2004-04-06 06:55:43 +02:00
|
|
|
|
2004-04-11 22:56:28 +02:00
|
|
|
// Long value. We have to do things the hard way...
|
|
|
|
if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
|
|
|
|
unsigned CLow = CI->getRawValue();
|
|
|
|
unsigned CHi = CI->getRawValue() >> 32;
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 22:56:28 +02:00
|
|
|
if (CLow == 0) {
|
|
|
|
// If the low part of the constant is all zeros, things are simple.
|
|
|
|
BuildMI(BB, IP, X86::MOV32ri, 1, DestReg).addImm(0);
|
|
|
|
doMultiplyConst(&BB, IP, DestReg+1, Type::UIntTy, Op0Reg, CHi);
|
|
|
|
return;
|
|
|
|
}
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 22:56:28 +02:00
|
|
|
// Multiply the two low parts... capturing carry into EDX
|
|
|
|
unsigned OverflowReg = 0;
|
|
|
|
if (CLow == 1) {
|
|
|
|
BuildMI(BB, IP, X86::MOV32rr, 1, DestReg).addReg(Op0Reg);
|
Efficiently handle a long multiplication by a constant. For this testcase:
long %test(long %X) {
%Y = mul long %X, 123
ret long %Y
}
we used to generate:
test:
sub %ESP, 12
mov DWORD PTR [%ESP + 8], %ESI
mov DWORD PTR [%ESP + 4], %EDI
mov DWORD PTR [%ESP], %EBX
mov %ECX, DWORD PTR [%ESP + 16]
mov %ESI, DWORD PTR [%ESP + 20]
mov %EDI, 123
mov %EBX, 0
mov %EAX, %ECX
mul %EDI
imul %ESI, %EDI
add %ESI, %EDX
imul %ECX, %EBX
add %ESI, %ECX
mov %EDX, %ESI
mov %EBX, DWORD PTR [%ESP]
mov %EDI, DWORD PTR [%ESP + 4]
mov %ESI, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Now we emit:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EDX, 123
mul %EDX
imul %ECX, %ECX, 123
add %ECX, %EDX
mov %EDX, %ECX
ret
Which, incidently, is substantially nicer than what GCC manages:
T:
sub %esp, 8
mov %eax, 123
mov DWORD PTR [%esp], %ebx
mov %ebx, DWORD PTR [%esp+16]
mov DWORD PTR [%esp+4], %esi
mov %esi, DWORD PTR [%esp+12]
imul %ecx, %ebx, 123
mov %ebx, DWORD PTR [%esp]
mul %esi
mov %esi, DWORD PTR [%esp+4]
add %esp, 8
lea %edx, [%ecx+%edx]
ret
llvm-svn: 12692
2004-04-06 06:29:36 +02:00
|
|
|
} else {
|
2004-04-11 22:56:28 +02:00
|
|
|
unsigned Op1RegL = makeAnotherReg(Type::UIntTy);
|
|
|
|
OverflowReg = makeAnotherReg(Type::UIntTy);
|
|
|
|
BuildMI(BB, IP, X86::MOV32ri, 1, Op1RegL).addImm(CLow);
|
|
|
|
BuildMI(BB, IP, X86::MOV32rr, 1, X86::EAX).addReg(Op0Reg);
|
|
|
|
BuildMI(BB, IP, X86::MUL32r, 1).addReg(Op1RegL); // AL*BL
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 22:56:28 +02:00
|
|
|
BuildMI(BB, IP, X86::MOV32rr, 1, DestReg).addReg(X86::EAX); // AL*BL
|
|
|
|
BuildMI(BB, IP, X86::MOV32rr, 1,
|
|
|
|
OverflowReg).addReg(X86::EDX); // AL*BL >> 32
|
|
|
|
}
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 22:56:28 +02:00
|
|
|
unsigned AHBLReg = makeAnotherReg(Type::UIntTy); // AH*BL
|
|
|
|
doMultiplyConst(&BB, IP, AHBLReg, Type::UIntTy, Op0Reg+1, CLow);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 22:56:28 +02:00
|
|
|
unsigned AHBLplusOverflowReg;
|
|
|
|
if (OverflowReg) {
|
|
|
|
AHBLplusOverflowReg = makeAnotherReg(Type::UIntTy);
|
|
|
|
BuildMI(BB, IP, X86::ADD32rr, 2, // AH*BL+(AL*BL >> 32)
|
Efficiently handle a long multiplication by a constant. For this testcase:
long %test(long %X) {
%Y = mul long %X, 123
ret long %Y
}
we used to generate:
test:
sub %ESP, 12
mov DWORD PTR [%ESP + 8], %ESI
mov DWORD PTR [%ESP + 4], %EDI
mov DWORD PTR [%ESP], %EBX
mov %ECX, DWORD PTR [%ESP + 16]
mov %ESI, DWORD PTR [%ESP + 20]
mov %EDI, 123
mov %EBX, 0
mov %EAX, %ECX
mul %EDI
imul %ESI, %EDI
add %ESI, %EDX
imul %ECX, %EBX
add %ESI, %ECX
mov %EDX, %ESI
mov %EBX, DWORD PTR [%ESP]
mov %EDI, DWORD PTR [%ESP + 4]
mov %ESI, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Now we emit:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EDX, 123
mul %EDX
imul %ECX, %ECX, 123
add %ECX, %EDX
mov %EDX, %ECX
ret
Which, incidently, is substantially nicer than what GCC manages:
T:
sub %esp, 8
mov %eax, 123
mov DWORD PTR [%esp], %ebx
mov %ebx, DWORD PTR [%esp+16]
mov DWORD PTR [%esp+4], %esi
mov %esi, DWORD PTR [%esp+12]
imul %ecx, %ebx, 123
mov %ebx, DWORD PTR [%esp]
mul %esi
mov %esi, DWORD PTR [%esp+4]
add %esp, 8
lea %edx, [%ecx+%edx]
ret
llvm-svn: 12692
2004-04-06 06:29:36 +02:00
|
|
|
AHBLplusOverflowReg).addReg(AHBLReg).addReg(OverflowReg);
|
2004-04-11 22:56:28 +02:00
|
|
|
} else {
|
|
|
|
AHBLplusOverflowReg = AHBLReg;
|
|
|
|
}
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 22:56:28 +02:00
|
|
|
if (CHi == 0) {
|
|
|
|
BuildMI(BB, IP, X86::MOV32rr, 1, DestReg+1).addReg(AHBLplusOverflowReg);
|
|
|
|
} else {
|
Efficiently handle a long multiplication by a constant. For this testcase:
long %test(long %X) {
%Y = mul long %X, 123
ret long %Y
}
we used to generate:
test:
sub %ESP, 12
mov DWORD PTR [%ESP + 8], %ESI
mov DWORD PTR [%ESP + 4], %EDI
mov DWORD PTR [%ESP], %EBX
mov %ECX, DWORD PTR [%ESP + 16]
mov %ESI, DWORD PTR [%ESP + 20]
mov %EDI, 123
mov %EBX, 0
mov %EAX, %ECX
mul %EDI
imul %ESI, %EDI
add %ESI, %EDX
imul %ECX, %EBX
add %ESI, %ECX
mov %EDX, %ESI
mov %EBX, DWORD PTR [%ESP]
mov %EDI, DWORD PTR [%ESP + 4]
mov %ESI, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Now we emit:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EDX, 123
mul %EDX
imul %ECX, %ECX, 123
add %ECX, %EDX
mov %EDX, %ECX
ret
Which, incidently, is substantially nicer than what GCC manages:
T:
sub %esp, 8
mov %eax, 123
mov DWORD PTR [%esp], %ebx
mov %ebx, DWORD PTR [%esp+16]
mov DWORD PTR [%esp+4], %esi
mov %esi, DWORD PTR [%esp+12]
imul %ecx, %ebx, 123
mov %ebx, DWORD PTR [%esp]
mul %esi
mov %esi, DWORD PTR [%esp+4]
add %esp, 8
lea %edx, [%ecx+%edx]
ret
llvm-svn: 12692
2004-04-06 06:29:36 +02:00
|
|
|
unsigned ALBHReg = makeAnotherReg(Type::UIntTy); // AL*BH
|
2004-04-11 22:56:28 +02:00
|
|
|
doMultiplyConst(&BB, IP, ALBHReg, Type::UIntTy, Op0Reg, CHi);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 22:56:28 +02:00
|
|
|
BuildMI(BB, IP, X86::ADD32rr, 2, // AL*BH + AH*BL + (AL*BL >> 32)
|
Efficiently handle a long multiplication by a constant. For this testcase:
long %test(long %X) {
%Y = mul long %X, 123
ret long %Y
}
we used to generate:
test:
sub %ESP, 12
mov DWORD PTR [%ESP + 8], %ESI
mov DWORD PTR [%ESP + 4], %EDI
mov DWORD PTR [%ESP], %EBX
mov %ECX, DWORD PTR [%ESP + 16]
mov %ESI, DWORD PTR [%ESP + 20]
mov %EDI, 123
mov %EBX, 0
mov %EAX, %ECX
mul %EDI
imul %ESI, %EDI
add %ESI, %EDX
imul %ECX, %EBX
add %ESI, %ECX
mov %EDX, %ESI
mov %EBX, DWORD PTR [%ESP]
mov %EDI, DWORD PTR [%ESP + 4]
mov %ESI, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Now we emit:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EDX, 123
mul %EDX
imul %ECX, %ECX, 123
add %ECX, %EDX
mov %EDX, %ECX
ret
Which, incidently, is substantially nicer than what GCC manages:
T:
sub %esp, 8
mov %eax, 123
mov DWORD PTR [%esp], %ebx
mov %ebx, DWORD PTR [%esp+16]
mov DWORD PTR [%esp+4], %esi
mov %esi, DWORD PTR [%esp+12]
imul %ecx, %ebx, 123
mov %ebx, DWORD PTR [%esp]
mul %esi
mov %esi, DWORD PTR [%esp+4]
add %esp, 8
lea %edx, [%ecx+%edx]
ret
llvm-svn: 12692
2004-04-06 06:29:36 +02:00
|
|
|
DestReg+1).addReg(AHBLplusOverflowReg).addReg(ALBHReg);
|
|
|
|
}
|
2004-04-11 22:56:28 +02:00
|
|
|
return;
|
2003-01-13 01:32:26 +01:00
|
|
|
}
|
2004-04-11 22:56:28 +02:00
|
|
|
|
|
|
|
// General 64x64 multiply
|
|
|
|
|
|
|
|
unsigned Op1Reg = getReg(Op1, &BB, IP);
|
|
|
|
// Multiply the two low parts... capturing carry into EDX
|
|
|
|
BuildMI(BB, IP, X86::MOV32rr, 1, X86::EAX).addReg(Op0Reg);
|
|
|
|
BuildMI(BB, IP, X86::MUL32r, 1).addReg(Op1Reg); // AL*BL
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 22:56:28 +02:00
|
|
|
unsigned OverflowReg = makeAnotherReg(Type::UIntTy);
|
|
|
|
BuildMI(BB, IP, X86::MOV32rr, 1, DestReg).addReg(X86::EAX); // AL*BL
|
|
|
|
BuildMI(BB, IP, X86::MOV32rr, 1,
|
|
|
|
OverflowReg).addReg(X86::EDX); // AL*BL >> 32
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 22:56:28 +02:00
|
|
|
unsigned AHBLReg = makeAnotherReg(Type::UIntTy); // AH*BL
|
|
|
|
BuildMI(BB, IP, X86::IMUL32rr, 2,
|
|
|
|
AHBLReg).addReg(Op0Reg+1).addReg(Op1Reg);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 22:56:28 +02:00
|
|
|
unsigned AHBLplusOverflowReg = makeAnotherReg(Type::UIntTy);
|
|
|
|
BuildMI(BB, IP, X86::ADD32rr, 2, // AH*BL+(AL*BL >> 32)
|
|
|
|
AHBLplusOverflowReg).addReg(AHBLReg).addReg(OverflowReg);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 22:56:28 +02:00
|
|
|
unsigned ALBHReg = makeAnotherReg(Type::UIntTy); // AL*BH
|
|
|
|
BuildMI(BB, IP, X86::IMUL32rr, 2,
|
|
|
|
ALBHReg).addReg(Op0Reg).addReg(Op1Reg+1);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-11 22:56:28 +02:00
|
|
|
BuildMI(BB, IP, X86::ADD32rr, 2, // AL*BH + AH*BL + (AL*BL >> 32)
|
|
|
|
DestReg+1).addReg(AHBLplusOverflowReg).addReg(ALBHReg);
|
2002-11-02 21:54:46 +01:00
|
|
|
}
|
|
|
|
|
2002-11-17 22:56:38 +01:00
|
|
|
|
2002-11-02 21:54:46 +01:00
|
|
|
/// visitDivRem - Handle division and remainder instructions... these
|
|
|
|
/// instruction both require the same instructions to be generated, they just
|
|
|
|
/// select the result from a different register. Note that both of these
|
|
|
|
/// instructions work differently for signed and unsigned operands.
|
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitDivRem(BinaryOperator &I) {
|
2003-10-23 19:21:43 +02:00
|
|
|
unsigned ResultReg = getReg(I);
|
2004-04-12 00:05:45 +02:00
|
|
|
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
|
|
|
|
|
|
|
|
// Fold loads into floating point divides.
|
|
|
|
if (getClass(Op0->getType()) == cFP) {
|
|
|
|
if (LoadInst *LI = dyn_cast<LoadInst>(Op1))
|
|
|
|
if (isSafeToFoldLoadIntoInstruction(*LI, I)) {
|
|
|
|
const Type *Ty = Op0->getType();
|
|
|
|
assert(Ty == Type::FloatTy||Ty == Type::DoubleTy && "Unknown FP type!");
|
|
|
|
unsigned Opcode = Ty == Type::FloatTy ? X86::FDIV32m : X86::FDIV64m;
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-12 00:05:45 +02:00
|
|
|
unsigned Op0r = getReg(Op0);
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
if (AllocaInst *AI = dyn_castFixedAlloca(LI->getOperand(0))) {
|
|
|
|
unsigned FI = getFixedSizedAllocaFI(AI);
|
|
|
|
addFrameReference(BuildMI(BB, Opcode, 5, ResultReg).addReg(Op0r), FI);
|
|
|
|
} else {
|
2004-08-30 02:13:26 +02:00
|
|
|
X86AddressMode AM;
|
|
|
|
getAddressingMode(LI->getOperand(0), AM);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-08-30 02:13:26 +02:00
|
|
|
addFullAddress(BuildMI(BB, Opcode, 5, ResultReg).addReg(Op0r), AM);
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
}
|
2004-04-12 00:05:45 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (LoadInst *LI = dyn_cast<LoadInst>(Op0))
|
|
|
|
if (isSafeToFoldLoadIntoInstruction(*LI, I)) {
|
|
|
|
const Type *Ty = Op0->getType();
|
|
|
|
assert(Ty == Type::FloatTy||Ty == Type::DoubleTy && "Unknown FP type!");
|
|
|
|
unsigned Opcode = Ty == Type::FloatTy ? X86::FDIVR32m : X86::FDIVR64m;
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-04-12 00:05:45 +02:00
|
|
|
unsigned Op1r = getReg(Op1);
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
if (AllocaInst *AI = dyn_castFixedAlloca(LI->getOperand(0))) {
|
|
|
|
unsigned FI = getFixedSizedAllocaFI(AI);
|
|
|
|
addFrameReference(BuildMI(BB, Opcode, 5, ResultReg).addReg(Op1r), FI);
|
|
|
|
} else {
|
2004-08-30 02:13:26 +02:00
|
|
|
X86AddressMode AM;
|
|
|
|
getAddressingMode(LI->getOperand(0), AM);
|
|
|
|
addFullAddress(BuildMI(BB, Opcode, 5, ResultReg).addReg(Op1r), AM);
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
}
|
2004-04-12 00:05:45 +02:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2003-10-23 19:21:43 +02:00
|
|
|
|
|
|
|
MachineBasicBlock::iterator IP = BB->end();
|
2004-04-12 00:05:45 +02:00
|
|
|
emitDivRemOperation(BB, IP, Op0, Op1,
|
2004-04-11 22:56:28 +02:00
|
|
|
I.getOpcode() == Instruction::Div, ResultReg);
|
2003-10-23 19:21:43 +02:00
|
|
|
}
|
2002-12-25 06:13:53 +01:00
|
|
|
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::emitDivRemOperation(MachineBasicBlock *BB,
|
|
|
|
MachineBasicBlock::iterator IP,
|
|
|
|
Value *Op0, Value *Op1, bool isDiv,
|
|
|
|
unsigned ResultReg) {
|
2004-04-11 23:09:14 +02:00
|
|
|
const Type *Ty = Op0->getType();
|
|
|
|
unsigned Class = getClass(Ty);
|
2002-12-25 06:13:53 +01:00
|
|
|
switch (Class) {
|
2003-01-13 01:32:26 +01:00
|
|
|
case cFP: // Floating point divide
|
2003-10-23 19:21:43 +02:00
|
|
|
if (isDiv) {
|
2004-04-11 23:23:56 +02:00
|
|
|
emitBinaryFPOperation(BB, IP, Op0, Op1, 3, ResultReg);
|
|
|
|
return;
|
2003-08-04 04:12:48 +02:00
|
|
|
} else { // Floating point remainder...
|
2004-04-11 22:56:28 +02:00
|
|
|
unsigned Op0Reg = getReg(Op0, BB, IP);
|
|
|
|
unsigned Op1Reg = getReg(Op1, BB, IP);
|
2003-01-13 01:32:26 +01:00
|
|
|
MachineInstr *TheCall =
|
2003-10-23 18:22:08 +02:00
|
|
|
BuildMI(X86::CALLpcrel32, 1).addExternalSymbol("fmod", true);
|
2003-01-13 01:32:26 +01:00
|
|
|
std::vector<ValueRecord> Args;
|
2003-10-23 19:21:43 +02:00
|
|
|
Args.push_back(ValueRecord(Op0Reg, Type::DoubleTy));
|
|
|
|
Args.push_back(ValueRecord(Op1Reg, Type::DoubleTy));
|
2003-01-13 01:32:26 +01:00
|
|
|
doCall(ValueRecord(ResultReg, Type::DoubleTy), TheCall, Args);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
case cLong: {
|
|
|
|
static const char *FnName[] =
|
|
|
|
{ "__moddi3", "__divdi3", "__umoddi3", "__udivdi3" };
|
2004-04-11 22:56:28 +02:00
|
|
|
unsigned Op0Reg = getReg(Op0, BB, IP);
|
|
|
|
unsigned Op1Reg = getReg(Op1, BB, IP);
|
2004-04-11 23:09:14 +02:00
|
|
|
unsigned NameIdx = Ty->isUnsigned()*2 + isDiv;
|
2003-01-13 01:32:26 +01:00
|
|
|
MachineInstr *TheCall =
|
|
|
|
BuildMI(X86::CALLpcrel32, 1).addExternalSymbol(FnName[NameIdx], true);
|
|
|
|
|
|
|
|
std::vector<ValueRecord> Args;
|
2003-10-23 19:21:43 +02:00
|
|
|
Args.push_back(ValueRecord(Op0Reg, Type::LongTy));
|
|
|
|
Args.push_back(ValueRecord(Op1Reg, Type::LongTy));
|
2003-01-13 01:32:26 +01:00
|
|
|
doCall(ValueRecord(ResultReg, Type::LongTy), TheCall, Args);
|
2002-12-25 06:13:53 +01:00
|
|
|
return;
|
2003-01-13 01:32:26 +01:00
|
|
|
}
|
|
|
|
case cByte: case cShort: case cInt:
|
2003-10-10 19:57:28 +02:00
|
|
|
break; // Small integrals, handled below...
|
2003-01-13 01:32:26 +01:00
|
|
|
default: assert(0 && "Unknown class!");
|
2002-12-25 06:13:53 +01:00
|
|
|
}
|
2002-11-02 21:54:46 +01:00
|
|
|
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
static const unsigned MovOpcode[]={ X86::MOV8rr, X86::MOV16rr, X86::MOV32rr };
|
Codegen signed mod by 2 or -2 more efficiently. Instead of generating:
t:
mov %EDX, DWORD PTR [%ESP + 4]
mov %ECX, 2
mov %EAX, %EDX
sar %EDX, 31
idiv %ECX
mov %EAX, %EDX
ret
Generate:
t:
mov %ECX, DWORD PTR [%ESP + 4]
*** mov %EAX, %ECX
cdq
and %ECX, 1
xor %ECX, %EDX
sub %ECX, %EDX
*** mov %EAX, %ECX
ret
Note that the two marked moves are redundant, and should be eliminated by the
register allocator, but aren't.
Compare this to GCC, which generates:
t:
mov %eax, DWORD PTR [%esp+4]
mov %edx, %eax
shr %edx, 31
lea %ecx, [%edx+%eax]
and %ecx, -2
sub %eax, %ecx
ret
or ICC 8.0, which generates:
t:
movl 4(%esp), %ecx #3.5
movl $-2147483647, %eax #3.25
imull %ecx #3.25
movl %ecx, %eax #3.25
sarl $31, %eax #3.25
addl %ecx, %edx #3.25
subl %edx, %eax #3.25
addl %eax, %eax #3.25
negl %eax #3.25
subl %eax, %ecx #3.25
movl %ecx, %eax #3.25
ret #3.25
We would be in great shape if not for the moves.
llvm-svn: 16763
2004-10-06 07:01:07 +02:00
|
|
|
static const unsigned NEGOpcode[]={ X86::NEG8r, X86::NEG16r, X86::NEG32r };
|
Improve signed division by power of 2 *dramatically* from this:
div:
mov %EDX, DWORD PTR [%ESP + 4]
mov %ECX, 64
mov %EAX, %EDX
sar %EDX, 31
idiv %ECX
ret
to this:
div:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
sar %ECX, 5
shr %ECX, 26
mov %EDX, %EAX
add %EDX, %ECX
sar %EAX, 6
ret
Note that the intel compiler is currently making this:
div:
movl 4(%esp), %edx #3.5
movl %edx, %eax #4.14
sarl $5, %eax #4.14
shrl $26, %eax #4.14
addl %edx, %eax #4.14
sarl $6, %eax #4.14
ret #4.14
Which has one less register->register copy. (hint hint alkis :)
llvm-svn: 13354
2004-05-04 21:33:58 +02:00
|
|
|
static const unsigned SAROpcode[]={ X86::SAR8ri, X86::SAR16ri, X86::SAR32ri };
|
|
|
|
static const unsigned SHROpcode[]={ X86::SHR8ri, X86::SHR16ri, X86::SHR32ri };
|
|
|
|
static const unsigned ADDOpcode[]={ X86::ADD8rr, X86::ADD16rr, X86::ADD32rr };
|
|
|
|
|
|
|
|
// Special case signed division by power of 2.
|
Codegen signed mod by 2 or -2 more efficiently. Instead of generating:
t:
mov %EDX, DWORD PTR [%ESP + 4]
mov %ECX, 2
mov %EAX, %EDX
sar %EDX, 31
idiv %ECX
mov %EAX, %EDX
ret
Generate:
t:
mov %ECX, DWORD PTR [%ESP + 4]
*** mov %EAX, %ECX
cdq
and %ECX, 1
xor %ECX, %EDX
sub %ECX, %EDX
*** mov %EAX, %ECX
ret
Note that the two marked moves are redundant, and should be eliminated by the
register allocator, but aren't.
Compare this to GCC, which generates:
t:
mov %eax, DWORD PTR [%esp+4]
mov %edx, %eax
shr %edx, 31
lea %ecx, [%edx+%eax]
and %ecx, -2
sub %eax, %ecx
ret
or ICC 8.0, which generates:
t:
movl 4(%esp), %ecx #3.5
movl $-2147483647, %eax #3.25
imull %ecx #3.25
movl %ecx, %eax #3.25
sarl $31, %eax #3.25
addl %ecx, %edx #3.25
subl %edx, %eax #3.25
addl %eax, %eax #3.25
negl %eax #3.25
subl %eax, %ecx #3.25
movl %ecx, %eax #3.25
ret #3.25
We would be in great shape if not for the moves.
llvm-svn: 16763
2004-10-06 07:01:07 +02:00
|
|
|
if (ConstantSInt *CI = dyn_cast<ConstantSInt>(Op1))
|
|
|
|
if (isDiv) {
|
Improve signed division by power of 2 *dramatically* from this:
div:
mov %EDX, DWORD PTR [%ESP + 4]
mov %ECX, 64
mov %EAX, %EDX
sar %EDX, 31
idiv %ECX
ret
to this:
div:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
sar %ECX, 5
shr %ECX, 26
mov %EDX, %EAX
add %EDX, %ECX
sar %EAX, 6
ret
Note that the intel compiler is currently making this:
div:
movl 4(%esp), %edx #3.5
movl %edx, %eax #4.14
sarl $5, %eax #4.14
shrl $26, %eax #4.14
addl %edx, %eax #4.14
sarl $6, %eax #4.14
ret #4.14
Which has one less register->register copy. (hint hint alkis :)
llvm-svn: 13354
2004-05-04 21:33:58 +02:00
|
|
|
assert(Class != cLong && "This doesn't handle 64-bit divides!");
|
|
|
|
int V = CI->getValue();
|
|
|
|
|
|
|
|
if (V == 1) { // X /s 1 => X
|
|
|
|
unsigned Op0Reg = getReg(Op0, BB, IP);
|
|
|
|
BuildMI(*BB, IP, MovOpcode[Class], 1, ResultReg).addReg(Op0Reg);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (V == -1) { // X /s -1 => -X
|
|
|
|
unsigned Op0Reg = getReg(Op0, BB, IP);
|
|
|
|
BuildMI(*BB, IP, NEGOpcode[Class], 1, ResultReg).addReg(Op0Reg);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
Codegen signed divides by 2 and -2 more efficiently. In particular
instead of:
s: ;; X / 2
movl 4(%esp), %eax
movl %eax, %ecx
shrl $31, %ecx
movl %eax, %edx
addl %ecx, %edx
sarl $1, %eax
ret
t: ;; X / -2
movl 4(%esp), %eax
movl %eax, %ecx
shrl $31, %ecx
movl %eax, %edx
addl %ecx, %edx
sarl $1, %eax
negl %eax
ret
Emit:
s:
movl 4(%esp), %eax
cmpl $-2147483648, %eax
sbbl $-1, %eax
sarl $1, %eax
ret
t:
movl 4(%esp), %eax
cmpl $-2147483648, %eax
sbbl $-1, %eax
sarl $1, %eax
negl %eax
ret
llvm-svn: 16760
2004-10-06 06:02:39 +02:00
|
|
|
if (V == 2 || V == -2) { // X /s 2
|
|
|
|
static const unsigned CMPOpcode[] = {
|
|
|
|
X86::CMP8ri, X86::CMP16ri, X86::CMP32ri
|
|
|
|
};
|
|
|
|
static const unsigned SBBOpcode[] = {
|
|
|
|
X86::SBB8ri, X86::SBB16ri, X86::SBB32ri
|
|
|
|
};
|
|
|
|
unsigned Op0Reg = getReg(Op0, BB, IP);
|
|
|
|
unsigned SignBit = 1 << (CI->getType()->getPrimitiveSize()*8-1);
|
|
|
|
BuildMI(*BB, IP, CMPOpcode[Class], 2).addReg(Op0Reg).addImm(SignBit);
|
|
|
|
|
|
|
|
unsigned TmpReg = makeAnotherReg(Op0->getType());
|
|
|
|
BuildMI(*BB, IP, SBBOpcode[Class], 2, TmpReg).addReg(Op0Reg).addImm(-1);
|
|
|
|
|
|
|
|
unsigned TmpReg2 = V == 2 ? ResultReg : makeAnotherReg(Op0->getType());
|
|
|
|
BuildMI(*BB, IP, SAROpcode[Class], 2, TmpReg2).addReg(TmpReg).addImm(1);
|
|
|
|
if (V == -2) {
|
|
|
|
BuildMI(*BB, IP, NEGOpcode[Class], 1, ResultReg).addReg(TmpReg2);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
Improve signed division by power of 2 *dramatically* from this:
div:
mov %EDX, DWORD PTR [%ESP + 4]
mov %ECX, 64
mov %EAX, %EDX
sar %EDX, 31
idiv %ECX
ret
to this:
div:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
sar %ECX, 5
shr %ECX, 26
mov %EDX, %EAX
add %EDX, %ECX
sar %EAX, 6
ret
Note that the intel compiler is currently making this:
div:
movl 4(%esp), %edx #3.5
movl %edx, %eax #4.14
sarl $5, %eax #4.14
shrl $26, %eax #4.14
addl %edx, %eax #4.14
sarl $6, %eax #4.14
ret #4.14
Which has one less register->register copy. (hint hint alkis :)
llvm-svn: 13354
2004-05-04 21:33:58 +02:00
|
|
|
bool isNeg = false;
|
|
|
|
if (V < 0) { // Not a positive power of 2?
|
|
|
|
V = -V;
|
|
|
|
isNeg = true; // Maybe it's a negative power of 2.
|
|
|
|
}
|
|
|
|
if (unsigned Log = ExactLog2(V)) {
|
|
|
|
--Log;
|
|
|
|
unsigned Op0Reg = getReg(Op0, BB, IP);
|
|
|
|
unsigned TmpReg = makeAnotherReg(Op0->getType());
|
Fix a scary bug with signed division by a power of two. We used to generate:
s: ;; X / 4
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
sar %ECX, 1
shr %ECX, 30
mov %EDX, %EAX
add %EDX, %ECX
sar %EAX, 2
ret
When we really meant:
s:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
sar %ECX, 1
shr %ECX, 30
add %EAX, %ECX
sar %EAX, 2
ret
Hey, this also reduces register pressure too :)
llvm-svn: 16761
2004-10-06 06:19:43 +02:00
|
|
|
BuildMI(*BB, IP, SAROpcode[Class], 2, TmpReg)
|
|
|
|
.addReg(Op0Reg).addImm(Log-1);
|
Improve signed division by power of 2 *dramatically* from this:
div:
mov %EDX, DWORD PTR [%ESP + 4]
mov %ECX, 64
mov %EAX, %EDX
sar %EDX, 31
idiv %ECX
ret
to this:
div:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
sar %ECX, 5
shr %ECX, 26
mov %EDX, %EAX
add %EDX, %ECX
sar %EAX, 6
ret
Note that the intel compiler is currently making this:
div:
movl 4(%esp), %edx #3.5
movl %edx, %eax #4.14
sarl $5, %eax #4.14
shrl $26, %eax #4.14
addl %edx, %eax #4.14
sarl $6, %eax #4.14
ret #4.14
Which has one less register->register copy. (hint hint alkis :)
llvm-svn: 13354
2004-05-04 21:33:58 +02:00
|
|
|
unsigned TmpReg2 = makeAnotherReg(Op0->getType());
|
|
|
|
BuildMI(*BB, IP, SHROpcode[Class], 2, TmpReg2)
|
|
|
|
.addReg(TmpReg).addImm(32-Log);
|
|
|
|
unsigned TmpReg3 = makeAnotherReg(Op0->getType());
|
|
|
|
BuildMI(*BB, IP, ADDOpcode[Class], 2, TmpReg3)
|
|
|
|
.addReg(Op0Reg).addReg(TmpReg2);
|
|
|
|
|
|
|
|
unsigned TmpReg4 = isNeg ? makeAnotherReg(Op0->getType()) : ResultReg;
|
|
|
|
BuildMI(*BB, IP, SAROpcode[Class], 2, TmpReg4)
|
Fix a scary bug with signed division by a power of two. We used to generate:
s: ;; X / 4
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
sar %ECX, 1
shr %ECX, 30
mov %EDX, %EAX
add %EDX, %ECX
sar %EAX, 2
ret
When we really meant:
s:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
sar %ECX, 1
shr %ECX, 30
add %EAX, %ECX
sar %EAX, 2
ret
Hey, this also reduces register pressure too :)
llvm-svn: 16761
2004-10-06 06:19:43 +02:00
|
|
|
.addReg(TmpReg3).addImm(Log);
|
Improve signed division by power of 2 *dramatically* from this:
div:
mov %EDX, DWORD PTR [%ESP + 4]
mov %ECX, 64
mov %EAX, %EDX
sar %EDX, 31
idiv %ECX
ret
to this:
div:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
sar %ECX, 5
shr %ECX, 26
mov %EDX, %EAX
add %EDX, %ECX
sar %EAX, 6
ret
Note that the intel compiler is currently making this:
div:
movl 4(%esp), %edx #3.5
movl %edx, %eax #4.14
sarl $5, %eax #4.14
shrl $26, %eax #4.14
addl %edx, %eax #4.14
sarl $6, %eax #4.14
ret #4.14
Which has one less register->register copy. (hint hint alkis :)
llvm-svn: 13354
2004-05-04 21:33:58 +02:00
|
|
|
if (isNeg)
|
|
|
|
BuildMI(*BB, IP, NEGOpcode[Class], 1, ResultReg).addReg(TmpReg4);
|
|
|
|
return;
|
|
|
|
}
|
Codegen signed mod by 2 or -2 more efficiently. Instead of generating:
t:
mov %EDX, DWORD PTR [%ESP + 4]
mov %ECX, 2
mov %EAX, %EDX
sar %EDX, 31
idiv %ECX
mov %EAX, %EDX
ret
Generate:
t:
mov %ECX, DWORD PTR [%ESP + 4]
*** mov %EAX, %ECX
cdq
and %ECX, 1
xor %ECX, %EDX
sub %ECX, %EDX
*** mov %EAX, %ECX
ret
Note that the two marked moves are redundant, and should be eliminated by the
register allocator, but aren't.
Compare this to GCC, which generates:
t:
mov %eax, DWORD PTR [%esp+4]
mov %edx, %eax
shr %edx, 31
lea %ecx, [%edx+%eax]
and %ecx, -2
sub %eax, %ecx
ret
or ICC 8.0, which generates:
t:
movl 4(%esp), %ecx #3.5
movl $-2147483647, %eax #3.25
imull %ecx #3.25
movl %ecx, %eax #3.25
sarl $31, %eax #3.25
addl %ecx, %edx #3.25
subl %edx, %eax #3.25
addl %eax, %eax #3.25
negl %eax #3.25
subl %eax, %ecx #3.25
movl %ecx, %eax #3.25
ret #3.25
We would be in great shape if not for the moves.
llvm-svn: 16763
2004-10-06 07:01:07 +02:00
|
|
|
} else { // X % C
|
|
|
|
assert(Class != cLong && "This doesn't handle 64-bit remainder!");
|
|
|
|
int V = CI->getValue();
|
|
|
|
|
|
|
|
if (V == 2 || V == -2) { // X % 2, X % -2
|
|
|
|
static const unsigned SExtOpcode[] = { X86::CBW, X86::CWD, X86::CDQ };
|
|
|
|
static const unsigned BaseReg[] = { X86::AL , X86::AX , X86::EAX };
|
|
|
|
static const unsigned SExtReg[] = { X86::AH , X86::DX , X86::EDX };
|
|
|
|
static const unsigned ANDOpcode[] = {
|
|
|
|
X86::AND8ri, X86::AND16ri, X86::AND32ri
|
|
|
|
};
|
|
|
|
static const unsigned XOROpcode[] = {
|
|
|
|
X86::XOR8rr, X86::XOR16rr, X86::XOR32rr
|
|
|
|
};
|
|
|
|
static const unsigned SUBOpcode[] = {
|
|
|
|
X86::SUB8rr, X86::SUB16rr, X86::SUB32rr
|
|
|
|
};
|
|
|
|
|
|
|
|
// Sign extend result into reg of -1 or 0.
|
|
|
|
unsigned Op0Reg = getReg(Op0, BB, IP);
|
|
|
|
BuildMI(*BB, IP, MovOpcode[Class], 1, BaseReg[Class]).addReg(Op0Reg);
|
|
|
|
BuildMI(*BB, IP, SExtOpcode[Class], 0);
|
|
|
|
unsigned TmpReg0 = makeAnotherReg(Op0->getType());
|
|
|
|
BuildMI(*BB, IP, MovOpcode[Class], 1, TmpReg0).addReg(SExtReg[Class]);
|
|
|
|
|
|
|
|
unsigned TmpReg1 = makeAnotherReg(Op0->getType());
|
|
|
|
BuildMI(*BB, IP, ANDOpcode[Class], 2, TmpReg1).addReg(Op0Reg).addImm(1);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
Codegen signed mod by 2 or -2 more efficiently. Instead of generating:
t:
mov %EDX, DWORD PTR [%ESP + 4]
mov %ECX, 2
mov %EAX, %EDX
sar %EDX, 31
idiv %ECX
mov %EAX, %EDX
ret
Generate:
t:
mov %ECX, DWORD PTR [%ESP + 4]
*** mov %EAX, %ECX
cdq
and %ECX, 1
xor %ECX, %EDX
sub %ECX, %EDX
*** mov %EAX, %ECX
ret
Note that the two marked moves are redundant, and should be eliminated by the
register allocator, but aren't.
Compare this to GCC, which generates:
t:
mov %eax, DWORD PTR [%esp+4]
mov %edx, %eax
shr %edx, 31
lea %ecx, [%edx+%eax]
and %ecx, -2
sub %eax, %ecx
ret
or ICC 8.0, which generates:
t:
movl 4(%esp), %ecx #3.5
movl $-2147483647, %eax #3.25
imull %ecx #3.25
movl %ecx, %eax #3.25
sarl $31, %eax #3.25
addl %ecx, %edx #3.25
subl %edx, %eax #3.25
addl %eax, %eax #3.25
negl %eax #3.25
subl %eax, %ecx #3.25
movl %ecx, %eax #3.25
ret #3.25
We would be in great shape if not for the moves.
llvm-svn: 16763
2004-10-06 07:01:07 +02:00
|
|
|
unsigned TmpReg2 = makeAnotherReg(Op0->getType());
|
|
|
|
BuildMI(*BB, IP, XOROpcode[Class], 2,
|
|
|
|
TmpReg2).addReg(TmpReg1).addReg(TmpReg0);
|
|
|
|
BuildMI(*BB, IP, SUBOpcode[Class], 2,
|
|
|
|
ResultReg).addReg(TmpReg2).addReg(TmpReg0);
|
|
|
|
return;
|
|
|
|
}
|
Improve signed division by power of 2 *dramatically* from this:
div:
mov %EDX, DWORD PTR [%ESP + 4]
mov %ECX, 64
mov %EAX, %EDX
sar %EDX, 31
idiv %ECX
ret
to this:
div:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
sar %ECX, 5
shr %ECX, 26
mov %EDX, %EAX
add %EDX, %ECX
sar %EAX, 6
ret
Note that the intel compiler is currently making this:
div:
movl 4(%esp), %edx #3.5
movl %edx, %eax #4.14
sarl $5, %eax #4.14
shrl $26, %eax #4.14
addl %edx, %eax #4.14
sarl $6, %eax #4.14
ret #4.14
Which has one less register->register copy. (hint hint alkis :)
llvm-svn: 13354
2004-05-04 21:33:58 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
static const unsigned Regs[] ={ X86::AL , X86::AX , X86::EAX };
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
static const unsigned ClrOpcode[]={ X86::MOV8ri, X86::MOV16ri, X86::MOV32ri };
|
2002-11-02 21:54:46 +01:00
|
|
|
static const unsigned ExtRegs[] ={ X86::AH , X86::DX , X86::EDX };
|
2005-01-05 17:30:14 +01:00
|
|
|
static const unsigned SExOpcode[]={ X86::CBW , X86::CWD , X86::CDQ };
|
2002-11-02 21:54:46 +01:00
|
|
|
|
|
|
|
static const unsigned DivOpcode[][4] = {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
{ X86::DIV8r , X86::DIV16r , X86::DIV32r , 0 }, // Unsigned division
|
|
|
|
{ X86::IDIV8r, X86::IDIV16r, X86::IDIV32r, 0 }, // Signed division
|
2002-11-02 21:54:46 +01:00
|
|
|
};
|
2002-11-02 21:28:58 +01:00
|
|
|
|
2002-11-02 21:54:46 +01:00
|
|
|
unsigned Reg = Regs[Class];
|
|
|
|
unsigned ExtReg = ExtRegs[Class];
|
|
|
|
|
|
|
|
// Put the first operand into one of the A registers...
|
2004-04-11 22:56:28 +02:00
|
|
|
unsigned Op0Reg = getReg(Op0, BB, IP);
|
|
|
|
unsigned Op1Reg = getReg(Op1, BB, IP);
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*BB, IP, MovOpcode[Class], 1, Reg).addReg(Op0Reg);
|
2002-11-02 21:54:46 +01:00
|
|
|
|
Improve signed division by power of 2 *dramatically* from this:
div:
mov %EDX, DWORD PTR [%ESP + 4]
mov %ECX, 64
mov %EAX, %EDX
sar %EDX, 31
idiv %ECX
ret
to this:
div:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
sar %ECX, 5
shr %ECX, 26
mov %EDX, %EAX
add %EDX, %ECX
sar %EAX, 6
ret
Note that the intel compiler is currently making this:
div:
movl 4(%esp), %edx #3.5
movl %edx, %eax #4.14
sarl $5, %eax #4.14
shrl $26, %eax #4.14
addl %edx, %eax #4.14
sarl $6, %eax #4.14
ret #4.14
Which has one less register->register copy. (hint hint alkis :)
llvm-svn: 13354
2004-05-04 21:33:58 +02:00
|
|
|
if (Ty->isSigned()) {
|
2005-01-05 17:30:14 +01:00
|
|
|
// Emit a sign extension instruction.
|
|
|
|
BuildMI(*BB, IP, SExOpcode[Class], 0);
|
Improve signed division by power of 2 *dramatically* from this:
div:
mov %EDX, DWORD PTR [%ESP + 4]
mov %ECX, 64
mov %EAX, %EDX
sar %EDX, 31
idiv %ECX
ret
to this:
div:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
sar %ECX, 5
shr %ECX, 26
mov %EDX, %EAX
add %EDX, %ECX
sar %EAX, 6
ret
Note that the intel compiler is currently making this:
div:
movl 4(%esp), %edx #3.5
movl %edx, %eax #4.14
sarl $5, %eax #4.14
shrl $26, %eax #4.14
addl %edx, %eax #4.14
sarl $6, %eax #4.14
ret #4.14
Which has one less register->register copy. (hint hint alkis :)
llvm-svn: 13354
2004-05-04 21:33:58 +02:00
|
|
|
|
|
|
|
// Emit the appropriate divide or remainder instruction...
|
|
|
|
BuildMI(*BB, IP, DivOpcode[1][Class], 1).addReg(Op1Reg);
|
2002-11-02 21:54:46 +01:00
|
|
|
} else {
|
2004-01-12 08:22:45 +01:00
|
|
|
// If unsigned, emit a zeroing instruction... (reg = 0)
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*BB, IP, ClrOpcode[Class], 2, ExtReg).addImm(0);
|
2002-11-02 21:54:46 +01:00
|
|
|
|
Improve signed division by power of 2 *dramatically* from this:
div:
mov %EDX, DWORD PTR [%ESP + 4]
mov %ECX, 64
mov %EAX, %EDX
sar %EDX, 31
idiv %ECX
ret
to this:
div:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, %EAX
sar %ECX, 5
shr %ECX, 26
mov %EDX, %EAX
add %EDX, %ECX
sar %EAX, 6
ret
Note that the intel compiler is currently making this:
div:
movl 4(%esp), %edx #3.5
movl %edx, %eax #4.14
sarl $5, %eax #4.14
shrl $26, %eax #4.14
addl %edx, %eax #4.14
sarl $6, %eax #4.14
ret #4.14
Which has one less register->register copy. (hint hint alkis :)
llvm-svn: 13354
2004-05-04 21:33:58 +02:00
|
|
|
// Emit the appropriate divide or remainder instruction...
|
|
|
|
BuildMI(*BB, IP, DivOpcode[0][Class], 1).addReg(Op1Reg);
|
|
|
|
}
|
2002-11-17 22:56:38 +01:00
|
|
|
|
2002-11-02 21:54:46 +01:00
|
|
|
// Figure out which register we want to pick the result out of...
|
2003-10-23 19:21:43 +02:00
|
|
|
unsigned DestReg = isDiv ? Reg : ExtReg;
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2002-11-02 21:54:46 +01:00
|
|
|
// Put the result into the destination register...
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*BB, IP, MovOpcode[Class], 1, ResultReg).addReg(DestReg);
|
2002-11-02 21:28:58 +01:00
|
|
|
}
|
2002-11-02 21:04:26 +01:00
|
|
|
|
2002-11-17 22:56:38 +01:00
|
|
|
|
2002-11-01 00:03:59 +01:00
|
|
|
/// Shift instructions: 'shl', 'sar', 'shr' - Some special cases here
|
|
|
|
/// for constant immediate shift values, and for constant immediate
|
|
|
|
/// shift values equal to 1. Even the general case is sort of special,
|
|
|
|
/// because the shift amount has to be in CL, not just any old register.
|
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitShiftInst(ShiftInst &I) {
|
2003-11-22 07:49:41 +01:00
|
|
|
MachineBasicBlock::iterator IP = BB->end ();
|
|
|
|
emitShiftOperation (BB, IP, I.getOperand (0), I.getOperand (1),
|
|
|
|
I.getOpcode () == Instruction::Shl, I.getType (),
|
|
|
|
getReg (I));
|
|
|
|
}
|
|
|
|
|
shld is a very high latency operation. Instead of emitting it for shifts of
two or three, open code the equivalent operation which is faster on athlon
and P4 (by a substantial margin).
For example, instead of compiling this:
long long X2(long long Y) { return Y << 2; }
to:
X3_2:
movl 4(%esp), %eax
movl 8(%esp), %edx
shldl $2, %eax, %edx
shll $2, %eax
ret
Compile it to:
X2:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $30, %edx
leal (%edx,%ecx,4), %edx
shll $2, %eax
ret
Likewise, for << 3, compile to:
X3:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $29, %edx
leal (%edx,%ecx,8), %edx
shll $3, %eax
ret
This matches icc, except that icc open codes the shifts as adds on the P4.
llvm-svn: 17707
2004-11-13 21:48:57 +01:00
|
|
|
/// Emit code for a 'SHLD DestReg, Op0, Op1, Amt' operation, where Amt is a
|
|
|
|
/// constant.
|
2005-04-22 01:38:14 +02:00
|
|
|
void X86ISel::doSHLDConst(MachineBasicBlock *MBB,
|
shld is a very high latency operation. Instead of emitting it for shifts of
two or three, open code the equivalent operation which is faster on athlon
and P4 (by a substantial margin).
For example, instead of compiling this:
long long X2(long long Y) { return Y << 2; }
to:
X3_2:
movl 4(%esp), %eax
movl 8(%esp), %edx
shldl $2, %eax, %edx
shll $2, %eax
ret
Compile it to:
X2:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $30, %edx
leal (%edx,%ecx,4), %edx
shll $2, %eax
ret
Likewise, for << 3, compile to:
X3:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $29, %edx
leal (%edx,%ecx,8), %edx
shll $3, %eax
ret
This matches icc, except that icc open codes the shifts as adds on the P4.
llvm-svn: 17707
2004-11-13 21:48:57 +01:00
|
|
|
MachineBasicBlock::iterator IP,
|
|
|
|
unsigned DestReg, unsigned Op0Reg, unsigned Op1Reg,
|
|
|
|
unsigned Amt) {
|
|
|
|
// SHLD is a very inefficient operation on every processor, try to do
|
|
|
|
// somethign simpler for common values of 'Amt'.
|
|
|
|
if (Amt == 0) {
|
|
|
|
BuildMI(*MBB, IP, X86::MOV32rr, 1, DestReg).addReg(Op0Reg);
|
|
|
|
} else if (Amt == 1) {
|
|
|
|
unsigned Tmp = makeAnotherReg(Type::UIntTy);
|
|
|
|
BuildMI(*MBB, IP, X86::ADD32rr, 2, Tmp).addReg(Op1Reg).addReg(Op1Reg);
|
|
|
|
BuildMI(*MBB, IP, X86::ADC32rr, 2, DestReg).addReg(Op0Reg).addReg(Op0Reg);
|
|
|
|
} else if (Amt == 2 || Amt == 3) {
|
|
|
|
// On the P4 and Athlon it is cheaper to replace shld ..., 2|3 with a
|
|
|
|
// shift/lea pair. NOTE: This should not be done on the P6 family!
|
|
|
|
unsigned Tmp = makeAnotherReg(Type::UIntTy);
|
|
|
|
BuildMI(*MBB, IP, X86::SHR32ri, 2, Tmp).addReg(Op1Reg).addImm(32-Amt);
|
|
|
|
X86AddressMode AM;
|
|
|
|
AM.BaseType = X86AddressMode::RegBase;
|
|
|
|
AM.Base.Reg = Tmp;
|
|
|
|
AM.Scale = 1 << Amt;
|
|
|
|
AM.IndexReg = Op0Reg;
|
|
|
|
AM.Disp = 0;
|
|
|
|
addFullAddress(BuildMI(*MBB, IP, X86::LEA32r, 4, DestReg), AM);
|
|
|
|
} else {
|
|
|
|
// NOTE: It is always cheaper on the P4 to emit SHLD as two shifts and an OR
|
|
|
|
// than it is to emit a real SHLD.
|
|
|
|
|
2005-04-22 01:38:14 +02:00
|
|
|
BuildMI(*MBB, IP, X86::SHLD32rri8, 3,
|
shld is a very high latency operation. Instead of emitting it for shifts of
two or three, open code the equivalent operation which is faster on athlon
and P4 (by a substantial margin).
For example, instead of compiling this:
long long X2(long long Y) { return Y << 2; }
to:
X3_2:
movl 4(%esp), %eax
movl 8(%esp), %edx
shldl $2, %eax, %edx
shll $2, %eax
ret
Compile it to:
X2:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $30, %edx
leal (%edx,%ecx,4), %edx
shll $2, %eax
ret
Likewise, for << 3, compile to:
X3:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $29, %edx
leal (%edx,%ecx,8), %edx
shll $3, %eax
ret
This matches icc, except that icc open codes the shifts as adds on the P4.
llvm-svn: 17707
2004-11-13 21:48:57 +01:00
|
|
|
DestReg).addReg(Op0Reg).addReg(Op1Reg).addImm(Amt);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2003-11-22 07:49:41 +01:00
|
|
|
/// emitShiftOperation - Common code shared between visitShiftInst and
|
|
|
|
/// constant expression support.
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::emitShiftOperation(MachineBasicBlock *MBB,
|
|
|
|
MachineBasicBlock::iterator IP,
|
2005-04-22 01:38:14 +02:00
|
|
|
Value *Op, Value *ShiftAmount,
|
|
|
|
bool isLeftShift, const Type *ResultTy,
|
2004-09-21 20:21:21 +02:00
|
|
|
unsigned DestReg) {
|
2003-11-22 07:49:41 +01:00
|
|
|
unsigned SrcReg = getReg (Op, MBB, IP);
|
|
|
|
bool isSigned = ResultTy->isSigned ();
|
|
|
|
unsigned Class = getClass (ResultTy);
|
2004-10-17 08:10:40 +02:00
|
|
|
|
shld is a very high latency operation. Instead of emitting it for shifts of
two or three, open code the equivalent operation which is faster on athlon
and P4 (by a substantial margin).
For example, instead of compiling this:
long long X2(long long Y) { return Y << 2; }
to:
X3_2:
movl 4(%esp), %eax
movl 8(%esp), %edx
shldl $2, %eax, %edx
shll $2, %eax
ret
Compile it to:
X2:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $30, %edx
leal (%edx,%ecx,4), %edx
shll $2, %eax
ret
Likewise, for << 3, compile to:
X3:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $29, %edx
leal (%edx,%ecx,8), %edx
shll $3, %eax
ret
This matches icc, except that icc open codes the shifts as adds on the P4.
llvm-svn: 17707
2004-11-13 21:48:57 +01:00
|
|
|
static const unsigned ConstantOperand[][3] = {
|
|
|
|
{ X86::SHR8ri, X86::SHR16ri, X86::SHR32ri }, // SHR
|
|
|
|
{ X86::SAR8ri, X86::SAR16ri, X86::SAR32ri }, // SAR
|
|
|
|
{ X86::SHL8ri, X86::SHL16ri, X86::SHL32ri }, // SHL
|
|
|
|
{ X86::SHL8ri, X86::SHL16ri, X86::SHL32ri }, // SAL = SHL
|
2003-01-13 01:32:26 +01:00
|
|
|
};
|
2002-11-02 02:15:18 +01:00
|
|
|
|
shld is a very high latency operation. Instead of emitting it for shifts of
two or three, open code the equivalent operation which is faster on athlon
and P4 (by a substantial margin).
For example, instead of compiling this:
long long X2(long long Y) { return Y << 2; }
to:
X3_2:
movl 4(%esp), %eax
movl 8(%esp), %edx
shldl $2, %eax, %edx
shll $2, %eax
ret
Compile it to:
X2:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $30, %edx
leal (%edx,%ecx,4), %edx
shll $2, %eax
ret
Likewise, for << 3, compile to:
X3:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $29, %edx
leal (%edx,%ecx,8), %edx
shll $3, %eax
ret
This matches icc, except that icc open codes the shifts as adds on the P4.
llvm-svn: 17707
2004-11-13 21:48:57 +01:00
|
|
|
static const unsigned NonConstantOperand[][3] = {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
{ X86::SHR8rCL, X86::SHR16rCL, X86::SHR32rCL }, // SHR
|
|
|
|
{ X86::SAR8rCL, X86::SAR16rCL, X86::SAR32rCL }, // SAR
|
|
|
|
{ X86::SHL8rCL, X86::SHL16rCL, X86::SHL32rCL }, // SHL
|
|
|
|
{ X86::SHL8rCL, X86::SHL16rCL, X86::SHL32rCL }, // SAL = SHL
|
2003-01-13 01:32:26 +01:00
|
|
|
};
|
2002-11-02 02:15:18 +01:00
|
|
|
|
shld is a very high latency operation. Instead of emitting it for shifts of
two or three, open code the equivalent operation which is faster on athlon
and P4 (by a substantial margin).
For example, instead of compiling this:
long long X2(long long Y) { return Y << 2; }
to:
X3_2:
movl 4(%esp), %eax
movl 8(%esp), %edx
shldl $2, %eax, %edx
shll $2, %eax
ret
Compile it to:
X2:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $30, %edx
leal (%edx,%ecx,4), %edx
shll $2, %eax
ret
Likewise, for << 3, compile to:
X3:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $29, %edx
leal (%edx,%ecx,8), %edx
shll $3, %eax
ret
This matches icc, except that icc open codes the shifts as adds on the P4.
llvm-svn: 17707
2004-11-13 21:48:57 +01:00
|
|
|
// Longs, as usual, are handled specially.
|
2003-01-13 01:32:26 +01:00
|
|
|
if (Class == cLong) {
|
2003-11-22 07:49:41 +01:00
|
|
|
if (ConstantUInt *CUI = dyn_cast<ConstantUInt>(ShiftAmount)) {
|
2003-01-13 01:32:26 +01:00
|
|
|
unsigned Amount = CUI->getValue();
|
2004-11-13 21:04:38 +01:00
|
|
|
if (Amount == 1 && isLeftShift) { // X << 1 == X+X
|
Compile:
long long X3_2(long long Y) { return Y+Y; }
int X(int Y) { return Y+Y; }
into:
X3_2:
movl 4(%esp), %eax
movl 8(%esp), %edx
addl %eax, %eax
adcl %edx, %edx
ret
X:
movl 4(%esp), %eax
addl %eax, %eax
ret
instead of:
X3_2:
movl 4(%esp), %eax
movl 8(%esp), %edx
shldl $1, %eax, %edx
shll $1, %eax
ret
X:
movl 4(%esp), %eax
shll $1, %eax
ret
llvm-svn: 17705
2004-11-13 21:03:48 +01:00
|
|
|
BuildMI(*MBB, IP, X86::ADD32rr, 2,
|
|
|
|
DestReg).addReg(SrcReg).addReg(SrcReg);
|
|
|
|
BuildMI(*MBB, IP, X86::ADC32rr, 2,
|
|
|
|
DestReg+1).addReg(SrcReg+1).addReg(SrcReg+1);
|
|
|
|
} else if (Amount < 32) {
|
2003-10-23 18:22:08 +02:00
|
|
|
const unsigned *Opc = ConstantOperand[isLeftShift*2+isSigned];
|
|
|
|
if (isLeftShift) {
|
shld is a very high latency operation. Instead of emitting it for shifts of
two or three, open code the equivalent operation which is faster on athlon
and P4 (by a substantial margin).
For example, instead of compiling this:
long long X2(long long Y) { return Y << 2; }
to:
X3_2:
movl 4(%esp), %eax
movl 8(%esp), %edx
shldl $2, %eax, %edx
shll $2, %eax
ret
Compile it to:
X2:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $30, %edx
leal (%edx,%ecx,4), %edx
shll $2, %eax
ret
Likewise, for << 3, compile to:
X3:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $29, %edx
leal (%edx,%ecx,8), %edx
shll $3, %eax
ret
This matches icc, except that icc open codes the shifts as adds on the P4.
llvm-svn: 17707
2004-11-13 21:48:57 +01:00
|
|
|
doSHLDConst(MBB, IP, DestReg+1, SrcReg+1, SrcReg, Amount);
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*MBB, IP, Opc[2], 2, DestReg).addReg(SrcReg).addImm(Amount);
|
2003-10-23 18:22:08 +02:00
|
|
|
} else {
|
shld is a very high latency operation. Instead of emitting it for shifts of
two or three, open code the equivalent operation which is faster on athlon
and P4 (by a substantial margin).
For example, instead of compiling this:
long long X2(long long Y) { return Y << 2; }
to:
X3_2:
movl 4(%esp), %eax
movl 8(%esp), %edx
shldl $2, %eax, %edx
shll $2, %eax
ret
Compile it to:
X2:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $30, %edx
leal (%edx,%ecx,4), %edx
shll $2, %eax
ret
Likewise, for << 3, compile to:
X3:
movl 4(%esp), %eax
movl 8(%esp), %ecx
movl %eax, %edx
shrl $29, %edx
leal (%edx,%ecx,8), %edx
shll $3, %eax
ret
This matches icc, except that icc open codes the shifts as adds on the P4.
llvm-svn: 17707
2004-11-13 21:48:57 +01:00
|
|
|
BuildMI(*MBB, IP, X86::SHRD32rri8, 3,
|
|
|
|
DestReg).addReg(SrcReg ).addReg(SrcReg+1).addImm(Amount);
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*MBB, IP, Opc[2],2,DestReg+1).addReg(SrcReg+1).addImm(Amount);
|
2003-10-23 18:22:08 +02:00
|
|
|
}
|
2004-11-16 00:16:34 +01:00
|
|
|
} else if (Amount == 32) {
|
2003-10-23 18:22:08 +02:00
|
|
|
if (isLeftShift) {
|
2004-11-16 00:16:34 +01:00
|
|
|
BuildMI(*MBB, IP, X86::MOV32rr, 1, DestReg+1).addReg(SrcReg);
|
2004-04-06 05:42:38 +02:00
|
|
|
BuildMI(*MBB, IP, X86::MOV32ri, 1, DestReg).addImm(0);
|
2003-10-23 18:22:08 +02:00
|
|
|
} else {
|
2004-11-16 19:40:52 +01:00
|
|
|
BuildMI(*MBB, IP, X86::MOV32rr, 1, DestReg).addReg(SrcReg+1);
|
2004-11-16 00:16:34 +01:00
|
|
|
if (!isSigned) {
|
|
|
|
BuildMI(*MBB, IP, X86::MOV32ri, 1, DestReg+1).addImm(0);
|
2004-04-06 05:42:38 +02:00
|
|
|
} else {
|
2004-11-16 00:16:34 +01:00
|
|
|
BuildMI(*MBB, IP, X86::SAR32ri, 2,
|
|
|
|
DestReg+1).addReg(SrcReg).addImm(31);
|
2004-04-06 05:42:38 +02:00
|
|
|
}
|
2004-11-16 00:16:34 +01:00
|
|
|
}
|
|
|
|
} else { // Shifting more than 32 bits
|
|
|
|
Amount -= 32;
|
|
|
|
if (isLeftShift) {
|
|
|
|
BuildMI(*MBB, IP, X86::SHL32ri, 2,
|
|
|
|
DestReg + 1).addReg(SrcReg).addImm(Amount);
|
|
|
|
BuildMI(*MBB, IP, X86::MOV32ri, 1, DestReg).addImm(0);
|
|
|
|
} else {
|
|
|
|
BuildMI(*MBB, IP, isSigned ? X86::SAR32ri : X86::SHR32ri, 2,
|
|
|
|
DestReg).addReg(SrcReg+1).addImm(Amount);
|
2005-04-06 22:59:35 +02:00
|
|
|
if (isSigned)
|
|
|
|
BuildMI(*MBB, IP, X86::SAR32ri, 2,
|
|
|
|
DestReg+1).addReg(SrcReg+1).addImm(31);
|
|
|
|
else
|
|
|
|
BuildMI(*MBB, IP, X86::MOV32ri, 1, DestReg+1).addImm(0);
|
2003-10-23 18:22:08 +02:00
|
|
|
}
|
2003-01-13 01:32:26 +01:00
|
|
|
}
|
|
|
|
} else {
|
2003-06-01 03:56:54 +02:00
|
|
|
unsigned TmpReg = makeAnotherReg(Type::IntTy);
|
|
|
|
if (!isLeftShift && isSigned) {
|
|
|
|
// If this is a SHR of a Long, then we need to do funny sign extension
|
|
|
|
// stuff. TmpReg gets the value to use as the high-part if we are
|
|
|
|
// shifting more than 32 bits.
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::SAR32ri, 2, TmpReg).addReg(SrcReg).addImm(31);
|
2003-06-01 03:56:54 +02:00
|
|
|
} else {
|
|
|
|
// Other shifts use a fixed zero value if the shift is more than 32
|
|
|
|
// bits.
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::MOV32ri, 1, TmpReg).addImm(0);
|
2003-06-01 03:56:54 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Initialize CL with the shift amount...
|
2003-11-22 07:49:41 +01:00
|
|
|
unsigned ShiftAmountReg = getReg(ShiftAmount, MBB, IP);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::MOV8rr, 1, X86::CL).addReg(ShiftAmountReg);
|
2003-06-01 03:56:54 +02:00
|
|
|
|
|
|
|
unsigned TmpReg2 = makeAnotherReg(Type::IntTy);
|
|
|
|
unsigned TmpReg3 = makeAnotherReg(Type::IntTy);
|
|
|
|
if (isLeftShift) {
|
|
|
|
// TmpReg2 = shld inHi, inLo
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::SHLD32rrCL,2,TmpReg2).addReg(SrcReg+1)
|
2004-02-29 08:22:16 +01:00
|
|
|
.addReg(SrcReg);
|
2003-06-01 03:56:54 +02:00
|
|
|
// TmpReg3 = shl inLo, CL
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::SHL32rCL, 1, TmpReg3).addReg(SrcReg);
|
2003-06-01 03:56:54 +02:00
|
|
|
|
|
|
|
// Set the flags to indicate whether the shift was by more than 32 bits.
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::TEST8ri, 2).addReg(X86::CL).addImm(32);
|
2003-06-01 03:56:54 +02:00
|
|
|
|
|
|
|
// DestHi = (>32) ? TmpReg3 : TmpReg2;
|
2005-04-22 01:38:14 +02:00
|
|
|
BuildMI(*MBB, IP, X86::CMOVNE32rr, 2,
|
2003-06-01 03:56:54 +02:00
|
|
|
DestReg+1).addReg(TmpReg2).addReg(TmpReg3);
|
|
|
|
// DestLo = (>32) ? TmpReg : TmpReg3;
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::CMOVNE32rr, 2,
|
2003-11-22 07:49:41 +01:00
|
|
|
DestReg).addReg(TmpReg3).addReg(TmpReg);
|
2003-06-01 03:56:54 +02:00
|
|
|
} else {
|
|
|
|
// TmpReg2 = shrd inLo, inHi
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::SHRD32rrCL,2,TmpReg2).addReg(SrcReg)
|
2004-02-29 08:22:16 +01:00
|
|
|
.addReg(SrcReg+1);
|
2003-06-01 03:56:54 +02:00
|
|
|
// TmpReg3 = s[ah]r inHi, CL
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, isSigned ? X86::SAR32rCL : X86::SHR32rCL, 1, TmpReg3)
|
2003-06-01 03:56:54 +02:00
|
|
|
.addReg(SrcReg+1);
|
|
|
|
|
|
|
|
// Set the flags to indicate whether the shift was by more than 32 bits.
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::TEST8ri, 2).addReg(X86::CL).addImm(32);
|
2003-06-01 03:56:54 +02:00
|
|
|
|
|
|
|
// DestLo = (>32) ? TmpReg3 : TmpReg2;
|
2005-04-22 01:38:14 +02:00
|
|
|
BuildMI(*MBB, IP, X86::CMOVNE32rr, 2,
|
2003-06-01 03:56:54 +02:00
|
|
|
DestReg).addReg(TmpReg2).addReg(TmpReg3);
|
|
|
|
|
|
|
|
// DestHi = (>32) ? TmpReg : TmpReg3;
|
2005-04-22 01:38:14 +02:00
|
|
|
BuildMI(*MBB, IP, X86::CMOVNE32rr, 2,
|
2003-06-01 03:56:54 +02:00
|
|
|
DestReg+1).addReg(TmpReg3).addReg(TmpReg);
|
|
|
|
}
|
2002-11-01 00:03:59 +01:00
|
|
|
}
|
2003-01-13 01:32:26 +01:00
|
|
|
return;
|
|
|
|
}
|
2002-11-02 02:41:55 +01:00
|
|
|
|
2003-11-22 07:49:41 +01:00
|
|
|
if (ConstantUInt *CUI = dyn_cast<ConstantUInt>(ShiftAmount)) {
|
2003-01-13 01:32:26 +01:00
|
|
|
// The shift amount is constant, guaranteed to be a ubyte. Get its value.
|
|
|
|
assert(CUI->getType() == Type::UByteTy && "Shift amount not a ubyte?");
|
2002-11-02 02:15:18 +01:00
|
|
|
|
Compile:
long long X3_2(long long Y) { return Y+Y; }
int X(int Y) { return Y+Y; }
into:
X3_2:
movl 4(%esp), %eax
movl 8(%esp), %edx
addl %eax, %eax
adcl %edx, %edx
ret
X:
movl 4(%esp), %eax
addl %eax, %eax
ret
instead of:
X3_2:
movl 4(%esp), %eax
movl 8(%esp), %edx
shldl $1, %eax, %edx
shll $1, %eax
ret
X:
movl 4(%esp), %eax
shll $1, %eax
ret
llvm-svn: 17705
2004-11-13 21:03:48 +01:00
|
|
|
if (CUI->getValue() == 1 && isLeftShift) { // X << 1 -> X+X
|
|
|
|
static const int AddOpC[] = { X86::ADD8rr, X86::ADD16rr, X86::ADD32rr };
|
|
|
|
BuildMI(*MBB, IP, AddOpC[Class], 2,DestReg).addReg(SrcReg).addReg(SrcReg);
|
|
|
|
} else {
|
|
|
|
const unsigned *Opc = ConstantOperand[isLeftShift*2+isSigned];
|
|
|
|
BuildMI(*MBB, IP, Opc[Class], 2,
|
|
|
|
DestReg).addReg(SrcReg).addImm(CUI->getValue());
|
|
|
|
}
|
2003-01-13 01:32:26 +01:00
|
|
|
} else { // The shift amount is non-constant.
|
2003-11-22 07:49:41 +01:00
|
|
|
unsigned ShiftAmountReg = getReg (ShiftAmount, MBB, IP);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::MOV8rr, 1, X86::CL).addReg(ShiftAmountReg);
|
2003-01-13 01:32:26 +01:00
|
|
|
|
|
|
|
const unsigned *Opc = NonConstantOperand[isLeftShift*2+isSigned];
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*MBB, IP, Opc[Class], 1, DestReg).addReg(SrcReg);
|
2003-01-13 01:32:26 +01:00
|
|
|
}
|
|
|
|
}
|
2002-11-02 02:15:18 +01:00
|
|
|
|
|
|
|
|
2004-03-08 02:18:36 +01:00
|
|
|
/// visitLoadInst - Implement LLVM load instructions in terms of the x86 'mov'
|
|
|
|
/// instruction. The load and store instructions are the only place where we
|
|
|
|
/// need to worry about the memory layout of the target machine.
|
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitLoadInst(LoadInst &I) {
|
Implement folding explicit load instructions into binary operations. For a
testcase like this:
int %test(int* %P, int %A) {
%Pv = load int* %P
%B = add int %A, %Pv
ret int %B
}
We now generate:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EAX, DWORD PTR [%ESP + 8]
add %EAX, DWORD PTR [%ECX]
ret
Instead of:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EAX, DWORD PTR [%EAX]
add %EAX, %ECX
ret
... saving one instruction, and often a register. Note that there are a lot
of other instructions that could use this, but they aren't handled. I'm not
really interested in adding them, but mul/div and all of the FP instructions
could be supported as well if someone wanted to add them.
llvm-svn: 12204
2004-03-08 02:58:35 +01:00
|
|
|
// Check to see if this load instruction is going to be folded into a binary
|
|
|
|
// instruction, like add. If so, we don't want to emit it. Wouldn't a real
|
|
|
|
// pattern matching instruction selector be nice?
|
2004-04-12 00:05:45 +02:00
|
|
|
unsigned Class = getClassB(I.getType());
|
2004-04-12 01:21:26 +02:00
|
|
|
if (I.hasOneUse()) {
|
Implement folding explicit load instructions into binary operations. For a
testcase like this:
int %test(int* %P, int %A) {
%Pv = load int* %P
%B = add int %A, %Pv
ret int %B
}
We now generate:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EAX, DWORD PTR [%ESP + 8]
add %EAX, DWORD PTR [%ECX]
ret
Instead of:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EAX, DWORD PTR [%EAX]
add %EAX, %ECX
ret
... saving one instruction, and often a register. Note that there are a lot
of other instructions that could use this, but they aren't handled. I'm not
really interested in adding them, but mul/div and all of the FP instructions
could be supported as well if someone wanted to add them.
llvm-svn: 12204
2004-03-08 02:58:35 +01:00
|
|
|
Instruction *User = cast<Instruction>(I.use_back());
|
|
|
|
switch (User->getOpcode()) {
|
2004-04-12 01:21:26 +02:00
|
|
|
case Instruction::Cast:
|
|
|
|
// If this is a cast from a signed-integer type to a floating point type,
|
|
|
|
// fold the cast here.
|
2004-06-09 17:18:51 +02:00
|
|
|
if (getClassB(User->getType()) == cFP &&
|
2004-04-12 01:21:26 +02:00
|
|
|
(I.getType() == Type::ShortTy || I.getType() == Type::IntTy ||
|
|
|
|
I.getType() == Type::LongTy)) {
|
|
|
|
unsigned DestReg = getReg(User);
|
|
|
|
static const unsigned Opcode[] = {
|
|
|
|
0/*BYTE*/, X86::FILD16m, X86::FILD32m, 0/*FP*/, X86::FILD64m
|
|
|
|
};
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
|
|
|
|
if (AllocaInst *AI = dyn_castFixedAlloca(I.getOperand(0))) {
|
|
|
|
unsigned FI = getFixedSizedAllocaFI(AI);
|
|
|
|
addFrameReference(BuildMI(BB, Opcode[Class], 4, DestReg), FI);
|
|
|
|
} else {
|
2004-08-30 02:13:26 +02:00
|
|
|
X86AddressMode AM;
|
|
|
|
getAddressingMode(I.getOperand(0), AM);
|
|
|
|
addFullAddress(BuildMI(BB, Opcode[Class], 4, DestReg), AM);
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
}
|
2004-04-12 01:21:26 +02:00
|
|
|
return;
|
|
|
|
} else {
|
|
|
|
User = 0;
|
|
|
|
}
|
|
|
|
break;
|
2004-04-12 02:12:04 +02:00
|
|
|
|
Implement folding explicit load instructions into binary operations. For a
testcase like this:
int %test(int* %P, int %A) {
%Pv = load int* %P
%B = add int %A, %Pv
ret int %B
}
We now generate:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EAX, DWORD PTR [%ESP + 8]
add %EAX, DWORD PTR [%ECX]
ret
Instead of:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EAX, DWORD PTR [%EAX]
add %EAX, %ECX
ret
... saving one instruction, and often a register. Note that there are a lot
of other instructions that could use this, but they aren't handled. I'm not
really interested in adding them, but mul/div and all of the FP instructions
could be supported as well if someone wanted to add them.
llvm-svn: 12204
2004-03-08 02:58:35 +01:00
|
|
|
case Instruction::Add:
|
|
|
|
case Instruction::Sub:
|
|
|
|
case Instruction::And:
|
|
|
|
case Instruction::Or:
|
|
|
|
case Instruction::Xor:
|
2004-04-12 01:21:26 +02:00
|
|
|
if (Class == cLong) User = 0;
|
Implement folding explicit load instructions into binary operations. For a
testcase like this:
int %test(int* %P, int %A) {
%Pv = load int* %P
%B = add int %A, %Pv
ret int %B
}
We now generate:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EAX, DWORD PTR [%ESP + 8]
add %EAX, DWORD PTR [%ECX]
ret
Instead of:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EAX, DWORD PTR [%EAX]
add %EAX, %ECX
ret
... saving one instruction, and often a register. Note that there are a lot
of other instructions that could use this, but they aren't handled. I'm not
really interested in adding them, but mul/div and all of the FP instructions
could be supported as well if someone wanted to add them.
llvm-svn: 12204
2004-03-08 02:58:35 +01:00
|
|
|
break;
|
2004-04-12 00:05:45 +02:00
|
|
|
case Instruction::Mul:
|
|
|
|
case Instruction::Div:
|
2004-04-12 02:12:04 +02:00
|
|
|
if (Class != cFP) User = 0;
|
2004-04-12 01:21:26 +02:00
|
|
|
break; // Folding only implemented for floating point.
|
2004-04-12 00:05:45 +02:00
|
|
|
default: User = 0; break;
|
Implement folding explicit load instructions into binary operations. For a
testcase like this:
int %test(int* %P, int %A) {
%Pv = load int* %P
%B = add int %A, %Pv
ret int %B
}
We now generate:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EAX, DWORD PTR [%ESP + 8]
add %EAX, DWORD PTR [%ECX]
ret
Instead of:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EAX, DWORD PTR [%EAX]
add %EAX, %ECX
ret
... saving one instruction, and often a register. Note that there are a lot
of other instructions that could use this, but they aren't handled. I'm not
really interested in adding them, but mul/div and all of the FP instructions
could be supported as well if someone wanted to add them.
llvm-svn: 12204
2004-03-08 02:58:35 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
if (User) {
|
|
|
|
// Okay, we found a user. If the load is the first operand and there is
|
|
|
|
// no second operand load, reverse the operand ordering. Note that this
|
|
|
|
// can fail for a subtract (ie, no change will be made).
|
2004-07-21 23:28:26 +02:00
|
|
|
bool Swapped = false;
|
Implement folding explicit load instructions into binary operations. For a
testcase like this:
int %test(int* %P, int %A) {
%Pv = load int* %P
%B = add int %A, %Pv
ret int %B
}
We now generate:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EAX, DWORD PTR [%ESP + 8]
add %EAX, DWORD PTR [%ECX]
ret
Instead of:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EAX, DWORD PTR [%EAX]
add %EAX, %ECX
ret
... saving one instruction, and often a register. Note that there are a lot
of other instructions that could use this, but they aren't handled. I'm not
really interested in adding them, but mul/div and all of the FP instructions
could be supported as well if someone wanted to add them.
llvm-svn: 12204
2004-03-08 02:58:35 +01:00
|
|
|
if (!isa<LoadInst>(User->getOperand(1)))
|
2004-07-21 23:28:26 +02:00
|
|
|
Swapped = !cast<BinaryOperator>(User)->swapOperands();
|
2005-04-22 01:38:14 +02:00
|
|
|
|
Implement folding explicit load instructions into binary operations. For a
testcase like this:
int %test(int* %P, int %A) {
%Pv = load int* %P
%B = add int %A, %Pv
ret int %B
}
We now generate:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EAX, DWORD PTR [%ESP + 8]
add %EAX, DWORD PTR [%ECX]
ret
Instead of:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EAX, DWORD PTR [%EAX]
add %EAX, %ECX
ret
... saving one instruction, and often a register. Note that there are a lot
of other instructions that could use this, but they aren't handled. I'm not
really interested in adding them, but mul/div and all of the FP instructions
could be supported as well if someone wanted to add them.
llvm-svn: 12204
2004-03-08 02:58:35 +01:00
|
|
|
// Okay, now that everything is set up, if this load is used by the second
|
|
|
|
// operand, and if there are no instructions that invalidate the load
|
|
|
|
// before the binary operator, eliminate the load.
|
|
|
|
if (User->getOperand(1) == &I &&
|
|
|
|
isSafeToFoldLoadIntoInstruction(I, *User))
|
|
|
|
return; // Eliminate the load!
|
2004-04-12 00:05:45 +02:00
|
|
|
|
|
|
|
// If this is a floating point sub or div, we won't be able to swap the
|
|
|
|
// operands, but we will still be able to eliminate the load.
|
|
|
|
if (Class == cFP && User->getOperand(0) == &I &&
|
|
|
|
!isa<LoadInst>(User->getOperand(1)) &&
|
|
|
|
(User->getOpcode() == Instruction::Sub ||
|
|
|
|
User->getOpcode() == Instruction::Div) &&
|
|
|
|
isSafeToFoldLoadIntoInstruction(I, *User))
|
|
|
|
return; // Eliminate the load!
|
2004-07-21 23:28:26 +02:00
|
|
|
|
|
|
|
// If we swapped the operands to the instruction, but couldn't fold the
|
2005-04-22 01:38:14 +02:00
|
|
|
// load anyway, swap them back. We don't want to break add X, int
|
2004-07-21 23:28:26 +02:00
|
|
|
// folding.
|
|
|
|
if (Swapped) cast<BinaryOperator>(User)->swapOperands();
|
Implement folding explicit load instructions into binary operations. For a
testcase like this:
int %test(int* %P, int %A) {
%Pv = load int* %P
%B = add int %A, %Pv
ret int %B
}
We now generate:
test:
mov %ECX, DWORD PTR [%ESP + 4]
mov %EAX, DWORD PTR [%ESP + 8]
add %EAX, DWORD PTR [%ECX]
ret
Instead of:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
mov %EAX, DWORD PTR [%EAX]
add %EAX, %ECX
ret
... saving one instruction, and often a register. Note that there are a lot
of other instructions that could use this, but they aren't handled. I'm not
really interested in adding them, but mul/div and all of the FP instructions
could be supported as well if someone wanted to add them.
llvm-svn: 12204
2004-03-08 02:58:35 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2003-10-20 06:48:06 +02:00
|
|
|
static const unsigned Opcodes[] = {
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
X86::MOV8rm, X86::MOV16rm, X86::MOV32rm, X86::FLD32m, X86::MOV32rm
|
2003-01-13 01:32:26 +01:00
|
|
|
};
|
2003-10-20 06:48:06 +02:00
|
|
|
unsigned Opcode = Opcodes[Class];
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
if (I.getType() == Type::DoubleTy) Opcode = X86::FLD64m;
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
|
|
|
|
unsigned DestReg = getReg(I);
|
|
|
|
|
|
|
|
if (AllocaInst *AI = dyn_castFixedAlloca(I.getOperand(0))) {
|
|
|
|
unsigned FI = getFixedSizedAllocaFI(AI);
|
|
|
|
if (Class == cLong) {
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV32rm, 4, DestReg), FI);
|
|
|
|
addFrameReference(BuildMI(BB, X86::MOV32rm, 4, DestReg+1), FI, 4);
|
|
|
|
} else {
|
|
|
|
addFrameReference(BuildMI(BB, Opcode, 4, DestReg), FI);
|
|
|
|
}
|
|
|
|
} else {
|
2004-08-30 02:13:26 +02:00
|
|
|
X86AddressMode AM;
|
|
|
|
getAddressingMode(I.getOperand(0), AM);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
if (Class == cLong) {
|
2004-08-30 02:13:26 +02:00
|
|
|
addFullAddress(BuildMI(BB, X86::MOV32rm, 4, DestReg), AM);
|
|
|
|
AM.Disp += 4;
|
|
|
|
addFullAddress(BuildMI(BB, X86::MOV32rm, 4, DestReg+1), AM);
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
} else {
|
2004-08-30 02:13:26 +02:00
|
|
|
addFullAddress(BuildMI(BB, Opcode, 4, DestReg), AM);
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
}
|
|
|
|
}
|
2003-01-13 01:32:26 +01:00
|
|
|
}
|
|
|
|
|
2002-11-17 22:11:55 +01:00
|
|
|
/// visitStoreInst - Implement LLVM store instructions in terms of the x86 'mov'
|
|
|
|
/// instruction.
|
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitStoreInst(StoreInst &I) {
|
2004-08-30 02:13:26 +02:00
|
|
|
X86AddressMode AM;
|
|
|
|
getAddressingMode(I.getOperand(1), AM);
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
|
2003-10-20 06:11:23 +02:00
|
|
|
const Type *ValTy = I.getOperand(0)->getType();
|
|
|
|
unsigned Class = getClassB(ValTy);
|
2003-10-20 06:48:06 +02:00
|
|
|
|
2004-02-25 03:56:58 +01:00
|
|
|
if (ConstantInt *CI = dyn_cast<ConstantInt>(I.getOperand(0))) {
|
|
|
|
uint64_t Val = CI->getRawValue();
|
|
|
|
if (Class == cLong) {
|
2004-08-30 02:13:26 +02:00
|
|
|
addFullAddress(BuildMI(BB, X86::MOV32mi, 5), AM).addImm(Val & ~0U);
|
|
|
|
AM.Disp += 4;
|
|
|
|
addFullAddress(BuildMI(BB, X86::MOV32mi, 5), AM).addImm(Val>>32);
|
2004-02-25 03:56:58 +01:00
|
|
|
} else {
|
|
|
|
static const unsigned Opcodes[] = {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
X86::MOV8mi, X86::MOV16mi, X86::MOV32mi
|
2004-02-25 03:56:58 +01:00
|
|
|
};
|
|
|
|
unsigned Opcode = Opcodes[Class];
|
2004-08-30 02:13:26 +02:00
|
|
|
addFullAddress(BuildMI(BB, Opcode, 5), AM).addImm(Val);
|
2004-02-25 03:56:58 +01:00
|
|
|
}
|
Two more improvements for null pointer handling: storing a null pointer
and passing a null pointer into a function.
For this testcase:
void %test(int** %X) {
store int* null, int** %X
call void %test(int** null)
ret void
}
we now generate this:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov DWORD PTR [%EAX], 0
mov DWORD PTR [%ESP], 0
call test
add %ESP, 12
ret
instead of this:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %ECX, 0
mov DWORD PTR [%EAX], %ECX
mov %EAX, 0
mov DWORD PTR [%ESP], %EAX
call test
add %ESP, 12
ret
llvm-svn: 13558
2004-05-13 17:26:48 +02:00
|
|
|
} else if (isa<ConstantPointerNull>(I.getOperand(0))) {
|
2004-10-15 07:05:29 +02:00
|
|
|
addFullAddress(BuildMI(BB, X86::MOV32mi, 5), AM).addImm(0);
|
2005-04-21 21:11:03 +02:00
|
|
|
} else if (GlobalValue *GV = dyn_cast<GlobalValue>(I.getOperand(0))) {
|
|
|
|
addFullAddress(BuildMI(BB, X86::MOV32mi, 5), AM).addGlobalAddress(GV);
|
2004-02-25 03:56:58 +01:00
|
|
|
} else if (ConstantBool *CB = dyn_cast<ConstantBool>(I.getOperand(0))) {
|
2004-08-30 02:13:26 +02:00
|
|
|
addFullAddress(BuildMI(BB, X86::MOV8mi, 5), AM).addImm(CB->getValue());
|
2004-05-07 23:18:15 +02:00
|
|
|
} else if (ConstantFP *CFP = dyn_cast<ConstantFP>(I.getOperand(0))) {
|
|
|
|
// Store constant FP values with integer instructions to avoid having to
|
|
|
|
// load the constants from the constant pool then do a store.
|
|
|
|
if (CFP->getType() == Type::FloatTy) {
|
|
|
|
union {
|
|
|
|
unsigned I;
|
|
|
|
float F;
|
|
|
|
} V;
|
|
|
|
V.F = CFP->getValue();
|
2004-08-30 02:13:26 +02:00
|
|
|
addFullAddress(BuildMI(BB, X86::MOV32mi, 5), AM).addImm(V.I);
|
2004-02-25 03:56:58 +01:00
|
|
|
} else {
|
2004-05-07 23:18:15 +02:00
|
|
|
union {
|
|
|
|
uint64_t I;
|
|
|
|
double F;
|
|
|
|
} V;
|
|
|
|
V.F = CFP->getValue();
|
2004-08-30 02:13:26 +02:00
|
|
|
addFullAddress(BuildMI(BB, X86::MOV32mi, 5), AM).addImm((unsigned)V.I);
|
|
|
|
AM.Disp += 4;
|
|
|
|
addFullAddress(BuildMI(BB, X86::MOV32mi, 5), AM).addImm(
|
2004-05-07 23:18:15 +02:00
|
|
|
unsigned(V.I >> 32));
|
2004-02-25 03:56:58 +01:00
|
|
|
}
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-05-07 23:18:15 +02:00
|
|
|
} else if (Class == cLong) {
|
|
|
|
unsigned ValReg = getReg(I.getOperand(0));
|
2004-08-30 02:13:26 +02:00
|
|
|
addFullAddress(BuildMI(BB, X86::MOV32mr, 5), AM).addReg(ValReg);
|
|
|
|
AM.Disp += 4;
|
|
|
|
addFullAddress(BuildMI(BB, X86::MOV32mr, 5), AM).addReg(ValReg+1);
|
2004-05-07 23:18:15 +02:00
|
|
|
} else {
|
2004-10-15 07:05:29 +02:00
|
|
|
// FIXME: stop emitting these two instructions:
|
|
|
|
// movl $global,%eax
|
|
|
|
// movl %eax,(%ebx)
|
|
|
|
// when one instruction will suffice. That includes when the global
|
|
|
|
// has an offset applied to it.
|
2004-05-07 23:18:15 +02:00
|
|
|
unsigned ValReg = getReg(I.getOperand(0));
|
|
|
|
static const unsigned Opcodes[] = {
|
|
|
|
X86::MOV8mr, X86::MOV16mr, X86::MOV32mr, X86::FST32m
|
|
|
|
};
|
|
|
|
unsigned Opcode = Opcodes[Class];
|
|
|
|
if (ValTy == Type::DoubleTy) Opcode = X86::FST64m;
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
|
2004-08-30 02:13:26 +02:00
|
|
|
addFullAddress(BuildMI(BB, Opcode, 1+4), AM).addReg(ValReg);
|
2002-12-25 06:13:53 +01:00
|
|
|
}
|
2002-11-17 22:11:55 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2004-03-02 00:53:11 +01:00
|
|
|
/// visitCastInst - Here we have various kinds of copying with or without sign
|
|
|
|
/// extension going on.
|
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitCastInst(CastInst &CI) {
|
2003-06-21 18:01:24 +02:00
|
|
|
Value *Op = CI.getOperand(0);
|
2004-04-11 21:21:59 +02:00
|
|
|
|
2004-04-12 02:23:04 +02:00
|
|
|
unsigned SrcClass = getClassB(Op->getType());
|
|
|
|
unsigned DestClass = getClassB(CI.getType());
|
|
|
|
// Noop casts are not emitted: getReg will return the source operand as the
|
|
|
|
// register to use for any uses of the noop cast.
|
2004-06-29 02:14:38 +02:00
|
|
|
if (DestClass == SrcClass) {
|
2005-04-22 01:38:14 +02:00
|
|
|
// The only detail in this plan is that casts from double -> float are
|
2004-06-29 02:14:38 +02:00
|
|
|
// truncating operations that we have to codegen through memory (despite
|
|
|
|
// the fact that the source/dest registers are the same class).
|
|
|
|
if (CI.getType() != Type::FloatTy || Op->getType() != Type::DoubleTy)
|
|
|
|
return;
|
|
|
|
}
|
2004-04-11 21:21:59 +02:00
|
|
|
|
2003-06-21 18:01:24 +02:00
|
|
|
// If this is a cast from a 32-bit integer to a Long type, and the only uses
|
|
|
|
// of the case are GEP instructions, then the cast does not need to be
|
|
|
|
// generated explicitly, it will be folded into the GEP.
|
2004-04-12 02:23:04 +02:00
|
|
|
if (DestClass == cLong && SrcClass == cInt) {
|
2003-06-21 18:01:24 +02:00
|
|
|
bool AllUsesAreGEPs = true;
|
|
|
|
for (Value::use_iterator I = CI.use_begin(), E = CI.use_end(); I != E; ++I)
|
|
|
|
if (!isa<GetElementPtrInst>(*I)) {
|
|
|
|
AllUsesAreGEPs = false;
|
|
|
|
break;
|
2005-04-22 01:38:14 +02:00
|
|
|
}
|
2003-06-21 18:01:24 +02:00
|
|
|
|
|
|
|
// No need to codegen this cast if all users are getelementptr instrs...
|
|
|
|
if (AllUsesAreGEPs) return;
|
|
|
|
}
|
|
|
|
|
2004-04-12 02:23:04 +02:00
|
|
|
// If this cast converts a load from a short,int, or long integer to a FP
|
|
|
|
// value, we will have folded this cast away.
|
|
|
|
if (DestClass == cFP && isa<LoadInst>(Op) && Op->hasOneUse() &&
|
|
|
|
(Op->getType() == Type::ShortTy || Op->getType() == Type::IntTy ||
|
|
|
|
Op->getType() == Type::LongTy))
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
2003-04-23 19:22:12 +02:00
|
|
|
unsigned DestReg = getReg(CI);
|
|
|
|
MachineBasicBlock::iterator MI = BB->end();
|
2003-06-21 18:01:24 +02:00
|
|
|
emitCastOperation(BB, MI, Op, CI.getType(), DestReg);
|
2003-04-23 19:22:12 +02:00
|
|
|
}
|
|
|
|
|
2004-03-02 00:53:11 +01:00
|
|
|
/// emitCastOperation - Common code shared between visitCastInst and constant
|
|
|
|
/// expression cast support.
|
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::emitCastOperation(MachineBasicBlock *BB,
|
|
|
|
MachineBasicBlock::iterator IP,
|
|
|
|
Value *Src, const Type *DestTy,
|
|
|
|
unsigned DestReg) {
|
2003-01-13 01:32:26 +01:00
|
|
|
const Type *SrcTy = Src->getType();
|
|
|
|
unsigned SrcClass = getClassB(SrcTy);
|
|
|
|
unsigned DestClass = getClassB(DestTy);
|
2004-04-12 01:21:26 +02:00
|
|
|
unsigned SrcReg = getReg(Src, BB, IP);
|
|
|
|
|
2003-01-13 01:32:26 +01:00
|
|
|
// Implement casts to bool by using compare on the operand followed by set if
|
|
|
|
// not zero on the result.
|
|
|
|
if (DestTy == Type::BoolTy) {
|
2003-06-01 05:38:24 +02:00
|
|
|
switch (SrcClass) {
|
|
|
|
case cByte:
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*BB, IP, X86::TEST8rr, 2).addReg(SrcReg).addReg(SrcReg);
|
2003-06-01 05:38:24 +02:00
|
|
|
break;
|
|
|
|
case cShort:
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*BB, IP, X86::TEST16rr, 2).addReg(SrcReg).addReg(SrcReg);
|
2003-06-01 05:38:24 +02:00
|
|
|
break;
|
|
|
|
case cInt:
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*BB, IP, X86::TEST32rr, 2).addReg(SrcReg).addReg(SrcReg);
|
2003-06-01 05:38:24 +02:00
|
|
|
break;
|
|
|
|
case cLong: {
|
|
|
|
unsigned TmpReg = makeAnotherReg(Type::IntTy);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*BB, IP, X86::OR32rr, 2, TmpReg).addReg(SrcReg).addReg(SrcReg+1);
|
2003-06-01 05:38:24 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case cFP:
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*BB, IP, X86::FTST, 1).addReg(SrcReg);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*BB, IP, X86::FNSTSW8r, 0);
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*BB, IP, X86::SAHF, 1);
|
2004-02-23 04:21:41 +01:00
|
|
|
break;
|
2003-06-01 05:38:24 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// If the zero flag is not set, then the value is true, set the byte to
|
|
|
|
// true.
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*BB, IP, X86::SETNEr, 1, DestReg);
|
2003-01-13 01:32:26 +01:00
|
|
|
return;
|
|
|
|
}
|
2002-12-13 12:31:59 +01:00
|
|
|
|
2003-01-13 01:32:26 +01:00
|
|
|
static const unsigned RegRegMove[] = {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
X86::MOV8rr, X86::MOV16rr, X86::MOV32rr, X86::FpMOV, X86::MOV32rr
|
2002-12-06 11:49:33 +01:00
|
|
|
};
|
2002-12-25 06:13:53 +01:00
|
|
|
|
2003-01-13 01:32:26 +01:00
|
|
|
// Implement casts between values of the same type class (as determined by
|
|
|
|
// getClass) by using a register-to-register move.
|
|
|
|
if (SrcClass == DestClass) {
|
|
|
|
if (SrcClass <= cInt || (SrcClass == cFP && SrcTy == DestTy)) {
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*BB, IP, RegRegMove[SrcClass], 1, DestReg).addReg(SrcReg);
|
2003-01-13 01:32:26 +01:00
|
|
|
} else if (SrcClass == cFP) {
|
|
|
|
if (SrcTy == Type::FloatTy) { // double -> float
|
2003-10-23 18:22:08 +02:00
|
|
|
assert(DestTy == Type::DoubleTy && "Unknown cFP member!");
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*BB, IP, X86::FpMOV, 1, DestReg).addReg(SrcReg);
|
2003-01-13 01:32:26 +01:00
|
|
|
} else { // float -> double
|
2003-10-23 18:22:08 +02:00
|
|
|
assert(SrcTy == Type::DoubleTy && DestTy == Type::FloatTy &&
|
|
|
|
"Unknown cFP member!");
|
|
|
|
// Truncate from double to float by storing to memory as short, then
|
|
|
|
// reading it back.
|
|
|
|
unsigned FltAlign = TM.getTargetData().getFloatAlignment();
|
2003-01-13 01:32:26 +01:00
|
|
|
int FrameIdx = F->getFrameInfo()->CreateStackObject(4, FltAlign);
|
2005-01-05 17:30:14 +01:00
|
|
|
addFrameReference(BuildMI(*BB, IP, X86::FST32m, 5),
|
|
|
|
FrameIdx).addReg(SrcReg);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
addFrameReference(BuildMI(*BB, IP, X86::FLD32m, 5, DestReg), FrameIdx);
|
2003-01-13 01:32:26 +01:00
|
|
|
}
|
|
|
|
} else if (SrcClass == cLong) {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*BB, IP, X86::MOV32rr, 1, DestReg).addReg(SrcReg);
|
|
|
|
BuildMI(*BB, IP, X86::MOV32rr, 1, DestReg+1).addReg(SrcReg+1);
|
2003-01-13 01:32:26 +01:00
|
|
|
} else {
|
2003-05-12 22:16:58 +02:00
|
|
|
assert(0 && "Cannot handle this type of cast instruction!");
|
2003-04-23 19:22:12 +02:00
|
|
|
abort();
|
2003-01-13 01:32:26 +01:00
|
|
|
}
|
2002-12-25 06:13:53 +01:00
|
|
|
return;
|
|
|
|
}
|
2003-01-13 01:32:26 +01:00
|
|
|
|
|
|
|
// Handle cast of SMALLER int to LARGER int using a move with sign extension
|
|
|
|
// or zero extension, depending on whether the source type was signed.
|
|
|
|
if (SrcClass <= cInt && (DestClass <= cInt || DestClass == cLong) &&
|
|
|
|
SrcClass < DestClass) {
|
|
|
|
bool isLong = DestClass == cLong;
|
|
|
|
if (isLong) DestClass = cInt;
|
|
|
|
|
|
|
|
static const unsigned Opc[][4] = {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
{ X86::MOVSX16rr8, X86::MOVSX32rr8, X86::MOVSX32rr16, X86::MOV32rr }, // s
|
|
|
|
{ X86::MOVZX16rr8, X86::MOVZX32rr8, X86::MOVZX32rr16, X86::MOV32rr } // u
|
2003-01-13 01:32:26 +01:00
|
|
|
};
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-05-10 00:28:45 +02:00
|
|
|
bool isUnsigned = SrcTy->isUnsigned() || SrcTy == Type::BoolTy;
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*BB, IP, Opc[isUnsigned][SrcClass + DestClass - 1], 1,
|
2003-04-23 19:22:12 +02:00
|
|
|
DestReg).addReg(SrcReg);
|
2003-01-13 01:32:26 +01:00
|
|
|
|
|
|
|
if (isLong) { // Handle upper 32 bits as appropriate...
|
|
|
|
if (isUnsigned) // Zero out top bits...
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*BB, IP, X86::MOV32ri, 1, DestReg+1).addImm(0);
|
2003-01-13 01:32:26 +01:00
|
|
|
else // Sign extend bottom half...
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*BB, IP, X86::SAR32ri, 2, DestReg+1).addReg(DestReg).addImm(31);
|
2002-12-06 11:49:33 +01:00
|
|
|
}
|
2003-01-13 01:32:26 +01:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Special case long -> int ...
|
|
|
|
if (SrcClass == cLong && DestClass == cInt) {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*BB, IP, X86::MOV32rr, 1, DestReg).addReg(SrcReg);
|
2003-01-13 01:32:26 +01:00
|
|
|
return;
|
|
|
|
}
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2003-01-13 01:32:26 +01:00
|
|
|
// Handle cast of LARGER int to SMALLER int using a move to EAX followed by a
|
|
|
|
// move out of AX or AL.
|
|
|
|
if ((SrcClass <= cInt || SrcClass == cLong) && DestClass <= cInt
|
|
|
|
&& SrcClass > DestClass) {
|
|
|
|
static const unsigned AReg[] = { X86::AL, X86::AX, X86::EAX, 0, X86::EAX };
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*BB, IP, RegRegMove[SrcClass], 1, AReg[SrcClass]).addReg(SrcReg);
|
|
|
|
BuildMI(*BB, IP, RegRegMove[DestClass], 1, DestReg).addReg(AReg[DestClass]);
|
2003-01-13 01:32:26 +01:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Handle casts from integer to floating point now...
|
|
|
|
if (DestClass == cFP) {
|
2003-05-12 22:36:13 +02:00
|
|
|
// Promote the integer to a type supported by FLD. We do this because there
|
|
|
|
// are no unsigned FLD instructions, so we must promote an unsigned value to
|
|
|
|
// a larger signed value, then use FLD on the larger value.
|
|
|
|
//
|
|
|
|
const Type *PromoteType = 0;
|
2004-04-10 20:32:01 +02:00
|
|
|
unsigned PromoteOpcode = 0;
|
2004-02-23 04:10:10 +01:00
|
|
|
unsigned RealDestReg = DestReg;
|
2004-06-17 20:19:28 +02:00
|
|
|
switch (SrcTy->getTypeID()) {
|
2003-05-12 22:36:13 +02:00
|
|
|
case Type::BoolTyID:
|
|
|
|
case Type::SByteTyID:
|
|
|
|
// We don't have the facilities for directly loading byte sized data from
|
|
|
|
// memory (even signed). Promote it to 16 bits.
|
|
|
|
PromoteType = Type::ShortTy;
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
PromoteOpcode = X86::MOVSX16rr8;
|
2003-05-12 22:36:13 +02:00
|
|
|
break;
|
|
|
|
case Type::UByteTyID:
|
|
|
|
PromoteType = Type::ShortTy;
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
PromoteOpcode = X86::MOVZX16rr8;
|
2003-05-12 22:36:13 +02:00
|
|
|
break;
|
|
|
|
case Type::UShortTyID:
|
|
|
|
PromoteType = Type::IntTy;
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
PromoteOpcode = X86::MOVZX32rr16;
|
2003-05-12 22:36:13 +02:00
|
|
|
break;
|
|
|
|
case Type::ULongTyID:
|
Rewrite support for cast uint -> FP. In particular, we used to compile this:
double %test(uint %X) {
%tmp.1 = cast uint %X to double ; <double> [#uses=1]
ret double %tmp.1
}
into:
test:
sub %ESP, 8
mov %EAX, DWORD PTR [%ESP + 12]
mov %ECX, 0
mov DWORD PTR [%ESP], %EAX
mov DWORD PTR [%ESP + 4], %ECX
fild QWORD PTR [%ESP]
add %ESP, 8
ret
... which basically zero extends to 8 bytes, then does an fild for an
8-byte signed int.
Now we generate this:
test:
sub %ESP, 4
mov %EAX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%ESP], %EAX
fild DWORD PTR [%ESP]
shr %EAX, 31
fadd DWORD PTR [.CPItest_0 + 4*%EAX]
add %ESP, 4
ret
.section .rodata
.align 4
.CPItest_0:
.quad 5728578726015270912
This does a 32-bit signed integer load, then adds in an offset if the sign
bit of the integer was set.
It turns out that this is substantially faster than the preceeding sequence.
Consider this testcase:
unsigned a[2]={1,2};
volatile double G;
void main() {
int i;
for (i=0; i<100000000; ++i )
G += a[i&1];
}
On zion (a P4 Xeon, 3Ghz), this patch speeds up the testcase from 2.140s
to 0.94s.
On apoc, an athlon MP 2100+, this patch speeds up the testcase from 1.72s
to 1.34s.
Note that the program takes 2.5s/1.97s on zion/apoc with GCC 3.3 -O3
-fomit-frame-pointer.
llvm-svn: 17083
2004-10-17 10:01:28 +02:00
|
|
|
case Type::UIntTyID:
|
2004-02-23 04:10:10 +01:00
|
|
|
// Don't fild into the read destination.
|
|
|
|
DestReg = makeAnotherReg(Type::DoubleTy);
|
|
|
|
break;
|
2003-05-12 22:36:13 +02:00
|
|
|
default: // No promotion needed...
|
|
|
|
break;
|
|
|
|
}
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2003-05-12 22:36:13 +02:00
|
|
|
if (PromoteType) {
|
|
|
|
unsigned TmpReg = makeAnotherReg(PromoteType);
|
2004-04-06 21:29:36 +02:00
|
|
|
BuildMI(*BB, IP, PromoteOpcode, 1, TmpReg).addReg(SrcReg);
|
2003-05-12 22:36:13 +02:00
|
|
|
SrcTy = PromoteType;
|
|
|
|
SrcClass = getClass(PromoteType);
|
2003-01-13 01:32:26 +01:00
|
|
|
SrcReg = TmpReg;
|
2002-12-06 11:49:33 +01:00
|
|
|
}
|
2003-01-13 01:32:26 +01:00
|
|
|
|
|
|
|
// Spill the integer to memory and reload it from there...
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
int FrameIdx =
|
|
|
|
F->getFrameInfo()->CreateStackObject(SrcTy, TM.getTargetData());
|
2003-01-13 01:32:26 +01:00
|
|
|
|
|
|
|
if (SrcClass == cLong) {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
addFrameReference(BuildMI(*BB, IP, X86::MOV32mr, 5),
|
2004-02-29 08:22:16 +01:00
|
|
|
FrameIdx).addReg(SrcReg);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
addFrameReference(BuildMI(*BB, IP, X86::MOV32mr, 5),
|
2003-10-23 18:22:08 +02:00
|
|
|
FrameIdx, 4).addReg(SrcReg+1);
|
2003-01-13 01:32:26 +01:00
|
|
|
} else {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
static const unsigned Op1[] = { X86::MOV8mr, X86::MOV16mr, X86::MOV32mr };
|
2004-02-29 08:22:16 +01:00
|
|
|
addFrameReference(BuildMI(*BB, IP, Op1[SrcClass], 5),
|
|
|
|
FrameIdx).addReg(SrcReg);
|
2003-01-13 01:32:26 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
static const unsigned Op2[] =
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
{ 0/*byte*/, X86::FILD16m, X86::FILD32m, 0/*FP*/, X86::FILD64m };
|
2004-02-29 08:22:16 +01:00
|
|
|
addFrameReference(BuildMI(*BB, IP, Op2[SrcClass], 5, DestReg), FrameIdx);
|
2004-02-23 04:10:10 +01:00
|
|
|
|
Rewrite support for cast uint -> FP. In particular, we used to compile this:
double %test(uint %X) {
%tmp.1 = cast uint %X to double ; <double> [#uses=1]
ret double %tmp.1
}
into:
test:
sub %ESP, 8
mov %EAX, DWORD PTR [%ESP + 12]
mov %ECX, 0
mov DWORD PTR [%ESP], %EAX
mov DWORD PTR [%ESP + 4], %ECX
fild QWORD PTR [%ESP]
add %ESP, 8
ret
... which basically zero extends to 8 bytes, then does an fild for an
8-byte signed int.
Now we generate this:
test:
sub %ESP, 4
mov %EAX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%ESP], %EAX
fild DWORD PTR [%ESP]
shr %EAX, 31
fadd DWORD PTR [.CPItest_0 + 4*%EAX]
add %ESP, 4
ret
.section .rodata
.align 4
.CPItest_0:
.quad 5728578726015270912
This does a 32-bit signed integer load, then adds in an offset if the sign
bit of the integer was set.
It turns out that this is substantially faster than the preceeding sequence.
Consider this testcase:
unsigned a[2]={1,2};
volatile double G;
void main() {
int i;
for (i=0; i<100000000; ++i )
G += a[i&1];
}
On zion (a P4 Xeon, 3Ghz), this patch speeds up the testcase from 2.140s
to 0.94s.
On apoc, an athlon MP 2100+, this patch speeds up the testcase from 1.72s
to 1.34s.
Note that the program takes 2.5s/1.97s on zion/apoc with GCC 3.3 -O3
-fomit-frame-pointer.
llvm-svn: 17083
2004-10-17 10:01:28 +02:00
|
|
|
if (SrcTy == Type::UIntTy) {
|
|
|
|
// If this is a cast from uint -> double, we need to be careful about if
|
|
|
|
// the "sign" bit is set. If so, we don't want to make a negative number,
|
|
|
|
// we want to make a positive number. Emit code to add an offset if the
|
|
|
|
// sign bit is set.
|
|
|
|
|
|
|
|
// Compute whether the sign bit is set by shifting the reg right 31 bits.
|
|
|
|
unsigned IsNeg = makeAnotherReg(Type::IntTy);
|
2005-01-09 02:49:29 +01:00
|
|
|
BuildMI(*BB, IP, X86::SHR32ri, 2, IsNeg).addReg(SrcReg).addImm(31);
|
Rewrite support for cast uint -> FP. In particular, we used to compile this:
double %test(uint %X) {
%tmp.1 = cast uint %X to double ; <double> [#uses=1]
ret double %tmp.1
}
into:
test:
sub %ESP, 8
mov %EAX, DWORD PTR [%ESP + 12]
mov %ECX, 0
mov DWORD PTR [%ESP], %EAX
mov DWORD PTR [%ESP + 4], %ECX
fild QWORD PTR [%ESP]
add %ESP, 8
ret
... which basically zero extends to 8 bytes, then does an fild for an
8-byte signed int.
Now we generate this:
test:
sub %ESP, 4
mov %EAX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%ESP], %EAX
fild DWORD PTR [%ESP]
shr %EAX, 31
fadd DWORD PTR [.CPItest_0 + 4*%EAX]
add %ESP, 4
ret
.section .rodata
.align 4
.CPItest_0:
.quad 5728578726015270912
This does a 32-bit signed integer load, then adds in an offset if the sign
bit of the integer was set.
It turns out that this is substantially faster than the preceeding sequence.
Consider this testcase:
unsigned a[2]={1,2};
volatile double G;
void main() {
int i;
for (i=0; i<100000000; ++i )
G += a[i&1];
}
On zion (a P4 Xeon, 3Ghz), this patch speeds up the testcase from 2.140s
to 0.94s.
On apoc, an athlon MP 2100+, this patch speeds up the testcase from 1.72s
to 1.34s.
Note that the program takes 2.5s/1.97s on zion/apoc with GCC 3.3 -O3
-fomit-frame-pointer.
llvm-svn: 17083
2004-10-17 10:01:28 +02:00
|
|
|
|
|
|
|
// Create a CP value that has the offset in one word and 0 in the other.
|
|
|
|
static ConstantInt *TheOffset = ConstantUInt::get(Type::ULongTy,
|
|
|
|
0x4f80000000000000ULL);
|
|
|
|
unsigned CPI = F->getConstantPool()->getConstantPoolIndex(TheOffset);
|
2005-01-09 02:49:29 +01:00
|
|
|
BuildMI(*BB, IP, X86::FADD32m, 5, RealDestReg).addReg(DestReg)
|
Rewrite support for cast uint -> FP. In particular, we used to compile this:
double %test(uint %X) {
%tmp.1 = cast uint %X to double ; <double> [#uses=1]
ret double %tmp.1
}
into:
test:
sub %ESP, 8
mov %EAX, DWORD PTR [%ESP + 12]
mov %ECX, 0
mov DWORD PTR [%ESP], %EAX
mov DWORD PTR [%ESP + 4], %ECX
fild QWORD PTR [%ESP]
add %ESP, 8
ret
... which basically zero extends to 8 bytes, then does an fild for an
8-byte signed int.
Now we generate this:
test:
sub %ESP, 4
mov %EAX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%ESP], %EAX
fild DWORD PTR [%ESP]
shr %EAX, 31
fadd DWORD PTR [.CPItest_0 + 4*%EAX]
add %ESP, 4
ret
.section .rodata
.align 4
.CPItest_0:
.quad 5728578726015270912
This does a 32-bit signed integer load, then adds in an offset if the sign
bit of the integer was set.
It turns out that this is substantially faster than the preceeding sequence.
Consider this testcase:
unsigned a[2]={1,2};
volatile double G;
void main() {
int i;
for (i=0; i<100000000; ++i )
G += a[i&1];
}
On zion (a P4 Xeon, 3Ghz), this patch speeds up the testcase from 2.140s
to 0.94s.
On apoc, an athlon MP 2100+, this patch speeds up the testcase from 1.72s
to 1.34s.
Note that the program takes 2.5s/1.97s on zion/apoc with GCC 3.3 -O3
-fomit-frame-pointer.
llvm-svn: 17083
2004-10-17 10:01:28 +02:00
|
|
|
.addConstantPoolIndex(CPI).addZImm(4).addReg(IsNeg).addSImm(0);
|
|
|
|
|
|
|
|
} else if (SrcTy == Type::ULongTy) {
|
|
|
|
// We need special handling for unsigned 64-bit integer sources. If the
|
|
|
|
// input number has the "sign bit" set, then we loaded it incorrectly as a
|
|
|
|
// negative 64-bit number. In this case, add an offset value.
|
|
|
|
|
2004-02-23 04:10:10 +01:00
|
|
|
// Emit a test instruction to see if the dynamic input value was signed.
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*BB, IP, X86::TEST32rr, 2).addReg(SrcReg+1).addReg(SrcReg+1);
|
2004-02-23 04:10:10 +01:00
|
|
|
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
// If the sign bit is set, get a pointer to an offset, otherwise get a
|
|
|
|
// pointer to a zero.
|
2004-02-23 04:10:10 +01:00
|
|
|
MachineConstantPool *CP = F->getConstantPool();
|
|
|
|
unsigned Zero = makeAnotherReg(Type::IntTy);
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
Constant *Null = Constant::getNullValue(Type::UIntTy);
|
2005-04-22 01:38:14 +02:00
|
|
|
addConstantPoolReference(BuildMI(*BB, IP, X86::LEA32r, 5, Zero),
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
CP->getConstantPoolIndex(Null));
|
2004-02-23 04:10:10 +01:00
|
|
|
unsigned Offset = makeAnotherReg(Type::IntTy);
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
Constant *OffsetCst = ConstantUInt::get(Type::UIntTy, 0x5f800000);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
addConstantPoolReference(BuildMI(*BB, IP, X86::LEA32r, 5, Offset),
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
CP->getConstantPoolIndex(OffsetCst));
|
2004-02-23 04:10:10 +01:00
|
|
|
unsigned Addr = makeAnotherReg(Type::IntTy);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*BB, IP, X86::CMOVS32rr, 2, Addr).addReg(Zero).addReg(Offset);
|
2004-02-23 04:10:10 +01:00
|
|
|
|
|
|
|
// Load the constant for an add. FIXME: this could make an 'fadd' that
|
|
|
|
// reads directly from memory, but we don't support these yet.
|
|
|
|
unsigned ConstReg = makeAnotherReg(Type::DoubleTy);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
addDirectMem(BuildMI(*BB, IP, X86::FLD32m, 4, ConstReg), Addr);
|
2004-02-23 04:10:10 +01:00
|
|
|
|
2004-02-29 08:22:16 +01:00
|
|
|
BuildMI(*BB, IP, X86::FpADD, 2, RealDestReg)
|
|
|
|
.addReg(ConstReg).addReg(DestReg);
|
2004-02-23 04:10:10 +01:00
|
|
|
}
|
|
|
|
|
2003-01-13 01:32:26 +01:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Handle casts from floating point to integer now...
|
|
|
|
if (SrcClass == cFP) {
|
|
|
|
// Change the floating point control register to use "round towards zero"
|
|
|
|
// mode when truncating to an integer value.
|
|
|
|
//
|
|
|
|
int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
addFrameReference(BuildMI(*BB, IP, X86::FNSTCW16m, 4), CWFrameIdx);
|
2003-01-13 01:32:26 +01:00
|
|
|
|
|
|
|
// Load the old value of the high byte of the control word...
|
|
|
|
unsigned HighPartOfCW = makeAnotherReg(Type::UByteTy);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
addFrameReference(BuildMI(*BB, IP, X86::MOV8rm, 4, HighPartOfCW),
|
2004-02-29 08:22:16 +01:00
|
|
|
CWFrameIdx, 1);
|
2003-01-13 01:32:26 +01:00
|
|
|
|
|
|
|
// Set the high part to be round to zero...
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
addFrameReference(BuildMI(*BB, IP, X86::MOV8mi, 5),
|
2004-02-29 08:22:16 +01:00
|
|
|
CWFrameIdx, 1).addImm(12);
|
2003-01-13 01:32:26 +01:00
|
|
|
|
|
|
|
// Reload the modified control word now...
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
addFrameReference(BuildMI(*BB, IP, X86::FLDCW16m, 4), CWFrameIdx);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2003-01-13 01:32:26 +01:00
|
|
|
// Restore the memory image of control word to original value
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
addFrameReference(BuildMI(*BB, IP, X86::MOV8mr, 5),
|
2003-10-23 18:22:08 +02:00
|
|
|
CWFrameIdx, 1).addReg(HighPartOfCW);
|
2003-01-13 01:32:26 +01:00
|
|
|
|
|
|
|
// We don't have the facilities for directly storing byte sized data to
|
|
|
|
// memory. Promote it to 16 bits. We also must promote unsigned values to
|
|
|
|
// larger classes because we only have signed FP stores.
|
|
|
|
unsigned StoreClass = DestClass;
|
|
|
|
const Type *StoreTy = DestTy;
|
|
|
|
if (StoreClass == cByte || DestTy->isUnsigned())
|
|
|
|
switch (StoreClass) {
|
|
|
|
case cByte: StoreTy = Type::ShortTy; StoreClass = cShort; break;
|
|
|
|
case cShort: StoreTy = Type::IntTy; StoreClass = cInt; break;
|
|
|
|
case cInt: StoreTy = Type::LongTy; StoreClass = cLong; break;
|
2003-07-18 22:23:43 +02:00
|
|
|
// The following treatment of cLong may not be perfectly right,
|
|
|
|
// but it survives chains of casts of the form
|
|
|
|
// double->ulong->double.
|
|
|
|
case cLong: StoreTy = Type::LongTy; StoreClass = cLong; break;
|
2003-01-13 01:32:26 +01:00
|
|
|
default: assert(0 && "Unknown store class!");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Spill the integer to memory and reload it from there...
|
|
|
|
int FrameIdx =
|
|
|
|
F->getFrameInfo()->CreateStackObject(StoreTy, TM.getTargetData());
|
|
|
|
|
|
|
|
static const unsigned Op1[] =
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
{ 0, X86::FIST16m, X86::FIST32m, 0, X86::FISTP64m };
|
2004-02-29 08:22:16 +01:00
|
|
|
addFrameReference(BuildMI(*BB, IP, Op1[StoreClass], 5),
|
|
|
|
FrameIdx).addReg(SrcReg);
|
2003-01-13 01:32:26 +01:00
|
|
|
|
|
|
|
if (DestClass == cLong) {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
addFrameReference(BuildMI(*BB, IP, X86::MOV32rm, 4, DestReg), FrameIdx);
|
|
|
|
addFrameReference(BuildMI(*BB, IP, X86::MOV32rm, 4, DestReg+1),
|
2004-02-29 08:22:16 +01:00
|
|
|
FrameIdx, 4);
|
2003-01-13 01:32:26 +01:00
|
|
|
} else {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
static const unsigned Op2[] = { X86::MOV8rm, X86::MOV16rm, X86::MOV32rm };
|
2004-02-29 08:22:16 +01:00
|
|
|
addFrameReference(BuildMI(*BB, IP, Op2[DestClass], 4, DestReg), FrameIdx);
|
2003-01-13 01:32:26 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
// Reload the original control word now...
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
addFrameReference(BuildMI(*BB, IP, X86::FLDCW16m, 4), CWFrameIdx);
|
2003-01-13 01:32:26 +01:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2002-12-06 11:49:33 +01:00
|
|
|
// Anything we haven't handled already, we can't (yet) handle at all.
|
2003-05-12 22:16:58 +02:00
|
|
|
assert(0 && "Unhandled cast instruction!");
|
2003-04-23 19:22:12 +02:00
|
|
|
abort();
|
2002-11-22 12:07:01 +01:00
|
|
|
}
|
2002-11-01 00:03:59 +01:00
|
|
|
|
2003-10-18 07:56:40 +02:00
|
|
|
/// visitVANextInst - Implement the va_next instruction...
|
2003-05-08 21:44:13 +02:00
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitVANextInst(VANextInst &I) {
|
2003-10-18 07:56:40 +02:00
|
|
|
unsigned VAList = getReg(I.getOperand(0));
|
2003-05-08 21:44:13 +02:00
|
|
|
unsigned DestReg = getReg(I);
|
|
|
|
|
|
|
|
unsigned Size;
|
2004-06-17 20:19:28 +02:00
|
|
|
switch (I.getArgType()->getTypeID()) {
|
2003-05-08 21:44:13 +02:00
|
|
|
default:
|
|
|
|
std::cerr << I;
|
2003-10-18 07:56:40 +02:00
|
|
|
assert(0 && "Error: bad type for va_next instruction!");
|
2003-05-08 21:44:13 +02:00
|
|
|
return;
|
|
|
|
case Type::PointerTyID:
|
|
|
|
case Type::UIntTyID:
|
|
|
|
case Type::IntTyID:
|
|
|
|
Size = 4;
|
|
|
|
break;
|
|
|
|
case Type::ULongTyID:
|
|
|
|
case Type::LongTyID:
|
2003-10-18 07:56:40 +02:00
|
|
|
case Type::DoubleTyID:
|
2003-05-08 21:44:13 +02:00
|
|
|
Size = 8;
|
2003-10-18 07:56:40 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Increment the VAList pointer...
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::ADD32ri, 2, DestReg).addReg(VAList).addImm(Size);
|
2003-10-18 07:56:40 +02:00
|
|
|
}
|
|
|
|
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitVAArgInst(VAArgInst &I) {
|
2003-10-18 07:56:40 +02:00
|
|
|
unsigned VAList = getReg(I.getOperand(0));
|
|
|
|
unsigned DestReg = getReg(I);
|
|
|
|
|
2004-06-17 20:19:28 +02:00
|
|
|
switch (I.getType()->getTypeID()) {
|
2003-10-18 07:56:40 +02:00
|
|
|
default:
|
|
|
|
std::cerr << I;
|
|
|
|
assert(0 && "Error: bad type for va_next instruction!");
|
|
|
|
return;
|
|
|
|
case Type::PointerTyID:
|
|
|
|
case Type::UIntTyID:
|
|
|
|
case Type::IntTyID:
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
addDirectMem(BuildMI(BB, X86::MOV32rm, 4, DestReg), VAList);
|
2003-10-18 07:56:40 +02:00
|
|
|
break;
|
|
|
|
case Type::ULongTyID:
|
|
|
|
case Type::LongTyID:
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
addDirectMem(BuildMI(BB, X86::MOV32rm, 4, DestReg), VAList);
|
|
|
|
addRegOffset(BuildMI(BB, X86::MOV32rm, 4, DestReg+1), VAList, 4);
|
2003-05-08 21:44:13 +02:00
|
|
|
break;
|
|
|
|
case Type::DoubleTyID:
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
addDirectMem(BuildMI(BB, X86::FLD64m, 4, DestReg), VAList);
|
2003-05-08 21:44:13 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2004-03-02 00:53:11 +01:00
|
|
|
/// visitGetElementPtrInst - instruction-select GEP instructions
|
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitGetElementPtrInst(GetElementPtrInst &I) {
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
// If this GEP instruction will be folded into all of its users, we don't need
|
|
|
|
// to explicitly calculate it!
|
2004-08-30 02:13:26 +02:00
|
|
|
X86AddressMode AM;
|
|
|
|
if (isGEPFoldable(0, I.getOperand(0), I.op_begin()+1, I.op_end(), AM)) {
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
// Check all of the users of the instruction to see if they are loads and
|
|
|
|
// stores.
|
|
|
|
bool AllWillFold = true;
|
|
|
|
for (Value::use_iterator UI = I.use_begin(), E = I.use_end(); UI != E; ++UI)
|
|
|
|
if (cast<Instruction>(*UI)->getOpcode() != Instruction::Load)
|
|
|
|
if (cast<Instruction>(*UI)->getOpcode() != Instruction::Store ||
|
|
|
|
cast<Instruction>(*UI)->getOperand(0) == &I) {
|
|
|
|
AllWillFold = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the instruction is foldable, and will be folded into all users, don't
|
|
|
|
// emit it!
|
|
|
|
if (AllWillFold) return;
|
|
|
|
}
|
|
|
|
|
2003-01-13 01:32:26 +01:00
|
|
|
unsigned outputReg = getReg(I);
|
2004-02-22 18:05:38 +01:00
|
|
|
emitGEPOperation(BB, BB->end(), I.getOperand(0),
|
2002-12-16 05:23:29 +01:00
|
|
|
I.op_begin()+1, I.op_end(), outputReg);
|
2002-12-13 07:56:29 +01:00
|
|
|
}
|
|
|
|
|
2004-02-25 04:45:50 +01:00
|
|
|
/// getGEPIndex - Inspect the getelementptr operands specified with GEPOps and
|
|
|
|
/// GEPTypes (the derived types being stepped through at each level). On return
|
|
|
|
/// from this function, if some indexes of the instruction are representable as
|
|
|
|
/// an X86 lea instruction, the machine operands are put into the Ops
|
|
|
|
/// instruction and the consumed indexes are poped from the GEPOps/GEPTypes
|
|
|
|
/// lists. Otherwise, GEPOps.size() is returned. If this returns a an
|
|
|
|
/// addressing mode that only partially consumes the input, the BaseReg input of
|
|
|
|
/// the addressing mode must be left free.
|
|
|
|
///
|
|
|
|
/// Note that there is one fewer entry in GEPTypes than there is in GEPOps.
|
|
|
|
///
|
2005-04-22 01:38:14 +02:00
|
|
|
void X86ISel::getGEPIndex(MachineBasicBlock *MBB,
|
2004-09-21 20:21:21 +02:00
|
|
|
MachineBasicBlock::iterator IP,
|
|
|
|
std::vector<Value*> &GEPOps,
|
|
|
|
std::vector<const Type*> &GEPTypes,
|
|
|
|
X86AddressMode &AM) {
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
const TargetData &TD = TM.getTargetData();
|
|
|
|
|
2004-02-25 04:45:50 +01:00
|
|
|
// Clear out the state we are working with...
|
2004-08-30 02:13:26 +02:00
|
|
|
AM.BaseType = X86AddressMode::RegBase;
|
|
|
|
AM.Base.Reg = 0; // No base register
|
|
|
|
AM.Scale = 1; // Unit scale
|
|
|
|
AM.IndexReg = 0; // No index register
|
|
|
|
AM.Disp = 0; // No displacement
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
|
2004-02-25 04:45:50 +01:00
|
|
|
// While there are GEP indexes that can be folded into the current address,
|
|
|
|
// keep processing them.
|
|
|
|
while (!GEPTypes.empty()) {
|
|
|
|
if (const StructType *StTy = dyn_cast<StructType>(GEPTypes.back())) {
|
|
|
|
// It's a struct access. CUI is the index into the structure,
|
|
|
|
// which names the field. This index must have unsigned type.
|
|
|
|
const ConstantUInt *CUI = cast<ConstantUInt>(GEPOps.back());
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-02-25 04:45:50 +01:00
|
|
|
// Use the TargetData structure to pick out what the layout of the
|
|
|
|
// structure is in memory. Since the structure index must be constant, we
|
|
|
|
// can get its value and use it to find the right byte offset from the
|
|
|
|
// StructLayout class's list of structure member offsets.
|
2004-08-30 02:13:26 +02:00
|
|
|
AM.Disp += TD.getStructLayout(StTy)->MemberOffsets[CUI->getValue()];
|
2004-02-25 04:45:50 +01:00
|
|
|
GEPOps.pop_back(); // Consume a GEP operand
|
|
|
|
GEPTypes.pop_back();
|
|
|
|
} else {
|
|
|
|
// It's an array or pointer access: [ArraySize x ElementType].
|
|
|
|
const SequentialType *SqTy = cast<SequentialType>(GEPTypes.back());
|
|
|
|
Value *idx = GEPOps.back();
|
|
|
|
|
|
|
|
// idx is the index into the array. Unlike with structure
|
|
|
|
// indices, we may not know its actual value at code-generation
|
|
|
|
// time.
|
|
|
|
|
|
|
|
// If idx is a constant, fold it into the offset.
|
Teach the instruction selector how to transform 'array' GEP computations into X86
scaled indexes. This allows us to compile GEP's like this:
int* %test([10 x { int, { int } }]* %X, int %Idx) {
%Idx = cast int %Idx to long
%X = getelementptr [10 x { int, { int } }]* %X, long 0, long %Idx, ubyte 1, ubyte 0
ret int* %X
}
Into a single address computation:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
lea %EAX, DWORD PTR [%EAX + 8*%ECX + 4]
ret
Before it generated:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
shl %ECX, 3
add %EAX, %ECX
lea %EAX, DWORD PTR [%EAX + 4]
ret
This is useful for things like int/float/double arrays, as the indexing can be folded into
the loads&stores, reducing register pressure and decreasing the pressure on the decode unit.
With these changes, I expect our performance on 256.bzip2 and gzip to improve a lot. On
bzip2 for example, we go from this:
10665 asm-printer - Number of machine instrs printed
40 ra-local - Number of loads/stores folded into instructions
1708 ra-local - Number of loads added
1532 ra-local - Number of stores added
1354 twoaddressinstruction - Number of instructions added
1354 twoaddressinstruction - Number of two-address instructions
2794 x86-peephole - Number of peephole optimization performed
to this:
9873 asm-printer - Number of machine instrs printed
41 ra-local - Number of loads/stores folded into instructions
1710 ra-local - Number of loads added
1521 ra-local - Number of stores added
789 twoaddressinstruction - Number of instructions added
789 twoaddressinstruction - Number of two-address instructions
2142 x86-peephole - Number of peephole optimization performed
... and these types of instructions are often in tight loops.
Linear scan is also helped, but not as much. It goes from:
8787 asm-printer - Number of machine instrs printed
2389 liveintervals - Number of identity moves eliminated after coalescing
2288 liveintervals - Number of interval joins performed
3522 liveintervals - Number of intervals after coalescing
5810 liveintervals - Number of original intervals
700 spiller - Number of loads added
487 spiller - Number of stores added
303 spiller - Number of register spills
1354 twoaddressinstruction - Number of instructions added
1354 twoaddressinstruction - Number of two-address instructions
363 x86-peephole - Number of peephole optimization performed
to:
7982 asm-printer - Number of machine instrs printed
1759 liveintervals - Number of identity moves eliminated after coalescing
1658 liveintervals - Number of interval joins performed
3282 liveintervals - Number of intervals after coalescing
4940 liveintervals - Number of original intervals
635 spiller - Number of loads added
452 spiller - Number of stores added
288 spiller - Number of register spills
789 twoaddressinstruction - Number of instructions added
789 twoaddressinstruction - Number of two-address instructions
258 x86-peephole - Number of peephole optimization performed
Though I'm not complaining about the drop in the number of intervals. :)
llvm-svn: 11820
2004-02-25 08:00:55 +01:00
|
|
|
unsigned TypeSize = TD.getTypeSize(SqTy->getElementType());
|
2004-02-25 04:45:50 +01:00
|
|
|
if (ConstantSInt *CSI = dyn_cast<ConstantSInt>(idx)) {
|
2004-08-30 02:13:26 +02:00
|
|
|
AM.Disp += TypeSize*CSI->getValue();
|
2004-04-05 03:30:19 +02:00
|
|
|
} else if (ConstantUInt *CUI = dyn_cast<ConstantUInt>(idx)) {
|
2004-08-30 02:13:26 +02:00
|
|
|
AM.Disp += TypeSize*CUI->getValue();
|
2004-02-25 04:45:50 +01:00
|
|
|
} else {
|
Teach the instruction selector how to transform 'array' GEP computations into X86
scaled indexes. This allows us to compile GEP's like this:
int* %test([10 x { int, { int } }]* %X, int %Idx) {
%Idx = cast int %Idx to long
%X = getelementptr [10 x { int, { int } }]* %X, long 0, long %Idx, ubyte 1, ubyte 0
ret int* %X
}
Into a single address computation:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
lea %EAX, DWORD PTR [%EAX + 8*%ECX + 4]
ret
Before it generated:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
shl %ECX, 3
add %EAX, %ECX
lea %EAX, DWORD PTR [%EAX + 4]
ret
This is useful for things like int/float/double arrays, as the indexing can be folded into
the loads&stores, reducing register pressure and decreasing the pressure on the decode unit.
With these changes, I expect our performance on 256.bzip2 and gzip to improve a lot. On
bzip2 for example, we go from this:
10665 asm-printer - Number of machine instrs printed
40 ra-local - Number of loads/stores folded into instructions
1708 ra-local - Number of loads added
1532 ra-local - Number of stores added
1354 twoaddressinstruction - Number of instructions added
1354 twoaddressinstruction - Number of two-address instructions
2794 x86-peephole - Number of peephole optimization performed
to this:
9873 asm-printer - Number of machine instrs printed
41 ra-local - Number of loads/stores folded into instructions
1710 ra-local - Number of loads added
1521 ra-local - Number of stores added
789 twoaddressinstruction - Number of instructions added
789 twoaddressinstruction - Number of two-address instructions
2142 x86-peephole - Number of peephole optimization performed
... and these types of instructions are often in tight loops.
Linear scan is also helped, but not as much. It goes from:
8787 asm-printer - Number of machine instrs printed
2389 liveintervals - Number of identity moves eliminated after coalescing
2288 liveintervals - Number of interval joins performed
3522 liveintervals - Number of intervals after coalescing
5810 liveintervals - Number of original intervals
700 spiller - Number of loads added
487 spiller - Number of stores added
303 spiller - Number of register spills
1354 twoaddressinstruction - Number of instructions added
1354 twoaddressinstruction - Number of two-address instructions
363 x86-peephole - Number of peephole optimization performed
to:
7982 asm-printer - Number of machine instrs printed
1759 liveintervals - Number of identity moves eliminated after coalescing
1658 liveintervals - Number of interval joins performed
3282 liveintervals - Number of intervals after coalescing
4940 liveintervals - Number of original intervals
635 spiller - Number of loads added
452 spiller - Number of stores added
288 spiller - Number of register spills
789 twoaddressinstruction - Number of instructions added
789 twoaddressinstruction - Number of two-address instructions
258 x86-peephole - Number of peephole optimization performed
Though I'm not complaining about the drop in the number of intervals. :)
llvm-svn: 11820
2004-02-25 08:00:55 +01:00
|
|
|
// If the index reg is already taken, we can't handle this index.
|
2004-08-30 02:13:26 +02:00
|
|
|
if (AM.IndexReg) return;
|
Teach the instruction selector how to transform 'array' GEP computations into X86
scaled indexes. This allows us to compile GEP's like this:
int* %test([10 x { int, { int } }]* %X, int %Idx) {
%Idx = cast int %Idx to long
%X = getelementptr [10 x { int, { int } }]* %X, long 0, long %Idx, ubyte 1, ubyte 0
ret int* %X
}
Into a single address computation:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
lea %EAX, DWORD PTR [%EAX + 8*%ECX + 4]
ret
Before it generated:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
shl %ECX, 3
add %EAX, %ECX
lea %EAX, DWORD PTR [%EAX + 4]
ret
This is useful for things like int/float/double arrays, as the indexing can be folded into
the loads&stores, reducing register pressure and decreasing the pressure on the decode unit.
With these changes, I expect our performance on 256.bzip2 and gzip to improve a lot. On
bzip2 for example, we go from this:
10665 asm-printer - Number of machine instrs printed
40 ra-local - Number of loads/stores folded into instructions
1708 ra-local - Number of loads added
1532 ra-local - Number of stores added
1354 twoaddressinstruction - Number of instructions added
1354 twoaddressinstruction - Number of two-address instructions
2794 x86-peephole - Number of peephole optimization performed
to this:
9873 asm-printer - Number of machine instrs printed
41 ra-local - Number of loads/stores folded into instructions
1710 ra-local - Number of loads added
1521 ra-local - Number of stores added
789 twoaddressinstruction - Number of instructions added
789 twoaddressinstruction - Number of two-address instructions
2142 x86-peephole - Number of peephole optimization performed
... and these types of instructions are often in tight loops.
Linear scan is also helped, but not as much. It goes from:
8787 asm-printer - Number of machine instrs printed
2389 liveintervals - Number of identity moves eliminated after coalescing
2288 liveintervals - Number of interval joins performed
3522 liveintervals - Number of intervals after coalescing
5810 liveintervals - Number of original intervals
700 spiller - Number of loads added
487 spiller - Number of stores added
303 spiller - Number of register spills
1354 twoaddressinstruction - Number of instructions added
1354 twoaddressinstruction - Number of two-address instructions
363 x86-peephole - Number of peephole optimization performed
to:
7982 asm-printer - Number of machine instrs printed
1759 liveintervals - Number of identity moves eliminated after coalescing
1658 liveintervals - Number of interval joins performed
3282 liveintervals - Number of intervals after coalescing
4940 liveintervals - Number of original intervals
635 spiller - Number of loads added
452 spiller - Number of stores added
288 spiller - Number of register spills
789 twoaddressinstruction - Number of instructions added
789 twoaddressinstruction - Number of two-address instructions
258 x86-peephole - Number of peephole optimization performed
Though I'm not complaining about the drop in the number of intervals. :)
llvm-svn: 11820
2004-02-25 08:00:55 +01:00
|
|
|
|
2005-04-22 01:38:14 +02:00
|
|
|
// If this is a size that we can handle, then add the index as
|
Teach the instruction selector how to transform 'array' GEP computations into X86
scaled indexes. This allows us to compile GEP's like this:
int* %test([10 x { int, { int } }]* %X, int %Idx) {
%Idx = cast int %Idx to long
%X = getelementptr [10 x { int, { int } }]* %X, long 0, long %Idx, ubyte 1, ubyte 0
ret int* %X
}
Into a single address computation:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
lea %EAX, DWORD PTR [%EAX + 8*%ECX + 4]
ret
Before it generated:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
shl %ECX, 3
add %EAX, %ECX
lea %EAX, DWORD PTR [%EAX + 4]
ret
This is useful for things like int/float/double arrays, as the indexing can be folded into
the loads&stores, reducing register pressure and decreasing the pressure on the decode unit.
With these changes, I expect our performance on 256.bzip2 and gzip to improve a lot. On
bzip2 for example, we go from this:
10665 asm-printer - Number of machine instrs printed
40 ra-local - Number of loads/stores folded into instructions
1708 ra-local - Number of loads added
1532 ra-local - Number of stores added
1354 twoaddressinstruction - Number of instructions added
1354 twoaddressinstruction - Number of two-address instructions
2794 x86-peephole - Number of peephole optimization performed
to this:
9873 asm-printer - Number of machine instrs printed
41 ra-local - Number of loads/stores folded into instructions
1710 ra-local - Number of loads added
1521 ra-local - Number of stores added
789 twoaddressinstruction - Number of instructions added
789 twoaddressinstruction - Number of two-address instructions
2142 x86-peephole - Number of peephole optimization performed
... and these types of instructions are often in tight loops.
Linear scan is also helped, but not as much. It goes from:
8787 asm-printer - Number of machine instrs printed
2389 liveintervals - Number of identity moves eliminated after coalescing
2288 liveintervals - Number of interval joins performed
3522 liveintervals - Number of intervals after coalescing
5810 liveintervals - Number of original intervals
700 spiller - Number of loads added
487 spiller - Number of stores added
303 spiller - Number of register spills
1354 twoaddressinstruction - Number of instructions added
1354 twoaddressinstruction - Number of two-address instructions
363 x86-peephole - Number of peephole optimization performed
to:
7982 asm-printer - Number of machine instrs printed
1759 liveintervals - Number of identity moves eliminated after coalescing
1658 liveintervals - Number of interval joins performed
3282 liveintervals - Number of intervals after coalescing
4940 liveintervals - Number of original intervals
635 spiller - Number of loads added
452 spiller - Number of stores added
288 spiller - Number of register spills
789 twoaddressinstruction - Number of instructions added
789 twoaddressinstruction - Number of two-address instructions
258 x86-peephole - Number of peephole optimization performed
Though I'm not complaining about the drop in the number of intervals. :)
llvm-svn: 11820
2004-02-25 08:00:55 +01:00
|
|
|
switch (TypeSize) {
|
|
|
|
case 1: case 2: case 4: case 8:
|
|
|
|
// These are all acceptable scales on X86.
|
2004-08-30 02:13:26 +02:00
|
|
|
AM.Scale = TypeSize;
|
Teach the instruction selector how to transform 'array' GEP computations into X86
scaled indexes. This allows us to compile GEP's like this:
int* %test([10 x { int, { int } }]* %X, int %Idx) {
%Idx = cast int %Idx to long
%X = getelementptr [10 x { int, { int } }]* %X, long 0, long %Idx, ubyte 1, ubyte 0
ret int* %X
}
Into a single address computation:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
lea %EAX, DWORD PTR [%EAX + 8*%ECX + 4]
ret
Before it generated:
test:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
shl %ECX, 3
add %EAX, %ECX
lea %EAX, DWORD PTR [%EAX + 4]
ret
This is useful for things like int/float/double arrays, as the indexing can be folded into
the loads&stores, reducing register pressure and decreasing the pressure on the decode unit.
With these changes, I expect our performance on 256.bzip2 and gzip to improve a lot. On
bzip2 for example, we go from this:
10665 asm-printer - Number of machine instrs printed
40 ra-local - Number of loads/stores folded into instructions
1708 ra-local - Number of loads added
1532 ra-local - Number of stores added
1354 twoaddressinstruction - Number of instructions added
1354 twoaddressinstruction - Number of two-address instructions
2794 x86-peephole - Number of peephole optimization performed
to this:
9873 asm-printer - Number of machine instrs printed
41 ra-local - Number of loads/stores folded into instructions
1710 ra-local - Number of loads added
1521 ra-local - Number of stores added
789 twoaddressinstruction - Number of instructions added
789 twoaddressinstruction - Number of two-address instructions
2142 x86-peephole - Number of peephole optimization performed
... and these types of instructions are often in tight loops.
Linear scan is also helped, but not as much. It goes from:
8787 asm-printer - Number of machine instrs printed
2389 liveintervals - Number of identity moves eliminated after coalescing
2288 liveintervals - Number of interval joins performed
3522 liveintervals - Number of intervals after coalescing
5810 liveintervals - Number of original intervals
700 spiller - Number of loads added
487 spiller - Number of stores added
303 spiller - Number of register spills
1354 twoaddressinstruction - Number of instructions added
1354 twoaddressinstruction - Number of two-address instructions
363 x86-peephole - Number of peephole optimization performed
to:
7982 asm-printer - Number of machine instrs printed
1759 liveintervals - Number of identity moves eliminated after coalescing
1658 liveintervals - Number of interval joins performed
3282 liveintervals - Number of intervals after coalescing
4940 liveintervals - Number of original intervals
635 spiller - Number of loads added
452 spiller - Number of stores added
288 spiller - Number of register spills
789 twoaddressinstruction - Number of instructions added
789 twoaddressinstruction - Number of two-address instructions
258 x86-peephole - Number of peephole optimization performed
Though I'm not complaining about the drop in the number of intervals. :)
llvm-svn: 11820
2004-02-25 08:00:55 +01:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
// Otherwise, we can't handle this scale
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (CastInst *CI = dyn_cast<CastInst>(idx))
|
|
|
|
if (CI->getOperand(0)->getType() == Type::IntTy ||
|
|
|
|
CI->getOperand(0)->getType() == Type::UIntTy)
|
|
|
|
idx = CI->getOperand(0);
|
|
|
|
|
2004-08-30 02:13:26 +02:00
|
|
|
AM.IndexReg = MBB ? getReg(idx, MBB, IP) : 1;
|
2004-02-25 04:45:50 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
GEPOps.pop_back(); // Consume a GEP operand
|
|
|
|
GEPTypes.pop_back();
|
|
|
|
}
|
|
|
|
}
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
|
2004-05-23 23:23:12 +02:00
|
|
|
// GEPTypes is empty, which means we have a single operand left. Set it as
|
|
|
|
// the base register.
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
//
|
2004-08-30 02:13:26 +02:00
|
|
|
assert(AM.Base.Reg == 0);
|
2004-05-23 23:23:12 +02:00
|
|
|
|
2004-08-30 02:13:26 +02:00
|
|
|
if (AllocaInst *AI = dyn_castFixedAlloca(GEPOps.back())) {
|
|
|
|
AM.BaseType = X86AddressMode::FrameIndexBase;
|
|
|
|
AM.Base.FrameIndex = getFixedSizedAllocaFI(AI);
|
2004-05-23 23:23:12 +02:00
|
|
|
GEPOps.pop_back();
|
|
|
|
return;
|
2004-08-30 02:13:26 +02:00
|
|
|
}
|
|
|
|
|
2004-10-15 07:05:29 +02:00
|
|
|
if (GlobalValue *GV = dyn_cast<GlobalValue>(GEPOps.back())) {
|
|
|
|
AM.GV = GV;
|
|
|
|
GEPOps.pop_back();
|
|
|
|
return;
|
2004-05-23 23:23:12 +02:00
|
|
|
}
|
|
|
|
|
2004-08-30 02:13:26 +02:00
|
|
|
AM.Base.Reg = MBB ? getReg(GEPOps[0], MBB, IP) : 1;
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
GEPOps.pop_back(); // Consume the last GEP operand
|
2004-02-25 04:45:50 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
/// isGEPFoldable - Return true if the specified GEP can be completely
|
|
|
|
/// folded into the addressing mode of a load/store or lea instruction.
|
2004-09-21 20:21:21 +02:00
|
|
|
bool X86ISel::isGEPFoldable(MachineBasicBlock *MBB,
|
|
|
|
Value *Src, User::op_iterator IdxBegin,
|
|
|
|
User::op_iterator IdxEnd, X86AddressMode &AM) {
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
|
|
|
|
std::vector<Value*> GEPOps;
|
|
|
|
GEPOps.resize(IdxEnd-IdxBegin+1);
|
|
|
|
GEPOps[0] = Src;
|
|
|
|
std::copy(IdxBegin, IdxEnd, GEPOps.begin()+1);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-05-23 23:23:12 +02:00
|
|
|
std::vector<const Type*>
|
|
|
|
GEPTypes(gep_type_begin(Src->getType(), IdxBegin, IdxEnd),
|
|
|
|
gep_type_end(Src->getType(), IdxBegin, IdxEnd));
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
|
|
|
|
MachineBasicBlock::iterator IP;
|
|
|
|
if (MBB) IP = MBB->end();
|
2004-08-30 02:13:26 +02:00
|
|
|
getGEPIndex(MBB, IP, GEPOps, GEPTypes, AM);
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
|
|
|
|
// We can fold it away iff the getGEPIndex call eliminated all operands.
|
|
|
|
return GEPOps.empty();
|
|
|
|
}
|
|
|
|
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::emitGEPOperation(MachineBasicBlock *MBB,
|
|
|
|
MachineBasicBlock::iterator IP,
|
|
|
|
Value *Src, User::op_iterator IdxBegin,
|
|
|
|
User::op_iterator IdxEnd, unsigned TargetReg) {
|
2002-12-13 07:56:29 +01:00
|
|
|
const TargetData &TD = TM.getTargetData();
|
2004-02-22 18:35:42 +01:00
|
|
|
|
2004-07-15 02:58:53 +02:00
|
|
|
// If this is a getelementptr null, with all constant integer indices, just
|
|
|
|
// replace it with TargetReg = 42.
|
|
|
|
if (isa<ConstantPointerNull>(Src)) {
|
|
|
|
User::op_iterator I = IdxBegin;
|
|
|
|
for (; I != IdxEnd; ++I)
|
|
|
|
if (!isa<ConstantInt>(*I))
|
|
|
|
break;
|
|
|
|
if (I == IdxEnd) { // All constant indices
|
|
|
|
unsigned Offset = TD.getIndexedOffset(Src->getType(),
|
|
|
|
std::vector<Value*>(IdxBegin, IdxEnd));
|
|
|
|
BuildMI(*MBB, IP, X86::MOV32ri, 1, TargetReg).addImm(Offset);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2004-02-22 08:04:00 +01:00
|
|
|
std::vector<Value*> GEPOps;
|
|
|
|
GEPOps.resize(IdxEnd-IdxBegin+1);
|
|
|
|
GEPOps[0] = Src;
|
|
|
|
std::copy(IdxBegin, IdxEnd, GEPOps.begin()+1);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-02-22 08:04:00 +01:00
|
|
|
std::vector<const Type*> GEPTypes;
|
|
|
|
GEPTypes.assign(gep_type_begin(Src->getType(), IdxBegin, IdxEnd),
|
|
|
|
gep_type_end(Src->getType(), IdxBegin, IdxEnd));
|
|
|
|
|
|
|
|
// Keep emitting instructions until we consume the entire GEP instruction.
|
|
|
|
while (!GEPOps.empty()) {
|
|
|
|
unsigned OldSize = GEPOps.size();
|
2004-08-30 02:13:26 +02:00
|
|
|
X86AddressMode AM;
|
|
|
|
getGEPIndex(MBB, IP, GEPOps, GEPTypes, AM);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2004-02-25 04:45:50 +01:00
|
|
|
if (GEPOps.size() != OldSize) {
|
|
|
|
// getGEPIndex consumed some of the input. Build an LEA instruction here.
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
unsigned NextTarget = 0;
|
|
|
|
if (!GEPOps.empty()) {
|
2004-08-30 02:13:26 +02:00
|
|
|
assert(AM.Base.Reg == 0 &&
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
"getGEPIndex should have left the base register open for chaining!");
|
2004-08-30 02:13:26 +02:00
|
|
|
NextTarget = AM.Base.Reg = makeAnotherReg(Type::UIntTy);
|
2004-02-25 04:45:50 +01:00
|
|
|
}
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
|
2004-08-30 02:13:26 +02:00
|
|
|
if (AM.BaseType == X86AddressMode::RegBase &&
|
2004-10-15 07:05:29 +02:00
|
|
|
AM.IndexReg == 0 && AM.Disp == 0 && !AM.GV)
|
2004-08-30 02:13:26 +02:00
|
|
|
BuildMI(*MBB, IP, X86::MOV32rr, 1, TargetReg).addReg(AM.Base.Reg);
|
2004-10-15 07:05:29 +02:00
|
|
|
else if (AM.BaseType == X86AddressMode::RegBase && AM.Base.Reg == 0 &&
|
|
|
|
AM.IndexReg == 0 && AM.Disp == 0)
|
|
|
|
BuildMI(*MBB, IP, X86::MOV32ri, 1, TargetReg).addGlobalAddress(AM.GV);
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
else
|
2004-08-30 02:13:26 +02:00
|
|
|
addFullAddress(BuildMI(*MBB, IP, X86::LEA32r, 5, TargetReg), AM);
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
--IP;
|
|
|
|
TargetReg = NextTarget;
|
2004-02-25 04:45:50 +01:00
|
|
|
} else if (GEPTypes.empty()) {
|
2004-02-22 08:04:00 +01:00
|
|
|
// The getGEPIndex operation didn't want to build an LEA. Check to see if
|
|
|
|
// all operands are consumed but the base pointer. If so, just load it
|
|
|
|
// into the register.
|
2004-02-22 18:35:42 +01:00
|
|
|
if (GlobalValue *GV = dyn_cast<GlobalValue>(GEPOps[0])) {
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::MOV32ri, 1, TargetReg).addGlobalAddress(GV);
|
2004-02-22 18:35:42 +01:00
|
|
|
} else {
|
|
|
|
unsigned BaseReg = getReg(GEPOps[0], MBB, IP);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::MOV32rr, 1, TargetReg).addReg(BaseReg);
|
2004-02-22 18:35:42 +01:00
|
|
|
}
|
|
|
|
break; // we are now done
|
* Make the previous patch more efficient by not allocating a temporary MachineInstr
to do analysis.
*** FOLD getelementptr instructions into loads and stores when possible,
making use of some of the crazy X86 addressing modes.
For example, the following C++ program fragment:
struct complex {
double re, im;
complex(double r, double i) : re(r), im(i) {}
};
inline complex operator+(const complex& a, const complex& b) {
return complex(a.re+b.re, a.im+b.im);
}
complex addone(const complex& arg) {
return arg + complex(1,0);
}
Used to be compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
*** mov %EDX, %ECX
fld QWORD PTR [%EDX]
fld1
faddp %ST(1)
*** add %ECX, 8
fld QWORD PTR [%ECX]
fldz
faddp %ST(1)
*** mov %ECX, %EAX
fxch %ST(1)
fstp QWORD PTR [%ECX]
*** add %EAX, 8
fstp QWORD PTR [%EAX]
ret
Now it is compiled to:
_Z6addoneRK7complex:
mov %EAX, DWORD PTR [%ESP + 4]
mov %ECX, DWORD PTR [%ESP + 8]
fld QWORD PTR [%ECX]
fld1
faddp %ST(1)
fld QWORD PTR [%ECX + 8]
fldz
faddp %ST(1)
fxch %ST(1)
fstp QWORD PTR [%EAX]
fstp QWORD PTR [%EAX + 8]
ret
Other programs should see similar improvements, across the board. Note that
in addition to reducing instruction count, this also reduces register pressure
a lot, always a good thing on X86. :)
llvm-svn: 11819
2004-02-25 07:13:04 +01:00
|
|
|
|
2004-02-22 08:04:00 +01:00
|
|
|
} else {
|
This checkin is brought to you by the brian gaeke allnighter fund.
(lib/Target/X86) InstSelectSimple.cpp:
Include llvm/DerivedTypes.h and iostream.
Refactor visitMul out into a wrapper around doMultiply(), so that we
can do multiplications on temporary values when we are doing
getelementptrs.
Refactor part of getReg out into makeAnotherReg, so that we can create
registers willy-nilly to hold temporary values, when we are doing
getelementptrs.
Add stub implementations of visitMallocInst and visitAllocaInst.
Add initial implementation of visitGetElementPtrInst.
In copyConstantToRegister:
We throw a *lot* of our asserts here. So, when we want to throw an
assert, print out to stderr whatever expr or whatever constant made
us barf.
Support copying ConstantPointerNull to register, using a move immediate
of zero.
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
Teach visitCallInst to extract byte- and short-class return values
from subregs of EAX. Add a FIXME note about how we would do it for
float-class return values.
Add a FIXME note about how we would cast float to int and back.
X86InstrInfo.def:
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
(tools/jello) GlobalVars.cpp:
Include iostream.
If we have to emit a floating-point constant to memory, gamble and use
the same method as for ints.
If we have to emit a ConstantPointerNull to memory, try using a "void *"
and "NULL".
Otherwise, if we are going to throw an assert, print out whatever constant
made us barf, first.
llvm-svn: 4973
2002-12-12 16:33:40 +01:00
|
|
|
// It's an array or pointer access: [ArraySize x ElementType].
|
2004-02-22 08:04:00 +01:00
|
|
|
const SequentialType *SqTy = cast<SequentialType>(GEPTypes.back());
|
|
|
|
Value *idx = GEPOps.back();
|
|
|
|
GEPOps.pop_back(); // Consume a GEP operand
|
|
|
|
GEPTypes.pop_back();
|
2002-12-16 20:32:50 +01:00
|
|
|
|
2004-04-05 03:30:19 +02:00
|
|
|
// Many GEP instructions use a [cast (int/uint) to LongTy] as their
|
2003-06-21 18:01:24 +02:00
|
|
|
// operand on X86. Handle this case directly now...
|
|
|
|
if (CastInst *CI = dyn_cast<CastInst>(idx))
|
|
|
|
if (CI->getOperand(0)->getType() == Type::IntTy ||
|
|
|
|
CI->getOperand(0)->getType() == Type::UIntTy)
|
|
|
|
idx = CI->getOperand(0);
|
|
|
|
|
2003-01-13 01:32:26 +01:00
|
|
|
// We want to add BaseReg to(idxReg * sizeof ElementType). First, we
|
2002-12-16 20:32:50 +01:00
|
|
|
// must find the size of the pointed-to type (Not coincidentally, the next
|
|
|
|
// type is the type of the elements in the array).
|
2004-02-22 08:04:00 +01:00
|
|
|
const Type *ElTy = SqTy->getElementType();
|
|
|
|
unsigned elementSize = TD.getTypeSize(ElTy);
|
2002-12-16 20:32:50 +01:00
|
|
|
|
|
|
|
// If idxReg is a constant, we don't need to perform the multiply!
|
2004-04-05 03:30:19 +02:00
|
|
|
if (ConstantInt *CSI = dyn_cast<ConstantInt>(idx)) {
|
2003-01-13 01:32:26 +01:00
|
|
|
if (!CSI->isNullValue()) {
|
2004-04-05 03:30:19 +02:00
|
|
|
unsigned Offset = elementSize*CSI->getRawValue();
|
2004-02-22 08:04:00 +01:00
|
|
|
unsigned Reg = makeAnotherReg(Type::UIntTy);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::ADD32ri, 2, TargetReg)
|
2004-02-29 08:22:16 +01:00
|
|
|
.addReg(Reg).addImm(Offset);
|
2004-02-22 08:04:00 +01:00
|
|
|
--IP; // Insert the next instruction before this one.
|
|
|
|
TargetReg = Reg; // Codegen the rest of the GEP into this
|
2002-12-16 20:32:50 +01:00
|
|
|
}
|
|
|
|
} else if (elementSize == 1) {
|
|
|
|
// If the element size is 1, we don't have to multiply, just add
|
|
|
|
unsigned idxReg = getReg(idx, MBB, IP);
|
2004-02-22 08:04:00 +01:00
|
|
|
unsigned Reg = makeAnotherReg(Type::UIntTy);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::ADD32rr, 2,TargetReg).addReg(Reg).addReg(idxReg);
|
2004-02-22 08:04:00 +01:00
|
|
|
--IP; // Insert the next instruction before this one.
|
|
|
|
TargetReg = Reg; // Codegen the rest of the GEP into this
|
2002-12-16 20:32:50 +01:00
|
|
|
} else {
|
|
|
|
unsigned idxReg = getReg(idx, MBB, IP);
|
|
|
|
unsigned OffsetReg = makeAnotherReg(Type::UIntTy);
|
2003-10-19 23:09:10 +02:00
|
|
|
|
2004-02-22 08:04:00 +01:00
|
|
|
// Make sure we can back the iterator up to point to the first
|
|
|
|
// instruction emitted.
|
|
|
|
MachineBasicBlock::iterator BeforeIt = IP;
|
|
|
|
if (IP == MBB->begin())
|
|
|
|
BeforeIt = MBB->end();
|
|
|
|
else
|
|
|
|
--BeforeIt;
|
2003-10-19 23:09:10 +02:00
|
|
|
doMultiplyConst(MBB, IP, OffsetReg, Type::IntTy, idxReg, elementSize);
|
|
|
|
|
2002-12-16 20:32:50 +01:00
|
|
|
// Emit an ADD to add OffsetReg to the basePtr.
|
2004-02-22 08:04:00 +01:00
|
|
|
unsigned Reg = makeAnotherReg(Type::UIntTy);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(*MBB, IP, X86::ADD32rr, 2, TargetReg)
|
2004-02-29 08:22:16 +01:00
|
|
|
.addReg(Reg).addReg(OffsetReg);
|
2004-02-22 08:04:00 +01:00
|
|
|
|
|
|
|
// Step to the first instruction of the multiply.
|
|
|
|
if (BeforeIt == MBB->end())
|
|
|
|
IP = MBB->begin();
|
|
|
|
else
|
|
|
|
IP = ++BeforeIt;
|
|
|
|
|
|
|
|
TargetReg = Reg; // Codegen the rest of the GEP into this
|
2002-12-16 20:32:50 +01:00
|
|
|
}
|
This checkin is brought to you by the brian gaeke allnighter fund.
(lib/Target/X86) InstSelectSimple.cpp:
Include llvm/DerivedTypes.h and iostream.
Refactor visitMul out into a wrapper around doMultiply(), so that we
can do multiplications on temporary values when we are doing
getelementptrs.
Refactor part of getReg out into makeAnotherReg, so that we can create
registers willy-nilly to hold temporary values, when we are doing
getelementptrs.
Add stub implementations of visitMallocInst and visitAllocaInst.
Add initial implementation of visitGetElementPtrInst.
In copyConstantToRegister:
We throw a *lot* of our asserts here. So, when we want to throw an
assert, print out to stderr whatever expr or whatever constant made
us barf.
Support copying ConstantPointerNull to register, using a move immediate
of zero.
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
Teach visitCallInst to extract byte- and short-class return values
from subregs of EAX. Add a FIXME note about how we would do it for
float-class return values.
Add a FIXME note about how we would cast float to int and back.
X86InstrInfo.def:
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
(tools/jello) GlobalVars.cpp:
Include iostream.
If we have to emit a floating-point constant to memory, gamble and use
the same method as for ints.
If we have to emit a ConstantPointerNull to memory, try using a "void *"
and "NULL".
Otherwise, if we are going to throw an assert, print out whatever constant
made us barf, first.
llvm-svn: 4973
2002-12-12 16:33:40 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2002-12-28 21:24:02 +01:00
|
|
|
/// visitAllocaInst - If this is a fixed size alloca, allocate space from the
|
|
|
|
/// frame manager, otherwise do it the hard way.
|
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitAllocaInst(AllocaInst &I) {
|
Second half of my fixed-sized-alloca patch. This folds the LEA to compute
the alloca address into common operations like loads/stores.
In a simple testcase like this (which is just designed to excersize the
alloca A, nothing more):
int %test(int %X, bool %C) {
%A = alloca int
store int %X, int* %A
store int* %A, int** %G
br bool %C, label %T, label %F
T:
call int %test(int 1, bool false)
%V = load int* %A
ret int %V
F:
call int %test(int 123, bool true)
%V2 = load int* %A
ret int %V2
}
We now generate:
test:
sub %ESP, 12
mov %EAX, DWORD PTR [%ESP + 16]
mov %CL, BYTE PTR [%ESP + 20]
*** mov DWORD PTR [%ESP + 8], %EAX
mov %EAX, OFFSET G
lea %EDX, DWORD PTR [%ESP + 8]
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov DWORD PTR [%ESP + 4], 0
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov DWORD PTR [%ESP + 4], 1
call test
*** mov %EAX, DWORD PTR [%ESP + 8]
add %ESP, 12
ret
Instead of:
test:
sub %ESP, 20
mov %EAX, DWORD PTR [%ESP + 24]
mov %CL, BYTE PTR [%ESP + 28]
*** lea %EDX, DWORD PTR [%ESP + 16]
*** mov DWORD PTR [%EDX], %EAX
mov %EAX, OFFSET G
mov DWORD PTR [%EAX], %EDX
test %CL, %CL
*** mov DWORD PTR [%ESP + 12], %EDX
je .LBB2 # PC rel: F
.LBB1: # T
mov DWORD PTR [%ESP], 1
mov %EAX, 0
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
.LBB2: # F
mov DWORD PTR [%ESP], 123
mov %EAX, 1
mov DWORD PTR [%ESP + 4], %EAX
call test
*** mov %EAX, DWORD PTR [%ESP + 12]
*** mov %EAX, DWORD PTR [%EAX]
add %ESP, 20
ret
llvm-svn: 13557
2004-05-13 17:12:43 +02:00
|
|
|
// If this is a fixed size alloca in the entry block for the function, we
|
|
|
|
// statically stack allocate the space, so we don't need to do anything here.
|
|
|
|
//
|
2004-05-13 09:40:27 +02:00
|
|
|
if (dyn_castFixedAlloca(&I)) return;
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2002-12-28 21:24:02 +01:00
|
|
|
// Find the data size of the alloca inst's getAllocatedType.
|
|
|
|
const Type *Ty = I.getAllocatedType();
|
|
|
|
unsigned TySize = TM.getTargetData().getTypeSize(Ty);
|
2002-12-13 07:46:31 +01:00
|
|
|
|
2002-12-28 21:24:02 +01:00
|
|
|
// Create a register to hold the temporary result of multiplying the type size
|
|
|
|
// constant by the variable amount.
|
|
|
|
unsigned TotalSizeReg = makeAnotherReg(Type::UIntTy);
|
|
|
|
unsigned SrcReg1 = getReg(I.getArraySize());
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2002-12-28 21:24:02 +01:00
|
|
|
// TotalSizeReg = mul <numelements>, <TypeSize>
|
|
|
|
MachineBasicBlock::iterator MBBI = BB->end();
|
2003-10-19 23:09:10 +02:00
|
|
|
doMultiplyConst(BB, MBBI, TotalSizeReg, Type::UIntTy, SrcReg1, TySize);
|
This checkin is brought to you by the brian gaeke allnighter fund.
(lib/Target/X86) InstSelectSimple.cpp:
Include llvm/DerivedTypes.h and iostream.
Refactor visitMul out into a wrapper around doMultiply(), so that we
can do multiplications on temporary values when we are doing
getelementptrs.
Refactor part of getReg out into makeAnotherReg, so that we can create
registers willy-nilly to hold temporary values, when we are doing
getelementptrs.
Add stub implementations of visitMallocInst and visitAllocaInst.
Add initial implementation of visitGetElementPtrInst.
In copyConstantToRegister:
We throw a *lot* of our asserts here. So, when we want to throw an
assert, print out to stderr whatever expr or whatever constant made
us barf.
Support copying ConstantPointerNull to register, using a move immediate
of zero.
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
Teach visitCallInst to extract byte- and short-class return values
from subregs of EAX. Add a FIXME note about how we would do it for
float-class return values.
Add a FIXME note about how we would cast float to int and back.
X86InstrInfo.def:
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
(tools/jello) GlobalVars.cpp:
Include iostream.
If we have to emit a floating-point constant to memory, gamble and use
the same method as for ints.
If we have to emit a ConstantPointerNull to memory, try using a "void *"
and "NULL".
Otherwise, if we are going to throw an assert, print out whatever constant
made us barf, first.
llvm-svn: 4973
2002-12-12 16:33:40 +01:00
|
|
|
|
2002-12-28 21:24:02 +01:00
|
|
|
// AddedSize = add <TotalSizeReg>, 15
|
|
|
|
unsigned AddedSizeReg = makeAnotherReg(Type::UIntTy);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::ADD32ri, 2, AddedSizeReg).addReg(TotalSizeReg).addImm(15);
|
This checkin is brought to you by the brian gaeke allnighter fund.
(lib/Target/X86) InstSelectSimple.cpp:
Include llvm/DerivedTypes.h and iostream.
Refactor visitMul out into a wrapper around doMultiply(), so that we
can do multiplications on temporary values when we are doing
getelementptrs.
Refactor part of getReg out into makeAnotherReg, so that we can create
registers willy-nilly to hold temporary values, when we are doing
getelementptrs.
Add stub implementations of visitMallocInst and visitAllocaInst.
Add initial implementation of visitGetElementPtrInst.
In copyConstantToRegister:
We throw a *lot* of our asserts here. So, when we want to throw an
assert, print out to stderr whatever expr or whatever constant made
us barf.
Support copying ConstantPointerNull to register, using a move immediate
of zero.
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
Teach visitCallInst to extract byte- and short-class return values
from subregs of EAX. Add a FIXME note about how we would do it for
float-class return values.
Add a FIXME note about how we would cast float to int and back.
X86InstrInfo.def:
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
(tools/jello) GlobalVars.cpp:
Include iostream.
If we have to emit a floating-point constant to memory, gamble and use
the same method as for ints.
If we have to emit a ConstantPointerNull to memory, try using a "void *"
and "NULL".
Otherwise, if we are going to throw an assert, print out whatever constant
made us barf, first.
llvm-svn: 4973
2002-12-12 16:33:40 +01:00
|
|
|
|
2002-12-28 21:24:02 +01:00
|
|
|
// AlignedSize = and <AddedSize>, ~15
|
|
|
|
unsigned AlignedSize = makeAnotherReg(Type::UIntTy);
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::AND32ri, 2, AlignedSize).addReg(AddedSizeReg).addImm(~15);
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2002-12-13 07:46:31 +01:00
|
|
|
// Subtract size from stack pointer, thereby allocating some space.
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::SUB32rr, 2, X86::ESP).addReg(X86::ESP).addReg(AlignedSize);
|
2002-12-28 21:24:02 +01:00
|
|
|
|
2002-12-13 07:46:31 +01:00
|
|
|
// Put a pointer to the space into the result register, by copying
|
|
|
|
// the stack pointer.
|
A big X86 instruction rename. The instructions are renamed to make
their names more decriptive. A name consists of the base name, a
default operand size followed by a character per operand with an
optional special size. For example:
ADD8rr -> add, 8-bit register, 8-bit register
IMUL16rmi -> imul, 16-bit register, 16-bit memory, 16-bit immediate
IMUL16rmi8 -> imul, 16-bit register, 16-bit memory, 8-bit immediate
MOVSX32rm16 -> movsx, 32-bit register, 16-bit memory
llvm-svn: 11995
2004-02-29 09:50:03 +01:00
|
|
|
BuildMI(BB, X86::MOV32rr, 1, getReg(I)).addReg(X86::ESP);
|
2002-12-28 21:24:02 +01:00
|
|
|
|
2003-05-03 04:18:17 +02:00
|
|
|
// Inform the Frame Information that we have just allocated a variable-sized
|
2002-12-28 21:24:02 +01:00
|
|
|
// object.
|
|
|
|
F->getFrameInfo()->CreateVariableSizedObject();
|
This checkin is brought to you by the brian gaeke allnighter fund.
(lib/Target/X86) InstSelectSimple.cpp:
Include llvm/DerivedTypes.h and iostream.
Refactor visitMul out into a wrapper around doMultiply(), so that we
can do multiplications on temporary values when we are doing
getelementptrs.
Refactor part of getReg out into makeAnotherReg, so that we can create
registers willy-nilly to hold temporary values, when we are doing
getelementptrs.
Add stub implementations of visitMallocInst and visitAllocaInst.
Add initial implementation of visitGetElementPtrInst.
In copyConstantToRegister:
We throw a *lot* of our asserts here. So, when we want to throw an
assert, print out to stderr whatever expr or whatever constant made
us barf.
Support copying ConstantPointerNull to register, using a move immediate
of zero.
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
Teach visitCallInst to extract byte- and short-class return values
from subregs of EAX. Add a FIXME note about how we would do it for
float-class return values.
Add a FIXME note about how we would cast float to int and back.
X86InstrInfo.def:
Rename FLDr4 and FLDr8 to FLDr32 and FLDr64, so that they match the meanings
of the numbers in the other instruction names. All uses modified.
(tools/jello) GlobalVars.cpp:
Include iostream.
If we have to emit a floating-point constant to memory, gamble and use
the same method as for ints.
If we have to emit a ConstantPointerNull to memory, try using a "void *"
and "NULL".
Otherwise, if we are going to throw an assert, print out whatever constant
made us barf, first.
llvm-svn: 4973
2002-12-12 16:33:40 +01:00
|
|
|
}
|
2003-01-13 01:32:26 +01:00
|
|
|
|
|
|
|
/// visitMallocInst - Malloc instructions are code generated into direct calls
|
|
|
|
/// to the library malloc.
|
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitMallocInst(MallocInst &I) {
|
2003-01-13 01:32:26 +01:00
|
|
|
unsigned AllocSize = TM.getTargetData().getTypeSize(I.getAllocatedType());
|
|
|
|
unsigned Arg;
|
|
|
|
|
|
|
|
if (ConstantUInt *C = dyn_cast<ConstantUInt>(I.getOperand(0))) {
|
|
|
|
Arg = getReg(ConstantUInt::get(Type::UIntTy, C->getValue() * AllocSize));
|
|
|
|
} else {
|
|
|
|
Arg = makeAnotherReg(Type::UIntTy);
|
2003-10-19 23:09:10 +02:00
|
|
|
unsigned Op0Reg = getReg(I.getOperand(0));
|
2003-01-13 01:32:26 +01:00
|
|
|
MachineBasicBlock::iterator MBBI = BB->end();
|
2003-10-19 23:09:10 +02:00
|
|
|
doMultiplyConst(BB, MBBI, Arg, Type::UIntTy, Op0Reg, AllocSize);
|
2003-01-13 01:32:26 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<ValueRecord> Args;
|
|
|
|
Args.push_back(ValueRecord(Arg, Type::UIntTy));
|
|
|
|
MachineInstr *TheCall = BuildMI(X86::CALLpcrel32,
|
2003-10-23 18:22:08 +02:00
|
|
|
1).addExternalSymbol("malloc", true);
|
2003-01-13 01:32:26 +01:00
|
|
|
doCall(ValueRecord(getReg(I), I.getType()), TheCall, Args);
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// visitFreeInst - Free instructions are code gen'd to call the free libc
|
|
|
|
/// function.
|
|
|
|
///
|
2004-09-21 20:21:21 +02:00
|
|
|
void X86ISel::visitFreeInst(FreeInst &I) {
|
2003-01-13 01:32:26 +01:00
|
|
|
std::vector<ValueRecord> Args;
|
2003-08-04 04:12:48 +02:00
|
|
|
Args.push_back(ValueRecord(I.getOperand(0)));
|
2003-01-13 01:32:26 +01:00
|
|
|
MachineInstr *TheCall = BuildMI(X86::CALLpcrel32,
|
2003-10-23 18:22:08 +02:00
|
|
|
1).addExternalSymbol("free", true);
|
2003-01-13 01:32:26 +01:00
|
|
|
doCall(ValueRecord(0, Type::VoidTy), TheCall, Args);
|
|
|
|
}
|
2005-04-22 01:38:14 +02:00
|
|
|
|
2003-07-27 01:49:58 +02:00
|
|
|
/// createX86SimpleInstructionSelector - This pass converts an LLVM function
|
2002-10-29 23:37:54 +01:00
|
|
|
/// into a machine code representation is a very simple peep-hole fashion. The
|
2002-10-26 00:55:53 +02:00
|
|
|
/// generated code sucks but the implementation is nice and simple.
|
|
|
|
///
|
2003-12-28 22:23:38 +01:00
|
|
|
FunctionPass *llvm::createX86SimpleInstructionSelector(TargetMachine &TM) {
|
2004-09-21 20:21:21 +02:00
|
|
|
return new X86ISel(TM);
|
2002-10-26 00:55:53 +02:00
|
|
|
}
|