1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 02:52:53 +02:00
llvm-mirror/lib/Target/AArch64/AArch64InstructionSelector.cpp

4500 lines
160 KiB
C++
Raw Normal View History

//===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
/// \file
/// This file implements the targeting of the InstructionSelector class for
/// AArch64.
/// \todo This should be generated by TableGen.
//===----------------------------------------------------------------------===//
#include "AArch64InstrInfo.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64RegisterBankInfo.h"
#include "AArch64RegisterInfo.h"
#include "AArch64Subtarget.h"
#include "AArch64TargetMachine.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
#include "llvm/ADT/Optional.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
#define DEBUG_TYPE "aarch64-isel"
using namespace llvm;
namespace {
#define GET_GLOBALISEL_PREDICATE_BITSET
#include "AArch64GenGlobalISel.inc"
#undef GET_GLOBALISEL_PREDICATE_BITSET
class AArch64InstructionSelector : public InstructionSelector {
public:
AArch64InstructionSelector(const AArch64TargetMachine &TM,
const AArch64Subtarget &STI,
const AArch64RegisterBankInfo &RBI);
2017-11-16 01:46:35 +01:00
bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
static const char *getName() { return DEBUG_TYPE; }
private:
/// tblgen-erated 'select' implementation, used as the initial selector for
/// the patterns that don't require complex C++.
2017-11-16 01:46:35 +01:00
bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
// A lowering phase that runs before any selection attempts.
void preISelLower(MachineInstr &I) const;
// An early selection function that runs before the selectImpl() call.
bool earlySelect(MachineInstr &I) const;
bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool earlySelectLoad(MachineInstr &I, MachineRegisterInfo &MRI) const;
/// Eliminate same-sized cross-bank copies into stores before selectImpl().
void contractCrossBankCopyIntoStore(MachineInstr &I,
MachineRegisterInfo &MRI) const;
bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
MachineRegisterInfo &MRI) const;
bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
MachineRegisterInfo &MRI) const;
bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
MachineRegisterInfo &MRI) const;
bool selectVectorASHR(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI) const;
// Helper to generate an equivalent of scalar_to_vector into a new register,
// returned via 'Dst'.
MachineInstr *emitScalarToVector(unsigned EltSize,
const TargetRegisterClass *DstRC,
Register Scalar,
MachineIRBuilder &MIRBuilder) const;
/// Emit a lane insert into \p DstReg, or a new vector register if None is
/// provided.
///
/// The lane inserted into is defined by \p LaneIdx. The vector source
/// register is given by \p SrcReg. The register containing the element is
/// given by \p EltReg.
MachineInstr *emitLaneInsert(Optional<Register> DstReg, Register SrcReg,
Register EltReg, unsigned LaneIdx,
const RegisterBank &RB,
MachineIRBuilder &MIRBuilder) const;
bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
void collectShuffleMaskIndices(MachineInstr &I, MachineRegisterInfo &MRI,
SmallVectorImpl<Optional<int>> &Idxs) const;
bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectSplitVectorUnmerge(MachineInstr &I,
MachineRegisterInfo &MRI) const;
bool selectIntrinsicWithSideEffects(MachineInstr &I,
MachineRegisterInfo &MRI) const;
bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const;
unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const;
MachineInstr *emitLoadFromConstantPool(Constant *CPVal,
MachineIRBuilder &MIRBuilder) const;
// Emit a vector concat operation.
MachineInstr *emitVectorConcat(Optional<Register> Dst, Register Op1,
Register Op2,
MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
MachineOperand &Predicate,
MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitTST(const Register &LHS, const Register &RHS,
MachineIRBuilder &MIRBuilder) const;
MachineInstr *emitExtractVectorElt(Optional<Register> DstReg,
const RegisterBank &DstRB, LLT ScalarTy,
Register VecReg, unsigned LaneIdx,
MachineIRBuilder &MIRBuilder) const;
/// Helper function for selecting G_FCONSTANT. If the G_FCONSTANT can be
/// materialized using a FMOV instruction, then update MI and return it.
/// Otherwise, do nothing and return a nullptr.
MachineInstr *emitFMovForFConstant(MachineInstr &MI,
MachineRegisterInfo &MRI) const;
/// Emit a CSet for a compare.
MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred,
MachineIRBuilder &MIRBuilder) const;
// Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
// We use these manually instead of using the importer since it doesn't
// support SDNodeXForm.
ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
unsigned Size) const;
ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
return selectAddrModeUnscaled(Root, 1);
}
ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
return selectAddrModeUnscaled(Root, 2);
}
ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
return selectAddrModeUnscaled(Root, 4);
}
ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
return selectAddrModeUnscaled(Root, 8);
}
ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
return selectAddrModeUnscaled(Root, 16);
}
ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
unsigned Size) const;
template <int Width>
ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
return selectAddrModeIndexed(Root, Width / 8);
}
bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
const MachineRegisterInfo &MRI) const;
ComplexRendererFns
selectAddrModeShiftedExtendXReg(MachineOperand &Root,
unsigned SizeInBytes) const;
ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
unsigned SizeInBytes) const;
void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const;
// Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
void materializeLargeCMVal(MachineInstr &I, const Value *V,
unsigned OpFlags) const;
// Optimization methods.
bool tryOptVectorShuffle(MachineInstr &I) const;
bool tryOptVectorDup(MachineInstr &MI) const;
bool tryOptSelect(MachineInstr &MI) const;
MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
MachineOperand &Predicate,
MachineIRBuilder &MIRBuilder) const;
const AArch64TargetMachine &TM;
const AArch64Subtarget &STI;
const AArch64InstrInfo &TII;
const AArch64RegisterInfo &TRI;
const AArch64RegisterBankInfo &RBI;
#define GET_GLOBALISEL_PREDICATES_DECL
#include "AArch64GenGlobalISel.inc"
#undef GET_GLOBALISEL_PREDICATES_DECL
// We declare the temporaries used by selectImpl() in the class to minimize the
// cost of constructing placeholder values.
#define GET_GLOBALISEL_TEMPORARIES_DECL
#include "AArch64GenGlobalISel.inc"
#undef GET_GLOBALISEL_TEMPORARIES_DECL
};
} // end anonymous namespace
#define GET_GLOBALISEL_IMPL
#include "AArch64GenGlobalISel.inc"
#undef GET_GLOBALISEL_IMPL
AArch64InstructionSelector::AArch64InstructionSelector(
const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
const AArch64RegisterBankInfo &RBI)
: InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
TRI(*STI.getRegisterInfo()), RBI(RBI),
#define GET_GLOBALISEL_PREDICATES_INIT
#include "AArch64GenGlobalISel.inc"
#undef GET_GLOBALISEL_PREDICATES_INIT
#define GET_GLOBALISEL_TEMPORARIES_INIT
#include "AArch64GenGlobalISel.inc"
#undef GET_GLOBALISEL_TEMPORARIES_INIT
{
}
// FIXME: This should be target-independent, inferred from the types declared
// for each class in the bank.
static const TargetRegisterClass *
getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
const RegisterBankInfo &RBI,
bool GetAllRegSet = false) {
if (RB.getID() == AArch64::GPRRegBankID) {
if (Ty.getSizeInBits() <= 32)
return GetAllRegSet ? &AArch64::GPR32allRegClass
: &AArch64::GPR32RegClass;
if (Ty.getSizeInBits() == 64)
return GetAllRegSet ? &AArch64::GPR64allRegClass
: &AArch64::GPR64RegClass;
return nullptr;
}
if (RB.getID() == AArch64::FPRRegBankID) {
if (Ty.getSizeInBits() <= 16)
return &AArch64::FPR16RegClass;
if (Ty.getSizeInBits() == 32)
return &AArch64::FPR32RegClass;
if (Ty.getSizeInBits() == 64)
return &AArch64::FPR64RegClass;
if (Ty.getSizeInBits() == 128)
return &AArch64::FPR128RegClass;
return nullptr;
}
return nullptr;
}
/// Given a register bank, and size in bits, return the smallest register class
/// that can represent that combination.
static const TargetRegisterClass *
getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits,
bool GetAllRegSet = false) {
unsigned RegBankID = RB.getID();
if (RegBankID == AArch64::GPRRegBankID) {
if (SizeInBits <= 32)
return GetAllRegSet ? &AArch64::GPR32allRegClass
: &AArch64::GPR32RegClass;
if (SizeInBits == 64)
return GetAllRegSet ? &AArch64::GPR64allRegClass
: &AArch64::GPR64RegClass;
}
if (RegBankID == AArch64::FPRRegBankID) {
switch (SizeInBits) {
default:
return nullptr;
case 8:
return &AArch64::FPR8RegClass;
case 16:
return &AArch64::FPR16RegClass;
case 32:
return &AArch64::FPR32RegClass;
case 64:
return &AArch64::FPR64RegClass;
case 128:
return &AArch64::FPR128RegClass;
}
}
return nullptr;
}
/// Returns the correct subregister to use for a given register class.
static bool getSubRegForClass(const TargetRegisterClass *RC,
const TargetRegisterInfo &TRI, unsigned &SubReg) {
switch (TRI.getRegSizeInBits(*RC)) {
case 8:
SubReg = AArch64::bsub;
break;
case 16:
SubReg = AArch64::hsub;
break;
case 32:
if (RC == &AArch64::GPR32RegClass)
SubReg = AArch64::sub_32;
else
SubReg = AArch64::ssub;
break;
case 64:
SubReg = AArch64::dsub;
break;
default:
LLVM_DEBUG(
dbgs() << "Couldn't find appropriate subregister for register class.");
return false;
}
return true;
}
/// Check whether \p I is a currently unsupported binary operation:
/// - it has an unsized type
/// - an operand is not a vreg
/// - all operands are not in the same bank
/// These are checks that should someday live in the verifier, but right now,
/// these are mostly limitations of the aarch64 selector.
static bool unsupportedBinOp(const MachineInstr &I,
const AArch64RegisterBankInfo &RBI,
const MachineRegisterInfo &MRI,
const AArch64RegisterInfo &TRI) {
LLT Ty = MRI.getType(I.getOperand(0).getReg());
if (!Ty.isValid()) {
LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
return true;
}
const RegisterBank *PrevOpBank = nullptr;
for (auto &MO : I.operands()) {
// FIXME: Support non-register operands.
if (!MO.isReg()) {
LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
return true;
}
// FIXME: Can generic operations have physical registers operands? If
// so, this will need to be taught about that, and we'll need to get the
// bank out of the minimal class for the register.
// Either way, this needs to be documented (and possibly verified).
if (!Register::isVirtualRegister(MO.getReg())) {
LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
return true;
}
const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
if (!OpBank) {
LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
return true;
}
if (PrevOpBank && OpBank != PrevOpBank) {
LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
return true;
}
PrevOpBank = OpBank;
}
return false;
}
/// Select the AArch64 opcode for the basic binary operation \p GenericOpc
[AArch64][GlobalISel] Legalize narrow scalar ops again. Since r279760, we've been marking as legal operations on narrow integer types that have wider legal equivalents (for instance, G_ADD s8). Compared to legalizing these operations, this reduced the amount of extends/truncates required, but was always a weird legalization decision made at selection time. So far, we haven't been able to formalize it in a way that permits the selector generated from SelectionDAG patterns to be sufficient. Using a wide instruction (say, s64), when a narrower instruction exists (s32) would introduce register class incompatibilities (when one narrow generic instruction is selected to the wider variant, but another is selected to the narrower variant). It's also impractical to limit which narrow operations are matched for which instruction, as restricting "narrow selection" to ranges of types clashes with potentially incompatible instruction predicates. Concerns were also raised regarding MIPS64's sign-extended register assumptions, as well as wrapping behavior. See discussions in https://reviews.llvm.org/D26878. Instead, legalize the operations. Should we ever revert to selecting these narrow operations, we should try to represent this more accurately: for instance, by separating a "concrete" type on operations, and an "underlying" type on vregs, we could move the "this narrow-looking op is really legal" decision to the legalizer, and let the selector use the "underlying" vreg type only, which would be guaranteed to map to a register class. In any case, we eventually should mitigate: - the performance impact by selecting no-op extract/truncates to COPYs (which we currently do), and the COPYs to register reuses (which we don't do yet). - the compile-time impact by optimizing away extract/truncate sequences in the legalizer. llvm-svn: 292827
2017-01-23 22:10:05 +01:00
/// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
/// and of size \p OpSize.
/// \returns \p GenericOpc if the combination is unsupported.
static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
unsigned OpSize) {
switch (RegBankID) {
case AArch64::GPRRegBankID:
if (OpSize == 32) {
switch (GenericOpc) {
case TargetOpcode::G_SHL:
return AArch64::LSLVWr;
case TargetOpcode::G_LSHR:
return AArch64::LSRVWr;
case TargetOpcode::G_ASHR:
return AArch64::ASRVWr;
default:
return GenericOpc;
}
} else if (OpSize == 64) {
switch (GenericOpc) {
case TargetOpcode::G_GEP:
return AArch64::ADDXrr;
case TargetOpcode::G_SHL:
return AArch64::LSLVXr;
case TargetOpcode::G_LSHR:
return AArch64::LSRVXr;
case TargetOpcode::G_ASHR:
return AArch64::ASRVXr;
default:
return GenericOpc;
}
}
break;
case AArch64::FPRRegBankID:
switch (OpSize) {
case 32:
switch (GenericOpc) {
case TargetOpcode::G_FADD:
return AArch64::FADDSrr;
case TargetOpcode::G_FSUB:
return AArch64::FSUBSrr;
case TargetOpcode::G_FMUL:
return AArch64::FMULSrr;
case TargetOpcode::G_FDIV:
return AArch64::FDIVSrr;
default:
return GenericOpc;
}
case 64:
switch (GenericOpc) {
case TargetOpcode::G_FADD:
return AArch64::FADDDrr;
case TargetOpcode::G_FSUB:
return AArch64::FSUBDrr;
case TargetOpcode::G_FMUL:
return AArch64::FMULDrr;
case TargetOpcode::G_FDIV:
return AArch64::FDIVDrr;
case TargetOpcode::G_OR:
return AArch64::ORRv8i8;
default:
return GenericOpc;
}
}
break;
}
return GenericOpc;
}
/// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
/// appropriate for the (value) register bank \p RegBankID and of memory access
/// size \p OpSize. This returns the variant with the base+unsigned-immediate
/// addressing mode (e.g., LDRXui).
/// \returns \p GenericOpc if the combination is unsupported.
static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
unsigned OpSize) {
const bool isStore = GenericOpc == TargetOpcode::G_STORE;
switch (RegBankID) {
case AArch64::GPRRegBankID:
switch (OpSize) {
case 8:
return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
case 16:
return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
case 32:
return isStore ? AArch64::STRWui : AArch64::LDRWui;
case 64:
return isStore ? AArch64::STRXui : AArch64::LDRXui;
}
break;
case AArch64::FPRRegBankID:
switch (OpSize) {
case 8:
return isStore ? AArch64::STRBui : AArch64::LDRBui;
case 16:
return isStore ? AArch64::STRHui : AArch64::LDRHui;
case 32:
return isStore ? AArch64::STRSui : AArch64::LDRSui;
case 64:
return isStore ? AArch64::STRDui : AArch64::LDRDui;
}
break;
}
return GenericOpc;
}
#ifndef NDEBUG
/// Helper function that verifies that we have a valid copy at the end of
/// selectCopy. Verifies that the source and dest have the expected sizes and
/// then returns true.
static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
const MachineRegisterInfo &MRI,
const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
const unsigned DstReg = I.getOperand(0).getReg();
const unsigned SrcReg = I.getOperand(1).getReg();
const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
// Make sure the size of the source and dest line up.
assert(
(DstSize == SrcSize ||
// Copies are a mean to setup initial types, the number of
// bits may not exactly match.
(Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) ||
// Copies are a mean to copy bits around, as long as we are
// on the same register class, that's fine. Otherwise, that
// means we need some SUBREG_TO_REG or AND & co.
(((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) &&
"Copy with different width?!");
// Check the size of the destination.
assert((DstSize <= 64 || DstBank.getID() == AArch64::FPRRegBankID) &&
"GPRs cannot get more than 64-bit width values");
return true;
}
#endif
/// Helper function for selectCopy. Inserts a subregister copy from
/// \p *From to \p *To, linking it up to \p I.
///
/// e.g, given I = "Dst = COPY SrcReg", we'll transform that into
///
/// CopyReg (From class) = COPY SrcReg
/// SubRegCopy (To class) = COPY CopyReg:SubReg
/// Dst = COPY SubRegCopy
static bool selectSubregisterCopy(MachineInstr &I, MachineRegisterInfo &MRI,
const RegisterBankInfo &RBI, unsigned SrcReg,
const TargetRegisterClass *From,
const TargetRegisterClass *To,
unsigned SubReg) {
MachineIRBuilder MIB(I);
auto Copy = MIB.buildCopy({From}, {SrcReg});
auto SubRegCopy = MIB.buildInstr(TargetOpcode::COPY, {To}, {})
.addReg(Copy.getReg(0), 0, SubReg);
MachineOperand &RegOp = I.getOperand(1);
RegOp.setReg(SubRegCopy.getReg(0));
// It's possible that the destination register won't be constrained. Make
// sure that happens.
if (!Register::isPhysicalRegister(I.getOperand(0).getReg()))
RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
return true;
}
/// Helper function to get the source and destination register classes for a
/// copy. Returns a std::pair containing the source register class for the
/// copy, and the destination register class for the copy. If a register class
/// cannot be determined, then it will be nullptr.
static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
unsigned DstReg = I.getOperand(0).getReg();
unsigned SrcReg = I.getOperand(1).getReg();
const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
// Special casing for cross-bank copies of s1s. We can technically represent
// a 1-bit value with any size of register. The minimum size for a GPR is 32
// bits. So, we need to put the FPR on 32 bits as well.
//
// FIXME: I'm not sure if this case holds true outside of copies. If it does,
// then we can pull it into the helpers that get the appropriate class for a
// register bank. Or make a new helper that carries along some constraint
// information.
if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
SrcSize = DstSize = 32;
return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
getMinClassForRegBank(DstRegBank, DstSize, true)};
}
static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
const RegisterBankInfo &RBI) {
unsigned DstReg = I.getOperand(0).getReg();
unsigned SrcReg = I.getOperand(1).getReg();
const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
// Find the correct register classes for the source and destination registers.
const TargetRegisterClass *SrcRC;
const TargetRegisterClass *DstRC;
std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
if (!DstRC) {
LLVM_DEBUG(dbgs() << "Unexpected dest size "
<< RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
return false;
}
// A couple helpers below, for making sure that the copy we produce is valid.
// Set to true if we insert a SUBREG_TO_REG. If we do this, then we don't want
// to verify that the src and dst are the same size, since that's handled by
// the SUBREG_TO_REG.
bool KnownValid = false;
// Returns true, or asserts if something we don't expect happens. Instead of
// returning true, we return isValidCopy() to ensure that we verify the
// result.
auto CheckCopy = [&]() {
// If we have a bitcast or something, we can't have physical registers.
assert((I.isCopy() ||
(!Register::isPhysicalRegister(I.getOperand(0).getReg()) &&
!Register::isPhysicalRegister(I.getOperand(1).getReg()))) &&
"No phys reg on generic operator!");
assert(KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI));
(void)KnownValid;
return true;
};
// Is this a copy? If so, then we may need to insert a subregister copy, or
// a SUBREG_TO_REG.
if (I.isCopy()) {
// Yes. Check if there's anything to fix up.
if (!SrcRC) {
LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
return false;
}
// Is this a cross-bank copy?
if (DstRegBank.getID() != SrcRegBank.getID()) {
// If we're doing a cross-bank copy on different-sized registers, we need
// to do a bit more work.
unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
if (SrcSize > DstSize) {
// We're doing a cross-bank copy into a smaller register. We need a
// subregister copy. First, get a register class that's on the same bank
// as the destination, but the same size as the source.
const TargetRegisterClass *SubregRC =
getMinClassForRegBank(DstRegBank, SrcSize, true);
assert(SubregRC && "Didn't get a register class for subreg?");
// Get the appropriate subregister for the destination.
unsigned SubReg = 0;
if (!getSubRegForClass(DstRC, TRI, SubReg)) {
LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n");
return false;
}
// Now, insert a subregister copy using the new register class.
selectSubregisterCopy(I, MRI, RBI, SrcReg, SubregRC, DstRC, SubReg);
return CheckCopy();
}
else if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 &&
SrcSize == 16) {
// Special case for FPR16 to GPR32.
// FIXME: This can probably be generalized like the above case.
unsigned PromoteReg =
MRI.createVirtualRegister(&AArch64::FPR32RegClass);
BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
.addImm(0)
.addUse(SrcReg)
.addImm(AArch64::hsub);
MachineOperand &RegOp = I.getOperand(1);
RegOp.setReg(PromoteReg);
// Promise that the copy is implicitly validated by the SUBREG_TO_REG.
KnownValid = true;
}
}
// If the destination is a physical register, then there's nothing to
// change, so we're done.
if (Register::isPhysicalRegister(DstReg))
return CheckCopy();
}
// No need to constrain SrcReg. It will get constrained when we hit another
// of its use or its defs. Copies do not have constraints.
if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
<< " operand\n");
return false;
}
I.setDesc(TII.get(AArch64::COPY));
return CheckCopy();
}
static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
if (!DstTy.isScalar() || !SrcTy.isScalar())
return GenericOpc;
const unsigned DstSize = DstTy.getSizeInBits();
const unsigned SrcSize = SrcTy.getSizeInBits();
switch (DstSize) {
case 32:
switch (SrcSize) {
case 32:
switch (GenericOpc) {
case TargetOpcode::G_SITOFP:
return AArch64::SCVTFUWSri;
case TargetOpcode::G_UITOFP:
return AArch64::UCVTFUWSri;
case TargetOpcode::G_FPTOSI:
return AArch64::FCVTZSUWSr;
case TargetOpcode::G_FPTOUI:
return AArch64::FCVTZUUWSr;
default:
return GenericOpc;
}
case 64:
switch (GenericOpc) {
case TargetOpcode::G_SITOFP:
return AArch64::SCVTFUXSri;
case TargetOpcode::G_UITOFP:
return AArch64::UCVTFUXSri;
case TargetOpcode::G_FPTOSI:
return AArch64::FCVTZSUWDr;
case TargetOpcode::G_FPTOUI:
return AArch64::FCVTZUUWDr;
default:
return GenericOpc;
}
default:
return GenericOpc;
}
case 64:
switch (SrcSize) {
case 32:
switch (GenericOpc) {
case TargetOpcode::G_SITOFP:
return AArch64::SCVTFUWDri;
case TargetOpcode::G_UITOFP:
return AArch64::UCVTFUWDri;
case TargetOpcode::G_FPTOSI:
return AArch64::FCVTZSUXSr;
case TargetOpcode::G_FPTOUI:
return AArch64::FCVTZUUXSr;
default:
return GenericOpc;
}
case 64:
switch (GenericOpc) {
case TargetOpcode::G_SITOFP:
return AArch64::SCVTFUXDri;
case TargetOpcode::G_UITOFP:
return AArch64::UCVTFUXDri;
case TargetOpcode::G_FPTOSI:
return AArch64::FCVTZSUXDr;
case TargetOpcode::G_FPTOUI:
return AArch64::FCVTZUUXDr;
default:
return GenericOpc;
}
default:
return GenericOpc;
}
default:
return GenericOpc;
};
return GenericOpc;
}
static unsigned selectSelectOpc(MachineInstr &I, MachineRegisterInfo &MRI,
const RegisterBankInfo &RBI) {
const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
bool IsFP = (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
AArch64::GPRRegBankID);
LLT Ty = MRI.getType(I.getOperand(0).getReg());
if (Ty == LLT::scalar(32))
return IsFP ? AArch64::FCSELSrrr : AArch64::CSELWr;
else if (Ty == LLT::scalar(64) || Ty == LLT::pointer(0, 64))
return IsFP ? AArch64::FCSELDrrr : AArch64::CSELXr;
return 0;
}
/// Helper function to select the opcode for a G_FCMP.
static unsigned selectFCMPOpc(MachineInstr &I, MachineRegisterInfo &MRI) {
// If this is a compare against +0.0, then we don't have to explicitly
// materialize a constant.
const ConstantFP *FPImm = getConstantFPVRegVal(I.getOperand(3).getReg(), MRI);
bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
unsigned OpSize = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
if (OpSize != 32 && OpSize != 64)
return 0;
unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
{AArch64::FCMPSri, AArch64::FCMPDri}};
return CmpOpcTbl[ShouldUseImm][OpSize == 64];
}
/// Returns true if \p P is an unsigned integer comparison predicate.
static bool isUnsignedICMPPred(const CmpInst::Predicate P) {
switch (P) {
default:
return false;
case CmpInst::ICMP_UGT:
case CmpInst::ICMP_UGE:
case CmpInst::ICMP_ULT:
case CmpInst::ICMP_ULE:
return true;
}
}
static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
switch (P) {
default:
llvm_unreachable("Unknown condition code!");
case CmpInst::ICMP_NE:
return AArch64CC::NE;
case CmpInst::ICMP_EQ:
return AArch64CC::EQ;
case CmpInst::ICMP_SGT:
return AArch64CC::GT;
case CmpInst::ICMP_SGE:
return AArch64CC::GE;
case CmpInst::ICMP_SLT:
return AArch64CC::LT;
case CmpInst::ICMP_SLE:
return AArch64CC::LE;
case CmpInst::ICMP_UGT:
return AArch64CC::HI;
case CmpInst::ICMP_UGE:
return AArch64CC::HS;
case CmpInst::ICMP_ULT:
return AArch64CC::LO;
case CmpInst::ICMP_ULE:
return AArch64CC::LS;
}
}
static void changeFCMPPredToAArch64CC(CmpInst::Predicate P,
AArch64CC::CondCode &CondCode,
AArch64CC::CondCode &CondCode2) {
CondCode2 = AArch64CC::AL;
switch (P) {
default:
llvm_unreachable("Unknown FP condition!");
case CmpInst::FCMP_OEQ:
CondCode = AArch64CC::EQ;
break;
case CmpInst::FCMP_OGT:
CondCode = AArch64CC::GT;
break;
case CmpInst::FCMP_OGE:
CondCode = AArch64CC::GE;
break;
case CmpInst::FCMP_OLT:
CondCode = AArch64CC::MI;
break;
case CmpInst::FCMP_OLE:
CondCode = AArch64CC::LS;
break;
case CmpInst::FCMP_ONE:
CondCode = AArch64CC::MI;
CondCode2 = AArch64CC::GT;
break;
case CmpInst::FCMP_ORD:
CondCode = AArch64CC::VC;
break;
case CmpInst::FCMP_UNO:
CondCode = AArch64CC::VS;
break;
case CmpInst::FCMP_UEQ:
CondCode = AArch64CC::EQ;
CondCode2 = AArch64CC::VS;
break;
case CmpInst::FCMP_UGT:
CondCode = AArch64CC::HI;
break;
case CmpInst::FCMP_UGE:
CondCode = AArch64CC::PL;
break;
case CmpInst::FCMP_ULT:
CondCode = AArch64CC::LT;
break;
case CmpInst::FCMP_ULE:
CondCode = AArch64CC::LE;
break;
case CmpInst::FCMP_UNE:
CondCode = AArch64CC::NE;
break;
}
}
bool AArch64InstructionSelector::selectCompareBranch(
MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
const Register CondReg = I.getOperand(0).getReg();
MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
MachineInstr *CCMI = MRI.getVRegDef(CondReg);
if (CCMI->getOpcode() == TargetOpcode::G_TRUNC)
CCMI = MRI.getVRegDef(CCMI->getOperand(1).getReg());
if (CCMI->getOpcode() != TargetOpcode::G_ICMP)
return false;
Register LHS = CCMI->getOperand(2).getReg();
Register RHS = CCMI->getOperand(3).getReg();
auto VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
if (!VRegAndVal)
std::swap(RHS, LHS);
VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
if (!VRegAndVal || VRegAndVal->Value != 0) {
MachineIRBuilder MIB(I);
// If we can't select a CBZ then emit a cmp + Bcc.
if (!emitIntegerCompare(CCMI->getOperand(2), CCMI->getOperand(3),
CCMI->getOperand(1), MIB))
return false;
const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
(CmpInst::Predicate)CCMI->getOperand(1).getPredicate());
MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
I.eraseFromParent();
return true;
}
const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI);
if (RB.getID() != AArch64::GPRRegBankID)
return false;
const auto Pred = (CmpInst::Predicate)CCMI->getOperand(1).getPredicate();
if (Pred != CmpInst::ICMP_NE && Pred != CmpInst::ICMP_EQ)
return false;
const unsigned CmpWidth = MRI.getType(LHS).getSizeInBits();
unsigned CBOpc = 0;
if (CmpWidth <= 32)
CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZW : AArch64::CBNZW);
else if (CmpWidth == 64)
CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZX : AArch64::CBNZX);
else
return false;
BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc))
.addUse(LHS)
.addMBB(DestMBB)
.constrainAllUses(TII, TRI, RBI);
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectVectorSHL(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_SHL);
Register DstReg = I.getOperand(0).getReg();
const LLT Ty = MRI.getType(DstReg);
Register Src1Reg = I.getOperand(1).getReg();
Register Src2Reg = I.getOperand(2).getReg();
if (!Ty.isVector())
return false;
unsigned Opc = 0;
if (Ty == LLT::vector(4, 32)) {
Opc = AArch64::USHLv4i32;
} else if (Ty == LLT::vector(2, 32)) {
Opc = AArch64::USHLv2i32;
} else {
LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
return false;
}
MachineIRBuilder MIB(I);
auto UShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Src2Reg});
constrainSelectedInstRegOperands(*UShl, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectVectorASHR(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_ASHR);
Register DstReg = I.getOperand(0).getReg();
const LLT Ty = MRI.getType(DstReg);
Register Src1Reg = I.getOperand(1).getReg();
Register Src2Reg = I.getOperand(2).getReg();
if (!Ty.isVector())
return false;
// There is not a shift right register instruction, but the shift left
// register instruction takes a signed value, where negative numbers specify a
// right shift.
unsigned Opc = 0;
unsigned NegOpc = 0;
const TargetRegisterClass *RC = nullptr;
if (Ty == LLT::vector(4, 32)) {
Opc = AArch64::SSHLv4i32;
NegOpc = AArch64::NEGv4i32;
RC = &AArch64::FPR128RegClass;
} else if (Ty == LLT::vector(2, 32)) {
Opc = AArch64::SSHLv2i32;
NegOpc = AArch64::NEGv2i32;
RC = &AArch64::FPR64RegClass;
} else {
LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
return false;
}
MachineIRBuilder MIB(I);
auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg});
constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg});
constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectVaStartAAPCS(
MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
return false;
}
bool AArch64InstructionSelector::selectVaStartDarwin(
MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
Register ListReg = I.getOperand(0).getReg();
Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
auto MIB =
BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
.addDef(ArgsAddrReg)
.addFrameIndex(FuncInfo->getVarArgsStackIndex())
.addImm(0)
.addImm(0);
constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
.addUse(ArgsAddrReg)
.addUse(ListReg)
.addImm(0)
.addMemOperand(*I.memoperands_begin());
constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
void AArch64InstructionSelector::materializeLargeCMVal(
MachineInstr &I, const Value *V, unsigned OpFlags) const {
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
MachineIRBuilder MIB(I);
[GISel]: Refactor MachineIRBuilder to allow passing additional parameters to build Instrs https://reviews.llvm.org/D55294 Previously MachineIRBuilder::buildInstr used to accept variadic arguments for sources (which were either unsigned or MachineInstrBuilder). While this worked well in common cases, it doesn't allow us to build instructions that have multiple destinations. Additionally passing in other optional parameters in the end (such as flags) is not possible trivially. Also a trivial call such as B.buildInstr(Opc, Reg1, Reg2, Reg3) can be interpreted differently based on the opcode (2defs + 1 src for unmerge vs 1 def + 2srcs). This patch refactors the buildInstr to buildInstr(Opc, ArrayRef<DstOps>, ArrayRef<SrcOps>) where DstOps and SrcOps are typed unions that know how to add itself to MachineInstrBuilder. After this patch, most invocations would look like B.buildInstr(Opc, {s32, DstReg}, {SrcRegs..., SrcMIBs..}); Now all the other calls (such as buildAdd, buildSub etc) forward to buildInstr. It also makes it possible to build instructions with multiple defs. Additionally in a subsequent patch, we should make it possible to add flags directly while building instructions. Additionally, the main buildInstr method is now virtual and other builders now only have to override buildInstr (for say constant folding/cseing) is straightforward. Also attached here (https://reviews.llvm.org/F7675680) is a clang-tidy patch that should upgrade the API calls if necessary. llvm-svn: 348815
2018-12-11 01:48:50 +01:00
auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
MovZ->addOperand(MF, I.getOperand(1));
MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
AArch64II::MO_NC);
MovZ->addOperand(MF, MachineOperand::CreateImm(0));
constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
Register ForceDstReg) {
Register DstReg = ForceDstReg
? ForceDstReg
: MRI.createVirtualRegister(&AArch64::GPR64RegClass);
auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
if (auto *GV = dyn_cast<GlobalValue>(V)) {
MovI->addOperand(MF, MachineOperand::CreateGA(
GV, MovZ->getOperand(1).getOffset(), Flags));
} else {
MovI->addOperand(
MF, MachineOperand::CreateBA(cast<BlockAddress>(V),
MovZ->getOperand(1).getOffset(), Flags));
}
MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
return DstReg;
};
Register DstReg = BuildMovK(MovZ.getReg(0),
AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
return;
}
void AArch64InstructionSelector::preISelLower(MachineInstr &I) const {
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
switch (I.getOpcode()) {
case TargetOpcode::G_SHL:
case TargetOpcode::G_ASHR:
case TargetOpcode::G_LSHR: {
// These shifts are legalized to have 64 bit shift amounts because we want
// to take advantage of the existing imported selection patterns that assume
// the immediates are s64s. However, if the shifted type is 32 bits and for
// some reason we receive input GMIR that has an s64 shift amount that's not
// a G_CONSTANT, insert a truncate so that we can still select the s32
// register-register variant.
unsigned SrcReg = I.getOperand(1).getReg();
unsigned ShiftReg = I.getOperand(2).getReg();
const LLT ShiftTy = MRI.getType(ShiftReg);
const LLT SrcTy = MRI.getType(SrcReg);
if (SrcTy.isVector())
return;
assert(!ShiftTy.isVector() && "unexpected vector shift ty");
if (SrcTy.getSizeInBits() != 32 || ShiftTy.getSizeInBits() != 64)
return;
auto *AmtMI = MRI.getVRegDef(ShiftReg);
assert(AmtMI && "could not find a vreg definition for shift amount");
if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) {
// Insert a subregister copy to implement a 64->32 trunc
MachineIRBuilder MIB(I);
auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
.addReg(ShiftReg, 0, AArch64::sub_32);
MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
I.getOperand(2).setReg(Trunc.getReg(0));
}
return;
}
case TargetOpcode::G_STORE:
contractCrossBankCopyIntoStore(I, MRI);
return;
default:
return;
}
}
bool AArch64InstructionSelector::earlySelectSHL(
MachineInstr &I, MachineRegisterInfo &MRI) const {
// We try to match the immediate variant of LSL, which is actually an alias
// for a special case of UBFM. Otherwise, we fall back to the imported
// selector which will match the register variant.
assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
const auto &MO = I.getOperand(2);
auto VRegAndVal = getConstantVRegVal(MO.getReg(), MRI);
if (!VRegAndVal)
return false;
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
if (DstTy.isVector())
return false;
bool Is64Bit = DstTy.getSizeInBits() == 64;
auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO);
auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO);
MachineIRBuilder MIB(I);
if (!Imm1Fn || !Imm2Fn)
return false;
auto NewI =
MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
{I.getOperand(0).getReg()}, {I.getOperand(1).getReg()});
for (auto &RenderFn : *Imm1Fn)
RenderFn(NewI);
for (auto &RenderFn : *Imm2Fn)
RenderFn(NewI);
I.eraseFromParent();
return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
}
void AArch64InstructionSelector::contractCrossBankCopyIntoStore(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
// If we're storing a scalar, it doesn't matter what register bank that
// scalar is on. All that matters is the size.
//
// So, if we see something like this (with a 32-bit scalar as an example):
//
// %x:gpr(s32) = ... something ...
// %y:fpr(s32) = COPY %x:gpr(s32)
// G_STORE %y:fpr(s32)
//
// We can fix this up into something like this:
//
// G_STORE %x:gpr(s32)
//
// And then continue the selection process normally.
MachineInstr *Def = getDefIgnoringCopies(I.getOperand(0).getReg(), MRI);
if (!Def)
return;
Register DefDstReg = Def->getOperand(0).getReg();
LLT DefDstTy = MRI.getType(DefDstReg);
Register StoreSrcReg = I.getOperand(0).getReg();
LLT StoreSrcTy = MRI.getType(StoreSrcReg);
// If we get something strange like a physical register, then we shouldn't
// go any further.
if (!DefDstTy.isValid())
return;
// Are the source and dst types the same size?
if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
return;
if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
RBI.getRegBank(DefDstReg, MRI, TRI))
return;
// We have a cross-bank copy, which is entering a store. Let's fold it.
I.getOperand(0).setReg(DefDstReg);
}
bool AArch64InstructionSelector::earlySelectLoad(
MachineInstr &I, MachineRegisterInfo &MRI) const {
// Try to fold in shifts, etc into the addressing mode of a load.
assert(I.getOpcode() == TargetOpcode::G_LOAD && "unexpected op");
// Don't handle atomic loads/stores yet.
auto &MemOp = **I.memoperands_begin();
if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n");
return false;
}
unsigned MemBytes = MemOp.getSize();
// Only support 64-bit loads for now.
if (MemBytes != 8)
return false;
Register DstReg = I.getOperand(0).getReg();
const LLT DstTy = MRI.getType(DstReg);
// Don't handle vectors.
if (DstTy.isVector())
return false;
unsigned DstSize = DstTy.getSizeInBits();
// TODO: 32-bit destinations.
if (DstSize != 64)
return false;
// Check if we can do any folding from GEPs/shifts etc. into the load.
auto ImmFn = selectAddrModeXRO(I.getOperand(1), MemBytes);
if (!ImmFn)
return false;
// We can fold something. Emit the load here.
MachineIRBuilder MIB(I);
// Choose the instruction based off the size of the element being loaded, and
// whether or not we're loading into a FPR.
const RegisterBank &RB = *RBI.getRegBank(DstReg, MRI, TRI);
unsigned Opc =
RB.getID() == AArch64::GPRRegBankID ? AArch64::LDRXroX : AArch64::LDRDroX;
// Construct the load.
auto LoadMI = MIB.buildInstr(Opc, {DstReg}, {});
for (auto &RenderFn : *ImmFn)
RenderFn(LoadMI);
LoadMI.addMemOperand(*I.memoperands_begin());
I.eraseFromParent();
return constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
}
bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
assert(I.getParent() && "Instruction should be in a basic block!");
assert(I.getParent()->getParent() && "Instruction should be in a function!");
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
switch (I.getOpcode()) {
case TargetOpcode::G_SHL:
return earlySelectSHL(I, MRI);
case TargetOpcode::G_LOAD:
return earlySelectLoad(I, MRI);
case TargetOpcode::G_CONSTANT: {
bool IsZero = false;
if (I.getOperand(1).isCImm())
IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0;
else if (I.getOperand(1).isImm())
IsZero = I.getOperand(1).getImm() == 0;
if (!IsZero)
return false;
Register DefReg = I.getOperand(0).getReg();
LLT Ty = MRI.getType(DefReg);
assert((Ty == LLT::scalar(64) || Ty == LLT::scalar(32)) &&
"Unexpected legal constant type");
if (Ty == LLT::scalar(64)) {
I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
} else {
I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
}
I.setDesc(TII.get(TargetOpcode::COPY));
return true;
}
default:
return false;
}
}
2017-11-16 01:46:35 +01:00
bool AArch64InstructionSelector::select(MachineInstr &I,
CodeGenCoverage &CoverageInfo) const {
assert(I.getParent() && "Instruction should be in a basic block!");
assert(I.getParent()->getParent() && "Instruction should be in a function!");
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
unsigned Opcode = I.getOpcode();
// G_PHI requires same handling as PHI
if (!isPreISelGenericOpcode(Opcode) || Opcode == TargetOpcode::G_PHI) {
// Certain non-generic instructions also need some special handling.
if (Opcode == TargetOpcode::LOAD_STACK_GUARD)
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
const Register DefReg = I.getOperand(0).getReg();
const LLT DefTy = MRI.getType(DefReg);
const RegClassOrRegBank &RegClassOrBank =
MRI.getRegClassOrRegBank(DefReg);
const TargetRegisterClass *DefRC
= RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
if (!DefRC) {
if (!DefTy.isValid()) {
LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
return false;
}
const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI);
if (!DefRC) {
LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
return false;
}
}
I.setDesc(TII.get(TargetOpcode::PHI));
return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
}
if (I.isCopy())
return selectCopy(I, TII, MRI, TRI, RBI);
return true;
}
if (I.getNumOperands() != I.getNumExplicitOperands()) {
LLVM_DEBUG(
dbgs() << "Generic instruction has unexpected implicit operands\n");
return false;
}
// Try to do some lowering before we start instruction selecting. These
// lowerings are purely transformations on the input G_MIR and so selection
// must continue after any modification of the instruction.
preISelLower(I);
// There may be patterns where the importer can't deal with them optimally,
// but does select it to a suboptimal sequence so our custom C++ selection
// code later never has a chance to work on it. Therefore, we have an early
// selection attempt here to give priority to certain selection routines
// over the imported ones.
if (earlySelect(I))
return true;
2017-11-16 01:46:35 +01:00
if (selectImpl(I, CoverageInfo))
return true;
LLT Ty =
I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
MachineIRBuilder MIB(I);
switch (Opcode) {
case TargetOpcode::G_BRCOND: {
if (Ty.getSizeInBits() > 32) {
// We shouldn't need this on AArch64, but it would be implemented as an
// EXTRACT_SUBREG followed by a TBNZW because TBNZX has no encoding if the
// bit being tested is < 32.
LLVM_DEBUG(dbgs() << "G_BRCOND has type: " << Ty
<< ", expected at most 32-bits");
return false;
}
const Register CondReg = I.getOperand(0).getReg();
MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
Introduce control flow speculation tracking pass for AArch64 The pass implements tracking of control flow miss-speculation into a "taint" register. That taint register can then be used to mask off registers with sensitive data when executing under miss-speculation, a.k.a. "transient execution". This pass is aimed at mitigating against SpectreV1-style vulnarabilities. At the moment, it implements the tracking of miss-speculation of control flow into a taint register, but doesn't implement a mechanism yet to then use that taint register to mask off vulnerable data in registers (something for a follow-on improvement). Possible strategies to mask out vulnerable data that can be implemented on top of this are: - speculative load hardening to automatically mask of data loaded in registers. - using intrinsics to mask of data in registers as indicated by the programmer (see https://lwn.net/Articles/759423/). For AArch64, the following implementation choices are made. Some of these are different than the implementation choices made in the similar pass implemented in X86SpeculativeLoadHardening.cpp, as the instruction set characteristics result in different trade-offs. - The speculation hardening is done after register allocation. With a relative abundance of registers, one register is reserved (X16) to be the taint register. X16 is expected to not clash with other register reservation mechanisms with very high probability because: . The AArch64 ABI doesn't guarantee X16 to be retained across any call. . The only way to request X16 to be used as a programmer is through inline assembly. In the rare case a function explicitly demands to use X16/W16, this pass falls back to hardening against speculation by inserting a DSB SYS/ISB barrier pair which will prevent control flow speculation. - It is easy to insert mask operations at this late stage as we have mask operations available that don't set flags. - The taint variable contains all-ones when no miss-speculation is detected, and contains all-zeros when miss-speculation is detected. Therefore, when masking, an AND instruction (which only changes the register to be masked, no other side effects) can easily be inserted anywhere that's needed. - The tracking of miss-speculation is done by using a data-flow conditional select instruction (CSEL) to evaluate the flags that were also used to make conditional branch direction decisions. Speculation of the CSEL instruction can be limited with a CSDB instruction - so the combination of CSEL + a later CSDB gives the guarantee that the flags as used in the CSEL aren't speculated. When conditional branch direction gets miss-speculated, the semantics of the inserted CSEL instruction is such that the taint register will contain all zero bits. One key requirement for this to work is that the conditional branch is followed by an execution of the CSEL instruction, where the CSEL instruction needs to use the same flags status as the conditional branch. This means that the conditional branches must not be implemented as one of the AArch64 conditional branches that do not use the flags as input (CB(N)Z and TB(N)Z). This is implemented by ensuring in the instruction selectors to not produce these instructions when speculation hardening is enabled. This pass will assert if it does encounter such an instruction. - On function call boundaries, the miss-speculation state is transferred from the taint register X16 to be encoded in the SP register as value 0. Future extensions/improvements could be: - Implement this functionality using full speculation barriers, akin to the x86-slh-lfence option. This may be more useful for the intrinsics-based approach than for the SLH approach to masking. Note that this pass already inserts the full speculation barriers if the function for some niche reason makes use of X16/W16. - no indirect branch misprediction gets protected/instrumented; but this could be done for some indirect branches, such as switch jump tables. Differential Revision: https://reviews.llvm.org/D54896 llvm-svn: 349456
2018-12-18 09:50:02 +01:00
// Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
// instructions will not be produced, as they are conditional branch
// instructions that do not set flags.
bool ProduceNonFlagSettingCondBr =
!MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
if (ProduceNonFlagSettingCondBr && selectCompareBranch(I, MF, MRI))
return true;
Introduce control flow speculation tracking pass for AArch64 The pass implements tracking of control flow miss-speculation into a "taint" register. That taint register can then be used to mask off registers with sensitive data when executing under miss-speculation, a.k.a. "transient execution". This pass is aimed at mitigating against SpectreV1-style vulnarabilities. At the moment, it implements the tracking of miss-speculation of control flow into a taint register, but doesn't implement a mechanism yet to then use that taint register to mask off vulnerable data in registers (something for a follow-on improvement). Possible strategies to mask out vulnerable data that can be implemented on top of this are: - speculative load hardening to automatically mask of data loaded in registers. - using intrinsics to mask of data in registers as indicated by the programmer (see https://lwn.net/Articles/759423/). For AArch64, the following implementation choices are made. Some of these are different than the implementation choices made in the similar pass implemented in X86SpeculativeLoadHardening.cpp, as the instruction set characteristics result in different trade-offs. - The speculation hardening is done after register allocation. With a relative abundance of registers, one register is reserved (X16) to be the taint register. X16 is expected to not clash with other register reservation mechanisms with very high probability because: . The AArch64 ABI doesn't guarantee X16 to be retained across any call. . The only way to request X16 to be used as a programmer is through inline assembly. In the rare case a function explicitly demands to use X16/W16, this pass falls back to hardening against speculation by inserting a DSB SYS/ISB barrier pair which will prevent control flow speculation. - It is easy to insert mask operations at this late stage as we have mask operations available that don't set flags. - The taint variable contains all-ones when no miss-speculation is detected, and contains all-zeros when miss-speculation is detected. Therefore, when masking, an AND instruction (which only changes the register to be masked, no other side effects) can easily be inserted anywhere that's needed. - The tracking of miss-speculation is done by using a data-flow conditional select instruction (CSEL) to evaluate the flags that were also used to make conditional branch direction decisions. Speculation of the CSEL instruction can be limited with a CSDB instruction - so the combination of CSEL + a later CSDB gives the guarantee that the flags as used in the CSEL aren't speculated. When conditional branch direction gets miss-speculated, the semantics of the inserted CSEL instruction is such that the taint register will contain all zero bits. One key requirement for this to work is that the conditional branch is followed by an execution of the CSEL instruction, where the CSEL instruction needs to use the same flags status as the conditional branch. This means that the conditional branches must not be implemented as one of the AArch64 conditional branches that do not use the flags as input (CB(N)Z and TB(N)Z). This is implemented by ensuring in the instruction selectors to not produce these instructions when speculation hardening is enabled. This pass will assert if it does encounter such an instruction. - On function call boundaries, the miss-speculation state is transferred from the taint register X16 to be encoded in the SP register as value 0. Future extensions/improvements could be: - Implement this functionality using full speculation barriers, akin to the x86-slh-lfence option. This may be more useful for the intrinsics-based approach than for the SLH approach to masking. Note that this pass already inserts the full speculation barriers if the function for some niche reason makes use of X16/W16. - no indirect branch misprediction gets protected/instrumented; but this could be done for some indirect branches, such as switch jump tables. Differential Revision: https://reviews.llvm.org/D54896 llvm-svn: 349456
2018-12-18 09:50:02 +01:00
if (ProduceNonFlagSettingCondBr) {
auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW))
.addUse(CondReg)
.addImm(/*bit offset=*/0)
.addMBB(DestMBB);
Introduce control flow speculation tracking pass for AArch64 The pass implements tracking of control flow miss-speculation into a "taint" register. That taint register can then be used to mask off registers with sensitive data when executing under miss-speculation, a.k.a. "transient execution". This pass is aimed at mitigating against SpectreV1-style vulnarabilities. At the moment, it implements the tracking of miss-speculation of control flow into a taint register, but doesn't implement a mechanism yet to then use that taint register to mask off vulnerable data in registers (something for a follow-on improvement). Possible strategies to mask out vulnerable data that can be implemented on top of this are: - speculative load hardening to automatically mask of data loaded in registers. - using intrinsics to mask of data in registers as indicated by the programmer (see https://lwn.net/Articles/759423/). For AArch64, the following implementation choices are made. Some of these are different than the implementation choices made in the similar pass implemented in X86SpeculativeLoadHardening.cpp, as the instruction set characteristics result in different trade-offs. - The speculation hardening is done after register allocation. With a relative abundance of registers, one register is reserved (X16) to be the taint register. X16 is expected to not clash with other register reservation mechanisms with very high probability because: . The AArch64 ABI doesn't guarantee X16 to be retained across any call. . The only way to request X16 to be used as a programmer is through inline assembly. In the rare case a function explicitly demands to use X16/W16, this pass falls back to hardening against speculation by inserting a DSB SYS/ISB barrier pair which will prevent control flow speculation. - It is easy to insert mask operations at this late stage as we have mask operations available that don't set flags. - The taint variable contains all-ones when no miss-speculation is detected, and contains all-zeros when miss-speculation is detected. Therefore, when masking, an AND instruction (which only changes the register to be masked, no other side effects) can easily be inserted anywhere that's needed. - The tracking of miss-speculation is done by using a data-flow conditional select instruction (CSEL) to evaluate the flags that were also used to make conditional branch direction decisions. Speculation of the CSEL instruction can be limited with a CSDB instruction - so the combination of CSEL + a later CSDB gives the guarantee that the flags as used in the CSEL aren't speculated. When conditional branch direction gets miss-speculated, the semantics of the inserted CSEL instruction is such that the taint register will contain all zero bits. One key requirement for this to work is that the conditional branch is followed by an execution of the CSEL instruction, where the CSEL instruction needs to use the same flags status as the conditional branch. This means that the conditional branches must not be implemented as one of the AArch64 conditional branches that do not use the flags as input (CB(N)Z and TB(N)Z). This is implemented by ensuring in the instruction selectors to not produce these instructions when speculation hardening is enabled. This pass will assert if it does encounter such an instruction. - On function call boundaries, the miss-speculation state is transferred from the taint register X16 to be encoded in the SP register as value 0. Future extensions/improvements could be: - Implement this functionality using full speculation barriers, akin to the x86-slh-lfence option. This may be more useful for the intrinsics-based approach than for the SLH approach to masking. Note that this pass already inserts the full speculation barriers if the function for some niche reason makes use of X16/W16. - no indirect branch misprediction gets protected/instrumented; but this could be done for some indirect branches, such as switch jump tables. Differential Revision: https://reviews.llvm.org/D54896 llvm-svn: 349456
2018-12-18 09:50:02 +01:00
I.eraseFromParent();
return constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
} else {
auto CMP = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri))
.addDef(AArch64::WZR)
.addUse(CondReg)
.addImm(1);
constrainSelectedInstRegOperands(*CMP.getInstr(), TII, TRI, RBI);
auto Bcc =
BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::Bcc))
.addImm(AArch64CC::EQ)
.addMBB(DestMBB);
I.eraseFromParent();
return constrainSelectedInstRegOperands(*Bcc.getInstr(), TII, TRI, RBI);
}
}
case TargetOpcode::G_BRINDIRECT: {
I.setDesc(TII.get(AArch64::BR));
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_BRJT:
return selectBrJT(I, MRI);
case TargetOpcode::G_BSWAP: {
// Handle vector types for G_BSWAP directly.
Register DstReg = I.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
// We should only get vector types here; everything else is handled by the
// importer right now.
if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) {
LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n");
return false;
}
// Only handle 4 and 2 element vectors for now.
// TODO: 16-bit elements.
unsigned NumElts = DstTy.getNumElements();
if (NumElts != 4 && NumElts != 2) {
LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n");
return false;
}
// Choose the correct opcode for the supported types. Right now, that's
// v2s32, v4s32, and v2s64.
unsigned Opc = 0;
unsigned EltSize = DstTy.getElementType().getSizeInBits();
if (EltSize == 32)
Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8
: AArch64::REV32v16i8;
else if (EltSize == 64)
Opc = AArch64::REV64v16i8;
// We should always get something by the time we get here...
assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?");
I.setDesc(TII.get(Opc));
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_FCONSTANT:
case TargetOpcode::G_CONSTANT: {
const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
const LLT s8 = LLT::scalar(8);
const LLT s16 = LLT::scalar(16);
const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
const LLT p0 = LLT::pointer(0, 64);
const Register DefReg = I.getOperand(0).getReg();
const LLT DefTy = MRI.getType(DefReg);
const unsigned DefSize = DefTy.getSizeInBits();
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
// FIXME: Redundant check, but even less readable when factored out.
if (isFP) {
if (Ty != s32 && Ty != s64) {
LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
<< " constant, expected: " << s32 << " or " << s64
<< '\n');
return false;
}
if (RB.getID() != AArch64::FPRRegBankID) {
LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
<< " constant on bank: " << RB
<< ", expected: FPR\n");
return false;
}
// The case when we have 0.0 is covered by tablegen. Reject it here so we
// can be sure tablegen works correctly and isn't rescued by this code.
if (I.getOperand(1).getFPImm()->getValueAPF().isExactlyValue(0.0))
return false;
} else {
// s32 and s64 are covered by tablegen.
if (Ty != p0 && Ty != s8 && Ty != s16) {
LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
<< " constant, expected: " << s32 << ", " << s64
<< ", or " << p0 << '\n');
return false;
}
if (RB.getID() != AArch64::GPRRegBankID) {
LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
<< " constant on bank: " << RB
<< ", expected: GPR\n");
return false;
}
}
// We allow G_CONSTANT of types < 32b.
const unsigned MovOpc =
DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
if (isFP) {
// Either emit a FMOV, or emit a copy to emit a normal mov.
const TargetRegisterClass &GPRRC =
DefSize == 32 ? AArch64::GPR32RegClass : AArch64::GPR64RegClass;
const TargetRegisterClass &FPRRC =
DefSize == 32 ? AArch64::FPR32RegClass : AArch64::FPR64RegClass;
// Can we use a FMOV instruction to represent the immediate?
if (emitFMovForFConstant(I, MRI))
return true;
// Nope. Emit a copy and use a normal mov instead.
const Register DefGPRReg = MRI.createVirtualRegister(&GPRRC);
MachineOperand &RegOp = I.getOperand(0);
RegOp.setReg(DefGPRReg);
MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
MIB.buildCopy({DefReg}, {DefGPRReg});
if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
return false;
}
MachineOperand &ImmOp = I.getOperand(1);
// FIXME: Is going through int64_t always correct?
ImmOp.ChangeToImmediate(
ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
[globalisel] Decouple src pattern operands from dst pattern operands. Summary: This isn't testable for AArch64 by itself so this patch also adds support for constant immediates in the pattern and physical register uses in the result. The new IntOperandMatcher matches the constant in patterns such as '(set $rd:GPR32, (G_XOR $rs:GPR32, -1))'. It's always safe to fold immediates into an instruction so this is the first rule that will match across multiple BB's. The Renderer hierarchy is responsible for adding operands to the result instruction. Renderers can copy operands (CopyRenderer) or add physical registers (in particular %wzr and %xzr) to the result instruction in any order (OperandMatchers now import the operand names from SelectionDAG to allow renderers to access any operand). This allows us to emit the result instruction for: %1 = G_XOR %0, -1 --> %1 = ORNWrr %wzr, %0 %1 = G_XOR -1, %0 --> %1 = ORNWrr %wzr, %0 although the latter is untested since the matcher/importer has not been taught about commutativity yet. Added BuildMIAction which can build new instructions and mutate them where possible. W.r.t the mutation aspect, MatchActions are now told the name of an instruction they can recycle and BuildMIAction will emit mutation code when the renderers are appropriate. They are appropriate when all operands are rendered using CopyRenderer and the indices are the same as the matcher. This currently assumes that all operands have at least one matcher. Finally, this change also fixes a crash in AArch64InstructionSelector::select() caused by an immediate operand passing isImm() rather than isCImm(). This was uncovered by the other changes and was detected by existing tests. Depends on D29711 Reviewers: t.p.northover, ab, qcolombet, rovka, aditya_nandakumar, javed.absar Reviewed By: rovka Subscribers: aemerson, dberris, kristof.beyls, llvm-commits Differential Revision: https://reviews.llvm.org/D29712 llvm-svn: 296131
2017-02-24 16:43:30 +01:00
} else if (I.getOperand(1).isCImm()) {
uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
I.getOperand(1).ChangeToImmediate(Val);
[globalisel] Decouple src pattern operands from dst pattern operands. Summary: This isn't testable for AArch64 by itself so this patch also adds support for constant immediates in the pattern and physical register uses in the result. The new IntOperandMatcher matches the constant in patterns such as '(set $rd:GPR32, (G_XOR $rs:GPR32, -1))'. It's always safe to fold immediates into an instruction so this is the first rule that will match across multiple BB's. The Renderer hierarchy is responsible for adding operands to the result instruction. Renderers can copy operands (CopyRenderer) or add physical registers (in particular %wzr and %xzr) to the result instruction in any order (OperandMatchers now import the operand names from SelectionDAG to allow renderers to access any operand). This allows us to emit the result instruction for: %1 = G_XOR %0, -1 --> %1 = ORNWrr %wzr, %0 %1 = G_XOR -1, %0 --> %1 = ORNWrr %wzr, %0 although the latter is untested since the matcher/importer has not been taught about commutativity yet. Added BuildMIAction which can build new instructions and mutate them where possible. W.r.t the mutation aspect, MatchActions are now told the name of an instruction they can recycle and BuildMIAction will emit mutation code when the renderers are appropriate. They are appropriate when all operands are rendered using CopyRenderer and the indices are the same as the matcher. This currently assumes that all operands have at least one matcher. Finally, this change also fixes a crash in AArch64InstructionSelector::select() caused by an immediate operand passing isImm() rather than isCImm(). This was uncovered by the other changes and was detected by existing tests. Depends on D29711 Reviewers: t.p.northover, ab, qcolombet, rovka, aditya_nandakumar, javed.absar Reviewed By: rovka Subscribers: aemerson, dberris, kristof.beyls, llvm-commits Differential Revision: https://reviews.llvm.org/D29712 llvm-svn: 296131
2017-02-24 16:43:30 +01:00
} else if (I.getOperand(1).isImm()) {
uint64_t Val = I.getOperand(1).getImm();
I.getOperand(1).ChangeToImmediate(Val);
}
I.setDesc(TII.get(MovOpc));
constrainSelectedInstRegOperands(I, TII, TRI, RBI);
return true;
}
case TargetOpcode::G_EXTRACT: {
Register DstReg = I.getOperand(0).getReg();
Register SrcReg = I.getOperand(1).getReg();
LLT SrcTy = MRI.getType(SrcReg);
LLT DstTy = MRI.getType(DstReg);
(void)DstTy;
unsigned SrcSize = SrcTy.getSizeInBits();
if (SrcTy.getSizeInBits() > 64) {
// This should be an extract of an s128, which is like a vector extract.
if (SrcTy.getSizeInBits() != 128)
return false;
// Only support extracting 64 bits from an s128 at the moment.
if (DstTy.getSizeInBits() != 64)
return false;
const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
// Check we have the right regbank always.
assert(SrcRB.getID() == AArch64::FPRRegBankID &&
DstRB.getID() == AArch64::FPRRegBankID &&
"Wrong extract regbank!");
(void)SrcRB;
// Emit the same code as a vector extract.
// Offset must be a multiple of 64.
unsigned Offset = I.getOperand(2).getImm();
if (Offset % 64 != 0)
return false;
unsigned LaneIdx = Offset / 64;
MachineIRBuilder MIB(I);
MachineInstr *Extract = emitExtractVectorElt(
DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
if (!Extract)
return false;
I.eraseFromParent();
return true;
}
I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
Ty.getSizeInBits() - 1);
if (SrcSize < 64) {
assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
"unexpected G_EXTRACT types");
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
.addReg(DstReg, 0, AArch64::sub_32);
RBI.constrainGenericRegister(I.getOperand(0).getReg(),
AArch64::GPR32RegClass, MRI);
I.getOperand(0).setReg(DstReg);
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_INSERT: {
LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
LLT DstTy = MRI.getType(I.getOperand(0).getReg());
unsigned DstSize = DstTy.getSizeInBits();
// Larger inserts are vectors, same-size ones should be something else by
// now (split up or turned into COPYs).
if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
return false;
I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
unsigned LSB = I.getOperand(3).getImm();
unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
I.getOperand(3).setImm((DstSize - LSB) % DstSize);
MachineInstrBuilder(MF, I).addImm(Width - 1);
if (DstSize < 64) {
assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
"unexpected G_INSERT types");
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
TII.get(AArch64::SUBREG_TO_REG))
.addDef(SrcReg)
.addImm(0)
.addUse(I.getOperand(2).getReg())
.addImm(AArch64::sub_32);
RBI.constrainGenericRegister(I.getOperand(2).getReg(),
AArch64::GPR32RegClass, MRI);
I.getOperand(2).setReg(SrcReg);
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_FRAME_INDEX: {
// allocas and G_FRAME_INDEX are only supported in addrspace(0).
if (Ty != LLT::pointer(0, 64)) {
LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
<< ", expected: " << LLT::pointer(0, 64) << '\n');
return false;
}
I.setDesc(TII.get(AArch64::ADDXri));
// MOs for a #0 shifted immediate.
I.addOperand(MachineOperand::CreateImm(0));
I.addOperand(MachineOperand::CreateImm(0));
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_GLOBAL_VALUE: {
auto GV = I.getOperand(1).getGlobal();
if (GV->isThreadLocal()) {
// FIXME: we don't support TLS yet.
return false;
}
unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
if (OpFlags & AArch64II::MO_GOT) {
I.setDesc(TII.get(AArch64::LOADgot));
I.getOperand(1).setTargetFlags(OpFlags);
} else if (TM.getCodeModel() == CodeModel::Large) {
// Materialize the global using movz/movk instructions.
materializeLargeCMVal(I, GV, OpFlags);
I.eraseFromParent();
return true;
} else if (TM.getCodeModel() == CodeModel::Tiny) {
I.setDesc(TII.get(AArch64::ADR));
I.getOperand(1).setTargetFlags(OpFlags);
} else {
I.setDesc(TII.get(AArch64::MOVaddr));
I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
MachineInstrBuilder MIB(MF, I);
MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
}
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_ZEXTLOAD:
case TargetOpcode::G_LOAD:
case TargetOpcode::G_STORE: {
bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
MachineIRBuilder MIB(I);
LLT PtrTy = MRI.getType(I.getOperand(1).getReg());
if (PtrTy != LLT::pointer(0, 64)) {
LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
<< ", expected: " << LLT::pointer(0, 64) << '\n');
return false;
}
auto &MemOp = **I.memoperands_begin();
if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
LLVM_DEBUG(dbgs() << "Atomic load/store not supported yet\n");
return false;
}
[globalisel] Update GlobalISel emitter to match new representation of extending loads Summary: Previously, a extending load was represented at (G_*EXT (G_LOAD x)). This had a few drawbacks: * G_LOAD had to be legal for all sizes you could extend from, even if registers didn't naturally hold those sizes. * All sizes you could extend from had to be allocatable just in case the extend went missing (e.g. by optimization). * At minimum, G_*EXT and G_TRUNC had to be legal for these sizes. As we improve optimization of extends and truncates, this legality requirement would spread without considerable care w.r.t when certain combines were permitted. * The SelectionDAG importer required some ugly and fragile pattern rewriting to translate patterns into this style. This patch changes the representation to: * (G_[SZ]EXTLOAD x) * (G_LOAD x) any-extends when MMO.getSize() * 8 < ResultTy.getSizeInBits() which resolves these issues by allowing targets to work entirely in their native register sizes, and by having a more direct translation from SelectionDAG patterns. Each extending load can be lowered by the legalizer into separate extends and loads, however a target that supports s1 will need the any-extending load to extend to at least s8 since LLVM does not represent memory accesses smaller than 8 bit. The legalizer can widenScalar G_LOAD into an any-extending load but sign/zero-extending loads need help from something else like a combiner pass. A follow-up patch that adds combiner helpers for for this will follow. The new representation requires that the MMO correctly reflect the memory access so this has been corrected in a couple tests. I've also moved the extending loads to their own tests since they are (mostly) separate opcodes now. Additionally, the re-write appears to have invalidated two tests from select-with-no-legality-check.mir since the matcher table no longer contains loads that result in s1's and they aren't legal in AArch64 anymore. Depends on D45540 Reviewers: ab, aditya_nandakumar, bogner, rtereshin, volkan, rovka, javed.absar Reviewed By: rtereshin Subscribers: javed.absar, llvm-commits, kristof.beyls Differential Revision: https://reviews.llvm.org/D45541 llvm-svn: 331601
2018-05-05 22:53:24 +02:00
unsigned MemSizeInBits = MemOp.getSize() * 8;
const Register PtrReg = I.getOperand(1).getReg();
#ifndef NDEBUG
const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
// Sanity-check the pointer register.
assert(PtrRB.getID() == AArch64::GPRRegBankID &&
"Load/Store pointer operand isn't a GPR");
assert(MRI.getType(PtrReg).isPointer() &&
"Load/Store pointer operand isn't a pointer");
#endif
const Register ValReg = I.getOperand(0).getReg();
const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
const unsigned NewOpc =
[globalisel] Update GlobalISel emitter to match new representation of extending loads Summary: Previously, a extending load was represented at (G_*EXT (G_LOAD x)). This had a few drawbacks: * G_LOAD had to be legal for all sizes you could extend from, even if registers didn't naturally hold those sizes. * All sizes you could extend from had to be allocatable just in case the extend went missing (e.g. by optimization). * At minimum, G_*EXT and G_TRUNC had to be legal for these sizes. As we improve optimization of extends and truncates, this legality requirement would spread without considerable care w.r.t when certain combines were permitted. * The SelectionDAG importer required some ugly and fragile pattern rewriting to translate patterns into this style. This patch changes the representation to: * (G_[SZ]EXTLOAD x) * (G_LOAD x) any-extends when MMO.getSize() * 8 < ResultTy.getSizeInBits() which resolves these issues by allowing targets to work entirely in their native register sizes, and by having a more direct translation from SelectionDAG patterns. Each extending load can be lowered by the legalizer into separate extends and loads, however a target that supports s1 will need the any-extending load to extend to at least s8 since LLVM does not represent memory accesses smaller than 8 bit. The legalizer can widenScalar G_LOAD into an any-extending load but sign/zero-extending loads need help from something else like a combiner pass. A follow-up patch that adds combiner helpers for for this will follow. The new representation requires that the MMO correctly reflect the memory access so this has been corrected in a couple tests. I've also moved the extending loads to their own tests since they are (mostly) separate opcodes now. Additionally, the re-write appears to have invalidated two tests from select-with-no-legality-check.mir since the matcher table no longer contains loads that result in s1's and they aren't legal in AArch64 anymore. Depends on D45540 Reviewers: ab, aditya_nandakumar, bogner, rtereshin, volkan, rovka, javed.absar Reviewed By: rtereshin Subscribers: javed.absar, llvm-commits, kristof.beyls Differential Revision: https://reviews.llvm.org/D45541 llvm-svn: 331601
2018-05-05 22:53:24 +02:00
selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
if (NewOpc == I.getOpcode())
return false;
I.setDesc(TII.get(NewOpc));
uint64_t Offset = 0;
auto *PtrMI = MRI.getVRegDef(PtrReg);
// Try to fold a GEP into our unsigned immediate addressing mode.
if (PtrMI->getOpcode() == TargetOpcode::G_GEP) {
if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) {
int64_t Imm = *COff;
[globalisel] Update GlobalISel emitter to match new representation of extending loads Summary: Previously, a extending load was represented at (G_*EXT (G_LOAD x)). This had a few drawbacks: * G_LOAD had to be legal for all sizes you could extend from, even if registers didn't naturally hold those sizes. * All sizes you could extend from had to be allocatable just in case the extend went missing (e.g. by optimization). * At minimum, G_*EXT and G_TRUNC had to be legal for these sizes. As we improve optimization of extends and truncates, this legality requirement would spread without considerable care w.r.t when certain combines were permitted. * The SelectionDAG importer required some ugly and fragile pattern rewriting to translate patterns into this style. This patch changes the representation to: * (G_[SZ]EXTLOAD x) * (G_LOAD x) any-extends when MMO.getSize() * 8 < ResultTy.getSizeInBits() which resolves these issues by allowing targets to work entirely in their native register sizes, and by having a more direct translation from SelectionDAG patterns. Each extending load can be lowered by the legalizer into separate extends and loads, however a target that supports s1 will need the any-extending load to extend to at least s8 since LLVM does not represent memory accesses smaller than 8 bit. The legalizer can widenScalar G_LOAD into an any-extending load but sign/zero-extending loads need help from something else like a combiner pass. A follow-up patch that adds combiner helpers for for this will follow. The new representation requires that the MMO correctly reflect the memory access so this has been corrected in a couple tests. I've also moved the extending loads to their own tests since they are (mostly) separate opcodes now. Additionally, the re-write appears to have invalidated two tests from select-with-no-legality-check.mir since the matcher table no longer contains loads that result in s1's and they aren't legal in AArch64 anymore. Depends on D45540 Reviewers: ab, aditya_nandakumar, bogner, rtereshin, volkan, rovka, javed.absar Reviewed By: rtereshin Subscribers: javed.absar, llvm-commits, kristof.beyls Differential Revision: https://reviews.llvm.org/D45541 llvm-svn: 331601
2018-05-05 22:53:24 +02:00
const unsigned Size = MemSizeInBits / 8;
const unsigned Scale = Log2_32(Size);
if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) {
unsigned Ptr2Reg = PtrMI->getOperand(1).getReg();
I.getOperand(1).setReg(Ptr2Reg);
PtrMI = MRI.getVRegDef(Ptr2Reg);
Offset = Imm / Size;
}
}
}
// If we haven't folded anything into our addressing mode yet, try to fold
// a frame index into the base+offset.
if (!Offset && PtrMI->getOpcode() == TargetOpcode::G_FRAME_INDEX)
I.getOperand(1).ChangeToFrameIndex(PtrMI->getOperand(1).getIndex());
I.addOperand(MachineOperand::CreateImm(Offset));
// If we're storing a 0, use WZR/XZR.
if (auto CVal = getConstantVRegVal(ValReg, MRI)) {
if (*CVal == 0 && Opcode == TargetOpcode::G_STORE) {
if (I.getOpcode() == AArch64::STRWui)
I.getOperand(0).setReg(AArch64::WZR);
else if (I.getOpcode() == AArch64::STRXui)
I.getOperand(0).setReg(AArch64::XZR);
}
}
if (IsZExtLoad) {
// The zextload from a smaller type to i32 should be handled by the importer.
if (MRI.getType(ValReg).getSizeInBits() != 64)
return false;
// If we have a ZEXTLOAD then change the load's type to be a narrower reg
//and zero_extend with SUBREG_TO_REG.
Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
Register DstReg = I.getOperand(0).getReg();
I.getOperand(0).setReg(LdReg);
MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
.addImm(0)
.addUse(LdReg)
.addImm(AArch64::sub_32);
constrainSelectedInstRegOperands(I, TII, TRI, RBI);
return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
MRI);
}
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_SMULH:
case TargetOpcode::G_UMULH: {
// Reject the various things we don't support yet.
if (unsupportedBinOp(I, RBI, MRI, TRI))
return false;
const Register DefReg = I.getOperand(0).getReg();
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
if (RB.getID() != AArch64::GPRRegBankID) {
LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
return false;
}
if (Ty != LLT::scalar(64)) {
LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
<< ", expected: " << LLT::scalar(64) << '\n');
return false;
}
unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
: AArch64::UMULHrr;
I.setDesc(TII.get(NewOpc));
// Now that we selected an opcode, we need to constrain the register
// operands to use appropriate classes.
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_FADD:
case TargetOpcode::G_FSUB:
case TargetOpcode::G_FMUL:
case TargetOpcode::G_FDIV:
case TargetOpcode::G_ASHR:
if (MRI.getType(I.getOperand(0).getReg()).isVector())
return selectVectorASHR(I, MRI);
LLVM_FALLTHROUGH;
case TargetOpcode::G_SHL:
if (Opcode == TargetOpcode::G_SHL &&
MRI.getType(I.getOperand(0).getReg()).isVector())
return selectVectorSHL(I, MRI);
LLVM_FALLTHROUGH;
case TargetOpcode::G_OR:
case TargetOpcode::G_LSHR: {
// Reject the various things we don't support yet.
if (unsupportedBinOp(I, RBI, MRI, TRI))
return false;
const unsigned OpSize = Ty.getSizeInBits();
const Register DefReg = I.getOperand(0).getReg();
const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
if (NewOpc == I.getOpcode())
return false;
I.setDesc(TII.get(NewOpc));
// FIXME: Should the type be always reset in setDesc?
// Now that we selected an opcode, we need to constrain the register
// operands to use appropriate classes.
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_GEP: {
MachineIRBuilder MIRBuilder(I);
emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2),
MIRBuilder);
I.eraseFromParent();
return true;
}
case TargetOpcode::G_UADDO: {
// TODO: Support other types.
unsigned OpSize = Ty.getSizeInBits();
if (OpSize != 32 && OpSize != 64) {
LLVM_DEBUG(
dbgs()
<< "G_UADDO currently only supported for 32 and 64 b types.\n");
return false;
}
// TODO: Support vectors.
if (Ty.isVector()) {
LLVM_DEBUG(dbgs() << "G_UADDO currently only supported for scalars.\n");
return false;
}
// Add and set the set condition flag.
unsigned AddsOpc = OpSize == 32 ? AArch64::ADDSWrr : AArch64::ADDSXrr;
MachineIRBuilder MIRBuilder(I);
auto AddsMI = MIRBuilder.buildInstr(
AddsOpc, {I.getOperand(0).getReg()},
{I.getOperand(2).getReg(), I.getOperand(3).getReg()});
constrainSelectedInstRegOperands(*AddsMI, TII, TRI, RBI);
// Now, put the overflow result in the register given by the first operand
// to the G_UADDO. CSINC increments the result when the predicate is false,
// so to get the increment when it's true, we need to use the inverse. In
// this case, we want to increment when carry is set.
auto CsetMI = MIRBuilder
.buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()},
{Register(AArch64::WZR), Register(AArch64::WZR)})
.addImm(getInvertedCondCode(AArch64CC::HS));
constrainSelectedInstRegOperands(*CsetMI, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
case TargetOpcode::G_PTR_MASK: {
uint64_t Align = I.getOperand(2).getImm();
if (Align >= 64 || Align == 0)
return false;
uint64_t Mask = ~((1ULL << Align) - 1);
I.setDesc(TII.get(AArch64::ANDXri));
I.getOperand(2).setImm(AArch64_AM::encodeLogicalImmediate(Mask, 64));
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
case TargetOpcode::G_PTRTOINT:
case TargetOpcode::G_TRUNC: {
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
const Register DstReg = I.getOperand(0).getReg();
const Register SrcReg = I.getOperand(1).getReg();
const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
if (DstRB.getID() != SrcRB.getID()) {
LLVM_DEBUG(
dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
return false;
}
if (DstRB.getID() == AArch64::GPRRegBankID) {
const TargetRegisterClass *DstRC =
getRegClassForTypeOnBank(DstTy, DstRB, RBI);
if (!DstRC)
return false;
const TargetRegisterClass *SrcRC =
getRegClassForTypeOnBank(SrcTy, SrcRB, RBI);
if (!SrcRC)
return false;
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
return false;
}
if (DstRC == SrcRC) {
// Nothing to be done
} else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
SrcTy == LLT::scalar(64)) {
llvm_unreachable("TableGen can import this case");
return false;
} else if (DstRC == &AArch64::GPR32RegClass &&
SrcRC == &AArch64::GPR64RegClass) {
I.getOperand(1).setSubReg(AArch64::sub_32);
} else {
LLVM_DEBUG(
dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
return false;
}
I.setDesc(TII.get(TargetOpcode::COPY));
return true;
} else if (DstRB.getID() == AArch64::FPRRegBankID) {
if (DstTy == LLT::vector(4, 16) && SrcTy == LLT::vector(4, 32)) {
I.setDesc(TII.get(AArch64::XTNv4i16));
constrainSelectedInstRegOperands(I, TII, TRI, RBI);
return true;
}
if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
MachineIRBuilder MIB(I);
MachineInstr *Extract = emitExtractVectorElt(
DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
if (!Extract)
return false;
I.eraseFromParent();
return true;
}
}
return false;
}
case TargetOpcode::G_ANYEXT: {
const Register DstReg = I.getOperand(0).getReg();
const Register SrcReg = I.getOperand(1).getReg();
const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
if (RBDst.getID() != AArch64::GPRRegBankID) {
LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
<< ", expected: GPR\n");
return false;
}
const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
if (RBSrc.getID() != AArch64::GPRRegBankID) {
LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
<< ", expected: GPR\n");
return false;
}
const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
if (DstSize == 0) {
LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
return false;
}
if (DstSize != 64 && DstSize > 32) {
LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
<< ", expected: 32 or 64\n");
return false;
}
// At this point G_ANYEXT is just like a plain COPY, but we need
// to explicitly form the 64-bit value if any.
if (DstSize > 32) {
Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
.addDef(ExtSrc)
.addImm(0)
.addUse(SrcReg)
.addImm(AArch64::sub_32);
I.getOperand(1).setReg(ExtSrc);
}
return selectCopy(I, TII, MRI, TRI, RBI);
}
case TargetOpcode::G_ZEXT:
case TargetOpcode::G_SEXT: {
unsigned Opcode = I.getOpcode();
const bool IsSigned = Opcode == TargetOpcode::G_SEXT;
const Register DefReg = I.getOperand(0).getReg();
const Register SrcReg = I.getOperand(1).getReg();
const LLT DstTy = MRI.getType(DefReg);
const LLT SrcTy = MRI.getType(SrcReg);
unsigned DstSize = DstTy.getSizeInBits();
unsigned SrcSize = SrcTy.getSizeInBits();
assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
AArch64::GPRRegBankID &&
"Unexpected ext regbank");
MachineIRBuilder MIB(I);
MachineInstr *ExtI;
if (DstTy.isVector())
return false; // Should be handled by imported patterns.
// First check if we're extending the result of a load which has a dest type
// smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
// GPR register on AArch64 and all loads which are smaller automatically
// zero-extend the upper bits. E.g.
// %v(s8) = G_LOAD %p, :: (load 1)
// %v2(s32) = G_ZEXT %v(s8)
if (!IsSigned) {
auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
if (LoadMI &&
RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID) {
const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
unsigned BytesLoaded = MemOp->getSize();
if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
return selectCopy(I, TII, MRI, TRI, RBI);
}
}
if (DstSize == 64) {
// FIXME: Can we avoid manually doing this?
if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, MRI)) {
LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
<< " operand\n");
return false;
}
auto SubregToReg =
MIB.buildInstr(AArch64::SUBREG_TO_REG, {&AArch64::GPR64RegClass}, {})
.addImm(0)
.addUse(SrcReg)
.addImm(AArch64::sub_32);
ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
{DefReg}, {SubregToReg})
.addImm(0)
.addImm(SrcSize - 1);
} else if (DstSize <= 32) {
ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
{DefReg}, {SrcReg})
.addImm(0)
.addImm(SrcSize - 1);
} else {
return false;
}
constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
case TargetOpcode::G_SITOFP:
case TargetOpcode::G_UITOFP:
case TargetOpcode::G_FPTOSI:
case TargetOpcode::G_FPTOUI: {
const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
SrcTy = MRI.getType(I.getOperand(1).getReg());
const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
if (NewOpc == Opcode)
return false;
I.setDesc(TII.get(NewOpc));
constrainSelectedInstRegOperands(I, TII, TRI, RBI);
return true;
}
case TargetOpcode::G_INTTOPTR:
// The importer is currently unable to import pointer types since they
// didn't exist in SelectionDAG.
return selectCopy(I, TII, MRI, TRI, RBI);
case TargetOpcode::G_BITCAST:
// Imported SelectionDAG rules can handle every bitcast except those that
// bitcast from a type to the same type. Ideally, these shouldn't occur
// but we might not run an optimizer that deletes them. The other exception
// is bitcasts involving pointer types, as SelectionDAG has no knowledge
// of them.
return selectCopy(I, TII, MRI, TRI, RBI);
case TargetOpcode::G_SELECT: {
if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
<< ", expected: " << LLT::scalar(1) << '\n');
return false;
}
const Register CondReg = I.getOperand(1).getReg();
const Register TReg = I.getOperand(2).getReg();
const Register FReg = I.getOperand(3).getReg();
if (tryOptSelect(I))
return true;
Register CSelOpc = selectSelectOpc(I, MRI, RBI);
MachineInstr &TstMI =
*BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ANDSWri))
.addDef(AArch64::WZR)
.addUse(CondReg)
.addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
MachineInstr &CSelMI = *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CSelOpc))
.addDef(I.getOperand(0).getReg())
.addUse(TReg)
.addUse(FReg)
.addImm(AArch64CC::NE);
constrainSelectedInstRegOperands(TstMI, TII, TRI, RBI);
constrainSelectedInstRegOperands(CSelMI, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
case TargetOpcode::G_ICMP: {
if (Ty.isVector())
return selectVectorICmp(I, MRI);
if (Ty != LLT::scalar(32)) {
LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
<< ", expected: " << LLT::scalar(32) << '\n');
return false;
}
MachineIRBuilder MIRBuilder(I);
if (!emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1),
MIRBuilder))
return false;
emitCSetForICMP(I.getOperand(0).getReg(), I.getOperand(1).getPredicate(),
MIRBuilder);
I.eraseFromParent();
return true;
}
case TargetOpcode::G_FCMP: {
if (Ty != LLT::scalar(32)) {
LLVM_DEBUG(dbgs() << "G_FCMP result has type: " << Ty
<< ", expected: " << LLT::scalar(32) << '\n');
return false;
}
unsigned CmpOpc = selectFCMPOpc(I, MRI);
if (!CmpOpc)
return false;
// FIXME: regbank
AArch64CC::CondCode CC1, CC2;
changeFCMPPredToAArch64CC(
(CmpInst::Predicate)I.getOperand(1).getPredicate(), CC1, CC2);
// Partially build the compare. Decide if we need to add a use for the
// third operand based off whether or not we're comparing against 0.0.
auto CmpMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(CmpOpc))
.addUse(I.getOperand(2).getReg());
// If we don't have an immediate compare, then we need to add a use of the
// register which wasn't used for the immediate.
// Note that the immediate will always be the last operand.
if (CmpOpc != AArch64::FCMPSri && CmpOpc != AArch64::FCMPDri)
CmpMI = CmpMI.addUse(I.getOperand(3).getReg());
const Register DefReg = I.getOperand(0).getReg();
Register Def1Reg = DefReg;
if (CC2 != AArch64CC::AL)
Def1Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
MachineInstr &CSetMI =
*BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
.addDef(Def1Reg)
.addUse(AArch64::WZR)
.addUse(AArch64::WZR)
.addImm(getInvertedCondCode(CC1));
if (CC2 != AArch64CC::AL) {
Register Def2Reg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
MachineInstr &CSet2MI =
*BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CSINCWr))
.addDef(Def2Reg)
.addUse(AArch64::WZR)
.addUse(AArch64::WZR)
.addImm(getInvertedCondCode(CC2));
MachineInstr &OrMI =
*BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::ORRWrr))
.addDef(DefReg)
.addUse(Def1Reg)
.addUse(Def2Reg);
constrainSelectedInstRegOperands(OrMI, TII, TRI, RBI);
constrainSelectedInstRegOperands(CSet2MI, TII, TRI, RBI);
}
constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
constrainSelectedInstRegOperands(CSetMI, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
case TargetOpcode::G_VASTART:
return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
: selectVaStartAAPCS(I, MF, MRI);
case TargetOpcode::G_INTRINSIC:
return selectIntrinsic(I, MRI);
case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
return selectIntrinsicWithSideEffects(I, MRI);
case TargetOpcode::G_IMPLICIT_DEF: {
I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
const Register DstReg = I.getOperand(0).getReg();
const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
const TargetRegisterClass *DstRC =
getRegClassForTypeOnBank(DstTy, DstRB, RBI);
RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
return true;
}
case TargetOpcode::G_BLOCK_ADDR: {
if (TM.getCodeModel() == CodeModel::Large) {
materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
I.eraseFromParent();
return true;
} else {
I.setDesc(TII.get(AArch64::MOVaddrBA));
auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA),
I.getOperand(0).getReg())
.addBlockAddress(I.getOperand(1).getBlockAddress(),
/* Offset */ 0, AArch64II::MO_PAGE)
.addBlockAddress(
I.getOperand(1).getBlockAddress(), /* Offset */ 0,
AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
I.eraseFromParent();
return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
}
}
case TargetOpcode::G_INTRINSIC_TRUNC:
return selectIntrinsicTrunc(I, MRI);
case TargetOpcode::G_INTRINSIC_ROUND:
return selectIntrinsicRound(I, MRI);
case TargetOpcode::G_BUILD_VECTOR:
return selectBuildVector(I, MRI);
case TargetOpcode::G_MERGE_VALUES:
return selectMergeValues(I, MRI);
case TargetOpcode::G_UNMERGE_VALUES:
return selectUnmergeValues(I, MRI);
case TargetOpcode::G_SHUFFLE_VECTOR:
return selectShuffleVector(I, MRI);
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
return selectExtractElt(I, MRI);
case TargetOpcode::G_INSERT_VECTOR_ELT:
return selectInsertElt(I, MRI);
case TargetOpcode::G_CONCAT_VECTORS:
return selectConcatVectors(I, MRI);
case TargetOpcode::G_JUMP_TABLE:
return selectJumpTable(I, MRI);
}
return false;
}
bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
Register JTAddr = I.getOperand(0).getReg();
unsigned JTI = I.getOperand(1).getIndex();
Register Index = I.getOperand(2).getReg();
MachineIRBuilder MIB(I);
Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
MIB.buildInstr(AArch64::JumpTableDest32, {TargetReg, ScratchReg},
{JTAddr, Index})
.addJumpTableIndex(JTI);
// Build the indirect branch.
MIB.buildInstr(AArch64::BR, {}, {TargetReg});
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectJumpTable(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
Register DstReg = I.getOperand(0).getReg();
unsigned JTI = I.getOperand(1).getIndex();
// We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
MachineIRBuilder MIB(I);
auto MovMI =
MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
.addJumpTableIndex(JTI, AArch64II::MO_PAGE)
.addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
I.eraseFromParent();
return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
}
bool AArch64InstructionSelector::selectIntrinsicTrunc(
MachineInstr &I, MachineRegisterInfo &MRI) const {
const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
// Select the correct opcode.
unsigned Opc = 0;
if (!SrcTy.isVector()) {
switch (SrcTy.getSizeInBits()) {
default:
case 16:
Opc = AArch64::FRINTZHr;
break;
case 32:
Opc = AArch64::FRINTZSr;
break;
case 64:
Opc = AArch64::FRINTZDr;
break;
}
} else {
unsigned NumElts = SrcTy.getNumElements();
switch (SrcTy.getElementType().getSizeInBits()) {
default:
break;
case 16:
if (NumElts == 4)
Opc = AArch64::FRINTZv4f16;
else if (NumElts == 8)
Opc = AArch64::FRINTZv8f16;
break;
case 32:
if (NumElts == 2)
Opc = AArch64::FRINTZv2f32;
else if (NumElts == 4)
Opc = AArch64::FRINTZv4f32;
break;
case 64:
if (NumElts == 2)
Opc = AArch64::FRINTZv2f64;
break;
}
}
if (!Opc) {
// Didn't get an opcode above, bail.
LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n");
return false;
}
// Legalization would have set us up perfectly for this; we just need to
// set the opcode and move on.
I.setDesc(TII.get(Opc));
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
bool AArch64InstructionSelector::selectIntrinsicRound(
MachineInstr &I, MachineRegisterInfo &MRI) const {
const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
// Select the correct opcode.
unsigned Opc = 0;
if (!SrcTy.isVector()) {
switch (SrcTy.getSizeInBits()) {
default:
case 16:
Opc = AArch64::FRINTAHr;
break;
case 32:
Opc = AArch64::FRINTASr;
break;
case 64:
Opc = AArch64::FRINTADr;
break;
}
} else {
unsigned NumElts = SrcTy.getNumElements();
switch (SrcTy.getElementType().getSizeInBits()) {
default:
break;
case 16:
if (NumElts == 4)
Opc = AArch64::FRINTAv4f16;
else if (NumElts == 8)
Opc = AArch64::FRINTAv8f16;
break;
case 32:
if (NumElts == 2)
Opc = AArch64::FRINTAv2f32;
else if (NumElts == 4)
Opc = AArch64::FRINTAv4f32;
break;
case 64:
if (NumElts == 2)
Opc = AArch64::FRINTAv2f64;
break;
}
}
if (!Opc) {
// Didn't get an opcode above, bail.
LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n");
return false;
}
// Legalization would have set us up perfectly for this; we just need to
// set the opcode and move on.
I.setDesc(TII.get(Opc));
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
bool AArch64InstructionSelector::selectVectorICmp(
MachineInstr &I, MachineRegisterInfo &MRI) const {
Register DstReg = I.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
Register SrcReg = I.getOperand(2).getReg();
Register Src2Reg = I.getOperand(3).getReg();
LLT SrcTy = MRI.getType(SrcReg);
unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
unsigned NumElts = DstTy.getNumElements();
// First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b
// Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16
// Third index is cc opcode:
// 0 == eq
// 1 == ugt
// 2 == uge
// 3 == ult
// 4 == ule
// 5 == sgt
// 6 == sge
// 7 == slt
// 8 == sle
// ne is done by negating 'eq' result.
// This table below assumes that for some comparisons the operands will be
// commuted.
// ult op == commute + ugt op
// ule op == commute + uge op
// slt op == commute + sgt op
// sle op == commute + sge op
unsigned PredIdx = 0;
bool SwapOperands = false;
CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
switch (Pred) {
case CmpInst::ICMP_NE:
case CmpInst::ICMP_EQ:
PredIdx = 0;
break;
case CmpInst::ICMP_UGT:
PredIdx = 1;
break;
case CmpInst::ICMP_UGE:
PredIdx = 2;
break;
case CmpInst::ICMP_ULT:
PredIdx = 3;
SwapOperands = true;
break;
case CmpInst::ICMP_ULE:
PredIdx = 4;
SwapOperands = true;
break;
case CmpInst::ICMP_SGT:
PredIdx = 5;
break;
case CmpInst::ICMP_SGE:
PredIdx = 6;
break;
case CmpInst::ICMP_SLT:
PredIdx = 7;
SwapOperands = true;
break;
case CmpInst::ICMP_SLE:
PredIdx = 8;
SwapOperands = true;
break;
default:
llvm_unreachable("Unhandled icmp predicate");
return false;
}
// This table obviously should be tablegen'd when we have our GISel native
// tablegen selector.
static const unsigned OpcTable[4][4][9] = {
{
{0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */},
{0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */},
{AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8,
AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8,
AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8},
{AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8,
AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8,
AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8}
},
{
{0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */},
{AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16,
AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16,
AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16},
{AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16,
AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16,
AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16},
{0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */}
},
{
{AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32,
AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32,
AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32},
{AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32,
AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32,
AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32},
{0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */},
{0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */}
},
{
{AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64,
AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64,
AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64},
{0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */},
{0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */},
{0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
0 /* invalid */}
},
};
unsigned EltIdx = Log2_32(SrcEltSize / 8);
unsigned NumEltsIdx = Log2_32(NumElts / 2);
unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx];
if (!Opc) {
LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode");
return false;
}
const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI);
const TargetRegisterClass *SrcRC =
getRegClassForTypeOnBank(SrcTy, VecRB, RBI, true);
if (!SrcRC) {
LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
return false;
}
unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0;
if (SrcTy.getSizeInBits() == 128)
NotOpc = NotOpc ? AArch64::NOTv16i8 : 0;
if (SwapOperands)
std::swap(SrcReg, Src2Reg);
MachineIRBuilder MIB(I);
auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg});
constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
// Invert if we had a 'ne' cc.
if (NotOpc) {
Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp});
constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
} else {
MIB.buildCopy(DstReg, Cmp.getReg(0));
}
RBI.constrainGenericRegister(DstReg, *SrcRC, MRI);
I.eraseFromParent();
return true;
}
MachineInstr *AArch64InstructionSelector::emitScalarToVector(
unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
MachineIRBuilder &MIRBuilder) const {
auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
auto BuildFn = [&](unsigned SubregIndex) {
auto Ins =
MIRBuilder
.buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar})
.addImm(SubregIndex);
constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
return &*Ins;
};
switch (EltSize) {
case 16:
return BuildFn(AArch64::hsub);
case 32:
return BuildFn(AArch64::ssub);
case 64:
return BuildFn(AArch64::dsub);
default:
return nullptr;
}
}
bool AArch64InstructionSelector::selectMergeValues(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
if (I.getNumOperands() != 3)
return false;
// Merging 2 s64s into an s128.
if (DstTy == LLT::scalar(128)) {
if (SrcTy.getSizeInBits() != 64)
return false;
MachineIRBuilder MIB(I);
Register DstReg = I.getOperand(0).getReg();
Register Src1Reg = I.getOperand(1).getReg();
Register Src2Reg = I.getOperand(2).getReg();
auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
MachineInstr *InsMI =
emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB);
if (!InsMI)
return false;
MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
Src2Reg, /* LaneIdx */ 1, RB, MIB);
if (!Ins2MI)
return false;
constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
if (RB.getID() != AArch64::GPRRegBankID)
return false;
if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
return false;
auto *DstRC = &AArch64::GPR64RegClass;
Register SubToRegDef = MRI.createVirtualRegister(DstRC);
MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(TargetOpcode::SUBREG_TO_REG))
.addDef(SubToRegDef)
.addImm(0)
.addUse(I.getOperand(1).getReg())
.addImm(AArch64::sub_32);
Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
// Need to anyext the second scalar before we can use bfm
MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(TargetOpcode::SUBREG_TO_REG))
.addDef(SubToRegDef2)
.addImm(0)
.addUse(I.getOperand(2).getReg())
.addImm(AArch64::sub_32);
MachineInstr &BFM =
*BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri))
.addDef(I.getOperand(0).getReg())
.addUse(SubToRegDef)
.addUse(SubToRegDef2)
.addImm(32)
.addImm(31);
constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI);
constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI);
constrainSelectedInstRegOperands(BFM, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
const unsigned EltSize) {
// Choose a lane copy opcode and subregister based off of the size of the
// vector's elements.
switch (EltSize) {
case 16:
CopyOpc = AArch64::CPYi16;
ExtractSubReg = AArch64::hsub;
break;
case 32:
CopyOpc = AArch64::CPYi32;
ExtractSubReg = AArch64::ssub;
break;
case 64:
CopyOpc = AArch64::CPYi64;
ExtractSubReg = AArch64::dsub;
break;
default:
// Unknown size, bail out.
LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
return false;
}
return true;
}
MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
Optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
unsigned CopyOpc = 0;
unsigned ExtractSubReg = 0;
if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) {
LLVM_DEBUG(
dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
return nullptr;
}
const TargetRegisterClass *DstRC =
getRegClassForTypeOnBank(ScalarTy, DstRB, RBI, true);
if (!DstRC) {
LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
return nullptr;
}
const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
const LLT &VecTy = MRI.getType(VecReg);
const TargetRegisterClass *VecRC =
getRegClassForTypeOnBank(VecTy, VecRB, RBI, true);
if (!VecRC) {
LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
return nullptr;
}
// The register that we're going to copy into.
Register InsertReg = VecReg;
if (!DstReg)
DstReg = MRI.createVirtualRegister(DstRC);
// If the lane index is 0, we just use a subregister COPY.
if (LaneIdx == 0) {
auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {})
.addReg(VecReg, 0, ExtractSubReg);
RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
return &*Copy;
}
// Lane copies require 128-bit wide registers. If we're dealing with an
// unpacked vector, then we need to move up to that width. Insert an implicit
// def and a subregister insert to get us there.
if (VecTy.getSizeInBits() != 128) {
MachineInstr *ScalarToVector = emitScalarToVector(
VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
if (!ScalarToVector)
return nullptr;
InsertReg = ScalarToVector->getOperand(0).getReg();
}
MachineInstr *LaneCopyMI =
MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx);
constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
// Make sure that we actually constrain the initial copy.
RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
return LaneCopyMI;
}
bool AArch64InstructionSelector::selectExtractElt(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
"unexpected opcode!");
Register DstReg = I.getOperand(0).getReg();
const LLT NarrowTy = MRI.getType(DstReg);
const Register SrcReg = I.getOperand(1).getReg();
const LLT WideTy = MRI.getType(SrcReg);
(void)WideTy;
assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
"source register size too small!");
assert(NarrowTy.isScalar() && "cannot extract vector into vector!");
// Need the lane index to determine the correct copy opcode.
MachineOperand &LaneIdxOp = I.getOperand(2);
assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
return false;
}
// Find the index to extract from.
auto VRegAndVal = getConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
if (!VRegAndVal)
return false;
unsigned LaneIdx = VRegAndVal->Value;
MachineIRBuilder MIRBuilder(I);
const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
LaneIdx, MIRBuilder);
if (!Extract)
return false;
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectSplitVectorUnmerge(
MachineInstr &I, MachineRegisterInfo &MRI) const {
unsigned NumElts = I.getNumOperands() - 1;
Register SrcReg = I.getOperand(NumElts).getReg();
const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
const LLT SrcTy = MRI.getType(SrcReg);
assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
if (SrcTy.getSizeInBits() > 128) {
LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
return false;
}
MachineIRBuilder MIB(I);
// We implement a split vector operation by treating the sub-vectors as
// scalars and extracting them.
const RegisterBank &DstRB =
*RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI);
for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
Register Dst = I.getOperand(OpIdx).getReg();
MachineInstr *Extract =
emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB);
if (!Extract)
return false;
}
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectUnmergeValues(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
"unexpected opcode");
// TODO: Handle unmerging into GPRs and from scalars to scalars.
if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
AArch64::FPRRegBankID ||
RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
AArch64::FPRRegBankID) {
LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
"currently unsupported.\n");
return false;
}
// The last operand is the vector source register, and every other operand is
// a register to unpack into.
unsigned NumElts = I.getNumOperands() - 1;
Register SrcReg = I.getOperand(NumElts).getReg();
const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
const LLT WideTy = MRI.getType(SrcReg);
(void)WideTy;
assert(WideTy.isVector() && "can only unmerge from vector types!");
assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
"source register size too small!");
if (!NarrowTy.isScalar())
return selectSplitVectorUnmerge(I, MRI);
MachineIRBuilder MIB(I);
// Choose a lane copy opcode and subregister based off of the size of the
// vector's elements.
unsigned CopyOpc = 0;
unsigned ExtractSubReg = 0;
if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits()))
return false;
// Set up for the lane copies.
MachineBasicBlock &MBB = *I.getParent();
// Stores the registers we'll be copying from.
SmallVector<Register, 4> InsertRegs;
// We'll use the first register twice, so we only need NumElts-1 registers.
unsigned NumInsertRegs = NumElts - 1;
// If our elements fit into exactly 128 bits, then we can copy from the source
// directly. Otherwise, we need to do a bit of setup with some subregister
// inserts.
if (NarrowTy.getSizeInBits() * NumElts == 128) {
InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
} else {
// No. We have to perform subregister inserts. For each insert, create an
// implicit def and a subregister insert, and save the register we create.
for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
MachineInstr &ImpDefMI =
*BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
ImpDefReg);
// Now, create the subregister insert from SrcReg.
Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
MachineInstr &InsMI =
*BuildMI(MBB, I, I.getDebugLoc(),
TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
.addUse(ImpDefReg)
.addUse(SrcReg)
.addImm(AArch64::dsub);
constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
// Save the register so that we can copy from it after.
InsertRegs.push_back(InsertReg);
}
}
// Now that we've created any necessary subregister inserts, we can
// create the copies.
//
// Perform the first copy separately as a subregister copy.
Register CopyTo = I.getOperand(0).getReg();
auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {})
.addReg(InsertRegs[0], 0, ExtractSubReg);
constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
// Now, perform the remaining copies as vector lane copies.
unsigned LaneIdx = 1;
for (Register InsReg : InsertRegs) {
Register CopyTo = I.getOperand(LaneIdx).getReg();
MachineInstr &CopyInst =
*BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
.addUse(InsReg)
.addImm(LaneIdx);
constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
++LaneIdx;
}
// Separately constrain the first copy's destination. Because of the
// limitation in constrainOperandRegClass, we can't guarantee that this will
// actually be constrained. So, do it ourselves using the second operand.
const TargetRegisterClass *RC =
MRI.getRegClassOrNull(I.getOperand(1).getReg());
if (!RC) {
LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
return false;
}
RBI.constrainGenericRegister(CopyTo, *RC, MRI);
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectConcatVectors(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
"Unexpected opcode");
Register Dst = I.getOperand(0).getReg();
Register Op1 = I.getOperand(1).getReg();
Register Op2 = I.getOperand(2).getReg();
MachineIRBuilder MIRBuilder(I);
MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIRBuilder);
if (!ConcatMI)
return false;
I.eraseFromParent();
return true;
}
void AArch64InstructionSelector::collectShuffleMaskIndices(
MachineInstr &I, MachineRegisterInfo &MRI,
SmallVectorImpl<Optional<int>> &Idxs) const {
MachineInstr *MaskDef = MRI.getVRegDef(I.getOperand(3).getReg());
assert(
MaskDef->getOpcode() == TargetOpcode::G_BUILD_VECTOR &&
"G_SHUFFLE_VECTOR should have a constant mask operand as G_BUILD_VECTOR");
// Find the constant indices.
for (unsigned i = 1, e = MaskDef->getNumOperands(); i < e; ++i) {
// Look through copies.
MachineInstr *ScalarDef =
getDefIgnoringCopies(MaskDef->getOperand(i).getReg(), MRI);
assert(ScalarDef && "Could not find vreg def of shufflevec index op");
if (ScalarDef->getOpcode() != TargetOpcode::G_CONSTANT) {
// This be an undef if not a constant.
assert(ScalarDef->getOpcode() == TargetOpcode::G_IMPLICIT_DEF);
Idxs.push_back(None);
} else {
Idxs.push_back(ScalarDef->getOperand(1).getCImm()->getSExtValue());
}
}
}
unsigned
AArch64InstructionSelector::emitConstantPoolEntry(Constant *CPVal,
MachineFunction &MF) const {
Type *CPTy = CPVal->getType();
unsigned Align = MF.getDataLayout().getPrefTypeAlignment(CPTy);
if (Align == 0)
Align = MF.getDataLayout().getTypeAllocSize(CPTy);
MachineConstantPool *MCP = MF.getConstantPool();
return MCP->getConstantPoolIndex(CPVal, Align);
}
MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
unsigned CPIdx = emitConstantPoolEntry(CPVal, MIRBuilder.getMF());
auto Adrp =
MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
.addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
MachineInstr *LoadMI = nullptr;
switch (MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType())) {
case 16:
LoadMI =
&*MIRBuilder
.buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp})
.addConstantPoolIndex(CPIdx, 0,
AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
break;
case 8:
LoadMI = &*MIRBuilder
.buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp})
.addConstantPoolIndex(
CPIdx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
break;
default:
LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
<< *CPVal->getType());
return nullptr;
}
constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
return LoadMI;
}
/// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
/// size and RB.
static std::pair<unsigned, unsigned>
getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
unsigned Opc, SubregIdx;
if (RB.getID() == AArch64::GPRRegBankID) {
if (EltSize == 32) {
Opc = AArch64::INSvi32gpr;
SubregIdx = AArch64::ssub;
} else if (EltSize == 64) {
Opc = AArch64::INSvi64gpr;
SubregIdx = AArch64::dsub;
} else {
llvm_unreachable("invalid elt size!");
}
} else {
if (EltSize == 8) {
Opc = AArch64::INSvi8lane;
SubregIdx = AArch64::bsub;
} else if (EltSize == 16) {
Opc = AArch64::INSvi16lane;
SubregIdx = AArch64::hsub;
} else if (EltSize == 32) {
Opc = AArch64::INSvi32lane;
SubregIdx = AArch64::ssub;
} else if (EltSize == 64) {
Opc = AArch64::INSvi64lane;
SubregIdx = AArch64::dsub;
} else {
llvm_unreachable("invalid elt size!");
}
}
return std::make_pair(Opc, SubregIdx);
}
MachineInstr *
AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const {
assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
static const unsigned OpcTable[2][2]{{AArch64::ADDXrr, AArch64::ADDXri},
{AArch64::ADDWrr, AArch64::ADDWri}};
bool Is32Bit = MRI.getType(LHS.getReg()).getSizeInBits() == 32;
auto ImmFns = selectArithImmed(RHS);
unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()];
auto AddMI = MIRBuilder.buildInstr(Opc, {DefReg}, {LHS.getReg()});
// If we matched a valid constant immediate, add those operands.
if (ImmFns) {
for (auto &RenderFn : *ImmFns)
RenderFn(AddMI);
} else {
AddMI.addUse(RHS.getReg());
}
constrainSelectedInstRegOperands(*AddMI, TII, TRI, RBI);
return &*AddMI;
}
MachineInstr *
AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
MachineIRBuilder &MIRBuilder) const {
assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
static const unsigned OpcTable[2][2]{{AArch64::ADDSXrr, AArch64::ADDSXri},
{AArch64::ADDSWrr, AArch64::ADDSWri}};
bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
auto ImmFns = selectArithImmed(RHS);
unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()];
Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
auto CmpMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS.getReg()});
// If we matched a valid constant immediate, add those operands.
if (ImmFns) {
for (auto &RenderFn : *ImmFns)
RenderFn(CmpMI);
} else {
CmpMI.addUse(RHS.getReg());
}
constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
return &*CmpMI;
}
MachineInstr *
AArch64InstructionSelector::emitTST(const Register &LHS, const Register &RHS,
MachineIRBuilder &MIRBuilder) const {
MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
unsigned RegSize = MRI.getType(LHS).getSizeInBits();
bool Is32Bit = (RegSize == 32);
static const unsigned OpcTable[2][2]{{AArch64::ANDSXrr, AArch64::ANDSXri},
{AArch64::ANDSWrr, AArch64::ANDSWri}};
Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
// We might be able to fold in an immediate into the TST. We need to make sure
// it's a logical immediate though, since ANDS requires that.
auto ValAndVReg = getConstantVRegValWithLookThrough(RHS, MRI);
bool IsImmForm = ValAndVReg.hasValue() &&
AArch64_AM::isLogicalImmediate(ValAndVReg->Value, RegSize);
unsigned Opc = OpcTable[Is32Bit][IsImmForm];
auto TstMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS});
if (IsImmForm)
TstMI.addImm(
AArch64_AM::encodeLogicalImmediate(ValAndVReg->Value, RegSize));
else
TstMI.addUse(RHS);
constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
return &*TstMI;
}
MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
MachineIRBuilder &MIRBuilder) const {
assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
// Fold the compare if possible.
MachineInstr *FoldCmp =
tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder);
if (FoldCmp)
return FoldCmp;
// Can't fold into a CMN. Just emit a normal compare.
unsigned CmpOpc = 0;
Register ZReg;
LLT CmpTy = MRI.getType(LHS.getReg());
assert((CmpTy.isScalar() || CmpTy.isPointer()) &&
"Expected scalar or pointer");
if (CmpTy == LLT::scalar(32)) {
CmpOpc = AArch64::SUBSWrr;
ZReg = AArch64::WZR;
} else if (CmpTy == LLT::scalar(64) || CmpTy.isPointer()) {
CmpOpc = AArch64::SUBSXrr;
ZReg = AArch64::XZR;
} else {
return nullptr;
}
// Try to match immediate forms.
auto ImmFns = selectArithImmed(RHS);
if (ImmFns)
CmpOpc = CmpOpc == AArch64::SUBSWrr ? AArch64::SUBSWri : AArch64::SUBSXri;
auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addDef(ZReg).addUse(LHS.getReg());
// If we matched a valid constant immediate, add those operands.
if (ImmFns) {
for (auto &RenderFn : *ImmFns)
RenderFn(CmpMI);
} else {
CmpMI.addUse(RHS.getReg());
}
// Make sure that we can constrain the compare that we emitted.
constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
return &*CmpMI;
}
MachineInstr *AArch64InstructionSelector::emitVectorConcat(
Optional<Register> Dst, Register Op1, Register Op2,
MachineIRBuilder &MIRBuilder) const {
// We implement a vector concat by:
// 1. Use scalar_to_vector to insert the lower vector into the larger dest
// 2. Insert the upper vector into the destination's upper element
// TODO: some of this code is common with G_BUILD_VECTOR handling.
MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
const LLT Op1Ty = MRI.getType(Op1);
const LLT Op2Ty = MRI.getType(Op2);
if (Op1Ty != Op2Ty) {
LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
return nullptr;
}
assert(Op1Ty.isVector() && "Expected a vector for vector concat");
if (Op1Ty.getSizeInBits() >= 128) {
LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
return nullptr;
}
// At the moment we just support 64 bit vector concats.
if (Op1Ty.getSizeInBits() != 64) {
LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
return nullptr;
}
const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
const TargetRegisterClass *DstRC =
getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2);
MachineInstr *WidenedOp1 =
emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
MachineInstr *WidenedOp2 =
emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
if (!WidenedOp1 || !WidenedOp2) {
LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
return nullptr;
}
// Now do the insert of the upper element.
unsigned InsertOpc, InsSubRegIdx;
std::tie(InsertOpc, InsSubRegIdx) =
getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
if (!Dst)
Dst = MRI.createVirtualRegister(DstRC);
auto InsElt =
MIRBuilder
.buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()})
.addImm(1) /* Lane index */
.addUse(WidenedOp2->getOperand(0).getReg())
.addImm(0);
constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
return &*InsElt;
}
MachineInstr *AArch64InstructionSelector::emitFMovForFConstant(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_FCONSTANT &&
"Expected a G_FCONSTANT!");
MachineOperand &ImmOp = I.getOperand(1);
unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits();
// Only handle 32 and 64 bit defs for now.
if (DefSize != 32 && DefSize != 64)
return nullptr;
// Don't handle null values using FMOV.
if (ImmOp.getFPImm()->isNullValue())
return nullptr;
// Get the immediate representation for the FMOV.
const APFloat &ImmValAPF = ImmOp.getFPImm()->getValueAPF();
int Imm = DefSize == 32 ? AArch64_AM::getFP32Imm(ImmValAPF)
: AArch64_AM::getFP64Imm(ImmValAPF);
// If this is -1, it means the immediate can't be represented as the requested
// floating point value. Bail.
if (Imm == -1)
return nullptr;
// Update MI to represent the new FMOV instruction, constrain it, and return.
ImmOp.ChangeToImmediate(Imm);
unsigned MovOpc = DefSize == 32 ? AArch64::FMOVSi : AArch64::FMOVDi;
I.setDesc(TII.get(MovOpc));
constrainSelectedInstRegOperands(I, TII, TRI, RBI);
return &I;
}
MachineInstr *
AArch64InstructionSelector::emitCSetForICMP(Register DefReg, unsigned Pred,
MachineIRBuilder &MIRBuilder) const {
// CSINC increments the result when the predicate is false. Invert it.
const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
CmpInst::getInversePredicate((CmpInst::Predicate)Pred));
auto I =
MIRBuilder
.buildInstr(AArch64::CSINCWr, {DefReg}, {Register(AArch64::WZR), Register(AArch64::WZR)})
.addImm(InvCC);
constrainSelectedInstRegOperands(*I, TII, TRI, RBI);
return &*I;
}
bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const {
MachineIRBuilder MIB(I);
MachineRegisterInfo &MRI = *MIB.getMRI();
const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
// We want to recognize this pattern:
//
// $z = G_FCMP pred, $x, $y
// ...
// $w = G_SELECT $z, $a, $b
//
// Where the value of $z is *only* ever used by the G_SELECT (possibly with
// some copies/truncs in between.)
//
// If we see this, then we can emit something like this:
//
// fcmp $x, $y
// fcsel $w, $a, $b, pred
//
// Rather than emitting both of the rather long sequences in the standard
// G_FCMP/G_SELECT select methods.
// First, check if the condition is defined by a compare.
MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
while (CondDef) {
// We can only fold if all of the defs have one use.
if (!MRI.hasOneUse(CondDef->getOperand(0).getReg()))
return false;
// We can skip over G_TRUNC since the condition is 1-bit.
// Truncating/extending can have no impact on the value.
unsigned Opc = CondDef->getOpcode();
if (Opc != TargetOpcode::COPY && Opc != TargetOpcode::G_TRUNC)
break;
// Can't see past copies from physregs.
if (Opc == TargetOpcode::COPY &&
Register::isPhysicalRegister(CondDef->getOperand(1).getReg()))
return false;
CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg());
}
// Is the condition defined by a compare?
if (!CondDef)
return false;
unsigned CondOpc = CondDef->getOpcode();
if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP)
return false;
AArch64CC::CondCode CondCode;
if (CondOpc == TargetOpcode::G_ICMP) {
CondCode = changeICMPPredToAArch64CC(
(CmpInst::Predicate)CondDef->getOperand(1).getPredicate());
if (!emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
CondDef->getOperand(1), MIB)) {
LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
return false;
}
} else {
// Get the condition code for the select.
AArch64CC::CondCode CondCode2;
changeFCMPPredToAArch64CC(
(CmpInst::Predicate)CondDef->getOperand(1).getPredicate(), CondCode,
CondCode2);
// changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
// instructions to emit the comparison.
// TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
// unnecessary.
if (CondCode2 != AArch64CC::AL)
return false;
// Make sure we'll be able to select the compare.
unsigned CmpOpc = selectFCMPOpc(*CondDef, MRI);
if (!CmpOpc)
return false;
// Emit a new compare.
auto Cmp = MIB.buildInstr(CmpOpc, {}, {CondDef->getOperand(2).getReg()});
if (CmpOpc != AArch64::FCMPSri && CmpOpc != AArch64::FCMPDri)
Cmp.addUse(CondDef->getOperand(3).getReg());
constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
}
// Emit the select.
unsigned CSelOpc = selectSelectOpc(I, MRI, RBI);
auto CSel =
MIB.buildInstr(CSelOpc, {I.getOperand(0).getReg()},
{I.getOperand(2).getReg(), I.getOperand(3).getReg()})
.addImm(CondCode);
constrainSelectedInstRegOperands(*CSel, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
MachineIRBuilder &MIRBuilder) const {
assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
"Unexpected MachineOperand");
MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
// We want to find this sort of thing:
// x = G_SUB 0, y
// G_ICMP z, x
//
// In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
// e.g:
//
// cmn z, y
// Helper lambda to detect the subtract followed by the compare.
// Takes in the def of the LHS or RHS, and checks if it's a subtract from 0.
auto IsCMN = [&](MachineInstr *DefMI, const AArch64CC::CondCode &CC) {
if (!DefMI || DefMI->getOpcode() != TargetOpcode::G_SUB)
return false;
// Need to make sure NZCV is the same at the end of the transformation.
if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
return false;
// We want to match against SUBs.
if (DefMI->getOpcode() != TargetOpcode::G_SUB)
return false;
// Make sure that we're getting
// x = G_SUB 0, y
auto ValAndVReg =
getConstantVRegValWithLookThrough(DefMI->getOperand(1).getReg(), MRI);
if (!ValAndVReg || ValAndVReg->Value != 0)
return false;
// This can safely be represented as a CMN.
return true;
};
// Check if the RHS or LHS of the G_ICMP is defined by a SUB
MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
CmpInst::Predicate P = (CmpInst::Predicate)Predicate.getPredicate();
const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(P);
// Given this:
//
// x = G_SUB 0, y
// G_ICMP x, z
//
// Produce this:
//
// cmn y, z
if (IsCMN(LHSDef, CC))
return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
// Same idea here, but with the RHS of the compare instead:
//
// Given this:
//
// x = G_SUB 0, y
// G_ICMP z, x
//
// Produce this:
//
// cmn z, y
if (IsCMN(RHSDef, CC))
return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
// Given this:
//
// z = G_AND x, y
// G_ICMP z, 0
//
// Produce this if the compare is signed:
//
// tst x, y
if (!isUnsignedICMPPred(P) && LHSDef &&
LHSDef->getOpcode() == TargetOpcode::G_AND) {
// Make sure that the RHS is 0.
auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI);
if (!ValAndVReg || ValAndVReg->Value != 0)
return nullptr;
return emitTST(LHSDef->getOperand(1).getReg(),
LHSDef->getOperand(2).getReg(), MIRBuilder);
}
return nullptr;
}
bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const {
// Try to match a vector splat operation into a dup instruction.
// We're looking for this pattern:
// %scalar:gpr(s64) = COPY $x0
// %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF
// %cst0:gpr(s32) = G_CONSTANT i32 0
// %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32)
// %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32)
// %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef,
// %zerovec(<2 x s32>)
//
// ...into:
// %splat = DUP %scalar
// We use the regbank of the scalar to determine which kind of dup to use.
MachineIRBuilder MIB(I);
MachineRegisterInfo &MRI = *MIB.getMRI();
const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
using namespace TargetOpcode;
using namespace MIPatternMatch;
// Begin matching the insert.
auto *InsMI =
getOpcodeDef(G_INSERT_VECTOR_ELT, I.getOperand(1).getReg(), MRI);
if (!InsMI)
return false;
// Match the undef vector operand.
auto *UndefMI =
getOpcodeDef(G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(), MRI);
if (!UndefMI)
return false;
// Match the scalar being splatted.
Register ScalarReg = InsMI->getOperand(2).getReg();
const RegisterBank *ScalarRB = RBI.getRegBank(ScalarReg, MRI, TRI);
// Match the index constant 0.
int64_t Index = 0;
if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index)
return false;
// The shuffle's second operand doesn't matter if the mask is all zero.
auto *ZeroVec = getOpcodeDef(G_BUILD_VECTOR, I.getOperand(3).getReg(), MRI);
if (!ZeroVec)
return false;
int64_t Zero = 0;
if (!mi_match(ZeroVec->getOperand(1).getReg(), MRI, m_ICst(Zero)) || Zero)
return false;
for (unsigned i = 1, e = ZeroVec->getNumOperands(); i < e; ++i) {
if (ZeroVec->getOperand(i).getReg() != ZeroVec->getOperand(1).getReg())
return false; // This wasn't an all zeros vector.
}
// We're done, now find out what kind of splat we need.
LLT VecTy = MRI.getType(I.getOperand(0).getReg());
LLT EltTy = VecTy.getElementType();
if (VecTy.getSizeInBits() != 128 || EltTy.getSizeInBits() < 32) {
LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 128b yet");
return false;
}
bool IsFP = ScalarRB->getID() == AArch64::FPRRegBankID;
static const unsigned OpcTable[2][2] = {
{AArch64::DUPv4i32gpr, AArch64::DUPv2i64gpr},
{AArch64::DUPv4i32lane, AArch64::DUPv2i64lane}};
unsigned Opc = OpcTable[IsFP][EltTy.getSizeInBits() == 64];
// For FP splats, we need to widen the scalar reg via undef too.
if (IsFP) {
MachineInstr *Widen = emitScalarToVector(
EltTy.getSizeInBits(), &AArch64::FPR128RegClass, ScalarReg, MIB);
if (!Widen)
return false;
ScalarReg = Widen->getOperand(0).getReg();
}
auto Dup = MIB.buildInstr(Opc, {I.getOperand(0).getReg()}, {ScalarReg});
if (IsFP)
Dup.addImm(0);
constrainSelectedInstRegOperands(*Dup, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::tryOptVectorShuffle(MachineInstr &I) const {
if (TM.getOptLevel() == CodeGenOpt::None)
return false;
if (tryOptVectorDup(I))
return true;
return false;
}
bool AArch64InstructionSelector::selectShuffleVector(
MachineInstr &I, MachineRegisterInfo &MRI) const {
if (tryOptVectorShuffle(I))
return true;
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
Register Src1Reg = I.getOperand(1).getReg();
const LLT Src1Ty = MRI.getType(Src1Reg);
Register Src2Reg = I.getOperand(2).getReg();
const LLT Src2Ty = MRI.getType(Src2Reg);
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
LLVMContext &Ctx = MF.getFunction().getContext();
// G_SHUFFLE_VECTOR doesn't really have a strictly enforced constant mask
// operand, it comes in as a normal vector value which we have to analyze to
// find the mask indices. If the mask element is undef, then
// collectShuffleMaskIndices() will add a None entry for that index into
// the list.
SmallVector<Optional<int>, 8> Mask;
collectShuffleMaskIndices(I, MRI, Mask);
assert(!Mask.empty() && "Expected to find mask indices");
// G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
// it's originated from a <1 x T> type. Those should have been lowered into
// G_BUILD_VECTOR earlier.
if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
return false;
}
unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
SmallVector<Constant *, 64> CstIdxs;
for (auto &MaybeVal : Mask) {
// For now, any undef indexes we'll just assume to be 0. This should be
// optimized in future, e.g. to select DUP etc.
int Val = MaybeVal.hasValue() ? *MaybeVal : 0;
for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
unsigned Offset = Byte + Val * BytesPerElt;
CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
}
}
MachineIRBuilder MIRBuilder(I);
// Use a constant pool to load the index vector for TBL.
Constant *CPVal = ConstantVector::get(CstIdxs);
MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIRBuilder);
if (!IndexLoad) {
LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
return false;
}
if (DstTy.getSizeInBits() != 128) {
assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
// This case can be done with TBL1.
MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIRBuilder);
if (!Concat) {
LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
return false;
}
// The constant pool load will be 64 bits, so need to convert to FPR128 reg.
IndexLoad =
emitScalarToVector(64, &AArch64::FPR128RegClass,
IndexLoad->getOperand(0).getReg(), MIRBuilder);
auto TBL1 = MIRBuilder.buildInstr(
AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
{Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
auto Copy =
MIRBuilder
.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
.addReg(TBL1.getReg(0), 0, AArch64::dsub);
RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
I.eraseFromParent();
return true;
}
// For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
// Q registers for regalloc.
auto RegSeq = MIRBuilder
.buildInstr(TargetOpcode::REG_SEQUENCE,
{&AArch64::QQRegClass}, {Src1Reg})
.addImm(AArch64::qsub0)
.addUse(Src2Reg)
.addImm(AArch64::qsub1);
auto TBL2 =
MIRBuilder.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0).getReg()},
{RegSeq, IndexLoad->getOperand(0).getReg()});
constrainSelectedInstRegOperands(*RegSeq, TII, TRI, RBI);
constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
MachineInstr *AArch64InstructionSelector::emitLaneInsert(
Optional<Register> DstReg, Register SrcReg, Register EltReg,
unsigned LaneIdx, const RegisterBank &RB,
MachineIRBuilder &MIRBuilder) const {
MachineInstr *InsElt = nullptr;
const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
// Create a register to define with the insert if one wasn't passed in.
if (!DstReg)
DstReg = MRI.createVirtualRegister(DstRC);
unsigned EltSize = MRI.getType(EltReg).getSizeInBits();
unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
if (RB.getID() == AArch64::FPRRegBankID) {
auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder);
InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
.addImm(LaneIdx)
.addUse(InsSub->getOperand(0).getReg())
.addImm(0);
} else {
InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
.addImm(LaneIdx)
.addUse(EltReg);
}
constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
return InsElt;
}
bool AArch64InstructionSelector::selectInsertElt(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
// Get information on the destination.
Register DstReg = I.getOperand(0).getReg();
const LLT DstTy = MRI.getType(DstReg);
unsigned VecSize = DstTy.getSizeInBits();
// Get information on the element we want to insert into the destination.
Register EltReg = I.getOperand(2).getReg();
const LLT EltTy = MRI.getType(EltReg);
unsigned EltSize = EltTy.getSizeInBits();
if (EltSize < 16 || EltSize > 64)
return false; // Don't support all element types yet.
// Find the definition of the index. Bail out if it's not defined by a
// G_CONSTANT.
Register IdxReg = I.getOperand(3).getReg();
auto VRegAndVal = getConstantVRegValWithLookThrough(IdxReg, MRI);
if (!VRegAndVal)
return false;
unsigned LaneIdx = VRegAndVal->Value;
// Perform the lane insert.
Register SrcReg = I.getOperand(1).getReg();
const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
MachineIRBuilder MIRBuilder(I);
if (VecSize < 128) {
// If the vector we're inserting into is smaller than 128 bits, widen it
// to 128 to do the insert.
MachineInstr *ScalarToVec = emitScalarToVector(
VecSize, &AArch64::FPR128RegClass, SrcReg, MIRBuilder);
if (!ScalarToVec)
return false;
SrcReg = ScalarToVec->getOperand(0).getReg();
}
// Create an insert into a new FPR128 register.
// Note that if our vector is already 128 bits, we end up emitting an extra
// register.
MachineInstr *InsMI =
emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIRBuilder);
if (VecSize < 128) {
// If we had to widen to perform the insert, then we have to demote back to
// the original size to get the result we want.
Register DemoteVec = InsMI->getOperand(0).getReg();
const TargetRegisterClass *RC =
getMinClassForRegBank(*RBI.getRegBank(DemoteVec, MRI, TRI), VecSize);
if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
return false;
}
unsigned SubReg = 0;
if (!getSubRegForClass(RC, TRI, SubReg))
return false;
if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize
<< "\n");
return false;
}
MIRBuilder.buildInstr(TargetOpcode::COPY, {DstReg}, {})
.addReg(DemoteVec, 0, SubReg);
RBI.constrainGenericRegister(DstReg, *RC, MRI);
} else {
// No widening needed.
InsMI->getOperand(0).setReg(DstReg);
constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
}
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectBuildVector(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
// Until we port more of the optimized selections, for now just use a vector
// insert sequence.
const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
unsigned EltSize = EltTy.getSizeInBits();
if (EltSize < 16 || EltSize > 64)
return false; // Don't support all element types yet.
const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
MachineIRBuilder MIRBuilder(I);
const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
MachineInstr *ScalarToVec =
emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
I.getOperand(1).getReg(), MIRBuilder);
if (!ScalarToVec)
return false;
Register DstVec = ScalarToVec->getOperand(0).getReg();
unsigned DstSize = DstTy.getSizeInBits();
// Keep track of the last MI we inserted. Later on, we might be able to save
// a copy using it.
MachineInstr *PrevMI = nullptr;
for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
// Note that if we don't do a subregister copy, we can end up making an
// extra register.
PrevMI = &*emitLaneInsert(None, DstVec, I.getOperand(i).getReg(), i - 1, RB,
MIRBuilder);
DstVec = PrevMI->getOperand(0).getReg();
}
// If DstTy's size in bits is less than 128, then emit a subregister copy
// from DstVec to the last register we've defined.
if (DstSize < 128) {
// Force this to be FPR using the destination vector.
const TargetRegisterClass *RC =
getMinClassForRegBank(*RBI.getRegBank(DstVec, MRI, TRI), DstSize);
if (!RC)
return false;
if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
return false;
}
unsigned SubReg = 0;
if (!getSubRegForClass(RC, TRI, SubReg))
return false;
if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
<< "\n");
return false;
}
Register Reg = MRI.createVirtualRegister(RC);
Register DstReg = I.getOperand(0).getReg();
MIRBuilder.buildInstr(TargetOpcode::COPY, {DstReg}, {})
.addReg(DstVec, 0, SubReg);
MachineOperand &RegOp = I.getOperand(1);
RegOp.setReg(Reg);
RBI.constrainGenericRegister(DstReg, *RC, MRI);
} else {
// We don't need a subregister copy. Save a copy by re-using the
// destination register on the final insert.
assert(PrevMI && "PrevMI was null?");
PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
}
I.eraseFromParent();
return true;
}
/// Helper function to find an intrinsic ID on an a MachineInstr. Returns the
/// ID if it exists, and 0 otherwise.
static unsigned findIntrinsicID(MachineInstr &I) {
auto IntrinOp = find_if(I.operands(), [&](const MachineOperand &Op) {
return Op.isIntrinsicID();
});
if (IntrinOp == I.operands_end())
return 0;
return IntrinOp->getIntrinsicID();
}
/// Helper function to emit the correct opcode for a llvm.aarch64.stlxr
/// intrinsic.
static unsigned getStlxrOpcode(unsigned NumBytesToStore) {
switch (NumBytesToStore) {
// TODO: 1 and 2 byte stores
case 4:
return AArch64::STLXRW;
case 8:
return AArch64::STLXRX;
default:
LLVM_DEBUG(dbgs() << "Unexpected number of bytes to store! ("
<< NumBytesToStore << ")\n");
break;
}
return 0;
}
bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
MachineInstr &I, MachineRegisterInfo &MRI) const {
// Find the intrinsic ID.
unsigned IntrinID = findIntrinsicID(I);
if (!IntrinID)
return false;
MachineIRBuilder MIRBuilder(I);
// Select the instruction.
switch (IntrinID) {
default:
return false;
case Intrinsic::trap:
MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(1);
break;
case Intrinsic::debugtrap:
if (!STI.isTargetWindows())
return false;
MIRBuilder.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
break;
case Intrinsic::aarch64_stlxr:
Register StatReg = I.getOperand(0).getReg();
assert(RBI.getSizeInBits(StatReg, MRI, TRI) == 32 &&
"Status register must be 32 bits!");
Register SrcReg = I.getOperand(2).getReg();
if (RBI.getSizeInBits(SrcReg, MRI, TRI) != 64) {
LLVM_DEBUG(dbgs() << "Only support 64-bit sources right now.\n");
return false;
}
Register PtrReg = I.getOperand(3).getReg();
assert(MRI.getType(PtrReg).isPointer() && "Expected pointer operand");
// Expect only one memory operand.
if (!I.hasOneMemOperand())
return false;
const MachineMemOperand *MemOp = *I.memoperands_begin();
unsigned NumBytesToStore = MemOp->getSize();
unsigned Opc = getStlxrOpcode(NumBytesToStore);
if (!Opc)
return false;
unsigned NumBitsToStore = NumBytesToStore * 8;
if (NumBitsToStore != 64) {
// The intrinsic always has a 64-bit source, but we might actually want
// a differently-sized source for the instruction. Try to get it.
// TODO: For 1 and 2-byte stores, this will have a G_AND. For now, let's
// just handle 4-byte stores.
// TODO: If we don't find a G_ZEXT, we'll have to truncate the value down
// to the right size for the STLXR.
MachineInstr *Zext = getOpcodeDef(TargetOpcode::G_ZEXT, SrcReg, MRI);
if (!Zext)
return false;
SrcReg = Zext->getOperand(1).getReg();
// We should get an appropriately-sized register here.
if (RBI.getSizeInBits(SrcReg, MRI, TRI) != NumBitsToStore)
return false;
}
auto StoreMI = MIRBuilder.buildInstr(Opc, {StatReg}, {SrcReg, PtrReg})
.addMemOperand(*I.memoperands_begin());
constrainSelectedInstRegOperands(*StoreMI, TII, TRI, RBI);
}
I.eraseFromParent();
return true;
}
bool AArch64InstructionSelector::selectIntrinsic(
MachineInstr &I, MachineRegisterInfo &MRI) const {
unsigned IntrinID = findIntrinsicID(I);
if (!IntrinID)
return false;
MachineIRBuilder MIRBuilder(I);
switch (IntrinID) {
default:
break;
case Intrinsic::aarch64_crypto_sha1h:
Register DstReg = I.getOperand(0).getReg();
Register SrcReg = I.getOperand(2).getReg();
// FIXME: Should this be an assert?
if (MRI.getType(DstReg).getSizeInBits() != 32 ||
MRI.getType(SrcReg).getSizeInBits() != 32)
return false;
// The operation has to happen on FPRs. Set up some new FPR registers for
// the source and destination if they are on GPRs.
if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
MIRBuilder.buildCopy({SrcReg}, {I.getOperand(2)});
// Make sure the copy ends up getting constrained properly.
RBI.constrainGenericRegister(I.getOperand(2).getReg(),
AArch64::GPR32RegClass, MRI);
}
if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
// Actually insert the instruction.
auto SHA1Inst = MIRBuilder.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
// Did we create a new register for the destination?
if (DstReg != I.getOperand(0).getReg()) {
// Yep. Copy the result of the instruction back into the original
// destination.
MIRBuilder.buildCopy({I.getOperand(0)}, {DstReg});
RBI.constrainGenericRegister(I.getOperand(0).getReg(),
AArch64::GPR32RegClass, MRI);
}
I.eraseFromParent();
return true;
}
return false;
}
static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
auto &MI = *Root.getParent();
auto &MBB = *MI.getParent();
auto &MF = *MBB.getParent();
auto &MRI = MF.getRegInfo();
uint64_t Immed;
if (Root.isImm())
Immed = Root.getImm();
else if (Root.isCImm())
Immed = Root.getCImm()->getZExtValue();
else if (Root.isReg()) {
auto ValAndVReg =
getConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
if (!ValAndVReg)
return None;
Immed = ValAndVReg->Value;
} else
return None;
return Immed;
}
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
auto MaybeImmed = getImmedFromMO(Root);
if (MaybeImmed == None || *MaybeImmed > 31)
return None;
uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
}
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
auto MaybeImmed = getImmedFromMO(Root);
if (MaybeImmed == None || *MaybeImmed > 31)
return None;
uint64_t Enc = 31 - *MaybeImmed;
return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
}
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
auto MaybeImmed = getImmedFromMO(Root);
if (MaybeImmed == None || *MaybeImmed > 63)
return None;
uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
}
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
auto MaybeImmed = getImmedFromMO(Root);
if (MaybeImmed == None || *MaybeImmed > 63)
return None;
uint64_t Enc = 63 - *MaybeImmed;
return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
}
/// Helper to select an immediate value that can be represented as a 12-bit
/// value shifted left by either 0 or 12. If it is possible to do so, return
/// the immediate and shift value. If not, return None.
///
/// Used by selectArithImmed and selectNegArithImmed.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::select12BitValueWithLeftShift(
uint64_t Immed) const {
unsigned ShiftAmt;
if (Immed >> 12 == 0) {
ShiftAmt = 0;
} else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
ShiftAmt = 12;
Immed = Immed >> 12;
} else
return None;
unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
return {{
[=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
}};
}
/// SelectArithImmed - Select an immediate value that can be represented as
/// a 12-bit value shifted left by either 0 or 12. If so, return true with
/// Val set to the 12-bit value and Shift set to the shifter operand.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
// This function is called from the addsub_shifted_imm ComplexPattern,
// which lists [imm] as the list of opcode it's interested in, however
// we still need to check whether the operand is actually an immediate
// here because the ComplexPattern opcode list is only used in
// root-level opcode matching.
auto MaybeImmed = getImmedFromMO(Root);
if (MaybeImmed == None)
return None;
return select12BitValueWithLeftShift(*MaybeImmed);
}
/// SelectNegArithImmed - As above, but negates the value before trying to
/// select it.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
// We need a register here, because we need to know if we have a 64 or 32
// bit immediate.
if (!Root.isReg())
return None;
auto MaybeImmed = getImmedFromMO(Root);
if (MaybeImmed == None)
return None;
uint64_t Immed = *MaybeImmed;
// This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
// have the opposite effect on the C flag, so this pattern mustn't match under
// those circumstances.
if (Immed == 0)
return None;
// Check if we're dealing with a 32-bit type on the root or a 64-bit type on
// the root.
MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
Immed = ~((uint32_t)Immed) + 1;
else
Immed = ~Immed + 1ULL;
if (Immed & 0xFFFFFFFFFF000000ULL)
return None;
Immed &= 0xFFFFFFULL;
return select12BitValueWithLeftShift(Immed);
}
/// Return true if it is worth folding MI into an extended register. That is,
/// if it's safe to pull it into the addressing mode of a load or store as a
/// shift.
bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
MachineInstr &MI, const MachineRegisterInfo &MRI) const {
// Always fold if there is one use, or if we're optimizing for size.
Register DefReg = MI.getOperand(0).getReg();
if (MRI.hasOneUse(DefReg) ||
MI.getParent()->getParent()->getFunction().hasMinSize())
return true;
// It's better to avoid folding and recomputing shifts when we don't have a
// fastpath.
if (!STI.hasLSLFast())
return false;
// We have a fastpath, so folding a shift in and potentially computing it
// many times may be beneficial. Check if this is only used in memory ops.
// If it is, then we should fold.
return all_of(MRI.use_instructions(DefReg),
[](MachineInstr &Use) { return Use.mayLoadOrStore(); });
}
/// This is used for computing addresses like this:
///
/// ldr x1, [x2, x3, lsl #3]
///
/// Where x2 is the base register, and x3 is an offset register. The shift-left
/// is a constant value specific to this load instruction. That is, we'll never
/// see anything other than a 3 here (which corresponds to the size of the
/// element being loaded.)
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
MachineOperand &Root, unsigned SizeInBytes) const {
if (!Root.isReg())
return None;
MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
// Make sure that the memory op is a valid size.
int64_t LegalShiftVal = Log2_32(SizeInBytes);
if (LegalShiftVal == 0)
return None;
// We want to find something like this:
//
// val = G_CONSTANT LegalShiftVal
// shift = G_SHL off_reg val
// ptr = G_GEP base_reg shift
// x = G_LOAD ptr
//
// And fold it into this addressing mode:
//
// ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
// Check if we can find the G_GEP.
MachineInstr *Gep = getOpcodeDef(TargetOpcode::G_GEP, Root.getReg(), MRI);
if (!Gep || !isWorthFoldingIntoExtendedReg(*Gep, MRI))
return None;
// Now, try to match an opcode which will match our specific offset.
// We want a G_SHL or a G_MUL.
MachineInstr *OffsetInst = getDefIgnoringCopies(Gep->getOperand(2).getReg(), MRI);
if (!OffsetInst)
return None;
unsigned OffsetOpc = OffsetInst->getOpcode();
if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
return None;
if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
return None;
// Now, try to find the specific G_CONSTANT. Start by assuming that the
// register we will offset is the LHS, and the register containing the
// constant is the RHS.
Register OffsetReg = OffsetInst->getOperand(1).getReg();
Register ConstantReg = OffsetInst->getOperand(2).getReg();
auto ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI);
if (!ValAndVReg) {
// We didn't get a constant on the RHS. If the opcode is a shift, then
// we're done.
if (OffsetOpc == TargetOpcode::G_SHL)
return None;
// If we have a G_MUL, we can use either register. Try looking at the RHS.
std::swap(OffsetReg, ConstantReg);
ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI);
if (!ValAndVReg)
return None;
}
// The value must fit into 3 bits, and must be positive. Make sure that is
// true.
int64_t ImmVal = ValAndVReg->Value;
// Since we're going to pull this into a shift, the constant value must be
// a power of 2. If we got a multiply, then we need to check this.
if (OffsetOpc == TargetOpcode::G_MUL) {
if (!isPowerOf2_32(ImmVal))
return None;
// Got a power of 2. So, the amount we'll shift is the log base-2 of that.
ImmVal = Log2_32(ImmVal);
}
if ((ImmVal & 0x7) != ImmVal)
return None;
// We are only allowed to shift by LegalShiftVal. This shift value is built
// into the instruction, so we can't just use whatever we want.
if (ImmVal != LegalShiftVal)
return None;
// We can use the LHS of the GEP as the base, and the LHS of the shift as an
// offset. Signify that we are shifting by setting the shift flag to 1.
return {{
[=](MachineInstrBuilder &MIB) { MIB.add(Gep->getOperand(1)); },
[=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(1); },
}};
}
/// This is used for computing addresses like this:
///
/// ldr x1, [x2, x3]
///
/// Where x2 is the base register, and x3 is an offset register.
///
/// When possible (or profitable) to fold a G_GEP into the address calculation,
/// this will do so. Otherwise, it will return None.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectAddrModeRegisterOffset(
MachineOperand &Root) const {
MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
// We need a GEP.
MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
if (!Gep || Gep->getOpcode() != TargetOpcode::G_GEP)
return None;
// If this is used more than once, let's not bother folding.
// TODO: Check if they are memory ops. If they are, then we can still fold
// without having to recompute anything.
if (!MRI.hasOneUse(Gep->getOperand(0).getReg()))
return None;
// Base is the GEP's LHS, offset is its RHS.
return {{
[=](MachineInstrBuilder &MIB) { MIB.add(Gep->getOperand(1)); },
[=](MachineInstrBuilder &MIB) { MIB.add(Gep->getOperand(2)); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
}};
}
/// This is intended to be equivalent to selectAddrModeXRO in
/// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
unsigned SizeInBytes) const {
MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
// If we have a constant offset, then we probably don't want to match a
// register offset.
if (isBaseWithConstantOffset(Root, MRI))
return None;
// Try to fold shifts into the addressing mode.
auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
if (AddrModeFns)
return AddrModeFns;
// If that doesn't work, see if it's possible to fold in registers from
// a GEP.
return selectAddrModeRegisterOffset(Root);
}
/// Select a "register plus unscaled signed 9-bit immediate" address. This
/// should only match when there is an offset that is not valid for a scaled
/// immediate addressing mode. The "Size" argument is the size in bytes of the
/// memory reference, which is needed here to know what is valid for a scaled
/// immediate.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
unsigned Size) const {
MachineRegisterInfo &MRI =
Root.getParent()->getParent()->getParent()->getRegInfo();
if (!Root.isReg())
return None;
if (!isBaseWithConstantOffset(Root, MRI))
return None;
MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
if (!RootDef)
return None;
MachineOperand &OffImm = RootDef->getOperand(2);
if (!OffImm.isReg())
return None;
MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
if (!RHS || RHS->getOpcode() != TargetOpcode::G_CONSTANT)
return None;
int64_t RHSC;
MachineOperand &RHSOp1 = RHS->getOperand(1);
if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
return None;
RHSC = RHSOp1.getCImm()->getSExtValue();
// If the offset is valid as a scaled immediate, don't match here.
if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size)))
return None;
if (RHSC >= -256 && RHSC < 256) {
MachineOperand &Base = RootDef->getOperand(1);
return {{
[=](MachineInstrBuilder &MIB) { MIB.add(Base); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); },
}};
}
return None;
}
/// Select a "register plus scaled unsigned 12-bit immediate" address. The
/// "Size" argument is the size in bytes of the memory reference, which
/// determines the scale.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
unsigned Size) const {
MachineRegisterInfo &MRI =
Root.getParent()->getParent()->getParent()->getRegInfo();
if (!Root.isReg())
return None;
MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
if (!RootDef)
return None;
if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
return {{
[=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
}};
}
if (isBaseWithConstantOffset(Root, MRI)) {
MachineOperand &LHS = RootDef->getOperand(1);
MachineOperand &RHS = RootDef->getOperand(2);
MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
if (LHSDef && RHSDef) {
int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
unsigned Scale = Log2_32(Size);
if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
return {{
[=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
}};
return {{
[=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
}};
}
}
}
// Before falling back to our general case, check if the unscaled
// instructions can handle this. If so, that's preferable.
if (selectAddrModeUnscaled(Root, Size).hasValue())
return None;
return {{
[=](MachineInstrBuilder &MIB) { MIB.add(Root); },
[=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
}};
}
void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
const MachineInstr &MI) const {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), MRI);
assert(CstVal && "Expected constant value");
MIB.addImm(CstVal.getValue());
}
namespace llvm {
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &TM,
AArch64Subtarget &Subtarget,
AArch64RegisterBankInfo &RBI) {
return new AArch64InstructionSelector(TM, Subtarget, RBI);
}
}