1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 02:52:53 +02:00
llvm-mirror/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
Chandler Carruth ae65e281f3 Update the file headers across all of the LLVM projects in the monorepo
to reflect the new license.

We understand that people may be surprised that we're moving the header
entirely to discuss the new license. We checked this carefully with the
Foundation's lawyer and we believe this is the correct approach.

Essentially, all code in the project is now made available by the LLVM
project under our new license, so you will see that the license headers
include that license only. Some of our contributors have contributed
code under our old license, and accordingly, we have retained a copy of
our old license notice in the top-level files in each project and
repository.

llvm-svn: 351636
2019-01-19 08:50:56 +00:00

741 lines
26 KiB
C++

//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains a pass that performs optimization on SIMD instructions
// with high latency by splitting them into more efficient series of
// instructions.
//
// 1. Rewrite certain SIMD instructions with vector element due to their
// inefficiency on some targets.
//
// For example:
// fmla v0.4s, v1.4s, v2.s[1]
//
// Is rewritten into:
// dup v3.4s, v2.s[1]
// fmla v0.4s, v1.4s, v3.4s
//
// 2. Rewrite interleaved memory access instructions due to their
// inefficiency on some targets.
//
// For example:
// st2 {v0.4s, v1.4s}, addr
//
// Is rewritten into:
// zip1 v2.4s, v0.4s, v1.4s
// zip2 v3.4s, v0.4s, v1.4s
// stp q2, q3, addr
//
//===----------------------------------------------------------------------===//
#include "AArch64InstrInfo.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCSchedule.h"
#include "llvm/Pass.h"
#include <unordered_map>
using namespace llvm;
#define DEBUG_TYPE "aarch64-simdinstr-opt"
STATISTIC(NumModifiedInstr,
"Number of SIMD instructions modified");
#define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \
"AArch64 SIMD instructions optimization pass"
namespace {
struct AArch64SIMDInstrOpt : public MachineFunctionPass {
static char ID;
const TargetInstrInfo *TII;
MachineRegisterInfo *MRI;
TargetSchedModel SchedModel;
// The two maps below are used to cache decisions instead of recomputing:
// This is used to cache instruction replacement decisions within function
// units and across function units.
std::map<std::pair<unsigned, std::string>, bool> SIMDInstrTable;
// This is used to cache the decision of whether to leave the interleaved
// store instructions replacement pass early or not for a particular target.
std::unordered_map<std::string, bool> InterlEarlyExit;
typedef enum {
VectorElem,
Interleave
} Subpass;
// Instruction represented by OrigOpc is replaced by instructions in ReplOpc.
struct InstReplInfo {
unsigned OrigOpc;
std::vector<unsigned> ReplOpc;
const TargetRegisterClass RC;
};
#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \
{OpcOrg, {OpcR0, OpcR1, OpcR2}, RC}
#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \
OpcR7, OpcR8, OpcR9, RC) \
{OpcOrg, \
{OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, OpcR8, OpcR9}, RC}
// The Instruction Replacement Table:
std::vector<InstReplInfo> IRT = {
// ST2 instructions
RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
AArch64::STPQi, AArch64::FPR128RegClass),
RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
AArch64::STPQi, AArch64::FPR128RegClass),
RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
AArch64::STPDi, AArch64::FPR64RegClass),
RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
AArch64::STPQi, AArch64::FPR128RegClass),
RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
AArch64::STPDi, AArch64::FPR64RegClass),
RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
AArch64::STPQi, AArch64::FPR128RegClass),
RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
AArch64::STPDi, AArch64::FPR64RegClass),
// ST4 instructions
RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64,
AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64,
AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32,
AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32,
AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32,
AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32,
AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16,
AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16,
AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16,
AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16,
AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass),
RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8,
AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8,
AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass),
RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8,
AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8,
AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass)
};
// A costly instruction is replaced in this work by N efficient instructions
// The maximum of N is curently 10 and it is for ST4 case.
static const unsigned MaxNumRepl = 10;
AArch64SIMDInstrOpt() : MachineFunctionPass(ID) {
initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry());
}
/// Based only on latency of instructions, determine if it is cost efficient
/// to replace the instruction InstDesc by the instructions stored in the
/// array InstDescRepl.
/// Return true if replacement is expected to be faster.
bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
SmallVectorImpl<const MCInstrDesc*> &ReplInstrMCID);
/// Determine if we need to exit the instruction replacement optimization
/// passes early. This makes sure that no compile time is spent in this pass
/// for targets with no need for any of these optimizations.
/// Return true if early exit of the pass is recommended.
bool shouldExitEarly(MachineFunction *MF, Subpass SP);
/// Check whether an equivalent DUP instruction has already been
/// created or not.
/// Return true when the DUP instruction already exists. In this case,
/// DestReg will point to the destination of the already created DUP.
bool reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg,
unsigned LaneNumber, unsigned *DestReg) const;
/// Certain SIMD instructions with vector element operand are not efficient.
/// Rewrite them into SIMD instructions with vector operands. This rewrite
/// is driven by the latency of the instructions.
/// Return true if the SIMD instruction is modified.
bool optimizeVectElement(MachineInstr &MI);
/// Process The REG_SEQUENCE instruction, and extract the source
/// operands of the ST2/4 instruction from it.
/// Example of such instructions.
/// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
/// Return true when the instruction is processed successfully.
bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg,
unsigned* StRegKill, unsigned NumArg) const;
/// Load/Store Interleaving instructions are not always beneficial.
/// Replace them by ZIP instructionand classical load/store.
/// Return true if the SIMD instruction is modified.
bool optimizeLdStInterleave(MachineInstr &MI);
/// Return the number of useful source registers for this
/// instruction (2 for ST2 and 4 for ST4).
unsigned determineSrcReg(MachineInstr &MI) const;
bool runOnMachineFunction(MachineFunction &Fn) override;
StringRef getPassName() const override {
return AARCH64_VECTOR_BY_ELEMENT_OPT_NAME;
}
};
char AArch64SIMDInstrOpt::ID = 0;
} // end anonymous namespace
INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt",
AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false)
/// Based only on latency of instructions, determine if it is cost efficient
/// to replace the instruction InstDesc by the instructions stored in the
/// array InstDescRepl.
/// Return true if replacement is expected to be faster.
bool AArch64SIMDInstrOpt::
shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) {
// Check if replacement decision is already available in the cached table.
// if so, return it.
std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU();
auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
if (SIMDInstrTable.find(InstID) != SIMDInstrTable.end())
return SIMDInstrTable[InstID];
unsigned SCIdx = InstDesc->getSchedClass();
const MCSchedClassDesc *SCDesc =
SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
// If a target does not define resources for the instructions
// of interest, then return false for no replacement.
const MCSchedClassDesc *SCDescRepl;
if (!SCDesc->isValid() || SCDesc->isVariant())
{
SIMDInstrTable[InstID] = false;
return false;
}
for (auto IDesc : InstDescRepl)
{
SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc(
IDesc->getSchedClass());
if (!SCDescRepl->isValid() || SCDescRepl->isVariant())
{
SIMDInstrTable[InstID] = false;
return false;
}
}
// Replacement cost.
unsigned ReplCost = 0;
for (auto IDesc :InstDescRepl)
ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode());
if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost)
{
SIMDInstrTable[InstID] = true;
return true;
}
else
{
SIMDInstrTable[InstID] = false;
return false;
}
}
/// Determine if we need to exit this pass for a kind of instruction replacement
/// early. This makes sure that no compile time is spent in this pass for
/// targets with no need for any of these optimizations beyond performing this
/// check.
/// Return true if early exit of this pass for a kind of instruction
/// replacement is recommended for a target.
bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) {
const MCInstrDesc* OriginalMCID;
SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;
switch (SP) {
// For this optimization, check by comparing the latency of a representative
// instruction to that of the replacement instructions.
// TODO: check for all concerned instructions.
case VectorElem:
OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed);
ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane));
ReplInstrMCID.push_back(&TII->get(AArch64::FMLAv4f32));
if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID))
return false;
break;
// For this optimization, check for all concerned instructions.
case Interleave:
std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU();
if (InterlEarlyExit.find(Subtarget) != InterlEarlyExit.end())
return InterlEarlyExit[Subtarget];
for (auto &I : IRT) {
OriginalMCID = &TII->get(I.OrigOpc);
for (auto &Repl : I.ReplOpc)
ReplInstrMCID.push_back(&TII->get(Repl));
if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) {
InterlEarlyExit[Subtarget] = false;
return false;
}
ReplInstrMCID.clear();
}
InterlEarlyExit[Subtarget] = true;
break;
}
return true;
}
/// Check whether an equivalent DUP instruction has already been
/// created or not.
/// Return true when the DUP instruction already exists. In this case,
/// DestReg will point to the destination of the already created DUP.
bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode,
unsigned SrcReg, unsigned LaneNumber,
unsigned *DestReg) const {
for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin();
MII != MIE;) {
MII--;
MachineInstr *CurrentMI = &*MII;
if (CurrentMI->getOpcode() == DupOpcode &&
CurrentMI->getNumOperands() == 3 &&
CurrentMI->getOperand(1).getReg() == SrcReg &&
CurrentMI->getOperand(2).getImm() == LaneNumber) {
*DestReg = CurrentMI->getOperand(0).getReg();
return true;
}
}
return false;
}
/// Certain SIMD instructions with vector element operand are not efficient.
/// Rewrite them into SIMD instructions with vector operands. This rewrite
/// is driven by the latency of the instructions.
/// The instruction of concerns are for the time being FMLA, FMLS, FMUL,
/// and FMULX and hence they are hardcoded.
///
/// For example:
/// fmla v0.4s, v1.4s, v2.s[1]
///
/// Is rewritten into
/// dup v3.4s, v2.s[1] // DUP not necessary if redundant
/// fmla v0.4s, v1.4s, v3.4s
///
/// Return true if the SIMD instruction is modified.
bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) {
const MCInstrDesc *MulMCID, *DupMCID;
const TargetRegisterClass *RC = &AArch64::FPR128RegClass;
switch (MI.getOpcode()) {
default:
return false;
// 4X32 instructions
case AArch64::FMLAv4i32_indexed:
DupMCID = &TII->get(AArch64::DUPv4i32lane);
MulMCID = &TII->get(AArch64::FMLAv4f32);
break;
case AArch64::FMLSv4i32_indexed:
DupMCID = &TII->get(AArch64::DUPv4i32lane);
MulMCID = &TII->get(AArch64::FMLSv4f32);
break;
case AArch64::FMULXv4i32_indexed:
DupMCID = &TII->get(AArch64::DUPv4i32lane);
MulMCID = &TII->get(AArch64::FMULXv4f32);
break;
case AArch64::FMULv4i32_indexed:
DupMCID = &TII->get(AArch64::DUPv4i32lane);
MulMCID = &TII->get(AArch64::FMULv4f32);
break;
// 2X64 instructions
case AArch64::FMLAv2i64_indexed:
DupMCID = &TII->get(AArch64::DUPv2i64lane);
MulMCID = &TII->get(AArch64::FMLAv2f64);
break;
case AArch64::FMLSv2i64_indexed:
DupMCID = &TII->get(AArch64::DUPv2i64lane);
MulMCID = &TII->get(AArch64::FMLSv2f64);
break;
case AArch64::FMULXv2i64_indexed:
DupMCID = &TII->get(AArch64::DUPv2i64lane);
MulMCID = &TII->get(AArch64::FMULXv2f64);
break;
case AArch64::FMULv2i64_indexed:
DupMCID = &TII->get(AArch64::DUPv2i64lane);
MulMCID = &TII->get(AArch64::FMULv2f64);
break;
// 2X32 instructions
case AArch64::FMLAv2i32_indexed:
RC = &AArch64::FPR64RegClass;
DupMCID = &TII->get(AArch64::DUPv2i32lane);
MulMCID = &TII->get(AArch64::FMLAv2f32);
break;
case AArch64::FMLSv2i32_indexed:
RC = &AArch64::FPR64RegClass;
DupMCID = &TII->get(AArch64::DUPv2i32lane);
MulMCID = &TII->get(AArch64::FMLSv2f32);
break;
case AArch64::FMULXv2i32_indexed:
RC = &AArch64::FPR64RegClass;
DupMCID = &TII->get(AArch64::DUPv2i32lane);
MulMCID = &TII->get(AArch64::FMULXv2f32);
break;
case AArch64::FMULv2i32_indexed:
RC = &AArch64::FPR64RegClass;
DupMCID = &TII->get(AArch64::DUPv2i32lane);
MulMCID = &TII->get(AArch64::FMULv2f32);
break;
}
SmallVector<const MCInstrDesc*, 2> ReplInstrMCID;
ReplInstrMCID.push_back(DupMCID);
ReplInstrMCID.push_back(MulMCID);
if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
ReplInstrMCID))
return false;
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock &MBB = *MI.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
// Get the operands of the current SIMD arithmetic instruction.
unsigned MulDest = MI.getOperand(0).getReg();
unsigned SrcReg0 = MI.getOperand(1).getReg();
unsigned Src0IsKill = getKillRegState(MI.getOperand(1).isKill());
unsigned SrcReg1 = MI.getOperand(2).getReg();
unsigned Src1IsKill = getKillRegState(MI.getOperand(2).isKill());
unsigned DupDest;
// Instructions of interest have either 4 or 5 operands.
if (MI.getNumOperands() == 5) {
unsigned SrcReg2 = MI.getOperand(3).getReg();
unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill());
unsigned LaneNumber = MI.getOperand(4).getImm();
// Create a new DUP instruction. Note that if an equivalent DUP instruction
// has already been created before, then use that one instead of creating
// a new one.
if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg2, LaneNumber, &DupDest)) {
DupDest = MRI.createVirtualRegister(RC);
BuildMI(MBB, MI, DL, *DupMCID, DupDest)
.addReg(SrcReg2, Src2IsKill)
.addImm(LaneNumber);
}
BuildMI(MBB, MI, DL, *MulMCID, MulDest)
.addReg(SrcReg0, Src0IsKill)
.addReg(SrcReg1, Src1IsKill)
.addReg(DupDest, Src2IsKill);
} else if (MI.getNumOperands() == 4) {
unsigned LaneNumber = MI.getOperand(3).getImm();
if (!reuseDUP(MI, DupMCID->getOpcode(), SrcReg1, LaneNumber, &DupDest)) {
DupDest = MRI.createVirtualRegister(RC);
BuildMI(MBB, MI, DL, *DupMCID, DupDest)
.addReg(SrcReg1, Src1IsKill)
.addImm(LaneNumber);
}
BuildMI(MBB, MI, DL, *MulMCID, MulDest)
.addReg(SrcReg0, Src0IsKill)
.addReg(DupDest, Src1IsKill);
} else {
return false;
}
++NumModifiedInstr;
return true;
}
/// Load/Store Interleaving instructions are not always beneficial.
/// Replace them by ZIP instructions and classical load/store.
///
/// For example:
/// st2 {v0.4s, v1.4s}, addr
///
/// Is rewritten into:
/// zip1 v2.4s, v0.4s, v1.4s
/// zip2 v3.4s, v0.4s, v1.4s
/// stp q2, q3, addr
//
/// For example:
/// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr
///
/// Is rewritten into:
/// zip1 v4.4s, v0.4s, v2.4s
/// zip2 v5.4s, v0.4s, v2.4s
/// zip1 v6.4s, v1.4s, v3.4s
/// zip2 v7.4s, v1.4s, v3.4s
/// zip1 v8.4s, v4.4s, v6.4s
/// zip2 v9.4s, v4.4s, v6.4s
/// zip1 v10.4s, v5.4s, v7.4s
/// zip2 v11.4s, v5.4s, v7.4s
/// stp q8, q9, addr
/// stp q10, q11, addr+32
///
/// Currently only instructions related to ST2 and ST4 are considered.
/// Other may be added later.
/// Return true if the SIMD instruction is modified.
bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) {
unsigned SeqReg, AddrReg;
unsigned StReg[4], StRegKill[4];
MachineInstr *DefiningMI;
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock &MBB = *MI.getParent();
SmallVector<unsigned, MaxNumRepl> ZipDest;
SmallVector<const MCInstrDesc*, MaxNumRepl> ReplInstrMCID;
// If current instruction matches any of the rewriting rules, then
// gather information about parameters of the new instructions.
bool Match = false;
for (auto &I : IRT) {
if (MI.getOpcode() == I.OrigOpc) {
SeqReg = MI.getOperand(0).getReg();
AddrReg = MI.getOperand(1).getReg();
DefiningMI = MRI->getUniqueVRegDef(SeqReg);
unsigned NumReg = determineSrcReg(MI);
if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg))
return false;
for (auto &Repl : I.ReplOpc) {
ReplInstrMCID.push_back(&TII->get(Repl));
// Generate destination registers but only for non-store instruction.
if (Repl != AArch64::STPQi && Repl != AArch64::STPDi)
ZipDest.push_back(MRI->createVirtualRegister(&I.RC));
}
Match = true;
break;
}
}
if (!Match)
return false;
// Determine if it is profitable to replace MI by the series of instructions
// represented in ReplInstrMCID.
if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()),
ReplInstrMCID))
return false;
// Generate the replacement instructions composed of ZIP1, ZIP2, and STP (at
// this point, the code generation is hardcoded and does not rely on the IRT
// table used above given that code generation for ST2 replacement is somewhat
// different than for ST4 replacement. We could have added more info into the
// table related to how we build new instructions but we may be adding more
// complexity with that).
switch (MI.getOpcode()) {
default:
return false;
case AArch64::ST2Twov16b:
case AArch64::ST2Twov8b:
case AArch64::ST2Twov8h:
case AArch64::ST2Twov4h:
case AArch64::ST2Twov4s:
case AArch64::ST2Twov2s:
case AArch64::ST2Twov2d:
// ZIP instructions
BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
.addReg(StReg[0])
.addReg(StReg[1]);
BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
.addReg(StReg[0], StRegKill[0])
.addReg(StReg[1], StRegKill[1]);
// STP instructions
BuildMI(MBB, MI, DL, *ReplInstrMCID[2])
.addReg(ZipDest[0])
.addReg(ZipDest[1])
.addReg(AddrReg)
.addImm(0);
break;
case AArch64::ST4Fourv16b:
case AArch64::ST4Fourv8b:
case AArch64::ST4Fourv8h:
case AArch64::ST4Fourv4h:
case AArch64::ST4Fourv4s:
case AArch64::ST4Fourv2s:
case AArch64::ST4Fourv2d:
// ZIP instructions
BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0])
.addReg(StReg[0])
.addReg(StReg[2]);
BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1])
.addReg(StReg[0], StRegKill[0])
.addReg(StReg[2], StRegKill[2]);
BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2])
.addReg(StReg[1])
.addReg(StReg[3]);
BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3])
.addReg(StReg[1], StRegKill[1])
.addReg(StReg[3], StRegKill[3]);
BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4])
.addReg(ZipDest[0])
.addReg(ZipDest[2]);
BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5])
.addReg(ZipDest[0])
.addReg(ZipDest[2]);
BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6])
.addReg(ZipDest[1])
.addReg(ZipDest[3]);
BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7])
.addReg(ZipDest[1])
.addReg(ZipDest[3]);
// stp instructions
BuildMI(MBB, MI, DL, *ReplInstrMCID[8])
.addReg(ZipDest[4])
.addReg(ZipDest[5])
.addReg(AddrReg)
.addImm(0);
BuildMI(MBB, MI, DL, *ReplInstrMCID[9])
.addReg(ZipDest[6])
.addReg(ZipDest[7])
.addReg(AddrReg)
.addImm(2);
break;
}
++NumModifiedInstr;
return true;
}
/// Process The REG_SEQUENCE instruction, and extract the source
/// operands of the ST2/4 instruction from it.
/// Example of such instruction.
/// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1;
/// Return true when the instruction is processed successfully.
bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI,
unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const {
assert (DefiningMI != NULL);
if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE)
return false;
for (unsigned i=0; i<NumArg; i++) {
StReg[i] = DefiningMI->getOperand(2*i+1).getReg();
StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill());
// Sanity check for the other arguments.
if (DefiningMI->getOperand(2*i+2).isImm()) {
switch (DefiningMI->getOperand(2*i+2).getImm()) {
default:
return false;
case AArch64::dsub0:
case AArch64::dsub1:
case AArch64::dsub2:
case AArch64::dsub3:
case AArch64::qsub0:
case AArch64::qsub1:
case AArch64::qsub2:
case AArch64::qsub3:
break;
}
}
else
return false;
}
return true;
}
/// Return the number of useful source registers for this instruction
/// (2 for ST2 and 4 for ST4).
unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const {
switch (MI.getOpcode()) {
default:
llvm_unreachable("Unsupported instruction for this pass");
case AArch64::ST2Twov16b:
case AArch64::ST2Twov8b:
case AArch64::ST2Twov8h:
case AArch64::ST2Twov4h:
case AArch64::ST2Twov4s:
case AArch64::ST2Twov2s:
case AArch64::ST2Twov2d:
return 2;
case AArch64::ST4Fourv16b:
case AArch64::ST4Fourv8b:
case AArch64::ST4Fourv8h:
case AArch64::ST4Fourv4h:
case AArch64::ST4Fourv4s:
case AArch64::ST4Fourv2s:
case AArch64::ST4Fourv2d:
return 4;
}
}
bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
TII = MF.getSubtarget().getInstrInfo();
MRI = &MF.getRegInfo();
const TargetSubtargetInfo &ST = MF.getSubtarget();
const AArch64InstrInfo *AAII =
static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
if (!AAII)
return false;
SchedModel.init(&ST);
if (!SchedModel.hasInstrSchedModel())
return false;
bool Changed = false;
for (auto OptimizationKind : {VectorElem, Interleave}) {
if (!shouldExitEarly(&MF, OptimizationKind)) {
SmallVector<MachineInstr *, 8> RemoveMIs;
for (MachineBasicBlock &MBB : MF) {
for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
MII != MIE;) {
MachineInstr &MI = *MII;
bool InstRewrite;
if (OptimizationKind == VectorElem)
InstRewrite = optimizeVectElement(MI) ;
else
InstRewrite = optimizeLdStInterleave(MI);
if (InstRewrite) {
// Add MI to the list of instructions to be removed given that it
// has been replaced.
RemoveMIs.push_back(&MI);
Changed = true;
}
++MII;
}
}
for (MachineInstr *MI : RemoveMIs)
MI->eraseFromParent();
}
}
return Changed;
}
/// Returns an instance of the high cost ASIMD instruction replacement
/// optimization pass.
FunctionPass *llvm::createAArch64SIMDInstrOptPass() {
return new AArch64SIMDInstrOpt();
}