1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 02:33:06 +01:00

[AArch64][GlobalISel] Create a new minimal combiner pass just for -O0.

We never bothered to have a separate set of combines for -O0 in the prelegalizer
before. This results in some minor performance hits for a mode where performance
isn't a concern (although not regressing code size significantly is still preferable).

This also removes the CSE option since we don't need it for -O0.

Through experiments, I've arrived at a set of combines that gets the most code
size improvement at -O0, while reducing the amount of time spent in the combiner
by around 35% give or take.

Differential Revision: https://reviews.llvm.org/D102038
This commit is contained in:
Amara Emerson 2021-05-06 17:14:04 -07:00
parent 818c390c9c
commit 9146866d14
12 changed files with 256 additions and 81 deletions

View File

@ -658,3 +658,10 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine,
div_rem_to_divrem, funnel_shift_combines]>;
// A combine group used to for prelegalizer combiners at -O0. The combines in
// this group have been selected based on experiments to balance code size and
// compile time performance.
def optnone_combines : GICombineGroup<[trivial_combines,
ptr_add_immed_chain, combines_for_extload,
not_cmp_fold, opt_brcond_by_inverting_cond]>;

View File

@ -59,7 +59,8 @@ ModulePass *createSVEIntrinsicOptsPass();
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &,
AArch64Subtarget &, AArch64RegisterBankInfo &);
FunctionPass *createAArch64PreLegalizerCombiner(bool IsOptNone);
FunctionPass *createAArch64O0PreLegalizerCombiner();
FunctionPass *createAArch64PreLegalizerCombiner();
FunctionPass *createAArch64PostLegalizerCombiner(bool IsOptNone);
FunctionPass *createAArch64PostLegalizerLowering();
FunctionPass *createAArch64PostSelectOptimize();
@ -82,6 +83,7 @@ void initializeAArch64SpeculationHardeningPass(PassRegistry&);
void initializeAArch64LoadStoreOptPass(PassRegistry&);
void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &);
void initializeAArch64SIMDInstrOptPass(PassRegistry&);
void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &);
void initializeAArch64PreLegalizerCombinerPass(PassRegistry&);
void initializeAArch64PostLegalizerCombinerPass(PassRegistry &);
void initializeAArch64PostLegalizerLoweringPass(PassRegistry &);

View File

@ -43,6 +43,13 @@ def AArch64PreLegalizerCombinerHelper: GICombinerHelper<
let AdditionalArguments = [];
}
def AArch64O0PreLegalizerCombinerHelper: GICombinerHelper<
"AArch64GenO0PreLegalizerCombinerHelper", [optnone_combines]> {
let DisableRuleOption = "aarch64O0prelegalizercombiner-disable-rule";
let StateClass = "AArch64O0PreLegalizerCombinerHelperState";
let AdditionalArguments = [];
}
// Matchdata for combines which replace a G_SHUFFLE_VECTOR with a
// target-specific opcode.
def shuffle_matchdata : GIDefMatchData<"ShuffleVectorPseudo">;

View File

@ -184,6 +184,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
initializeAArch64ExpandPseudoPass(*PR);
initializeAArch64LoadStoreOptPass(*PR);
initializeAArch64SIMDInstrOptPass(*PR);
initializeAArch64O0PreLegalizerCombinerPass(*PR);
initializeAArch64PreLegalizerCombinerPass(*PR);
initializeAArch64PostLegalizerCombinerPass(*PR);
initializeAArch64PostLegalizerLoweringPass(*PR);
@ -562,8 +563,10 @@ bool AArch64PassConfig::addIRTranslator() {
}
void AArch64PassConfig::addPreLegalizeMachineIR() {
bool IsOptNone = getOptLevel() == CodeGenOpt::None;
addPass(createAArch64PreLegalizerCombiner(IsOptNone));
if (getOptLevel() == CodeGenOpt::None)
addPass(createAArch64O0PreLegalizerCombiner());
else
addPass(createAArch64PreLegalizerCombiner());
}
bool AArch64PassConfig::addLegalizeMachineIR() {

View File

@ -10,6 +10,8 @@ tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel)
tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel)
tablegen(LLVM AArch64GenGlobalISel.inc -gen-global-isel)
tablegen(LLVM AArch64GenO0PreLegalizeGICombiner.inc -gen-global-isel-combiner
-combiners="AArch64O0PreLegalizerCombinerHelper")
tablegen(LLVM AArch64GenPreLegalizeGICombiner.inc -gen-global-isel-combiner
-combiners="AArch64PreLegalizerCombinerHelper")
tablegen(LLVM AArch64GenPostLegalizeGICombiner.inc -gen-global-isel-combiner
@ -32,6 +34,7 @@ add_llvm_target(AArch64CodeGen
GISel/AArch64GlobalISelUtils.cpp
GISel/AArch64InstructionSelector.cpp
GISel/AArch64LegalizerInfo.cpp
GISel/AArch64O0PreLegalizerCombiner.cpp
GISel/AArch64PreLegalizerCombiner.cpp
GISel/AArch64PostLegalizerCombiner.cpp
GISel/AArch64PostLegalizerLowering.cpp

View File

@ -11,6 +11,7 @@
#include "AArch64GlobalISelUtils.h"
#include "AArch64InstrInfo.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@ -57,3 +58,38 @@ bool AArch64GISelUtils::isCMN(const MachineInstr *MaybeSub,
getConstantVRegValWithLookThrough(MaybeSub->getOperand(1).getReg(), MRI);
return MaybeZero && MaybeZero->Value.getZExtValue() == 0;
}
bool AArch64GISelUtils::tryEmitBZero(MachineInstr &MI,
MachineIRBuilder &MIRBuilder,
bool MinSize) {
assert(MI.getOpcode() == TargetOpcode::G_MEMSET);
MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
if (!TLI.getLibcallName(RTLIB::BZERO))
return false;
auto Zero = getConstantVRegValWithLookThrough(MI.getOperand(1).getReg(), MRI);
if (!Zero || Zero->Value.getSExtValue() != 0)
return false;
// It's not faster to use bzero rather than memset for sizes <= 256.
// However, it *does* save us a mov from wzr, so if we're going for
// minsize, use bzero even if it's slower.
if (!MinSize) {
// If the size is known, check it. If it is not known, assume using bzero is
// better.
if (auto Size =
getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI)) {
if (Size->Value.getSExtValue() <= 256)
return false;
}
}
MIRBuilder.setInstrAndDebugLoc(MI);
MIRBuilder
.buildInstr(TargetOpcode::G_BZERO, {},
{MI.getOperand(0), MI.getOperand(2)})
.addImm(MI.getOperand(3).getImm())
.addMemOperand(*MI.memoperands_begin());
MI.eraseFromParent();
return true;
}

View File

@ -13,6 +13,7 @@
#define LLVM_LIB_TARGET_AARCH64_GISEL_AARCH64GLOBALISELUTILS_H
#include "llvm/ADT/Optional.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/Register.h"
#include "MCTargetDesc/AArch64AddressingModes.h"
@ -44,6 +45,14 @@ Optional<int64_t> getAArch64VectorSplatScalar(const MachineInstr &MI,
bool isCMN(const MachineInstr *MaybeSub, const CmpInst::Predicate &Pred,
const MachineRegisterInfo &MRI);
/// Replace a G_MEMSET with a value of 0 with a G_BZERO instruction if it is
/// supported and beneficial to do so.
///
/// \note This only applies on Darwin.
///
/// \returns true if \p MI was replaced with a G_BZERO.
bool tryEmitBZero(MachineInstr &MI, MachineIRBuilder &MIRBuilder, bool MinSize);
} // namespace AArch64GISelUtils
} // namespace llvm

View File

@ -0,0 +1,171 @@
//=== lib/CodeGen/GlobalISel/AArch64O0PreLegalizerCombiner.cpp ------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This pass does combining of machine instructions at the generic MI level,
// before the legalizer.
//
//===----------------------------------------------------------------------===//
#include "AArch64GlobalISelUtils.h"
#include "AArch64TargetMachine.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Instructions.h"
#include "llvm/Support/Debug.h"
#define DEBUG_TYPE "aarch64-O0-prelegalizer-combiner"
using namespace llvm;
using namespace MIPatternMatch;
class AArch64O0PreLegalizerCombinerHelperState {
protected:
CombinerHelper &Helper;
public:
AArch64O0PreLegalizerCombinerHelperState(CombinerHelper &Helper)
: Helper(Helper) {}
};
#define AARCH64O0PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
#include "AArch64GenO0PreLegalizeGICombiner.inc"
#undef AARCH64O0PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
namespace {
#define AARCH64O0PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
#include "AArch64GenO0PreLegalizeGICombiner.inc"
#undef AARCH64O0PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
class AArch64O0PreLegalizerCombinerInfo : public CombinerInfo {
GISelKnownBits *KB;
MachineDominatorTree *MDT;
AArch64GenO0PreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
public:
AArch64O0PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
GISelKnownBits *KB,
MachineDominatorTree *MDT)
: CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
/*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
KB(KB), MDT(MDT) {
if (!GeneratedRuleCfg.parseCommandLineOption())
report_fatal_error("Invalid rule identifier");
}
virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
MachineIRBuilder &B) const override;
};
bool AArch64O0PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
MachineInstr &MI,
MachineIRBuilder &B) const {
CombinerHelper Helper(Observer, B, KB, MDT);
AArch64GenO0PreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper);
if (Generated.tryCombineAll(Observer, MI, B))
return true;
unsigned Opc = MI.getOpcode();
switch (Opc) {
case TargetOpcode::G_CONCAT_VECTORS:
return Helper.tryCombineConcatVectors(MI);
case TargetOpcode::G_SHUFFLE_VECTOR:
return Helper.tryCombineShuffleVector(MI);
case TargetOpcode::G_MEMCPY:
case TargetOpcode::G_MEMMOVE:
case TargetOpcode::G_MEMSET: {
// At -O0 set a maxlen of 32 to inline;
unsigned MaxLen = 32;
// Try to inline memcpy type calls if optimizations are enabled.
if (Helper.tryCombineMemCpyFamily(MI, MaxLen))
return true;
if (Opc == TargetOpcode::G_MEMSET)
return llvm::AArch64GISelUtils::tryEmitBZero(MI, B, EnableMinSize);
return false;
}
}
return false;
}
#define AARCH64O0PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
#include "AArch64GenO0PreLegalizeGICombiner.inc"
#undef AARCH64O0PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
// Pass boilerplate
// ================
class AArch64O0PreLegalizerCombiner : public MachineFunctionPass {
public:
static char ID;
AArch64O0PreLegalizerCombiner();
StringRef getPassName() const override {
return "AArch64O0PreLegalizerCombiner";
}
bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
};
} // end anonymous namespace
void AArch64O0PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<TargetPassConfig>();
AU.setPreservesCFG();
getSelectionDAGFallbackAnalysisUsage(AU);
AU.addRequired<GISelKnownBitsAnalysis>();
AU.addPreserved<GISelKnownBitsAnalysis>();
MachineFunctionPass::getAnalysisUsage(AU);
}
AArch64O0PreLegalizerCombiner::AArch64O0PreLegalizerCombiner()
: MachineFunctionPass(ID) {
initializeAArch64O0PreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
}
bool AArch64O0PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
if (MF.getProperties().hasProperty(
MachineFunctionProperties::Property::FailedISel))
return false;
auto &TPC = getAnalysis<TargetPassConfig>();
const Function &F = MF.getFunction();
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
AArch64O0PreLegalizerCombinerInfo PCInfo(
false, F.hasOptSize(), F.hasMinSize(), KB, nullptr /* MDT */);
Combiner C(PCInfo, &TPC);
return C.combineMachineInstrs(MF, nullptr /* CSEInfo */);
}
char AArch64O0PreLegalizerCombiner::ID = 0;
INITIALIZE_PASS_BEGIN(AArch64O0PreLegalizerCombiner, DEBUG_TYPE,
"Combine AArch64 machine instrs before legalization",
false, false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
INITIALIZE_PASS_END(AArch64O0PreLegalizerCombiner, DEBUG_TYPE,
"Combine AArch64 machine instrs before legalization", false,
false)
namespace llvm {
FunctionPass *createAArch64O0PreLegalizerCombiner() {
return new AArch64O0PreLegalizerCombiner();
}
} // end namespace llvm

View File

@ -11,6 +11,7 @@
//
//===----------------------------------------------------------------------===//
#include "AArch64GlobalISelUtils.h"
#include "AArch64TargetMachine.h"
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
@ -219,46 +220,6 @@ static bool applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
return true;
}
/// Replace a G_MEMSET with a value of 0 with a G_BZERO instruction if it is
/// supported and beneficial to do so.
///
/// \note This only applies on Darwin.
///
/// \returns true if \p MI was replaced with a G_BZERO.
static bool tryEmitBZero(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
bool MinSize) {
assert(MI.getOpcode() == TargetOpcode::G_MEMSET);
MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
if (!TLI.getLibcallName(RTLIB::BZERO))
return false;
auto Zero = getConstantVRegValWithLookThrough(MI.getOperand(1).getReg(), MRI);
if (!Zero || Zero->Value.getSExtValue() != 0)
return false;
// It's not faster to use bzero rather than memset for sizes <= 256.
// However, it *does* save us a mov from wzr, so if we're going for
// minsize, use bzero even if it's slower.
if (!MinSize) {
// If the size is known, check it. If it is not known, assume using bzero is
// better.
if (auto Size =
getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI)) {
if (Size->Value.getSExtValue() <= 256)
return false;
}
}
MIRBuilder.setInstrAndDebugLoc(MI);
MIRBuilder
.buildInstr(TargetOpcode::G_BZERO, {},
{MI.getOperand(0), MI.getOperand(2)})
.addImm(MI.getOperand(3).getImm())
.addMemOperand(*MI.memoperands_begin());
MI.eraseFromParent();
return true;
}
class AArch64PreLegalizerCombinerHelperState {
protected:
CombinerHelper &Helper;
@ -321,7 +282,7 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
if (!EnableMinSize && Helper.tryCombineMemCpyFamily(MI, MaxLen))
return true;
if (Opc == TargetOpcode::G_MEMSET)
return tryEmitBZero(MI, B, EnableMinSize);
return llvm::AArch64GISelUtils::tryEmitBZero(MI, B, EnableMinSize);
return false;
}
}
@ -340,15 +301,13 @@ class AArch64PreLegalizerCombiner : public MachineFunctionPass {
public:
static char ID;
AArch64PreLegalizerCombiner(bool IsOptNone = false);
AArch64PreLegalizerCombiner();
StringRef getPassName() const override { return "AArch64PreLegalizerCombiner"; }
bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
private:
bool IsOptNone;
};
} // end anonymous namespace
@ -358,17 +317,15 @@ void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
getSelectionDAGFallbackAnalysisUsage(AU);
AU.addRequired<GISelKnownBitsAnalysis>();
AU.addPreserved<GISelKnownBitsAnalysis>();
if (!IsOptNone) {
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
}
AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
AU.addRequired<GISelCSEAnalysisWrapperPass>();
AU.addPreserved<GISelCSEAnalysisWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner(bool IsOptNone)
: MachineFunctionPass(ID), IsOptNone(IsOptNone) {
AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner()
: MachineFunctionPass(ID) {
initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
}
@ -387,8 +344,7 @@ bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
bool EnableOpt =
MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
MachineDominatorTree *MDT =
IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
AArch64PreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
F.hasMinSize(), KB, MDT);
Combiner C(PCInfo, &TPC);
@ -408,7 +364,7 @@ INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE,
namespace llvm {
FunctionPass *createAArch64PreLegalizerCombiner(bool IsOptNone) {
return new AArch64PreLegalizerCombiner(IsOptNone);
FunctionPass *createAArch64PreLegalizerCombiner() {
return new AArch64PreLegalizerCombiner();
}
} // end namespace llvm

View File

@ -56,9 +56,11 @@
; VERIFY-NEXT: Verify generated machine code
; ENABLED-NEXT: Analysis for ComputingKnownBits
; ENABLED-O1-NEXT: MachineDominator Tree Construction
; ENABLED-NEXT: Analysis containing CSE Info
; ENABLED-NEXT: PreLegalizerCombiner
; ENABLED-O1-NEXT: Analysis containing CSE Info
; ENABLED-O1-NEXT: PreLegalizerCombiner
; VERIFY-O0-NEXT: AArch64O0PreLegalizerCombiner
; VERIFY-NEXT: Verify generated machine code
; VERIFY-O0-NEXT: Analysis containing CSE Info
; ENABLED-NEXT: Legalizer
; VERIFY-NEXT: Verify generated machine code
; ENABLED: RegBankSelect

View File

@ -34,8 +34,8 @@
; CHECK-NEXT: Analysis containing CSE Info
; CHECK-NEXT: IRTranslator
; CHECK-NEXT: Analysis for ComputingKnownBits
; CHECK-NEXT: AArch64O0PreLegalizerCombiner
; CHECK-NEXT: Analysis containing CSE Info
; CHECK-NEXT: AArch64PreLegalizerCombiner
; CHECK-NEXT: Legalizer
; CHECK-NEXT: AArch64PostLegalizerLowering
; CHECK-NEXT: RegBankSelect

View File

@ -1,21 +0,0 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -O0 -mtriple=aarch64-unknown-unknown | FileCheck %s
define <2 x i64> @z(i64* nocapture nonnull readonly %p) {
; CHECK-LABEL: z:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr d1, [x0]
; CHECK-NEXT: ldr d2, [x0, #8]
; CHECK-NEXT: // implicit-def: $q0
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: // implicit-def: $q1
; CHECK-NEXT: mov v1.16b, v2.16b
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ret
%b = load i64, i64* %p
%p2 = getelementptr i64, i64* %p, i64 1
%bb = load i64, i64* %p2
%r1 = insertelement <2 x i64> zeroinitializer, i64 %b, i32 0
%r2 = insertelement <2 x i64> %r1, i64 %bb, i32 1
ret <2 x i64> %r2
}