mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 19:23:23 +01:00
AArch64: Do not test for CPUs, use SubtargetFeatures
Testing for specific CPUs has a number of problems, better use subtarget features: - When some tweak is added for a specific CPU it is often desirable for the next version of that CPU as well, yet we often forget to add it. - It is hard to keep track of checks scattered around the target code; Declaring all target specifics together with the CPU in the tablegen file is a clear representation. - Subtarget features can be tweaked from the command line. To discourage people from using CPU checks in the future I removed the isCortexXX(), isCyclone(), ... functions. I added an getProcFamily() function for exceptional circumstances but made it clear in the comment that usage is discouraged. Reformat feature list in AArch64.td to have 1 feature per line in alphabetical order to simplify merging and sorting for out of tree tweaks. No functional change intended. Differential Revision: http://reviews.llvm.org/D20762 llvm-svn: 271555
This commit is contained in:
parent
647e745fb4
commit
5a2d283ab8
@ -58,6 +58,50 @@ def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true",
|
||||
"Reserve X18, making it unavailable "
|
||||
"as a GPR">;
|
||||
|
||||
def FeatureMergeNarrowLd : SubtargetFeature<"merge-narrow-ld",
|
||||
"MergeNarrowLoads", "true",
|
||||
"Merge narrow load instructions">;
|
||||
|
||||
def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
|
||||
"Use alias analysis during codegen">;
|
||||
|
||||
def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps",
|
||||
"true",
|
||||
"balance mix of odd and even D-registers for fp multiply(-accumulate) ops">;
|
||||
|
||||
def FeaturePredictableSelectIsExpensive : SubtargetFeature<
|
||||
"predictable-select-expensive", "PredictableSelectIsExpensive", "true",
|
||||
"Prefer likely predicted branches over selects">;
|
||||
|
||||
def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
|
||||
"CustomAsCheapAsMove", "true",
|
||||
"Use custom code for TargetInstrInfo::isAsCheapAsAMove()">;
|
||||
|
||||
def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
|
||||
"UsePostRAScheduler", "true", "Schedule again after register allocation">;
|
||||
|
||||
def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store",
|
||||
"Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">;
|
||||
|
||||
def FeatureAvoidQuadLdStPairs : SubtargetFeature<"no-quad-ldst-pairs",
|
||||
"AvoidQuadLdStPairs", "true",
|
||||
"Do not form quad load/store pair operations">;
|
||||
|
||||
def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature<
|
||||
"alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern",
|
||||
"true", "Use alternative pattern for sextload convert to f32">;
|
||||
|
||||
def FeatureMacroOpFusion : SubtargetFeature<
|
||||
"macroop-fusion", "HasMacroOpFusion", "true",
|
||||
"CPU supports macro op fusion">;
|
||||
|
||||
def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
|
||||
"disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
|
||||
"Disable latency scheduling heuristic">;
|
||||
|
||||
def FeatureUseRSqrt : SubtargetFeature<
|
||||
"use-reverse-square-root", "UseRSqrt", "true", "Use reverse square root">;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Architectures.
|
||||
//
|
||||
@ -94,57 +138,87 @@ include "AArch64SchedM1.td"
|
||||
include "AArch64SchedKryo.td"
|
||||
|
||||
def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
|
||||
"Cortex-A35 ARM processors",
|
||||
[FeatureFPARMv8,
|
||||
FeatureNEON,
|
||||
FeatureCrypto,
|
||||
"Cortex-A35 ARM processors", [
|
||||
FeatureCRC,
|
||||
FeaturePerfMon]>;
|
||||
FeatureCrypto,
|
||||
FeatureFPARMv8,
|
||||
FeatureNEON,
|
||||
FeaturePerfMon
|
||||
]>;
|
||||
|
||||
def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
|
||||
"Cortex-A53 ARM processors",
|
||||
[FeatureFPARMv8,
|
||||
FeatureNEON,
|
||||
FeatureCrypto,
|
||||
"Cortex-A53 ARM processors", [
|
||||
FeatureBalanceFPOps,
|
||||
FeatureCRC,
|
||||
FeaturePerfMon]>;
|
||||
FeatureCrypto,
|
||||
FeatureCustomCheapAsMoveHandling,
|
||||
FeatureFPARMv8,
|
||||
FeatureNEON,
|
||||
FeaturePerfMon,
|
||||
FeaturePostRAScheduler,
|
||||
FeatureUseAA
|
||||
]>;
|
||||
|
||||
def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
|
||||
"Cortex-A57 ARM processors",
|
||||
[FeatureFPARMv8,
|
||||
FeatureNEON,
|
||||
FeatureCrypto,
|
||||
"Cortex-A57 ARM processors", [
|
||||
FeatureBalanceFPOps,
|
||||
FeatureCRC,
|
||||
FeaturePerfMon]>;
|
||||
FeatureCrypto,
|
||||
FeatureCustomCheapAsMoveHandling,
|
||||
FeatureFPARMv8,
|
||||
FeatureMergeNarrowLd,
|
||||
FeatureNEON,
|
||||
FeaturePerfMon,
|
||||
FeaturePostRAScheduler,
|
||||
FeaturePredictableSelectIsExpensive
|
||||
]>;
|
||||
|
||||
def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
|
||||
"Cyclone",
|
||||
[FeatureFPARMv8,
|
||||
FeatureNEON,
|
||||
"Cyclone", [
|
||||
FeatureAlternateSExtLoadCVTF32Pattern,
|
||||
FeatureCrypto,
|
||||
FeatureDisableLatencySchedHeuristic,
|
||||
FeatureFPARMv8,
|
||||
FeatureMacroOpFusion,
|
||||
FeatureNEON,
|
||||
FeaturePerfMon,
|
||||
FeatureZCRegMove, FeatureZCZeroing]>;
|
||||
FeatureSlowMisaligned128Store,
|
||||
FeatureZCRegMove,
|
||||
FeatureZCZeroing
|
||||
]>;
|
||||
|
||||
def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
|
||||
"Samsung Exynos-M1 processors",
|
||||
[FeatureFPARMv8,
|
||||
FeatureNEON,
|
||||
FeatureCrypto,
|
||||
"Samsung Exynos-M1 processors", [
|
||||
FeatureAvoidQuadLdStPairs,
|
||||
FeatureCRC,
|
||||
FeaturePerfMon]>;
|
||||
FeatureCrypto,
|
||||
FeatureCustomCheapAsMoveHandling,
|
||||
FeatureFPARMv8,
|
||||
FeatureNEON,
|
||||
FeaturePerfMon,
|
||||
FeatureUseRSqrt
|
||||
]>;
|
||||
|
||||
def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
|
||||
"Qualcomm Kryo processors",
|
||||
[FeatureFPARMv8,
|
||||
FeatureNEON,
|
||||
FeatureCrypto,
|
||||
"Qualcomm Kryo processors", [
|
||||
FeatureCRC,
|
||||
FeaturePerfMon]>;
|
||||
FeatureCrypto,
|
||||
FeatureCustomCheapAsMoveHandling,
|
||||
FeatureFPARMv8,
|
||||
FeatureMergeNarrowLd,
|
||||
FeatureNEON,
|
||||
FeaturePerfMon,
|
||||
FeaturePostRAScheduler,
|
||||
FeaturePredictableSelectIsExpensive
|
||||
]>;
|
||||
|
||||
def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
|
||||
FeatureNEON,
|
||||
FeatureCRC,
|
||||
FeaturePerfMon]>;
|
||||
def : ProcessorModel<"generic", NoSchedModel, [
|
||||
FeatureCRC,
|
||||
FeatureFPARMv8,
|
||||
FeatureNEON,
|
||||
FeaturePerfMon,
|
||||
FeaturePostRAScheduler
|
||||
]>;
|
||||
|
||||
// FIXME: Cortex-A35 is currently modelled as a Cortex-A53
|
||||
def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
|
||||
|
@ -314,9 +314,7 @@ bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
|
||||
if (skipFunction(*F.getFunction()))
|
||||
return false;
|
||||
|
||||
// Don't do anything if this isn't an A53 or A57.
|
||||
if (!(F.getSubtarget<AArch64Subtarget>().isCortexA53() ||
|
||||
F.getSubtarget<AArch64Subtarget>().isCortexA57()))
|
||||
if (!F.getSubtarget<AArch64Subtarget>().balanceFPOps())
|
||||
return false;
|
||||
|
||||
bool Changed = false;
|
||||
|
@ -634,9 +634,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
|
||||
}
|
||||
}
|
||||
|
||||
// Prefer likely predicted branches to selects on out-of-order cores.
|
||||
if (Subtarget->isCortexA57() || Subtarget->isKryo())
|
||||
PredictableSelectIsExpensive = true;
|
||||
PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
|
||||
}
|
||||
|
||||
void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
|
||||
@ -814,12 +812,9 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
|
||||
if (Subtarget->requiresStrictAlign())
|
||||
return false;
|
||||
|
||||
// FIXME: This is mostly true for Cyclone, but not necessarily others.
|
||||
if (Fast) {
|
||||
// FIXME: Define an attribute for slow unaligned accesses instead of
|
||||
// relying on the CPU type as a proxy.
|
||||
// On Cyclone, unaligned 128-bit stores are slow.
|
||||
*Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 ||
|
||||
// Some CPUs are fine with unaligned stores except for 128-bit ones.
|
||||
*Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
|
||||
// See comments in performSTORECombine() for more details about
|
||||
// these conditions.
|
||||
|
||||
@ -8792,9 +8787,7 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
|
||||
// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
|
||||
// a call to that function here.
|
||||
|
||||
// Cyclone has bad performance on unaligned 16B stores when crossing line and
|
||||
// page boundaries. We want to split such stores.
|
||||
if (!Subtarget->isCyclone())
|
||||
if (!Subtarget->isMisaligned128StoreSlow())
|
||||
return SDValue();
|
||||
|
||||
// Don't split at -Oz.
|
||||
|
@ -544,8 +544,7 @@ static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) {
|
||||
// FIXME: this implementation should be micro-architecture dependent, so a
|
||||
// micro-architecture target hook should be introduced here in future.
|
||||
bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
|
||||
if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53() &&
|
||||
!Subtarget.isExynosM1() && !Subtarget.isKryo())
|
||||
if (!Subtarget.hasCustomCheapAsMoveHandling())
|
||||
return MI->isAsCheapAsAMove();
|
||||
|
||||
unsigned Imm;
|
||||
@ -559,7 +558,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
|
||||
case AArch64::ADDXri:
|
||||
case AArch64::SUBWri:
|
||||
case AArch64::SUBXri:
|
||||
return (Subtarget.isExynosM1() ||
|
||||
return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 ||
|
||||
MI->getOperand(3).getImm() == 0);
|
||||
|
||||
// add/sub on register with shift
|
||||
@ -568,7 +567,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
|
||||
case AArch64::SUBWrs:
|
||||
case AArch64::SUBXrs:
|
||||
Imm = MI->getOperand(3).getImm();
|
||||
return (Subtarget.isExynosM1() &&
|
||||
return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
|
||||
AArch64_AM::getArithShiftValue(Imm) < 4);
|
||||
|
||||
// logical ops on immediate
|
||||
@ -609,7 +608,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
|
||||
case AArch64::ORRWrs:
|
||||
case AArch64::ORRXrs:
|
||||
Imm = MI->getOperand(3).getImm();
|
||||
return (Subtarget.isExynosM1() &&
|
||||
return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
|
||||
AArch64_AM::getShiftValue(Imm) < 4 &&
|
||||
AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL);
|
||||
|
||||
@ -1522,8 +1521,8 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr *MI) const {
|
||||
if (isLdStPairSuppressed(MI))
|
||||
return false;
|
||||
|
||||
// Do not pair quad ld/st for Exynos.
|
||||
if (Subtarget.isExynosM1()) {
|
||||
// On some CPUs quad load/store pairs are slower than two single load/stores.
|
||||
if (Subtarget.avoidQuadLdStPairs()) {
|
||||
switch (MI->getOpcode()) {
|
||||
default:
|
||||
break;
|
||||
@ -1801,8 +1800,8 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr *FirstLdSt,
|
||||
|
||||
bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
|
||||
MachineInstr *Second) const {
|
||||
if (Subtarget.isCyclone()) {
|
||||
// Cyclone can fuse CMN, CMP, TST followed by Bcc.
|
||||
if (Subtarget.hasMacroOpFusion()) {
|
||||
// Fuse CMN, CMP, TST followed by Bcc.
|
||||
unsigned SecondOpcode = Second->getOpcode();
|
||||
if (SecondOpcode == AArch64::Bcc) {
|
||||
switch (First->getOpcode()) {
|
||||
@ -1817,7 +1816,7 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// Cyclone B0 also supports ALU operations followed by CBZ/CBNZ.
|
||||
// Fuse ALU operations followed by CBZ/CBNZ.
|
||||
if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
|
||||
SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
|
||||
switch (First->getOpcode()) {
|
||||
|
@ -34,7 +34,8 @@ def HasSPE : Predicate<"Subtarget->hasSPE()">,
|
||||
|
||||
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
|
||||
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
|
||||
def IsCyclone : Predicate<"Subtarget->isCyclone()">;
|
||||
def UseAlternateSExtLoadCVTF32
|
||||
: Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// AArch64-specific DAG Nodes.
|
||||
@ -4957,7 +4958,8 @@ class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
|
||||
0),
|
||||
dsub)),
|
||||
0),
|
||||
ssub)))>, Requires<[NotForCodeSize, IsCyclone]>;
|
||||
ssub)))>,
|
||||
Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
|
||||
|
||||
def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
|
||||
(LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
|
||||
@ -5010,7 +5012,8 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
|
||||
0),
|
||||
dsub)),
|
||||
0),
|
||||
dsub)))>, Requires<[NotForCodeSize, IsCyclone]>;
|
||||
dsub)))>,
|
||||
Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
|
||||
|
||||
def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
|
||||
(LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
|
||||
|
@ -160,10 +160,6 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
|
||||
// Find and promote load instructions which read directly from store.
|
||||
bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
|
||||
|
||||
// Check if converting two narrow loads into a single wider load with
|
||||
// bitfield extracts could be enabled.
|
||||
bool enableNarrowLdMerge(MachineFunction &Fn);
|
||||
|
||||
bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
|
||||
|
||||
bool runOnMachineFunction(MachineFunction &Fn) override;
|
||||
@ -1912,15 +1908,6 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
|
||||
return Modified;
|
||||
}
|
||||
|
||||
bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
|
||||
bool ProfitableArch = Subtarget->isCortexA57() || Subtarget->isKryo();
|
||||
// FIXME: The benefit from converting narrow loads into a wider load could be
|
||||
// microarchitectural as it assumes that a single load with two bitfield
|
||||
// extracts is cheaper than two narrow loads. Currently, this conversion is
|
||||
// enabled only in cortex-a57 on which performance benefits were verified.
|
||||
return ProfitableArch && !Subtarget->requiresStrictAlign();
|
||||
}
|
||||
|
||||
bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
|
||||
if (skipFunction(*Fn.getFunction()))
|
||||
return false;
|
||||
@ -1936,7 +1923,8 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
|
||||
UsedRegs.resize(TRI->getNumRegs());
|
||||
|
||||
bool Modified = false;
|
||||
bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
|
||||
bool enableNarrowLdOpt =
|
||||
Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign();
|
||||
for (auto &MBB : Fn)
|
||||
Modified |= optimizeBlock(MBB, enableNarrowLdOpt);
|
||||
|
||||
|
@ -44,9 +44,36 @@ AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) {
|
||||
CPUString = "generic";
|
||||
|
||||
ParseSubtargetFeatures(CPUString, FS);
|
||||
initializeProperties();
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
void AArch64Subtarget::initializeProperties() {
|
||||
// Initialize CPU specific properties. We should add a tablegen feature for
|
||||
// this in the future so we can specify it together with the subtarget
|
||||
// features.
|
||||
switch (ARMProcFamily) {
|
||||
case Cyclone:
|
||||
CacheLineSize = 64;
|
||||
PrefetchDistance = 280;
|
||||
MinPrefetchStride = 2048;
|
||||
MaxPrefetchIterationsAhead = 3;
|
||||
break;
|
||||
case CortexA57:
|
||||
MaxInterleaveFactor = 4;
|
||||
break;
|
||||
case Kryo:
|
||||
MaxInterleaveFactor = 4;
|
||||
VectorInsertExtractBaseCost = 2;
|
||||
break;
|
||||
case Others: break;
|
||||
case CortexA35: break;
|
||||
case CortexA53: break;
|
||||
case ExynosM1: break;
|
||||
}
|
||||
}
|
||||
|
||||
AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
|
||||
const std::string &FS,
|
||||
const TargetMachine &TM, bool LittleEndian)
|
||||
@ -110,8 +137,7 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
|
||||
// Enabling or Disabling the latency heuristic is a close call: It seems to
|
||||
// help nearly no benchmark on out-of-order architectures, on the other hand
|
||||
// it regresses register pressure on a few benchmarking.
|
||||
if (isCyclone())
|
||||
Policy.DisableLatencyHeuristic = true;
|
||||
Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
|
||||
}
|
||||
|
||||
bool AArch64Subtarget::enableEarlyIfConversion() const {
|
||||
@ -133,8 +159,5 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
|
||||
|
||||
std::unique_ptr<PBQPRAConstraint>
|
||||
AArch64Subtarget::getCustomPBQPConstraints() const {
|
||||
if (!isCortexA57())
|
||||
return nullptr;
|
||||
|
||||
return llvm::make_unique<A57ChainingConstraint>();
|
||||
return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr;
|
||||
}
|
||||
|
@ -33,8 +33,8 @@ class StringRef;
|
||||
class Triple;
|
||||
|
||||
class AArch64Subtarget : public AArch64GenSubtargetInfo {
|
||||
protected:
|
||||
enum ARMProcFamilyEnum {
|
||||
public:
|
||||
enum ARMProcFamilyEnum : uint8_t {
|
||||
Others,
|
||||
CortexA35,
|
||||
CortexA53,
|
||||
@ -44,6 +44,7 @@ protected:
|
||||
Kryo
|
||||
};
|
||||
|
||||
protected:
|
||||
/// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
|
||||
ARMProcFamilyEnum ARMProcFamily = Others;
|
||||
|
||||
@ -66,6 +67,24 @@ protected:
|
||||
|
||||
// StrictAlign - Disallow unaligned memory accesses.
|
||||
bool StrictAlign = false;
|
||||
bool MergeNarrowLoads = false;
|
||||
bool UseAA = false;
|
||||
bool PredictableSelectIsExpensive = false;
|
||||
bool BalanceFPOps = false;
|
||||
bool CustomAsCheapAsMove = false;
|
||||
bool UsePostRAScheduler = false;
|
||||
bool Misaligned128StoreIsSlow = false;
|
||||
bool AvoidQuadLdStPairs = false;
|
||||
bool UseAlternateSExtLoadCVTF32Pattern = false;
|
||||
bool HasMacroOpFusion = false;
|
||||
bool DisableLatencySchedHeuristic = false;
|
||||
bool UseRSqrt = false;
|
||||
uint8_t MaxInterleaveFactor = 2;
|
||||
uint8_t VectorInsertExtractBaseCost = 3;
|
||||
uint16_t CacheLineSize = 0;
|
||||
uint16_t PrefetchDistance = 0;
|
||||
uint16_t MinPrefetchStride = 1;
|
||||
unsigned MaxPrefetchIterationsAhead = UINT_MAX;
|
||||
|
||||
// ReserveX18 - X18 is not available as a general purpose register.
|
||||
bool ReserveX18;
|
||||
@ -93,6 +112,9 @@ private:
|
||||
/// subtarget initialization.
|
||||
AArch64Subtarget &initializeSubtargetDependencies(StringRef FS);
|
||||
|
||||
/// Initialize properties based on the selected processor family.
|
||||
void initializeProperties();
|
||||
|
||||
public:
|
||||
/// This constructor initializes the data members to match that
|
||||
/// of the specified triple.
|
||||
@ -123,7 +145,15 @@ public:
|
||||
const Triple &getTargetTriple() const { return TargetTriple; }
|
||||
bool enableMachineScheduler() const override { return true; }
|
||||
bool enablePostRAScheduler() const override {
|
||||
return isGeneric() || isCortexA53() || isCortexA57() || isKryo();
|
||||
return UsePostRAScheduler;
|
||||
}
|
||||
|
||||
/// Returns ARM processor family.
|
||||
/// Avoid this function! CPU specifics should be kept local to this class
|
||||
/// and preferably modeled with SubtargetFeatures or properties in
|
||||
/// initializeProperties().
|
||||
ARMProcFamilyEnum getProcFamily() const {
|
||||
return ARMProcFamily;
|
||||
}
|
||||
|
||||
bool hasV8_1aOps() const { return HasV8_1aOps; }
|
||||
@ -140,6 +170,30 @@ public:
|
||||
bool hasNEON() const { return HasNEON; }
|
||||
bool hasCrypto() const { return HasCrypto; }
|
||||
bool hasCRC() const { return HasCRC; }
|
||||
bool mergeNarrowLoads() const { return MergeNarrowLoads; }
|
||||
bool balanceFPOps() const { return BalanceFPOps; }
|
||||
bool predictableSelectIsExpensive() const {
|
||||
return PredictableSelectIsExpensive;
|
||||
}
|
||||
bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }
|
||||
bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }
|
||||
bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; }
|
||||
bool useAlternateSExtLoadCVTF32Pattern() const {
|
||||
return UseAlternateSExtLoadCVTF32Pattern;
|
||||
}
|
||||
bool hasMacroOpFusion() const { return HasMacroOpFusion; }
|
||||
bool useRSqrt() const { return UseRSqrt; }
|
||||
unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
|
||||
unsigned getVectorInsertExtractBaseCost() const {
|
||||
return VectorInsertExtractBaseCost;
|
||||
}
|
||||
unsigned getCacheLineSize() const { return CacheLineSize; }
|
||||
unsigned getPrefetchDistance() const { return PrefetchDistance; }
|
||||
unsigned getMinPrefetchStride() const { return MinPrefetchStride; }
|
||||
unsigned getMaxPrefetchIterationsAhead() const {
|
||||
return MaxPrefetchIterationsAhead;
|
||||
}
|
||||
|
||||
/// CPU has TBI (top byte of addresses is ignored during HW address
|
||||
/// translation) and OS enables it.
|
||||
bool supportsAddressTopByteIgnored() const;
|
||||
@ -160,14 +214,7 @@ public:
|
||||
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
|
||||
bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
|
||||
|
||||
bool isGeneric() const { return CPUString == "generic"; }
|
||||
bool isCyclone() const { return CPUString == "cyclone"; }
|
||||
bool isCortexA57() const { return CPUString == "cortex-a57"; }
|
||||
bool isCortexA53() const { return CPUString == "cortex-a53"; }
|
||||
bool isExynosM1() const { return CPUString == "exynos-m1"; }
|
||||
bool isKryo() const { return CPUString == "kryo"; }
|
||||
|
||||
bool useAA() const override { return isCortexA53(); }
|
||||
bool useAA() const override { return UseAA; }
|
||||
|
||||
/// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
|
||||
/// that still makes it profitable to inline the call.
|
||||
|
@ -147,8 +147,7 @@ static void initReciprocals(AArch64TargetMachine& TM, AArch64Subtarget& ST)
|
||||
// (52 mantissa bits) are 2 and 3, respectively.
|
||||
unsigned ExtraStepsF = 2,
|
||||
ExtraStepsD = ExtraStepsF + 1;
|
||||
// FIXME: Enable x^-1/2 only for Exynos M1 at the moment.
|
||||
bool UseRsqrt = ST.isExynosM1();
|
||||
bool UseRsqrt = ST.useRSqrt();
|
||||
|
||||
TM.Options.Reciprocals.setDefaults("sqrtf", UseRsqrt, ExtraStepsF);
|
||||
TM.Options.Reciprocals.setDefaults("sqrtd", UseRsqrt, ExtraStepsD);
|
||||
|
@ -368,9 +368,7 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
|
||||
}
|
||||
|
||||
// All other insert/extracts cost this much.
|
||||
if (ST->isKryo())
|
||||
return 2;
|
||||
return 3;
|
||||
return ST->getVectorInsertExtractBaseCost();
|
||||
}
|
||||
|
||||
int AArch64TTIImpl::getArithmeticInstrCost(
|
||||
@ -529,9 +527,7 @@ int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
|
||||
}
|
||||
|
||||
unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
|
||||
if (ST->isCortexA57() || ST->isKryo())
|
||||
return 4;
|
||||
return 2;
|
||||
return ST->getMaxInterleaveFactor();
|
||||
}
|
||||
|
||||
void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
|
||||
@ -630,28 +626,17 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
|
||||
}
|
||||
|
||||
unsigned AArch64TTIImpl::getCacheLineSize() {
|
||||
if (ST->isCyclone())
|
||||
return 64;
|
||||
return BaseT::getCacheLineSize();
|
||||
return ST->getCacheLineSize();
|
||||
}
|
||||
|
||||
unsigned AArch64TTIImpl::getPrefetchDistance() {
|
||||
if (ST->isCyclone())
|
||||
return 280;
|
||||
return BaseT::getPrefetchDistance();
|
||||
return ST->getPrefetchDistance();
|
||||
}
|
||||
|
||||
unsigned AArch64TTIImpl::getMinPrefetchStride() {
|
||||
if (ST->isCyclone())
|
||||
// The HW prefetcher handles accesses with strides up to 2KB.
|
||||
return 2048;
|
||||
return BaseT::getMinPrefetchStride();
|
||||
return ST->getMinPrefetchStride();
|
||||
}
|
||||
|
||||
unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
|
||||
if (ST->isCyclone())
|
||||
// Be conservative for now and don't prefetch ahead too much since the loop
|
||||
// may terminate early.
|
||||
return 3;
|
||||
return BaseT::getMaxPrefetchIterationsAhead();
|
||||
return ST->getMaxPrefetchIterationsAhead();
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user