1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 19:23:23 +01:00

AArch64: Do not test for CPUs, use SubtargetFeatures

Testing for specific CPUs has a number of problems, better use subtarget
features:
- When some tweak is added for a specific CPU it is often desirable for
  the next version of that CPU as well, yet we often forget to add it.
- It is hard to keep track of checks scattered around the target code;
  Declaring all target specifics together with the CPU in the tablegen
  file is a clear representation.
- Subtarget features can be tweaked from the command line.

To discourage people from using CPU checks in the future I removed the
isCortexXX(), isCyclone(), ... functions. I added an getProcFamily()
function for exceptional circumstances but made it clear in the comment
that usage is discouraged.

Reformat feature list in AArch64.td to have 1 feature per line in
alphabetical order to simplify merging and sorting for out of tree
tweaks.

No functional change intended.

Differential Revision: http://reviews.llvm.org/D20762

llvm-svn: 271555
This commit is contained in:
Matthias Braun 2016-06-02 18:03:53 +00:00
parent 647e745fb4
commit 5a2d283ab8
10 changed files with 223 additions and 114 deletions

View File

@ -58,6 +58,50 @@ def FeatureReserveX18 : SubtargetFeature<"reserve-x18", "ReserveX18", "true",
"Reserve X18, making it unavailable "
"as a GPR">;
def FeatureMergeNarrowLd : SubtargetFeature<"merge-narrow-ld",
"MergeNarrowLoads", "true",
"Merge narrow load instructions">;
def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
"Use alias analysis during codegen">;
def FeatureBalanceFPOps : SubtargetFeature<"balance-fp-ops", "BalanceFPOps",
"true",
"balance mix of odd and even D-registers for fp multiply(-accumulate) ops">;
def FeaturePredictableSelectIsExpensive : SubtargetFeature<
"predictable-select-expensive", "PredictableSelectIsExpensive", "true",
"Prefer likely predicted branches over selects">;
def FeatureCustomCheapAsMoveHandling : SubtargetFeature<"custom-cheap-as-move",
"CustomAsCheapAsMove", "true",
"Use custom code for TargetInstrInfo::isAsCheapAsAMove()">;
def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
"UsePostRAScheduler", "true", "Schedule again after register allocation">;
def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store",
"Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">;
def FeatureAvoidQuadLdStPairs : SubtargetFeature<"no-quad-ldst-pairs",
"AvoidQuadLdStPairs", "true",
"Do not form quad load/store pair operations">;
def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature<
"alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern",
"true", "Use alternative pattern for sextload convert to f32">;
def FeatureMacroOpFusion : SubtargetFeature<
"macroop-fusion", "HasMacroOpFusion", "true",
"CPU supports macro op fusion">;
def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
"disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
"Disable latency scheduling heuristic">;
def FeatureUseRSqrt : SubtargetFeature<
"use-reverse-square-root", "UseRSqrt", "true", "Use reverse square root">;
//===----------------------------------------------------------------------===//
// Architectures.
//
@ -94,57 +138,87 @@ include "AArch64SchedM1.td"
include "AArch64SchedKryo.td"
def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
"Cortex-A35 ARM processors",
[FeatureFPARMv8,
FeatureNEON,
FeatureCrypto,
"Cortex-A35 ARM processors", [
FeatureCRC,
FeaturePerfMon]>;
FeatureCrypto,
FeatureFPARMv8,
FeatureNEON,
FeaturePerfMon
]>;
def ProcA53 : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
"Cortex-A53 ARM processors",
[FeatureFPARMv8,
FeatureNEON,
FeatureCrypto,
"Cortex-A53 ARM processors", [
FeatureBalanceFPOps,
FeatureCRC,
FeaturePerfMon]>;
FeatureCrypto,
FeatureCustomCheapAsMoveHandling,
FeatureFPARMv8,
FeatureNEON,
FeaturePerfMon,
FeaturePostRAScheduler,
FeatureUseAA
]>;
def ProcA57 : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
"Cortex-A57 ARM processors",
[FeatureFPARMv8,
FeatureNEON,
FeatureCrypto,
"Cortex-A57 ARM processors", [
FeatureBalanceFPOps,
FeatureCRC,
FeaturePerfMon]>;
FeatureCrypto,
FeatureCustomCheapAsMoveHandling,
FeatureFPARMv8,
FeatureMergeNarrowLd,
FeatureNEON,
FeaturePerfMon,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive
]>;
def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
"Cyclone",
[FeatureFPARMv8,
FeatureNEON,
"Cyclone", [
FeatureAlternateSExtLoadCVTF32Pattern,
FeatureCrypto,
FeatureDisableLatencySchedHeuristic,
FeatureFPARMv8,
FeatureMacroOpFusion,
FeatureNEON,
FeaturePerfMon,
FeatureZCRegMove, FeatureZCZeroing]>;
FeatureSlowMisaligned128Store,
FeatureZCRegMove,
FeatureZCZeroing
]>;
def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
"Samsung Exynos-M1 processors",
[FeatureFPARMv8,
FeatureNEON,
FeatureCrypto,
"Samsung Exynos-M1 processors", [
FeatureAvoidQuadLdStPairs,
FeatureCRC,
FeaturePerfMon]>;
FeatureCrypto,
FeatureCustomCheapAsMoveHandling,
FeatureFPARMv8,
FeatureNEON,
FeaturePerfMon,
FeatureUseRSqrt
]>;
def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
"Qualcomm Kryo processors",
[FeatureFPARMv8,
FeatureNEON,
FeatureCrypto,
"Qualcomm Kryo processors", [
FeatureCRC,
FeaturePerfMon]>;
FeatureCrypto,
FeatureCustomCheapAsMoveHandling,
FeatureFPARMv8,
FeatureMergeNarrowLd,
FeatureNEON,
FeaturePerfMon,
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive
]>;
def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
FeatureNEON,
FeatureCRC,
FeaturePerfMon]>;
def : ProcessorModel<"generic", NoSchedModel, [
FeatureCRC,
FeatureFPARMv8,
FeatureNEON,
FeaturePerfMon,
FeaturePostRAScheduler
]>;
// FIXME: Cortex-A35 is currently modelled as a Cortex-A53
def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;

View File

@ -314,9 +314,7 @@ bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
if (skipFunction(*F.getFunction()))
return false;
// Don't do anything if this isn't an A53 or A57.
if (!(F.getSubtarget<AArch64Subtarget>().isCortexA53() ||
F.getSubtarget<AArch64Subtarget>().isCortexA57()))
if (!F.getSubtarget<AArch64Subtarget>().balanceFPOps())
return false;
bool Changed = false;

View File

@ -634,9 +634,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
}
// Prefer likely predicted branches to selects on out-of-order cores.
if (Subtarget->isCortexA57() || Subtarget->isKryo())
PredictableSelectIsExpensive = true;
PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
}
void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
@ -814,12 +812,9 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
if (Subtarget->requiresStrictAlign())
return false;
// FIXME: This is mostly true for Cyclone, but not necessarily others.
if (Fast) {
// FIXME: Define an attribute for slow unaligned accesses instead of
// relying on the CPU type as a proxy.
// On Cyclone, unaligned 128-bit stores are slow.
*Fast = !Subtarget->isCyclone() || VT.getStoreSize() != 16 ||
// Some CPUs are fine with unaligned stores except for 128-bit ones.
*Fast = !Subtarget->isMisaligned128StoreSlow() || VT.getStoreSize() != 16 ||
// See comments in performSTORECombine() for more details about
// these conditions.
@ -8792,9 +8787,7 @@ static SDValue split16BStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
// a call to that function here.
// Cyclone has bad performance on unaligned 16B stores when crossing line and
// page boundaries. We want to split such stores.
if (!Subtarget->isCyclone())
if (!Subtarget->isMisaligned128StoreSlow())
return SDValue();
// Don't split at -Oz.

View File

@ -544,8 +544,7 @@ static bool canBeExpandedToORR(const MachineInstr *MI, unsigned BitSize) {
// FIXME: this implementation should be micro-architecture dependent, so a
// micro-architecture target hook should be introduced here in future.
bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
if (!Subtarget.isCortexA57() && !Subtarget.isCortexA53() &&
!Subtarget.isExynosM1() && !Subtarget.isKryo())
if (!Subtarget.hasCustomCheapAsMoveHandling())
return MI->isAsCheapAsAMove();
unsigned Imm;
@ -559,7 +558,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
case AArch64::ADDXri:
case AArch64::SUBWri:
case AArch64::SUBXri:
return (Subtarget.isExynosM1() ||
return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 ||
MI->getOperand(3).getImm() == 0);
// add/sub on register with shift
@ -568,7 +567,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
case AArch64::SUBWrs:
case AArch64::SUBXrs:
Imm = MI->getOperand(3).getImm();
return (Subtarget.isExynosM1() &&
return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
AArch64_AM::getArithShiftValue(Imm) < 4);
// logical ops on immediate
@ -609,7 +608,7 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr *MI) const {
case AArch64::ORRWrs:
case AArch64::ORRXrs:
Imm = MI->getOperand(3).getImm();
return (Subtarget.isExynosM1() &&
return (Subtarget.getProcFamily() == AArch64Subtarget::ExynosM1 &&
AArch64_AM::getShiftValue(Imm) < 4 &&
AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL);
@ -1522,8 +1521,8 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr *MI) const {
if (isLdStPairSuppressed(MI))
return false;
// Do not pair quad ld/st for Exynos.
if (Subtarget.isExynosM1()) {
// On some CPUs quad load/store pairs are slower than two single load/stores.
if (Subtarget.avoidQuadLdStPairs()) {
switch (MI->getOpcode()) {
default:
break;
@ -1801,8 +1800,8 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr *FirstLdSt,
bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
MachineInstr *Second) const {
if (Subtarget.isCyclone()) {
// Cyclone can fuse CMN, CMP, TST followed by Bcc.
if (Subtarget.hasMacroOpFusion()) {
// Fuse CMN, CMP, TST followed by Bcc.
unsigned SecondOpcode = Second->getOpcode();
if (SecondOpcode == AArch64::Bcc) {
switch (First->getOpcode()) {
@ -1817,7 +1816,7 @@ bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
return true;
}
}
// Cyclone B0 also supports ALU operations followed by CBZ/CBNZ.
// Fuse ALU operations followed by CBZ/CBNZ.
if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
switch (First->getOpcode()) {

View File

@ -34,7 +34,8 @@ def HasSPE : Predicate<"Subtarget->hasSPE()">,
def IsLE : Predicate<"Subtarget->isLittleEndian()">;
def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
def IsCyclone : Predicate<"Subtarget->isCyclone()">;
def UseAlternateSExtLoadCVTF32
: Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
//===----------------------------------------------------------------------===//
// AArch64-specific DAG Nodes.
@ -4957,7 +4958,8 @@ class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
0),
dsub)),
0),
ssub)))>, Requires<[NotForCodeSize, IsCyclone]>;
ssub)))>,
Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
(LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
@ -5010,7 +5012,8 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
0),
dsub)),
0),
dsub)))>, Requires<[NotForCodeSize, IsCyclone]>;
dsub)))>,
Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
(LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;

View File

@ -160,10 +160,6 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
// Find and promote load instructions which read directly from store.
bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
// Check if converting two narrow loads into a single wider load with
// bitfield extracts could be enabled.
bool enableNarrowLdMerge(MachineFunction &Fn);
bool optimizeBlock(MachineBasicBlock &MBB, bool enableNarrowLdOpt);
bool runOnMachineFunction(MachineFunction &Fn) override;
@ -1912,15 +1908,6 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
return Modified;
}
bool AArch64LoadStoreOpt::enableNarrowLdMerge(MachineFunction &Fn) {
bool ProfitableArch = Subtarget->isCortexA57() || Subtarget->isKryo();
// FIXME: The benefit from converting narrow loads into a wider load could be
// microarchitectural as it assumes that a single load with two bitfield
// extracts is cheaper than two narrow loads. Currently, this conversion is
// enabled only in cortex-a57 on which performance benefits were verified.
return ProfitableArch && !Subtarget->requiresStrictAlign();
}
bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
if (skipFunction(*Fn.getFunction()))
return false;
@ -1936,7 +1923,8 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
UsedRegs.resize(TRI->getNumRegs());
bool Modified = false;
bool enableNarrowLdOpt = enableNarrowLdMerge(Fn);
bool enableNarrowLdOpt =
Subtarget->mergeNarrowLoads() && !Subtarget->requiresStrictAlign();
for (auto &MBB : Fn)
Modified |= optimizeBlock(MBB, enableNarrowLdOpt);

View File

@ -44,9 +44,36 @@ AArch64Subtarget::initializeSubtargetDependencies(StringRef FS) {
CPUString = "generic";
ParseSubtargetFeatures(CPUString, FS);
initializeProperties();
return *this;
}
void AArch64Subtarget::initializeProperties() {
// Initialize CPU specific properties. We should add a tablegen feature for
// this in the future so we can specify it together with the subtarget
// features.
switch (ARMProcFamily) {
case Cyclone:
CacheLineSize = 64;
PrefetchDistance = 280;
MinPrefetchStride = 2048;
MaxPrefetchIterationsAhead = 3;
break;
case CortexA57:
MaxInterleaveFactor = 4;
break;
case Kryo:
MaxInterleaveFactor = 4;
VectorInsertExtractBaseCost = 2;
break;
case Others: break;
case CortexA35: break;
case CortexA53: break;
case ExynosM1: break;
}
}
AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
const std::string &FS,
const TargetMachine &TM, bool LittleEndian)
@ -110,8 +137,7 @@ void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
// Enabling or Disabling the latency heuristic is a close call: It seems to
// help nearly no benchmark on out-of-order architectures, on the other hand
// it regresses register pressure on a few benchmarking.
if (isCyclone())
Policy.DisableLatencyHeuristic = true;
Policy.DisableLatencyHeuristic = DisableLatencySchedHeuristic;
}
bool AArch64Subtarget::enableEarlyIfConversion() const {
@ -133,8 +159,5 @@ bool AArch64Subtarget::supportsAddressTopByteIgnored() const {
std::unique_ptr<PBQPRAConstraint>
AArch64Subtarget::getCustomPBQPConstraints() const {
if (!isCortexA57())
return nullptr;
return llvm::make_unique<A57ChainingConstraint>();
return balanceFPOps() ? llvm::make_unique<A57ChainingConstraint>() : nullptr;
}

View File

@ -33,8 +33,8 @@ class StringRef;
class Triple;
class AArch64Subtarget : public AArch64GenSubtargetInfo {
protected:
enum ARMProcFamilyEnum {
public:
enum ARMProcFamilyEnum : uint8_t {
Others,
CortexA35,
CortexA53,
@ -44,6 +44,7 @@ protected:
Kryo
};
protected:
/// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
ARMProcFamilyEnum ARMProcFamily = Others;
@ -66,6 +67,24 @@ protected:
// StrictAlign - Disallow unaligned memory accesses.
bool StrictAlign = false;
bool MergeNarrowLoads = false;
bool UseAA = false;
bool PredictableSelectIsExpensive = false;
bool BalanceFPOps = false;
bool CustomAsCheapAsMove = false;
bool UsePostRAScheduler = false;
bool Misaligned128StoreIsSlow = false;
bool AvoidQuadLdStPairs = false;
bool UseAlternateSExtLoadCVTF32Pattern = false;
bool HasMacroOpFusion = false;
bool DisableLatencySchedHeuristic = false;
bool UseRSqrt = false;
uint8_t MaxInterleaveFactor = 2;
uint8_t VectorInsertExtractBaseCost = 3;
uint16_t CacheLineSize = 0;
uint16_t PrefetchDistance = 0;
uint16_t MinPrefetchStride = 1;
unsigned MaxPrefetchIterationsAhead = UINT_MAX;
// ReserveX18 - X18 is not available as a general purpose register.
bool ReserveX18;
@ -93,6 +112,9 @@ private:
/// subtarget initialization.
AArch64Subtarget &initializeSubtargetDependencies(StringRef FS);
/// Initialize properties based on the selected processor family.
void initializeProperties();
public:
/// This constructor initializes the data members to match that
/// of the specified triple.
@ -123,7 +145,15 @@ public:
const Triple &getTargetTriple() const { return TargetTriple; }
bool enableMachineScheduler() const override { return true; }
bool enablePostRAScheduler() const override {
return isGeneric() || isCortexA53() || isCortexA57() || isKryo();
return UsePostRAScheduler;
}
/// Returns ARM processor family.
/// Avoid this function! CPU specifics should be kept local to this class
/// and preferably modeled with SubtargetFeatures or properties in
/// initializeProperties().
ARMProcFamilyEnum getProcFamily() const {
return ARMProcFamily;
}
bool hasV8_1aOps() const { return HasV8_1aOps; }
@ -140,6 +170,30 @@ public:
bool hasNEON() const { return HasNEON; }
bool hasCrypto() const { return HasCrypto; }
bool hasCRC() const { return HasCRC; }
bool mergeNarrowLoads() const { return MergeNarrowLoads; }
bool balanceFPOps() const { return BalanceFPOps; }
bool predictableSelectIsExpensive() const {
return PredictableSelectIsExpensive;
}
bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }
bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }
bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; }
bool useAlternateSExtLoadCVTF32Pattern() const {
return UseAlternateSExtLoadCVTF32Pattern;
}
bool hasMacroOpFusion() const { return HasMacroOpFusion; }
bool useRSqrt() const { return UseRSqrt; }
unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
unsigned getVectorInsertExtractBaseCost() const {
return VectorInsertExtractBaseCost;
}
unsigned getCacheLineSize() const { return CacheLineSize; }
unsigned getPrefetchDistance() const { return PrefetchDistance; }
unsigned getMinPrefetchStride() const { return MinPrefetchStride; }
unsigned getMaxPrefetchIterationsAhead() const {
return MaxPrefetchIterationsAhead;
}
/// CPU has TBI (top byte of addresses is ignored during HW address
/// translation) and OS enables it.
bool supportsAddressTopByteIgnored() const;
@ -160,14 +214,7 @@ public:
bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
bool isGeneric() const { return CPUString == "generic"; }
bool isCyclone() const { return CPUString == "cyclone"; }
bool isCortexA57() const { return CPUString == "cortex-a57"; }
bool isCortexA53() const { return CPUString == "cortex-a53"; }
bool isExynosM1() const { return CPUString == "exynos-m1"; }
bool isKryo() const { return CPUString == "kryo"; }
bool useAA() const override { return isCortexA53(); }
bool useAA() const override { return UseAA; }
/// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
/// that still makes it profitable to inline the call.

View File

@ -147,8 +147,7 @@ static void initReciprocals(AArch64TargetMachine& TM, AArch64Subtarget& ST)
// (52 mantissa bits) are 2 and 3, respectively.
unsigned ExtraStepsF = 2,
ExtraStepsD = ExtraStepsF + 1;
// FIXME: Enable x^-1/2 only for Exynos M1 at the moment.
bool UseRsqrt = ST.isExynosM1();
bool UseRsqrt = ST.useRSqrt();
TM.Options.Reciprocals.setDefaults("sqrtf", UseRsqrt, ExtraStepsF);
TM.Options.Reciprocals.setDefaults("sqrtd", UseRsqrt, ExtraStepsD);

View File

@ -368,9 +368,7 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
}
// All other insert/extracts cost this much.
if (ST->isKryo())
return 2;
return 3;
return ST->getVectorInsertExtractBaseCost();
}
int AArch64TTIImpl::getArithmeticInstrCost(
@ -529,9 +527,7 @@ int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
}
unsigned AArch64TTIImpl::getMaxInterleaveFactor(unsigned VF) {
if (ST->isCortexA57() || ST->isKryo())
return 4;
return 2;
return ST->getMaxInterleaveFactor();
}
void AArch64TTIImpl::getUnrollingPreferences(Loop *L,
@ -630,28 +626,17 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
}
unsigned AArch64TTIImpl::getCacheLineSize() {
if (ST->isCyclone())
return 64;
return BaseT::getCacheLineSize();
return ST->getCacheLineSize();
}
unsigned AArch64TTIImpl::getPrefetchDistance() {
if (ST->isCyclone())
return 280;
return BaseT::getPrefetchDistance();
return ST->getPrefetchDistance();
}
unsigned AArch64TTIImpl::getMinPrefetchStride() {
if (ST->isCyclone())
// The HW prefetcher handles accesses with strides up to 2KB.
return 2048;
return BaseT::getMinPrefetchStride();
return ST->getMinPrefetchStride();
}
unsigned AArch64TTIImpl::getMaxPrefetchIterationsAhead() {
if (ST->isCyclone())
// Be conservative for now and don't prefetch ahead too much since the loop
// may terminate early.
return 3;
return BaseT::getMaxPrefetchIterationsAhead();
return ST->getMaxPrefetchIterationsAhead();
}