mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 18:54:02 +01:00
[ExpandMemCmp] Move all options to TargetTransformInfo.
Split off from D60318. llvm-svn: 364281
This commit is contained in:
parent
7b7fe56333
commit
5b2aa25659
@ -630,17 +630,35 @@ public:
|
||||
/// Don't restrict interleaved unrolling to small loops.
|
||||
bool enableAggressiveInterleaving(bool LoopHasReductions) const;
|
||||
|
||||
/// If not nullptr, enable inline expansion of memcmp. IsZeroCmp is
|
||||
/// true if this is the expansion of memcmp(p1, p2, s) == 0.
|
||||
/// Returns options for expansion of memcmp. IsZeroCmp is
|
||||
// true if this is the expansion of memcmp(p1, p2, s) == 0.
|
||||
struct MemCmpExpansionOptions {
|
||||
// Return true if memcmp expansion is enabled.
|
||||
operator bool() const { return MaxNumLoads > 0; }
|
||||
|
||||
// Maximum number of load operations.
|
||||
unsigned MaxNumLoads = 0;
|
||||
|
||||
// The list of available load sizes (in bytes), sorted in decreasing order.
|
||||
SmallVector<unsigned, 8> LoadSizes;
|
||||
|
||||
// For memcmp expansion when the memcmp result is only compared equal or
|
||||
// not-equal to 0, allow up to this number of load pairs per block. As an
|
||||
// example, this may allow 'memcmp(a, b, 3) == 0' in a single block:
|
||||
// a0 = load2bytes &a[0]
|
||||
// b0 = load2bytes &b[0]
|
||||
// a2 = load1byte &a[2]
|
||||
// b2 = load1byte &b[2]
|
||||
// r = cmp eq (a0 ^ b0 | a2 ^ b2), 0
|
||||
unsigned NumLoadsPerBlock = 1;
|
||||
|
||||
// Set to true to allow overlapping loads. For example, 7-byte compares can
|
||||
// be done with two 4-byte compares instead of 4+2+1-byte compares. This
|
||||
// requires all loads in LoadSizes to be doable in an unaligned way.
|
||||
bool AllowOverlappingLoads = false;
|
||||
};
|
||||
const MemCmpExpansionOptions *enableMemCmpExpansion(bool IsZeroCmp) const;
|
||||
MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
|
||||
bool IsZeroCmp) const;
|
||||
|
||||
/// Enable matching of interleaved access groups.
|
||||
bool enableInterleavedAccessVectorization() const;
|
||||
@ -1162,8 +1180,8 @@ public:
|
||||
unsigned VF) = 0;
|
||||
virtual bool supportsEfficientVectorElementLoadStore() = 0;
|
||||
virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
|
||||
virtual const MemCmpExpansionOptions *enableMemCmpExpansion(
|
||||
bool IsZeroCmp) const = 0;
|
||||
virtual MemCmpExpansionOptions
|
||||
enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0;
|
||||
virtual bool enableInterleavedAccessVectorization() = 0;
|
||||
virtual bool enableMaskedInterleavedAccessVectorization() = 0;
|
||||
virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
|
||||
@ -1464,9 +1482,9 @@ public:
|
||||
bool enableAggressiveInterleaving(bool LoopHasReductions) override {
|
||||
return Impl.enableAggressiveInterleaving(LoopHasReductions);
|
||||
}
|
||||
const MemCmpExpansionOptions *enableMemCmpExpansion(
|
||||
bool IsZeroCmp) const override {
|
||||
return Impl.enableMemCmpExpansion(IsZeroCmp);
|
||||
MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
|
||||
bool IsZeroCmp) const override {
|
||||
return Impl.enableMemCmpExpansion(OptSize, IsZeroCmp);
|
||||
}
|
||||
bool enableInterleavedAccessVectorization() override {
|
||||
return Impl.enableInterleavedAccessVectorization();
|
||||
|
@ -296,9 +296,9 @@ public:
|
||||
|
||||
bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; }
|
||||
|
||||
const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
|
||||
bool IsZeroCmp) const {
|
||||
return nullptr;
|
||||
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
|
||||
bool IsZeroCmp) const {
|
||||
return {};
|
||||
}
|
||||
|
||||
bool enableInterleavedAccessVectorization() { return false; }
|
||||
|
@ -1385,18 +1385,6 @@ public:
|
||||
return OptSize ? MaxLoadsPerMemcmpOptSize : MaxLoadsPerMemcmp;
|
||||
}
|
||||
|
||||
/// For memcmp expansion when the memcmp result is only compared equal or
|
||||
/// not-equal to 0, allow up to this number of load pairs per block. As an
|
||||
/// example, this may allow 'memcmp(a, b, 3) == 0' in a single block:
|
||||
/// a0 = load2bytes &a[0]
|
||||
/// b0 = load2bytes &b[0]
|
||||
/// a2 = load1byte &a[2]
|
||||
/// b2 = load1byte &b[2]
|
||||
/// r = cmp eq (a0 ^ b0 | a2 ^ b2), 0
|
||||
virtual unsigned getMemcmpEqZeroLoadsPerBlock() const {
|
||||
return 1;
|
||||
}
|
||||
|
||||
/// Get maximum # of store operations permitted for llvm.memmove
|
||||
///
|
||||
/// This function returns the maximum number of store operations permitted
|
||||
|
@ -374,9 +374,9 @@ bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) c
|
||||
return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
|
||||
}
|
||||
|
||||
const TargetTransformInfo::MemCmpExpansionOptions *
|
||||
TargetTransformInfo::enableMemCmpExpansion(bool IsZeroCmp) const {
|
||||
return TTIImpl->enableMemCmpExpansion(IsZeroCmp);
|
||||
TargetTransformInfo::MemCmpExpansionOptions
|
||||
TargetTransformInfo::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
|
||||
return TTIImpl->enableMemCmpExpansion(OptSize, IsZeroCmp);
|
||||
}
|
||||
|
||||
bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
|
||||
|
@ -113,8 +113,7 @@ class MemCmpExpansion {
|
||||
public:
|
||||
MemCmpExpansion(CallInst *CI, uint64_t Size,
|
||||
const TargetTransformInfo::MemCmpExpansionOptions &Options,
|
||||
unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
|
||||
unsigned MaxLoadsPerBlockForZeroCmp, const DataLayout &TheDataLayout);
|
||||
const bool IsUsedForZeroCmp, const DataLayout &TheDataLayout);
|
||||
|
||||
unsigned getNumBlocks();
|
||||
uint64_t getNumLoads() const { return LoadSequence.size(); }
|
||||
@ -203,16 +202,10 @@ MemCmpExpansion::computeOverlappingLoadSequence(uint64_t Size,
|
||||
MemCmpExpansion::MemCmpExpansion(
|
||||
CallInst *const CI, uint64_t Size,
|
||||
const TargetTransformInfo::MemCmpExpansionOptions &Options,
|
||||
const unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
|
||||
const unsigned MaxLoadsPerBlockForZeroCmp, const DataLayout &TheDataLayout)
|
||||
: CI(CI),
|
||||
Size(Size),
|
||||
MaxLoadSize(0),
|
||||
NumLoadsNonOneByte(0),
|
||||
NumLoadsPerBlockForZeroCmp(MaxLoadsPerBlockForZeroCmp),
|
||||
IsUsedForZeroCmp(IsUsedForZeroCmp),
|
||||
DL(TheDataLayout),
|
||||
Builder(CI) {
|
||||
const bool IsUsedForZeroCmp, const DataLayout &TheDataLayout)
|
||||
: CI(CI), Size(Size), MaxLoadSize(0), NumLoadsNonOneByte(0),
|
||||
NumLoadsPerBlockForZeroCmp(Options.NumLoadsPerBlock),
|
||||
IsUsedForZeroCmp(IsUsedForZeroCmp), DL(TheDataLayout), Builder(CI) {
|
||||
assert(Size > 0 && "zero blocks");
|
||||
// Scale the max size down if the target can load more bytes than we need.
|
||||
llvm::ArrayRef<unsigned> LoadSizes(Options.LoadSizes);
|
||||
@ -223,17 +216,17 @@ MemCmpExpansion::MemCmpExpansion(
|
||||
MaxLoadSize = LoadSizes.front();
|
||||
// Compute the decomposition.
|
||||
unsigned GreedyNumLoadsNonOneByte = 0;
|
||||
LoadSequence = computeGreedyLoadSequence(Size, LoadSizes, MaxNumLoads,
|
||||
LoadSequence = computeGreedyLoadSequence(Size, LoadSizes, Options.MaxNumLoads,
|
||||
GreedyNumLoadsNonOneByte);
|
||||
NumLoadsNonOneByte = GreedyNumLoadsNonOneByte;
|
||||
assert(LoadSequence.size() <= MaxNumLoads && "broken invariant");
|
||||
assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");
|
||||
// If we allow overlapping loads and the load sequence is not already optimal,
|
||||
// use overlapping loads.
|
||||
if (Options.AllowOverlappingLoads &&
|
||||
(LoadSequence.empty() || LoadSequence.size() > 2)) {
|
||||
unsigned OverlappingNumLoadsNonOneByte = 0;
|
||||
auto OverlappingLoads = computeOverlappingLoadSequence(
|
||||
Size, MaxLoadSize, MaxNumLoads, OverlappingNumLoadsNonOneByte);
|
||||
Size, MaxLoadSize, Options.MaxNumLoads, OverlappingNumLoadsNonOneByte);
|
||||
if (!OverlappingLoads.empty() &&
|
||||
(LoadSequence.empty() ||
|
||||
OverlappingLoads.size() < LoadSequence.size())) {
|
||||
@ -241,7 +234,7 @@ MemCmpExpansion::MemCmpExpansion(
|
||||
NumLoadsNonOneByte = OverlappingNumLoadsNonOneByte;
|
||||
}
|
||||
}
|
||||
assert(LoadSequence.size() <= MaxNumLoads && "broken invariant");
|
||||
assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");
|
||||
}
|
||||
|
||||
unsigned MemCmpExpansion::getNumBlocks() {
|
||||
@ -748,23 +741,21 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
|
||||
// TTI call to check if target would like to expand memcmp. Also, get the
|
||||
// available load sizes.
|
||||
const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
|
||||
const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp);
|
||||
auto Options = TTI->enableMemCmpExpansion(CI->getFunction()->hasOptSize(),
|
||||
IsUsedForZeroCmp);
|
||||
if (!Options) return false;
|
||||
|
||||
const unsigned MaxNumLoads = CI->getFunction()->hasOptSize()
|
||||
? (MaxLoadsPerMemcmpOptSize.getNumOccurrences()
|
||||
? MaxLoadsPerMemcmpOptSize
|
||||
: TLI->getMaxExpandSizeMemcmp(true))
|
||||
: (MaxLoadsPerMemcmp.getNumOccurrences()
|
||||
? MaxLoadsPerMemcmp
|
||||
: TLI->getMaxExpandSizeMemcmp(false));
|
||||
if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences())
|
||||
Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock;
|
||||
|
||||
unsigned NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences()
|
||||
? MemCmpEqZeroNumLoadsPerBlock
|
||||
: TLI->getMemcmpEqZeroLoadsPerBlock();
|
||||
if (CI->getFunction()->hasOptSize() &&
|
||||
MaxLoadsPerMemcmpOptSize.getNumOccurrences())
|
||||
Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize;
|
||||
|
||||
MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads,
|
||||
IsUsedForZeroCmp, NumLoadsPerBlock, *DL);
|
||||
if (!CI->getFunction()->hasOptSize() && MaxLoadsPerMemcmp.getNumOccurrences())
|
||||
Options.MaxNumLoads = MaxLoadsPerMemcmp;
|
||||
|
||||
MemCmpExpansion Expansion(CI, SizeVal, Options, IsUsedForZeroCmp, *DL);
|
||||
|
||||
// Don't expand if this will require more loads than desired by the target.
|
||||
if (Expansion.getNumLoads() == 0) {
|
||||
|
@ -582,17 +582,12 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
|
||||
return LoopHasReductions;
|
||||
}
|
||||
|
||||
const PPCTTIImpl::TTI::MemCmpExpansionOptions *
|
||||
PPCTTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
|
||||
static const auto Options = []() {
|
||||
TTI::MemCmpExpansionOptions Options;
|
||||
Options.LoadSizes.push_back(8);
|
||||
Options.LoadSizes.push_back(4);
|
||||
Options.LoadSizes.push_back(2);
|
||||
Options.LoadSizes.push_back(1);
|
||||
return Options;
|
||||
}();
|
||||
return &Options;
|
||||
PPCTTIImpl::TTI::MemCmpExpansionOptions
|
||||
PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
|
||||
TTI::MemCmpExpansionOptions Options;
|
||||
Options.LoadSizes = {8, 4, 2, 1};
|
||||
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
|
||||
return Options;
|
||||
}
|
||||
|
||||
bool PPCTTIImpl::enableInterleavedAccessVectorization() {
|
||||
|
@ -66,8 +66,8 @@ public:
|
||||
/// @{
|
||||
bool useColdCCForColdCall(Function &F);
|
||||
bool enableAggressiveInterleaving(bool LoopHasReductions);
|
||||
const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
|
||||
bool IsZeroCmp) const;
|
||||
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
|
||||
bool IsZeroCmp) const;
|
||||
bool enableInterleavedAccessVectorization();
|
||||
unsigned getNumberOfRegisters(bool Vector);
|
||||
unsigned getRegisterBitWidth(bool Vector) const;
|
||||
|
@ -879,11 +879,6 @@ namespace llvm {
|
||||
/// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
|
||||
MVT hasFastEqualityCompare(unsigned NumBits) const override;
|
||||
|
||||
/// Allow multiple load pairs per block for smaller and faster code.
|
||||
unsigned getMemcmpEqZeroLoadsPerBlock() const override {
|
||||
return 2;
|
||||
}
|
||||
|
||||
/// Return the value type to use for ISD::SETCC.
|
||||
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
|
||||
EVT VT) const override;
|
||||
|
@ -3291,38 +3291,29 @@ bool X86TTIImpl::areFunctionArgsABICompatible(
|
||||
TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs();
|
||||
}
|
||||
|
||||
const X86TTIImpl::TTI::MemCmpExpansionOptions *
|
||||
X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
|
||||
// Only enable vector loads for equality comparison.
|
||||
// Right now the vector version is not as fast, see #33329.
|
||||
static const auto ThreeWayOptions = [this]() {
|
||||
TTI::MemCmpExpansionOptions Options;
|
||||
if (ST->is64Bit()) {
|
||||
Options.LoadSizes.push_back(8);
|
||||
}
|
||||
Options.LoadSizes.push_back(4);
|
||||
Options.LoadSizes.push_back(2);
|
||||
Options.LoadSizes.push_back(1);
|
||||
return Options;
|
||||
}();
|
||||
static const auto EqZeroOptions = [this]() {
|
||||
TTI::MemCmpExpansionOptions Options;
|
||||
X86TTIImpl::TTI::MemCmpExpansionOptions
|
||||
X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
|
||||
TTI::MemCmpExpansionOptions Options;
|
||||
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
|
||||
Options.NumLoadsPerBlock = 2;
|
||||
if (IsZeroCmp) {
|
||||
// Only enable vector loads for equality comparison. Right now the vector
|
||||
// version is not as fast for three way compare (see #33329).
|
||||
// TODO: enable AVX512 when the DAG is ready.
|
||||
// if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
|
||||
if (ST->hasAVX2()) Options.LoadSizes.push_back(32);
|
||||
if (ST->hasSSE2()) Options.LoadSizes.push_back(16);
|
||||
if (ST->is64Bit()) {
|
||||
Options.LoadSizes.push_back(8);
|
||||
}
|
||||
Options.LoadSizes.push_back(4);
|
||||
Options.LoadSizes.push_back(2);
|
||||
Options.LoadSizes.push_back(1);
|
||||
// All GPR and vector loads can be unaligned. SIMD compare requires integer
|
||||
// vectors (SSE2/AVX2).
|
||||
Options.AllowOverlappingLoads = true;
|
||||
return Options;
|
||||
}();
|
||||
return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;
|
||||
}
|
||||
if (ST->is64Bit()) {
|
||||
Options.LoadSizes.push_back(8);
|
||||
}
|
||||
Options.LoadSizes.push_back(4);
|
||||
Options.LoadSizes.push_back(2);
|
||||
Options.LoadSizes.push_back(1);
|
||||
return Options;
|
||||
}
|
||||
|
||||
bool X86TTIImpl::enableInterleavedAccessVectorization() {
|
||||
|
@ -199,8 +199,8 @@ public:
|
||||
bool areFunctionArgsABICompatible(const Function *Caller,
|
||||
const Function *Callee,
|
||||
SmallPtrSetImpl<Argument *> &Args) const;
|
||||
const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
|
||||
bool IsZeroCmp) const;
|
||||
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
|
||||
bool IsZeroCmp) const;
|
||||
bool enableInterleavedAccessVectorization();
|
||||
private:
|
||||
int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
|
||||
|
@ -866,7 +866,7 @@ static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
|
||||
|
||||
// We only try merging comparisons if the target wants to expand memcmp later.
|
||||
// The rationale is to avoid turning small chains into memcmp calls.
|
||||
if (!TTI.enableMemCmpExpansion(true))
|
||||
if (!TTI.enableMemCmpExpansion(F.hasOptSize(), true))
|
||||
return false;
|
||||
|
||||
// If we don't have memcmp avaiable we can't emit calls to it.
|
||||
|
Loading…
Reference in New Issue
Block a user