1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 18:54:02 +01:00

[ExpandMemCmp] Move all options to TargetTransformInfo.

Split off from D60318.

llvm-svn: 364281
This commit is contained in:
Clement Courbet 2019-06-25 08:04:13 +00:00
parent 7b7fe56333
commit 5b2aa25659
11 changed files with 79 additions and 101 deletions

View File

@ -630,17 +630,35 @@ public:
/// Don't restrict interleaved unrolling to small loops.
bool enableAggressiveInterleaving(bool LoopHasReductions) const;
/// If not nullptr, enable inline expansion of memcmp. IsZeroCmp is
/// true if this is the expansion of memcmp(p1, p2, s) == 0.
/// Returns options for expansion of memcmp. IsZeroCmp is
// true if this is the expansion of memcmp(p1, p2, s) == 0.
struct MemCmpExpansionOptions {
// Return true if memcmp expansion is enabled.
operator bool() const { return MaxNumLoads > 0; }
// Maximum number of load operations.
unsigned MaxNumLoads = 0;
// The list of available load sizes (in bytes), sorted in decreasing order.
SmallVector<unsigned, 8> LoadSizes;
// For memcmp expansion when the memcmp result is only compared equal or
// not-equal to 0, allow up to this number of load pairs per block. As an
// example, this may allow 'memcmp(a, b, 3) == 0' in a single block:
// a0 = load2bytes &a[0]
// b0 = load2bytes &b[0]
// a2 = load1byte &a[2]
// b2 = load1byte &b[2]
// r = cmp eq (a0 ^ b0 | a2 ^ b2), 0
unsigned NumLoadsPerBlock = 1;
// Set to true to allow overlapping loads. For example, 7-byte compares can
// be done with two 4-byte compares instead of 4+2+1-byte compares. This
// requires all loads in LoadSizes to be doable in an unaligned way.
bool AllowOverlappingLoads = false;
};
const MemCmpExpansionOptions *enableMemCmpExpansion(bool IsZeroCmp) const;
MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const;
/// Enable matching of interleaved access groups.
bool enableInterleavedAccessVectorization() const;
@ -1162,8 +1180,8 @@ public:
unsigned VF) = 0;
virtual bool supportsEfficientVectorElementLoadStore() = 0;
virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
virtual const MemCmpExpansionOptions *enableMemCmpExpansion(
bool IsZeroCmp) const = 0;
virtual MemCmpExpansionOptions
enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0;
virtual bool enableInterleavedAccessVectorization() = 0;
virtual bool enableMaskedInterleavedAccessVectorization() = 0;
virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
@ -1464,9 +1482,9 @@ public:
bool enableAggressiveInterleaving(bool LoopHasReductions) override {
return Impl.enableAggressiveInterleaving(LoopHasReductions);
}
const MemCmpExpansionOptions *enableMemCmpExpansion(
bool IsZeroCmp) const override {
return Impl.enableMemCmpExpansion(IsZeroCmp);
MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const override {
return Impl.enableMemCmpExpansion(OptSize, IsZeroCmp);
}
bool enableInterleavedAccessVectorization() override {
return Impl.enableInterleavedAccessVectorization();

View File

@ -296,9 +296,9 @@ public:
bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; }
const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
bool IsZeroCmp) const {
return nullptr;
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const {
return {};
}
bool enableInterleavedAccessVectorization() { return false; }

View File

@ -1385,18 +1385,6 @@ public:
return OptSize ? MaxLoadsPerMemcmpOptSize : MaxLoadsPerMemcmp;
}
/// For memcmp expansion when the memcmp result is only compared equal or
/// not-equal to 0, allow up to this number of load pairs per block. As an
/// example, this may allow 'memcmp(a, b, 3) == 0' in a single block:
/// a0 = load2bytes &a[0]
/// b0 = load2bytes &b[0]
/// a2 = load1byte &a[2]
/// b2 = load1byte &b[2]
/// r = cmp eq (a0 ^ b0 | a2 ^ b2), 0
virtual unsigned getMemcmpEqZeroLoadsPerBlock() const {
return 1;
}
/// Get maximum # of store operations permitted for llvm.memmove
///
/// This function returns the maximum number of store operations permitted

View File

@ -374,9 +374,9 @@ bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) c
return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
}
const TargetTransformInfo::MemCmpExpansionOptions *
TargetTransformInfo::enableMemCmpExpansion(bool IsZeroCmp) const {
return TTIImpl->enableMemCmpExpansion(IsZeroCmp);
TargetTransformInfo::MemCmpExpansionOptions
TargetTransformInfo::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
return TTIImpl->enableMemCmpExpansion(OptSize, IsZeroCmp);
}
bool TargetTransformInfo::enableInterleavedAccessVectorization() const {

View File

@ -113,8 +113,7 @@ class MemCmpExpansion {
public:
MemCmpExpansion(CallInst *CI, uint64_t Size,
const TargetTransformInfo::MemCmpExpansionOptions &Options,
unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
unsigned MaxLoadsPerBlockForZeroCmp, const DataLayout &TheDataLayout);
const bool IsUsedForZeroCmp, const DataLayout &TheDataLayout);
unsigned getNumBlocks();
uint64_t getNumLoads() const { return LoadSequence.size(); }
@ -203,16 +202,10 @@ MemCmpExpansion::computeOverlappingLoadSequence(uint64_t Size,
MemCmpExpansion::MemCmpExpansion(
CallInst *const CI, uint64_t Size,
const TargetTransformInfo::MemCmpExpansionOptions &Options,
const unsigned MaxNumLoads, const bool IsUsedForZeroCmp,
const unsigned MaxLoadsPerBlockForZeroCmp, const DataLayout &TheDataLayout)
: CI(CI),
Size(Size),
MaxLoadSize(0),
NumLoadsNonOneByte(0),
NumLoadsPerBlockForZeroCmp(MaxLoadsPerBlockForZeroCmp),
IsUsedForZeroCmp(IsUsedForZeroCmp),
DL(TheDataLayout),
Builder(CI) {
const bool IsUsedForZeroCmp, const DataLayout &TheDataLayout)
: CI(CI), Size(Size), MaxLoadSize(0), NumLoadsNonOneByte(0),
NumLoadsPerBlockForZeroCmp(Options.NumLoadsPerBlock),
IsUsedForZeroCmp(IsUsedForZeroCmp), DL(TheDataLayout), Builder(CI) {
assert(Size > 0 && "zero blocks");
// Scale the max size down if the target can load more bytes than we need.
llvm::ArrayRef<unsigned> LoadSizes(Options.LoadSizes);
@ -223,17 +216,17 @@ MemCmpExpansion::MemCmpExpansion(
MaxLoadSize = LoadSizes.front();
// Compute the decomposition.
unsigned GreedyNumLoadsNonOneByte = 0;
LoadSequence = computeGreedyLoadSequence(Size, LoadSizes, MaxNumLoads,
LoadSequence = computeGreedyLoadSequence(Size, LoadSizes, Options.MaxNumLoads,
GreedyNumLoadsNonOneByte);
NumLoadsNonOneByte = GreedyNumLoadsNonOneByte;
assert(LoadSequence.size() <= MaxNumLoads && "broken invariant");
assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");
// If we allow overlapping loads and the load sequence is not already optimal,
// use overlapping loads.
if (Options.AllowOverlappingLoads &&
(LoadSequence.empty() || LoadSequence.size() > 2)) {
unsigned OverlappingNumLoadsNonOneByte = 0;
auto OverlappingLoads = computeOverlappingLoadSequence(
Size, MaxLoadSize, MaxNumLoads, OverlappingNumLoadsNonOneByte);
Size, MaxLoadSize, Options.MaxNumLoads, OverlappingNumLoadsNonOneByte);
if (!OverlappingLoads.empty() &&
(LoadSequence.empty() ||
OverlappingLoads.size() < LoadSequence.size())) {
@ -241,7 +234,7 @@ MemCmpExpansion::MemCmpExpansion(
NumLoadsNonOneByte = OverlappingNumLoadsNonOneByte;
}
}
assert(LoadSequence.size() <= MaxNumLoads && "broken invariant");
assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");
}
unsigned MemCmpExpansion::getNumBlocks() {
@ -748,23 +741,21 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
// TTI call to check if target would like to expand memcmp. Also, get the
// available load sizes.
const bool IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
const auto *const Options = TTI->enableMemCmpExpansion(IsUsedForZeroCmp);
auto Options = TTI->enableMemCmpExpansion(CI->getFunction()->hasOptSize(),
IsUsedForZeroCmp);
if (!Options) return false;
const unsigned MaxNumLoads = CI->getFunction()->hasOptSize()
? (MaxLoadsPerMemcmpOptSize.getNumOccurrences()
? MaxLoadsPerMemcmpOptSize
: TLI->getMaxExpandSizeMemcmp(true))
: (MaxLoadsPerMemcmp.getNumOccurrences()
? MaxLoadsPerMemcmp
: TLI->getMaxExpandSizeMemcmp(false));
if (MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences())
Options.NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock;
unsigned NumLoadsPerBlock = MemCmpEqZeroNumLoadsPerBlock.getNumOccurrences()
? MemCmpEqZeroNumLoadsPerBlock
: TLI->getMemcmpEqZeroLoadsPerBlock();
if (CI->getFunction()->hasOptSize() &&
MaxLoadsPerMemcmpOptSize.getNumOccurrences())
Options.MaxNumLoads = MaxLoadsPerMemcmpOptSize;
MemCmpExpansion Expansion(CI, SizeVal, *Options, MaxNumLoads,
IsUsedForZeroCmp, NumLoadsPerBlock, *DL);
if (!CI->getFunction()->hasOptSize() && MaxLoadsPerMemcmp.getNumOccurrences())
Options.MaxNumLoads = MaxLoadsPerMemcmp;
MemCmpExpansion Expansion(CI, SizeVal, Options, IsUsedForZeroCmp, *DL);
// Don't expand if this will require more loads than desired by the target.
if (Expansion.getNumLoads() == 0) {

View File

@ -582,17 +582,12 @@ bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
return LoopHasReductions;
}
const PPCTTIImpl::TTI::MemCmpExpansionOptions *
PPCTTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
static const auto Options = []() {
TTI::MemCmpExpansionOptions Options;
Options.LoadSizes.push_back(8);
Options.LoadSizes.push_back(4);
Options.LoadSizes.push_back(2);
Options.LoadSizes.push_back(1);
return Options;
}();
return &Options;
PPCTTIImpl::TTI::MemCmpExpansionOptions
PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
TTI::MemCmpExpansionOptions Options;
Options.LoadSizes = {8, 4, 2, 1};
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
return Options;
}
bool PPCTTIImpl::enableInterleavedAccessVectorization() {

View File

@ -66,8 +66,8 @@ public:
/// @{
bool useColdCCForColdCall(Function &F);
bool enableAggressiveInterleaving(bool LoopHasReductions);
const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
bool IsZeroCmp) const;
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const;
bool enableInterleavedAccessVectorization();
unsigned getNumberOfRegisters(bool Vector);
unsigned getRegisterBitWidth(bool Vector) const;

View File

@ -879,11 +879,6 @@ namespace llvm {
/// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
MVT hasFastEqualityCompare(unsigned NumBits) const override;
/// Allow multiple load pairs per block for smaller and faster code.
unsigned getMemcmpEqZeroLoadsPerBlock() const override {
return 2;
}
/// Return the value type to use for ISD::SETCC.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
EVT VT) const override;

View File

@ -3291,38 +3291,29 @@ bool X86TTIImpl::areFunctionArgsABICompatible(
TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs();
}
const X86TTIImpl::TTI::MemCmpExpansionOptions *
X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
// Only enable vector loads for equality comparison.
// Right now the vector version is not as fast, see #33329.
static const auto ThreeWayOptions = [this]() {
TTI::MemCmpExpansionOptions Options;
if (ST->is64Bit()) {
Options.LoadSizes.push_back(8);
}
Options.LoadSizes.push_back(4);
Options.LoadSizes.push_back(2);
Options.LoadSizes.push_back(1);
return Options;
}();
static const auto EqZeroOptions = [this]() {
TTI::MemCmpExpansionOptions Options;
X86TTIImpl::TTI::MemCmpExpansionOptions
X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
TTI::MemCmpExpansionOptions Options;
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
Options.NumLoadsPerBlock = 2;
if (IsZeroCmp) {
// Only enable vector loads for equality comparison. Right now the vector
// version is not as fast for three way compare (see #33329).
// TODO: enable AVX512 when the DAG is ready.
// if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
if (ST->hasAVX2()) Options.LoadSizes.push_back(32);
if (ST->hasSSE2()) Options.LoadSizes.push_back(16);
if (ST->is64Bit()) {
Options.LoadSizes.push_back(8);
}
Options.LoadSizes.push_back(4);
Options.LoadSizes.push_back(2);
Options.LoadSizes.push_back(1);
// All GPR and vector loads can be unaligned. SIMD compare requires integer
// vectors (SSE2/AVX2).
Options.AllowOverlappingLoads = true;
return Options;
}();
return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;
}
if (ST->is64Bit()) {
Options.LoadSizes.push_back(8);
}
Options.LoadSizes.push_back(4);
Options.LoadSizes.push_back(2);
Options.LoadSizes.push_back(1);
return Options;
}
bool X86TTIImpl::enableInterleavedAccessVectorization() {

View File

@ -199,8 +199,8 @@ public:
bool areFunctionArgsABICompatible(const Function *Caller,
const Function *Callee,
SmallPtrSetImpl<Argument *> &Args) const;
const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
bool IsZeroCmp) const;
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
bool IsZeroCmp) const;
bool enableInterleavedAccessVectorization();
private:
int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,

View File

@ -866,7 +866,7 @@ static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
// We only try merging comparisons if the target wants to expand memcmp later.
// The rationale is to avoid turning small chains into memcmp calls.
if (!TTI.enableMemCmpExpansion(true))
if (!TTI.enableMemCmpExpansion(F.hasOptSize(), true))
return false;
// If we don't have memcmp avaiable we can't emit calls to it.