mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-18 18:42:46 +02:00
[AArch64] Expand bcmp() for small block lengths
Patch D56593 by @courbet results in calls to `bcmp()` in some cases, should the target support the it. Unless `TTI::MemCmpExpansionOptions()` is overridden by the target. In a proprietary benchmark we see a performance drop of about 12% on PNG compression before this patch, though it passes all tests. This patch mirrors X86 for AArch64 and initializes `TTI::MemCmpExpansionOptions()` to then expand calls to `bcmp()` when appropriate. No tuning of the parameters was performed, but, at this point, it's enough to recover the performance drop above. This problem also exists on ARM. Once a consensus is reached for AArch64, we can work to fix ARM as well. Authors: - Evandro Menezes (@evandro) <e.menezes@samsung.com> - Brian Rzycki (@brzycki) <b.rzycki@samsung.com> Differential revision: https://reviews.llvm.org/D64805 llvm-svn: 367898
This commit is contained in:
parent
0492dd709d
commit
66a85dd4d4
@ -629,6 +629,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
|
||||
|
||||
MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;
|
||||
|
||||
MaxLoadsPerMemcmpOptSize = 4;
|
||||
MaxLoadsPerMemcmp = Subtarget->requiresStrictAlign()
|
||||
? MaxLoadsPerMemcmpOptSize : 8;
|
||||
|
||||
setStackPointerRegisterToSaveRestore(AArch64::SP);
|
||||
|
||||
setSchedulingPreference(Sched::Hybrid);
|
||||
|
@ -618,6 +618,19 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
|
||||
return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
|
||||
}
|
||||
|
||||
AArch64TTIImpl::TTI::MemCmpExpansionOptions
|
||||
AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
|
||||
TTI::MemCmpExpansionOptions Options;
|
||||
Options.AllowOverlappingLoads = !ST->requiresStrictAlign();
|
||||
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
|
||||
Options.NumLoadsPerBlock = Options.MaxNumLoads;
|
||||
// TODO: Though vector loads usually perform well on AArch64, in some targets
|
||||
// they may wake up the FP unit, which raises the power consumption. Perhaps
|
||||
// they could be used with no holds barred (-O3).
|
||||
Options.LoadSizes = {8, 4, 2, 1};
|
||||
return Options;
|
||||
}
|
||||
|
||||
int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
|
||||
unsigned Alignment, unsigned AddressSpace,
|
||||
const Instruction *I) {
|
||||
|
@ -130,6 +130,9 @@ public:
|
||||
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
|
||||
const Instruction *I = nullptr);
|
||||
|
||||
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
|
||||
bool IsZeroCmp) const;
|
||||
|
||||
int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
|
||||
unsigned AddressSpace, const Instruction *I = nullptr);
|
||||
|
||||
|
44
test/CodeGen/AArch64/bcmp-inline-small.ll
Normal file
44
test/CodeGen/AArch64/bcmp-inline-small.ll
Normal file
@ -0,0 +1,44 @@
|
||||
; RUN: llc -O2 < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECKN
|
||||
; RUN: llc -O2 < %s -mtriple=aarch64-linux-gnu -mattr=strict-align | FileCheck %s --check-prefixes=CHECK,CHECKS
|
||||
|
||||
declare i32 @bcmp(i8*, i8*, i64) nounwind readonly
|
||||
declare i32 @memcmp(i8*, i8*, i64) nounwind readonly
|
||||
|
||||
define i1 @bcmp_b2(i8* %s1, i8* %s2) {
|
||||
entry:
|
||||
%bcmp = call i32 @bcmp(i8* %s1, i8* %s2, i64 15)
|
||||
%ret = icmp eq i32 %bcmp, 0
|
||||
ret i1 %ret
|
||||
|
||||
; CHECK-LABEL: bcmp_b2:
|
||||
; CHECK-NOT: bl bcmp
|
||||
; CHECKN: ldr x
|
||||
; CHECKN-NEXT: ldr x
|
||||
; CHECKN-NEXT: ldur x
|
||||
; CHECKN-NEXT: ldur x
|
||||
; CHECKS: ldr x
|
||||
; CHECKS-NEXT: ldr x
|
||||
; CHECKS-NEXT: ldr w
|
||||
; CHECKS-NEXT: ldr w
|
||||
; CHECKS-NEXT: ldrh w
|
||||
; CHECKS-NEXT: ldrh w
|
||||
; CHECKS-NEXT: ldrb w
|
||||
; CHECKS-NEXT: ldrb w
|
||||
}
|
||||
|
||||
define i1 @bcmp_bs(i8* %s1, i8* %s2) optsize {
|
||||
entry:
|
||||
%memcmp = call i32 @memcmp(i8* %s1, i8* %s2, i64 31)
|
||||
%ret = icmp eq i32 %memcmp, 0
|
||||
ret i1 %ret
|
||||
|
||||
; CHECK-LABEL: bcmp_bs:
|
||||
; CHECKN-NOT: bl memcmp
|
||||
; CHECKN: ldp x
|
||||
; CHECKN-NEXT: ldp x
|
||||
; CHECKN-NEXT: ldr x
|
||||
; CHECKN-NEXT: ldr x
|
||||
; CHECKN-NEXT: ldur x
|
||||
; CHECKN-NEXT: ldur x
|
||||
; CHECKS: bl memcmp
|
||||
}
|
Loading…
Reference in New Issue
Block a user