mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 03:02:36 +01:00
[ARM] Don't expand sdiv when optimising for minsize
Don't expand SDIV with an immediate that is a power of 2 if we optimise for minimum code size. For example: sdiv %1, i32 4 gets expanded to a sequence of 3 instructions, but this is suboptimal for minimum code size so instead we just generate a MOV and a SDIV if integer division is supported. Differential Revision: https://reviews.llvm.org/D54546 llvm-svn: 347965
This commit is contained in:
parent
ff5a1e1583
commit
532a78148a
@ -7794,6 +7794,50 @@ SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
|
||||
return LowerCallTo(CLI).first;
|
||||
}
|
||||
|
||||
// This is a code size optimisation: return the original SDIV node to
|
||||
// DAGCombiner when we don't want to expand SDIV into a sequence of
|
||||
// instructions, and an empty node otherwise which will cause the
|
||||
// SDIV to be expanded in DAGCombine.
|
||||
SDValue
|
||||
ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
|
||||
SelectionDAG &DAG,
|
||||
SmallVectorImpl<SDNode *> &Created) const {
|
||||
// TODO: Support SREM
|
||||
if (N->getOpcode() != ISD::SDIV)
|
||||
return SDValue();
|
||||
|
||||
const auto &ST = static_cast<const ARMSubtarget&>(DAG.getSubtarget());
|
||||
const auto &MF = DAG.getMachineFunction();
|
||||
const bool MinSize = MF.getFunction().optForMinSize();
|
||||
const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
|
||||
: ST.hasDivideInARMMode();
|
||||
|
||||
// Don't touch vector types; rewriting this may lead to scalarizing
|
||||
// the int divs.
|
||||
if (N->getOperand(0).getValueType().isVector())
|
||||
return SDValue();
|
||||
|
||||
// Bail if MinSize is not set, and also for both ARM and Thumb mode we need
|
||||
// hwdiv support for this to be really profitable.
|
||||
if (!(MinSize && HasDivide))
|
||||
return SDValue();
|
||||
|
||||
// ARM mode is a bit simpler than Thumb: we can handle large power
|
||||
// of 2 immediates with 1 mov instruction; no further checks required,
|
||||
// just return the sdiv node.
|
||||
if (!ST.isThumb())
|
||||
return SDValue(N, 0);
|
||||
|
||||
// In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
|
||||
// and thus lose the code size benefits of a MOVS that requires only 2.
|
||||
// TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
|
||||
// but as it's doing exactly this, it's not worth the trouble to get TTI.
|
||||
if (Divisor.sgt(128))
|
||||
return SDValue();
|
||||
|
||||
return SDValue(N, 0);
|
||||
}
|
||||
|
||||
SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
|
||||
bool Signed) const {
|
||||
assert(Op.getValueType() == MVT::i32 &&
|
||||
|
@ -694,6 +694,9 @@ class VectorType;
|
||||
unsigned getRegisterByName(const char* RegName, EVT VT,
|
||||
SelectionDAG &DAG) const override;
|
||||
|
||||
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
|
||||
SmallVectorImpl<SDNode *> &Created) const override;
|
||||
|
||||
/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
|
||||
/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
|
||||
/// expanded to FMAs when this method returns true, otherwise fmuladd is
|
||||
|
79
test/CodeGen/ARM/sdiv-pow2-arm-size.ll
Normal file
79
test/CodeGen/ARM/sdiv-pow2-arm-size.ll
Normal file
@ -0,0 +1,79 @@
|
||||
; RUN: llc -mtriple=armv7a -mattr=+hwdiv-arm %s -o - | FileCheck %s --check-prefixes=CHECK,DIV
|
||||
; RUN: llc -mtriple=armv7a -mattr=-hwdiv-arm %s -o - | FileCheck %s --check-prefixes=CHECK,NODIV
|
||||
|
||||
; Check SREM
|
||||
define dso_local i32 @test_rem(i32 %F) local_unnamed_addr #0 {
|
||||
; CHECK-LABEL: test_rem
|
||||
; CHECK: asr r1, r0, #31
|
||||
; CHECK-NEXT: add r1, r0, r1, lsr #30
|
||||
; CHECK-NEXT: bic r1, r1, #3
|
||||
; CHECK-NEXT: sub r0, r0, r1
|
||||
|
||||
entry:
|
||||
%div = srem i32 %F, 4
|
||||
ret i32 %div
|
||||
}
|
||||
|
||||
; Try an i16 sdiv, with a small immediate.
|
||||
define dso_local signext i16 @f0(i16 signext %F) local_unnamed_addr #0 {
|
||||
; CHECK-LABEL: f0
|
||||
|
||||
; DIV: mov r1, #2
|
||||
; DIV-NEXT: sdiv r0, r0, r1
|
||||
; DIV-NEXT: sxth r0, r0
|
||||
; DIV-NEXT: bx lr
|
||||
|
||||
; NODIV: uxth r1, r0
|
||||
; NODIV-NEXT: add r0, r0, r1, lsr #15
|
||||
; NODIV-NEXT: sxth r0, r0
|
||||
; NODIV-NEXT: asr r0, r0, #1
|
||||
; NODIV-NEXT: bx lr
|
||||
|
||||
entry:
|
||||
%0 = sdiv i16 %F, 2
|
||||
ret i16 %0
|
||||
}
|
||||
|
||||
; Try an i32 sdiv, with a small immediate.
|
||||
define dso_local i32 @f1(i32 %F) local_unnamed_addr #0 {
|
||||
; CHECK-LABEL: f1
|
||||
|
||||
; DIV: mov r1, #4
|
||||
; DIV-NEXT: sdiv r0, r0, r1
|
||||
; DIV-NEXT: bx lr
|
||||
|
||||
; NODIV: asr r1, r0, #31
|
||||
; NODIV-NEXT: add r0, r0, r1, lsr #30
|
||||
; NODIV-NEXT: asr r0, r0, #2
|
||||
; NODIV-NEXT: bx lr
|
||||
|
||||
entry:
|
||||
%div = sdiv i32 %F, 4
|
||||
ret i32 %div
|
||||
}
|
||||
|
||||
; Try a large power of 2 immediate, which should also be materialised with 1
|
||||
; move immediate instruction.
|
||||
define dso_local i32 @f2(i32 %F) local_unnamed_addr #0 {
|
||||
; CHECK-LABEL: f2
|
||||
; DIV: mov r1, #131072
|
||||
; DIV-NEXT: sdiv r0, r0, r1
|
||||
; DIV-NEXT: bx lr
|
||||
entry:
|
||||
%div = sdiv i32 %F, 131072
|
||||
ret i32 %div
|
||||
}
|
||||
|
||||
; MinSize not set, so should expand to the faster but longer sequence.
|
||||
define dso_local i32 @f3(i32 %F) {
|
||||
; CHECK-LABEL: f3
|
||||
; CHECK: asr r1, r0, #31
|
||||
; CHECK-NEXT: add r0, r0, r1, lsr #30
|
||||
; CHECK-NEXT: asr r0, r0, #2
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%div = sdiv i32 %F, 4
|
||||
ret i32 %div
|
||||
}
|
||||
|
||||
attributes #0 = { minsize norecurse nounwind optsize readnone }
|
105
test/CodeGen/ARM/sdiv-pow2-thumb-size.ll
Normal file
105
test/CodeGen/ARM/sdiv-pow2-thumb-size.ll
Normal file
@ -0,0 +1,105 @@
|
||||
; RUN: llc -mtriple=thumbv8 %s -o - | FileCheck %s --check-prefixes=CHECK,T2
|
||||
; RUN: llc -mtriple=thumbv8m.main %s -o - | FileCheck %s --check-prefixes=CHECK,T2
|
||||
; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s --check-prefixes=CHECK,T1
|
||||
; RUN: llc -mtriple=thumbv7em %s -o - | FileCheck %s --check-prefixes=CHECK,T2
|
||||
; RUN: llc -mtriple=thumbv6m %s -o - | FileCheck %s --check-prefixes=V6M
|
||||
|
||||
; Armv6m targets don't have a sdiv instruction, so sdiv should not appear at
|
||||
; all in the output:
|
||||
|
||||
; V6M: .file {{.*}}
|
||||
; V6M-NOT: sdiv
|
||||
; V6M-NOT: idiv
|
||||
|
||||
; Test sdiv i16
|
||||
define dso_local signext i16 @f0(i16 signext %F) local_unnamed_addr #0 {
|
||||
; CHECK-LABEL: f0
|
||||
; CHECK: movs r1, #2
|
||||
; CHECK-NEXT: sdiv r0, r0, r1
|
||||
; CHECK-NEXT: sxth r0, r0
|
||||
; CHECK-NEXT: bx lr
|
||||
|
||||
entry:
|
||||
%0 = sdiv i16 %F, 2
|
||||
ret i16 %0
|
||||
}
|
||||
|
||||
; Same as above, but now with i32
|
||||
define dso_local i32 @f1(i32 %F) local_unnamed_addr #0 {
|
||||
; CHECK-LABEL: f1
|
||||
; CHECK: movs r1, #4
|
||||
; CHECK-NEXT: sdiv r0, r0, r1
|
||||
; CHECK-NEXT: bx lr
|
||||
|
||||
entry:
|
||||
%div = sdiv i32 %F, 4
|
||||
ret i32 %div
|
||||
}
|
||||
|
||||
; The immediate is not a power of 2, so we expect a sdiv.
|
||||
define dso_local i32 @f2(i32 %F) local_unnamed_addr #0 {
|
||||
; CHECK-LABEL: f2
|
||||
; CHECK: movs r1, #5
|
||||
; CHECK-NEXT: sdiv r0, r0, r1
|
||||
; CHECK-NEXT: bx lr
|
||||
|
||||
entry:
|
||||
%div = sdiv i32 %F, 5
|
||||
ret i32 %div
|
||||
}
|
||||
|
||||
; Try a larger power of 2 immediate: immediates larger than
|
||||
; 128 don't give any code size savings.
|
||||
define dso_local i32 @f3(i32 %F) local_unnamed_addr #0 {
|
||||
; CHECK-LABEL: f3
|
||||
; CHECK-NOT: sdiv
|
||||
entry:
|
||||
%div = sdiv i32 %F, 256
|
||||
ret i32 %div
|
||||
}
|
||||
|
||||
attributes #0 = { minsize norecurse nounwind optsize readnone }
|
||||
|
||||
|
||||
; These functions don't have the minsize attribute set, so should not lower
|
||||
; the sdiv to sdiv, but to the faster instruction sequence.
|
||||
|
||||
define dso_local signext i16 @f4(i16 signext %F) {
|
||||
; T2-LABEL: f4
|
||||
; T2: uxth r1, r0
|
||||
; T2-NEXT: add.w r0, r0, r1, lsr #15
|
||||
; T2-NEXT: sxth r0, r0
|
||||
; T2-NEXT: asrs r0, r0, #1
|
||||
; T2-NEXT: bx lr
|
||||
|
||||
; T1-LABEL: f4
|
||||
; T1: uxth r1, r0
|
||||
; T1-NEXT: lsrs r1, r1, #15
|
||||
; T1-NEXT: adds r0, r0, r1
|
||||
; T1-NEXT: sxth r0, r0
|
||||
; T1-NEXT: asrs r0, r0, #1
|
||||
; T1-NEXT: bx lr
|
||||
|
||||
entry:
|
||||
%0 = sdiv i16 %F, 2
|
||||
ret i16 %0
|
||||
}
|
||||
|
||||
define dso_local i32 @f5(i32 %F) {
|
||||
; T2-LABEL: f5
|
||||
; T2: asrs r1, r0, #31
|
||||
; T2-NEXT: add.w r0, r0, r1, lsr #30
|
||||
; T2-NEXT: asrs r0, r0, #2
|
||||
; T2-NEXT: bx lr
|
||||
|
||||
; T1-LABEL: f5
|
||||
; T1: asrs r1, r0, #31
|
||||
; T1-NEXT: lsrs r1, r1, #30
|
||||
; T1-NEXT: adds r0, r0, r1
|
||||
; T1-NEXT: asrs r0, r0, #2
|
||||
; T1-NEXT: bx lr
|
||||
|
||||
entry:
|
||||
%div = sdiv i32 %F, 4
|
||||
ret i32 %div
|
||||
}
|
Loading…
Reference in New Issue
Block a user