mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 11:13:28 +01:00
[ARM] Code size optimisation to lower udiv+urem to udiv+mls instead of a
library call to __aeabi_uidivmod. This is an improved implementation of r280808, see also D24133, that got reverted because isel was stuck in a loop. That was caused by the optimisation incorrectly triggering on i64 ints, which shouldn't happen because there is no 64bit hwdiv support; that put isel's type legalization and this optimisation in a loop. A native ARM compiler and testing now shows that this is fixed. Patch mostly by Pablo Barrio. Differential Revision: https://reviews.llvm.org/D25077 llvm-svn: 283098
This commit is contained in:
parent
cab8bd9b77
commit
5a6c252c8d
@ -12436,6 +12436,25 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
|
||||
bool isSigned = (Opcode == ISD::SDIVREM);
|
||||
EVT VT = Op->getValueType(0);
|
||||
Type *Ty = VT.getTypeForEVT(*DAG.getContext());
|
||||
SDLoc dl(Op);
|
||||
|
||||
// If the target has hardware divide, use divide + multiply + subtract:
|
||||
// div = a / b
|
||||
// rem = a - b * div
|
||||
// return {div, rem}
|
||||
// This should be lowered into UDIV/SDIV + MLS later on.
|
||||
if (Subtarget->hasDivide() && Op->getValueType(0).isSimple() &&
|
||||
Op->getSimpleValueType(0) == MVT::i32) {
|
||||
unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
|
||||
const SDValue Dividend = Op->getOperand(0);
|
||||
const SDValue Divisor = Op->getOperand(1);
|
||||
SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
|
||||
SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
|
||||
SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
|
||||
|
||||
SDValue Values[2] = {Div, Rem};
|
||||
return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
|
||||
}
|
||||
|
||||
RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
|
||||
VT.getSimpleVT().SimpleTy);
|
||||
@ -12449,7 +12468,6 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
|
||||
|
||||
Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr);
|
||||
|
||||
SDLoc dl(Op);
|
||||
TargetLowering::CallLoweringInfo CLI(DAG);
|
||||
CLI.setDebugLoc(dl).setChain(InChain)
|
||||
.setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
|
||||
|
@ -3,7 +3,12 @@
|
||||
; expanded to a sequence of umull, lsrs, muls and sub instructions, but
|
||||
; just a call to __aeabi_uidivmod.
|
||||
;
|
||||
; When the processor features hardware division, UDIV + UREM can be turned
|
||||
; into UDIV + MLS. This prevents the library function __aeabi_uidivmod to be
|
||||
; pulled into the binary. The test uses ARMv7-M.
|
||||
;
|
||||
; RUN: llc -mtriple=armv7a-eabi -mattr=-neon -verify-machineinstrs %s -o - | FileCheck %s
|
||||
; RUN: llc -mtriple=thumbv7m-eabi -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=V7M
|
||||
|
||||
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
|
||||
target triple = "thumbv7m-arm-none-eabi"
|
||||
@ -28,11 +33,16 @@ entry:
|
||||
ret i32 %div
|
||||
}
|
||||
|
||||
; Test for unsigned remainder
|
||||
define i32 @foo3() local_unnamed_addr #0 {
|
||||
entry:
|
||||
; CHECK-LABEL: foo3:
|
||||
; CHECK: __aeabi_uidivmod
|
||||
; CHECK-NOT: umull
|
||||
; V7M-LABEL: foo3:
|
||||
; V7M: udiv [[R2:r[0-9]+]], [[R0:r[0-9]+]], [[R1:r[0-9]+]]
|
||||
; V7M: mls {{r[0-9]+}}, [[R2]], [[R1]], [[R0]]
|
||||
; V7M-NOT: __aeabi_uidivmod
|
||||
%call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)()
|
||||
%rem = urem i32 %call, 1000000
|
||||
%cmp = icmp eq i32 %rem, 0
|
||||
@ -40,6 +50,68 @@ entry:
|
||||
ret i32 %conv
|
||||
}
|
||||
|
||||
; Test for signed remainder
|
||||
define i32 @foo4() local_unnamed_addr #0 {
|
||||
entry:
|
||||
; CHECK-LABEL: foo4:
|
||||
; CHECK:__aeabi_idivmod
|
||||
; V7M-LABEL: foo4:
|
||||
; V7M: sdiv [[R2:r[0-9]+]], [[R0:r[0-9]+]], [[R1:r[0-9]+]]
|
||||
; V7M: mls {{r[0-9]+}}, [[R2]], [[R1]], [[R0]]
|
||||
; V7M-NOT: __aeabi_idivmod
|
||||
%call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)()
|
||||
%rem = srem i32 %call, 1000000
|
||||
ret i32 %rem
|
||||
}
|
||||
|
||||
; Check that doing a sdiv+srem has the same effect as only the srem,
|
||||
; as the division needs to be computed anyway in order to calculate
|
||||
; the remainder (i.e. make sure we don't end up with two divisions).
|
||||
define i32 @foo5() local_unnamed_addr #0 {
|
||||
entry:
|
||||
; CHECK-LABEL: foo5:
|
||||
; CHECK:__aeabi_idivmod
|
||||
; V7M-LABEL: foo5:
|
||||
; V7M: sdiv [[R2:r[0-9]+]], [[R0:r[0-9]+]], [[R1:r[0-9]+]]
|
||||
; V7M-NOT: sdiv
|
||||
; V7M: mls {{r[0-9]+}}, [[R2]], [[R1]], [[R0]]
|
||||
; V7M-NOT: __aeabi_idivmod
|
||||
%call = tail call i32 bitcast (i32 (...)* @GetValue to i32 ()*)()
|
||||
%div = sdiv i32 %call, 1000000
|
||||
%rem = srem i32 %call, 1000000
|
||||
%add = add i32 %div, %rem
|
||||
ret i32 %add
|
||||
}
|
||||
|
||||
; An early version of this patch caused isel to hang. The reason
|
||||
; was that it shouldn't do the rewrite for i64 because that's not
|
||||
; supported by hardware. Isel was stuck in a loop with type
|
||||
; legalization and this optimisation.
|
||||
; Function Attrs: norecurse nounwind
|
||||
define i64 @isel_dont_hang(i32 %bar) local_unnamed_addr #4 {
|
||||
entry:
|
||||
; CHECK-LABEL: isel_dont_hang:
|
||||
; CHECK: __aeabi_uldivmod
|
||||
%temp.0 = sext i32 %bar to i64
|
||||
%mul83 = shl i64 %temp.0, 1
|
||||
%add84 = add i64 %temp.0, 2
|
||||
%div85 = udiv i64 %mul83, %add84
|
||||
ret i64 %div85
|
||||
}
|
||||
|
||||
; i16 types are promoted to i32, and we expect a normal udiv here:
|
||||
define i16 @isel_dont_hang_2(i16 %bar) local_unnamed_addr #4 {
|
||||
entry:
|
||||
; CHECK-LABEL: isel_dont_hang_2:
|
||||
; CHECK: udiv
|
||||
; CHECK-NOT: __aeabi_
|
||||
%mul83 = shl i16 %bar, 1
|
||||
%add84 = add i16 %bar, 2
|
||||
%div85 = udiv i16 %mul83, %add84
|
||||
ret i16 %div85
|
||||
}
|
||||
declare i32 @GetValue(...) local_unnamed_addr
|
||||
|
||||
attributes #0 = { minsize nounwind optsize }
|
||||
attributes #4 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-jump-tables"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a15" "target-features"="+dsp,+hwdiv,+hwdiv-arm,+neon,+vfp4" "use-soft-float"="false" }
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user