mirror of
https://github.com/pmret/gcc-papermario.git
synced 2024-11-08 20:02:47 +01:00
487 lines
10 KiB
NASM
487 lines
10 KiB
NASM
/* This is an assembly language implementation of libgcc1.c for the sparc
|
|
processor.
|
|
|
|
These routines are derived from the Sparc Architecture Manual, version 8,
|
|
slightly edited to match the desired calling convention, and also to
|
|
optimize them for our purposes. */
|
|
|
|
#ifdef L_mulsi3
|
|
.text
|
|
.align 4
|
|
.global .umul
|
|
.proc 4
|
|
.umul:
|
|
or %o0, %o1, %o4 ! logical or of multiplier and multiplicand
|
|
mov %o0, %y ! multiplier to Y register
|
|
andncc %o4, 0xfff, %o5 ! mask out lower 12 bits
|
|
be mul_shortway ! can do it the short way
|
|
andcc %g0, %g0, %o4 ! zero the partial product and clear NV cc
|
|
!
|
|
! long multiply
|
|
!
|
|
mulscc %o4, %o1, %o4 ! first iteration of 33
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4 ! 32nd iteration
|
|
mulscc %o4, %g0, %o4 ! last iteration only shifts
|
|
! the upper 32 bits of product are wrong, but we do not care
|
|
retl
|
|
rd %y, %o0
|
|
!
|
|
! short multiply
|
|
!
|
|
mul_shortway:
|
|
mulscc %o4, %o1, %o4 ! first iteration of 13
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4
|
|
mulscc %o4, %o1, %o4 ! 12th iteration
|
|
mulscc %o4, %g0, %o4 ! last iteration only shifts
|
|
rd %y, %o5
|
|
sll %o4, 12, %o4 ! left shift partial product by 12 bits
|
|
srl %o5, 20, %o5 ! right shift partial product by 20 bits
|
|
retl
|
|
or %o5, %o4, %o0 ! merge for true product
|
|
#endif
|
|
|
|
#ifdef L_divsi3
|
|
.text
|
|
.align 4
|
|
.global .udiv
|
|
.proc 4
|
|
.udiv:
|
|
save %sp, -64, %sp
|
|
b divide
|
|
mov 0, %l2 ! result always positive
|
|
.global .div
|
|
.proc 4
|
|
.div:
|
|
save %sp, -64, %sp
|
|
orcc %i1, %i0, %g0 ! is either operand negative
|
|
bge divide ! if not, skip this junk
|
|
xor %i1, %i0, %l2 ! record sign of result in sign of %l2
|
|
tst %i1
|
|
bge 2f
|
|
tst %i0
|
|
! %i1 < 0
|
|
bge divide
|
|
neg %i1
|
|
2: ! %i0 < 0
|
|
neg %i0
|
|
! FALL THROUGH
|
|
divide:
|
|
! Compute size of quotient, scale comparand.
|
|
orcc %i1, %g0, %l1 ! movcc %i1, %l1
|
|
te 2 ! if %i1 = 0
|
|
mov %i0, %i3
|
|
mov 0, %i2
|
|
sethi %hi(1<<(32-4-1)), %l3
|
|
cmp %i3, %l3
|
|
blu not_really_big
|
|
mov 0, %l0
|
|
!
|
|
! Here, the %i0 is >= 2^(31-3) or so. We must be careful here,
|
|
! as our usual 3-at-a-shot divide step will cause overflow and havoc.
|
|
! The total number of bits in the result here is 3*%l0+%l4, where
|
|
! %l4 <= 3.
|
|
! Compute %l0 in an unorthodox manner: know we need to Shift %l1 into
|
|
! the top decade: so do not even bother to compare to %i3.
|
|
1: cmp %l1, %l3
|
|
bgeu 3f
|
|
mov 1, %l4
|
|
sll %l1, 3, %l1
|
|
b 1b
|
|
inc %l0
|
|
!
|
|
! Now compute %l4
|
|
!
|
|
2: addcc %l1, %l1, %l1
|
|
bcc not_too_big
|
|
add %l4, 1, %l4
|
|
!
|
|
! We are here if the %i1 overflowed when Shifting.
|
|
! This means that %i3 has the high-order bit set.
|
|
! Restore %l1 and subtract from %i3.
|
|
sll %l3, 4, %l3
|
|
srl %l1, 1, %l1
|
|
add %l1, %l3, %l1
|
|
b do_single_div
|
|
dec %l4
|
|
not_too_big:
|
|
3: cmp %l1, %i3
|
|
blu 2b
|
|
nop
|
|
be do_single_div
|
|
nop
|
|
! %l1 > %i3: went too far: back up 1 step
|
|
! srl %l1, 1, %l1
|
|
! dec %l4
|
|
! do single-bit divide steps
|
|
!
|
|
! We have to be careful here. We know that %i3 >= %l1, so we can do the
|
|
! first divide step without thinking. BUT, the others are conditional,
|
|
! and are only done if %i3 >= 0. Because both %i3 and %l1 may have the
|
|
! high-order bit set in the first step, just falling into the regular
|
|
! division loop will mess up the first time around.
|
|
! So we unroll slightly...
|
|
do_single_div:
|
|
deccc %l4
|
|
bl end_regular_divide
|
|
nop
|
|
sub %i3, %l1, %i3
|
|
mov 1, %i2
|
|
b end_single_divloop
|
|
nop
|
|
single_divloop:
|
|
sll %i2, 1, %i2
|
|
bl 1f
|
|
srl %l1, 1, %l1
|
|
! %i3 >= 0
|
|
sub %i3, %l1, %i3
|
|
b 2f
|
|
inc %i2
|
|
1: ! %i3 < 0
|
|
add %i3, %l1, %i3
|
|
dec %i2
|
|
end_single_divloop:
|
|
2: deccc %l4
|
|
bge single_divloop
|
|
tst %i3
|
|
b end_regular_divide
|
|
nop
|
|
not_really_big:
|
|
1: sll %l1, 3, %l1
|
|
cmp %l1, %i3
|
|
bleu 1b
|
|
inccc %l0
|
|
be got_result
|
|
dec %l0
|
|
do_regular_divide:
|
|
! Do the main division iteration
|
|
tst %i3
|
|
! Fall through into divide loop
|
|
divloop:
|
|
sll %i2, 3, %i2
|
|
! depth 1, accumulated bits 0
|
|
bl L.1.8
|
|
srl %l1,1,%l1
|
|
! remainder is positive
|
|
subcc %i3,%l1,%i3
|
|
! depth 2, accumulated bits 1
|
|
bl L.2.9
|
|
srl %l1,1,%l1
|
|
! remainder is positive
|
|
subcc %i3,%l1,%i3
|
|
! depth 3, accumulated bits 3
|
|
bl L.3.11
|
|
srl %l1,1,%l1
|
|
! remainder is positive
|
|
subcc %i3,%l1,%i3
|
|
b 9f
|
|
add %i2, (3*2+1), %i2
|
|
L.3.11: ! remainder is negative
|
|
addcc %i3,%l1,%i3
|
|
b 9f
|
|
add %i2, (3*2-1), %i2
|
|
L.2.9: ! remainder is negative
|
|
addcc %i3,%l1,%i3
|
|
! depth 3, accumulated bits 1
|
|
bl L.3.9
|
|
srl %l1,1,%l1
|
|
! remainder is positive
|
|
subcc %i3,%l1,%i3
|
|
b 9f
|
|
add %i2, (1*2+1), %i2
|
|
L.3.9: ! remainder is negative
|
|
addcc %i3,%l1,%i3
|
|
b 9f
|
|
add %i2, (1*2-1), %i2
|
|
L.1.8: ! remainder is negative
|
|
addcc %i3,%l1,%i3
|
|
! depth 2, accumulated bits -1
|
|
bl L.2.7
|
|
srl %l1,1,%l1
|
|
! remainder is positive
|
|
subcc %i3,%l1,%i3
|
|
! depth 3, accumulated bits -1
|
|
bl L.3.7
|
|
srl %l1,1,%l1
|
|
! remainder is positive
|
|
subcc %i3,%l1,%i3
|
|
b 9f
|
|
add %i2, (-1*2+1), %i2
|
|
L.3.7: ! remainder is negative
|
|
addcc %i3,%l1,%i3
|
|
b 9f
|
|
add %i2, (-1*2-1), %i2
|
|
L.2.7: ! remainder is negative
|
|
addcc %i3,%l1,%i3
|
|
! depth 3, accumulated bits -3
|
|
bl L.3.5
|
|
srl %l1,1,%l1
|
|
! remainder is positive
|
|
subcc %i3,%l1,%i3
|
|
b 9f
|
|
add %i2, (-3*2+1), %i2
|
|
L.3.5: ! remainder is negative
|
|
addcc %i3,%l1,%i3
|
|
b 9f
|
|
add %i2, (-3*2-1), %i2
|
|
end_regular_divide:
|
|
9: deccc %l0
|
|
bge divloop
|
|
tst %i3
|
|
bge got_result
|
|
nop
|
|
! non-restoring fixup here
|
|
dec %i2
|
|
got_result:
|
|
tst %l2
|
|
bge 1f
|
|
restore
|
|
! answer < 0
|
|
retl ! leaf-routine return
|
|
neg %o2, %o0 ! quotient <- -%i2
|
|
1: retl ! leaf-routine return
|
|
mov %o2, %o0 ! quotient <- %i2
|
|
#endif
|
|
|
|
#ifdef L_modsi3
|
|
.text
|
|
.align 4
|
|
.global .urem
|
|
.proc 4
|
|
.urem:
|
|
save %sp, -64, %sp
|
|
b divide
|
|
mov 0, %l2 ! result always positive
|
|
.global .rem
|
|
.proc 4
|
|
.rem:
|
|
save %sp, -64, %sp
|
|
orcc %i1, %i0, %g0 ! is either operand negative
|
|
bge divide ! if not, skip this junk
|
|
mov %i0, %l2 ! record sign of result in sign of %i2
|
|
tst %i1
|
|
bge 2f
|
|
tst %i0
|
|
! %i1 < 0
|
|
bge divide
|
|
neg %i1
|
|
2: ! %i0 < 0
|
|
neg %i0
|
|
! FALL THROUGH
|
|
divide:
|
|
! Compute size of quotient, scale comparand.
|
|
orcc %i1, %g0, %l1 ! movcc %i1, %l1
|
|
te 2 ! if %i1 = 0
|
|
mov %i0, %i3
|
|
mov 0, %i2
|
|
sethi %hi(1<<(32-4-1)), %l3
|
|
cmp %i3, %l3
|
|
blu not_really_big
|
|
mov 0, %l0
|
|
!
|
|
! Here, the %i0 is >= 2^(31-3) or so. We must be careful here,
|
|
! as our usual 3-at-a-shot divide step will cause overflow and havoc.
|
|
! The total number of bits in the result here is 3*%l0+%l4, where
|
|
! %l4 <= 3.
|
|
! Compute %l0 in an unorthodox manner: know we need to Shift %l1 into
|
|
! the top decade: so do not even bother to compare to %i3.
|
|
1: cmp %l1, %l3
|
|
bgeu 3f
|
|
mov 1, %l4
|
|
sll %l1, 3, %l1
|
|
b 1b
|
|
inc %l0
|
|
!
|
|
! Now compute %l4
|
|
!
|
|
2: addcc %l1, %l1, %l1
|
|
bcc not_too_big
|
|
add %l4, 1, %l4
|
|
!
|
|
! We are here if the %i1 overflowed when Shifting.
|
|
! This means that %i3 has the high-order bit set.
|
|
! Restore %l1 and subtract from %i3.
|
|
sll %l3, 4, %l3
|
|
srl %l1, 1, %l1
|
|
add %l1, %l3, %l1
|
|
b do_single_div
|
|
dec %l4
|
|
not_too_big:
|
|
3: cmp %l1, %i3
|
|
blu 2b
|
|
nop
|
|
be do_single_div
|
|
nop
|
|
! %l1 > %i3: went too far: back up 1 step
|
|
! srl %l1, 1, %l1
|
|
! dec %l4
|
|
! do single-bit divide steps
|
|
!
|
|
! We have to be careful here. We know that %i3 >= %l1, so we can do the
|
|
! first divide step without thinking. BUT, the others are conditional,
|
|
! and are only done if %i3 >= 0. Because both %i3 and %l1 may have the
|
|
! high-order bit set in the first step, just falling into the regular
|
|
! division loop will mess up the first time around.
|
|
! So we unroll slightly...
|
|
do_single_div:
|
|
deccc %l4
|
|
bl end_regular_divide
|
|
nop
|
|
sub %i3, %l1, %i3
|
|
mov 1, %i2
|
|
b end_single_divloop
|
|
nop
|
|
single_divloop:
|
|
sll %i2, 1, %i2
|
|
bl 1f
|
|
srl %l1, 1, %l1
|
|
! %i3 >= 0
|
|
sub %i3, %l1, %i3
|
|
b 2f
|
|
inc %i2
|
|
1: ! %i3 < 0
|
|
add %i3, %l1, %i3
|
|
dec %i2
|
|
end_single_divloop:
|
|
2: deccc %l4
|
|
bge single_divloop
|
|
tst %i3
|
|
b end_regular_divide
|
|
nop
|
|
not_really_big:
|
|
1: sll %l1, 3, %l1
|
|
cmp %l1, %i3
|
|
bleu 1b
|
|
inccc %l0
|
|
be got_result
|
|
dec %l0
|
|
do_regular_divide:
|
|
! Do the main division iteration
|
|
tst %i3
|
|
! Fall through into divide loop
|
|
divloop:
|
|
sll %i2, 3, %i2
|
|
! depth 1, accumulated bits 0
|
|
bl L.1.8
|
|
srl %l1,1,%l1
|
|
! remainder is positive
|
|
subcc %i3,%l1,%i3
|
|
! depth 2, accumulated bits 1
|
|
bl L.2.9
|
|
srl %l1,1,%l1
|
|
! remainder is positive
|
|
subcc %i3,%l1,%i3
|
|
! depth 3, accumulated bits 3
|
|
bl L.3.11
|
|
srl %l1,1,%l1
|
|
! remainder is positive
|
|
subcc %i3,%l1,%i3
|
|
b 9f
|
|
add %i2, (3*2+1), %i2
|
|
L.3.11: ! remainder is negative
|
|
addcc %i3,%l1,%i3
|
|
b 9f
|
|
add %i2, (3*2-1), %i2
|
|
L.2.9: ! remainder is negative
|
|
addcc %i3,%l1,%i3
|
|
! depth 3, accumulated bits 1
|
|
bl L.3.9
|
|
srl %l1,1,%l1
|
|
! remainder is positive
|
|
subcc %i3,%l1,%i3
|
|
b 9f
|
|
add %i2, (1*2+1), %i2
|
|
L.3.9: ! remainder is negative
|
|
addcc %i3,%l1,%i3
|
|
b 9f
|
|
add %i2, (1*2-1), %i2
|
|
L.1.8: ! remainder is negative
|
|
addcc %i3,%l1,%i3
|
|
! depth 2, accumulated bits -1
|
|
bl L.2.7
|
|
srl %l1,1,%l1
|
|
! remainder is positive
|
|
subcc %i3,%l1,%i3
|
|
! depth 3, accumulated bits -1
|
|
bl L.3.7
|
|
srl %l1,1,%l1
|
|
! remainder is positive
|
|
subcc %i3,%l1,%i3
|
|
b 9f
|
|
add %i2, (-1*2+1), %i2
|
|
L.3.7: ! remainder is negative
|
|
addcc %i3,%l1,%i3
|
|
b 9f
|
|
add %i2, (-1*2-1), %i2
|
|
L.2.7: ! remainder is negative
|
|
addcc %i3,%l1,%i3
|
|
! depth 3, accumulated bits -3
|
|
bl L.3.5
|
|
srl %l1,1,%l1
|
|
! remainder is positive
|
|
subcc %i3,%l1,%i3
|
|
b 9f
|
|
add %i2, (-3*2+1), %i2
|
|
L.3.5: ! remainder is negative
|
|
addcc %i3,%l1,%i3
|
|
b 9f
|
|
add %i2, (-3*2-1), %i2
|
|
end_regular_divide:
|
|
9: deccc %l0
|
|
bge divloop
|
|
tst %i3
|
|
bge got_result
|
|
nop
|
|
! non-restoring fixup here
|
|
add %i3, %i1, %i3
|
|
got_result:
|
|
tst %l2
|
|
bge 1f
|
|
restore
|
|
! answer < 0
|
|
retl ! leaf-routine return
|
|
neg %o3, %o0 ! remainder <- -%i3
|
|
1: retl ! leaf-routine return
|
|
mov %o3, %o0 ! remainder <- %i3
|
|
#endif
|
|
|
|
|