/* Copyright (C) 2006-2020 Free Software Foundation, Inc.
This file is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 3, or (at your option) any
later version.
This file is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details.
Under Section 7 of GPL version 3, you are granted additional
permissions described in the GCC Runtime Library Exception, version
3.1, as published by the Free Software Foundation.
You should have received a copy of the GNU General Public License and
a copy of the GCC Runtime Library Exception along with this program;
see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
<http://www.gnu.org/licenses/>. */
/* Moderately Space-optimized libgcc routines for the Renesas SH /
STMicroelectronics ST40 CPUs.
Contributed by J"orn Rennecke joern.rennecke@st.com. */
#include "lib1funcs.h"
#ifdef L_udivsi3_i4i
/* 88 bytes; sh4-200 cycle counts:
divisor >= 2G: 11 cycles
dividend < 2G: 48 cycles
dividend >= 2G: divisor != 1: 54 cycles
dividend >= 2G, divisor == 1: 22 cycles */
#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
!! args in r4 and r5, result in r0, clobber r1
.global GLOBAL(udivsi3_i4i)
FUNC(GLOBAL(udivsi3_i4i))
GLOBAL(udivsi3_i4i):
mova L1,r0
cmp/pz r5
sts fpscr,r1
lds.l @r0+,fpscr
sts.l fpul,@-r15
bf LOCAL(huge_divisor)
mov.l r1,@-r15
lds r4,fpul
cmp/pz r4
#ifdef FMOVD_WORKS
fmov.d dr0,@-r15
float fpul,dr0
fmov.d dr2,@-r15
bt LOCAL(dividend_adjusted)
mov #1,r1
fmov.d @r0,dr2
cmp/eq r1,r5
bt LOCAL(div_by_1)
fadd dr2,dr0
LOCAL(dividend_adjusted):
lds r5,fpul
float fpul,dr2
fdiv dr2,dr0
LOCAL(div_by_1):
fmov.d @r15+,dr2
ftrc dr0,fpul
fmov.d @r15+,dr0
#else /* !FMOVD_WORKS */
fmov.s DR01,@-r15
mov #1,r1
fmov.s DR00,@-r15
float fpul,dr0
fmov.s DR21,@-r15
bt/s LOCAL(dividend_adjusted)
fmov.s DR20,@-r15
cmp/eq r1,r5
bt LOCAL(div_by_1)
fmov.s @r0+,DR20
fmov.s @r0,DR21
fadd dr2,dr0
LOCAL(dividend_adjusted):
lds r5,fpul
float fpul,dr2
fdiv dr2,dr0
LOCAL(div_by_1):
fmov.s @r15+,DR20
fmov.s @r15+,DR21
ftrc dr0,fpul
fmov.s @r15+,DR00
fmov.s @r15+,DR01
#endif /* !FMOVD_WORKS */
lds.l @r15+,fpscr
sts fpul,r0
rts
lds.l @r15+,fpul
#ifdef FMOVD_WORKS
.p2align 3 ! make double below 8 byte aligned.
#endif
LOCAL(huge_divisor):
lds r1,fpscr
add #4,r15
cmp/hs r5,r4
rts
movt r0
.p2align 2
L1:
#ifndef FMOVD_WORKS
.long 0x80000
#else
.long 0x180000
#endif
.double 4294967296
ENDFUNC(GLOBAL(udivsi3_i4i))
#elif !defined (__sh1__) /* !__SH_FPU_DOUBLE__ */
#if 0
/* With 36 bytes, the following would probably be the most compact
implementation, but with 139 cycles on an sh4-200, it is extremely slow. */
GLOBAL(udivsi3_i4i):
mov.l r2,@-r15
mov #0,r1
div0u
mov r1,r2
mov.l r3,@-r15
mov r1,r3
sett
mov r4,r0
LOCAL(loop):
rotcr r2
;
bt/s LOCAL(end)
cmp/gt r2,r3
rotcl r0
bra LOCAL(loop)
div1 r5,r1
LOCAL(end):
rotcl r0
mov.l @r15+,r3
rts
mov.l @r15+,r2
#endif /* 0 */
/* Size: 186 bytes jointly for udivsi3_i4i and sdivsi3_i4i
sh4-200 run times:
udiv small divisor: 55 cycles
udiv large divisor: 52 cycles
sdiv small divisor, positive result: 59 cycles
sdiv large divisor, positive result: 56 cycles
sdiv small divisor, negative result: 65 cycles (*)
sdiv large divisor, negative result: 62 cycles (*)
(*): r2 is restored in the rts delay slot and has a lingering latency
of two more cycles. */
.balign 4
.global GLOBAL(udivsi3_i4i)
FUNC(GLOBAL(udivsi3_i4i))
FUNC(GLOBAL(sdivsi3_i4i))
GLOBAL(udivsi3_i4i):
sts pr,r1
mov.l r4,@-r15
extu.w r5,r0
cmp/eq r5,r0
swap.w r4,r0
shlr16 r4
bf/s LOCAL(large_divisor)
div0u
mov.l r5,@-r15
shll16 r5
LOCAL(sdiv_small_divisor):
div1 r5,r4
bsr LOCAL(div6)
div1 r5,r4
div1 r5,r4
bsr LOCAL(div6)
div1 r5,r4
xtrct r4,r0
xtrct r0,r4
bsr LOCAL(div7)
swap.w r4,r4
div1 r5,r4
bsr LOCAL(div7)
div1 r5,r4
xtrct r4,r0
mov.l @r15+,r5
swap.w r0,r0
mov.l @r15+,r4
jmp @r1
rotcl r0
LOCAL(div7):
div1 r5,r4
LOCAL(div6):
div1 r5,r4; div1 r5,r4; div1 r5,r4
div1 r5,r4; div1 r5,r4; rts; div1 r5,r4
LOCAL(divx3):
rotcl r0
div1 r5,r4
rotcl r0
div1 r5,r4
rotcl r0
rts
div1 r5,r4
LOCAL(large_divisor):
mov.l r5,@-r15
LOCAL(sdiv_large_divisor):
xor r4,r0
.rept 4
rotcl r0
bsr LOCAL(divx3)
div1 r5,r4
.endr
mov.l @r15+,r5
mov.l @r15+,r4
jmp @r1
rotcl r0
ENDFUNC(GLOBAL(udivsi3_i4i))
.global GLOBAL(sdivsi3_i4i)
GLOBAL(sdivsi3_i4i):
mov.l r4,@-r15
cmp/pz r5
mov.l r5,@-r15
bt/s LOCAL(pos_divisor)
cmp/pz r4
neg r5,r5
extu.w r5,r0
bt/s LOCAL(neg_result)
cmp/eq r5,r0
neg r4,r4
LOCAL(pos_result):
swap.w r4,r0
bra LOCAL(sdiv_check_divisor)
sts pr,r1
LOCAL(pos_divisor):
extu.w r5,r0
bt/s LOCAL(pos_result)
cmp/eq r5,r0
neg r4,r4
LOCAL(neg_result):
mova LOCAL(negate_result),r0
;
mov r0,r1
swap.w r4,r0
lds r2,macl
sts pr,r2
LOCAL(sdiv_check_divisor):
shlr16 r4
bf/s LOCAL(sdiv_large_divisor)
div0u
bra LOCAL(sdiv_small_divisor)
shll16 r5
.balign 4
LOCAL(negate_result):
neg r0,r0
jmp @r2
sts macl,r2
ENDFUNC(GLOBAL(sdivsi3_i4i))
#endif /* !__SH_FPU_DOUBLE__ */
#endif /* L_udivsi3_i4i */
#ifdef L_sdivsi3_i4i
#if defined (__SH_FPU_DOUBLE__) || defined (__SH4_SINGLE_ONLY__)
/* 48 bytes, 45 cycles on sh4-200 */
!! args in r4 and r5, result in r0, clobber r1
.global GLOBAL(sdivsi3_i4i)
FUNC(GLOBAL(sdivsi3_i4i))
GLOBAL(sdivsi3_i4i):
sts.l fpscr,@-r15
sts fpul,r1
mova L1,r0
lds.l @r0+,fpscr
lds r4,fpul
#ifdef FMOVD_WORKS
fmov.d dr0,@-r15
float fpul,dr0
lds r5,fpul
fmov.d dr2,@-r15
#else
fmov.s DR01,@-r15
fmov.s DR00,@-r15
float fpul,dr0
lds r5,fpul
fmov.s DR21,@-r15
fmov.s DR20,@-r15
#endif
float fpul,dr2
fdiv dr2,dr0
#ifdef FMOVD_WORKS
fmov.d @r15+,dr2
#else
fmov.s @r15+,DR20
fmov.s @r15+,DR21
#endif
ftrc dr0,fpul
#ifdef FMOVD_WORKS
fmov.d @r15+,dr0
#else
fmov.s @r15+,DR00
fmov.s @r15+,DR01
#endif
lds.l @r15+,fpscr
sts fpul,r0
rts
lds r1,fpul
.p2align 2
L1:
#ifndef FMOVD_WORKS
.long 0x80000
#else
.long 0x180000
#endif
ENDFUNC(GLOBAL(sdivsi3_i4i))
#endif /* __SH_FPU_DOUBLE__ */
#endif /* L_sdivsi3_i4i */