dnl PowerPC-64 mpn_mod_1_1p
dnl Copyright 2010, 2011 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of either:
dnl
dnl * the GNU Lesser General Public License as published by the Free
dnl Software Foundation; either version 3 of the License, or (at your
dnl option) any later version.
dnl
dnl or
dnl
dnl * the GNU General Public License as published by the Free Software
dnl Foundation; either version 2 of the License, or (at your option) any
dnl later version.
dnl
dnl or both in parallel, as here.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
dnl for more details.
dnl
dnl You should have received copies of the GNU General Public License and the
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
dnl see https://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C POWER3/PPC630 ?
C POWER4/PPC970 17
C POWER5 16
C POWER6 30
C POWER7 10.2
C TODO
C * Optimise, in particular the cps function. This was compiler-generated and
C then hand optimised.
C INPUT PARAMETERS
define(`ap', `r3')
define(`n', `r4')
define(`d', `r5')
define(`cps', `r6')
ASM_START()
EXTERN_FUNC(mpn_invert_limb)
PROLOGUE(mpn_mod_1_1p)
sldi r10, r4, 3
addi r4, r4, -1
add r3, r3, r10
ld r0, 16(r6) C B1modb
ld r12, 24(r6) C B2modb
ld r9, -8(r3)
ld r10, -16(r3)
mtctr r4
mulhdu r8, r9, r0
mulld r7, r9, r0
addc r11, r7, r10
addze r9, r8
bdz L(end)
ALIGN(16)
L(top): ld r4, -24(r3)
addi r3, r3, -8
nop
mulld r10, r11, r0
mulld r8, r9, r12
mulhdu r11, r11, r0
mulhdu r9, r9, r12
addc r7, r10, r4
addze r10, r11
addc r11, r8, r7
adde r9, r9, r10
bdnz L(top)
L(end):
ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
` lwz r0, 8(r6)',
` lwz r0, 12(r6)')
ld r3, 0(r6)
cmpdi cr7, r0, 0
beq- cr7, L(4)
subfic r10, r0, 64
sld r9, r9, r0
srd r10, r11, r10
or r9, r10, r9
L(4): subfc r10, r5, r9
subfe r10, r10, r10
nand r10, r10, r10
sld r11, r11, r0
and r10, r10, r5
subf r9, r10, r9
mulhdu r10, r9, r3
mulld r3, r9, r3
addi r9, r9, 1
addc r8, r3, r11
adde r3, r10, r9
mulld r3, r3, r5
subf r3, r3, r11
cmpld cr7, r8, r3
bge cr7, L(5) C FIXME: Make branch-less
add r3, r3, r5
L(5): cmpld cr7, r3, r5
bge- cr7, L(10)
srd r3, r3, r0
blr
L(10): subf r3, r5, r3
srd r3, r3, r0
blr
EPILOGUE()
PROLOGUE(mpn_mod_1_1p_cps,toc)
mflr r0
std r29, -24(r1)
std r30, -16(r1)
std r31, -8(r1)
cntlzd r31, r4
std r0, 16(r1)
extsw r31, r31
mr r29, r3
stdu r1, -144(r1)
sld r30, r4, r31
mr r3, r30
CALL( mpn_invert_limb)
cmpdi cr7, r31, 0
neg r0, r30
beq- cr7, L(13)
subfic r11, r31, 64
li r0, 1
neg r9, r30
srd r11, r3, r11
sld r0, r0, r31
or r0, r11, r0
mulld r0, r0, r9
L(13): mulhdu r9, r0, r3
mulld r11, r0, r3
add r9, r0, r9
nor r9, r9, r9
mulld r9, r9, r30
cmpld cr7, r11, r9
bge cr7, L(14)
add r9, r9, r30
L(14): addi r1, r1, 144
srd r0, r0, r31
std r31, 8(r29)
std r3, 0(r29)
std r0, 16(r29)
ld r0, 16(r1)
srd r9, r9, r31
ld r30, -16(r1)
ld r31, -8(r1)
std r9, 24(r29)
ld r29, -24(r1)
mtlr r0
blr
EPILOGUE()