dnl Intel Pentium 4 mpn_mod_34lsub1 -- remainder modulo 2^24-1.
dnl Copyright 2000-2003 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of either:
dnl
dnl * the GNU Lesser General Public License as published by the Free
dnl Software Foundation; either version 3 of the License, or (at your
dnl option) any later version.
dnl
dnl or
dnl
dnl * the GNU General Public License as published by the Free Software
dnl Foundation; either version 2 of the License, or (at your option) any
dnl later version.
dnl
dnl or both in parallel, as here.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
dnl for more details.
dnl
dnl You should have received copies of the GNU General Public License and the
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
dnl see https://www.gnu.org/licenses/.
include(`../config.m4')
C Pentium4: 1.0 cycles/limb
C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size)
C
C Enhancements:
C
C There might a couple of cycles to save by using plain integer code for
C more small sizes. 2 limbs measures about 20 cycles, but 3 limbs jumps to
C about 46 (inclusive of some function call overheads).
defframe(PARAM_SIZE, 8)
defframe(PARAM_SRC, 4)
dnl re-use parameter space
define(SAVE_EBX, `PARAM_SRC')
define(SAVE_ESI, `PARAM_SIZE')
TEXT
ALIGN(16)
PROLOGUE(mpn_mod_34lsub1)
deflit(`FRAME',0)
movl PARAM_SIZE, %ecx
movl PARAM_SRC, %edx
movl (%edx), %eax
subl $2, %ecx
ja L(three_or_more)
jne L(one)
movl 4(%edx), %edx
movl %eax, %ecx
shrl $24, %eax C src[0] high
andl $0x00FFFFFF, %ecx C src[0] low
addl %ecx, %eax
movl %edx, %ecx
shll $8, %edx
shrl $16, %ecx C src[1] low
addl %ecx, %eax
andl $0x00FFFF00, %edx C src[1] high
addl %edx, %eax
L(one):
ret
L(three_or_more):
pxor %mm0, %mm0
pxor %mm1, %mm1
pxor %mm2, %mm2
pcmpeqd %mm7, %mm7
psrlq $32, %mm7 C 0x00000000FFFFFFFF, low 32 bits
pcmpeqd %mm6, %mm6
psrlq $40, %mm6 C 0x0000000000FFFFFF, low 24 bits
L(top):
C eax
C ebx
C ecx counter, size-2 to 0, -1 or -2
C edx src, incrementing
C
C mm0 sum 0mod3
C mm1 sum 1mod3
C mm2 sum 2mod3
C mm3
C mm4
C mm5
C mm6 0x0000000000FFFFFF
C mm7 0x00000000FFFFFFFF
movd (%edx), %mm3
paddq %mm3, %mm0
movd 4(%edx), %mm3
paddq %mm3, %mm1
movd 8(%edx), %mm3
paddq %mm3, %mm2
addl $12, %edx
subl $3, %ecx
ja L(top)
C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively
addl $1, %ecx
js L(combine) C 0 more
movd (%edx), %mm3
paddq %mm3, %mm0
jz L(combine) C 1 more
movd 4(%edx), %mm3
paddq %mm3, %mm1
L(combine):
movq %mm7, %mm3 C low halves
pand %mm0, %mm3
movq %mm7, %mm4
pand %mm1, %mm4
movq %mm7, %mm5
pand %mm2, %mm5
psrlq $32, %mm0 C high halves
psrlq $32, %mm1
psrlq $32, %mm2
paddq %mm0, %mm4 C fold high halves to give 33 bits each
paddq %mm1, %mm5
paddq %mm2, %mm3
psllq $8, %mm4 C combine at respective offsets
psllq $16, %mm5
paddq %mm4, %mm3
paddq %mm5, %mm3 C 0x000cxxxxxxxxxxxx, 50 bits
pand %mm3, %mm6 C fold at 24 bits
psrlq $24, %mm3
paddq %mm6, %mm3
movd %mm3, %eax
ASSERT(z, C nothing left in high dword
`psrlq $32, %mm3
movd %mm3, %ecx
orl %ecx, %ecx')
emms
ret
EPILOGUE()