dnl AMD64 mpn_modexact_1_odd -- Hensel norm remainder.
dnl Copyright 2000-2006, 2011, 2012 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of either:
dnl
dnl * the GNU Lesser General Public License as published by the Free
dnl Software Foundation; either version 3 of the License, or (at your
dnl option) any later version.
dnl
dnl or
dnl
dnl * the GNU General Public License as published by the Free Software
dnl Foundation; either version 2 of the License, or (at your option) any
dnl later version.
dnl
dnl or both in parallel, as here.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
dnl for more details.
dnl
dnl You should have received copies of the GNU General Public License and the
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
dnl see https://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C AMD K8,K9 10
C AMD K10 10
C Intel P4 33
C Intel core2 13
C Intel corei 14.5
C Intel atom 35
C VIA nano ?
C The dependent chain in the main loop is
C
C cycles
C sub %rdx, %rax 1
C imul %r9, %rax 4
C mul %r8 5
C ----
C total 10
C
C The mov load from src seems to need to be scheduled back before the jz to
C achieve this speed, out-of-order execution apparently can't completely hide
C the latency otherwise.
C
C The l=src[i]-cbit step is rotated back too, since that allows us to avoid it
C for the first iteration (where there's no cbit).
C
C The code alignment used (32-byte) for the loop also seems necessary. Without
C that the non-PIC case has adc crossing the 0x60 offset, apparently making it
C run at 11 cycles instead of 10.
ABI_SUPPORT(DOS64)
ABI_SUPPORT(STD64)
ASM_START()
TEXT
ALIGN(32)
PROLOGUE(mpn_modexact_1_odd)
FUNC_ENTRY(3)
mov $0, R32(%rcx)
IFDOS(` jmp L(ent) ')
PROLOGUE(mpn_modexact_1c_odd)
FUNC_ENTRY(4)
L(ent):
C rdi src
C rsi size
C rdx divisor
C rcx carry
mov %rdx, %r8 C d
shr R32(%rdx) C d/2
LEA( binvert_limb_table, %r9)
and $127, R32(%rdx)
mov %rcx, %r10 C initial carry
movzbl (%r9,%rdx), R32(%rdx) C inv 8 bits
mov (%rdi), %rax C src[0]
lea (%rdi,%rsi,8), %r11 C src end
mov %r8, %rdi C d, made available to imull
lea (%rdx,%rdx), R32(%rcx) C 2*inv
imul R32(%rdx), R32(%rdx) C inv*inv
neg %rsi C -size
imul R32(%rdi), R32(%rdx) C inv*inv*d
sub R32(%rdx), R32(%rcx) C inv = 2*inv - inv*inv*d, 16 bits
lea (%rcx,%rcx), R32(%rdx) C 2*inv
imul R32(%rcx), R32(%rcx) C inv*inv
imul R32(%rdi), R32(%rcx) C inv*inv*d
sub R32(%rcx), R32(%rdx) C inv = 2*inv - inv*inv*d, 32 bits
xor R32(%rcx), R32(%rcx) C initial cbit
lea (%rdx,%rdx), %r9 C 2*inv
imul %rdx, %rdx C inv*inv
imul %r8, %rdx C inv*inv*d
sub %rdx, %r9 C inv = 2*inv - inv*inv*d, 64 bits
mov %r10, %rdx C initial climb
ASSERT(e,` C d*inv == 1 mod 2^64
mov %r8, %r10
imul %r9, %r10
cmp $1, %r10')
inc %rsi
jz L(one)
ALIGN(16)
L(top):
C rax l = src[i]-cbit
C rcx new cbit, 0 or 1
C rdx climb, high of last product
C rsi counter, limbs, negative
C rdi
C r8 divisor
C r9 inverse
C r11 src end ptr
sub %rdx, %rax C l = src[i]-cbit - climb
adc $0, %rcx C more cbit
imul %r9, %rax C q = l * inverse
mul %r8 C climb = high (q * d)
mov (%r11,%rsi,8), %rax C src[i+1]
sub %rcx, %rax C next l = src[i+1] - cbit
setc R8(%rcx) C new cbit
inc %rsi
jnz L(top)
L(one):
sub %rdx, %rax C l = src[i]-cbit - climb
adc $0, %rcx C more cbit
imul %r9, %rax C q = l * inverse
mul %r8 C climb = high (q * d)
lea (%rcx,%rdx), %rax C climb+cbit
FUNC_EXIT()
ret
EPILOGUE(mpn_modexact_1c_odd)
EPILOGUE(mpn_modexact_1_odd)