dnl IA-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
dnl Copyright 2010, 2013 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of either:
dnl
dnl * the GNU Lesser General Public License as published by the Free
dnl Software Foundation; either version 3 of the License, or (at your
dnl option) any later version.
dnl
dnl or
dnl
dnl * the GNU General Public License as published by the Free Software
dnl Foundation; either version 2 of the License, or (at your option) any
dnl later version.
dnl
dnl or both in parallel, as here.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
dnl for more details.
dnl
dnl You should have received copies of the GNU General Public License and the
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
dnl see https://www.gnu.org/licenses/.
include(`../config.m4')
C norm frac
C itanium 1
C itanium 2 29 29
C TODO
C * Inline and interleave limb inversion code with loop setup code.
C * We should use explicit bundling in much of the code, since it typically
C cuts some cycles with the GNU assembler.
ASM_START()
C HP's assembler requires these declarations for importing mpn_invert_limb
.global mpn_invert_limb
.type mpn_invert_limb,@function
C INPUT PARAMETERS
C qp = r32
C fn = r33
C np = r34
C nn = r35
C dp = r36
define(`f0x1', `f15')
ASM_START()
PROLOGUE(mpn_divrem_2)
.prologue
ifdef(`HAVE_ABI_32',
` addp4 r32 = 0, r32 C M I
addp4 r34 = 0, r34 C M I
zxt4 r35 = r35 C I
addp4 r36 = 0, r36 C M I
nop.m 0
zxt4 r33 = r33 C I
;;
')
.save ar.pfs, r42
alloc r42 = ar.pfs, 5, 9, 1, 0
shladd r34 = r35, 3, r34
adds r14 = 8, r36
mov r43 = r1
;;
adds r15 = -8, r34
ld8 r39 = [r14]
.save ar.lc, r45
mov r45 = ar.lc
adds r14 = -16, r34
mov r40 = r0
adds r34 = -24, r34
;;
ld8 r38 = [r15]
.save rp, r41
mov r41 = b0
.body
ld8 r36 = [r36]
ld8 r37 = [r14]
;;
cmp.gtu p6, p7 = r39, r38
(p6) br.cond.dptk .L8
;;
cmp.leu p8, p9 = r36, r37
cmp.geu p6, p7 = r39, r38
;;
(p8) cmp4.ne.and.orcm p6, p7 = 0, r0
(p7) br.cond.dptk .L51
.L8:
add r14 = r33, r35 // un + fn
mov r46 = r39 // argument to mpn_invert_limb
;;
adds r35 = -3, r14
;;
cmp.gt p12, p0 = r0, r35
(p12) br.cond.dpnt L(end)
br.call.sptk.many b0 = mpn_invert_limb
;;
setf.sig f11 = r8 // di (non-final)
setf.sig f34 = r39 // d1
setf.sig f33 = r36 // d0
mov r1 = r43
;;
mov r17 = 1
setf.sig f9 = r38 // n2
xma.l f6 = f11, f34, f0 // t0 = LO(di * d1)
;;
setf.sig f10 = r37 // n1
setf.sig f15 = r17 // 1
xma.hu f8 = f11, f33, f0 // s0 = HI(di * d0)
;;
getf.sig r17 = f6
getf.sig r16 = f8
mov ar.lc = r35
;;
sub r18 = r0, r39 // -d1
add r14 = r17, r36
;;
setf.sig f14 = r18 // -d1
cmp.leu p8, p9 = r17, r14
add r16 = r14, r16
;;
(p9) adds r19 = 0, r0
(p8) adds r19 = -1, r0
cmp.gtu p6, p7 = r14, r16
;;
(p6) adds r19 = 1, r19
;;
ifelse(1,1,`
cmp.gt p7, p6 = r0, r19
;;
(p6) adds r8 = -1, r8 // di--
(p6) sub r14 = r16, r39 // t0 -= d1
(p6) cmp.ltu p6, p7 = r16, r39 // cy for: t0 - d1
;;
(p6) cmp.gt p9, p8 = 1, r19
(p7) cmp.gt p9, p8 = 0, r19
(p6) adds r19 = -1, r19 // t1 -= cy
mov r16 = r14
;;
(p8) adds r8 = -1, r8 // di--
(p8) sub r14 = r16, r39 // t0 -= d1
(p8) cmp.ltu p8, p9 = r16, r39 // cy for: t0 - d1
;;
(p8) cmp.gt p7, p6 = 1, r19
(p9) cmp.gt p7, p6 = 0, r19
(p8) adds r19 = -1, r19 // t1 -= cy
mov r16 = r14
;;
(p6) adds r8 = -1, r8 // di--
(p6) sub r14 = r16, r39 // t0 -= d1
(p6) cmp.ltu p6, p7 = r16, r39 // cy for: t0 - d1
;;
(p6) cmp.gt p9, p8 = 1, r19
(p7) cmp.gt p9, p8 = 0, r19
(p6) adds r19 = -1, r19 // t1 -= cy
mov r16 = r14
;;
(p8) adds r8 = -1, r8 // di--
(p8) sub r14 = r16, r39 // t0 -= d1
(p8) cmp.ltu p8, p9 = r16, r39 // cy for: t0 - d1
;;
(p8) adds r19 = -1, r19 // t1 -= cy
mov r16 = r14
',`
cmp.gt p8, p9 = r0, r19
(p8) br.cond.dpnt .L46
.L52:
cmp.leu p6, p7 = r39, r16
sub r14 = r16, r39
adds r8 = -1, r8
;;
(p7) adds r19 = -1, r19
mov r16 = r14
;;
(p7) cmp.gt p8, p9 = r0, r19
(p9) br.cond.dptk .L52
.L46:
')
setf.sig f32 = r8 // di
shladd r32 = r35, 3, r32
;;
ALIGN(16)
L(top): nop 0
nop 0
cmp.gt p8, p9 = r33, r35
;;
(p8) mov r37 = r0
(p9) ld8 r37 = [r34], -8
xma.hu f8 = f9, f32, f10 // 0,29
xma.l f12 = f9, f32, f10 // 0
;;
getf.sig r20 = f12 // q0 4
xma.l f13 = f15, f8, f9 // q += n2 4
sub r8 = -1, r36 // bitnot d0
;;
getf.sig r18 = f13 // 8
xma.l f7 = f14, f13, f10 // 8
xma.l f6 = f33, f13, f33 // t0 = LO(d0*q+d0) 8
xma.hu f9 = f33, f13, f33 // t1 = HI(d0*q+d0) 9
;;
getf.sig r38 = f7 // n1 12
getf.sig r16 = f6 // 13
getf.sig r19 = f9 // 14
;;
sub r38 = r38, r39 // n1 -= d1 17
;;
cmp.ne p9, p0 = r0, r0 // clear p9
cmp.leu p10, p11 = r16, r37 // cy for: n0 - t0 18
;;
sub r37 = r37, r16 // n0 -= t0 19
(p11) sub r38 = r38, r19, 1 // n1 -= t1 - cy 19
(p10) sub r38 = r38, r19 // n1 -= t1 19
;;
cmp.gtu p6, p7 = r20, r38 // n1 >= q0 20
;;
(p7) cmp.ltu p9, p0 = r8, r37 // 21
(p6) add r18 = 1, r18 //
(p7) add r37 = r37, r36 // 21
(p7) add r38 = r38, r39 // 21
;;
setf.sig f10 = r37 // n1 22
(p9) add r38 = 1, r38 // 22
;;
setf.sig f9 = r38 // n2 23
cmp.gtu p6, p7 = r39, r38 // 23
(p7) br.cond.spnt L(fix)
L(bck): st8 [r32] = r18, -8
adds r35 = -1, r35
br.cloop.sptk.few L(top)
;;
L(end): add r14 = 8, r34
add r15 = 16, r34
mov b0 = r41
;;
st8 [r14] = r37
st8 [r15] = r38
mov ar.pfs = r42
mov r8 = r40
mov ar.lc = r45
br.ret.sptk.many b0
;;
.L51:
.pred.rel "mutex", p8, p9
sub r37 = r37, r36
(p9) sub r38 = r38, r39, 1
(p8) sub r38 = r38, r39
adds r40 = 1, r0
br .L8
;;
L(fix): cmp.geu p6, p7 = r39, r38
cmp.leu p8, p9 = r36, r37
;;
(p8) cmp4.ne.and.orcm p6, p7 = 0, r0
(p6) br.cond.dptk L(bck)
sub r37 = r37, r36
(p9) sub r38 = r38, r39, 1
(p8) sub r38 = r38, r39
adds r18 = 1, r18
;;
setf.sig f9 = r38 // n2
setf.sig f10 = r37 // n1
br L(bck)
EPILOGUE()
ASM_END()