dnl IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and
dnl store the result in a second limb vector.
dnl Contributed to the GNU project by Torbjorn Granlund.
dnl Copyright 2000-2004, 2006, 2007 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of either:
dnl
dnl * the GNU Lesser General Public License as published by the Free
dnl Software Foundation; either version 3 of the License, or (at your
dnl option) any later version.
dnl
dnl or
dnl
dnl * the GNU General Public License as published by the Free Software
dnl Foundation; either version 2 of the License, or (at your option) any
dnl later version.
dnl
dnl or both in parallel, as here.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
dnl for more details.
dnl
dnl You should have received copies of the GNU General Public License and the
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
dnl see https://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C Itanium: 4.0
C Itanium 2: 2.0
C TODO
C * Further optimize feed-in and wind-down code, both for speed and code size.
C * Handle low limb input and results specially, using a common stf8 in the
C epilogue.
C * Use 1 c/l carry propagation scheme in wind-down code.
C * Use extra pointer register for `up' to speed up feed-in loads.
C * Work out final differences with addmul_1.asm.
C INPUT PARAMETERS
define(`rp', `r32')
define(`up', `r33')
define(`n', `r34')
define(`vl', `r35')
define(`cy', `r36') C for mpn_mul_1c
ASM_START()
PROLOGUE(mpn_mul_1)
.prologue
.save ar.lc, r2
.body
ifdef(`HAVE_ABI_32',
` addp4 rp = 0, rp C M I
addp4 up = 0, up C M I
zxt4 n = n C I
;;
')
{.mfi
adds r15 = -1, n C M I
mov f9 = f0 C F
mov.i r2 = ar.lc C I0
}
{.mmi
ldf8 f7 = [up], 8 C M
nop.m 0 C M
and r14 = 3, n C M I
;;
}
.Lcommon:
{.mii
setf.sig f6 = vl C M2 M3
shr.u r31 = r15, 2 C I0
cmp.eq p10, p0 = 0, r14 C M I
}
{.mii
cmp.eq p11, p0 = 2, r14 C M I
cmp.eq p12, p0 = 3, r14 C M I
nop.i 0 C I
;;
}
{.mii
cmp.ne p6, p7 = r0, r0 C M I
mov.i ar.lc = r31 C I0
cmp.ne p8, p9 = r0, r0 C M I
}
{.bbb
(p10) br.dptk .Lb00 C B
(p11) br.dptk .Lb10 C B
(p12) br.dptk .Lb11 C B
;;
}
.Lb01: mov r20 = 0
br.cloop.dptk .grt1 C B
xma.l f39 = f7, f6, f9 C F
xma.hu f43 = f7, f6, f9 C F
;;
getf.sig r8 = f43 C M2
stf8 [rp] = f39 C M2 M3
mov.i ar.lc = r2 C I0
br.ret.sptk.many b0 C B
.grt1:
ldf8 f32 = [up], 8
;;
ldf8 f33 = [up], 8
;;
ldf8 f34 = [up], 8
xma.l f39 = f7, f6, f9
xma.hu f43 = f7, f6, f9
;;
ldf8 f35 = [up], 8
br.cloop.dptk .grt5
xma.l f36 = f32, f6, f0
xma.hu f40 = f32, f6, f0
;;
stf8 [rp] = f39, 8
xma.l f37 = f33, f6, f0
xma.hu f41 = f33, f6, f0
;;
getf.sig r21 = f43
getf.sig r18 = f36
xma.l f38 = f34, f6, f0
xma.hu f42 = f34, f6, f0
;;
getf.sig r22 = f40
getf.sig r19 = f37
xma.l f39 = f35, f6, f0
xma.hu f43 = f35, f6, f0
;;
getf.sig r23 = f41
getf.sig r16 = f38
br .Lcj5
.grt5:
xma.l f36 = f32, f6, f0
xma.hu f40 = f32, f6, f0
;;
getf.sig r17 = f39
ldf8 f32 = [up], 8
xma.l f37 = f33, f6, f0
xma.hu f41 = f33, f6, f0
;;
getf.sig r21 = f43
ldf8 f33 = [up], 8
xma.l f38 = f34, f6, f0
;;
getf.sig r18 = f36
xma.hu f42 = f34, f6, f0
;;
getf.sig r22 = f40
ldf8 f34 = [up], 8
xma.l f39 = f35, f6, f0
;;
getf.sig r19 = f37
xma.hu f43 = f35, f6, f0
br .LL01
.Lb10: ldf8 f35 = [up], 8
mov r23 = 0
br.cloop.dptk .grt2
xma.l f38 = f7, f6, f9
xma.hu f42 = f7, f6, f9
;;
stf8 [rp] = f38, 8
xma.l f39 = f35, f6, f42
xma.hu f43 = f35, f6, f42
;;
getf.sig r8 = f43
stf8 [rp] = f39
mov.i ar.lc = r2
br.ret.sptk.many b0
.grt2:
ldf8 f32 = [up], 8
;;
ldf8 f33 = [up], 8
xma.l f38 = f7, f6, f9
xma.hu f42 = f7, f6, f9
;;
ldf8 f34 = [up], 8
xma.l f39 = f35, f6, f0
xma.hu f43 = f35, f6, f0
;;
ldf8 f35 = [up], 8
br.cloop.dptk .grt6
stf8 [rp] = f38, 8
xma.l f36 = f32, f6, f0
xma.hu f40 = f32, f6, f0
;;
getf.sig r20 = f42
getf.sig r17 = f39
xma.l f37 = f33, f6, f0
xma.hu f41 = f33, f6, f0
;;
getf.sig r21 = f43
getf.sig r18 = f36
xma.l f38 = f34, f6, f0
xma.hu f42 = f34, f6, f0
;;
getf.sig r22 = f40
getf.sig r19 = f37
xma.l f39 = f35, f6, f0
xma.hu f43 = f35, f6, f0
br .Lcj6
.grt6:
getf.sig r16 = f38
xma.l f36 = f32, f6, f0
xma.hu f40 = f32, f6, f0
;;
getf.sig r20 = f42
ldf8 f32 = [up], 8
xma.l f37 = f33, f6, f0
;;
getf.sig r17 = f39
xma.hu f41 = f33, f6, f0
;;
getf.sig r21 = f43
ldf8 f33 = [up], 8
xma.l f38 = f34, f6, f0
;;
getf.sig r18 = f36
xma.hu f42 = f34, f6, f0
br .LL10
.Lb11: ldf8 f34 = [up], 8
mov r22 = 0
;;
ldf8 f35 = [up], 8
br.cloop.dptk .grt3
;;
xma.l f37 = f7, f6, f9
xma.hu f41 = f7, f6, f9
xma.l f38 = f34, f6, f0
xma.hu f42 = f34, f6, f0
xma.l f39 = f35, f6, f0
xma.hu f43 = f35, f6, f0
;;
getf.sig r23 = f41
stf8 [rp] = f37, 8
getf.sig r16 = f38
getf.sig r20 = f42
getf.sig r17 = f39
getf.sig r8 = f43
br .Lcj3
.grt3:
ldf8 f32 = [up], 8
xma.l f37 = f7, f6, f9
xma.hu f41 = f7, f6, f9
;;
ldf8 f33 = [up], 8
xma.l f38 = f34, f6, f0
xma.hu f42 = f34, f6, f0
;;
getf.sig r19 = f37
ldf8 f34 = [up], 8
xma.l f39 = f35, f6, f0
xma.hu f43 = f35, f6, f0
;;
getf.sig r23 = f41
ldf8 f35 = [up], 8
br.cloop.dptk .grt7
getf.sig r16 = f38
xma.l f36 = f32, f6, f0
getf.sig r20 = f42
xma.hu f40 = f32, f6, f0
;;
getf.sig r17 = f39
xma.l f37 = f33, f6, f0
getf.sig r21 = f43
xma.hu f41 = f33, f6, f0
;;
getf.sig r18 = f36
st8 [rp] = r19, 8
xma.l f38 = f34, f6, f0
xma.hu f42 = f34, f6, f0
br .Lcj7
.grt7:
getf.sig r16 = f38
xma.l f36 = f32, f6, f0
xma.hu f40 = f32, f6, f0
;;
getf.sig r20 = f42
ldf8 f32 = [up], 8
xma.l f37 = f33, f6, f0
;;
getf.sig r17 = f39
xma.hu f41 = f33, f6, f0
br .LL11
.Lb00: ldf8 f33 = [up], 8
mov r21 = 0
;;
ldf8 f34 = [up], 8
;;
ldf8 f35 = [up], 8
xma.l f36 = f7, f6, f9
xma.hu f40 = f7, f6, f9
br.cloop.dptk .grt4
xma.l f37 = f33, f6, f0
xma.hu f41 = f33, f6, f0
xma.l f38 = f34, f6, f0
xma.hu f42 = f34, f6, f0
;;
getf.sig r22 = f40
stf8 [rp] = f36, 8
xma.l f39 = f35, f6, f0
getf.sig r19 = f37
xma.hu f43 = f35, f6, f0
;;
getf.sig r23 = f41
getf.sig r16 = f38
getf.sig r20 = f42
getf.sig r17 = f39
br .Lcj4
.grt4:
ldf8 f32 = [up], 8
xma.l f37 = f33, f6, f0
xma.hu f41 = f33, f6, f0
;;
getf.sig r18 = f36
ldf8 f33 = [up], 8
xma.l f38 = f34, f6, f0
xma.hu f42 = f34, f6, f0
;;
getf.sig r22 = f40
ldf8 f34 = [up], 8
xma.l f39 = f35, f6, f0
;;
getf.sig r19 = f37
getf.sig r23 = f41
xma.hu f43 = f35, f6, f0
ldf8 f35 = [up], 8
br.cloop.dptk .grt8
getf.sig r16 = f38
xma.l f36 = f32, f6, f0
getf.sig r20 = f42
xma.hu f40 = f32, f6, f0
;;
getf.sig r17 = f39
st8 [rp] = r18, 8
xma.l f37 = f33, f6, f0
xma.hu f41 = f33, f6, f0
br .Lcj8
.grt8:
getf.sig r16 = f38
xma.l f36 = f32, f6, f0
xma.hu f40 = f32, f6, f0
br .LL00
C *** MAIN LOOP START ***
ALIGN(32)
.Loop:
.pred.rel "mutex",p6,p7
getf.sig r16 = f38
xma.l f36 = f32, f6, f0
(p6) cmp.leu p8, p9 = r24, r17
st8 [rp] = r24, 8
xma.hu f40 = f32, f6, f0
(p7) cmp.ltu p8, p9 = r24, r17
;;
.LL00:
.pred.rel "mutex",p8,p9
getf.sig r20 = f42
(p8) add r24 = r18, r21, 1
nop.b 0
ldf8 f32 = [up], 8
(p9) add r24 = r18, r21
nop.b 0
;;
.pred.rel "mutex",p8,p9
getf.sig r17 = f39
xma.l f37 = f33, f6, f0
(p8) cmp.leu p6, p7 = r24, r18
st8 [rp] = r24, 8
xma.hu f41 = f33, f6, f0
(p9) cmp.ltu p6, p7 = r24, r18
;;
.LL11:
.pred.rel "mutex",p6,p7
getf.sig r21 = f43
(p6) add r24 = r19, r22, 1
nop.b 0
ldf8 f33 = [up], 8
(p7) add r24 = r19, r22
nop.b 0
;;
.pred.rel "mutex",p6,p7
getf.sig r18 = f36
xma.l f38 = f34, f6, f0
(p6) cmp.leu p8, p9 = r24, r19
st8 [rp] = r24, 8
xma.hu f42 = f34, f6, f0
(p7) cmp.ltu p8, p9 = r24, r19
;;
.LL10:
.pred.rel "mutex",p8,p9
getf.sig r22 = f40
(p8) add r24 = r16, r23, 1
nop.b 0
ldf8 f34 = [up], 8
(p9) add r24 = r16, r23
nop.b 0
;;
.pred.rel "mutex",p8,p9
getf.sig r19 = f37
xma.l f39 = f35, f6, f0
(p8) cmp.leu p6, p7 = r24, r16
st8 [rp] = r24, 8
xma.hu f43 = f35, f6, f0
(p9) cmp.ltu p6, p7 = r24, r16
;;
.LL01:
.pred.rel "mutex",p6,p7
getf.sig r23 = f41
(p6) add r24 = r17, r20, 1
nop.b 0
ldf8 f35 = [up], 8
(p7) add r24 = r17, r20
br.cloop.dptk .Loop
C *** MAIN LOOP END ***
;;
.Lcj9:
.pred.rel "mutex",p6,p7
getf.sig r16 = f38
xma.l f36 = f32, f6, f0
(p6) cmp.leu p8, p9 = r24, r17
st8 [rp] = r24, 8
xma.hu f40 = f32, f6, f0
(p7) cmp.ltu p8, p9 = r24, r17
;;
.pred.rel "mutex",p8,p9
getf.sig r20 = f42
(p8) add r24 = r18, r21, 1
(p9) add r24 = r18, r21
;;
.pred.rel "mutex",p8,p9
getf.sig r17 = f39
xma.l f37 = f33, f6, f0
(p8) cmp.leu p6, p7 = r24, r18
st8 [rp] = r24, 8
xma.hu f41 = f33, f6, f0
(p9) cmp.ltu p6, p7 = r24, r18
;;
.Lcj8:
.pred.rel "mutex",p6,p7
getf.sig r21 = f43
(p6) add r24 = r19, r22, 1
(p7) add r24 = r19, r22
;;
.pred.rel "mutex",p6,p7
getf.sig r18 = f36
xma.l f38 = f34, f6, f0
(p6) cmp.leu p8, p9 = r24, r19
st8 [rp] = r24, 8
xma.hu f42 = f34, f6, f0
(p7) cmp.ltu p8, p9 = r24, r19
;;
.Lcj7:
.pred.rel "mutex",p8,p9
getf.sig r22 = f40
(p8) add r24 = r16, r23, 1
(p9) add r24 = r16, r23
;;
.pred.rel "mutex",p8,p9
getf.sig r19 = f37
xma.l f39 = f35, f6, f0
(p8) cmp.leu p6, p7 = r24, r16
st8 [rp] = r24, 8
xma.hu f43 = f35, f6, f0
(p9) cmp.ltu p6, p7 = r24, r16
;;
.Lcj6:
.pred.rel "mutex",p6,p7
getf.sig r23 = f41
(p6) add r24 = r17, r20, 1
(p7) add r24 = r17, r20
;;
.pred.rel "mutex",p6,p7
(p6) cmp.leu p8, p9 = r24, r17
(p7) cmp.ltu p8, p9 = r24, r17
getf.sig r16 = f38
st8 [rp] = r24, 8
;;
.Lcj5:
.pred.rel "mutex",p8,p9
getf.sig r20 = f42
(p8) add r24 = r18, r21, 1
(p9) add r24 = r18, r21
;;
.pred.rel "mutex",p8,p9
(p8) cmp.leu p6, p7 = r24, r18
(p9) cmp.ltu p6, p7 = r24, r18
getf.sig r17 = f39
st8 [rp] = r24, 8
;;
.Lcj4:
.pred.rel "mutex",p6,p7
getf.sig r8 = f43
(p6) add r24 = r19, r22, 1
(p7) add r24 = r19, r22
;;
.pred.rel "mutex",p6,p7
st8 [rp] = r24, 8
(p6) cmp.leu p8, p9 = r24, r19
(p7) cmp.ltu p8, p9 = r24, r19
;;
.Lcj3:
.pred.rel "mutex",p8,p9
(p8) add r24 = r16, r23, 1
(p9) add r24 = r16, r23
;;
.pred.rel "mutex",p8,p9
st8 [rp] = r24, 8
(p8) cmp.leu p6, p7 = r24, r16
(p9) cmp.ltu p6, p7 = r24, r16
;;
.Lcj2:
.pred.rel "mutex",p6,p7
(p6) add r24 = r17, r20, 1
(p7) add r24 = r17, r20
;;
.pred.rel "mutex",p6,p7
st8 [rp] = r24, 8
(p6) cmp.leu p8, p9 = r24, r17
(p7) cmp.ltu p8, p9 = r24, r17
;;
(p8) add r8 = 1, r8
mov.i ar.lc = r2
br.ret.sptk.many b0
EPILOGUE()
PROLOGUE(mpn_mul_1c)
.prologue
.save ar.lc, r2
.body
ifdef(`HAVE_ABI_32',
` addp4 rp = 0, rp C M I
addp4 up = 0, up C M I
zxt4 n = n C I
;;
')
{.mmi
adds r15 = -1, n C M I
setf.sig f9 = cy C M2 M3
mov.i r2 = ar.lc C I0
}
{.mmb
ldf8 f7 = [up], 8 C M
and r14 = 3, n C M I
br.sptk .Lcommon
;;
}
EPILOGUE()
ASM_END()