dnl ARM64 mpn_lshift.
dnl Copyright 2013, 2014 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of the GNU Lesser General Public License as published
dnl by the Free Software Foundation; either version 3 of the License, or (at
dnl your option) any later version.
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
dnl License for more details.
dnl You should have received a copy of the GNU Lesser General Public License
dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C Cortex-A53 ?
C Cortex-A57 ?
changecom(@&*$)
define(`rp_arg', `x0')
define(`up', `x1')
define(`n', `x2')
define(`cnt', `x3')
define(`rp', `x16')
define(`tnc',`x8')
ASM_START()
PROLOGUE(mpn_lshift)
add rp, rp_arg, n, lsl #3
add up, up, n, lsl #3
sub tnc, xzr, cnt
tbz n, #0, L(bx0)
L(bx1): ldr x4, [up,#-8]
tbnz n, #1, L(b11)
L(b01): lsr x0, x4, tnc
lsl x18, x4, cnt
sub n, n, #1
cbnz n, L(gt1)
str x18, [rp,#-8]
ret
L(gt1): ldp x4, x5, [up,#-24]
sub up, up, #8
add rp, rp, #16
b L(lo2)
L(b11): lsr x0, x4, tnc
lsl x9, x4, cnt
ldp x6, x7, [up,#-24]
add n, n, #1
add up, up, #8
add rp, rp, #32
b L(lo0)
L(bx0): ldp x4, x5, [up,#-16]
tbz n, #1, L(b00)
L(b10): lsr x0, x5, tnc
lsl x13, x5, cnt
lsr x10, x4, tnc
lsl x18, x4, cnt
sub n, n, #2
cbnz n, L(gt2)
orr x10, x10, x13
stp x18, x10, [rp,#-16]
ret
L(gt2): ldp x4, x5, [up,#-32]
orr x10, x10, x13
str x10, [rp,#-8]
sub up, up, #16
add rp, rp, #8
b L(lo2)
L(b00): lsr x0, x5, tnc
lsl x13, x5, cnt
lsr x10, x4, tnc
lsl x9, x4, cnt
ldp x6, x7, [up,#-32]
orr x10, x10, x13
str x10, [rp,#-8]
add rp, rp, #24
b L(lo0)
ALIGN(16)
L(top): ldp x4, x5, [up,#-48]
sub rp, rp, #32 C integrate with stp?
sub up, up, #32 C integrate with ldp?
orr x11, x11, x9
orr x10, x10, x13
stp x10, x11, [rp,#-16]
L(lo2): lsr x11, x5, tnc
lsl x13, x5, cnt
lsr x10, x4, tnc
lsl x9, x4, cnt
ldp x6, x7, [up,#-32]
orr x11, x11, x18
orr x10, x10, x13
stp x10, x11, [rp,#-32]
L(lo0): sub n, n, #4
lsr x11, x7, tnc
lsl x13, x7, cnt
lsr x10, x6, tnc
lsl x18, x6, cnt
cbnz n, L(top)
L(end): orr x11, x11, x9
orr x10, x10, x13
stp x10, x11, [rp,#-48]
str x18, [rp,#-56]
ret
EPILOGUE()