dnl ARM64 Neon mpn_hamdist -- mpn bit hamming distance.
dnl Copyright 2013, 2014 Free Software Foundation, Inc.
dnl This file is part of the GNU MP Library.
dnl
dnl The GNU MP Library is free software; you can redistribute it and/or modify
dnl it under the terms of either:
dnl
dnl * the GNU Lesser General Public License as published by the Free
dnl Software Foundation; either version 3 of the License, or (at your
dnl option) any later version.
dnl
dnl or
dnl
dnl * the GNU General Public License as published by the Free Software
dnl Foundation; either version 2 of the License, or (at your option) any
dnl later version.
dnl
dnl or both in parallel, as here.
dnl
dnl The GNU MP Library is distributed in the hope that it will be useful, but
dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
dnl for more details.
dnl
dnl You should have received copies of the GNU General Public License and the
dnl GNU Lesser General Public License along with the GNU MP Library. If not,
dnl see https://www.gnu.org/licenses/.
include(`../config.m4')
C cycles/limb
C Cortex-A53 ?
C Cortex-A57 ?
C TODO
C * Consider greater unrolling.
C * Arrange to align the pointer, if that helps performance. Use the same
C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry
C valgrind!)
C * Explore if explicit align directives, e.g., "[ptr:128]" help.
C * See rth's gmp-devel 2013-02/03 messages about final summation tricks.
changecom(@&*$)
C INPUT PARAMETERS
define(`ap', x0)
define(`bp', x1)
define(`n', x2)
C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end
C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or
C (8*2^16-1)/64 = 0x1fff limbs. We use a chunksize close to that, but which
C allows the huge count code to jump deep into the code (at L(chu)).
define(`maxsize', 0x1fff)
define(`chunksize',0x1ff0)
ASM_START()
PROLOGUE(mpn_hamdist)
mov x11, #maxsize
cmp n, x11
b.hi L(gt8k)
L(lt8k):
movi v4.16b, #0 C clear summation register
movi v5.16b, #0 C clear summation register
tbz n, #0, L(xx0)
sub n, n, #1
ld1 {v0.1d}, [ap], #8 C load 1 limb
ld1 {v16.1d}, [bp], #8 C load 1 limb
eor v0.16b, v0.16b, v16.16b
cnt v6.16b, v0.16b
uadalp v4.8h, v6.16b C could also splat
L(xx0): tbz n, #1, L(x00)
sub n, n, #2
ld1 {v0.2d}, [ap], #16 C load 2 limbs
ld1 {v16.2d}, [bp], #16 C load 2 limbs
eor v0.16b, v0.16b, v16.16b
cnt v6.16b, v0.16b
uadalp v4.8h, v6.16b
L(x00): tbz n, #2, L(000)
subs n, n, #4
ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs
b.ls L(sum)
L(gt4): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs
eor v0.16b, v0.16b, v16.16b
eor v1.16b, v1.16b, v17.16b
sub n, n, #4
cnt v6.16b, v0.16b
cnt v7.16b, v1.16b
b L(mid)
L(000): subs n, n, #8
b.lo L(e0)
L(chu): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs
ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs
eor v2.16b, v2.16b, v18.16b
eor v3.16b, v3.16b, v19.16b
cnt v6.16b, v2.16b
cnt v7.16b, v3.16b
subs n, n, #8
b.lo L(end)
L(top): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs
ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs
eor v0.16b, v0.16b, v16.16b
eor v1.16b, v1.16b, v17.16b
uadalp v4.8h, v6.16b
cnt v6.16b, v0.16b
uadalp v5.8h, v7.16b
cnt v7.16b, v1.16b
L(mid): ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs
ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs
eor v2.16b, v2.16b, v18.16b
eor v3.16b, v3.16b, v19.16b
subs n, n, #8
uadalp v4.8h, v6.16b
cnt v6.16b, v2.16b
uadalp v5.8h, v7.16b
cnt v7.16b, v3.16b
b.hs L(top)
L(end): uadalp v4.8h, v6.16b
uadalp v5.8h, v7.16b
L(sum): eor v0.16b, v0.16b, v16.16b
eor v1.16b, v1.16b, v17.16b
cnt v6.16b, v0.16b
cnt v7.16b, v1.16b
uadalp v4.8h, v6.16b
uadalp v5.8h, v7.16b
add v4.8h, v4.8h, v5.8h
C we have 8 16-bit counts
L(e0): uaddlp v4.4s, v4.8h C we have 4 32-bit counts
uaddlp v4.2d, v4.4s C we have 2 64-bit counts
mov x0, v4.d[0]
mov x1, v4.d[1]
add x0, x0, x1
ret
C Code for count > maxsize. Splits operand and calls above code.
define(`ap2', x5) C caller-saves reg not used above
define(`bp2', x6) C caller-saves reg not used above
L(gt8k):
mov x8, x30
mov x7, n C full count (caller-saves reg not used above)
mov x4, #0 C total sum (caller-saves reg not used above)
mov x9, #chunksize*8 C caller-saves reg not used above
mov x10, #chunksize C caller-saves reg not used above
1: add ap2, ap, x9 C point at subsequent block
add bp2, bp, x9 C point at subsequent block
mov n, #chunksize-8 C count for this invocation, adjusted for entry pt
movi v4.16b, #0 C clear chunk summation register
movi v5.16b, #0 C clear chunk summation register
bl L(chu) C jump deep inside code
add x4, x4, x0
mov ap, ap2 C put chunk pointer in place for calls
mov bp, bp2 C put chunk pointer in place for calls
sub x7, x7, x10
cmp x7, x11
b.hi 1b
mov n, x7 C count for final invocation
bl L(lt8k)
add x0, x4, x0
mov x30, x8
ret
EPILOGUE()