#ifndef __ASSEMBLER__
# define __ASSEMBLER__ 1
#endif
#include "crypto/sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
# define STPTR stx
# define SIZE_T 8
#else
# define STPTR st
# define SIZE_T 4
#endif
#define LOCALS (STACK_BIAS+STACK_FRAME)
.section ".text",#alloc,#execinstr
#ifdef __PIC__
SPARC_PIC_THUNK(%g1)
#endif
.globl poly1305_init
.align 32
poly1305_init:
save %sp,-STACK_FRAME-16,%sp
nop
SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
ld [%g1],%g1
and %g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1
cmp %g1,SPARCV9_FMADD
be .Lpoly1305_init_fma
nop
stx %g0,[%i0+0]
stx %g0,[%i0+8] ! zero hash value
brz,pn %i1,.Lno_key
stx %g0,[%i0+16]
and %i1,7,%i5 ! alignment factor
andn %i1,7,%i1
sll %i5,3,%i5 ! *8
neg %i5,%i4
sethi %hi(0x0ffffffc),%o4
set 8,%o1
or %o4,%lo(0x0ffffffc),%o4
set 16,%o2
sllx %o4,32,%o5
or %o4,%o5,%o5 ! 0x0ffffffc0ffffffc
or %o5,3,%o4 ! 0x0ffffffc0fffffff
ldxa [%i1+%g0]0x88,%o0 ! load little-endian key
brz,pt %i5,.Lkey_aligned
ldxa [%i1+%o1]0x88,%o1
ldxa [%i1+%o2]0x88,%o2
srlx %o0,%i5,%o0
sllx %o1,%i4,%o7
srlx %o1,%i5,%o1
or %o7,%o0,%o0
sllx %o2,%i4,%o2
or %o2,%o1,%o1
.Lkey_aligned:
and %o4,%o0,%o0
and %o5,%o1,%o1
stx %o0,[%i0+32+0] ! store key
stx %o1,[%i0+32+8]
andcc %g1,SPARCV9_VIS3,%g0
be .Lno_key
nop
1: call .+8
add %o7,poly1305_blocks_vis3-1b,%o7
add %o7,poly1305_emit-poly1305_blocks_vis3,%o5
STPTR %o7,[%i2]
STPTR %o5,[%i2+SIZE_T]
ret
restore %g0,1,%o0 ! return 1
.Lno_key:
ret
restore %g0,%g0,%o0 ! return 0
.type poly1305_init,#function
.size poly1305_init,.-poly1305_init
.globl poly1305_blocks
.align 32
poly1305_blocks:
save %sp,-STACK_FRAME,%sp
srln %i2,4,%i2
brz,pn %i2,.Lno_data
nop
ld [%i0+32+0],%l1 ! load key
ld [%i0+32+4],%l0
ld [%i0+32+8],%l3
ld [%i0+32+12],%l2
ld [%i0+0],%o1 ! load hash value
ld [%i0+4],%o0
ld [%i0+8],%o3
ld [%i0+12],%o2
ld [%i0+16],%l7
and %i1,7,%i5 ! alignment factor
andn %i1,7,%i1
set 8,%g2
sll %i5,3,%i5 ! *8
set 16,%g3
neg %i5,%i4
srl %l1,2,%l4
srl %l2,2,%l5
add %l1,%l4,%l4
srl %l3,2,%l6
add %l2,%l5,%l5
add %l3,%l6,%l6
.Loop:
ldxa [%i1+%g0]0x88,%g1 ! load little-endian input
brz,pt %i5,.Linp_aligned
ldxa [%i1+%g2]0x88,%g2
ldxa [%i1+%g3]0x88,%g3
srlx %g1,%i5,%g1
sllx %g2,%i4,%o5
srlx %g2,%i5,%g2
or %o5,%g1,%g1
sllx %g3,%i4,%g3
or %g3,%g2,%g2
.Linp_aligned:
srlx %g1,32,%o4
addcc %g1,%o0,%o0 ! accumulate input
srlx %g2,32,%o5
addccc %o4,%o1,%o1
addccc %g2,%o2,%o2
addccc %o5,%o3,%o3
addc %i3,%l7,%l7
umul %l0,%o0,%g1
umul %l1,%o0,%g2
umul %l2,%o0,%g3
umul %l3,%o0,%g4
sub %i2,1,%i2
add %i1,16,%i1
umul %l6,%o1,%o4
umul %l0,%o1,%o5
umul %l1,%o1,%o7
add %o4,%g1,%g1
add %o5,%g2,%g2
umul %l2,%o1,%o4
add %o7,%g3,%g3
add %o4,%g4,%g4
umul %l5,%o2,%o5
umul %l6,%o2,%o7
umul %l0,%o2,%o4
add %o5,%g1,%g1
add %o7,%g2,%g2
umul %l1,%o2,%o5
add %o4,%g3,%g3
add %o5,%g4,%g4
umul %l4,%o3,%o7
umul %l5,%o3,%o4
umul %l6,%o3,%o5
add %o7,%g1,%g1
add %o4,%g2,%g2
umul %l0,%o3,%o7
add %o5,%g3,%g3
add %o7,%g4,%g4
umul %l4,%l7,%o4
umul %l5,%l7,%o5
umul %l6,%l7,%o7
umul %l0,%l7,%l7
add %o4,%g2,%g2
add %o5,%g3,%g3
srlx %g1,32,%o1
add %o7,%g4,%g4
srlx %g2,32,%o2
addcc %g2,%o1,%o1
srlx %g3,32,%o3
set 8,%g2
addccc %g3,%o2,%o2
srlx %g4,32,%o4
set 16,%g3
addccc %g4,%o3,%o3
addc %o4,%l7,%l7
srl %l7,2,%o4 ! final reduction step
andn %l7,3,%o5
and %l7,3,%l7
add %o5,%o4,%o4
addcc %o4,%g1,%o0
addccc %g0,%o1,%o1
addccc %g0,%o2,%o2
addccc %g0,%o3,%o3
brnz,pt %i2,.Loop
addc %g0,%l7,%l7
st %o1,[%i0+0] ! store hash value
st %o0,[%i0+4]
st %o3,[%i0+8]
st %o2,[%i0+12]
st %l7,[%i0+16]
.Lno_data:
ret
restore
.type poly1305_blocks,#function
.size poly1305_blocks,.-poly1305_blocks
.align 32
poly1305_blocks_vis3:
save %sp,-STACK_FRAME,%sp
srln %i2,4,%i2
brz,pn %i2,.Lno_data
nop
ldx [%i0+32+0],%o3 ! load key
ldx [%i0+32+8],%o4
ldx [%i0+0],%o0 ! load hash value
ldx [%i0+8],%o1
ld [%i0+16],%o2
and %i1,7,%i5 ! alignment factor
andn %i1,7,%i1
set 8,%l1
sll %i5,3,%i5 ! *8
set 16,%l2
neg %i5,%i4
srlx %o4,2,%o5
b .Loop_vis3
add %o4,%o5,%o5
.Loop_vis3:
ldxa [%i1+%g0]0x88,%g1 ! load little-endian input
brz,pt %i5,.Linp_aligned_vis3
ldxa [%i1+%l1]0x88,%g2
ldxa [%i1+%l2]0x88,%g3
srlx %g1,%i5,%g1
sllx %g2,%i4,%o7
srlx %g2,%i5,%g2
or %o7,%g1,%g1
sllx %g3,%i4,%g3
or %g3,%g2,%g2
.Linp_aligned_vis3:
addcc %g1,%o0,%o0 ! accumulate input
sub %i2,1,%i2
.word 0x93b08269 !addxccc %g2,%o1,%o1
add %i1,16,%i1
mulx %o3,%o0,%g1 ! r0*h0
.word 0x95b6c22a !addxc %i3,%o2,%o2
.word 0x85b2c2c8 !umulxhi %o3,%o0,%g2
mulx %o5,%o1,%g4 ! s1*h1
.word 0x9fb342c9 !umulxhi %o5,%o1,%o7
addcc %g4,%g1,%g1
mulx %o4,%o0,%g4 ! r1*h0
.word 0x85b3c222 !addxc %o7,%g2,%g2
.word 0x87b302c8 !umulxhi %o4,%o0,%g3
addcc %g4,%g2,%g2
mulx %o3,%o1,%g4 ! r0*h1
.word 0x87b00223 !addxc %g0,%g3,%g3
.word 0x9fb2c2c9 !umulxhi %o3,%o1,%o7
addcc %g4,%g2,%g2
mulx %o5,%o2,%g4 ! s1*h2
.word 0x87b3c223 !addxc %o7,%g3,%g3
mulx %o3,%o2,%o7 ! r0*h2
addcc %g4,%g2,%g2
.word 0x87b3c223 !addxc %o7,%g3,%g3
srlx %g3,2,%g4 ! final reduction step
andn %g3,3,%o7
and %g3,3,%o2
add %o7,%g4,%g4
addcc %g4,%g1,%o0
.word 0x93b00262 !addxccc %g0,%g2,%o1
brnz,pt %i2,.Loop_vis3
.word 0x95b0022a !addxc %g0,%o2,%o2
stx %o0,[%i0+0] ! store hash value
stx %o1,[%i0+8]
st %o2,[%i0+16]
ret
restore
.type poly1305_blocks_vis3,#function
.size poly1305_blocks_vis3,.-poly1305_blocks_vis3
.globl poly1305_emit
.align 32
poly1305_emit:
save %sp,-STACK_FRAME,%sp
ld [%i0+0],%o1 ! load hash value
ld [%i0+4],%o0
ld [%i0+8],%o3
ld [%i0+12],%o2
ld [%i0+16],%l7
addcc %o0,5,%l0 ! compare to modulus
addccc %o1,0,%l1
addccc %o2,0,%l2
addccc %o3,0,%l3
addc %l7,0,%l7
andcc %l7,4,%g0 ! did it carry/borrow?
movnz %icc,%l0,%o0
ld [%i2+0],%l0 ! load nonce
movnz %icc,%l1,%o1
ld [%i2+4],%l1
movnz %icc,%l2,%o2
ld [%i2+8],%l2
movnz %icc,%l3,%o3
ld [%i2+12],%l3
addcc %l0,%o0,%o0 ! accumulate nonce
addccc %l1,%o1,%o1
addccc %l2,%o2,%o2
addc %l3,%o3,%o3
srl %o0,8,%l0
stb %o0,[%i1+0] ! store little-endian result
srl %o0,16,%l1
stb %l0,[%i1+1]
srl %o0,24,%l2
stb %l1,[%i1+2]
stb %l2,[%i1+3]
srl %o1,8,%l0
stb %o1,[%i1+4]
srl %o1,16,%l1
stb %l0,[%i1+5]
srl %o1,24,%l2
stb %l1,[%i1+6]
stb %l2,[%i1+7]
srl %o2,8,%l0
stb %o2,[%i1+8]
srl %o2,16,%l1
stb %l0,[%i1+9]
srl %o2,24,%l2
stb %l1,[%i1+10]
stb %l2,[%i1+11]
srl %o3,8,%l0
stb %o3,[%i1+12]
srl %o3,16,%l1
stb %l0,[%i1+13]
srl %o3,24,%l2
stb %l1,[%i1+14]
stb %l2,[%i1+15]
ret
restore
.type poly1305_emit,#function
.size poly1305_emit,.-poly1305_emit
.align 32
poly1305_init_fma:
save %sp,-STACK_FRAME-16,%sp
nop
.Lpoly1305_init_fma:
1: call .+8
add %o7,.Lconsts_fma-1b,%o7
ldd [%o7+8*0],%f16 ! load constants
ldd [%o7+8*1],%f18
ldd [%o7+8*2],%f20
ldd [%o7+8*3],%f22
ldd [%o7+8*5],%f26
std %f16,[%i0+8*0] ! initial hash value, biased 0
std %f18,[%i0+8*1]
std %f20,[%i0+8*2]
std %f22,[%i0+8*3]
brz,pn %i1,.Lno_key_fma
nop
stx %fsr,[%sp+LOCALS] ! save original %fsr
ldx [%o7+8*6],%fsr ! load new %fsr
std %f16,[%i0+8*4] ! key "template"
std %f18,[%i0+8*5]
std %f20,[%i0+8*6]
std %f22,[%i0+8*7]
and %i1,7,%l2
andn %i1,7,%i1 ! align pointer
mov 8,%l0
sll %l2,3,%l2
mov 16,%l1
neg %l2,%l3
ldxa [%i1+%g0]0x88,%o0 ! load little-endian key
ldxa [%i1+%l0]0x88,%o2
brz %l2,.Lkey_aligned_fma
sethi %hi(0xf0000000),%l0 ! 0xf0000000
ldxa [%i1+%l1]0x88,%o4
srlx %o0,%l2,%o0 ! align data
sllx %o2,%l3,%o1
srlx %o2,%l2,%o2
or %o1,%o0,%o0
sllx %o4,%l3,%o3
or %o3,%o2,%o2
.Lkey_aligned_fma:
or %l0,3,%l1 ! 0xf0000003
srlx %o0,32,%o1
andn %o0,%l0,%o0 ! &=0x0fffffff
andn %o1,%l1,%o1 ! &=0x0ffffffc
srlx %o2,32,%o3
andn %o2,%l1,%o2
andn %o3,%l1,%o3
st %o0,[%i0+36] ! fill "template"
st %o1,[%i0+44]
st %o2,[%i0+52]
st %o3,[%i0+60]
ldd [%i0+8*4],%f0 ! load [biased] key
ldd [%i0+8*5],%f4
ldd [%i0+8*6],%f8
ldd [%i0+8*7],%f12
fsubd %f0,%f16, %f0 ! r0
ldd [%o7+8*7],%f16 ! more constants
fsubd %f4,%f18,%f4 ! r1
ldd [%o7+8*8],%f18
fsubd %f8,%f20,%f8 ! r2
ldd [%o7+8*9],%f20
fsubd %f12,%f22,%f12 ! r3
ldd [%o7+8*10],%f22
fmuld %f26,%f4,%f52 ! s1
fmuld %f26,%f8,%f40 ! s2
fmuld %f26,%f12,%f44 ! s3
faddd %f0,%f16, %f2
faddd %f4,%f18,%f6
faddd %f8,%f20,%f10
faddd %f12,%f22,%f14
fsubd %f2,%f16, %f2
ldd [%o7+8*11],%f16 ! more constants
fsubd %f6,%f18,%f6
ldd [%o7+8*12],%f18
fsubd %f10,%f20,%f10
ldd [%o7+8*13],%f20
fsubd %f14,%f22,%f14
fsubd %f0,%f2,%f0
std %f2,[%i0+8*5] ! r0hi
fsubd %f4,%f6,%f4
std %f6,[%i0+8*7] ! r1hi
fsubd %f8,%f10,%f8
std %f10,[%i0+8*9] ! r2hi
fsubd %f12,%f14,%f12
std %f14,[%i0+8*11] ! r3hi
faddd %f52,%f16, %f54
faddd %f40,%f18,%f42
faddd %f44,%f20,%f46
fsubd %f54,%f16, %f54
fsubd %f42,%f18,%f42
fsubd %f46,%f20,%f46
fsubd %f52,%f54,%f52
fsubd %f40,%f42,%f40
fsubd %f44,%f46,%f44
ldx [%sp+LOCALS],%fsr ! restore %fsr
std %f0,[%i0+8*4] ! r0lo
std %f4,[%i0+8*6] ! r1lo
std %f8,[%i0+8*8] ! r2lo
std %f12,[%i0+8*10] ! r3lo
std %f54,[%i0+8*13]
std %f42,[%i0+8*15]
std %f46,[%i0+8*17]
std %f52,[%i0+8*12]
std %f40,[%i0+8*14]
std %f44,[%i0+8*16]
add %o7,poly1305_blocks_fma-.Lconsts_fma,%o0
add %o7,poly1305_emit_fma-.Lconsts_fma,%o1
STPTR %o0,[%i2]
STPTR %o1,[%i2+SIZE_T]
ret
restore %g0,1,%o0 ! return 1
.Lno_key_fma:
ret
restore %g0,%g0,%o0 ! return 0
.type poly1305_init_fma,#function
.size poly1305_init_fma,.-poly1305_init_fma
.align 32
poly1305_blocks_fma:
save %sp,-STACK_FRAME-48,%sp
srln %i2,4,%i2
brz,pn %i2,.Labort
sub %i2,1,%i2
1: call .+8
add %o7,.Lconsts_fma-1b,%o7
ldd [%o7+8*0],%f16 ! load constants
ldd [%o7+8*1],%f18
ldd [%o7+8*2],%f20
ldd [%o7+8*3],%f22
ldd [%o7+8*4],%f24
ldd [%o7+8*5],%f26
ldd [%i0+8*0],%f0 ! load [biased] hash value
ldd [%i0+8*1],%f4
ldd [%i0+8*2],%f8
ldd [%i0+8*3],%f12
std %f16,[%sp+LOCALS+8*0] ! input "template"
sethi %hi((1023+52+96)<<20),%o3
std %f18,[%sp+LOCALS+8*1]
or %i3,%o3,%o3
std %f20,[%sp+LOCALS+8*2]
st %o3,[%sp+LOCALS+8*3]
and %i1,7,%l2
andn %i1,7,%i1 ! align pointer
mov 8,%l0
sll %l2,3,%l2
mov 16,%l1
neg %l2,%l3
ldxa [%i1+%g0]0x88,%o0 ! load little-endian input
brz %l2,.Linp_aligned_fma
ldxa [%i1+%l0]0x88,%o2
ldxa [%i1+%l1]0x88,%o4
add %i1,8,%i1
srlx %o0,%l2,%o0 ! align data
sllx %o2,%l3,%o1
srlx %o2,%l2,%o2
or %o1,%o0,%o0
sllx %o4,%l3,%o3
srlx %o4,%l2,%o4 ! pre-shift
or %o3,%o2,%o2
.Linp_aligned_fma:
srlx %o0,32,%o1
movrz %i2,0,%l1
srlx %o2,32,%o3
add %l1,%i1,%i1 ! conditional advance
st %o0,[%sp+LOCALS+8*0+4] ! fill "template"
st %o1,[%sp+LOCALS+8*1+4]
st %o2,[%sp+LOCALS+8*2+4]
st %o3,[%sp+LOCALS+8*3+4]
ldd [%i0+8*4],%f28 ! load key
ldd [%i0+8*5],%f30
ldd [%i0+8*6],%f32
ldd [%i0+8*7],%f34
ldd [%i0+8*8],%f36
ldd [%i0+8*9],%f38
ldd [%i0+8*10],%f48
ldd [%i0+8*11],%f50
ldd [%i0+8*12],%f52
ldd [%i0+8*13],%f54
ldd [%i0+8*14],%f40
ldd [%i0+8*15],%f42
ldd [%i0+8*16],%f44
ldd [%i0+8*17],%f46
stx %fsr,[%sp+LOCALS+8*4] ! save original %fsr
ldx [%o7+8*6],%fsr ! load new %fsr
subcc %i2,1,%i2
movrz %i2,0,%l1
ldd [%sp+LOCALS+8*0],%f56 ! load biased input
ldd [%sp+LOCALS+8*1],%f58
ldd [%sp+LOCALS+8*2],%f60
ldd [%sp+LOCALS+8*3],%f62
fsubd %f0,%f16, %f0 ! de-bias hash value
fsubd %f4,%f18,%f4
ldxa [%i1+%g0]0x88,%o0 ! modulo-scheduled input load
fsubd %f8,%f20,%f8
fsubd %f12,%f22,%f12
ldxa [%i1+%l0]0x88,%o2
fsubd %f56,%f16, %f56 ! de-bias input
fsubd %f58,%f18,%f58
fsubd %f60,%f20,%f60
fsubd %f62,%f22,%f62
brz %l2,.Linp_aligned_fma2
add %l1,%i1,%i1 ! conditional advance
sllx %o0,%l3,%o1 ! align data
srlx %o0,%l2,%o3
or %o1,%o4,%o0
sllx %o2,%l3,%o1
srlx %o2,%l2,%o4 ! pre-shift
or %o3,%o1,%o2
.Linp_aligned_fma2:
srlx %o0,32,%o1
srlx %o2,32,%o3
faddd %f0,%f56,%f56 ! accumulate input
stw %o0,[%sp+LOCALS+8*0+4]
faddd %f4,%f58,%f58
stw %o1,[%sp+LOCALS+8*1+4]
faddd %f8,%f60,%f60
stw %o2,[%sp+LOCALS+8*2+4]
faddd %f12,%f62,%f62
stw %o3,[%sp+LOCALS+8*3+4]
b .Lentry_fma
nop
.align 16
.Loop_fma:
ldxa [%i1+%g0]0x88,%o0 ! modulo-scheduled input load
ldxa [%i1+%l0]0x88,%o2
movrz %i2,0,%l1
faddd %f52,%f0,%f0 ! accumulate input
faddd %f54,%f2,%f2
faddd %f62,%f8,%f8
faddd %f60,%f10,%f10
brz,pn %l2,.Linp_aligned_fma3
add %l1,%i1,%i1 ! conditional advance
sllx %o0,%l3,%o1 ! align data
srlx %o0,%l2,%o3
or %o1,%o4,%o0
sllx %o2,%l3,%o1
srlx %o2,%l2,%o4 ! pre-shift
or %o3,%o1,%o2
.Linp_aligned_fma3:
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
faddd %f20,%f4,%f52
srlx %o0,32,%o1
faddd %f20,%f6,%f54
srlx %o2,32,%o3
faddd %f24,%f12,%f60
st %o0,[%sp+LOCALS+8*0+4] ! fill "template"
faddd %f24,%f14,%f62
st %o1,[%sp+LOCALS+8*1+4]
faddd %f18,%f0,%f48
st %o2,[%sp+LOCALS+8*2+4]
faddd %f18,%f2,%f50
st %o3,[%sp+LOCALS+8*3+4]
faddd %f22,%f8,%f56
faddd %f22,%f10,%f58
fsubd %f52,%f20,%f52
fsubd %f54,%f20,%f54
fsubd %f60,%f24,%f60
fsubd %f62,%f24,%f62
fsubd %f48,%f18,%f48
fsubd %f50,%f18,%f50
fsubd %f56,%f22,%f56
fsubd %f58,%f22,%f58
fsubd %f4,%f52,%f4
fsubd %f6,%f54,%f6
fsubd %f12,%f60,%f12
fsubd %f14,%f62,%f14
fsubd %f8,%f56,%f8
fsubd %f10,%f58,%f10
fsubd %f0,%f48,%f0
fsubd %f2,%f50,%f2
faddd %f4,%f48,%f4
faddd %f6,%f50,%f6
faddd %f12,%f56,%f12
faddd %f14,%f58,%f14
faddd %f8,%f52,%f8
faddd %f10,%f54,%f10
.word 0x81be805d !fmaddd %f26,%f60,%f0,%f0
.word 0x85be845f !fmaddd %f26,%f62,%f2,%f2
faddd %f4,%f6,%f58
ldd [%i0+8*12],%f52 ! reload constants
faddd %f12,%f14,%f62
ldd [%i0+8*13],%f54
faddd %f8,%f10,%f60
ldd [%i0+8*10],%f48
faddd %f0,%f2,%f56
ldd [%i0+8*11],%f50
.Lentry_fma:
fmuld %f58,%f44,%f0
fmuld %f58,%f46,%f2
fmuld %f58,%f32,%f8
fmuld %f58,%f34,%f10
fmuld %f58,%f28,%f4
fmuld %f58,%f30,%f6
fmuld %f58,%f36,%f12
fmuld %f58,%f38,%f14
.word 0x81bfc055 !fmaddd %f62,%f52,%f0,%f0
.word 0x85bfc457 !fmaddd %f62,%f54,%f2,%f2
.word 0x91bfd04d !fmaddd %f62,%f44,%f8,%f8
.word 0x95bfd44f !fmaddd %f62,%f46,%f10,%f10
.word 0x89bfc849 !fmaddd %f62,%f40,%f4,%f4
.word 0x8dbfcc4b !fmaddd %f62,%f42,%f6,%f6
.word 0x99bfd85c !fmaddd %f62,%f28,%f12,%f12
.word 0x9dbfdc5e !fmaddd %f62,%f30,%f14,%f14
.word 0x81bf4049 !fmaddd %f60,%f40,%f0,%f0
.word 0x85bf444b !fmaddd %f60,%f42,%f2,%f2
.word 0x91bf505c !fmaddd %f60,%f28,%f8,%f8
.word 0x95bf545e !fmaddd %f60,%f30,%f10,%f10
.word 0x89bf484d !fmaddd %f60,%f44,%f4,%f4
ldd [%sp+LOCALS+8*0],%f52 ! load [biased] input
.word 0x8dbf4c4f !fmaddd %f60,%f46,%f6,%f6
ldd [%sp+LOCALS+8*1],%f54
.word 0x99bf5841 !fmaddd %f60,%f32,%f12,%f12
ldd [%sp+LOCALS+8*2],%f62
.word 0x9dbf5c43 !fmaddd %f60,%f34,%f14,%f14
ldd [%sp+LOCALS+8*3],%f60
.word 0x81be405c !fmaddd %f56,%f28,%f0,%f0
fsubd %f52,%f16, %f52 ! de-bias input
.word 0x85be445e !fmaddd %f56,%f30,%f2,%f2
fsubd %f54,%f18,%f54
.word 0x91be5045 !fmaddd %f56,%f36,%f8,%f8
fsubd %f62,%f20,%f62
.word 0x95be5447 !fmaddd %f56,%f38,%f10,%f10
fsubd %f60,%f22,%f60
.word 0x89be4841 !fmaddd %f56,%f32,%f4,%f4
.word 0x8dbe4c43 !fmaddd %f56,%f34,%f6,%f6
.word 0x99be5851 !fmaddd %f56,%f48,%f12,%f12
.word 0x9dbe5c53 !fmaddd %f56,%f50,%f14,%f14
bcc SIZE_T_CC,.Loop_fma
subcc %i2,1,%i2
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
faddd %f0,%f18,%f48
faddd %f2,%f18,%f50
faddd %f8,%f22,%f56
faddd %f10,%f22,%f58
faddd %f4,%f20,%f52
faddd %f6,%f20,%f54
faddd %f12,%f24,%f60
faddd %f14,%f24,%f62
fsubd %f48,%f18,%f48
fsubd %f50,%f18,%f50
fsubd %f56,%f22,%f56
fsubd %f58,%f22,%f58
fsubd %f52,%f20,%f52
fsubd %f54,%f20,%f54
fsubd %f60,%f24,%f60
fsubd %f62,%f24,%f62
fsubd %f4,%f52,%f4
fsubd %f6,%f54,%f6
fsubd %f12,%f60,%f12
fsubd %f14,%f62,%f14
fsubd %f8,%f56,%f8
fsubd %f10,%f58,%f10
fsubd %f0,%f48,%f0
fsubd %f2,%f50,%f2
faddd %f4,%f48,%f4
faddd %f6,%f50,%f6
faddd %f12,%f56,%f12
faddd %f14,%f58,%f14
faddd %f8,%f52,%f8
faddd %f10,%f54,%f10
.word 0x81be805d !fmaddd %f26,%f60,%f0,%f0
.word 0x85be845f !fmaddd %f26,%f62,%f2,%f2
faddd %f4,%f6,%f58
faddd %f12,%f14,%f62
faddd %f8,%f10,%f60
faddd %f0,%f2,%f56
faddd %f58,%f18,%f58 ! bias
faddd %f62,%f22,%f62
faddd %f60,%f20,%f60
faddd %f56,%f16, %f56
ldx [%sp+LOCALS+8*4],%fsr ! restore saved %fsr
std %f58,[%i0+8*1] ! store [biased] hash value
std %f62,[%i0+8*3]
std %f60,[%i0+8*2]
std %f56,[%i0+8*0]
.Labort:
ret
restore
.type poly1305_blocks_fma,#function
.size poly1305_blocks_fma,.-poly1305_blocks_fma
.align 32
poly1305_emit_fma:
save %sp,-STACK_FRAME,%sp
ld [%i0+8*0+0],%l5 ! load hash
ld [%i0+8*0+4],%l0
ld [%i0+8*1+0],%o0
ld [%i0+8*1+4],%l1
ld [%i0+8*2+0],%o1
ld [%i0+8*2+4],%l2
ld [%i0+8*3+0],%o2
ld [%i0+8*3+4],%l3
sethi %hi(0xfff00000),%o3
andn %l5,%o3,%l5 ! mask exponent
andn %o0,%o3,%o0
andn %o1,%o3,%o1
andn %o2,%o3,%o2 ! can be partially reduced...
mov 3,%o3
srl %o2,2,%i3 ! ... so reduce
and %o2,%o3,%l4
andn %o2,%o3,%o2
add %i3,%o2,%o2
addcc %o2,%l0,%l0
addccc %l5,%l1,%l1
addccc %o0,%l2,%l2
addccc %o1,%l3,%l3
addc %g0,%l4,%l4
addcc %l0,5,%l5 ! compare to modulus
addccc %l1,0,%o0
addccc %l2,0,%o1
addccc %l3,0,%o2
addc %l4,0,%o3
srl %o3,2,%o3 ! did it carry/borrow?
neg %o3,%o3
sra %o3,31,%o3 ! mask
andn %l0,%o3,%l0
and %l5,%o3,%l5
andn %l1,%o3,%l1
and %o0,%o3,%o0
or %l5,%l0,%l0
ld [%i2+0],%l5 ! load nonce
andn %l2,%o3,%l2
and %o1,%o3,%o1
or %o0,%l1,%l1
ld [%i2+4],%o0
andn %l3,%o3,%l3
and %o2,%o3,%o2
or %o1,%l2,%l2
ld [%i2+8],%o1
or %o2,%l3,%l3
ld [%i2+12],%o2
addcc %l5,%l0,%l0 ! accumulate nonce
addccc %o0,%l1,%l1
addccc %o1,%l2,%l2
addc %o2,%l3,%l3
stb %l0,[%i1+0] ! write little-endian result
srl %l0,8,%l0
stb %l1,[%i1+4]
srl %l1,8,%l1
stb %l2,[%i1+8]
srl %l2,8,%l2
stb %l3,[%i1+12]
srl %l3,8,%l3
stb %l0,[%i1+1]
srl %l0,8,%l0
stb %l1,[%i1+5]
srl %l1,8,%l1
stb %l2,[%i1+9]
srl %l2,8,%l2
stb %l3,[%i1+13]
srl %l3,8,%l3
stb %l0,[%i1+2]
srl %l0,8,%l0
stb %l1,[%i1+6]
srl %l1,8,%l1
stb %l2,[%i1+10]
srl %l2,8,%l2
stb %l3,[%i1+14]
srl %l3,8,%l3
stb %l0,[%i1+3]
stb %l1,[%i1+7]
stb %l2,[%i1+11]
stb %l3,[%i1+15]
ret
restore
.type poly1305_emit_fma,#function
.size poly1305_emit_fma,.-poly1305_emit_fma
.align 64
.Lconsts_fma:
.word 0x43300000,0x00000000 ! 2^(52+0)
.word 0x45300000,0x00000000 ! 2^(52+32)
.word 0x47300000,0x00000000 ! 2^(52+64)
.word 0x49300000,0x00000000 ! 2^(52+96)
.word 0x4b500000,0x00000000 ! 2^(52+130)
.word 0x37f40000,0x00000000 ! 5/2^130
.word 0,1<<30 ! fsr: truncate, no exceptions
.word 0x44300000,0x00000000 ! 2^(52+16+0)
.word 0x46300000,0x00000000 ! 2^(52+16+32)
.word 0x48300000,0x00000000 ! 2^(52+16+64)
.word 0x4a300000,0x00000000 ! 2^(52+16+96)
.word 0x3e300000,0x00000000 ! 2^(52+16+0-96)
.word 0x40300000,0x00000000 ! 2^(52+16+32-96)
.word 0x42300000,0x00000000 ! 2^(52+16+64-96)
.asciz "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro@openssl.org>"
.align 4