#include "sparc_arch.h"
.section ".text",#alloc,#execinstr
.global bn_mul_mont_fpu
.align 32
bn_mul_mont_fpu:
save %sp,-STACK_FRAME-64,%sp
cmp %i5,4
bl,a,pn %icc,.Lret
clr %i0
andcc %i5,1,%g0 ! %i5 has to be even...
bnz,a,pn %icc,.Lret
clr %i0 ! signal "unsupported input value"
srl %i5,1,%i5
sethi %hi(0xffff),%l7
ld [%i4+0],%g4 ! %g4 reassigned, remember?
or %l7,%lo(0xffff),%l7
ld [%i4+4],%o0
sllx %o0,32,%o0
or %o0,%g4,%g4 ! %g4=n0[1].n0[0]
sll %i5,3,%i5 ! num*=8
add %sp,STACK_BIAS,%o0 ! real top of stack
sll %i5,2,%o1
add %o1,%i5,%o1 ! %o1=num*5
sub %o0,%o1,%o0
and %o0,-2048,%o0 ! optimize TLB utilization
sub %o0,STACK_BIAS,%sp ! alloca(5*num*8)
rd %asi,%o7 ! save %asi
add %sp,STACK_BIAS+STACK_FRAME+64,%l0
add %l0,%i5,%l1
add %l1,%i5,%l1 ! [an]p_[lh] point at the vectors' ends !
add %l1,%i5,%l2
add %l2,%i5,%l3
add %l3,%i5,%l4
wr %g0,210,%asi ! setup %asi for 16-bit FP loads
add %i0,%i5,%i0 ! readjust input pointers to point
add %i1,%i5,%i1 ! at the ends too...
add %i2,%i5,%i2
add %i3,%i5,%i3
stx %o7,[%sp+STACK_BIAS+STACK_FRAME+48] ! save %asi
sub %g0,%i5,%l5 ! i=-num
sub %g0,%i5,%l6 ! j=-num
add %i1,%l6,%o3
add %i2,%l5,%o4
ld [%o3+4],%g1 ! bp[0]
ld [%o3+0],%o0
ld [%o4+4],%g5 ! ap[0]
sllx %g1,32,%g1
ld [%o4+0],%o1
sllx %g5,32,%g5
or %g1,%o0,%o0
or %g5,%o1,%o1
add %i3,%l6,%o5
mulx %o1,%o0,%o0 ! ap[0]*bp[0]
mulx %g4,%o0,%o0 ! ap[0]*bp[0]*n0
stx %o0,[%sp+STACK_BIAS+STACK_FRAME+0]
ld [%o3+0],%f17 ! load a[j] as pair of 32-bit words
.word 0xa1b00c20 ! fzeros %f16
ld [%o3+4],%f19
.word 0xa5b00c20 ! fzeros %f18
ld [%o5+0],%f21 ! load n[j] as pair of 32-bit words
.word 0xa9b00c20 ! fzeros %f20
ld [%o5+4],%f23
.word 0xadb00c20 ! fzeros %f22
! transfer b[i] to FPU as 4x16-bit values
ldda [%o4+2]%asi,%f0
fxtod %f16,%f16
ldda [%o4+0]%asi,%f2
fxtod %f18,%f18
ldda [%o4+6]%asi,%f4
fxtod %f20,%f20
ldda [%o4+4]%asi,%f6
fxtod %f22,%f22
! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
ldda [%sp+STACK_BIAS+STACK_FRAME+6]%asi,%f8
fxtod %f0,%f0
ldda [%sp+STACK_BIAS+STACK_FRAME+4]%asi,%f10
fxtod %f2,%f2
ldda [%sp+STACK_BIAS+STACK_FRAME+2]%asi,%f12
fxtod %f4,%f4
ldda [%sp+STACK_BIAS+STACK_FRAME+0]%asi,%f14
fxtod %f6,%f6
std %f16,[%l1+%l6] ! save smashed ap[j] in double format
fxtod %f8,%f8
std %f18,[%l2+%l6]
fxtod %f10,%f10
std %f20,[%l3+%l6] ! save smashed np[j] in double format
fxtod %f12,%f12
std %f22,[%l4+%l6]
fxtod %f14,%f14
fmuld %f16,%f0,%f32
fmuld %f20,%f8,%f48
fmuld %f16,%f2,%f34
fmuld %f20,%f10,%f50
fmuld %f16,%f4,%f36
faddd %f32,%f48,%f48
fmuld %f20,%f12,%f52
fmuld %f16,%f6,%f38
faddd %f34,%f50,%f50
fmuld %f20,%f14,%f54
fmuld %f18,%f0,%f40
faddd %f36,%f52,%f52
fmuld %f22,%f8,%f56
fmuld %f18,%f2,%f42
faddd %f38,%f54,%f54
fmuld %f22,%f10,%f58
fmuld %f18,%f4,%f44
faddd %f40,%f56,%f56
fmuld %f22,%f12,%f60
fmuld %f18,%f6,%f46
faddd %f42,%f58,%f58
fmuld %f22,%f14,%f62
faddd %f44,%f60,%f24 ! %f60
faddd %f46,%f62,%f26 ! %f62
faddd %f52,%f56,%f52
faddd %f54,%f58,%f54
fdtox %f48,%f48
fdtox %f50,%f50
fdtox %f52,%f52
fdtox %f54,%f54
std %f48,[%sp+STACK_BIAS+STACK_FRAME+0]
add %l6,8,%l6
std %f50,[%sp+STACK_BIAS+STACK_FRAME+8]
add %i1,%l6,%o4
std %f52,[%sp+STACK_BIAS+STACK_FRAME+16]
add %i3,%l6,%o5
std %f54,[%sp+STACK_BIAS+STACK_FRAME+24]
ld [%o4+0],%f17 ! load a[j] as pair of 32-bit words
.word 0xa1b00c20 ! fzeros %f16
ld [%o4+4],%f19
.word 0xa5b00c20 ! fzeros %f18
ld [%o5+0],%f21 ! load n[j] as pair of 32-bit words
.word 0xa9b00c20 ! fzeros %f20
ld [%o5+4],%f23
.word 0xadb00c20 ! fzeros %f22
fxtod %f16,%f16
fxtod %f18,%f18
fxtod %f20,%f20
fxtod %f22,%f22
ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0
fmuld %f16,%f0,%f32
ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1
fmuld %f20,%f8,%f48
ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2
fmuld %f16,%f2,%f34
ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3
fmuld %f20,%f10,%f50
srlx %o0,16,%o7
std %f16,[%l1+%l6] ! save smashed ap[j] in double format
fmuld %f16,%f4,%f36
add %o7,%o1,%o1
std %f18,[%l2+%l6]
faddd %f32,%f48,%f48
fmuld %f20,%f12,%f52
srlx %o1,16,%o7
std %f20,[%l3+%l6] ! save smashed np[j] in double format
fmuld %f16,%f6,%f38
add %o7,%o2,%o2
std %f22,[%l4+%l6]
faddd %f34,%f50,%f50
fmuld %f20,%f14,%f54
srlx %o2,16,%o7
fmuld %f18,%f0,%f40
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
faddd %f36,%f52,%f52
fmuld %f22,%f8,%f56
!and %o0,%l7,%o0
!and %o1,%l7,%o1
!and %o2,%l7,%o2
!sllx %o1,16,%o1
!sllx %o2,32,%o2
!sllx %o3,48,%o7
!or %o1,%o0,%o0
!or %o2,%o0,%o0
!or %o7,%o0,%o0 ! 64-bit result
srlx %o3,16,%g1 ! 34-bit carry
fmuld %f18,%f2,%f42
faddd %f38,%f54,%f54
fmuld %f22,%f10,%f58
fmuld %f18,%f4,%f44
faddd %f40,%f56,%f56
fmuld %f22,%f12,%f60
fmuld %f18,%f6,%f46
faddd %f42,%f58,%f58
fmuld %f22,%f14,%f62
faddd %f24,%f48,%f48
faddd %f26,%f50,%f50
faddd %f44,%f60,%f24 ! %f60
faddd %f46,%f62,%f26 ! %f62
faddd %f52,%f56,%f52
faddd %f54,%f58,%f54
fdtox %f48,%f48
fdtox %f50,%f50
fdtox %f52,%f52
fdtox %f54,%f54
std %f48,[%sp+STACK_BIAS+STACK_FRAME+0]
std %f50,[%sp+STACK_BIAS+STACK_FRAME+8]
addcc %l6,8,%l6
std %f52,[%sp+STACK_BIAS+STACK_FRAME+16]
bz,pn %icc,.L1stskip
std %f54,[%sp+STACK_BIAS+STACK_FRAME+24]
.align 32 ! incidentally already aligned !
.L1st:
add %i1,%l6,%o4
add %i3,%l6,%o5
ld [%o4+0],%f17 ! load a[j] as pair of 32-bit words
.word 0xa1b00c20 ! fzeros %f16
ld [%o4+4],%f19
.word 0xa5b00c20 ! fzeros %f18
ld [%o5+0],%f21 ! load n[j] as pair of 32-bit words
.word 0xa9b00c20 ! fzeros %f20
ld [%o5+4],%f23
.word 0xadb00c20 ! fzeros %f22
fxtod %f16,%f16
fxtod %f18,%f18
fxtod %f20,%f20
fxtod %f22,%f22
ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0
fmuld %f16,%f0,%f32
ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1
fmuld %f20,%f8,%f48
ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2
fmuld %f16,%f2,%f34
ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3
fmuld %f20,%f10,%f50
srlx %o0,16,%o7
std %f16,[%l1+%l6] ! save smashed ap[j] in double format
fmuld %f16,%f4,%f36
add %o7,%o1,%o1
std %f18,[%l2+%l6]
faddd %f32,%f48,%f48
fmuld %f20,%f12,%f52
srlx %o1,16,%o7
std %f20,[%l3+%l6] ! save smashed np[j] in double format
fmuld %f16,%f6,%f38
add %o7,%o2,%o2
std %f22,[%l4+%l6]
faddd %f34,%f50,%f50
fmuld %f20,%f14,%f54
srlx %o2,16,%o7
fmuld %f18,%f0,%f40
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,%l7,%o0
faddd %f36,%f52,%f52
fmuld %f22,%f8,%f56
and %o1,%l7,%o1
and %o2,%l7,%o2
fmuld %f18,%f2,%f42
sllx %o1,16,%o1
faddd %f38,%f54,%f54
fmuld %f22,%f10,%f58
sllx %o2,32,%o2
fmuld %f18,%f4,%f44
sllx %o3,48,%o7
or %o1,%o0,%o0
faddd %f40,%f56,%f56
fmuld %f22,%f12,%f60
or %o2,%o0,%o0
fmuld %f18,%f6,%f46
or %o7,%o0,%o0 ! 64-bit result
faddd %f42,%f58,%f58
fmuld %f22,%f14,%f62
addcc %g1,%o0,%o0
faddd %f24,%f48,%f48
srlx %o3,16,%g1 ! 34-bit carry
faddd %f26,%f50,%f50
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[%l0] ! tp[j-1]=
faddd %f44,%f60,%f24 ! %f60
faddd %f46,%f62,%f26 ! %f62
faddd %f52,%f56,%f52
faddd %f54,%f58,%f54
fdtox %f48,%f48
fdtox %f50,%f50
fdtox %f52,%f52
fdtox %f54,%f54
std %f48,[%sp+STACK_BIAS+STACK_FRAME+0]
std %f50,[%sp+STACK_BIAS+STACK_FRAME+8]
std %f52,[%sp+STACK_BIAS+STACK_FRAME+16]
std %f54,[%sp+STACK_BIAS+STACK_FRAME+24]
addcc %l6,8,%l6
bnz,pt %icc,.L1st
add %l0,8,%l0
.L1stskip:
fdtox %f24,%f24
fdtox %f26,%f26
ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0
ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1
ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2
ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3
srlx %o0,16,%o7
std %f24,[%sp+STACK_BIAS+STACK_FRAME+32]
add %o7,%o1,%o1
std %f26,[%sp+STACK_BIAS+STACK_FRAME+40]
srlx %o1,16,%o7
add %o7,%o2,%o2
srlx %o2,16,%o7
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,%l7,%o0
and %o1,%l7,%o1
and %o2,%l7,%o2
sllx %o1,16,%o1
sllx %o2,32,%o2
sllx %o3,48,%o7
or %o1,%o0,%o0
or %o2,%o0,%o0
or %o7,%o0,%o0 ! 64-bit result
ldx [%sp+STACK_BIAS+STACK_FRAME+32],%o4
addcc %g1,%o0,%o0
ldx [%sp+STACK_BIAS+STACK_FRAME+40],%o5
srlx %o3,16,%g1 ! 34-bit carry
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[%l0] ! tp[j-1]=
add %l0,8,%l0
srlx %o4,16,%o7
add %o7,%o5,%o5
and %o4,%l7,%o4
sllx %o5,16,%o7
or %o7,%o4,%o4
addcc %g1,%o4,%o4
srlx %o5,48,%g1
bcs,a %xcc,.+8
add %g1,1,%g1
mov %g1,%i4
stx %o4,[%l0] ! tp[num-1]=
ba .Louter
add %l5,8,%l5
.align 32
.Louter:
sub %g0,%i5,%l6 ! j=-num
add %sp,STACK_BIAS+STACK_FRAME+64,%l0
add %i1,%l6,%o3
add %i2,%l5,%o4
ld [%o3+4],%g1 ! bp[i]
ld [%o3+0],%o0
ld [%o4+4],%g5 ! ap[0]
sllx %g1,32,%g1
ld [%o4+0],%o1
sllx %g5,32,%g5
or %g1,%o0,%o0
or %g5,%o1,%o1
ldx [%l0],%o2 ! tp[0]
mulx %o1,%o0,%o0
addcc %o2,%o0,%o0
mulx %g4,%o0,%o0 ! (ap[0]*bp[i]+t[0])*n0
stx %o0,[%sp+STACK_BIAS+STACK_FRAME+0]
! transfer b[i] to FPU as 4x16-bit values
ldda [%o4+2]%asi,%f0
ldda [%o4+0]%asi,%f2
ldda [%o4+6]%asi,%f4
ldda [%o4+4]%asi,%f6
! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
ldda [%sp+STACK_BIAS+STACK_FRAME+6]%asi,%f8
fxtod %f0,%f0
ldda [%sp+STACK_BIAS+STACK_FRAME+4]%asi,%f10
fxtod %f2,%f2
ldda [%sp+STACK_BIAS+STACK_FRAME+2]%asi,%f12
fxtod %f4,%f4
ldda [%sp+STACK_BIAS+STACK_FRAME+0]%asi,%f14
fxtod %f6,%f6
ldd [%l1+%l6],%f16 ! load a[j] in double format
fxtod %f8,%f8
ldd [%l2+%l6],%f18
fxtod %f10,%f10
ldd [%l3+%l6],%f20 ! load n[j] in double format
fxtod %f12,%f12
ldd [%l4+%l6],%f22
fxtod %f14,%f14
fmuld %f16,%f0,%f32
fmuld %f20,%f8,%f48
fmuld %f16,%f2,%f34
fmuld %f20,%f10,%f50
fmuld %f16,%f4,%f36
faddd %f32,%f48,%f48
fmuld %f20,%f12,%f52
fmuld %f16,%f6,%f38
faddd %f34,%f50,%f50
fmuld %f20,%f14,%f54
fmuld %f18,%f0,%f40
faddd %f36,%f52,%f52
fmuld %f22,%f8,%f56
fmuld %f18,%f2,%f42
faddd %f38,%f54,%f54
fmuld %f22,%f10,%f58
fmuld %f18,%f4,%f44
faddd %f40,%f56,%f56
fmuld %f22,%f12,%f60
fmuld %f18,%f6,%f46
faddd %f42,%f58,%f58
fmuld %f22,%f14,%f62
faddd %f44,%f60,%f24 ! %f60
faddd %f46,%f62,%f26 ! %f62
faddd %f52,%f56,%f52
faddd %f54,%f58,%f54
fdtox %f48,%f48
fdtox %f50,%f50
fdtox %f52,%f52
fdtox %f54,%f54
std %f48,[%sp+STACK_BIAS+STACK_FRAME+0]
std %f50,[%sp+STACK_BIAS+STACK_FRAME+8]
std %f52,[%sp+STACK_BIAS+STACK_FRAME+16]
add %l6,8,%l6
std %f54,[%sp+STACK_BIAS+STACK_FRAME+24]
ldd [%l1+%l6],%f16 ! load a[j] in double format
ldd [%l2+%l6],%f18
ldd [%l3+%l6],%f20 ! load n[j] in double format
ldd [%l4+%l6],%f22
fmuld %f16,%f0,%f32
fmuld %f20,%f8,%f48
fmuld %f16,%f2,%f34
fmuld %f20,%f10,%f50
fmuld %f16,%f4,%f36
ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0
faddd %f32,%f48,%f48
fmuld %f20,%f12,%f52
ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1
fmuld %f16,%f6,%f38
ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2
faddd %f34,%f50,%f50
fmuld %f20,%f14,%f54
ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3
fmuld %f18,%f0,%f40
srlx %o0,16,%o7
faddd %f36,%f52,%f52
fmuld %f22,%f8,%f56
add %o7,%o1,%o1
fmuld %f18,%f2,%f42
srlx %o1,16,%o7
faddd %f38,%f54,%f54
fmuld %f22,%f10,%f58
add %o7,%o2,%o2
fmuld %f18,%f4,%f44
srlx %o2,16,%o7
faddd %f40,%f56,%f56
fmuld %f22,%f12,%f60
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
! why?
and %o0,%l7,%o0
fmuld %f18,%f6,%f46
and %o1,%l7,%o1
and %o2,%l7,%o2
faddd %f42,%f58,%f58
fmuld %f22,%f14,%f62
sllx %o1,16,%o1
faddd %f24,%f48,%f48
sllx %o2,32,%o2
faddd %f26,%f50,%f50
sllx %o3,48,%o7
or %o1,%o0,%o0
faddd %f44,%f60,%f24 ! %f60
or %o2,%o0,%o0
faddd %f46,%f62,%f26 ! %f62
or %o7,%o0,%o0 ! 64-bit result
ldx [%l0],%o7
faddd %f52,%f56,%f52
addcc %o7,%o0,%o0
! end-of-why?
faddd %f54,%f58,%f54
srlx %o3,16,%g1 ! 34-bit carry
fdtox %f48,%f48
bcs,a %xcc,.+8
add %g1,1,%g1
fdtox %f50,%f50
fdtox %f52,%f52
fdtox %f54,%f54
std %f48,[%sp+STACK_BIAS+STACK_FRAME+0]
std %f50,[%sp+STACK_BIAS+STACK_FRAME+8]
addcc %l6,8,%l6
std %f52,[%sp+STACK_BIAS+STACK_FRAME+16]
bz,pn %icc,.Linnerskip
std %f54,[%sp+STACK_BIAS+STACK_FRAME+24]
ba .Linner
nop
.align 32
.Linner:
ldd [%l1+%l6],%f16 ! load a[j] in double format
ldd [%l2+%l6],%f18
ldd [%l3+%l6],%f20 ! load n[j] in double format
ldd [%l4+%l6],%f22
fmuld %f16,%f0,%f32
fmuld %f20,%f8,%f48
fmuld %f16,%f2,%f34
fmuld %f20,%f10,%f50
fmuld %f16,%f4,%f36
ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0
faddd %f32,%f48,%f48
fmuld %f20,%f12,%f52
ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1
fmuld %f16,%f6,%f38
ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2
faddd %f34,%f50,%f50
fmuld %f20,%f14,%f54
ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3
fmuld %f18,%f0,%f40
srlx %o0,16,%o7
faddd %f36,%f52,%f52
fmuld %f22,%f8,%f56
add %o7,%o1,%o1
fmuld %f18,%f2,%f42
srlx %o1,16,%o7
faddd %f38,%f54,%f54
fmuld %f22,%f10,%f58
add %o7,%o2,%o2
fmuld %f18,%f4,%f44
srlx %o2,16,%o7
faddd %f40,%f56,%f56
fmuld %f22,%f12,%f60
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,%l7,%o0
fmuld %f18,%f6,%f46
and %o1,%l7,%o1
and %o2,%l7,%o2
faddd %f42,%f58,%f58
fmuld %f22,%f14,%f62
sllx %o1,16,%o1
faddd %f24,%f48,%f48
sllx %o2,32,%o2
faddd %f26,%f50,%f50
sllx %o3,48,%o7
or %o1,%o0,%o0
faddd %f44,%f60,%f24 ! %f60
or %o2,%o0,%o0
faddd %f46,%f62,%f26 ! %f62
or %o7,%o0,%o0 ! 64-bit result
faddd %f52,%f56,%f52
addcc %g1,%o0,%o0
ldx [%l0+8],%o7 ! tp[j]
faddd %f54,%f58,%f54
srlx %o3,16,%g1 ! 34-bit carry
fdtox %f48,%f48
bcs,a %xcc,.+8
add %g1,1,%g1
fdtox %f50,%f50
addcc %o7,%o0,%o0
fdtox %f52,%f52
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[%l0] ! tp[j-1]
fdtox %f54,%f54
std %f48,[%sp+STACK_BIAS+STACK_FRAME+0]
std %f50,[%sp+STACK_BIAS+STACK_FRAME+8]
std %f52,[%sp+STACK_BIAS+STACK_FRAME+16]
addcc %l6,8,%l6
std %f54,[%sp+STACK_BIAS+STACK_FRAME+24]
bnz,pt %icc,.Linner
add %l0,8,%l0
.Linnerskip:
fdtox %f24,%f24
fdtox %f26,%f26
ldx [%sp+STACK_BIAS+STACK_FRAME+0],%o0
ldx [%sp+STACK_BIAS+STACK_FRAME+8],%o1
ldx [%sp+STACK_BIAS+STACK_FRAME+16],%o2
ldx [%sp+STACK_BIAS+STACK_FRAME+24],%o3
srlx %o0,16,%o7
std %f24,[%sp+STACK_BIAS+STACK_FRAME+32]
add %o7,%o1,%o1
std %f26,[%sp+STACK_BIAS+STACK_FRAME+40]
srlx %o1,16,%o7
add %o7,%o2,%o2
srlx %o2,16,%o7
add %o7,%o3,%o3 ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
and %o0,%l7,%o0
and %o1,%l7,%o1
and %o2,%l7,%o2
sllx %o1,16,%o1
sllx %o2,32,%o2
sllx %o3,48,%o7
or %o1,%o0,%o0
or %o2,%o0,%o0
ldx [%sp+STACK_BIAS+STACK_FRAME+32],%o4
or %o7,%o0,%o0 ! 64-bit result
ldx [%sp+STACK_BIAS+STACK_FRAME+40],%o5
addcc %g1,%o0,%o0
ldx [%l0+8],%o7 ! tp[j]
srlx %o3,16,%g1 ! 34-bit carry
bcs,a %xcc,.+8
add %g1,1,%g1
addcc %o7,%o0,%o0
bcs,a %xcc,.+8
add %g1,1,%g1
stx %o0,[%l0] ! tp[j-1]
add %l0,8,%l0
srlx %o4,16,%o7
add %o7,%o5,%o5
and %o4,%l7,%o4
sllx %o5,16,%o7
or %o7,%o4,%o4
addcc %g1,%o4,%o4
srlx %o5,48,%g1
bcs,a %xcc,.+8
add %g1,1,%g1
addcc %i4,%o4,%o4
stx %o4,[%l0] ! tp[num-1]
mov %g1,%i4
bcs,a %xcc,.+8
add %i4,1,%i4
addcc %l5,8,%l5
bnz %icc,.Louter
nop
add %l0,8,%l0 ! adjust tp to point at the end
orn %g0,%g0,%g4
sub %g0,%i5,%o7 ! n=-num
ba .Lsub
subcc %g0,%g0,%g0 ! clear %icc.c
.align 32
.Lsub:
ldx [%l0+%o7],%o0
add %i3,%o7,%g1
ld [%g1+0],%o2
ld [%g1+4],%o3
srlx %o0,32,%o1
subccc %o0,%o2,%o2
add %i0,%o7,%g1
subccc %o1,%o3,%o3
st %o2,[%g1+0]
add %o7,8,%o7
brnz,pt %o7,.Lsub
st %o3,[%g1+4]
subc %i4,0,%g4
sub %g0,%i5,%o7 ! n=-num
ba .Lcopy
nop
.align 32
.Lcopy:
ldx [%l0+%o7],%o0
add %i0,%o7,%g1
ld [%g1+0],%o2
ld [%g1+4],%o3
stx %g0,[%l0+%o7]
and %o0,%g4,%o0
srlx %o0,32,%o1
andn %o2,%g4,%o2
andn %o3,%g4,%o3
or %o2,%o0,%o0
or %o3,%o1,%o1
st %o0,[%g1+0]
add %o7,8,%o7
brnz,pt %o7,.Lcopy
st %o1,[%g1+4]
sub %g0,%i5,%o7 ! n=-num
.Lzap:
stx %g0,[%l1+%o7]
stx %g0,[%l2+%o7]
stx %g0,[%l3+%o7]
stx %g0,[%l4+%o7]
add %o7,8,%o7
brnz,pt %o7,.Lzap
nop
ldx [%sp+STACK_BIAS+STACK_FRAME+48],%o7
wr %g0,%o7,%asi ! restore %asi
mov 1,%i0
.Lret:
ret
restore
.type bn_mul_mont_fpu,#function
.size bn_mul_mont_fpu,(.-bn_mul_mont_fpu)
.asciz "Montgomery Multiplication for UltraSPARC, CRYPTOGAMS by <appro@openssl.org>"
.align 32