#ifdef __linux__
#include <asm/regdef.h>
#else
#include <asm.h>
#include <regdef.h>
#endif
.text
.set noat
.set noreorder
.globl bn_mul_mont
.align 5
.ent bn_mul_mont
bn_mul_mont:
lda sp,-48(sp)
stq ra,0(sp)
stq s3,8(sp)
stq s4,16(sp)
stq s5,24(sp)
stq fp,32(sp)
mov sp,fp
.mask 0x0400f000,-48
.frame fp,48,ra
.prologue 0
.align 4
.set reorder
sextl a5,a5
mov 0,v0
cmplt a5,4,AT
bne AT,.Lexit
ldq t1,0(a1) # ap[0]
s8addq a5,16,AT
ldq t4,8(a1)
subq sp,AT,sp
ldq t5,0(a2) # bp[0]
lda AT,-4096(zero) # mov -4096,AT
ldq a4,0(a4)
and sp,AT,sp
mulq t1,t5,t0
ldq t3,0(a3) # np[0]
umulh t1,t5,t1
ldq t6,8(a3)
mulq t0,a4,s5
mulq t3,s5,t2
umulh t3,s5,t3
addq t2,t0,t2
cmpult t2,t0,AT
addq t3,AT,t3
mulq t4,t5,t8
mov 2,s4
umulh t4,t5,t9
mov sp,t7
mulq t6,s5,t10
s8addq s4,a1,t4
umulh t6,s5,t11
s8addq s4,a3,t6
.align 4
.L1st:
.set noreorder
ldq t4,0(t4)
addl s4,1,s4
ldq t6,0(t6)
lda t7,8(t7)
addq t8,t1,t0
mulq t4,t5,t8
cmpult t0,t1,AT
addq t10,t3,t2
mulq t6,s5,t10
addq t9,AT,t1
cmpult t2,t3,v0
cmplt s4,a5,t12
umulh t4,t5,t9
addq t11,v0,t3
addq t2,t0,t2
s8addq s4,a1,t4
umulh t6,s5,t11
cmpult t2,t0,v0
addq t3,v0,t3
s8addq s4,a3,t6
stq t2,-8(t7)
nop
unop
bne t12,.L1st
.set reorder
addq t8,t1,t0
addq t10,t3,t2
cmpult t0,t1,AT
cmpult t2,t3,v0
addq t9,AT,t1
addq t11,v0,t3
addq t2,t0,t2
cmpult t2,t0,v0
addq t3,v0,t3
stq t2,0(t7)
addq t3,t1,t3
cmpult t3,t1,AT
stq t3,8(t7)
stq AT,16(t7)
mov 1,s3
.align 4
.Louter:
s8addq s3,a2,t5
ldq t1,0(a1)
ldq t4,8(a1)
ldq t5,0(t5)
ldq t3,0(a3)
ldq t6,8(a3)
ldq t12,0(sp)
mulq t1,t5,t0
umulh t1,t5,t1
addq t0,t12,t0
cmpult t0,t12,AT
addq t1,AT,t1
mulq t0,a4,s5
mulq t3,s5,t2
umulh t3,s5,t3
addq t2,t0,t2
cmpult t2,t0,AT
mov 2,s4
addq t3,AT,t3
mulq t4,t5,t8
mov sp,t7
umulh t4,t5,t9
mulq t6,s5,t10
s8addq s4,a1,t4
umulh t6,s5,t11
.align 4
.Linner:
.set noreorder
ldq t12,8(t7) #L0
nop #U1
ldq t4,0(t4) #L1
s8addq s4,a3,t6 #U0
ldq t6,0(t6) #L0
nop #U1
addq t8,t1,t0 #L1
lda t7,8(t7)
mulq t4,t5,t8 #U1
cmpult t0,t1,AT #L0
addq t10,t3,t2 #L1
addl s4,1,s4
mulq t6,s5,t10 #U1
addq t9,AT,t1 #L0
addq t0,t12,t0 #L1
cmpult t2,t3,v0 #U0
umulh t4,t5,t9 #U1
cmpult t0,t12,AT #L0
addq t2,t0,t2 #L1
addq t11,v0,t3 #U0
umulh t6,s5,t11 #U1
s8addq s4,a1,t4 #L0
cmpult t2,t0,v0 #L1
cmplt s4,a5,t12 #U0 # borrow t12
addq t1,AT,t1 #L0
addq t3,v0,t3 #U1
stq t2,-8(t7) #L1
bne t12,.Linner #U0
.set reorder
ldq t12,8(t7)
addq t8,t1,t0
addq t10,t3,t2
cmpult t0,t1,AT
cmpult t2,t3,v0
addq t9,AT,t1
addq t11,v0,t3
addq t0,t12,t0
cmpult t0,t12,AT
addq t1,AT,t1
ldq t12,16(t7)
addq t2,t0,s4
cmpult s4,t0,v0
addq t3,v0,t3
addq t3,t1,t2
stq s4,0(t7)
cmpult t2,t1,t3
addq t2,t12,t2
cmpult t2,t12,AT
addl s3,1,s3
addq t3,AT,t3
stq t2,8(t7)
cmplt s3,a5,t12 # borrow t12
stq t3,16(t7)
bne t12,.Louter
s8addq a5,sp,t12 # &tp[num]
mov a0,a2 # put rp aside
mov sp,t7
mov sp,a1
mov 0,t1 # clear borrow bit
.align 4
.Lsub: ldq t0,0(t7)
ldq t2,0(a3)
lda t7,8(t7)
lda a3,8(a3)
subq t0,t2,t2 # tp[i]-np[i]
cmpult t0,t2,AT
subq t2,t1,t0
cmpult t2,t0,t1
or t1,AT,t1
stq t0,0(a0)
cmpult t7,t12,v0
lda a0,8(a0)
bne v0,.Lsub
subq t3,t1,t1 # handle upmost overflow bit
mov sp,t7
mov a2,a0 # restore rp
.align 4
.Lcopy: ldq t4,0(t7) # conditional copy
ldq t6,0(a0)
lda t7,8(t7)
lda a0,8(a0)
cmoveq t1,t6,t4
stq zero,-8(t7) # zap tp
cmpult t7,t12,AT
stq t4,-8(a0)
bne AT,.Lcopy
mov 1,v0
.Lexit:
.set noreorder
mov fp,sp
/*ldq ra,0(sp)*/
ldq s3,8(sp)
ldq s4,16(sp)
ldq s5,24(sp)
ldq fp,32(sp)
lda sp,48(sp)
ret (ra)
.end bn_mul_mont
.ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro@openssl.org>"
.align 2