#include <machine/asm.h>
.text
.globl poly1305_init
.hidden poly1305_init
.globl poly1305_blocks
.hidden poly1305_blocks
.globl poly1305_emit
.hidden poly1305_emit
.type poly1305_init,@function
.align 32
poly1305_init:
.cfi_startproc
xorq %rax,%rax
movq %rax,0(%rdi)
movq %rax,8(%rdi)
movq %rax,16(%rdi)
cmpq $0,%rsi
je .Lno_key
leaq poly1305_blocks(%rip),%r10
leaq poly1305_emit(%rip),%r11
movq OPENSSL_ia32cap_P+4(%rip),%r9
leaq poly1305_blocks_avx(%rip),%rax
leaq poly1305_emit_avx(%rip),%rcx
btq $28,%r9
cmovcq %rax,%r10
cmovcq %rcx,%r11
leaq poly1305_blocks_avx2(%rip),%rax
btq $37,%r9
cmovcq %rax,%r10
movq $2149646336,%rax
shrq $32,%r9
andq %rax,%r9
cmpq %rax,%r9
je .Linit_base2_44
movq $0x0ffffffc0fffffff,%rax
movq $0x0ffffffc0ffffffc,%rcx
andq 0(%rsi),%rax
andq 8(%rsi),%rcx
movq %rax,24(%rdi)
movq %rcx,32(%rdi)
movq %r10,0(%rdx)
movq %r11,8(%rdx)
movl $1,%eax
.Lno_key:
.byte 0xf3,0xc3
.cfi_endproc
.size poly1305_init,.-poly1305_init
.type poly1305_blocks,@function
.align 32
poly1305_blocks:
.cfi_startproc
.Lblocks:
shrq $4,%rdx
jz .Lno_data
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-16
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
.Lblocks_body:
movq %rdx,%r15
movq 24(%rdi),%r11
movq 32(%rdi),%r13
movq 0(%rdi),%r14
movq 8(%rdi),%rbx
movq 16(%rdi),%rbp
movq %r13,%r12
shrq $2,%r13
movq %r12,%rax
addq %r12,%r13
jmp .Loop
.align 32
.Loop:
addq 0(%rsi),%r14
adcq 8(%rsi),%rbx
leaq 16(%rsi),%rsi
adcq %rcx,%rbp
mulq %r14
movq %rax,%r9
movq %r11,%rax
movq %rdx,%r10
mulq %r14
movq %rax,%r14
movq %r11,%rax
movq %rdx,%r8
mulq %rbx
addq %rax,%r9
movq %r13,%rax
adcq %rdx,%r10
mulq %rbx
movq %rbp,%rbx
addq %rax,%r14
adcq %rdx,%r8
imulq %r13,%rbx
addq %rbx,%r9
movq %r8,%rbx
adcq $0,%r10
imulq %r11,%rbp
addq %r9,%rbx
movq $-4,%rax
adcq %rbp,%r10
andq %r10,%rax
movq %r10,%rbp
shrq $2,%r10
andq $3,%rbp
addq %r10,%rax
addq %rax,%r14
adcq $0,%rbx
adcq $0,%rbp
movq %r12,%rax
decq %r15
jnz .Loop
movq %r14,0(%rdi)
movq %rbx,8(%rdi)
movq %rbp,16(%rdi)
movq 0(%rsp),%r15
.cfi_restore %r15
movq 8(%rsp),%r14
.cfi_restore %r14
movq 16(%rsp),%r13
.cfi_restore %r13
movq 24(%rsp),%r12
.cfi_restore %r12
movq 32(%rsp),%rbp
.cfi_restore %rbp
movq 40(%rsp),%rbx
.cfi_restore %rbx
leaq 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
.Lno_data:
.Lblocks_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
.size poly1305_blocks,.-poly1305_blocks
.type poly1305_emit,@function
.align 32
poly1305_emit:
.cfi_startproc
.Lemit:
movq 0(%rdi),%r8
movq 8(%rdi),%r9
movq 16(%rdi),%r10
movq %r8,%rax
addq $5,%r8
movq %r9,%rcx
adcq $0,%r9
adcq $0,%r10
shrq $2,%r10
cmovnzq %r8,%rax
cmovnzq %r9,%rcx
addq 0(%rdx),%rax
adcq 8(%rdx),%rcx
movq %rax,0(%rsi)
movq %rcx,8(%rsi)
.byte 0xf3,0xc3
.cfi_endproc
.size poly1305_emit,.-poly1305_emit
.type __poly1305_block,@function
.align 32
__poly1305_block:
.cfi_startproc
mulq %r14
movq %rax,%r9
movq %r11,%rax
movq %rdx,%r10
mulq %r14
movq %rax,%r14
movq %r11,%rax
movq %rdx,%r8
mulq %rbx
addq %rax,%r9
movq %r13,%rax
adcq %rdx,%r10
mulq %rbx
movq %rbp,%rbx
addq %rax,%r14
adcq %rdx,%r8
imulq %r13,%rbx
addq %rbx,%r9
movq %r8,%rbx
adcq $0,%r10
imulq %r11,%rbp
addq %r9,%rbx
movq $-4,%rax
adcq %rbp,%r10
andq %r10,%rax
movq %r10,%rbp
shrq $2,%r10
andq $3,%rbp
addq %r10,%rax
addq %rax,%r14
adcq $0,%rbx
adcq $0,%rbp
.byte 0xf3,0xc3
.cfi_endproc
.size __poly1305_block,.-__poly1305_block
.type __poly1305_init_avx,@function
.align 32
__poly1305_init_avx:
.cfi_startproc
movq %r11,%r14
movq %r12,%rbx
xorq %rbp,%rbp
leaq 48+64(%rdi),%rdi
movq %r12,%rax
call __poly1305_block
movl $0x3ffffff,%eax
movl $0x3ffffff,%edx
movq %r14,%r8
andl %r14d,%eax
movq %r11,%r9
andl %r11d,%edx
movl %eax,-64(%rdi)
shrq $26,%r8
movl %edx,-60(%rdi)
shrq $26,%r9
movl $0x3ffffff,%eax
movl $0x3ffffff,%edx
andl %r8d,%eax
andl %r9d,%edx
movl %eax,-48(%rdi)
leal (%rax,%rax,4),%eax
movl %edx,-44(%rdi)
leal (%rdx,%rdx,4),%edx
movl %eax,-32(%rdi)
shrq $26,%r8
movl %edx,-28(%rdi)
shrq $26,%r9
movq %rbx,%rax
movq %r12,%rdx
shlq $12,%rax
shlq $12,%rdx
orq %r8,%rax
orq %r9,%rdx
andl $0x3ffffff,%eax
andl $0x3ffffff,%edx
movl %eax,-16(%rdi)
leal (%rax,%rax,4),%eax
movl %edx,-12(%rdi)
leal (%rdx,%rdx,4),%edx
movl %eax,0(%rdi)
movq %rbx,%r8
movl %edx,4(%rdi)
movq %r12,%r9
movl $0x3ffffff,%eax
movl $0x3ffffff,%edx
shrq $14,%r8
shrq $14,%r9
andl %r8d,%eax
andl %r9d,%edx
movl %eax,16(%rdi)
leal (%rax,%rax,4),%eax
movl %edx,20(%rdi)
leal (%rdx,%rdx,4),%edx
movl %eax,32(%rdi)
shrq $26,%r8
movl %edx,36(%rdi)
shrq $26,%r9
movq %rbp,%rax
shlq $24,%rax
orq %rax,%r8
movl %r8d,48(%rdi)
leaq (%r8,%r8,4),%r8
movl %r9d,52(%rdi)
leaq (%r9,%r9,4),%r9
movl %r8d,64(%rdi)
movl %r9d,68(%rdi)
movq %r12,%rax
call __poly1305_block
movl $0x3ffffff,%eax
movq %r14,%r8
andl %r14d,%eax
shrq $26,%r8
movl %eax,-52(%rdi)
movl $0x3ffffff,%edx
andl %r8d,%edx
movl %edx,-36(%rdi)
leal (%rdx,%rdx,4),%edx
shrq $26,%r8
movl %edx,-20(%rdi)
movq %rbx,%rax
shlq $12,%rax
orq %r8,%rax
andl $0x3ffffff,%eax
movl %eax,-4(%rdi)
leal (%rax,%rax,4),%eax
movq %rbx,%r8
movl %eax,12(%rdi)
movl $0x3ffffff,%edx
shrq $14,%r8
andl %r8d,%edx
movl %edx,28(%rdi)
leal (%rdx,%rdx,4),%edx
shrq $26,%r8
movl %edx,44(%rdi)
movq %rbp,%rax
shlq $24,%rax
orq %rax,%r8
movl %r8d,60(%rdi)
leaq (%r8,%r8,4),%r8
movl %r8d,76(%rdi)
movq %r12,%rax
call __poly1305_block
movl $0x3ffffff,%eax
movq %r14,%r8
andl %r14d,%eax
shrq $26,%r8
movl %eax,-56(%rdi)
movl $0x3ffffff,%edx
andl %r8d,%edx
movl %edx,-40(%rdi)
leal (%rdx,%rdx,4),%edx
shrq $26,%r8
movl %edx,-24(%rdi)
movq %rbx,%rax
shlq $12,%rax
orq %r8,%rax
andl $0x3ffffff,%eax
movl %eax,-8(%rdi)
leal (%rax,%rax,4),%eax
movq %rbx,%r8
movl %eax,8(%rdi)
movl $0x3ffffff,%edx
shrq $14,%r8
andl %r8d,%edx
movl %edx,24(%rdi)
leal (%rdx,%rdx,4),%edx
shrq $26,%r8
movl %edx,40(%rdi)
movq %rbp,%rax
shlq $24,%rax
orq %rax,%r8
movl %r8d,56(%rdi)
leaq (%r8,%r8,4),%r8
movl %r8d,72(%rdi)
leaq -48-64(%rdi),%rdi
.byte 0xf3,0xc3
.cfi_endproc
.size __poly1305_init_avx,.-__poly1305_init_avx
.type poly1305_blocks_avx,@function
.align 32
poly1305_blocks_avx:
.cfi_startproc
movl 20(%rdi),%r8d
cmpq $128,%rdx
jae .Lblocks_avx
testl %r8d,%r8d
jz .Lblocks
.Lblocks_avx:
andq $-16,%rdx
jz .Lno_data_avx
vzeroupper
testl %r8d,%r8d
jz .Lbase2_64_avx
testq $31,%rdx
jz .Leven_avx
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-16
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
.Lblocks_avx_body:
movq %rdx,%r15
movq 0(%rdi),%r8
movq 8(%rdi),%r9
movl 16(%rdi),%ebp
movq 24(%rdi),%r11
movq 32(%rdi),%r13
movl %r8d,%r14d
andq $-2147483648,%r8
movq %r9,%r12
movl %r9d,%ebx
andq $-2147483648,%r9
shrq $6,%r8
shlq $52,%r12
addq %r8,%r14
shrq $12,%rbx
shrq $18,%r9
addq %r12,%r14
adcq %r9,%rbx
movq %rbp,%r8
shlq $40,%r8
shrq $24,%rbp
addq %r8,%rbx
adcq $0,%rbp
movq $-4,%r9
movq %rbp,%r8
andq %rbp,%r9
shrq $2,%r8
andq $3,%rbp
addq %r9,%r8
addq %r8,%r14
adcq $0,%rbx
adcq $0,%rbp
movq %r13,%r12
movq %r13,%rax
shrq $2,%r13
addq %r12,%r13
addq 0(%rsi),%r14
adcq 8(%rsi),%rbx
leaq 16(%rsi),%rsi
adcq %rcx,%rbp
call __poly1305_block
testq %rcx,%rcx
jz .Lstore_base2_64_avx
movq %r14,%rax
movq %r14,%rdx
shrq $52,%r14
movq %rbx,%r11
movq %rbx,%r12
shrq $26,%rdx
andq $0x3ffffff,%rax
shlq $12,%r11
andq $0x3ffffff,%rdx
shrq $14,%rbx
orq %r11,%r14
shlq $24,%rbp
andq $0x3ffffff,%r14
shrq $40,%r12
andq $0x3ffffff,%rbx
orq %r12,%rbp
subq $16,%r15
jz .Lstore_base2_26_avx
vmovd %eax,%xmm0
vmovd %edx,%xmm1
vmovd %r14d,%xmm2
vmovd %ebx,%xmm3
vmovd %ebp,%xmm4
jmp .Lproceed_avx
.align 32
.Lstore_base2_64_avx:
movq %r14,0(%rdi)
movq %rbx,8(%rdi)
movq %rbp,16(%rdi)
jmp .Ldone_avx
.align 16
.Lstore_base2_26_avx:
movl %eax,0(%rdi)
movl %edx,4(%rdi)
movl %r14d,8(%rdi)
movl %ebx,12(%rdi)
movl %ebp,16(%rdi)
.align 16
.Ldone_avx:
movq 0(%rsp),%r15
.cfi_restore %r15
movq 8(%rsp),%r14
.cfi_restore %r14
movq 16(%rsp),%r13
.cfi_restore %r13
movq 24(%rsp),%r12
.cfi_restore %r12
movq 32(%rsp),%rbp
.cfi_restore %rbp
movq 40(%rsp),%rbx
.cfi_restore %rbx
leaq 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
.Lno_data_avx:
.Lblocks_avx_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
.align 32
.Lbase2_64_avx:
.cfi_startproc
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-16
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
.Lbase2_64_avx_body:
movq %rdx,%r15
movq 24(%rdi),%r11
movq 32(%rdi),%r13
movq 0(%rdi),%r14
movq 8(%rdi),%rbx
movl 16(%rdi),%ebp
movq %r13,%r12
movq %r13,%rax
shrq $2,%r13
addq %r12,%r13
testq $31,%rdx
jz .Linit_avx
addq 0(%rsi),%r14
adcq 8(%rsi),%rbx
leaq 16(%rsi),%rsi
adcq %rcx,%rbp
subq $16,%r15
call __poly1305_block
.Linit_avx:
movq %r14,%rax
movq %r14,%rdx
shrq $52,%r14
movq %rbx,%r8
movq %rbx,%r9
shrq $26,%rdx
andq $0x3ffffff,%rax
shlq $12,%r8
andq $0x3ffffff,%rdx
shrq $14,%rbx
orq %r8,%r14
shlq $24,%rbp
andq $0x3ffffff,%r14
shrq $40,%r9
andq $0x3ffffff,%rbx
orq %r9,%rbp
vmovd %eax,%xmm0
vmovd %edx,%xmm1
vmovd %r14d,%xmm2
vmovd %ebx,%xmm3
vmovd %ebp,%xmm4
movl $1,20(%rdi)
call __poly1305_init_avx
.Lproceed_avx:
movq %r15,%rdx
movq 0(%rsp),%r15
.cfi_restore %r15
movq 8(%rsp),%r14
.cfi_restore %r14
movq 16(%rsp),%r13
.cfi_restore %r13
movq 24(%rsp),%r12
.cfi_restore %r12
movq 32(%rsp),%rbp
.cfi_restore %rbp
movq 40(%rsp),%rbx
.cfi_restore %rbx
leaq 48(%rsp),%rax
leaq 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
.Lbase2_64_avx_epilogue:
jmp .Ldo_avx
.cfi_endproc
.align 32
.Leven_avx:
.cfi_startproc
vmovd 0(%rdi),%xmm0
vmovd 4(%rdi),%xmm1
vmovd 8(%rdi),%xmm2
vmovd 12(%rdi),%xmm3
vmovd 16(%rdi),%xmm4
.Ldo_avx:
leaq -88(%rsp),%r11
.cfi_def_cfa %r11,0x60
subq $0x178,%rsp
subq $64,%rdx
leaq -32(%rsi),%rax
cmovcq %rax,%rsi
vmovdqu 48(%rdi),%xmm14
leaq 112(%rdi),%rdi
leaq .Lconst(%rip),%rcx
vmovdqu 32(%rsi),%xmm5
vmovdqu 48(%rsi),%xmm6
vmovdqa 64(%rcx),%xmm15
vpsrldq $6,%xmm5,%xmm7
vpsrldq $6,%xmm6,%xmm8
vpunpckhqdq %xmm6,%xmm5,%xmm9
vpunpcklqdq %xmm6,%xmm5,%xmm5
vpunpcklqdq %xmm8,%xmm7,%xmm8
vpsrlq $40,%xmm9,%xmm9
vpsrlq $26,%xmm5,%xmm6
vpand %xmm15,%xmm5,%xmm5
vpsrlq $4,%xmm8,%xmm7
vpand %xmm15,%xmm6,%xmm6
vpsrlq $30,%xmm8,%xmm8
vpand %xmm15,%xmm7,%xmm7
vpand %xmm15,%xmm8,%xmm8
vpor 32(%rcx),%xmm9,%xmm9
jbe .Lskip_loop_avx
vmovdqu -48(%rdi),%xmm11
vmovdqu -32(%rdi),%xmm12
vpshufd $0xEE,%xmm14,%xmm13
vpshufd $0x44,%xmm14,%xmm10
vmovdqa %xmm13,-144(%r11)
vmovdqa %xmm10,0(%rsp)
vpshufd $0xEE,%xmm11,%xmm14
vmovdqu -16(%rdi),%xmm10
vpshufd $0x44,%xmm11,%xmm11
vmovdqa %xmm14,-128(%r11)
vmovdqa %xmm11,16(%rsp)
vpshufd $0xEE,%xmm12,%xmm13
vmovdqu 0(%rdi),%xmm11
vpshufd $0x44,%xmm12,%xmm12
vmovdqa %xmm13,-112(%r11)
vmovdqa %xmm12,32(%rsp)
vpshufd $0xEE,%xmm10,%xmm14
vmovdqu 16(%rdi),%xmm12
vpshufd $0x44,%xmm10,%xmm10
vmovdqa %xmm14,-96(%r11)
vmovdqa %xmm10,48(%rsp)
vpshufd $0xEE,%xmm11,%xmm13
vmovdqu 32(%rdi),%xmm10
vpshufd $0x44,%xmm11,%xmm11
vmovdqa %xmm13,-80(%r11)
vmovdqa %xmm11,64(%rsp)
vpshufd $0xEE,%xmm12,%xmm14
vmovdqu 48(%rdi),%xmm11
vpshufd $0x44,%xmm12,%xmm12
vmovdqa %xmm14,-64(%r11)
vmovdqa %xmm12,80(%rsp)
vpshufd $0xEE,%xmm10,%xmm13
vmovdqu 64(%rdi),%xmm12
vpshufd $0x44,%xmm10,%xmm10
vmovdqa %xmm13,-48(%r11)
vmovdqa %xmm10,96(%rsp)
vpshufd $0xEE,%xmm11,%xmm14
vpshufd $0x44,%xmm11,%xmm11
vmovdqa %xmm14,-32(%r11)
vmovdqa %xmm11,112(%rsp)
vpshufd $0xEE,%xmm12,%xmm13
vmovdqa 0(%rsp),%xmm14
vpshufd $0x44,%xmm12,%xmm12
vmovdqa %xmm13,-16(%r11)
vmovdqa %xmm12,128(%rsp)
jmp .Loop_avx
.align 32
.Loop_avx:
vpmuludq %xmm5,%xmm14,%xmm10
vpmuludq %xmm6,%xmm14,%xmm11
vmovdqa %xmm2,32(%r11)
vpmuludq %xmm7,%xmm14,%xmm12
vmovdqa 16(%rsp),%xmm2
vpmuludq %xmm8,%xmm14,%xmm13
vpmuludq %xmm9,%xmm14,%xmm14
vmovdqa %xmm0,0(%r11)
vpmuludq 32(%rsp),%xmm9,%xmm0
vmovdqa %xmm1,16(%r11)
vpmuludq %xmm8,%xmm2,%xmm1
vpaddq %xmm0,%xmm10,%xmm10
vpaddq %xmm1,%xmm14,%xmm14
vmovdqa %xmm3,48(%r11)
vpmuludq %xmm7,%xmm2,%xmm0
vpmuludq %xmm6,%xmm2,%xmm1
vpaddq %xmm0,%xmm13,%xmm13
vmovdqa 48(%rsp),%xmm3
vpaddq %xmm1,%xmm12,%xmm12
vmovdqa %xmm4,64(%r11)
vpmuludq %xmm5,%xmm2,%xmm2
vpmuludq %xmm7,%xmm3,%xmm0
vpaddq %xmm2,%xmm11,%xmm11
vmovdqa 64(%rsp),%xmm4
vpaddq %xmm0,%xmm14,%xmm14
vpmuludq %xmm6,%xmm3,%xmm1
vpmuludq %xmm5,%xmm3,%xmm3
vpaddq %xmm1,%xmm13,%xmm13
vmovdqa 80(%rsp),%xmm2
vpaddq %xmm3,%xmm12,%xmm12
vpmuludq %xmm9,%xmm4,%xmm0
vpmuludq %xmm8,%xmm4,%xmm4
vpaddq %xmm0,%xmm11,%xmm11
vmovdqa 96(%rsp),%xmm3
vpaddq %xmm4,%xmm10,%xmm10
vmovdqa 128(%rsp),%xmm4
vpmuludq %xmm6,%xmm2,%xmm1
vpmuludq %xmm5,%xmm2,%xmm2
vpaddq %xmm1,%xmm14,%xmm14
vpaddq %xmm2,%xmm13,%xmm13
vpmuludq %xmm9,%xmm3,%xmm0
vpmuludq %xmm8,%xmm3,%xmm1
vpaddq %xmm0,%xmm12,%xmm12
vmovdqu 0(%rsi),%xmm0
vpaddq %xmm1,%xmm11,%xmm11
vpmuludq %xmm7,%xmm3,%xmm3
vpmuludq %xmm7,%xmm4,%xmm7
vpaddq %xmm3,%xmm10,%xmm10
vmovdqu 16(%rsi),%xmm1
vpaddq %xmm7,%xmm11,%xmm11
vpmuludq %xmm8,%xmm4,%xmm8
vpmuludq %xmm9,%xmm4,%xmm9
vpsrldq $6,%xmm0,%xmm2
vpaddq %xmm8,%xmm12,%xmm12
vpaddq %xmm9,%xmm13,%xmm13
vpsrldq $6,%xmm1,%xmm3
vpmuludq 112(%rsp),%xmm5,%xmm9
vpmuludq %xmm6,%xmm4,%xmm5
vpunpckhqdq %xmm1,%xmm0,%xmm4
vpaddq %xmm9,%xmm14,%xmm14
vmovdqa -144(%r11),%xmm9
vpaddq %xmm5,%xmm10,%xmm10
vpunpcklqdq %xmm1,%xmm0,%xmm0
vpunpcklqdq %xmm3,%xmm2,%xmm3
vpsrldq $5,%xmm4,%xmm4
vpsrlq $26,%xmm0,%xmm1
vpand %xmm15,%xmm0,%xmm0
vpsrlq $4,%xmm3,%xmm2
vpand %xmm15,%xmm1,%xmm1
vpand 0(%rcx),%xmm4,%xmm4
vpsrlq $30,%xmm3,%xmm3
vpand %xmm15,%xmm2,%xmm2
vpand %xmm15,%xmm3,%xmm3
vpor 32(%rcx),%xmm4,%xmm4
vpaddq 0(%r11),%xmm0,%xmm0
vpaddq 16(%r11),%xmm1,%xmm1
vpaddq 32(%r11),%xmm2,%xmm2
vpaddq 48(%r11),%xmm3,%xmm3
vpaddq 64(%r11),%xmm4,%xmm4
leaq 32(%rsi),%rax
leaq 64(%rsi),%rsi
subq $64,%rdx
cmovcq %rax,%rsi
vpmuludq %xmm0,%xmm9,%xmm5
vpmuludq %xmm1,%xmm9,%xmm6
vpaddq %xmm5,%xmm10,%xmm10
vpaddq %xmm6,%xmm11,%xmm11
vmovdqa -128(%r11),%xmm7
vpmuludq %xmm2,%xmm9,%xmm5
vpmuludq %xmm3,%xmm9,%xmm6
vpaddq %xmm5,%xmm12,%xmm12
vpaddq %xmm6,%xmm13,%xmm13
vpmuludq %xmm4,%xmm9,%xmm9
vpmuludq -112(%r11),%xmm4,%xmm5
vpaddq %xmm9,%xmm14,%xmm14
vpaddq %xmm5,%xmm10,%xmm10
vpmuludq %xmm2,%xmm7,%xmm6
vpmuludq %xmm3,%xmm7,%xmm5
vpaddq %xmm6,%xmm13,%xmm13
vmovdqa -96(%r11),%xmm8
vpaddq %xmm5,%xmm14,%xmm14
vpmuludq %xmm1,%xmm7,%xmm6
vpmuludq %xmm0,%xmm7,%xmm7
vpaddq %xmm6,%xmm12,%xmm12
vpaddq %xmm7,%xmm11,%xmm11
vmovdqa -80(%r11),%xmm9
vpmuludq %xmm2,%xmm8,%xmm5
vpmuludq %xmm1,%xmm8,%xmm6
vpaddq %xmm5,%xmm14,%xmm14
vpaddq %xmm6,%xmm13,%xmm13
vmovdqa -64(%r11),%xmm7
vpmuludq %xmm0,%xmm8,%xmm8
vpmuludq %xmm4,%xmm9,%xmm5
vpaddq %xmm8,%xmm12,%xmm12
vpaddq %xmm5,%xmm11,%xmm11
vmovdqa -48(%r11),%xmm8
vpmuludq %xmm3,%xmm9,%xmm9
vpmuludq %xmm1,%xmm7,%xmm6
vpaddq %xmm9,%xmm10,%xmm10
vmovdqa -16(%r11),%xmm9
vpaddq %xmm6,%xmm14,%xmm14
vpmuludq %xmm0,%xmm7,%xmm7
vpmuludq %xmm4,%xmm8,%xmm5
vpaddq %xmm7,%xmm13,%xmm13
vpaddq %xmm5,%xmm12,%xmm12
vmovdqu 32(%rsi),%xmm5
vpmuludq %xmm3,%xmm8,%xmm7
vpmuludq %xmm2,%xmm8,%xmm8
vpaddq %xmm7,%xmm11,%xmm11
vmovdqu 48(%rsi),%xmm6
vpaddq %xmm8,%xmm10,%xmm10
vpmuludq %xmm2,%xmm9,%xmm2
vpmuludq %xmm3,%xmm9,%xmm3
vpsrldq $6,%xmm5,%xmm7
vpaddq %xmm2,%xmm11,%xmm11
vpmuludq %xmm4,%xmm9,%xmm4
vpsrldq $6,%xmm6,%xmm8
vpaddq %xmm3,%xmm12,%xmm2
vpaddq %xmm4,%xmm13,%xmm3
vpmuludq -32(%r11),%xmm0,%xmm4
vpmuludq %xmm1,%xmm9,%xmm0
vpunpckhqdq %xmm6,%xmm5,%xmm9
vpaddq %xmm4,%xmm14,%xmm4
vpaddq %xmm0,%xmm10,%xmm0
vpunpcklqdq %xmm6,%xmm5,%xmm5
vpunpcklqdq %xmm8,%xmm7,%xmm8
vpsrldq $5,%xmm9,%xmm9
vpsrlq $26,%xmm5,%xmm6
vmovdqa 0(%rsp),%xmm14
vpand %xmm15,%xmm5,%xmm5
vpsrlq $4,%xmm8,%xmm7
vpand %xmm15,%xmm6,%xmm6
vpand 0(%rcx),%xmm9,%xmm9
vpsrlq $30,%xmm8,%xmm8
vpand %xmm15,%xmm7,%xmm7
vpand %xmm15,%xmm8,%xmm8
vpor 32(%rcx),%xmm9,%xmm9
vpsrlq $26,%xmm3,%xmm13
vpand %xmm15,%xmm3,%xmm3
vpaddq %xmm13,%xmm4,%xmm4
vpsrlq $26,%xmm0,%xmm10
vpand %xmm15,%xmm0,%xmm0
vpaddq %xmm10,%xmm11,%xmm1
vpsrlq $26,%xmm4,%xmm10
vpand %xmm15,%xmm4,%xmm4
vpsrlq $26,%xmm1,%xmm11
vpand %xmm15,%xmm1,%xmm1
vpaddq %xmm11,%xmm2,%xmm2
vpaddq %xmm10,%xmm0,%xmm0
vpsllq $2,%xmm10,%xmm10
vpaddq %xmm10,%xmm0,%xmm0
vpsrlq $26,%xmm2,%xmm12
vpand %xmm15,%xmm2,%xmm2
vpaddq %xmm12,%xmm3,%xmm3
vpsrlq $26,%xmm0,%xmm10
vpand %xmm15,%xmm0,%xmm0
vpaddq %xmm10,%xmm1,%xmm1
vpsrlq $26,%xmm3,%xmm13
vpand %xmm15,%xmm3,%xmm3
vpaddq %xmm13,%xmm4,%xmm4
ja .Loop_avx
.Lskip_loop_avx:
vpshufd $0x10,%xmm14,%xmm14
addq $32,%rdx
jnz .Long_tail_avx
vpaddq %xmm2,%xmm7,%xmm7
vpaddq %xmm0,%xmm5,%xmm5
vpaddq %xmm1,%xmm6,%xmm6
vpaddq %xmm3,%xmm8,%xmm8
vpaddq %xmm4,%xmm9,%xmm9
.Long_tail_avx:
vmovdqa %xmm2,32(%r11)
vmovdqa %xmm0,0(%r11)
vmovdqa %xmm1,16(%r11)
vmovdqa %xmm3,48(%r11)
vmovdqa %xmm4,64(%r11)
vpmuludq %xmm7,%xmm14,%xmm12
vpmuludq %xmm5,%xmm14,%xmm10
vpshufd $0x10,-48(%rdi),%xmm2
vpmuludq %xmm6,%xmm14,%xmm11
vpmuludq %xmm8,%xmm14,%xmm13
vpmuludq %xmm9,%xmm14,%xmm14
vpmuludq %xmm8,%xmm2,%xmm0
vpaddq %xmm0,%xmm14,%xmm14
vpshufd $0x10,-32(%rdi),%xmm3
vpmuludq %xmm7,%xmm2,%xmm1
vpaddq %xmm1,%xmm13,%xmm13
vpshufd $0x10,-16(%rdi),%xmm4
vpmuludq %xmm6,%xmm2,%xmm0
vpaddq %xmm0,%xmm12,%xmm12
vpmuludq %xmm5,%xmm2,%xmm2
vpaddq %xmm2,%xmm11,%xmm11
vpmuludq %xmm9,%xmm3,%xmm3
vpaddq %xmm3,%xmm10,%xmm10
vpshufd $0x10,0(%rdi),%xmm2
vpmuludq %xmm7,%xmm4,%xmm1
vpaddq %xmm1,%xmm14,%xmm14
vpmuludq %xmm6,%xmm4,%xmm0
vpaddq %xmm0,%xmm13,%xmm13
vpshufd $0x10,16(%rdi),%xmm3
vpmuludq %xmm5,%xmm4,%xmm4
vpaddq %xmm4,%xmm12,%xmm12
vpmuludq %xmm9,%xmm2,%xmm1
vpaddq %xmm1,%xmm11,%xmm11
vpshufd $0x10,32(%rdi),%xmm4
vpmuludq %xmm8,%xmm2,%xmm2
vpaddq %xmm2,%xmm10,%xmm10
vpmuludq %xmm6,%xmm3,%xmm0
vpaddq %xmm0,%xmm14,%xmm14
vpmuludq %xmm5,%xmm3,%xmm3
vpaddq %xmm3,%xmm13,%xmm13
vpshufd $0x10,48(%rdi),%xmm2
vpmuludq %xmm9,%xmm4,%xmm1
vpaddq %xmm1,%xmm12,%xmm12
vpshufd $0x10,64(%rdi),%xmm3
vpmuludq %xmm8,%xmm4,%xmm0
vpaddq %xmm0,%xmm11,%xmm11
vpmuludq %xmm7,%xmm4,%xmm4
vpaddq %xmm4,%xmm10,%xmm10
vpmuludq %xmm5,%xmm2,%xmm2
vpaddq %xmm2,%xmm14,%xmm14
vpmuludq %xmm9,%xmm3,%xmm1
vpaddq %xmm1,%xmm13,%xmm13
vpmuludq %xmm8,%xmm3,%xmm0
vpaddq %xmm0,%xmm12,%xmm12
vpmuludq %xmm7,%xmm3,%xmm1
vpaddq %xmm1,%xmm11,%xmm11
vpmuludq %xmm6,%xmm3,%xmm3
vpaddq %xmm3,%xmm10,%xmm10
jz .Lshort_tail_avx
vmovdqu 0(%rsi),%xmm0
vmovdqu 16(%rsi),%xmm1
vpsrldq $6,%xmm0,%xmm2
vpsrldq $6,%xmm1,%xmm3
vpunpckhqdq %xmm1,%xmm0,%xmm4
vpunpcklqdq %xmm1,%xmm0,%xmm0
vpunpcklqdq %xmm3,%xmm2,%xmm3
vpsrlq $40,%xmm4,%xmm4
vpsrlq $26,%xmm0,%xmm1
vpand %xmm15,%xmm0,%xmm0
vpsrlq $4,%xmm3,%xmm2
vpand %xmm15,%xmm1,%xmm1
vpsrlq $30,%xmm3,%xmm3
vpand %xmm15,%xmm2,%xmm2
vpand %xmm15,%xmm3,%xmm3
vpor 32(%rcx),%xmm4,%xmm4
vpshufd $0x32,-64(%rdi),%xmm9
vpaddq 0(%r11),%xmm0,%xmm0
vpaddq 16(%r11),%xmm1,%xmm1
vpaddq 32(%r11),%xmm2,%xmm2
vpaddq 48(%r11),%xmm3,%xmm3
vpaddq 64(%r11),%xmm4,%xmm4
vpmuludq %xmm0,%xmm9,%xmm5
vpaddq %xmm5,%xmm10,%xmm10
vpmuludq %xmm1,%xmm9,%xmm6
vpaddq %xmm6,%xmm11,%xmm11
vpmuludq %xmm2,%xmm9,%xmm5
vpaddq %xmm5,%xmm12,%xmm12
vpshufd $0x32,-48(%rdi),%xmm7
vpmuludq %xmm3,%xmm9,%xmm6
vpaddq %xmm6,%xmm13,%xmm13
vpmuludq %xmm4,%xmm9,%xmm9
vpaddq %xmm9,%xmm14,%xmm14
vpmuludq %xmm3,%xmm7,%xmm5
vpaddq %xmm5,%xmm14,%xmm14
vpshufd $0x32,-32(%rdi),%xmm8
vpmuludq %xmm2,%xmm7,%xmm6
vpaddq %xmm6,%xmm13,%xmm13
vpshufd $0x32,-16(%rdi),%xmm9
vpmuludq %xmm1,%xmm7,%xmm5
vpaddq %xmm5,%xmm12,%xmm12
vpmuludq %xmm0,%xmm7,%xmm7
vpaddq %xmm7,%xmm11,%xmm11
vpmuludq %xmm4,%xmm8,%xmm8
vpaddq %xmm8,%xmm10,%xmm10
vpshufd $0x32,0(%rdi),%xmm7
vpmuludq %xmm2,%xmm9,%xmm6
vpaddq %xmm6,%xmm14,%xmm14
vpmuludq %xmm1,%xmm9,%xmm5
vpaddq %xmm5,%xmm13,%xmm13
vpshufd $0x32,16(%rdi),%xmm8
vpmuludq %xmm0,%xmm9,%xmm9
vpaddq %xmm9,%xmm12,%xmm12
vpmuludq %xmm4,%xmm7,%xmm6
vpaddq %xmm6,%xmm11,%xmm11
vpshufd $0x32,32(%rdi),%xmm9
vpmuludq %xmm3,%xmm7,%xmm7
vpaddq %xmm7,%xmm10,%xmm10
vpmuludq %xmm1,%xmm8,%xmm5
vpaddq %xmm5,%xmm14,%xmm14
vpmuludq %xmm0,%xmm8,%xmm8
vpaddq %xmm8,%xmm13,%xmm13
vpshufd $0x32,48(%rdi),%xmm7
vpmuludq %xmm4,%xmm9,%xmm6
vpaddq %xmm6,%xmm12,%xmm12
vpshufd $0x32,64(%rdi),%xmm8
vpmuludq %xmm3,%xmm9,%xmm5
vpaddq %xmm5,%xmm11,%xmm11
vpmuludq %xmm2,%xmm9,%xmm9
vpaddq %xmm9,%xmm10,%xmm10
vpmuludq %xmm0,%xmm7,%xmm7
vpaddq %xmm7,%xmm14,%xmm14
vpmuludq %xmm4,%xmm8,%xmm6
vpaddq %xmm6,%xmm13,%xmm13
vpmuludq %xmm3,%xmm8,%xmm5
vpaddq %xmm5,%xmm12,%xmm12
vpmuludq %xmm2,%xmm8,%xmm6
vpaddq %xmm6,%xmm11,%xmm11
vpmuludq %xmm1,%xmm8,%xmm8
vpaddq %xmm8,%xmm10,%xmm10
.Lshort_tail_avx:
vpsrldq $8,%xmm14,%xmm9
vpsrldq $8,%xmm13,%xmm8
vpsrldq $8,%xmm11,%xmm6
vpsrldq $8,%xmm10,%xmm5
vpsrldq $8,%xmm12,%xmm7
vpaddq %xmm8,%xmm13,%xmm13
vpaddq %xmm9,%xmm14,%xmm14
vpaddq %xmm5,%xmm10,%xmm10
vpaddq %xmm6,%xmm11,%xmm11
vpaddq %xmm7,%xmm12,%xmm12
vpsrlq $26,%xmm13,%xmm3
vpand %xmm15,%xmm13,%xmm13
vpaddq %xmm3,%xmm14,%xmm14
vpsrlq $26,%xmm10,%xmm0
vpand %xmm15,%xmm10,%xmm10
vpaddq %xmm0,%xmm11,%xmm11
vpsrlq $26,%xmm14,%xmm4
vpand %xmm15,%xmm14,%xmm14
vpsrlq $26,%xmm11,%xmm1
vpand %xmm15,%xmm11,%xmm11
vpaddq %xmm1,%xmm12,%xmm12
vpaddq %xmm4,%xmm10,%xmm10
vpsllq $2,%xmm4,%xmm4
vpaddq %xmm4,%xmm10,%xmm10
vpsrlq $26,%xmm12,%xmm2
vpand %xmm15,%xmm12,%xmm12
vpaddq %xmm2,%xmm13,%xmm13
vpsrlq $26,%xmm10,%xmm0
vpand %xmm15,%xmm10,%xmm10
vpaddq %xmm0,%xmm11,%xmm11
vpsrlq $26,%xmm13,%xmm3
vpand %xmm15,%xmm13,%xmm13
vpaddq %xmm3,%xmm14,%xmm14
vmovd %xmm10,-112(%rdi)
vmovd %xmm11,-108(%rdi)
vmovd %xmm12,-104(%rdi)
vmovd %xmm13,-100(%rdi)
vmovd %xmm14,-96(%rdi)
leaq 88(%r11),%rsp
.cfi_def_cfa %rsp,8
vzeroupper
.byte 0xf3,0xc3
.cfi_endproc
.size poly1305_blocks_avx,.-poly1305_blocks_avx
.type poly1305_emit_avx,@function
.align 32
poly1305_emit_avx:
.cfi_startproc
cmpl $0,20(%rdi)
je .Lemit
movl 0(%rdi),%eax
movl 4(%rdi),%ecx
movl 8(%rdi),%r8d
movl 12(%rdi),%r11d
movl 16(%rdi),%r10d
shlq $26,%rcx
movq %r8,%r9
shlq $52,%r8
addq %rcx,%rax
shrq $12,%r9
addq %rax,%r8
adcq $0,%r9
shlq $14,%r11
movq %r10,%rax
shrq $24,%r10
addq %r11,%r9
shlq $40,%rax
addq %rax,%r9
adcq $0,%r10
movq %r10,%rax
movq %r10,%rcx
andq $3,%r10
shrq $2,%rax
andq $-4,%rcx
addq %rcx,%rax
addq %rax,%r8
adcq $0,%r9
adcq $0,%r10
movq %r8,%rax
addq $5,%r8
movq %r9,%rcx
adcq $0,%r9
adcq $0,%r10
shrq $2,%r10
cmovnzq %r8,%rax
cmovnzq %r9,%rcx
addq 0(%rdx),%rax
adcq 8(%rdx),%rcx
movq %rax,0(%rsi)
movq %rcx,8(%rsi)
.byte 0xf3,0xc3
.cfi_endproc
.size poly1305_emit_avx,.-poly1305_emit_avx
.type poly1305_blocks_avx2,@function
.align 32
poly1305_blocks_avx2:
.cfi_startproc
movl 20(%rdi),%r8d
cmpq $128,%rdx
jae .Lblocks_avx2
testl %r8d,%r8d
jz .Lblocks
.Lblocks_avx2:
andq $-16,%rdx
jz .Lno_data_avx2
vzeroupper
testl %r8d,%r8d
jz .Lbase2_64_avx2
testq $63,%rdx
jz .Leven_avx2
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-16
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
.Lblocks_avx2_body:
movq %rdx,%r15
movq 0(%rdi),%r8
movq 8(%rdi),%r9
movl 16(%rdi),%ebp
movq 24(%rdi),%r11
movq 32(%rdi),%r13
movl %r8d,%r14d
andq $-2147483648,%r8
movq %r9,%r12
movl %r9d,%ebx
andq $-2147483648,%r9
shrq $6,%r8
shlq $52,%r12
addq %r8,%r14
shrq $12,%rbx
shrq $18,%r9
addq %r12,%r14
adcq %r9,%rbx
movq %rbp,%r8
shlq $40,%r8
shrq $24,%rbp
addq %r8,%rbx
adcq $0,%rbp
movq $-4,%r9
movq %rbp,%r8
andq %rbp,%r9
shrq $2,%r8
andq $3,%rbp
addq %r9,%r8
addq %r8,%r14
adcq $0,%rbx
adcq $0,%rbp
movq %r13,%r12
movq %r13,%rax
shrq $2,%r13
addq %r12,%r13
.Lbase2_26_pre_avx2:
addq 0(%rsi),%r14
adcq 8(%rsi),%rbx
leaq 16(%rsi),%rsi
adcq %rcx,%rbp
subq $16,%r15
call __poly1305_block
movq %r12,%rax
testq $63,%r15
jnz .Lbase2_26_pre_avx2
testq %rcx,%rcx
jz .Lstore_base2_64_avx2
movq %r14,%rax
movq %r14,%rdx
shrq $52,%r14
movq %rbx,%r11
movq %rbx,%r12
shrq $26,%rdx
andq $0x3ffffff,%rax
shlq $12,%r11
andq $0x3ffffff,%rdx
shrq $14,%rbx
orq %r11,%r14
shlq $24,%rbp
andq $0x3ffffff,%r14
shrq $40,%r12
andq $0x3ffffff,%rbx
orq %r12,%rbp
testq %r15,%r15
jz .Lstore_base2_26_avx2
vmovd %eax,%xmm0
vmovd %edx,%xmm1
vmovd %r14d,%xmm2
vmovd %ebx,%xmm3
vmovd %ebp,%xmm4
jmp .Lproceed_avx2
.align 32
.Lstore_base2_64_avx2:
movq %r14,0(%rdi)
movq %rbx,8(%rdi)
movq %rbp,16(%rdi)
jmp .Ldone_avx2
.align 16
.Lstore_base2_26_avx2:
movl %eax,0(%rdi)
movl %edx,4(%rdi)
movl %r14d,8(%rdi)
movl %ebx,12(%rdi)
movl %ebp,16(%rdi)
.align 16
.Ldone_avx2:
movq 0(%rsp),%r15
.cfi_restore %r15
movq 8(%rsp),%r14
.cfi_restore %r14
movq 16(%rsp),%r13
.cfi_restore %r13
movq 24(%rsp),%r12
.cfi_restore %r12
movq 32(%rsp),%rbp
.cfi_restore %rbp
movq 40(%rsp),%rbx
.cfi_restore %rbx
leaq 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
.Lno_data_avx2:
.Lblocks_avx2_epilogue:
.byte 0xf3,0xc3
.cfi_endproc
.align 32
.Lbase2_64_avx2:
.cfi_startproc
pushq %rbx
.cfi_adjust_cfa_offset 8
.cfi_offset %rbx,-16
pushq %rbp
.cfi_adjust_cfa_offset 8
.cfi_offset %rbp,-24
pushq %r12
.cfi_adjust_cfa_offset 8
.cfi_offset %r12,-32
pushq %r13
.cfi_adjust_cfa_offset 8
.cfi_offset %r13,-40
pushq %r14
.cfi_adjust_cfa_offset 8
.cfi_offset %r14,-48
pushq %r15
.cfi_adjust_cfa_offset 8
.cfi_offset %r15,-56
.Lbase2_64_avx2_body:
movq %rdx,%r15
movq 24(%rdi),%r11
movq 32(%rdi),%r13
movq 0(%rdi),%r14
movq 8(%rdi),%rbx
movl 16(%rdi),%ebp
movq %r13,%r12
movq %r13,%rax
shrq $2,%r13
addq %r12,%r13
testq $63,%rdx
jz .Linit_avx2
.Lbase2_64_pre_avx2:
addq 0(%rsi),%r14
adcq 8(%rsi),%rbx
leaq 16(%rsi),%rsi
adcq %rcx,%rbp
subq $16,%r15
call __poly1305_block
movq %r12,%rax
testq $63,%r15
jnz .Lbase2_64_pre_avx2
.Linit_avx2:
movq %r14,%rax
movq %r14,%rdx
shrq $52,%r14
movq %rbx,%r8
movq %rbx,%r9
shrq $26,%rdx
andq $0x3ffffff,%rax
shlq $12,%r8
andq $0x3ffffff,%rdx
shrq $14,%rbx
orq %r8,%r14
shlq $24,%rbp
andq $0x3ffffff,%r14
shrq $40,%r9
andq $0x3ffffff,%rbx
orq %r9,%rbp
vmovd %eax,%xmm0
vmovd %edx,%xmm1
vmovd %r14d,%xmm2
vmovd %ebx,%xmm3
vmovd %ebp,%xmm4
movl $1,20(%rdi)
call __poly1305_init_avx
.Lproceed_avx2:
movq %r15,%rdx
movl OPENSSL_ia32cap_P+8(%rip),%r10d
movl $3221291008,%r11d
movq 0(%rsp),%r15
.cfi_restore %r15
movq 8(%rsp),%r14
.cfi_restore %r14
movq 16(%rsp),%r13
.cfi_restore %r13
movq 24(%rsp),%r12
.cfi_restore %r12
movq 32(%rsp),%rbp
.cfi_restore %rbp
movq 40(%rsp),%rbx
.cfi_restore %rbx
leaq 48(%rsp),%rax
leaq 48(%rsp),%rsp
.cfi_adjust_cfa_offset -48
.Lbase2_64_avx2_epilogue:
jmp .Ldo_avx2
.cfi_endproc
.align 32
.Leven_avx2:
.cfi_startproc
movl OPENSSL_ia32cap_P+8(%rip),%r10d
vmovd 0(%rdi),%xmm0
vmovd 4(%rdi),%xmm1
vmovd 8(%rdi),%xmm2
vmovd 12(%rdi),%xmm3
vmovd 16(%rdi),%xmm4
.Ldo_avx2:
cmpq $512,%rdx
jb .Lskip_avx512
andl %r11d,%r10d
testl $65536,%r10d
jnz .Lblocks_avx512
.Lskip_avx512:
leaq -8(%rsp),%r11
.cfi_def_cfa %r11,16
subq $0x128,%rsp
leaq .Lconst(%rip),%rcx
leaq 48+64(%rdi),%rdi
vmovdqa 96(%rcx),%ymm7
vmovdqu -64(%rdi),%xmm9
andq $-512,%rsp
vmovdqu -48(%rdi),%xmm10
vmovdqu -32(%rdi),%xmm6
vmovdqu -16(%rdi),%xmm11
vmovdqu 0(%rdi),%xmm12
vmovdqu 16(%rdi),%xmm13
leaq 144(%rsp),%rax
vmovdqu 32(%rdi),%xmm14
vpermd %ymm9,%ymm7,%ymm9
vmovdqu 48(%rdi),%xmm15
vpermd %ymm10,%ymm7,%ymm10
vmovdqu 64(%rdi),%xmm5
vpermd %ymm6,%ymm7,%ymm6
vmovdqa %ymm9,0(%rsp)
vpermd %ymm11,%ymm7,%ymm11
vmovdqa %ymm10,32-144(%rax)
vpermd %ymm12,%ymm7,%ymm12
vmovdqa %ymm6,64-144(%rax)
vpermd %ymm13,%ymm7,%ymm13
vmovdqa %ymm11,96-144(%rax)
vpermd %ymm14,%ymm7,%ymm14
vmovdqa %ymm12,128-144(%rax)
vpermd %ymm15,%ymm7,%ymm15
vmovdqa %ymm13,160-144(%rax)
vpermd %ymm5,%ymm7,%ymm5
vmovdqa %ymm14,192-144(%rax)
vmovdqa %ymm15,224-144(%rax)
vmovdqa %ymm5,256-144(%rax)
vmovdqa 64(%rcx),%ymm5
vmovdqu 0(%rsi),%xmm7
vmovdqu 16(%rsi),%xmm8
vinserti128 $1,32(%rsi),%ymm7,%ymm7
vinserti128 $1,48(%rsi),%ymm8,%ymm8
leaq 64(%rsi),%rsi
vpsrldq $6,%ymm7,%ymm9
vpsrldq $6,%ymm8,%ymm10
vpunpckhqdq %ymm8,%ymm7,%ymm6
vpunpcklqdq %ymm10,%ymm9,%ymm9
vpunpcklqdq %ymm8,%ymm7,%ymm7
vpsrlq $30,%ymm9,%ymm10
vpsrlq $4,%ymm9,%ymm9
vpsrlq $26,%ymm7,%ymm8
vpsrlq $40,%ymm6,%ymm6
vpand %ymm5,%ymm9,%ymm9
vpand %ymm5,%ymm7,%ymm7
vpand %ymm5,%ymm8,%ymm8
vpand %ymm5,%ymm10,%ymm10
vpor 32(%rcx),%ymm6,%ymm6
vpaddq %ymm2,%ymm9,%ymm2
subq $64,%rdx
jz .Ltail_avx2
jmp .Loop_avx2
.align 32
.Loop_avx2:
vpaddq %ymm0,%ymm7,%ymm0
vmovdqa 0(%rsp),%ymm7
vpaddq %ymm1,%ymm8,%ymm1
vmovdqa 32(%rsp),%ymm8
vpaddq %ymm3,%ymm10,%ymm3
vmovdqa 96(%rsp),%ymm9
vpaddq %ymm4,%ymm6,%ymm4
vmovdqa 48(%rax),%ymm10
vmovdqa 112(%rax),%ymm5
vpmuludq %ymm2,%ymm7,%ymm13
vpmuludq %ymm2,%ymm8,%ymm14
vpmuludq %ymm2,%ymm9,%ymm15
vpmuludq %ymm2,%ymm10,%ymm11
vpmuludq %ymm2,%ymm5,%ymm12
vpmuludq %ymm0,%ymm8,%ymm6
vpmuludq %ymm1,%ymm8,%ymm2
vpaddq %ymm6,%ymm12,%ymm12
vpaddq %ymm2,%ymm13,%ymm13
vpmuludq %ymm3,%ymm8,%ymm6
vpmuludq 64(%rsp),%ymm4,%ymm2
vpaddq %ymm6,%ymm15,%ymm15
vpaddq %ymm2,%ymm11,%ymm11
vmovdqa -16(%rax),%ymm8
vpmuludq %ymm0,%ymm7,%ymm6
vpmuludq %ymm1,%ymm7,%ymm2
vpaddq %ymm6,%ymm11,%ymm11
vpaddq %ymm2,%ymm12,%ymm12
vpmuludq %ymm3,%ymm7,%ymm6
vpmuludq %ymm4,%ymm7,%ymm2
vmovdqu 0(%rsi),%xmm7
vpaddq %ymm6,%ymm14,%ymm14
vpaddq %ymm2,%ymm15,%ymm15
vinserti128 $1,32(%rsi),%ymm7,%ymm7
vpmuludq %ymm3,%ymm8,%ymm6
vpmuludq %ymm4,%ymm8,%ymm2
vmovdqu 16(%rsi),%xmm8
vpaddq %ymm6,%ymm11,%ymm11
vpaddq %ymm2,%ymm12,%ymm12
vmovdqa 16(%rax),%ymm2
vpmuludq %ymm1,%ymm9,%ymm6
vpmuludq %ymm0,%ymm9,%ymm9
vpaddq %ymm6,%ymm14,%ymm14
vpaddq %ymm9,%ymm13,%ymm13
vinserti128 $1,48(%rsi),%ymm8,%ymm8
leaq 64(%rsi),%rsi
vpmuludq %ymm1,%ymm2,%ymm6
vpmuludq %ymm0,%ymm2,%ymm2
vpsrldq $6,%ymm7,%ymm9
vpaddq %ymm6,%ymm15,%ymm15
vpaddq %ymm2,%ymm14,%ymm14
vpmuludq %ymm3,%ymm10,%ymm6
vpmuludq %ymm4,%ymm10,%ymm2
vpsrldq $6,%ymm8,%ymm10
vpaddq %ymm6,%ymm12,%ymm12
vpaddq %ymm2,%ymm13,%ymm13
vpunpckhqdq %ymm8,%ymm7,%ymm6
vpmuludq %ymm3,%ymm5,%ymm3
vpmuludq %ymm4,%ymm5,%ymm4
vpunpcklqdq %ymm8,%ymm7,%ymm7
vpaddq %ymm3,%ymm13,%ymm2
vpaddq %ymm4,%ymm14,%ymm3
vpunpcklqdq %ymm10,%ymm9,%ymm10
vpmuludq 80(%rax),%ymm0,%ymm4
vpmuludq %ymm1,%ymm5,%ymm0
vmovdqa 64(%rcx),%ymm5
vpaddq %ymm4,%ymm15,%ymm4
vpaddq %ymm0,%ymm11,%ymm0
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpaddq %ymm14,%ymm4,%ymm4
vpsrlq $26,%ymm0,%ymm11
vpand %ymm5,%ymm0,%ymm0
vpaddq %ymm11,%ymm12,%ymm1
vpsrlq $26,%ymm4,%ymm15
vpand %ymm5,%ymm4,%ymm4
vpsrlq $4,%ymm10,%ymm9
vpsrlq $26,%ymm1,%ymm12
vpand %ymm5,%ymm1,%ymm1
vpaddq %ymm12,%ymm2,%ymm2
vpaddq %ymm15,%ymm0,%ymm0
vpsllq $2,%ymm15,%ymm15
vpaddq %ymm15,%ymm0,%ymm0
vpand %ymm5,%ymm9,%ymm9
vpsrlq $26,%ymm7,%ymm8
vpsrlq $26,%ymm2,%ymm13
vpand %ymm5,%ymm2,%ymm2
vpaddq %ymm13,%ymm3,%ymm3
vpaddq %ymm9,%ymm2,%ymm2
vpsrlq $30,%ymm10,%ymm10
vpsrlq $26,%ymm0,%ymm11
vpand %ymm5,%ymm0,%ymm0
vpaddq %ymm11,%ymm1,%ymm1
vpsrlq $40,%ymm6,%ymm6
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpaddq %ymm14,%ymm4,%ymm4
vpand %ymm5,%ymm7,%ymm7
vpand %ymm5,%ymm8,%ymm8
vpand %ymm5,%ymm10,%ymm10
vpor 32(%rcx),%ymm6,%ymm6
subq $64,%rdx
jnz .Loop_avx2
.byte 0x66,0x90
.Ltail_avx2:
vpaddq %ymm0,%ymm7,%ymm0
vmovdqu 4(%rsp),%ymm7
vpaddq %ymm1,%ymm8,%ymm1
vmovdqu 36(%rsp),%ymm8
vpaddq %ymm3,%ymm10,%ymm3
vmovdqu 100(%rsp),%ymm9
vpaddq %ymm4,%ymm6,%ymm4
vmovdqu 52(%rax),%ymm10
vmovdqu 116(%rax),%ymm5
vpmuludq %ymm2,%ymm7,%ymm13
vpmuludq %ymm2,%ymm8,%ymm14
vpmuludq %ymm2,%ymm9,%ymm15
vpmuludq %ymm2,%ymm10,%ymm11
vpmuludq %ymm2,%ymm5,%ymm12
vpmuludq %ymm0,%ymm8,%ymm6
vpmuludq %ymm1,%ymm8,%ymm2
vpaddq %ymm6,%ymm12,%ymm12
vpaddq %ymm2,%ymm13,%ymm13
vpmuludq %ymm3,%ymm8,%ymm6
vpmuludq 68(%rsp),%ymm4,%ymm2
vpaddq %ymm6,%ymm15,%ymm15
vpaddq %ymm2,%ymm11,%ymm11
vpmuludq %ymm0,%ymm7,%ymm6
vpmuludq %ymm1,%ymm7,%ymm2
vpaddq %ymm6,%ymm11,%ymm11
vmovdqu -12(%rax),%ymm8
vpaddq %ymm2,%ymm12,%ymm12
vpmuludq %ymm3,%ymm7,%ymm6
vpmuludq %ymm4,%ymm7,%ymm2
vpaddq %ymm6,%ymm14,%ymm14
vpaddq %ymm2,%ymm15,%ymm15
vpmuludq %ymm3,%ymm8,%ymm6
vpmuludq %ymm4,%ymm8,%ymm2
vpaddq %ymm6,%ymm11,%ymm11
vpaddq %ymm2,%ymm12,%ymm12
vmovdqu 20(%rax),%ymm2
vpmuludq %ymm1,%ymm9,%ymm6
vpmuludq %ymm0,%ymm9,%ymm9
vpaddq %ymm6,%ymm14,%ymm14
vpaddq %ymm9,%ymm13,%ymm13
vpmuludq %ymm1,%ymm2,%ymm6
vpmuludq %ymm0,%ymm2,%ymm2
vpaddq %ymm6,%ymm15,%ymm15
vpaddq %ymm2,%ymm14,%ymm14
vpmuludq %ymm3,%ymm10,%ymm6
vpmuludq %ymm4,%ymm10,%ymm2
vpaddq %ymm6,%ymm12,%ymm12
vpaddq %ymm2,%ymm13,%ymm13
vpmuludq %ymm3,%ymm5,%ymm3
vpmuludq %ymm4,%ymm5,%ymm4
vpaddq %ymm3,%ymm13,%ymm2
vpaddq %ymm4,%ymm14,%ymm3
vpmuludq 84(%rax),%ymm0,%ymm4
vpmuludq %ymm1,%ymm5,%ymm0
vmovdqa 64(%rcx),%ymm5
vpaddq %ymm4,%ymm15,%ymm4
vpaddq %ymm0,%ymm11,%ymm0
vpsrldq $8,%ymm12,%ymm8
vpsrldq $8,%ymm2,%ymm9
vpsrldq $8,%ymm3,%ymm10
vpsrldq $8,%ymm4,%ymm6
vpsrldq $8,%ymm0,%ymm7
vpaddq %ymm8,%ymm12,%ymm12
vpaddq %ymm9,%ymm2,%ymm2
vpaddq %ymm10,%ymm3,%ymm3
vpaddq %ymm6,%ymm4,%ymm4
vpaddq %ymm7,%ymm0,%ymm0
vpermq $0x2,%ymm3,%ymm10
vpermq $0x2,%ymm4,%ymm6
vpermq $0x2,%ymm0,%ymm7
vpermq $0x2,%ymm12,%ymm8
vpermq $0x2,%ymm2,%ymm9
vpaddq %ymm10,%ymm3,%ymm3
vpaddq %ymm6,%ymm4,%ymm4
vpaddq %ymm7,%ymm0,%ymm0
vpaddq %ymm8,%ymm12,%ymm12
vpaddq %ymm9,%ymm2,%ymm2
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpaddq %ymm14,%ymm4,%ymm4
vpsrlq $26,%ymm0,%ymm11
vpand %ymm5,%ymm0,%ymm0
vpaddq %ymm11,%ymm12,%ymm1
vpsrlq $26,%ymm4,%ymm15
vpand %ymm5,%ymm4,%ymm4
vpsrlq $26,%ymm1,%ymm12
vpand %ymm5,%ymm1,%ymm1
vpaddq %ymm12,%ymm2,%ymm2
vpaddq %ymm15,%ymm0,%ymm0
vpsllq $2,%ymm15,%ymm15
vpaddq %ymm15,%ymm0,%ymm0
vpsrlq $26,%ymm2,%ymm13
vpand %ymm5,%ymm2,%ymm2
vpaddq %ymm13,%ymm3,%ymm3
vpsrlq $26,%ymm0,%ymm11
vpand %ymm5,%ymm0,%ymm0
vpaddq %ymm11,%ymm1,%ymm1
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpaddq %ymm14,%ymm4,%ymm4
vmovd %xmm0,-112(%rdi)
vmovd %xmm1,-108(%rdi)
vmovd %xmm2,-104(%rdi)
vmovd %xmm3,-100(%rdi)
vmovd %xmm4,-96(%rdi)
leaq 8(%r11),%rsp
.cfi_def_cfa %rsp,8
vzeroupper
.byte 0xf3,0xc3
.cfi_endproc
.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
.type poly1305_blocks_avx512,@function
.align 32
poly1305_blocks_avx512:
.cfi_startproc
.Lblocks_avx512:
movl $15,%eax
kmovw %eax,%k2
leaq -8(%rsp),%r11
.cfi_def_cfa %r11,16
subq $0x128,%rsp
leaq .Lconst(%rip),%rcx
leaq 48+64(%rdi),%rdi
vmovdqa 96(%rcx),%ymm9
vmovdqu -64(%rdi),%xmm11
andq $-512,%rsp
vmovdqu -48(%rdi),%xmm12
movq $0x20,%rax
vmovdqu -32(%rdi),%xmm7
vmovdqu -16(%rdi),%xmm13
vmovdqu 0(%rdi),%xmm8
vmovdqu 16(%rdi),%xmm14
vmovdqu 32(%rdi),%xmm10
vmovdqu 48(%rdi),%xmm15
vmovdqu 64(%rdi),%xmm6
vpermd %zmm11,%zmm9,%zmm16
vpbroadcastq 64(%rcx),%zmm5
vpermd %zmm12,%zmm9,%zmm17
vpermd %zmm7,%zmm9,%zmm21
vpermd %zmm13,%zmm9,%zmm18
vmovdqa64 %zmm16,0(%rsp){%k2}
vpsrlq $32,%zmm16,%zmm7
vpermd %zmm8,%zmm9,%zmm22
vmovdqu64 %zmm17,0(%rsp,%rax,1){%k2}
vpsrlq $32,%zmm17,%zmm8
vpermd %zmm14,%zmm9,%zmm19
vmovdqa64 %zmm21,64(%rsp){%k2}
vpermd %zmm10,%zmm9,%zmm23
vpermd %zmm15,%zmm9,%zmm20
vmovdqu64 %zmm18,64(%rsp,%rax,1){%k2}
vpermd %zmm6,%zmm9,%zmm24
vmovdqa64 %zmm22,128(%rsp){%k2}
vmovdqu64 %zmm19,128(%rsp,%rax,1){%k2}
vmovdqa64 %zmm23,192(%rsp){%k2}
vmovdqu64 %zmm20,192(%rsp,%rax,1){%k2}
vmovdqa64 %zmm24,256(%rsp){%k2}
vpmuludq %zmm7,%zmm16,%zmm11
vpmuludq %zmm7,%zmm17,%zmm12
vpmuludq %zmm7,%zmm18,%zmm13
vpmuludq %zmm7,%zmm19,%zmm14
vpmuludq %zmm7,%zmm20,%zmm15
vpsrlq $32,%zmm18,%zmm9
vpmuludq %zmm8,%zmm24,%zmm25
vpmuludq %zmm8,%zmm16,%zmm26
vpmuludq %zmm8,%zmm17,%zmm27
vpmuludq %zmm8,%zmm18,%zmm28
vpmuludq %zmm8,%zmm19,%zmm29
vpsrlq $32,%zmm19,%zmm10
vpaddq %zmm25,%zmm11,%zmm11
vpaddq %zmm26,%zmm12,%zmm12
vpaddq %zmm27,%zmm13,%zmm13
vpaddq %zmm28,%zmm14,%zmm14
vpaddq %zmm29,%zmm15,%zmm15
vpmuludq %zmm9,%zmm23,%zmm25
vpmuludq %zmm9,%zmm24,%zmm26
vpmuludq %zmm9,%zmm17,%zmm28
vpmuludq %zmm9,%zmm18,%zmm29
vpmuludq %zmm9,%zmm16,%zmm27
vpsrlq $32,%zmm20,%zmm6
vpaddq %zmm25,%zmm11,%zmm11
vpaddq %zmm26,%zmm12,%zmm12
vpaddq %zmm28,%zmm14,%zmm14
vpaddq %zmm29,%zmm15,%zmm15
vpaddq %zmm27,%zmm13,%zmm13
vpmuludq %zmm10,%zmm22,%zmm25
vpmuludq %zmm10,%zmm16,%zmm28
vpmuludq %zmm10,%zmm17,%zmm29
vpmuludq %zmm10,%zmm23,%zmm26
vpmuludq %zmm10,%zmm24,%zmm27
vpaddq %zmm25,%zmm11,%zmm11
vpaddq %zmm28,%zmm14,%zmm14
vpaddq %zmm29,%zmm15,%zmm15
vpaddq %zmm26,%zmm12,%zmm12
vpaddq %zmm27,%zmm13,%zmm13
vpmuludq %zmm6,%zmm24,%zmm28
vpmuludq %zmm6,%zmm16,%zmm29
vpmuludq %zmm6,%zmm21,%zmm25
vpmuludq %zmm6,%zmm22,%zmm26
vpmuludq %zmm6,%zmm23,%zmm27
vpaddq %zmm28,%zmm14,%zmm14
vpaddq %zmm29,%zmm15,%zmm15
vpaddq %zmm25,%zmm11,%zmm11
vpaddq %zmm26,%zmm12,%zmm12
vpaddq %zmm27,%zmm13,%zmm13
vmovdqu64 0(%rsi),%zmm10
vmovdqu64 64(%rsi),%zmm6
leaq 128(%rsi),%rsi
vpsrlq $26,%zmm14,%zmm28
vpandq %zmm5,%zmm14,%zmm14
vpaddq %zmm28,%zmm15,%zmm15
vpsrlq $26,%zmm11,%zmm25
vpandq %zmm5,%zmm11,%zmm11
vpaddq %zmm25,%zmm12,%zmm12
vpsrlq $26,%zmm15,%zmm29
vpandq %zmm5,%zmm15,%zmm15
vpsrlq $26,%zmm12,%zmm26
vpandq %zmm5,%zmm12,%zmm12
vpaddq %zmm26,%zmm13,%zmm13
vpaddq %zmm29,%zmm11,%zmm11
vpsllq $2,%zmm29,%zmm29
vpaddq %zmm29,%zmm11,%zmm11
vpsrlq $26,%zmm13,%zmm27
vpandq %zmm5,%zmm13,%zmm13
vpaddq %zmm27,%zmm14,%zmm14
vpsrlq $26,%zmm11,%zmm25
vpandq %zmm5,%zmm11,%zmm11
vpaddq %zmm25,%zmm12,%zmm12
vpsrlq $26,%zmm14,%zmm28
vpandq %zmm5,%zmm14,%zmm14
vpaddq %zmm28,%zmm15,%zmm15
vpunpcklqdq %zmm6,%zmm10,%zmm7
vpunpckhqdq %zmm6,%zmm10,%zmm6
vmovdqa32 128(%rcx),%zmm25
movl $0x7777,%eax
kmovw %eax,%k1
vpermd %zmm16,%zmm25,%zmm16
vpermd %zmm17,%zmm25,%zmm17
vpermd %zmm18,%zmm25,%zmm18
vpermd %zmm19,%zmm25,%zmm19
vpermd %zmm20,%zmm25,%zmm20
vpermd %zmm11,%zmm25,%zmm16{%k1}
vpermd %zmm12,%zmm25,%zmm17{%k1}
vpermd %zmm13,%zmm25,%zmm18{%k1}
vpermd %zmm14,%zmm25,%zmm19{%k1}
vpermd %zmm15,%zmm25,%zmm20{%k1}
vpslld $2,%zmm17,%zmm21
vpslld $2,%zmm18,%zmm22
vpslld $2,%zmm19,%zmm23
vpslld $2,%zmm20,%zmm24
vpaddd %zmm17,%zmm21,%zmm21
vpaddd %zmm18,%zmm22,%zmm22
vpaddd %zmm19,%zmm23,%zmm23
vpaddd %zmm20,%zmm24,%zmm24
vpbroadcastq 32(%rcx),%zmm30
vpsrlq $52,%zmm7,%zmm9
vpsllq $12,%zmm6,%zmm10
vporq %zmm10,%zmm9,%zmm9
vpsrlq $26,%zmm7,%zmm8
vpsrlq $14,%zmm6,%zmm10
vpsrlq $40,%zmm6,%zmm6
vpandq %zmm5,%zmm9,%zmm9
vpandq %zmm5,%zmm7,%zmm7
vpaddq %zmm2,%zmm9,%zmm2
subq $192,%rdx
jbe .Ltail_avx512
jmp .Loop_avx512
.align 32
.Loop_avx512:
vpmuludq %zmm2,%zmm17,%zmm14
vpaddq %zmm0,%zmm7,%zmm0
vpmuludq %zmm2,%zmm18,%zmm15
vpandq %zmm5,%zmm8,%zmm8
vpmuludq %zmm2,%zmm23,%zmm11
vpandq %zmm5,%zmm10,%zmm10
vpmuludq %zmm2,%zmm24,%zmm12
vporq %zmm30,%zmm6,%zmm6
vpmuludq %zmm2,%zmm16,%zmm13
vpaddq %zmm1,%zmm8,%zmm1
vpaddq %zmm3,%zmm10,%zmm3
vpaddq %zmm4,%zmm6,%zmm4
vmovdqu64 0(%rsi),%zmm10
vmovdqu64 64(%rsi),%zmm6
leaq 128(%rsi),%rsi
vpmuludq %zmm0,%zmm19,%zmm28
vpmuludq %zmm0,%zmm20,%zmm29
vpmuludq %zmm0,%zmm16,%zmm25
vpmuludq %zmm0,%zmm17,%zmm26
vpaddq %zmm28,%zmm14,%zmm14
vpaddq %zmm29,%zmm15,%zmm15
vpaddq %zmm25,%zmm11,%zmm11
vpaddq %zmm26,%zmm12,%zmm12
vpmuludq %zmm1,%zmm18,%zmm28
vpmuludq %zmm1,%zmm19,%zmm29
vpmuludq %zmm1,%zmm24,%zmm25
vpmuludq %zmm0,%zmm18,%zmm27
vpaddq %zmm28,%zmm14,%zmm14
vpaddq %zmm29,%zmm15,%zmm15
vpaddq %zmm25,%zmm11,%zmm11
vpaddq %zmm27,%zmm13,%zmm13
vpunpcklqdq %zmm6,%zmm10,%zmm7
vpunpckhqdq %zmm6,%zmm10,%zmm6
vpmuludq %zmm3,%zmm16,%zmm28
vpmuludq %zmm3,%zmm17,%zmm29
vpmuludq %zmm1,%zmm16,%zmm26
vpmuludq %zmm1,%zmm17,%zmm27
vpaddq %zmm28,%zmm14,%zmm14
vpaddq %zmm29,%zmm15,%zmm15
vpaddq %zmm26,%zmm12,%zmm12
vpaddq %zmm27,%zmm13,%zmm13
vpmuludq %zmm4,%zmm24,%zmm28
vpmuludq %zmm4,%zmm16,%zmm29
vpmuludq %zmm3,%zmm22,%zmm25
vpmuludq %zmm3,%zmm23,%zmm26
vpaddq %zmm28,%zmm14,%zmm14
vpmuludq %zmm3,%zmm24,%zmm27
vpaddq %zmm29,%zmm15,%zmm15
vpaddq %zmm25,%zmm11,%zmm11
vpaddq %zmm26,%zmm12,%zmm12
vpaddq %zmm27,%zmm13,%zmm13
vpmuludq %zmm4,%zmm21,%zmm25
vpmuludq %zmm4,%zmm22,%zmm26
vpmuludq %zmm4,%zmm23,%zmm27
vpaddq %zmm25,%zmm11,%zmm0
vpaddq %zmm26,%zmm12,%zmm1
vpaddq %zmm27,%zmm13,%zmm2
vpsrlq $52,%zmm7,%zmm9
vpsllq $12,%zmm6,%zmm10
vpsrlq $26,%zmm14,%zmm3
vpandq %zmm5,%zmm14,%zmm14
vpaddq %zmm3,%zmm15,%zmm4
vporq %zmm10,%zmm9,%zmm9
vpsrlq $26,%zmm0,%zmm11
vpandq %zmm5,%zmm0,%zmm0
vpaddq %zmm11,%zmm1,%zmm1
vpandq %zmm5,%zmm9,%zmm9
vpsrlq $26,%zmm4,%zmm15
vpandq %zmm5,%zmm4,%zmm4
vpsrlq $26,%zmm1,%zmm12
vpandq %zmm5,%zmm1,%zmm1
vpaddq %zmm12,%zmm2,%zmm2
vpaddq %zmm15,%zmm0,%zmm0
vpsllq $2,%zmm15,%zmm15
vpaddq %zmm15,%zmm0,%zmm0
vpaddq %zmm9,%zmm2,%zmm2
vpsrlq $26,%zmm7,%zmm8
vpsrlq $26,%zmm2,%zmm13
vpandq %zmm5,%zmm2,%zmm2
vpaddq %zmm13,%zmm14,%zmm3
vpsrlq $14,%zmm6,%zmm10
vpsrlq $26,%zmm0,%zmm11
vpandq %zmm5,%zmm0,%zmm0
vpaddq %zmm11,%zmm1,%zmm1
vpsrlq $40,%zmm6,%zmm6
vpsrlq $26,%zmm3,%zmm14
vpandq %zmm5,%zmm3,%zmm3
vpaddq %zmm14,%zmm4,%zmm4
vpandq %zmm5,%zmm7,%zmm7
subq $128,%rdx
ja .Loop_avx512
.Ltail_avx512:
vpsrlq $32,%zmm16,%zmm16
vpsrlq $32,%zmm17,%zmm17
vpsrlq $32,%zmm18,%zmm18
vpsrlq $32,%zmm23,%zmm23
vpsrlq $32,%zmm24,%zmm24
vpsrlq $32,%zmm19,%zmm19
vpsrlq $32,%zmm20,%zmm20
vpsrlq $32,%zmm21,%zmm21
vpsrlq $32,%zmm22,%zmm22
leaq (%rsi,%rdx,1),%rsi
vpaddq %zmm0,%zmm7,%zmm0
vpmuludq %zmm2,%zmm17,%zmm14
vpmuludq %zmm2,%zmm18,%zmm15
vpmuludq %zmm2,%zmm23,%zmm11
vpandq %zmm5,%zmm8,%zmm8
vpmuludq %zmm2,%zmm24,%zmm12
vpandq %zmm5,%zmm10,%zmm10
vpmuludq %zmm2,%zmm16,%zmm13
vporq %zmm30,%zmm6,%zmm6
vpaddq %zmm1,%zmm8,%zmm1
vpaddq %zmm3,%zmm10,%zmm3
vpaddq %zmm4,%zmm6,%zmm4
vmovdqu 0(%rsi),%xmm7
vpmuludq %zmm0,%zmm19,%zmm28
vpmuludq %zmm0,%zmm20,%zmm29
vpmuludq %zmm0,%zmm16,%zmm25
vpmuludq %zmm0,%zmm17,%zmm26
vpaddq %zmm28,%zmm14,%zmm14
vpaddq %zmm29,%zmm15,%zmm15
vpaddq %zmm25,%zmm11,%zmm11
vpaddq %zmm26,%zmm12,%zmm12
vmovdqu 16(%rsi),%xmm8
vpmuludq %zmm1,%zmm18,%zmm28
vpmuludq %zmm1,%zmm19,%zmm29
vpmuludq %zmm1,%zmm24,%zmm25
vpmuludq %zmm0,%zmm18,%zmm27
vpaddq %zmm28,%zmm14,%zmm14
vpaddq %zmm29,%zmm15,%zmm15
vpaddq %zmm25,%zmm11,%zmm11
vpaddq %zmm27,%zmm13,%zmm13
vinserti128 $1,32(%rsi),%ymm7,%ymm7
vpmuludq %zmm3,%zmm16,%zmm28
vpmuludq %zmm3,%zmm17,%zmm29
vpmuludq %zmm1,%zmm16,%zmm26
vpmuludq %zmm1,%zmm17,%zmm27
vpaddq %zmm28,%zmm14,%zmm14
vpaddq %zmm29,%zmm15,%zmm15
vpaddq %zmm26,%zmm12,%zmm12
vpaddq %zmm27,%zmm13,%zmm13
vinserti128 $1,48(%rsi),%ymm8,%ymm8
vpmuludq %zmm4,%zmm24,%zmm28
vpmuludq %zmm4,%zmm16,%zmm29
vpmuludq %zmm3,%zmm22,%zmm25
vpmuludq %zmm3,%zmm23,%zmm26
vpmuludq %zmm3,%zmm24,%zmm27
vpaddq %zmm28,%zmm14,%zmm3
vpaddq %zmm29,%zmm15,%zmm15
vpaddq %zmm25,%zmm11,%zmm11
vpaddq %zmm26,%zmm12,%zmm12
vpaddq %zmm27,%zmm13,%zmm13
vpmuludq %zmm4,%zmm21,%zmm25
vpmuludq %zmm4,%zmm22,%zmm26
vpmuludq %zmm4,%zmm23,%zmm27
vpaddq %zmm25,%zmm11,%zmm0
vpaddq %zmm26,%zmm12,%zmm1
vpaddq %zmm27,%zmm13,%zmm2
movl $1,%eax
vpermq $0xb1,%zmm3,%zmm14
vpermq $0xb1,%zmm15,%zmm4
vpermq $0xb1,%zmm0,%zmm11
vpermq $0xb1,%zmm1,%zmm12
vpermq $0xb1,%zmm2,%zmm13
vpaddq %zmm14,%zmm3,%zmm3
vpaddq %zmm15,%zmm4,%zmm4
vpaddq %zmm11,%zmm0,%zmm0
vpaddq %zmm12,%zmm1,%zmm1
vpaddq %zmm13,%zmm2,%zmm2
kmovw %eax,%k3
vpermq $0x2,%zmm3,%zmm14
vpermq $0x2,%zmm4,%zmm15
vpermq $0x2,%zmm0,%zmm11
vpermq $0x2,%zmm1,%zmm12
vpermq $0x2,%zmm2,%zmm13
vpaddq %zmm14,%zmm3,%zmm3
vpaddq %zmm15,%zmm4,%zmm4
vpaddq %zmm11,%zmm0,%zmm0
vpaddq %zmm12,%zmm1,%zmm1
vpaddq %zmm13,%zmm2,%zmm2
vextracti64x4 $0x1,%zmm3,%ymm14
vextracti64x4 $0x1,%zmm4,%ymm15
vextracti64x4 $0x1,%zmm0,%ymm11
vextracti64x4 $0x1,%zmm1,%ymm12
vextracti64x4 $0x1,%zmm2,%ymm13
vpaddq %zmm14,%zmm3,%zmm3{%k3}{z}
vpaddq %zmm15,%zmm4,%zmm4{%k3}{z}
vpaddq %zmm11,%zmm0,%zmm0{%k3}{z}
vpaddq %zmm12,%zmm1,%zmm1{%k3}{z}
vpaddq %zmm13,%zmm2,%zmm2{%k3}{z}
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpsrldq $6,%ymm7,%ymm9
vpsrldq $6,%ymm8,%ymm10
vpunpckhqdq %ymm8,%ymm7,%ymm6
vpaddq %ymm14,%ymm4,%ymm4
vpsrlq $26,%ymm0,%ymm11
vpand %ymm5,%ymm0,%ymm0
vpunpcklqdq %ymm10,%ymm9,%ymm9
vpunpcklqdq %ymm8,%ymm7,%ymm7
vpaddq %ymm11,%ymm1,%ymm1
vpsrlq $26,%ymm4,%ymm15
vpand %ymm5,%ymm4,%ymm4
vpsrlq $26,%ymm1,%ymm12
vpand %ymm5,%ymm1,%ymm1
vpsrlq $30,%ymm9,%ymm10
vpsrlq $4,%ymm9,%ymm9
vpaddq %ymm12,%ymm2,%ymm2
vpaddq %ymm15,%ymm0,%ymm0
vpsllq $2,%ymm15,%ymm15
vpsrlq $26,%ymm7,%ymm8
vpsrlq $40,%ymm6,%ymm6
vpaddq %ymm15,%ymm0,%ymm0
vpsrlq $26,%ymm2,%ymm13
vpand %ymm5,%ymm2,%ymm2
vpand %ymm5,%ymm9,%ymm9
vpand %ymm5,%ymm7,%ymm7
vpaddq %ymm13,%ymm3,%ymm3
vpsrlq $26,%ymm0,%ymm11
vpand %ymm5,%ymm0,%ymm0
vpaddq %ymm2,%ymm9,%ymm2
vpand %ymm5,%ymm8,%ymm8
vpaddq %ymm11,%ymm1,%ymm1
vpsrlq $26,%ymm3,%ymm14
vpand %ymm5,%ymm3,%ymm3
vpand %ymm5,%ymm10,%ymm10
vpor 32(%rcx),%ymm6,%ymm6
vpaddq %ymm14,%ymm4,%ymm4
leaq 144(%rsp),%rax
addq $64,%rdx
jnz .Ltail_avx2
vpsubq %ymm9,%ymm2,%ymm2
vmovd %xmm0,-112(%rdi)
vmovd %xmm1,-108(%rdi)
vmovd %xmm2,-104(%rdi)
vmovd %xmm3,-100(%rdi)
vmovd %xmm4,-96(%rdi)
vzeroall
leaq 8(%r11),%rsp
.cfi_def_cfa %rsp,8
.byte 0xf3,0xc3
.cfi_endproc
.size poly1305_blocks_avx512,.-poly1305_blocks_avx512
.type poly1305_init_base2_44,@function
.align 32
poly1305_init_base2_44:
.cfi_startproc
xorq %rax,%rax
movq %rax,0(%rdi)
movq %rax,8(%rdi)
movq %rax,16(%rdi)
.Linit_base2_44:
leaq poly1305_blocks_vpmadd52(%rip),%r10
leaq poly1305_emit_base2_44(%rip),%r11
movq $0x0ffffffc0fffffff,%rax
movq $0x0ffffffc0ffffffc,%rcx
andq 0(%rsi),%rax
movq $0x00000fffffffffff,%r8
andq 8(%rsi),%rcx
movq $0x00000fffffffffff,%r9
andq %rax,%r8
shrdq $44,%rcx,%rax
movq %r8,40(%rdi)
andq %r9,%rax
shrq $24,%rcx
movq %rax,48(%rdi)
leaq (%rax,%rax,4),%rax
movq %rcx,56(%rdi)
shlq $2,%rax
leaq (%rcx,%rcx,4),%rcx
shlq $2,%rcx
movq %rax,24(%rdi)
movq %rcx,32(%rdi)
movq $-1,64(%rdi)
movq %r10,0(%rdx)
movq %r11,8(%rdx)
movl $1,%eax
.byte 0xf3,0xc3
.cfi_endproc
.size poly1305_init_base2_44,.-poly1305_init_base2_44
.type poly1305_blocks_vpmadd52,@function
.align 32
poly1305_blocks_vpmadd52:
.cfi_startproc
.byte 243,15,30,250
shrq $4,%rdx
jz .Lno_data_vpmadd52
shlq $40,%rcx
movq 64(%rdi),%r8
movq $3,%rax
movq $1,%r10
cmpq $4,%rdx
cmovaeq %r10,%rax
testq %r8,%r8
cmovnsq %r10,%rax
andq %rdx,%rax
jz .Lblocks_vpmadd52_4x
subq %rax,%rdx
movl $7,%r10d
movl $1,%r11d
kmovw %r10d,%k7
leaq .L2_44_inp_permd(%rip),%r10
kmovw %r11d,%k1
vmovq %rcx,%xmm21
vmovdqa64 0(%r10),%ymm19
vmovdqa64 32(%r10),%ymm20
vpermq $0xcf,%ymm21,%ymm21
vmovdqa64 64(%r10),%ymm22
vmovdqu64 0(%rdi),%ymm16{%k7}{z}
vmovdqu64 40(%rdi),%ymm3{%k7}{z}
vmovdqu64 32(%rdi),%ymm4{%k7}{z}
vmovdqu64 24(%rdi),%ymm5{%k7}{z}
vmovdqa64 96(%r10),%ymm23
vmovdqa64 128(%r10),%ymm24
jmp .Loop_vpmadd52
.align 32
.Loop_vpmadd52:
vmovdqu32 0(%rsi),%xmm18
leaq 16(%rsi),%rsi
vpermd %ymm18,%ymm19,%ymm18
vpsrlvq %ymm20,%ymm18,%ymm18
vpandq %ymm22,%ymm18,%ymm18
vporq %ymm21,%ymm18,%ymm18
vpaddq %ymm18,%ymm16,%ymm16
vpermq $0,%ymm16,%ymm0{%k7}{z}
vpermq $85,%ymm16,%ymm1{%k7}{z}
vpermq $170,%ymm16,%ymm2{%k7}{z}
vpxord %ymm16,%ymm16,%ymm16
vpxord %ymm17,%ymm17,%ymm17
vpmadd52luq %ymm3,%ymm0,%ymm16
vpmadd52huq %ymm3,%ymm0,%ymm17
vpmadd52luq %ymm4,%ymm1,%ymm16
vpmadd52huq %ymm4,%ymm1,%ymm17
vpmadd52luq %ymm5,%ymm2,%ymm16
vpmadd52huq %ymm5,%ymm2,%ymm17
vpsrlvq %ymm23,%ymm16,%ymm18
vpsllvq %ymm24,%ymm17,%ymm17
vpandq %ymm22,%ymm16,%ymm16
vpaddq %ymm18,%ymm17,%ymm17
vpermq $147,%ymm17,%ymm17
vpaddq %ymm17,%ymm16,%ymm16
vpsrlvq %ymm23,%ymm16,%ymm18
vpandq %ymm22,%ymm16,%ymm16
vpermq $147,%ymm18,%ymm18
vpaddq %ymm18,%ymm16,%ymm16
vpermq $147,%ymm16,%ymm18{%k1}{z}
vpaddq %ymm18,%ymm16,%ymm16
vpsllq $2,%ymm18,%ymm18
vpaddq %ymm18,%ymm16,%ymm16
decq %rax
jnz .Loop_vpmadd52
vmovdqu64 %ymm16,0(%rdi){%k7}
testq %rdx,%rdx
jnz .Lblocks_vpmadd52_4x
.Lno_data_vpmadd52:
.byte 0xf3,0xc3
.cfi_endproc
.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
.type poly1305_blocks_vpmadd52_4x,@function
.align 32
poly1305_blocks_vpmadd52_4x:
.cfi_startproc
shrq $4,%rdx
jz .Lno_data_vpmadd52_4x
shlq $40,%rcx
movq 64(%rdi),%r8
.Lblocks_vpmadd52_4x:
vpbroadcastq %rcx,%ymm31
vmovdqa64 .Lx_mask44(%rip),%ymm28
movl $5,%eax
vmovdqa64 .Lx_mask42(%rip),%ymm29
kmovw %eax,%k1
testq %r8,%r8
js .Linit_vpmadd52
vmovq 0(%rdi),%xmm0
vmovq 8(%rdi),%xmm1
vmovq 16(%rdi),%xmm2
testq $3,%rdx
jnz .Lblocks_vpmadd52_2x_do
.Lblocks_vpmadd52_4x_do:
vpbroadcastq 64(%rdi),%ymm3
vpbroadcastq 96(%rdi),%ymm4
vpbroadcastq 128(%rdi),%ymm5
vpbroadcastq 160(%rdi),%ymm16
.Lblocks_vpmadd52_4x_key_loaded:
vpsllq $2,%ymm5,%ymm17
vpaddq %ymm5,%ymm17,%ymm17
vpsllq $2,%ymm17,%ymm17
testq $7,%rdx
jz .Lblocks_vpmadd52_8x
vmovdqu64 0(%rsi),%ymm26
vmovdqu64 32(%rsi),%ymm27
leaq 64(%rsi),%rsi
vpunpcklqdq %ymm27,%ymm26,%ymm25
vpunpckhqdq %ymm27,%ymm26,%ymm27
vpsrlq $24,%ymm27,%ymm26
vporq %ymm31,%ymm26,%ymm26
vpaddq %ymm26,%ymm2,%ymm2
vpandq %ymm28,%ymm25,%ymm24
vpsrlq $44,%ymm25,%ymm25
vpsllq $20,%ymm27,%ymm27
vporq %ymm27,%ymm25,%ymm25
vpandq %ymm28,%ymm25,%ymm25
subq $4,%rdx
jz .Ltail_vpmadd52_4x
jmp .Loop_vpmadd52_4x
ud2
.align 32
.Linit_vpmadd52:
vmovq 24(%rdi),%xmm16
vmovq 56(%rdi),%xmm2
vmovq 32(%rdi),%xmm17
vmovq 40(%rdi),%xmm3
vmovq 48(%rdi),%xmm4
vmovdqa %ymm3,%ymm0
vmovdqa %ymm4,%ymm1
vmovdqa %ymm2,%ymm5
movl $2,%eax
.Lmul_init_vpmadd52:
vpxorq %ymm18,%ymm18,%ymm18
vpmadd52luq %ymm2,%ymm16,%ymm18
vpxorq %ymm19,%ymm19,%ymm19
vpmadd52huq %ymm2,%ymm16,%ymm19
vpxorq %ymm20,%ymm20,%ymm20
vpmadd52luq %ymm2,%ymm17,%ymm20
vpxorq %ymm21,%ymm21,%ymm21
vpmadd52huq %ymm2,%ymm17,%ymm21
vpxorq %ymm22,%ymm22,%ymm22
vpmadd52luq %ymm2,%ymm3,%ymm22
vpxorq %ymm23,%ymm23,%ymm23
vpmadd52huq %ymm2,%ymm3,%ymm23
vpmadd52luq %ymm0,%ymm3,%ymm18
vpmadd52huq %ymm0,%ymm3,%ymm19
vpmadd52luq %ymm0,%ymm4,%ymm20
vpmadd52huq %ymm0,%ymm4,%ymm21
vpmadd52luq %ymm0,%ymm5,%ymm22
vpmadd52huq %ymm0,%ymm5,%ymm23
vpmadd52luq %ymm1,%ymm17,%ymm18
vpmadd52huq %ymm1,%ymm17,%ymm19
vpmadd52luq %ymm1,%ymm3,%ymm20
vpmadd52huq %ymm1,%ymm3,%ymm21
vpmadd52luq %ymm1,%ymm4,%ymm22
vpmadd52huq %ymm1,%ymm4,%ymm23
vpsrlq $44,%ymm18,%ymm30
vpsllq $8,%ymm19,%ymm19
vpandq %ymm28,%ymm18,%ymm0
vpaddq %ymm30,%ymm19,%ymm19
vpaddq %ymm19,%ymm20,%ymm20
vpsrlq $44,%ymm20,%ymm30
vpsllq $8,%ymm21,%ymm21
vpandq %ymm28,%ymm20,%ymm1
vpaddq %ymm30,%ymm21,%ymm21
vpaddq %ymm21,%ymm22,%ymm22
vpsrlq $42,%ymm22,%ymm30
vpsllq $10,%ymm23,%ymm23
vpandq %ymm29,%ymm22,%ymm2
vpaddq %ymm30,%ymm23,%ymm23
vpaddq %ymm23,%ymm0,%ymm0
vpsllq $2,%ymm23,%ymm23
vpaddq %ymm23,%ymm0,%ymm0
vpsrlq $44,%ymm0,%ymm30
vpandq %ymm28,%ymm0,%ymm0
vpaddq %ymm30,%ymm1,%ymm1
decl %eax
jz .Ldone_init_vpmadd52
vpunpcklqdq %ymm4,%ymm1,%ymm4
vpbroadcastq %xmm1,%xmm1
vpunpcklqdq %ymm5,%ymm2,%ymm5
vpbroadcastq %xmm2,%xmm2
vpunpcklqdq %ymm3,%ymm0,%ymm3
vpbroadcastq %xmm0,%xmm0
vpsllq $2,%ymm4,%ymm16
vpsllq $2,%ymm5,%ymm17
vpaddq %ymm4,%ymm16,%ymm16
vpaddq %ymm5,%ymm17,%ymm17
vpsllq $2,%ymm16,%ymm16
vpsllq $2,%ymm17,%ymm17
jmp .Lmul_init_vpmadd52
ud2
.align 32
.Ldone_init_vpmadd52:
vinserti128 $1,%xmm4,%ymm1,%ymm4
vinserti128 $1,%xmm5,%ymm2,%ymm5
vinserti128 $1,%xmm3,%ymm0,%ymm3
vpermq $216,%ymm4,%ymm4
vpermq $216,%ymm5,%ymm5
vpermq $216,%ymm3,%ymm3
vpsllq $2,%ymm4,%ymm16
vpaddq %ymm4,%ymm16,%ymm16
vpsllq $2,%ymm16,%ymm16
vmovq 0(%rdi),%xmm0
vmovq 8(%rdi),%xmm1
vmovq 16(%rdi),%xmm2
testq $3,%rdx
jnz .Ldone_init_vpmadd52_2x
vmovdqu64 %ymm3,64(%rdi)
vpbroadcastq %xmm3,%ymm3
vmovdqu64 %ymm4,96(%rdi)
vpbroadcastq %xmm4,%ymm4
vmovdqu64 %ymm5,128(%rdi)
vpbroadcastq %xmm5,%ymm5
vmovdqu64 %ymm16,160(%rdi)
vpbroadcastq %xmm16,%ymm16
jmp .Lblocks_vpmadd52_4x_key_loaded
ud2
.align 32
.Ldone_init_vpmadd52_2x:
vmovdqu64 %ymm3,64(%rdi)
vpsrldq $8,%ymm3,%ymm3
vmovdqu64 %ymm4,96(%rdi)
vpsrldq $8,%ymm4,%ymm4
vmovdqu64 %ymm5,128(%rdi)
vpsrldq $8,%ymm5,%ymm5
vmovdqu64 %ymm16,160(%rdi)
vpsrldq $8,%ymm16,%ymm16
jmp .Lblocks_vpmadd52_2x_key_loaded
ud2
.align 32
.Lblocks_vpmadd52_2x_do:
vmovdqu64 128+8(%rdi),%ymm5{%k1}{z}
vmovdqu64 160+8(%rdi),%ymm16{%k1}{z}
vmovdqu64 64+8(%rdi),%ymm3{%k1}{z}
vmovdqu64 96+8(%rdi),%ymm4{%k1}{z}
.Lblocks_vpmadd52_2x_key_loaded:
vmovdqu64 0(%rsi),%ymm26
vpxorq %ymm27,%ymm27,%ymm27
leaq 32(%rsi),%rsi
vpunpcklqdq %ymm27,%ymm26,%ymm25
vpunpckhqdq %ymm27,%ymm26,%ymm27
vpsrlq $24,%ymm27,%ymm26
vporq %ymm31,%ymm26,%ymm26
vpaddq %ymm26,%ymm2,%ymm2
vpandq %ymm28,%ymm25,%ymm24
vpsrlq $44,%ymm25,%ymm25
vpsllq $20,%ymm27,%ymm27
vporq %ymm27,%ymm25,%ymm25
vpandq %ymm28,%ymm25,%ymm25
jmp .Ltail_vpmadd52_2x
ud2
.align 32
.Loop_vpmadd52_4x:
vpaddq %ymm24,%ymm0,%ymm0
vpaddq %ymm25,%ymm1,%ymm1
vpxorq %ymm18,%ymm18,%ymm18
vpmadd52luq %ymm2,%ymm16,%ymm18
vpxorq %ymm19,%ymm19,%ymm19
vpmadd52huq %ymm2,%ymm16,%ymm19
vpxorq %ymm20,%ymm20,%ymm20
vpmadd52luq %ymm2,%ymm17,%ymm20
vpxorq %ymm21,%ymm21,%ymm21
vpmadd52huq %ymm2,%ymm17,%ymm21
vpxorq %ymm22,%ymm22,%ymm22
vpmadd52luq %ymm2,%ymm3,%ymm22
vpxorq %ymm23,%ymm23,%ymm23
vpmadd52huq %ymm2,%ymm3,%ymm23
vmovdqu64 0(%rsi),%ymm26
vmovdqu64 32(%rsi),%ymm27
leaq 64(%rsi),%rsi
vpmadd52luq %ymm0,%ymm3,%ymm18
vpmadd52huq %ymm0,%ymm3,%ymm19
vpmadd52luq %ymm0,%ymm4,%ymm20
vpmadd52huq %ymm0,%ymm4,%ymm21
vpmadd52luq %ymm0,%ymm5,%ymm22
vpmadd52huq %ymm0,%ymm5,%ymm23
vpunpcklqdq %ymm27,%ymm26,%ymm25
vpunpckhqdq %ymm27,%ymm26,%ymm27
vpmadd52luq %ymm1,%ymm17,%ymm18
vpmadd52huq %ymm1,%ymm17,%ymm19
vpmadd52luq %ymm1,%ymm3,%ymm20
vpmadd52huq %ymm1,%ymm3,%ymm21
vpmadd52luq %ymm1,%ymm4,%ymm22
vpmadd52huq %ymm1,%ymm4,%ymm23
vpsrlq $44,%ymm18,%ymm30
vpsllq $8,%ymm19,%ymm19
vpandq %ymm28,%ymm18,%ymm0
vpaddq %ymm30,%ymm19,%ymm19
vpsrlq $24,%ymm27,%ymm26
vporq %ymm31,%ymm26,%ymm26
vpaddq %ymm19,%ymm20,%ymm20
vpsrlq $44,%ymm20,%ymm30
vpsllq $8,%ymm21,%ymm21
vpandq %ymm28,%ymm20,%ymm1
vpaddq %ymm30,%ymm21,%ymm21
vpandq %ymm28,%ymm25,%ymm24
vpsrlq $44,%ymm25,%ymm25
vpsllq $20,%ymm27,%ymm27
vpaddq %ymm21,%ymm22,%ymm22
vpsrlq $42,%ymm22,%ymm30
vpsllq $10,%ymm23,%ymm23
vpandq %ymm29,%ymm22,%ymm2
vpaddq %ymm30,%ymm23,%ymm23
vpaddq %ymm26,%ymm2,%ymm2
vpaddq %ymm23,%ymm0,%ymm0
vpsllq $2,%ymm23,%ymm23
vpaddq %ymm23,%ymm0,%ymm0
vporq %ymm27,%ymm25,%ymm25
vpandq %ymm28,%ymm25,%ymm25
vpsrlq $44,%ymm0,%ymm30
vpandq %ymm28,%ymm0,%ymm0
vpaddq %ymm30,%ymm1,%ymm1
subq $4,%rdx
jnz .Loop_vpmadd52_4x
.Ltail_vpmadd52_4x:
vmovdqu64 128(%rdi),%ymm5
vmovdqu64 160(%rdi),%ymm16
vmovdqu64 64(%rdi),%ymm3
vmovdqu64 96(%rdi),%ymm4
.Ltail_vpmadd52_2x:
vpsllq $2,%ymm5,%ymm17
vpaddq %ymm5,%ymm17,%ymm17
vpsllq $2,%ymm17,%ymm17
vpaddq %ymm24,%ymm0,%ymm0
vpaddq %ymm25,%ymm1,%ymm1
vpxorq %ymm18,%ymm18,%ymm18
vpmadd52luq %ymm2,%ymm16,%ymm18
vpxorq %ymm19,%ymm19,%ymm19
vpmadd52huq %ymm2,%ymm16,%ymm19
vpxorq %ymm20,%ymm20,%ymm20
vpmadd52luq %ymm2,%ymm17,%ymm20
vpxorq %ymm21,%ymm21,%ymm21
vpmadd52huq %ymm2,%ymm17,%ymm21
vpxorq %ymm22,%ymm22,%ymm22
vpmadd52luq %ymm2,%ymm3,%ymm22
vpxorq %ymm23,%ymm23,%ymm23
vpmadd52huq %ymm2,%ymm3,%ymm23
vpmadd52luq %ymm0,%ymm3,%ymm18
vpmadd52huq %ymm0,%ymm3,%ymm19
vpmadd52luq %ymm0,%ymm4,%ymm20
vpmadd52huq %ymm0,%ymm4,%ymm21
vpmadd52luq %ymm0,%ymm5,%ymm22
vpmadd52huq %ymm0,%ymm5,%ymm23
vpmadd52luq %ymm1,%ymm17,%ymm18
vpmadd52huq %ymm1,%ymm17,%ymm19
vpmadd52luq %ymm1,%ymm3,%ymm20
vpmadd52huq %ymm1,%ymm3,%ymm21
vpmadd52luq %ymm1,%ymm4,%ymm22
vpmadd52huq %ymm1,%ymm4,%ymm23
movl $1,%eax
kmovw %eax,%k1
vpsrldq $8,%ymm18,%ymm24
vpsrldq $8,%ymm19,%ymm0
vpsrldq $8,%ymm20,%ymm25
vpsrldq $8,%ymm21,%ymm1
vpaddq %ymm24,%ymm18,%ymm18
vpaddq %ymm0,%ymm19,%ymm19
vpsrldq $8,%ymm22,%ymm26
vpsrldq $8,%ymm23,%ymm2
vpaddq %ymm25,%ymm20,%ymm20
vpaddq %ymm1,%ymm21,%ymm21
vpermq $0x2,%ymm18,%ymm24
vpermq $0x2,%ymm19,%ymm0
vpaddq %ymm26,%ymm22,%ymm22
vpaddq %ymm2,%ymm23,%ymm23
vpermq $0x2,%ymm20,%ymm25
vpermq $0x2,%ymm21,%ymm1
vpaddq %ymm24,%ymm18,%ymm18{%k1}{z}
vpaddq %ymm0,%ymm19,%ymm19{%k1}{z}
vpermq $0x2,%ymm22,%ymm26
vpermq $0x2,%ymm23,%ymm2
vpaddq %ymm25,%ymm20,%ymm20{%k1}{z}
vpaddq %ymm1,%ymm21,%ymm21{%k1}{z}
vpaddq %ymm26,%ymm22,%ymm22{%k1}{z}
vpaddq %ymm2,%ymm23,%ymm23{%k1}{z}
vpsrlq $44,%ymm18,%ymm30
vpsllq $8,%ymm19,%ymm19
vpandq %ymm28,%ymm18,%ymm0
vpaddq %ymm30,%ymm19,%ymm19
vpaddq %ymm19,%ymm20,%ymm20
vpsrlq $44,%ymm20,%ymm30
vpsllq $8,%ymm21,%ymm21
vpandq %ymm28,%ymm20,%ymm1
vpaddq %ymm30,%ymm21,%ymm21
vpaddq %ymm21,%ymm22,%ymm22
vpsrlq $42,%ymm22,%ymm30
vpsllq $10,%ymm23,%ymm23
vpandq %ymm29,%ymm22,%ymm2
vpaddq %ymm30,%ymm23,%ymm23
vpaddq %ymm23,%ymm0,%ymm0
vpsllq $2,%ymm23,%ymm23
vpaddq %ymm23,%ymm0,%ymm0
vpsrlq $44,%ymm0,%ymm30
vpandq %ymm28,%ymm0,%ymm0
vpaddq %ymm30,%ymm1,%ymm1
subq $2,%rdx
ja .Lblocks_vpmadd52_4x_do
vmovq %xmm0,0(%rdi)
vmovq %xmm1,8(%rdi)
vmovq %xmm2,16(%rdi)
vzeroall
.Lno_data_vpmadd52_4x:
.byte 0xf3,0xc3
.cfi_endproc
.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
.type poly1305_blocks_vpmadd52_8x,@function
.align 32
poly1305_blocks_vpmadd52_8x:
.cfi_startproc
shrq $4,%rdx
jz .Lno_data_vpmadd52_8x
shlq $40,%rcx
movq 64(%rdi),%r8
vmovdqa64 .Lx_mask44(%rip),%ymm28
vmovdqa64 .Lx_mask42(%rip),%ymm29
testq %r8,%r8
js .Linit_vpmadd52
vmovq 0(%rdi),%xmm0
vmovq 8(%rdi),%xmm1
vmovq 16(%rdi),%xmm2
.Lblocks_vpmadd52_8x:
vmovdqu64 128(%rdi),%ymm5
vmovdqu64 160(%rdi),%ymm16
vmovdqu64 64(%rdi),%ymm3
vmovdqu64 96(%rdi),%ymm4
vpsllq $2,%ymm5,%ymm17
vpaddq %ymm5,%ymm17,%ymm17
vpsllq $2,%ymm17,%ymm17
vpbroadcastq %xmm5,%ymm8
vpbroadcastq %xmm3,%ymm6
vpbroadcastq %xmm4,%ymm7
vpxorq %ymm18,%ymm18,%ymm18
vpmadd52luq %ymm8,%ymm16,%ymm18
vpxorq %ymm19,%ymm19,%ymm19
vpmadd52huq %ymm8,%ymm16,%ymm19
vpxorq %ymm20,%ymm20,%ymm20
vpmadd52luq %ymm8,%ymm17,%ymm20
vpxorq %ymm21,%ymm21,%ymm21
vpmadd52huq %ymm8,%ymm17,%ymm21
vpxorq %ymm22,%ymm22,%ymm22
vpmadd52luq %ymm8,%ymm3,%ymm22
vpxorq %ymm23,%ymm23,%ymm23
vpmadd52huq %ymm8,%ymm3,%ymm23
vpmadd52luq %ymm6,%ymm3,%ymm18
vpmadd52huq %ymm6,%ymm3,%ymm19
vpmadd52luq %ymm6,%ymm4,%ymm20
vpmadd52huq %ymm6,%ymm4,%ymm21
vpmadd52luq %ymm6,%ymm5,%ymm22
vpmadd52huq %ymm6,%ymm5,%ymm23
vpmadd52luq %ymm7,%ymm17,%ymm18
vpmadd52huq %ymm7,%ymm17,%ymm19
vpmadd52luq %ymm7,%ymm3,%ymm20
vpmadd52huq %ymm7,%ymm3,%ymm21
vpmadd52luq %ymm7,%ymm4,%ymm22
vpmadd52huq %ymm7,%ymm4,%ymm23
vpsrlq $44,%ymm18,%ymm30
vpsllq $8,%ymm19,%ymm19
vpandq %ymm28,%ymm18,%ymm6
vpaddq %ymm30,%ymm19,%ymm19
vpaddq %ymm19,%ymm20,%ymm20
vpsrlq $44,%ymm20,%ymm30
vpsllq $8,%ymm21,%ymm21
vpandq %ymm28,%ymm20,%ymm7
vpaddq %ymm30,%ymm21,%ymm21
vpaddq %ymm21,%ymm22,%ymm22
vpsrlq $42,%ymm22,%ymm30
vpsllq $10,%ymm23,%ymm23
vpandq %ymm29,%ymm22,%ymm8
vpaddq %ymm30,%ymm23,%ymm23
vpaddq %ymm23,%ymm6,%ymm6
vpsllq $2,%ymm23,%ymm23
vpaddq %ymm23,%ymm6,%ymm6
vpsrlq $44,%ymm6,%ymm30
vpandq %ymm28,%ymm6,%ymm6
vpaddq %ymm30,%ymm7,%ymm7
vpunpcklqdq %ymm5,%ymm8,%ymm26
vpunpckhqdq %ymm5,%ymm8,%ymm5
vpunpcklqdq %ymm3,%ymm6,%ymm24
vpunpckhqdq %ymm3,%ymm6,%ymm3
vpunpcklqdq %ymm4,%ymm7,%ymm25
vpunpckhqdq %ymm4,%ymm7,%ymm4
vshufi64x2 $0x44,%zmm5,%zmm26,%zmm8
vshufi64x2 $0x44,%zmm3,%zmm24,%zmm6
vshufi64x2 $0x44,%zmm4,%zmm25,%zmm7
vmovdqu64 0(%rsi),%zmm26
vmovdqu64 64(%rsi),%zmm27
leaq 128(%rsi),%rsi
vpsllq $2,%zmm8,%zmm10
vpsllq $2,%zmm7,%zmm9
vpaddq %zmm8,%zmm10,%zmm10
vpaddq %zmm7,%zmm9,%zmm9
vpsllq $2,%zmm10,%zmm10
vpsllq $2,%zmm9,%zmm9
vpbroadcastq %rcx,%zmm31
vpbroadcastq %xmm28,%zmm28
vpbroadcastq %xmm29,%zmm29
vpbroadcastq %xmm9,%zmm16
vpbroadcastq %xmm10,%zmm17
vpbroadcastq %xmm6,%zmm3
vpbroadcastq %xmm7,%zmm4
vpbroadcastq %xmm8,%zmm5
vpunpcklqdq %zmm27,%zmm26,%zmm25
vpunpckhqdq %zmm27,%zmm26,%zmm27
vpsrlq $24,%zmm27,%zmm26
vporq %zmm31,%zmm26,%zmm26
vpaddq %zmm26,%zmm2,%zmm2
vpandq %zmm28,%zmm25,%zmm24
vpsrlq $44,%zmm25,%zmm25
vpsllq $20,%zmm27,%zmm27
vporq %zmm27,%zmm25,%zmm25
vpandq %zmm28,%zmm25,%zmm25
subq $8,%rdx
jz .Ltail_vpmadd52_8x
jmp .Loop_vpmadd52_8x
.align 32
.Loop_vpmadd52_8x:
vpaddq %zmm24,%zmm0,%zmm0
vpaddq %zmm25,%zmm1,%zmm1
vpxorq %zmm18,%zmm18,%zmm18
vpmadd52luq %zmm2,%zmm16,%zmm18
vpxorq %zmm19,%zmm19,%zmm19
vpmadd52huq %zmm2,%zmm16,%zmm19
vpxorq %zmm20,%zmm20,%zmm20
vpmadd52luq %zmm2,%zmm17,%zmm20
vpxorq %zmm21,%zmm21,%zmm21
vpmadd52huq %zmm2,%zmm17,%zmm21
vpxorq %zmm22,%zmm22,%zmm22
vpmadd52luq %zmm2,%zmm3,%zmm22
vpxorq %zmm23,%zmm23,%zmm23
vpmadd52huq %zmm2,%zmm3,%zmm23
vmovdqu64 0(%rsi),%zmm26
vmovdqu64 64(%rsi),%zmm27
leaq 128(%rsi),%rsi
vpmadd52luq %zmm0,%zmm3,%zmm18
vpmadd52huq %zmm0,%zmm3,%zmm19
vpmadd52luq %zmm0,%zmm4,%zmm20
vpmadd52huq %zmm0,%zmm4,%zmm21
vpmadd52luq %zmm0,%zmm5,%zmm22
vpmadd52huq %zmm0,%zmm5,%zmm23
vpunpcklqdq %zmm27,%zmm26,%zmm25
vpunpckhqdq %zmm27,%zmm26,%zmm27
vpmadd52luq %zmm1,%zmm17,%zmm18
vpmadd52huq %zmm1,%zmm17,%zmm19
vpmadd52luq %zmm1,%zmm3,%zmm20
vpmadd52huq %zmm1,%zmm3,%zmm21
vpmadd52luq %zmm1,%zmm4,%zmm22
vpmadd52huq %zmm1,%zmm4,%zmm23
vpsrlq $44,%zmm18,%zmm30
vpsllq $8,%zmm19,%zmm19
vpandq %zmm28,%zmm18,%zmm0
vpaddq %zmm30,%zmm19,%zmm19
vpsrlq $24,%zmm27,%zmm26
vporq %zmm31,%zmm26,%zmm26
vpaddq %zmm19,%zmm20,%zmm20
vpsrlq $44,%zmm20,%zmm30
vpsllq $8,%zmm21,%zmm21
vpandq %zmm28,%zmm20,%zmm1
vpaddq %zmm30,%zmm21,%zmm21
vpandq %zmm28,%zmm25,%zmm24
vpsrlq $44,%zmm25,%zmm25
vpsllq $20,%zmm27,%zmm27
vpaddq %zmm21,%zmm22,%zmm22
vpsrlq $42,%zmm22,%zmm30
vpsllq $10,%zmm23,%zmm23
vpandq %zmm29,%zmm22,%zmm2
vpaddq %zmm30,%zmm23,%zmm23
vpaddq %zmm26,%zmm2,%zmm2
vpaddq %zmm23,%zmm0,%zmm0
vpsllq $2,%zmm23,%zmm23
vpaddq %zmm23,%zmm0,%zmm0
vporq %zmm27,%zmm25,%zmm25
vpandq %zmm28,%zmm25,%zmm25
vpsrlq $44,%zmm0,%zmm30
vpandq %zmm28,%zmm0,%zmm0
vpaddq %zmm30,%zmm1,%zmm1
subq $8,%rdx
jnz .Loop_vpmadd52_8x
.Ltail_vpmadd52_8x:
vpaddq %zmm24,%zmm0,%zmm0
vpaddq %zmm25,%zmm1,%zmm1
vpxorq %zmm18,%zmm18,%zmm18
vpmadd52luq %zmm2,%zmm9,%zmm18
vpxorq %zmm19,%zmm19,%zmm19
vpmadd52huq %zmm2,%zmm9,%zmm19
vpxorq %zmm20,%zmm20,%zmm20
vpmadd52luq %zmm2,%zmm10,%zmm20
vpxorq %zmm21,%zmm21,%zmm21
vpmadd52huq %zmm2,%zmm10,%zmm21
vpxorq %zmm22,%zmm22,%zmm22
vpmadd52luq %zmm2,%zmm6,%zmm22
vpxorq %zmm23,%zmm23,%zmm23
vpmadd52huq %zmm2,%zmm6,%zmm23
vpmadd52luq %zmm0,%zmm6,%zmm18
vpmadd52huq %zmm0,%zmm6,%zmm19
vpmadd52luq %zmm0,%zmm7,%zmm20
vpmadd52huq %zmm0,%zmm7,%zmm21
vpmadd52luq %zmm0,%zmm8,%zmm22
vpmadd52huq %zmm0,%zmm8,%zmm23
vpmadd52luq %zmm1,%zmm10,%zmm18
vpmadd52huq %zmm1,%zmm10,%zmm19
vpmadd52luq %zmm1,%zmm6,%zmm20
vpmadd52huq %zmm1,%zmm6,%zmm21
vpmadd52luq %zmm1,%zmm7,%zmm22
vpmadd52huq %zmm1,%zmm7,%zmm23
movl $1,%eax
kmovw %eax,%k1
vpsrldq $8,%zmm18,%zmm24
vpsrldq $8,%zmm19,%zmm0
vpsrldq $8,%zmm20,%zmm25
vpsrldq $8,%zmm21,%zmm1
vpaddq %zmm24,%zmm18,%zmm18
vpaddq %zmm0,%zmm19,%zmm19
vpsrldq $8,%zmm22,%zmm26
vpsrldq $8,%zmm23,%zmm2
vpaddq %zmm25,%zmm20,%zmm20
vpaddq %zmm1,%zmm21,%zmm21
vpermq $0x2,%zmm18,%zmm24
vpermq $0x2,%zmm19,%zmm0
vpaddq %zmm26,%zmm22,%zmm22
vpaddq %zmm2,%zmm23,%zmm23
vpermq $0x2,%zmm20,%zmm25
vpermq $0x2,%zmm21,%zmm1
vpaddq %zmm24,%zmm18,%zmm18
vpaddq %zmm0,%zmm19,%zmm19
vpermq $0x2,%zmm22,%zmm26
vpermq $0x2,%zmm23,%zmm2
vpaddq %zmm25,%zmm20,%zmm20
vpaddq %zmm1,%zmm21,%zmm21
vextracti64x4 $1,%zmm18,%ymm24
vextracti64x4 $1,%zmm19,%ymm0
vpaddq %zmm26,%zmm22,%zmm22
vpaddq %zmm2,%zmm23,%zmm23
vextracti64x4 $1,%zmm20,%ymm25
vextracti64x4 $1,%zmm21,%ymm1
vextracti64x4 $1,%zmm22,%ymm26
vextracti64x4 $1,%zmm23,%ymm2
vpaddq %ymm24,%ymm18,%ymm18{%k1}{z}
vpaddq %ymm0,%ymm19,%ymm19{%k1}{z}
vpaddq %ymm25,%ymm20,%ymm20{%k1}{z}
vpaddq %ymm1,%ymm21,%ymm21{%k1}{z}
vpaddq %ymm26,%ymm22,%ymm22{%k1}{z}
vpaddq %ymm2,%ymm23,%ymm23{%k1}{z}
vpsrlq $44,%ymm18,%ymm30
vpsllq $8,%ymm19,%ymm19
vpandq %ymm28,%ymm18,%ymm0
vpaddq %ymm30,%ymm19,%ymm19
vpaddq %ymm19,%ymm20,%ymm20
vpsrlq $44,%ymm20,%ymm30
vpsllq $8,%ymm21,%ymm21
vpandq %ymm28,%ymm20,%ymm1
vpaddq %ymm30,%ymm21,%ymm21
vpaddq %ymm21,%ymm22,%ymm22
vpsrlq $42,%ymm22,%ymm30
vpsllq $10,%ymm23,%ymm23
vpandq %ymm29,%ymm22,%ymm2
vpaddq %ymm30,%ymm23,%ymm23
vpaddq %ymm23,%ymm0,%ymm0
vpsllq $2,%ymm23,%ymm23
vpaddq %ymm23,%ymm0,%ymm0
vpsrlq $44,%ymm0,%ymm30
vpandq %ymm28,%ymm0,%ymm0
vpaddq %ymm30,%ymm1,%ymm1
vmovq %xmm0,0(%rdi)
vmovq %xmm1,8(%rdi)
vmovq %xmm2,16(%rdi)
vzeroall
.Lno_data_vpmadd52_8x:
.byte 0xf3,0xc3
.cfi_endproc
.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
.type poly1305_emit_base2_44,@function
.align 32
poly1305_emit_base2_44:
.cfi_startproc
.byte 243,15,30,250
movq 0(%rdi),%r8
movq 8(%rdi),%r9
movq 16(%rdi),%r10
movq %r9,%rax
shrq $20,%r9
shlq $44,%rax
movq %r10,%rcx
shrq $40,%r10
shlq $24,%rcx
addq %rax,%r8
adcq %rcx,%r9
adcq $0,%r10
movq %r8,%rax
addq $5,%r8
movq %r9,%rcx
adcq $0,%r9
adcq $0,%r10
shrq $2,%r10
cmovnzq %r8,%rax
cmovnzq %r9,%rcx
addq 0(%rdx),%rax
adcq 8(%rdx),%rcx
movq %rax,0(%rsi)
movq %rcx,8(%rsi)
.byte 0xf3,0xc3
.cfi_endproc
.size poly1305_emit_base2_44,.-poly1305_emit_base2_44
.align 64
.Lconst:
.Lmask24:
.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
.L129:
.long 16777216,0,16777216,0,16777216,0,16777216,0
.Lmask26:
.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
.Lpermd_avx2:
.long 2,2,2,3,2,0,2,1
.Lpermd_avx512:
.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
.L2_44_inp_permd:
.long 0,1,1,2,2,3,7,7
.L2_44_inp_shift:
.quad 0,12,24,64
.L2_44_mask:
.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
.L2_44_shift_rgt:
.quad 44,44,42,64
.L2_44_shift_lft:
.quad 8,8,10,64
.align 64
.Lx_mask44:
.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
.Lx_mask42:
.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 16
.globl xor128_encrypt_n_pad
.type xor128_encrypt_n_pad,@function
.align 16
xor128_encrypt_n_pad:
.cfi_startproc
subq %rdx,%rsi
subq %rdx,%rdi
movq %rcx,%r10
shrq $4,%rcx
jz .Ltail_enc
nop
.Loop_enc_xmm:
movdqu (%rsi,%rdx,1),%xmm0
pxor (%rdx),%xmm0
movdqu %xmm0,(%rdi,%rdx,1)
movdqa %xmm0,(%rdx)
leaq 16(%rdx),%rdx
decq %rcx
jnz .Loop_enc_xmm
andq $15,%r10
jz .Ldone_enc
.Ltail_enc:
movq $16,%rcx
subq %r10,%rcx
xorl %eax,%eax
.Loop_enc_byte:
movb (%rsi,%rdx,1),%al
xorb (%rdx),%al
movb %al,(%rdi,%rdx,1)
movb %al,(%rdx)
leaq 1(%rdx),%rdx
decq %r10
jnz .Loop_enc_byte
xorl %eax,%eax
.Loop_enc_pad:
movb %al,(%rdx)
leaq 1(%rdx),%rdx
decq %rcx
jnz .Loop_enc_pad
.Ldone_enc:
movq %rdx,%rax
.byte 0xf3,0xc3
.cfi_endproc
.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
.globl xor128_decrypt_n_pad
.type xor128_decrypt_n_pad,@function
.align 16
xor128_decrypt_n_pad:
.cfi_startproc
subq %rdx,%rsi
subq %rdx,%rdi
movq %rcx,%r10
shrq $4,%rcx
jz .Ltail_dec
nop
.Loop_dec_xmm:
movdqu (%rsi,%rdx,1),%xmm0
movdqa (%rdx),%xmm1
pxor %xmm0,%xmm1
movdqu %xmm1,(%rdi,%rdx,1)
movdqa %xmm0,(%rdx)
leaq 16(%rdx),%rdx
decq %rcx
jnz .Loop_dec_xmm
pxor %xmm1,%xmm1
andq $15,%r10
jz .Ldone_dec
.Ltail_dec:
movq $16,%rcx
subq %r10,%rcx
xorl %eax,%eax
xorq %r11,%r11
.Loop_dec_byte:
movb (%rsi,%rdx,1),%r11b
movb (%rdx),%al
xorb %r11b,%al
movb %al,(%rdi,%rdx,1)
movb %r11b,(%rdx)
leaq 1(%rdx),%rdx
decq %r10
jnz .Loop_dec_byte
xorl %eax,%eax
.Loop_dec_pad:
movb %al,(%rdx)
leaq 1(%rdx),%rdx
decq %rcx
jnz .Loop_dec_pad
.Ldone_dec:
movq %rdx,%rax
.byte 0xf3,0xc3
.cfi_endproc
.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
.section ".note.gnu.property", "a"
.p2align 3
.long 1f - 0f
.long 4f - 1f
.long 5
0:
# "GNU" encoded with .byte, since .asciz isn't supported
# on Solaris.
.byte 0x47
.byte 0x4e
.byte 0x55
.byte 0
1:
.p2align 3
.long 0xc0000002
.long 3f - 2f
2:
.long 3
3:
.p2align 3
4: