#include "arm_asm.h"
#include "arm_arch.h"
#if __ARM_MAX_ARCH__>=8
.fpu neon
#ifdef __thumb2__
.syntax unified
.thumb
# define INST(a,b,c,d) c,0xef,a,b
#else
.code 32
# define INST(a,b,c,d) a,b,c,0xf2
#endif
.text
.globl aes_gcm_enc_128_kernel
.type aes_gcm_enc_128_kernel,%function
.align 4
aes_gcm_enc_128_kernel:
cbz r1, .L128_enc_ret
stp r19, r20, [sp, #-112]!
mov r16, r4
mov r8, r5
stp r21, r22, [sp, #16]
stp r23, r24, [sp, #32]
stp d8, d9, [sp, #48]
stp d10, d11, [sp, #64]
stp d12, d13, [sp, #80]
stp d14, d15, [sp, #96]
ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32
#ifdef __ARMEB__
rev r10, r10
rev r11, r11
#endif
ldp r13, r14, [r8, #160] @ load rk10
#ifdef __ARMEB__
ror r13, r13, #32
ror r14, r14, #32
#endif
ld1 {v11.16b}, [r3]
ext v11.16b, v11.16b, v11.16b, #8
rev64 v11.16b, v11.16b
lsr r5, r1, #3 @ byte_len
mov r15, r5
ld1 {v18.4s}, [r8], #16 @ load rk0
add r4, r0, r1, lsr #3 @ end_input_ptr
sub r5, r5, #1 @ byte_len - 1
lsr r12, r11, #32
ldr q15, [r3, #112] @ load h4l | h4h
#ifndef __ARMEB__
ext v15.16b, v15.16b, v15.16b, #8
#endif
fmov d1, r10 @ CTR block 1
rev r12, r12 @ rev_ctr32
add r12, r12, #1 @ increment rev_ctr32
orr r11, r11, r11
ld1 {v19.4s}, [r8], #16 @ load rk1
rev r9, r12 @ CTR block 1
add r12, r12, #1 @ CTR block 1
fmov d3, r10 @ CTR block 3
orr r9, r11, r9, lsl #32 @ CTR block 1
ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible
fmov v1.d[1], r9 @ CTR block 1
rev r9, r12 @ CTR block 2
fmov d2, r10 @ CTR block 2
orr r9, r11, r9, lsl #32 @ CTR block 2
add r12, r12, #1 @ CTR block 2
fmov v2.d[1], r9 @ CTR block 2
rev r9, r12 @ CTR block 3
orr r9, r11, r9, lsl #32 @ CTR block 3
ld1 {v20.4s}, [r8], #16 @ load rk2
add r12, r12, #1 @ CTR block 3
fmov v3.d[1], r9 @ CTR block 3
ldr q14, [r3, #80] @ load h3l | h3h
#ifndef __ARMEB__
ext v14.16b, v14.16b, v14.16b, #8
#endif
aese q1, v18.16b
aesmc q1, q1 @ AES block 1 - round 0
ld1 {v21.4s}, [r8], #16 @ load rk3
aese q2, v18.16b
aesmc q2, q2 @ AES block 2 - round 0
ldr q12, [r3, #32] @ load h1l | h1h
#ifndef __ARMEB__
ext v12.16b, v12.16b, v12.16b, #8
#endif
aese q0, v18.16b
aesmc q0, q0 @ AES block 0 - round 0
ld1 {v22.4s}, [r8], #16 @ load rk4
aese q3, v18.16b
aesmc q3, q3 @ AES block 3 - round 0
ld1 {v23.4s}, [r8], #16 @ load rk5
aese q2, v19.16b
aesmc q2, q2 @ AES block 2 - round 1
trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l
aese q0, v19.16b
aesmc q0, q0 @ AES block 0 - round 1
ld1 {v24.4s}, [r8], #16 @ load rk6
aese q1, v19.16b
aesmc q1, q1 @ AES block 1 - round 1
ld1 {v25.4s}, [r8], #16 @ load rk7
aese q3, v19.16b
aesmc q3, q3 @ AES block 3 - round 1
trn1 q9, v14.2d, v15.2d @ h4h | h3h
aese q0, v20.16b
aesmc q0, q0 @ AES block 0 - round 2
ld1 {v26.4s}, [r8], #16 @ load rk8
aese q1, v20.16b
aesmc q1, q1 @ AES block 1 - round 2
ldr q13, [r3, #64] @ load h2l | h2h
#ifndef __ARMEB__
ext v13.16b, v13.16b, v13.16b, #8
#endif
aese q3, v20.16b
aesmc q3, q3 @ AES block 3 - round 2
aese q2, v20.16b
aesmc q2, q2 @ AES block 2 - round 2
eor v17.16b, v17.16b, q9 @ h4k | h3k
aese q0, v21.16b
aesmc q0, q0 @ AES block 0 - round 3
aese q1, v21.16b
aesmc q1, q1 @ AES block 1 - round 3
aese q2, v21.16b
aesmc q2, q2 @ AES block 2 - round 3
ld1 {v27.4s}, [r8], #16 @ load rk9
aese q3, v21.16b
aesmc q3, q3 @ AES block 3 - round 3
and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l
aese q3, v22.16b
aesmc q3, q3 @ AES block 3 - round 4
add r5, r5, r0
aese q2, v22.16b
aesmc q2, q2 @ AES block 2 - round 4
cmp r0, r5 @ check if we have <= 4 blocks
aese q0, v22.16b
aesmc q0, q0 @ AES block 0 - round 4
aese q3, v23.16b
aesmc q3, q3 @ AES block 3 - round 5
aese q2, v23.16b
aesmc q2, q2 @ AES block 2 - round 5
aese q0, v23.16b
aesmc q0, q0 @ AES block 0 - round 5
aese q3, v24.16b
aesmc q3, q3 @ AES block 3 - round 6
aese q1, v22.16b
aesmc q1, q1 @ AES block 1 - round 4
aese q2, v24.16b
aesmc q2, q2 @ AES block 2 - round 6
trn1 q8, v12.2d, v13.2d @ h2h | h1h
aese q0, v24.16b
aesmc q0, q0 @ AES block 0 - round 6
aese q1, v23.16b
aesmc q1, q1 @ AES block 1 - round 5
aese q3, v25.16b
aesmc q3, q3 @ AES block 3 - round 7
aese q0, v25.16b
aesmc q0, q0 @ AES block 0 - round 7
aese q1, v24.16b
aesmc q1, q1 @ AES block 1 - round 6
aese q2, v25.16b
aesmc q2, q2 @ AES block 2 - round 7
aese q0, v26.16b
aesmc q0, q0 @ AES block 0 - round 8
aese q1, v25.16b
aesmc q1, q1 @ AES block 1 - round 7
aese q2, v26.16b
aesmc q2, q2 @ AES block 2 - round 8
aese q3, v26.16b
aesmc q3, q3 @ AES block 3 - round 8
aese q1, v26.16b
aesmc q1, q1 @ AES block 1 - round 8
aese q2, v27.16b @ AES block 2 - round 9
aese q0, v27.16b @ AES block 0 - round 9
eor v16.16b, v16.16b, q8 @ h2k | h1k
aese q1, v27.16b @ AES block 1 - round 9
aese q3, v27.16b @ AES block 3 - round 9
bge .L128_enc_tail @ handle tail
ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext
#ifdef __ARMEB__
rev r6, r6
rev r7, r7
#endif
ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext
#ifdef __ARMEB__
rev r21, r21
rev r22, r22
#endif
ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext
#ifdef __ARMEB__
rev r19, r19
rev r20, r20
#endif
ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext
#ifdef __ARMEB__
rev r23, r23
rev r24, r24
#endif
eor r6, r6, r13 @ AES block 0 - round 10 low
eor r7, r7, r14 @ AES block 0 - round 10 high
eor r21, r21, r13 @ AES block 2 - round 10 low
fmov d4, r6 @ AES block 0 - mov low
eor r19, r19, r13 @ AES block 1 - round 10 low
eor r22, r22, r14 @ AES block 2 - round 10 high
fmov v4.d[1], r7 @ AES block 0 - mov high
fmov d5, r19 @ AES block 1 - mov low
eor r20, r20, r14 @ AES block 1 - round 10 high
eor r23, r23, r13 @ AES block 3 - round 10 low
fmov v5.d[1], r20 @ AES block 1 - mov high
fmov d6, r21 @ AES block 2 - mov low
eor r24, r24, r14 @ AES block 3 - round 10 high
rev r9, r12 @ CTR block 4
fmov v6.d[1], r22 @ AES block 2 - mov high
orr r9, r11, r9, lsl #32 @ CTR block 4
eor q4, q4, q0 @ AES block 0 - result
fmov d0, r10 @ CTR block 4
add r12, r12, #1 @ CTR block 4
fmov v0.d[1], r9 @ CTR block 4
rev r9, r12 @ CTR block 5
eor q5, q5, q1 @ AES block 1 - result
fmov d1, r10 @ CTR block 5
orr r9, r11, r9, lsl #32 @ CTR block 5
add r12, r12, #1 @ CTR block 5
add r0, r0, #64 @ AES input_ptr update
fmov v1.d[1], r9 @ CTR block 5
fmov d7, r23 @ AES block 3 - mov low
rev r9, r12 @ CTR block 6
st1 { q4}, [r2], #16 @ AES block 0 - store result
fmov v7.d[1], r24 @ AES block 3 - mov high
orr r9, r11, r9, lsl #32 @ CTR block 6
add r12, r12, #1 @ CTR block 6
eor q6, q6, q2 @ AES block 2 - result
st1 { q5}, [r2], #16 @ AES block 1 - store result
fmov d2, r10 @ CTR block 6
cmp r0, r5 @ check if we have <= 8 blocks
fmov v2.d[1], r9 @ CTR block 6
rev r9, r12 @ CTR block 7
st1 { q6}, [r2], #16 @ AES block 2 - store result
orr r9, r11, r9, lsl #32 @ CTR block 7
eor q7, q7, q3 @ AES block 3 - result
st1 { q7}, [r2], #16 @ AES block 3 - store result
bge .L128_enc_prepretail @ do prepretail
.L128_enc_main_loop:@ main loop start
ldp r23, r24, [r0, #48] @ AES block 4k+3 - load plaintext
#ifdef __ARMEB__
rev r23, r23
rev r24, r24
#endif
rev64 q4, q4 @ GHASH block 4k (only t0 is free)
rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free)
aese q2, v18.16b
aesmc q2, q2 @ AES block 4k+6 - round 0
fmov d3, r10 @ CTR block 4k+3
ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0
rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free)
aese q1, v18.16b
aesmc q1, q1 @ AES block 4k+5 - round 0
add r12, r12, #1 @ CTR block 4k+3
fmov v3.d[1], r9 @ CTR block 4k+3
aese q0, v18.16b
aesmc q0, q0 @ AES block 4k+4 - round 0
mov d31, v6.d[1] @ GHASH block 4k+2 - mid
aese q2, v19.16b
aesmc q2, q2 @ AES block 4k+6 - round 1
mov d30, v5.d[1] @ GHASH block 4k+1 - mid
aese q1, v19.16b
aesmc q1, q1 @ AES block 4k+5 - round 1
eor q4, q4, v11.16b @ PRE 1
aese q3, v18.16b
aesmc q3, q3 @ AES block 4k+7 - round 0
eor r24, r24, r14 @ AES block 4k+3 - round 10 high
pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high
eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid
ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext
#ifdef __ARMEB__
rev r6, r6
rev r7, r7
#endif
aese q0, v19.16b
aesmc q0, q0 @ AES block 4k+4 - round 1
rev r9, r12 @ CTR block 4k+8
eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid
mov d8, v4.d[1] @ GHASH block 4k - mid
orr r9, r11, r9, lsl #32 @ CTR block 4k+8
pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high
add r12, r12, #1 @ CTR block 4k+8
mov d10, v17.d[1] @ GHASH block 4k - mid
aese q0, v20.16b
aesmc q0, q0 @ AES block 4k+4 - round 2
pmull v11.1q, q4, v15.1d @ GHASH block 4k - low
eor q8, q8, q4 @ GHASH block 4k - mid
aese q1, v20.16b
aesmc q1, q1 @ AES block 4k+5 - round 2
aese q0, v21.16b
aesmc q0, q0 @ AES block 4k+4 - round 3
eor q9, q9, v28.16b @ GHASH block 4k+1 - high
pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low
pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid
rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid
pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low
ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid
pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high
eor r7, r7, r14 @ AES block 4k+4 - round 10 high
eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid
mov d30, v7.d[1] @ GHASH block 4k+3 - mid
aese q3, v19.16b
aesmc q3, q3 @ AES block 4k+7 - round 1
eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low
aese q2, v20.16b
aesmc q2, q2 @ AES block 4k+6 - round 2
eor r6, r6, r13 @ AES block 4k+4 - round 10 low
aese q1, v21.16b
aesmc q1, q1 @ AES block 4k+5 - round 3
eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid
pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high
aese q2, v21.16b
aesmc q2, q2 @ AES block 4k+6 - round 3
eor q9, q9, q8 @ GHASH block 4k+2 - high
pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid
pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low
movi q8, #0xc2
pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid
eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low
aese q1, v22.16b
aesmc q1, q1 @ AES block 4k+5 - round 4
aese q3, v20.16b
aesmc q3, q3 @ AES block 4k+7 - round 2
shl d8, d8, #56 @ mod_constant
aese q0, v22.16b
aesmc q0, q0 @ AES block 4k+4 - round 4
eor q9, q9, q4 @ GHASH block 4k+3 - high
aese q1, v23.16b
aesmc q1, q1 @ AES block 4k+5 - round 5
ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext
#ifdef __ARMEB__
rev r19, r19
rev r20, r20
#endif
aese q3, v21.16b
aesmc q3, q3 @ AES block 4k+7 - round 3
eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid
aese q0, v23.16b
aesmc q0, q0 @ AES block 4k+4 - round 5
ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext
#ifdef __ARMEB__
rev r21, r21
rev r22, r22
#endif
pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid
eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low
aese q2, v22.16b
aesmc q2, q2 @ AES block 4k+6 - round 4
eor r19, r19, r13 @ AES block 4k+5 - round 10 low
aese q3, v22.16b
aesmc q3, q3 @ AES block 4k+7 - round 4
eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid
aese q1, v24.16b
aesmc q1, q1 @ AES block 4k+5 - round 6
eor r23, r23, r13 @ AES block 4k+3 - round 10 low
aese q2, v23.16b
aesmc q2, q2 @ AES block 4k+6 - round 5
eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up
fmov d4, r6 @ AES block 4k+4 - mov low
aese q0, v24.16b
aesmc q0, q0 @ AES block 4k+4 - round 6
fmov v4.d[1], r7 @ AES block 4k+4 - mov high
add r0, r0, #64 @ AES input_ptr update
fmov d7, r23 @ AES block 4k+3 - mov low
ext q9, q9, q9, #8 @ MODULO - other top alignment
aese q3, v23.16b
aesmc q3, q3 @ AES block 4k+7 - round 5
fmov d5, r19 @ AES block 4k+5 - mov low
aese q0, v25.16b
aesmc q0, q0 @ AES block 4k+4 - round 7
eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up
aese q2, v24.16b
aesmc q2, q2 @ AES block 4k+6 - round 6
eor r20, r20, r14 @ AES block 4k+5 - round 10 high
aese q1, v25.16b
aesmc q1, q1 @ AES block 4k+5 - round 7
fmov v5.d[1], r20 @ AES block 4k+5 - mov high
aese q0, v26.16b
aesmc q0, q0 @ AES block 4k+4 - round 8
fmov v7.d[1], r24 @ AES block 4k+3 - mov high
aese q3, v24.16b
aesmc q3, q3 @ AES block 4k+7 - round 6
cmp r0, r5 @ .LOOP CONTROL
aese q1, v26.16b
aesmc q1, q1 @ AES block 4k+5 - round 8
eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid
aese q0, v27.16b @ AES block 4k+4 - round 9
eor r21, r21, r13 @ AES block 4k+6 - round 10 low
eor r22, r22, r14 @ AES block 4k+6 - round 10 high
aese q3, v25.16b
aesmc q3, q3 @ AES block 4k+7 - round 7
fmov d6, r21 @ AES block 4k+6 - mov low
aese q1, v27.16b @ AES block 4k+5 - round 9
fmov v6.d[1], r22 @ AES block 4k+6 - mov high
aese q2, v25.16b
aesmc q2, q2 @ AES block 4k+6 - round 7
eor q4, q4, q0 @ AES block 4k+4 - result
fmov d0, r10 @ CTR block 4k+8
aese q3, v26.16b
aesmc q3, q3 @ AES block 4k+7 - round 8
fmov v0.d[1], r9 @ CTR block 4k+8
rev r9, r12 @ CTR block 4k+9
eor v10.16b, v10.16b, q9 @ MODULO - fold into mid
aese q2, v26.16b
aesmc q2, q2 @ AES block 4k+6 - round 8
eor q5, q5, q1 @ AES block 4k+5 - result
add r12, r12, #1 @ CTR block 4k+9
orr r9, r11, r9, lsl #32 @ CTR block 4k+9
fmov d1, r10 @ CTR block 4k+9
pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low
fmov v1.d[1], r9 @ CTR block 4k+9
rev r9, r12 @ CTR block 4k+10
aese q2, v27.16b @ AES block 4k+6 - round 9
st1 { q4}, [r2], #16 @ AES block 4k+4 - store result
eor q6, q6, q2 @ AES block 4k+6 - result
orr r9, r11, r9, lsl #32 @ CTR block 4k+10
aese q3, v27.16b @ AES block 4k+7 - round 9
add r12, r12, #1 @ CTR block 4k+10
ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment
fmov d2, r10 @ CTR block 4k+10
eor v11.16b, v11.16b, q9 @ MODULO - fold into low
st1 { q5}, [r2], #16 @ AES block 4k+5 - store result
fmov v2.d[1], r9 @ CTR block 4k+10
st1 { q6}, [r2], #16 @ AES block 4k+6 - store result
rev r9, r12 @ CTR block 4k+11
orr r9, r11, r9, lsl #32 @ CTR block 4k+11
eor q7, q7, q3 @ AES block 4k+3 - result
eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low
st1 { q7}, [r2], #16 @ AES block 4k+3 - store result
blt .L128_enc_main_loop
.L128_enc_prepretail:@ PREPRETAIL
rev64 q4, q4 @ GHASH block 4k (only t0 is free)
fmov d3, r10 @ CTR block 4k+3
rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free)
ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0
add r12, r12, #1 @ CTR block 4k+3
fmov v3.d[1], r9 @ CTR block 4k+3
aese q1, v18.16b
aesmc q1, q1 @ AES block 4k+5 - round 0
rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free)
pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low
rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
eor q4, q4, v11.16b @ PRE 1
pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high
aese q3, v18.16b
aesmc q3, q3 @ AES block 4k+7 - round 0
mov d30, v5.d[1] @ GHASH block 4k+1 - mid
pmull v11.1q, q4, v15.1d @ GHASH block 4k - low
mov d8, v4.d[1] @ GHASH block 4k - mid
mov d31, v6.d[1] @ GHASH block 4k+2 - mid
mov d10, v17.d[1] @ GHASH block 4k - mid
aese q1, v19.16b
aesmc q1, q1 @ AES block 4k+5 - round 1
eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid
eor q8, q8, q4 @ GHASH block 4k - mid
pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high
eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid
aese q3, v19.16b
aesmc q3, q3 @ AES block 4k+7 - round 1
pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid
eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low
pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid
aese q0, v18.16b
aesmc q0, q0 @ AES block 4k+4 - round 0
ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid
aese q2, v18.16b
aesmc q2, q2 @ AES block 4k+6 - round 0
eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid
mov d30, v7.d[1] @ GHASH block 4k+3 - mid
aese q0, v19.16b
aesmc q0, q0 @ AES block 4k+4 - round 1
eor q9, q9, v28.16b @ GHASH block 4k+1 - high
pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid
pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high
eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid
pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high
pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low
aese q2, v19.16b
aesmc q2, q2 @ AES block 4k+6 - round 1
eor q9, q9, q8 @ GHASH block 4k+2 - high
aese q0, v20.16b
aesmc q0, q0 @ AES block 4k+4 - round 2
pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low
movi q8, #0xc2
aese q2, v20.16b
aesmc q2, q2 @ AES block 4k+6 - round 2
eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low
aese q3, v20.16b
aesmc q3, q3 @ AES block 4k+7 - round 2
pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid
eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid
aese q2, v21.16b
aesmc q2, q2 @ AES block 4k+6 - round 3
aese q1, v20.16b
aesmc q1, q1 @ AES block 4k+5 - round 2
eor q9, q9, q4 @ GHASH block 4k+3 - high
aese q0, v21.16b
aesmc q0, q0 @ AES block 4k+4 - round 3
eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid
shl d8, d8, #56 @ mod_constant
aese q1, v21.16b
aesmc q1, q1 @ AES block 4k+5 - round 3
eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low
aese q0, v22.16b
aesmc q0, q0 @ AES block 4k+4 - round 4
pmull v28.1q, q9, q8
eor v10.16b, v10.16b, q9 @ karatsuba tidy up
aese q1, v22.16b
aesmc q1, q1 @ AES block 4k+5 - round 4
aese q0, v23.16b
aesmc q0, q0 @ AES block 4k+4 - round 5
ext q9, q9, q9, #8
aese q3, v21.16b
aesmc q3, q3 @ AES block 4k+7 - round 3
aese q2, v22.16b
aesmc q2, q2 @ AES block 4k+6 - round 4
eor v10.16b, v10.16b, v11.16b
aese q0, v24.16b
aesmc q0, q0 @ AES block 4k+4 - round 6
aese q3, v22.16b
aesmc q3, q3 @ AES block 4k+7 - round 4
aese q1, v23.16b
aesmc q1, q1 @ AES block 4k+5 - round 5
aese q2, v23.16b
aesmc q2, q2 @ AES block 4k+6 - round 5
eor v10.16b, v10.16b, v28.16b
aese q3, v23.16b
aesmc q3, q3 @ AES block 4k+7 - round 5
aese q1, v24.16b
aesmc q1, q1 @ AES block 4k+5 - round 6
aese q2, v24.16b
aesmc q2, q2 @ AES block 4k+6 - round 6
aese q3, v24.16b
aesmc q3, q3 @ AES block 4k+7 - round 6
eor v10.16b, v10.16b, q9
aese q0, v25.16b
aesmc q0, q0 @ AES block 4k+4 - round 7
aese q2, v25.16b
aesmc q2, q2 @ AES block 4k+6 - round 7
aese q3, v25.16b
aesmc q3, q3 @ AES block 4k+7 - round 7
pmull v28.1q, v10.1d, q8
aese q1, v25.16b
aesmc q1, q1 @ AES block 4k+5 - round 7
ext v10.16b, v10.16b, v10.16b, #8
aese q3, v26.16b
aesmc q3, q3 @ AES block 4k+7 - round 8
aese q0, v26.16b
aesmc q0, q0 @ AES block 4k+4 - round 8
eor v11.16b, v11.16b, v28.16b
aese q1, v26.16b
aesmc q1, q1 @ AES block 4k+5 - round 8
aese q3, v27.16b @ AES block 4k+7 - round 9
aese q2, v26.16b
aesmc q2, q2 @ AES block 4k+6 - round 8
aese q0, v27.16b @ AES block 4k+4 - round 9
aese q1, v27.16b @ AES block 4k+5 - round 9
eor v11.16b, v11.16b, v10.16b
aese q2, v27.16b @ AES block 4k+6 - round 9
.L128_enc_tail:@ TAIL
sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process
ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext
#ifdef __ARMEB__
rev r6, r6
rev r7, r7
#endif
cmp r5, #48
ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag
eor r6, r6, r13 @ AES block 4k+4 - round 10 low
eor r7, r7, r14 @ AES block 4k+4 - round 10 high
fmov d4, r6 @ AES block 4k+4 - mov low
fmov v4.d[1], r7 @ AES block 4k+4 - mov high
eor q5, q4, q0 @ AES block 4k+4 - result
bgt .L128_enc_blocks_more_than_3
sub r12, r12, #1
movi v11.8b, #0
mov q3, q2
cmp r5, #32
mov q2, q1
movi q9, #0
movi v10.8b, #0
bgt .L128_enc_blocks_more_than_2
mov q3, q1
cmp r5, #16
sub r12, r12, #1
bgt .L128_enc_blocks_more_than_1
sub r12, r12, #1
b .L128_enc_blocks_less_than_1
.L128_enc_blocks_more_than_3:@ blocks left > 3
st1 { q5}, [r2], #16 @ AES final-3 block - store result
ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high
#ifdef __ARMEB__
rev r6, r6
rev r7, r7
#endif
rev64 q4, q5 @ GHASH final-3 block
eor q4, q4, q8 @ feed in partial tag
eor r7, r7, r14 @ AES final-2 block - round 10 high
eor r6, r6, r13 @ AES final-2 block - round 10 low
fmov d5, r6 @ AES final-2 block - mov low
movi q8, #0 @ suppress further partial tag feed in
fmov v5.d[1], r7 @ AES final-2 block - mov high
pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low
mov d22, v4.d[1] @ GHASH final-3 block - mid
pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high
mov d10, v17.d[1] @ GHASH final-3 block - mid
eor q5, q5, q1 @ AES final-2 block - result
eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid
pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid
.L128_enc_blocks_more_than_2:@ blocks left > 2
st1 { q5}, [r2], #16 @ AES final-2 block - store result
rev64 q4, q5 @ GHASH final-2 block
ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high
#ifdef __ARMEB__
rev r6, r6
rev r7, r7
#endif
eor q4, q4, q8 @ feed in partial tag
eor r6, r6, r13 @ AES final-1 block - round 10 low
fmov d5, r6 @ AES final-1 block - mov low
eor r7, r7, r14 @ AES final-1 block - round 10 high
pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high
fmov v5.d[1], r7 @ AES final-1 block - mov high
mov d22, v4.d[1] @ GHASH final-2 block - mid
pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low
eor q9, q9, v20.16b @ GHASH final-2 block - high
eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid
eor q5, q5, q2 @ AES final-1 block - result
eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low
pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid
movi q8, #0 @ suppress further partial tag feed in
eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid
.L128_enc_blocks_more_than_1:@ blocks left > 1
st1 { q5}, [r2], #16 @ AES final-1 block - store result
rev64 q4, q5 @ GHASH final-1 block
ldp r6, r7, [r0], #16 @ AES final block - load input low & high
#ifdef __ARMEB__
rev r6, r6
rev r7, r7
#endif
eor q4, q4, q8 @ feed in partial tag
eor r7, r7, r14 @ AES final block - round 10 high
eor r6, r6, r13 @ AES final block - round 10 low
fmov d5, r6 @ AES final block - mov low
pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high
fmov v5.d[1], r7 @ AES final block - mov high
mov d22, v4.d[1] @ GHASH final-1 block - mid
pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low
eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid
eor q5, q5, q3 @ AES final block - result
ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid
pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid
eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low
eor q9, q9, v20.16b @ GHASH final-1 block - high
eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid
movi q8, #0 @ suppress further partial tag feed in
.L128_enc_blocks_less_than_1:@ blocks left <= 1
and r1, r1, #127 @ bit_length %= 128
mvn r13, xzr @ rk10_l = 0xffffffffffffffff
mvn r14, xzr @ rk10_h = 0xffffffffffffffff
sub r1, r1, #128 @ bit_length -= 128
neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128])
and r1, r1, #127 @ bit_length %= 128
lsr r14, r14, r1 @ rk10_h is mask for top 64b of last block
cmp r1, #64
csel r6, r13, r14, lt
csel r7, r14, xzr, lt
fmov d0, r6 @ ctr0b is mask for last block
fmov v0.d[1], r7
and q5, q5, q0 @ possibly partial last block has zeroes in highest bits
rev64 q4, q5 @ GHASH final block
eor q4, q4, q8 @ feed in partial tag
mov d8, v4.d[1] @ GHASH final block - mid
pmull v21.1q, q4, v12.1d @ GHASH final block - low
ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored
eor q8, q8, q4 @ GHASH final block - mid
#ifndef __ARMEB__
rev r9, r12
#else
mov r9, r12
#endif
pmull2 v20.1q, q4, v12.2d @ GHASH final block - high
pmull v8.1q, q8, v16.1d @ GHASH final block - mid
eor v11.16b, v11.16b, v21.16b @ GHASH final block - low
eor q9, q9, v20.16b @ GHASH final block - high
eor v10.16b, v10.16b, q8 @ GHASH final block - mid
movi q8, #0xc2
eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up
shl d8, d8, #56 @ mod_constant
eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up
pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid
ext q9, q9, q9, #8 @ MODULO - other top alignment
eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid
eor v10.16b, v10.16b, q9 @ MODULO - fold into mid
pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low
ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment
bif q5, v18.16b, q0 @ insert existing bytes in top end of result before storing
eor v11.16b, v11.16b, q9 @ MODULO - fold into low
st1 { q5}, [r2] @ store all 16B
str r9, [r16, #12] @ store the updated counter
eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low
ext v11.16b, v11.16b, v11.16b, #8
rev64 v11.16b, v11.16b
mov r0, r15
st1 { v11.16b }, [r3]
ldp r21, r22, [sp, #16]
ldp r23, r24, [sp, #32]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #64]
ldp d12, d13, [sp, #80]
ldp d14, d15, [sp, #96]
ldp r19, r20, [sp], #112
RET
.L128_enc_ret:
mov r0, #0x0
RET
.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
.globl aes_gcm_dec_128_kernel
.type aes_gcm_dec_128_kernel,%function
.align 4
aes_gcm_dec_128_kernel:
cbz r1, .L128_dec_ret
stp r19, r20, [sp, #-112]!
mov r16, r4
mov r8, r5
stp r21, r22, [sp, #16]
stp r23, r24, [sp, #32]
stp d8, d9, [sp, #48]
stp d10, d11, [sp, #64]
stp d12, d13, [sp, #80]
stp d14, d15, [sp, #96]
lsr r5, r1, #3 @ byte_len
mov r15, r5
ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32
#ifdef __ARMEB__
rev r10, r10
rev r11, r11
#endif
ldp r13, r14, [r8, #160] @ load rk10
#ifdef __ARMEB__
ror r14, r14, 32
ror r13, r13, 32
#endif
sub r5, r5, #1 @ byte_len - 1
ld1 {v18.4s}, [r8], #16 @ load rk0
and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible
ldr q13, [r3, #64] @ load h2l | h2h
#ifndef __ARMEB__
ext v13.16b, v13.16b, v13.16b, #8
#endif
lsr r12, r11, #32
fmov d2, r10 @ CTR block 2
ld1 {v19.4s}, [r8], #16 @ load rk1
orr r11, r11, r11
rev r12, r12 @ rev_ctr32
fmov d1, r10 @ CTR block 1
add r12, r12, #1 @ increment rev_ctr32
aese q0, v18.16b
aesmc q0, q0 @ AES block 0 - round 0
rev r9, r12 @ CTR block 1
orr r9, r11, r9, lsl #32 @ CTR block 1
ld1 {v20.4s}, [r8], #16 @ load rk2
add r12, r12, #1 @ CTR block 1
fmov v1.d[1], r9 @ CTR block 1
rev r9, r12 @ CTR block 2
add r12, r12, #1 @ CTR block 2
aese q0, v19.16b
aesmc q0, q0 @ AES block 0 - round 1
orr r9, r11, r9, lsl #32 @ CTR block 2
fmov v2.d[1], r9 @ CTR block 2
rev r9, r12 @ CTR block 3
fmov d3, r10 @ CTR block 3
orr r9, r11, r9, lsl #32 @ CTR block 3
add r12, r12, #1 @ CTR block 3
fmov v3.d[1], r9 @ CTR block 3
add r4, r0, r1, lsr #3 @ end_input_ptr
aese q1, v18.16b
aesmc q1, q1 @ AES block 1 - round 0
ld1 {v21.4s}, [r8], #16 @ load rk3
aese q0, v20.16b
aesmc q0, q0 @ AES block 0 - round 2
ld1 {v22.4s}, [r8], #16 @ load rk4
aese q2, v18.16b
aesmc q2, q2 @ AES block 2 - round 0
ld1 {v23.4s}, [r8], #16 @ load rk5
aese q1, v19.16b
aesmc q1, q1 @ AES block 1 - round 1
ld1 {v24.4s}, [r8], #16 @ load rk6
aese q3, v18.16b
aesmc q3, q3 @ AES block 3 - round 0
aese q2, v19.16b
aesmc q2, q2 @ AES block 2 - round 1
aese q1, v20.16b
aesmc q1, q1 @ AES block 1 - round 2
aese q3, v19.16b
aesmc q3, q3 @ AES block 3 - round 1
ld1 { v11.16b}, [r3]
ext v11.16b, v11.16b, v11.16b, #8
rev64 v11.16b, v11.16b
aese q0, v21.16b
aesmc q0, q0 @ AES block 0 - round 3
ld1 {v25.4s}, [r8], #16 @ load rk7
aese q1, v21.16b
aesmc q1, q1 @ AES block 1 - round 3
aese q3, v20.16b
aesmc q3, q3 @ AES block 3 - round 2
aese q2, v20.16b
aesmc q2, q2 @ AES block 2 - round 2
ld1 {v26.4s}, [r8], #16 @ load rk8
aese q1, v22.16b
aesmc q1, q1 @ AES block 1 - round 4
aese q3, v21.16b
aesmc q3, q3 @ AES block 3 - round 3
aese q2, v21.16b
aesmc q2, q2 @ AES block 2 - round 3
ldr q14, [r3, #80] @ load h3l | h3h
#ifndef __ARMEB__
ext v14.16b, v14.16b, v14.16b, #8
#endif
aese q0, v22.16b
aesmc q0, q0 @ AES block 0 - round 4
ld1 {v27.4s}, [r8], #16 @ load rk9
aese q1, v23.16b
aesmc q1, q1 @ AES block 1 - round 5
aese q2, v22.16b
aesmc q2, q2 @ AES block 2 - round 4
aese q3, v22.16b
aesmc q3, q3 @ AES block 3 - round 4
aese q0, v23.16b
aesmc q0, q0 @ AES block 0 - round 5
aese q2, v23.16b
aesmc q2, q2 @ AES block 2 - round 5
ldr q12, [r3, #32] @ load h1l | h1h
#ifndef __ARMEB__
ext v12.16b, v12.16b, v12.16b, #8
#endif
aese q3, v23.16b
aesmc q3, q3 @ AES block 3 - round 5
aese q0, v24.16b
aesmc q0, q0 @ AES block 0 - round 6
aese q1, v24.16b
aesmc q1, q1 @ AES block 1 - round 6
aese q3, v24.16b
aesmc q3, q3 @ AES block 3 - round 6
aese q2, v24.16b
aesmc q2, q2 @ AES block 2 - round 6
trn1 q8, v12.2d, v13.2d @ h2h | h1h
ldr q15, [r3, #112] @ load h4l | h4h
#ifndef __ARMEB__
ext v15.16b, v15.16b, v15.16b, #8
#endif
trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l
add r5, r5, r0
aese q1, v25.16b
aesmc q1, q1 @ AES block 1 - round 7
aese q2, v25.16b
aesmc q2, q2 @ AES block 2 - round 7
aese q0, v25.16b
aesmc q0, q0 @ AES block 0 - round 7
eor v16.16b, v16.16b, q8 @ h2k | h1k
aese q3, v25.16b
aesmc q3, q3 @ AES block 3 - round 7
aese q1, v26.16b
aesmc q1, q1 @ AES block 1 - round 8
trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l
aese q2, v26.16b
aesmc q2, q2 @ AES block 2 - round 8
aese q3, v26.16b
aesmc q3, q3 @ AES block 3 - round 8
aese q0, v26.16b
aesmc q0, q0 @ AES block 0 - round 8
trn1 q9, v14.2d, v15.2d @ h4h | h3h
aese q2, v27.16b @ AES block 2 - round 9
aese q3, v27.16b @ AES block 3 - round 9
aese q0, v27.16b @ AES block 0 - round 9
cmp r0, r5 @ check if we have <= 4 blocks
aese q1, v27.16b @ AES block 1 - round 9
eor v17.16b, v17.16b, q9 @ h4k | h3k
bge .L128_dec_tail @ handle tail
ld1 {q4, q5}, [r0], #32 @ AES block 0 - load ciphertext; AES block 1 - load ciphertext
eor q1, q5, q1 @ AES block 1 - result
ld1 {q6}, [r0], #16 @ AES block 2 - load ciphertext
eor q0, q4, q0 @ AES block 0 - result
rev64 q4, q4 @ GHASH block 0
rev r9, r12 @ CTR block 4
orr r9, r11, r9, lsl #32 @ CTR block 4
add r12, r12, #1 @ CTR block 4
ld1 {q7}, [r0], #16 @ AES block 3 - load ciphertext
rev64 q5, q5 @ GHASH block 1
mov r19, v1.d[0] @ AES block 1 - mov low
mov r20, v1.d[1] @ AES block 1 - mov high
mov r6, v0.d[0] @ AES block 0 - mov low
cmp r0, r5 @ check if we have <= 8 blocks
mov r7, v0.d[1] @ AES block 0 - mov high
fmov d0, r10 @ CTR block 4
fmov v0.d[1], r9 @ CTR block 4
rev r9, r12 @ CTR block 5
eor r19, r19, r13 @ AES block 1 - round 10 low
#ifdef __ARMEB__
rev r19, r19
#endif
fmov d1, r10 @ CTR block 5
add r12, r12, #1 @ CTR block 5
orr r9, r11, r9, lsl #32 @ CTR block 5
fmov v1.d[1], r9 @ CTR block 5
rev r9, r12 @ CTR block 6
add r12, r12, #1 @ CTR block 6
orr r9, r11, r9, lsl #32 @ CTR block 6
eor r20, r20, r14 @ AES block 1 - round 10 high
#ifdef __ARMEB__
rev r20, r20
#endif
eor r6, r6, r13 @ AES block 0 - round 10 low
#ifdef __ARMEB__
rev r6, r6
#endif
eor q2, q6, q2 @ AES block 2 - result
eor r7, r7, r14 @ AES block 0 - round 10 high
#ifdef __ARMEB__
rev r7, r7
#endif
stp r6, r7, [r2], #16 @ AES block 0 - store result
stp r19, r20, [r2], #16 @ AES block 1 - store result
bge .L128_dec_prepretail @ do prepretail
.L128_dec_main_loop:@ main loop start
eor q3, q7, q3 @ AES block 4k+3 - result
ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0
mov r21, v2.d[0] @ AES block 4k+2 - mov low
pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high
mov r22, v2.d[1] @ AES block 4k+2 - mov high
aese q1, v18.16b
aesmc q1, q1 @ AES block 4k+5 - round 0
fmov d2, r10 @ CTR block 4k+6
rev64 q6, q6 @ GHASH block 4k+2
fmov v2.d[1], r9 @ CTR block 4k+6
rev r9, r12 @ CTR block 4k+7
mov r23, v3.d[0] @ AES block 4k+3 - mov low
eor q4, q4, v11.16b @ PRE 1
mov d30, v5.d[1] @ GHASH block 4k+1 - mid
aese q1, v19.16b
aesmc q1, q1 @ AES block 4k+5 - round 1
rev64 q7, q7 @ GHASH block 4k+3
pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low
mov r24, v3.d[1] @ AES block 4k+3 - mov high
orr r9, r11, r9, lsl #32 @ CTR block 4k+7
pmull v11.1q, q4, v15.1d @ GHASH block 4k - low
fmov d3, r10 @ CTR block 4k+7
eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid
aese q1, v20.16b
aesmc q1, q1 @ AES block 4k+5 - round 2
fmov v3.d[1], r9 @ CTR block 4k+7
aese q2, v18.16b
aesmc q2, q2 @ AES block 4k+6 - round 0
mov d10, v17.d[1] @ GHASH block 4k - mid
pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high
eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low
pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low
aese q1, v21.16b
aesmc q1, q1 @ AES block 4k+5 - round 3
mov d8, v4.d[1] @ GHASH block 4k - mid
aese q3, v18.16b
aesmc q3, q3 @ AES block 4k+7 - round 0
eor q9, q9, v28.16b @ GHASH block 4k+1 - high
aese q0, v18.16b
aesmc q0, q0 @ AES block 4k+4 - round 0
pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low
eor q8, q8, q4 @ GHASH block 4k - mid
aese q3, v19.16b
aesmc q3, q3 @ AES block 4k+7 - round 1
eor r23, r23, r13 @ AES block 4k+3 - round 10 low
#ifdef __ARMEB__
rev r23, r23
#endif
pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid
eor r22, r22, r14 @ AES block 4k+2 - round 10 high
#ifdef __ARMEB__
rev r22, r22
#endif
mov d31, v6.d[1] @ GHASH block 4k+2 - mid
aese q0, v19.16b
aesmc q0, q0 @ AES block 4k+4 - round 1
eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low
pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid
aese q3, v20.16b
aesmc q3, q3 @ AES block 4k+7 - round 2
eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid
aese q0, v20.16b
aesmc q0, q0 @ AES block 4k+4 - round 2
aese q1, v22.16b
aesmc q1, q1 @ AES block 4k+5 - round 4
eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid
pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high
aese q0, v21.16b
aesmc q0, q0 @ AES block 4k+4 - round 3
ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid
pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high
aese q2, v19.16b
aesmc q2, q2 @ AES block 4k+6 - round 1
mov d30, v7.d[1] @ GHASH block 4k+3 - mid
aese q0, v22.16b
aesmc q0, q0 @ AES block 4k+4 - round 4
eor q9, q9, q8 @ GHASH block 4k+2 - high
pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid
eor r24, r24, r14 @ AES block 4k+3 - round 10 high
#ifdef __ARMEB__
rev r24, r24
#endif
aese q2, v20.16b
aesmc q2, q2 @ AES block 4k+6 - round 2
eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid
aese q1, v23.16b
aesmc q1, q1 @ AES block 4k+5 - round 5
eor r21, r21, r13 @ AES block 4k+2 - round 10 low
#ifdef __ARMEB__
rev r21, r21
#endif
aese q0, v23.16b
aesmc q0, q0 @ AES block 4k+4 - round 5
movi q8, #0xc2
aese q2, v21.16b
aesmc q2, q2 @ AES block 4k+6 - round 3
eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low
aese q1, v24.16b
aesmc q1, q1 @ AES block 4k+5 - round 6
aese q0, v24.16b
aesmc q0, q0 @ AES block 4k+4 - round 6
eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid
aese q2, v22.16b
aesmc q2, q2 @ AES block 4k+6 - round 4
stp r21, r22, [r2], #16 @ AES block 4k+2 - store result
pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid
eor q9, q9, q4 @ GHASH block 4k+3 - high
ld1 {q4}, [r0], #16 @ AES block 4k+3 - load ciphertext
aese q1, v25.16b
aesmc q1, q1 @ AES block 4k+5 - round 7
add r12, r12, #1 @ CTR block 4k+7
aese q0, v25.16b
aesmc q0, q0 @ AES block 4k+4 - round 7
shl d8, d8, #56 @ mod_constant
aese q2, v23.16b
aesmc q2, q2 @ AES block 4k+6 - round 5
eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid
aese q1, v26.16b
aesmc q1, q1 @ AES block 4k+5 - round 8
stp r23, r24, [r2], #16 @ AES block 4k+3 - store result
aese q0, v26.16b
aesmc q0, q0 @ AES block 4k+4 - round 8
eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up
aese q3, v21.16b
aesmc q3, q3 @ AES block 4k+7 - round 3
rev r9, r12 @ CTR block 4k+8
pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid
ld1 {q5}, [r0], #16 @ AES block 4k+4 - load ciphertext
ext q9, q9, q9, #8 @ MODULO - other top alignment
aese q0, v27.16b @ AES block 4k+4 - round 9
orr r9, r11, r9, lsl #32 @ CTR block 4k+8
aese q3, v22.16b
aesmc q3, q3 @ AES block 4k+7 - round 4
eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up
aese q1, v27.16b @ AES block 4k+5 - round 9
aese q2, v24.16b
aesmc q2, q2 @ AES block 4k+6 - round 6
eor q0, q4, q0 @ AES block 4k+4 - result
aese q3, v23.16b
aesmc q3, q3 @ AES block 4k+7 - round 5
ld1 {q6}, [r0], #16 @ AES block 4k+5 - load ciphertext
add r12, r12, #1 @ CTR block 4k+8
eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid
eor q1, q5, q1 @ AES block 4k+5 - result
aese q2, v25.16b
aesmc q2, q2 @ AES block 4k+6 - round 7
ld1 {q7}, [r0], #16 @ AES block 4k+6 - load ciphertext
aese q3, v24.16b
aesmc q3, q3 @ AES block 4k+7 - round 6
rev64 q5, q5 @ GHASH block 4k+5
eor v10.16b, v10.16b, q9 @ MODULO - fold into mid
mov r7, v0.d[1] @ AES block 4k+4 - mov high
aese q2, v26.16b
aesmc q2, q2 @ AES block 4k+6 - round 8
mov r6, v0.d[0] @ AES block 4k+4 - mov low
aese q3, v25.16b
aesmc q3, q3 @ AES block 4k+7 - round 7
fmov d0, r10 @ CTR block 4k+8
pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low
fmov v0.d[1], r9 @ CTR block 4k+8
rev r9, r12 @ CTR block 4k+9
aese q2, v27.16b @ AES block 4k+6 - round 9
orr r9, r11, r9, lsl #32 @ CTR block 4k+9
ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment
aese q3, v26.16b
aesmc q3, q3 @ AES block 4k+7 - round 8
eor r7, r7, r14 @ AES block 4k+4 - round 10 high
#ifdef __ARMEB__
rev r7, r7
#endif
eor v11.16b, v11.16b, q8 @ MODULO - fold into low
mov r20, v1.d[1] @ AES block 4k+5 - mov high
eor r6, r6, r13 @ AES block 4k+4 - round 10 low
#ifdef __ARMEB__
rev r6, r6
#endif
eor q2, q6, q2 @ AES block 4k+6 - result
mov r19, v1.d[0] @ AES block 4k+5 - mov low
add r12, r12, #1 @ CTR block 4k+9
aese q3, v27.16b @ AES block 4k+7 - round 9
fmov d1, r10 @ CTR block 4k+9
cmp r0, r5 @ .LOOP CONTROL
rev64 q4, q4 @ GHASH block 4k+4
eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low
fmov v1.d[1], r9 @ CTR block 4k+9
rev r9, r12 @ CTR block 4k+10
add r12, r12, #1 @ CTR block 4k+10
eor r20, r20, r14 @ AES block 4k+5 - round 10 high
#ifdef __ARMEB__
rev r20, r20
#endif
stp r6, r7, [r2], #16 @ AES block 4k+4 - store result
eor r19, r19, r13 @ AES block 4k+5 - round 10 low
#ifdef __ARMEB__
rev r19, r19
#endif
stp r19, r20, [r2], #16 @ AES block 4k+5 - store result
orr r9, r11, r9, lsl #32 @ CTR block 4k+10
blt .L128_dec_main_loop
.L128_dec_prepretail:@ PREPRETAIL
ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0
mov r21, v2.d[0] @ AES block 4k+2 - mov low
mov d30, v5.d[1] @ GHASH block 4k+1 - mid
aese q0, v18.16b
aesmc q0, q0 @ AES block 4k+4 - round 0
eor q3, q7, q3 @ AES block 4k+3 - result
aese q1, v18.16b
aesmc q1, q1 @ AES block 4k+5 - round 0
mov r22, v2.d[1] @ AES block 4k+2 - mov high
eor q4, q4, v11.16b @ PRE 1
fmov d2, r10 @ CTR block 4k+6
rev64 q6, q6 @ GHASH block 4k+2
aese q0, v19.16b
aesmc q0, q0 @ AES block 4k+4 - round 1
fmov v2.d[1], r9 @ CTR block 4k+6
rev r9, r12 @ CTR block 4k+7
mov r23, v3.d[0] @ AES block 4k+3 - mov low
eor v30.8b, v30.8b, q5 @ GHASH block 4k+1 - mid
pmull v11.1q, q4, v15.1d @ GHASH block 4k - low
mov d10, v17.d[1] @ GHASH block 4k - mid
mov r24, v3.d[1] @ AES block 4k+3 - mov high
aese q1, v19.16b
aesmc q1, q1 @ AES block 4k+5 - round 1
mov d31, v6.d[1] @ GHASH block 4k+2 - mid
aese q0, v20.16b
aesmc q0, q0 @ AES block 4k+4 - round 2
orr r9, r11, r9, lsl #32 @ CTR block 4k+7
pmull v29.1q, q5, v14.1d @ GHASH block 4k+1 - low
mov d8, v4.d[1] @ GHASH block 4k - mid
fmov d3, r10 @ CTR block 4k+7
aese q2, v18.16b
aesmc q2, q2 @ AES block 4k+6 - round 0
fmov v3.d[1], r9 @ CTR block 4k+7
pmull v30.1q, v30.1d, v17.1d @ GHASH block 4k+1 - mid
eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid
rev64 q7, q7 @ GHASH block 4k+3
aese q2, v19.16b
aesmc q2, q2 @ AES block 4k+6 - round 1
eor q8, q8, q4 @ GHASH block 4k - mid
pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high
aese q3, v18.16b
aesmc q3, q3 @ AES block 4k+7 - round 0
ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid
pmull2 v28.1q, q5, v14.2d @ GHASH block 4k+1 - high
pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid
eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+1 - low
pmull v29.1q, q7, v12.1d @ GHASH block 4k+3 - low
pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid
eor q9, q9, v28.16b @ GHASH block 4k+1 - high
eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+1 - mid
pmull2 v4.1q, q7, v12.2d @ GHASH block 4k+3 - high
pmull2 v8.1q, q6, v13.2d @ GHASH block 4k+2 - high
mov d30, v7.d[1] @ GHASH block 4k+3 - mid
aese q1, v20.16b
aesmc q1, q1 @ AES block 4k+5 - round 2
eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid
pmull v28.1q, q6, v13.1d @ GHASH block 4k+2 - low
eor q9, q9, q8 @ GHASH block 4k+2 - high
movi q8, #0xc2
aese q3, v19.16b
aesmc q3, q3 @ AES block 4k+7 - round 1
eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid
eor v11.16b, v11.16b, v28.16b @ GHASH block 4k+2 - low
aese q2, v20.16b
aesmc q2, q2 @ AES block 4k+6 - round 2
eor q9, q9, q4 @ GHASH block 4k+3 - high
aese q3, v20.16b
aesmc q3, q3 @ AES block 4k+7 - round 2
eor r23, r23, r13 @ AES block 4k+3 - round 10 low
#ifdef __ARMEB__
rev r23, r23
#endif
pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid
eor r21, r21, r13 @ AES block 4k+2 - round 10 low
#ifdef __ARMEB__
rev r21, r21
#endif
eor v11.16b, v11.16b, v29.16b @ GHASH block 4k+3 - low
aese q2, v21.16b
aesmc q2, q2 @ AES block 4k+6 - round 3
aese q1, v21.16b
aesmc q1, q1 @ AES block 4k+5 - round 3
shl d8, d8, #56 @ mod_constant
aese q0, v21.16b
aesmc q0, q0 @ AES block 4k+4 - round 3
aese q2, v22.16b
aesmc q2, q2 @ AES block 4k+6 - round 4
eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid
aese q1, v22.16b
aesmc q1, q1 @ AES block 4k+5 - round 4
aese q3, v21.16b
aesmc q3, q3 @ AES block 4k+7 - round 3
eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up
aese q2, v23.16b
aesmc q2, q2 @ AES block 4k+6 - round 5
aese q1, v23.16b
aesmc q1, q1 @ AES block 4k+5 - round 5
aese q3, v22.16b
aesmc q3, q3 @ AES block 4k+7 - round 4
aese q0, v22.16b
aesmc q0, q0 @ AES block 4k+4 - round 4
eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up
pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid
aese q1, v24.16b
aesmc q1, q1 @ AES block 4k+5 - round 6
ext q9, q9, q9, #8 @ MODULO - other top alignment
aese q3, v23.16b
aesmc q3, q3 @ AES block 4k+7 - round 5
aese q0, v23.16b
aesmc q0, q0 @ AES block 4k+4 - round 5
eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid
aese q1, v25.16b
aesmc q1, q1 @ AES block 4k+5 - round 7
aese q2, v24.16b
aesmc q2, q2 @ AES block 4k+6 - round 6
aese q0, v24.16b
aesmc q0, q0 @ AES block 4k+4 - round 6
aese q1, v26.16b
aesmc q1, q1 @ AES block 4k+5 - round 8
eor v10.16b, v10.16b, q9 @ MODULO - fold into mid
aese q3, v24.16b
aesmc q3, q3 @ AES block 4k+7 - round 6
aese q0, v25.16b
aesmc q0, q0 @ AES block 4k+4 - round 7
aese q1, v27.16b @ AES block 4k+5 - round 9
pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low
eor r24, r24, r14 @ AES block 4k+3 - round 10 high
#ifdef __ARMEB__
rev r24, r24
#endif
aese q2, v25.16b
aesmc q2, q2 @ AES block 4k+6 - round 7
ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment
aese q3, v25.16b
aesmc q3, q3 @ AES block 4k+7 - round 7
aese q0, v26.16b
aesmc q0, q0 @ AES block 4k+4 - round 8
eor v11.16b, v11.16b, q8 @ MODULO - fold into low
aese q2, v26.16b
aesmc q2, q2 @ AES block 4k+6 - round 8
aese q3, v26.16b
aesmc q3, q3 @ AES block 4k+7 - round 8
eor r22, r22, r14 @ AES block 4k+2 - round 10 high
#ifdef __ARMEB__
rev r22, r22
#endif
aese q0, v27.16b @ AES block 4k+4 - round 9
stp r21, r22, [r2], #16 @ AES block 4k+2 - store result
aese q2, v27.16b @ AES block 4k+6 - round 9
add r12, r12, #1 @ CTR block 4k+7
stp r23, r24, [r2], #16 @ AES block 4k+3 - store result
aese q3, v27.16b @ AES block 4k+7 - round 9
eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low
.L128_dec_tail:@ TAIL
sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process
ld1 { q5}, [r0], #16 @ AES block 4k+4 - load ciphertext
eor q0, q5, q0 @ AES block 4k+4 - result
mov r7, v0.d[1] @ AES block 4k+4 - mov high
mov r6, v0.d[0] @ AES block 4k+4 - mov low
cmp r5, #48
eor r7, r7, r14 @ AES block 4k+4 - round 10 high
#ifdef __ARMEB__
rev r7, r7
#endif
ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag
eor r6, r6, r13 @ AES block 4k+4 - round 10 low
#ifdef __ARMEB__
rev r6, r6
#endif
bgt .L128_dec_blocks_more_than_3
mov q3, q2
sub r12, r12, #1
movi v11.8b, #0
movi q9, #0
mov q2, q1
movi v10.8b, #0
cmp r5, #32
bgt .L128_dec_blocks_more_than_2
cmp r5, #16
mov q3, q1
sub r12, r12, #1
bgt .L128_dec_blocks_more_than_1
sub r12, r12, #1
b .L128_dec_blocks_less_than_1
.L128_dec_blocks_more_than_3:@ blocks left > 3
rev64 q4, q5 @ GHASH final-3 block
ld1 { q5}, [r0], #16 @ AES final-2 block - load ciphertext
eor q4, q4, q8 @ feed in partial tag
mov d10, v17.d[1] @ GHASH final-3 block - mid
stp r6, r7, [r2], #16 @ AES final-3 block - store result
eor q0, q5, q1 @ AES final-2 block - result
mov d22, v4.d[1] @ GHASH final-3 block - mid
mov r7, v0.d[1] @ AES final-2 block - mov high
pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low
mov r6, v0.d[0] @ AES final-2 block - mov low
pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high
eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid
movi q8, #0 @ suppress further partial tag feed in
eor r7, r7, r14 @ AES final-2 block - round 10 high
#ifdef __ARMEB__
rev r7, r7
#endif
pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid
eor r6, r6, r13 @ AES final-2 block - round 10 low
#ifdef __ARMEB__
rev r6, r6
#endif
.L128_dec_blocks_more_than_2:@ blocks left > 2
rev64 q4, q5 @ GHASH final-2 block
ld1 { q5}, [r0], #16 @ AES final-1 block - load ciphertext
eor q4, q4, q8 @ feed in partial tag
eor q0, q5, q2 @ AES final-1 block - result
stp r6, r7, [r2], #16 @ AES final-2 block - store result
mov d22, v4.d[1] @ GHASH final-2 block - mid
pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low
pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high
mov r6, v0.d[0] @ AES final-1 block - mov low
mov r7, v0.d[1] @ AES final-1 block - mov high
eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid
movi q8, #0 @ suppress further partial tag feed in
pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid
eor r6, r6, r13 @ AES final-1 block - round 10 low
#ifdef __ARMEB__
rev r6, r6
#endif
eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low
eor q9, q9, v20.16b @ GHASH final-2 block - high
eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid
eor r7, r7, r14 @ AES final-1 block - round 10 high
#ifdef __ARMEB__
rev r7, r7
#endif
.L128_dec_blocks_more_than_1:@ blocks left > 1
rev64 q4, q5 @ GHASH final-1 block
ld1 { q5}, [r0], #16 @ AES final block - load ciphertext
eor q4, q4, q8 @ feed in partial tag
mov d22, v4.d[1] @ GHASH final-1 block - mid
eor q0, q5, q3 @ AES final block - result
eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid
stp r6, r7, [r2], #16 @ AES final-1 block - store result
mov r6, v0.d[0] @ AES final block - mov low
mov r7, v0.d[1] @ AES final block - mov high
ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid
pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low
pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high
pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid
movi q8, #0 @ suppress further partial tag feed in
eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low
eor q9, q9, v20.16b @ GHASH final-1 block - high
eor r7, r7, r14 @ AES final block - round 10 high
#ifdef __ARMEB__
rev r7, r7
#endif
eor r6, r6, r13 @ AES final block - round 10 low
#ifdef __ARMEB__
rev r6, r6
#endif
eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid
.L128_dec_blocks_less_than_1:@ blocks left <= 1
mvn r14, xzr @ rk10_h = 0xffffffffffffffff
and r1, r1, #127 @ bit_length %= 128
mvn r13, xzr @ rk10_l = 0xffffffffffffffff
sub r1, r1, #128 @ bit_length -= 128
neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128])
and r1, r1, #127 @ bit_length %= 128
lsr r14, r14, r1 @ rk10_h is mask for top 64b of last block
cmp r1, #64
csel r10, r14, xzr, lt
csel r9, r13, r14, lt
fmov d0, r9 @ ctr0b is mask for last block
mov v0.d[1], r10
and q5, q5, q0 @ possibly partial last block has zeroes in highest bits
rev64 q4, q5 @ GHASH final block
eor q4, q4, q8 @ feed in partial tag
ldp r4, r5, [r2] @ load existing bytes we need to not overwrite
and r7, r7, r10
pmull2 v20.1q, q4, v12.2d @ GHASH final block - high
mov d8, v4.d[1] @ GHASH final block - mid
eor q8, q8, q4 @ GHASH final block - mid
eor q9, q9, v20.16b @ GHASH final block - high
pmull v8.1q, q8, v16.1d @ GHASH final block - mid
pmull v21.1q, q4, v12.1d @ GHASH final block - low
bic r4, r4, r9 @ mask out low existing bytes
and r6, r6, r9
#ifndef __ARMEB__
rev r9, r12
#else
mov r9, r12
#endif
eor v10.16b, v10.16b, q8 @ GHASH final block - mid
movi q8, #0xc2
eor v11.16b, v11.16b, v21.16b @ GHASH final block - low
bic r5, r5, r10 @ mask out high existing bytes
shl d8, d8, #56 @ mod_constant
eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up
pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid
eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up
orr r6, r6, r4
str r9, [r16, #12] @ store the updated counter
orr r7, r7, r5
stp r6, r7, [r2]
ext q9, q9, q9, #8 @ MODULO - other top alignment
eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid
eor v10.16b, v10.16b, q9 @ MODULO - fold into mid
pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low
ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment
eor v11.16b, v11.16b, q8 @ MODULO - fold into low
eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low
ext v11.16b, v11.16b, v11.16b, #8
rev64 v11.16b, v11.16b
mov r0, r15
st1 { v11.16b }, [r3]
ldp r21, r22, [sp, #16]
ldp r23, r24, [sp, #32]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #64]
ldp d12, d13, [sp, #80]
ldp d14, d15, [sp, #96]
ldp r19, r20, [sp], #112
RET
.L128_dec_ret:
mov r0, #0x0
RET
.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
.globl aes_gcm_enc_192_kernel
.type aes_gcm_enc_192_kernel,%function
.align 4
aes_gcm_enc_192_kernel:
cbz r1, .L192_enc_ret
stp r19, r20, [sp, #-112]!
mov r16, r4
mov r8, r5
stp r21, r22, [sp, #16]
stp r23, r24, [sp, #32]
stp d8, d9, [sp, #48]
stp d10, d11, [sp, #64]
stp d12, d13, [sp, #80]
stp d14, d15, [sp, #96]
ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32
#ifdef __ARMEB__
rev r10, r10
rev r11, r11
#endif
ldp r13, r14, [r8, #192] @ load rk12
#ifdef __ARMEB__
ror r13, r13, #32
ror r14, r14, #32
#endif
ld1 {v18.4s}, [r8], #16 @ load rk0
ld1 {v19.4s}, [r8], #16 @ load rk1
ld1 {v20.4s}, [r8], #16 @ load rk2
lsr r12, r11, #32
ld1 {v21.4s}, [r8], #16 @ load rk3
orr r11, r11, r11
ld1 {v22.4s}, [r8], #16 @ load rk4
rev r12, r12 @ rev_ctr32
add r12, r12, #1 @ increment rev_ctr32
fmov d3, r10 @ CTR block 3
rev r9, r12 @ CTR block 1
add r12, r12, #1 @ CTR block 1
fmov d1, r10 @ CTR block 1
orr r9, r11, r9, lsl #32 @ CTR block 1
ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible
fmov v1.d[1], r9 @ CTR block 1
rev r9, r12 @ CTR block 2
add r12, r12, #1 @ CTR block 2
fmov d2, r10 @ CTR block 2
orr r9, r11, r9, lsl #32 @ CTR block 2
fmov v2.d[1], r9 @ CTR block 2
rev r9, r12 @ CTR block 3
orr r9, r11, r9, lsl #32 @ CTR block 3
ld1 {v23.4s}, [r8], #16 @ load rk5
fmov v3.d[1], r9 @ CTR block 3
ld1 {v24.4s}, [r8], #16 @ load rk6
ld1 {v25.4s}, [r8], #16 @ load rk7
aese q0, v18.16b
aesmc q0, q0 @ AES block 0 - round 0
ld1 { v11.16b}, [r3]
ext v11.16b, v11.16b, v11.16b, #8
rev64 v11.16b, v11.16b
aese q3, v18.16b
aesmc q3, q3 @ AES block 3 - round 0
ld1 {v26.4s}, [r8], #16 @ load rk8
aese q1, v18.16b
aesmc q1, q1 @ AES block 1 - round 0
ldr q15, [r3, #112] @ load h4l | h4h
#ifndef __ARMEB__
ext v15.16b, v15.16b, v15.16b, #8
#endif
aese q2, v18.16b
aesmc q2, q2 @ AES block 2 - round 0
ld1 {v27.4s}, [r8], #16 @ load rk9
aese q0, v19.16b
aesmc q0, q0 @ AES block 0 - round 1
ld1 {v28.4s}, [r8], #16 @ load rk10
aese q1, v19.16b
aesmc q1, q1 @ AES block 1 - round 1
ldr q12, [r3, #32] @ load h1l | h1h
#ifndef __ARMEB__
ext v12.16b, v12.16b, v12.16b, #8
#endif
aese q2, v19.16b
aesmc q2, q2 @ AES block 2 - round 1
ld1 {v29.4s}, [r8], #16 @ load rk11
aese q3, v19.16b
aesmc q3, q3 @ AES block 3 - round 1
ldr q14, [r3, #80] @ load h3l | h3h
#ifndef __ARMEB__
ext v14.16b, v14.16b, v14.16b, #8
#endif
aese q0, v20.16b
aesmc q0, q0 @ AES block 0 - round 2
aese q2, v20.16b
aesmc q2, q2 @ AES block 2 - round 2
aese q3, v20.16b
aesmc q3, q3 @ AES block 3 - round 2
aese q0, v21.16b
aesmc q0, q0 @ AES block 0 - round 3
trn1 q9, v14.2d, v15.2d @ h4h | h3h
aese q2, v21.16b
aesmc q2, q2 @ AES block 2 - round 3
aese q1, v20.16b
aesmc q1, q1 @ AES block 1 - round 2
trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l
aese q0, v22.16b
aesmc q0, q0 @ AES block 0 - round 4
aese q3, v21.16b
aesmc q3, q3 @ AES block 3 - round 3
aese q1, v21.16b
aesmc q1, q1 @ AES block 1 - round 3
aese q0, v23.16b
aesmc q0, q0 @ AES block 0 - round 5
aese q2, v22.16b
aesmc q2, q2 @ AES block 2 - round 4
aese q1, v22.16b
aesmc q1, q1 @ AES block 1 - round 4
aese q0, v24.16b
aesmc q0, q0 @ AES block 0 - round 6
aese q3, v22.16b
aesmc q3, q3 @ AES block 3 - round 4
aese q2, v23.16b
aesmc q2, q2 @ AES block 2 - round 5
aese q1, v23.16b
aesmc q1, q1 @ AES block 1 - round 5
aese q3, v23.16b
aesmc q3, q3 @ AES block 3 - round 5
aese q2, v24.16b
aesmc q2, q2 @ AES block 2 - round 6
ldr q13, [r3, #64] @ load h2l | h2h
#ifndef __ARMEB__
ext v13.16b, v13.16b, v13.16b, #8
#endif
aese q1, v24.16b
aesmc q1, q1 @ AES block 1 - round 6
aese q3, v24.16b
aesmc q3, q3 @ AES block 3 - round 6
aese q0, v25.16b
aesmc q0, q0 @ AES block 0 - round 7
aese q1, v25.16b
aesmc q1, q1 @ AES block 1 - round 7
trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l
aese q3, v25.16b
aesmc q3, q3 @ AES block 3 - round 7
aese q0, v26.16b
aesmc q0, q0 @ AES block 0 - round 8
aese q2, v25.16b
aesmc q2, q2 @ AES block 2 - round 7
trn1 q8, v12.2d, v13.2d @ h2h | h1h
aese q1, v26.16b
aesmc q1, q1 @ AES block 1 - round 8
aese q3, v26.16b
aesmc q3, q3 @ AES block 3 - round 8
aese q2, v26.16b
aesmc q2, q2 @ AES block 2 - round 8
aese q0, v27.16b
aesmc q0, q0 @ AES block 0 - round 9
aese q3, v27.16b
aesmc q3, q3 @ AES block 3 - round 9
aese q2, v27.16b
aesmc q2, q2 @ AES block 2 - round 9
aese q1, v27.16b
aesmc q1, q1 @ AES block 1 - round 9
aese q0, v28.16b
aesmc q0, q0 @ AES block 0 - round 10
aese q2, v28.16b
aesmc q2, q2 @ AES block 2 - round 10
aese q1, v28.16b
aesmc q1, q1 @ AES block 1 - round 10
lsr r5, r1, #3 @ byte_len
mov r15, r5
aese q3, v28.16b
aesmc q3, q3 @ AES block 3 - round 10
sub r5, r5, #1 @ byte_len - 1
eor v16.16b, v16.16b, q8 @ h2k | h1k
and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
eor v17.16b, v17.16b, q9 @ h4k | h3k
aese q2, v29.16b @ AES block 2 - round 11
add r4, r0, r1, lsr #3 @ end_input_ptr
add r5, r5, r0
aese q1, v29.16b @ AES block 1 - round 11
cmp r0, r5 @ check if we have <= 4 blocks
aese q0, v29.16b @ AES block 0 - round 11
add r12, r12, #1 @ CTR block 3
aese q3, v29.16b @ AES block 3 - round 11
bge .L192_enc_tail @ handle tail
rev r9, r12 @ CTR block 4
ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext
#ifdef __ARMEB__
rev r6, r6
rev r7, r7
#endif
orr r9, r11, r9, lsl #32 @ CTR block 4
ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext
#ifdef __ARMEB__
rev r21, r21
rev r22, r22
#endif
ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext
#ifdef __ARMEB__
rev r23, r23
rev r24, r24
#endif
ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext
#ifdef __ARMEB__
rev r19, r19
rev r20, r20
#endif
add r0, r0, #64 @ AES input_ptr update
cmp r0, r5 @ check if we have <= 8 blocks
eor r6, r6, r13 @ AES block 0 - round 12 low
eor r7, r7, r14 @ AES block 0 - round 12 high
eor r22, r22, r14 @ AES block 2 - round 12 high
fmov d4, r6 @ AES block 0 - mov low
eor r24, r24, r14 @ AES block 3 - round 12 high
fmov v4.d[1], r7 @ AES block 0 - mov high
eor r21, r21, r13 @ AES block 2 - round 12 low
eor r19, r19, r13 @ AES block 1 - round 12 low
fmov d5, r19 @ AES block 1 - mov low
eor r20, r20, r14 @ AES block 1 - round 12 high
fmov v5.d[1], r20 @ AES block 1 - mov high
eor r23, r23, r13 @ AES block 3 - round 12 low
fmov d6, r21 @ AES block 2 - mov low
add r12, r12, #1 @ CTR block 4
eor q4, q4, q0 @ AES block 0 - result
fmov d0, r10 @ CTR block 4
fmov v0.d[1], r9 @ CTR block 4
rev r9, r12 @ CTR block 5
orr r9, r11, r9, lsl #32 @ CTR block 5
add r12, r12, #1 @ CTR block 5
fmov d7, r23 @ AES block 3 - mov low
st1 { q4}, [r2], #16 @ AES block 0 - store result
fmov v6.d[1], r22 @ AES block 2 - mov high
eor q5, q5, q1 @ AES block 1 - result
fmov d1, r10 @ CTR block 5
st1 { q5}, [r2], #16 @ AES block 1 - store result
fmov v7.d[1], r24 @ AES block 3 - mov high
fmov v1.d[1], r9 @ CTR block 5
rev r9, r12 @ CTR block 6
orr r9, r11, r9, lsl #32 @ CTR block 6
add r12, r12, #1 @ CTR block 6
eor q6, q6, q2 @ AES block 2 - result
fmov d2, r10 @ CTR block 6
fmov v2.d[1], r9 @ CTR block 6
rev r9, r12 @ CTR block 7
orr r9, r11, r9, lsl #32 @ CTR block 7
st1 { q6}, [r2], #16 @ AES block 2 - store result
eor q7, q7, q3 @ AES block 3 - result
st1 { q7}, [r2], #16 @ AES block 3 - store result
bge .L192_enc_prepretail @ do prepretail
.L192_enc_main_loop:@ main loop start
aese q2, v18.16b
aesmc q2, q2 @ AES block 4k+6 - round 0
rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free)
aese q1, v18.16b
aesmc q1, q1 @ AES block 4k+5 - round 0
ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext
#ifdef __ARMEB__
rev r19, r19
rev r20, r20
#endif
ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0
fmov d3, r10 @ CTR block 4k+3
rev64 q4, q4 @ GHASH block 4k (only t0 is free)
aese q2, v19.16b
aesmc q2, q2 @ AES block 4k+6 - round 1
fmov v3.d[1], r9 @ CTR block 4k+3
pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high
rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext
#ifdef __ARMEB__
rev r21, r21
rev r22, r22
#endif
aese q0, v18.16b
aesmc q0, q0 @ AES block 4k+4 - round 0
ldp r23, r24, [r0, #48] @ AES block 4k+3 - load plaintext
#ifdef __ARMEB__
rev r23, r23
rev r24, r24
#endif
pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low
eor q4, q4, v11.16b @ PRE 1
aese q1, v19.16b
aesmc q1, q1 @ AES block 4k+5 - round 1
aese q0, v19.16b
aesmc q0, q0 @ AES block 4k+4 - round 1
rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free)
aese q3, v18.16b
aesmc q3, q3 @ AES block 4k+7 - round 0
eor r24, r24, r14 @ AES block 4k+3 - round 12 high
pmull v11.1q, q4, v15.1d @ GHASH block 4k - low
mov d8, v4.d[1] @ GHASH block 4k - mid
aese q0, v20.16b
aesmc q0, q0 @ AES block 4k+4 - round 2
aese q3, v19.16b
aesmc q3, q3 @ AES block 4k+7 - round 1
eor r21, r21, r13 @ AES block 4k+6 - round 12 low
eor q8, q8, q4 @ GHASH block 4k - mid
eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low
aese q0, v21.16b
aesmc q0, q0 @ AES block 4k+4 - round 3
eor r19, r19, r13 @ AES block 4k+5 - round 12 low
aese q1, v20.16b
aesmc q1, q1 @ AES block 4k+5 - round 2
mov d31, v6.d[1] @ GHASH block 4k+2 - mid
pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high
mov d4, v5.d[1] @ GHASH block 4k+1 - mid
aese q2, v20.16b
aesmc q2, q2 @ AES block 4k+6 - round 2
aese q1, v21.16b
aesmc q1, q1 @ AES block 4k+5 - round 3
mov d10, v17.d[1] @ GHASH block 4k - mid
eor q9, q9, v30.16b @ GHASH block 4k+1 - high
aese q3, v20.16b
aesmc q3, q3 @ AES block 4k+7 - round 2
eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid
pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high
aese q0, v22.16b
aesmc q0, q0 @ AES block 4k+4 - round 4
eor q4, q4, q5 @ GHASH block 4k+1 - mid
aese q3, v21.16b
aesmc q3, q3 @ AES block 4k+7 - round 3
pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high
eor r20, r20, r14 @ AES block 4k+5 - round 12 high
ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid
aese q0, v23.16b
aesmc q0, q0 @ AES block 4k+4 - round 5
add r12, r12, #1 @ CTR block 4k+3
aese q3, v22.16b
aesmc q3, q3 @ AES block 4k+7 - round 4
eor q9, q9, v30.16b @ GHASH block 4k+2 - high
pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid
eor r22, r22, r14 @ AES block 4k+6 - round 12 high
pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid
eor r23, r23, r13 @ AES block 4k+3 - round 12 low
mov d30, v7.d[1] @ GHASH block 4k+3 - mid
pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid
rev r9, r12 @ CTR block 4k+8
pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low
orr r9, r11, r9, lsl #32 @ CTR block 4k+8
aese q2, v21.16b
aesmc q2, q2 @ AES block 4k+6 - round 3
eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid
aese q1, v22.16b
aesmc q1, q1 @ AES block 4k+5 - round 4
ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext
#ifdef __ARMEB__
rev r6, r6
rev r7, r7
#endif
aese q0, v24.16b
aesmc q0, q0 @ AES block 4k+4 - round 6
eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low
aese q2, v22.16b
aesmc q2, q2 @ AES block 4k+6 - round 4
add r0, r0, #64 @ AES input_ptr update
aese q1, v23.16b
aesmc q1, q1 @ AES block 4k+5 - round 5
movi q8, #0xc2
pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low
eor r7, r7, r14 @ AES block 4k+4 - round 12 high
eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid
aese q2, v23.16b
aesmc q2, q2 @ AES block 4k+6 - round 5
eor r6, r6, r13 @ AES block 4k+4 - round 12 low
aese q1, v24.16b
aesmc q1, q1 @ AES block 4k+5 - round 6
shl d8, d8, #56 @ mod_constant
aese q3, v23.16b
aesmc q3, q3 @ AES block 4k+7 - round 5
eor q9, q9, q5 @ GHASH block 4k+3 - high
aese q0, v25.16b
aesmc q0, q0 @ AES block 4k+4 - round 7
fmov d5, r19 @ AES block 4k+5 - mov low
aese q1, v25.16b
aesmc q1, q1 @ AES block 4k+5 - round 7
eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid
aese q3, v24.16b
aesmc q3, q3 @ AES block 4k+7 - round 6
fmov v5.d[1], r20 @ AES block 4k+5 - mov high
aese q0, v26.16b
aesmc q0, q0 @ AES block 4k+4 - round 8
eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low
pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid
cmp r0, r5 @ .LOOP CONTROL
fmov d4, r6 @ AES block 4k+4 - mov low
aese q2, v24.16b
aesmc q2, q2 @ AES block 4k+6 - round 6
fmov v4.d[1], r7 @ AES block 4k+4 - mov high
aese q1, v26.16b
aesmc q1, q1 @ AES block 4k+5 - round 8
fmov d7, r23 @ AES block 4k+3 - mov low
eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid
eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up
add r12, r12, #1 @ CTR block 4k+8
aese q2, v25.16b
aesmc q2, q2 @ AES block 4k+6 - round 7
fmov v7.d[1], r24 @ AES block 4k+3 - mov high
pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid
ext q9, q9, q9, #8 @ MODULO - other top alignment
fmov d6, r21 @ AES block 4k+6 - mov low
aese q3, v25.16b
aesmc q3, q3 @ AES block 4k+7 - round 7
aese q0, v27.16b
aesmc q0, q0 @ AES block 4k+4 - round 9
eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up
aese q2, v26.16b
aesmc q2, q2 @ AES block 4k+6 - round 8
aese q3, v26.16b
aesmc q3, q3 @ AES block 4k+7 - round 8
aese q1, v27.16b
aesmc q1, q1 @ AES block 4k+5 - round 9
aese q0, v28.16b
aesmc q0, q0 @ AES block 4k+4 - round 10
eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid
aese q3, v27.16b
aesmc q3, q3 @ AES block 4k+7 - round 9
aese q2, v27.16b
aesmc q2, q2 @ AES block 4k+6 - round 9
aese q0, v29.16b @ AES block 4k+4 - round 11
aese q1, v28.16b
aesmc q1, q1 @ AES block 4k+5 - round 10
eor v10.16b, v10.16b, q9 @ MODULO - fold into mid
aese q2, v28.16b
aesmc q2, q2 @ AES block 4k+6 - round 10
eor q4, q4, q0 @ AES block 4k+4 - result
fmov d0, r10 @ CTR block 4k+8
aese q1, v29.16b @ AES block 4k+5 - round 11
fmov v0.d[1], r9 @ CTR block 4k+8
rev r9, r12 @ CTR block 4k+9
pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low
fmov v6.d[1], r22 @ AES block 4k+6 - mov high
st1 { q4}, [r2], #16 @ AES block 4k+4 - store result
aese q3, v28.16b
aesmc q3, q3 @ AES block 4k+7 - round 10
orr r9, r11, r9, lsl #32 @ CTR block 4k+9
eor q5, q5, q1 @ AES block 4k+5 - result
add r12, r12, #1 @ CTR block 4k+9
fmov d1, r10 @ CTR block 4k+9
aese q2, v29.16b @ AES block 4k+6 - round 11
fmov v1.d[1], r9 @ CTR block 4k+9
rev r9, r12 @ CTR block 4k+10
add r12, r12, #1 @ CTR block 4k+10
ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment
orr r9, r11, r9, lsl #32 @ CTR block 4k+10
st1 { q5}, [r2], #16 @ AES block 4k+5 - store result
eor v11.16b, v11.16b, q9 @ MODULO - fold into low
aese q3, v29.16b @ AES block 4k+7 - round 11
eor q6, q6, q2 @ AES block 4k+6 - result
fmov d2, r10 @ CTR block 4k+10
st1 { q6}, [r2], #16 @ AES block 4k+6 - store result
fmov v2.d[1], r9 @ CTR block 4k+10
rev r9, r12 @ CTR block 4k+11
eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low
orr r9, r11, r9, lsl #32 @ CTR block 4k+11
eor q7, q7, q3 @ AES block 4k+3 - result
st1 { q7}, [r2], #16 @ AES block 4k+3 - store result
blt .L192_enc_main_loop
.L192_enc_prepretail:@ PREPRETAIL
aese q0, v18.16b
aesmc q0, q0 @ AES block 4k+4 - round 0
rev64 q4, q4 @ GHASH block 4k (only t0 is free)
fmov d3, r10 @ CTR block 4k+3
ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0
add r12, r12, #1 @ CTR block 4k+3
aese q1, v18.16b
aesmc q1, q1 @ AES block 4k+5 - round 0
rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free)
aese q2, v18.16b
aesmc q2, q2 @ AES block 4k+6 - round 0
fmov v3.d[1], r9 @ CTR block 4k+3
eor q4, q4, v11.16b @ PRE 1
mov d10, v17.d[1] @ GHASH block 4k - mid
aese q1, v19.16b
aesmc q1, q1 @ AES block 4k+5 - round 1
rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free)
pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high
pmull v11.1q, q4, v15.1d @ GHASH block 4k - low
mov d8, v4.d[1] @ GHASH block 4k - mid
pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low
rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high
eor q8, q8, q4 @ GHASH block 4k - mid
mov d4, v5.d[1] @ GHASH block 4k+1 - mid
eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low
mov d31, v6.d[1] @ GHASH block 4k+2 - mid
aese q3, v18.16b
aesmc q3, q3 @ AES block 4k+7 - round 0
eor q9, q9, v30.16b @ GHASH block 4k+1 - high
pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high
eor q4, q4, q5 @ GHASH block 4k+1 - mid
eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid
aese q3, v19.16b
aesmc q3, q3 @ AES block 4k+7 - round 1
aese q2, v19.16b
aesmc q2, q2 @ AES block 4k+6 - round 1
eor q9, q9, v30.16b @ GHASH block 4k+2 - high
aese q0, v19.16b
aesmc q0, q0 @ AES block 4k+4 - round 1
aese q1, v20.16b
aesmc q1, q1 @ AES block 4k+5 - round 2
mov d30, v7.d[1] @ GHASH block 4k+3 - mid
pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high
ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid
aese q0, v20.16b
aesmc q0, q0 @ AES block 4k+4 - round 2
pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid
eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid
aese q1, v21.16b
aesmc q1, q1 @ AES block 4k+5 - round 3
pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid
pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid
pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid
eor q9, q9, q5 @ GHASH block 4k+3 - high
pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low
aese q0, v21.16b
aesmc q0, q0 @ AES block 4k+4 - round 3
eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid
aese q3, v20.16b
aesmc q3, q3 @ AES block 4k+7 - round 2
aese q2, v20.16b
aesmc q2, q2 @ AES block 4k+6 - round 2
eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low
aese q0, v22.16b
aesmc q0, q0 @ AES block 4k+4 - round 4
aese q3, v21.16b
aesmc q3, q3 @ AES block 4k+7 - round 3
eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid
aese q2, v21.16b
aesmc q2, q2 @ AES block 4k+6 - round 3
pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low
movi q8, #0xc2
aese q3, v22.16b
aesmc q3, q3 @ AES block 4k+7 - round 4
aese q2, v22.16b
aesmc q2, q2 @ AES block 4k+6 - round 4
aese q1, v22.16b
aesmc q1, q1 @ AES block 4k+5 - round 4
eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid
aese q3, v23.16b
aesmc q3, q3 @ AES block 4k+7 - round 5
aese q2, v23.16b
aesmc q2, q2 @ AES block 4k+6 - round 5
aese q1, v23.16b
aesmc q1, q1 @ AES block 4k+5 - round 5
eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low
aese q0, v23.16b
aesmc q0, q0 @ AES block 4k+4 - round 5
aese q3, v24.16b
aesmc q3, q3 @ AES block 4k+7 - round 6
eor v10.16b, v10.16b, q9 @ karatsuba tidy up
aese q1, v24.16b
aesmc q1, q1 @ AES block 4k+5 - round 6
aese q0, v24.16b
aesmc q0, q0 @ AES block 4k+4 - round 6
shl d8, d8, #56 @ mod_constant
aese q3, v25.16b
aesmc q3, q3 @ AES block 4k+7 - round 7
aese q1, v25.16b
aesmc q1, q1 @ AES block 4k+5 - round 7
eor v10.16b, v10.16b, v11.16b
aese q0, v25.16b
aesmc q0, q0 @ AES block 4k+4 - round 7
pmull v30.1q, q9, q8
aese q2, v24.16b
aesmc q2, q2 @ AES block 4k+6 - round 6
ext q9, q9, q9, #8
aese q0, v26.16b
aesmc q0, q0 @ AES block 4k+4 - round 8
aese q1, v26.16b
aesmc q1, q1 @ AES block 4k+5 - round 8
eor v10.16b, v10.16b, v30.16b
aese q2, v25.16b
aesmc q2, q2 @ AES block 4k+6 - round 7
aese q3, v26.16b
aesmc q3, q3 @ AES block 4k+7 - round 8
aese q0, v27.16b
aesmc q0, q0 @ AES block 4k+4 - round 9
aese q2, v26.16b
aesmc q2, q2 @ AES block 4k+6 - round 8
eor v10.16b, v10.16b, q9
aese q3, v27.16b
aesmc q3, q3 @ AES block 4k+7 - round 9
aese q1, v27.16b
aesmc q1, q1 @ AES block 4k+5 - round 9
aese q2, v27.16b
aesmc q2, q2 @ AES block 4k+6 - round 9
pmull v30.1q, v10.1d, q8
ext v10.16b, v10.16b, v10.16b, #8
aese q3, v28.16b
aesmc q3, q3 @ AES block 4k+7 - round 10
aese q0, v28.16b
aesmc q0, q0 @ AES block 4k+4 - round 10
aese q2, v28.16b
aesmc q2, q2 @ AES block 4k+6 - round 10
aese q1, v28.16b
aesmc q1, q1 @ AES block 4k+5 - round 10
eor v11.16b, v11.16b, v30.16b
aese q0, v29.16b @ AES block 4k+4 - round 11
aese q3, v29.16b @ AES block 4k+7 - round 11
aese q2, v29.16b @ AES block 4k+6 - round 11
aese q1, v29.16b @ AES block 4k+5 - round 11
eor v11.16b, v11.16b, v10.16b
.L192_enc_tail:@ TAIL
sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process
ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext
#ifdef __ARMEB__
rev r6, r6
rev r7, r7
#endif
eor r6, r6, r13 @ AES block 4k+4 - round 12 low
eor r7, r7, r14 @ AES block 4k+4 - round 12 high
fmov d4, r6 @ AES block 4k+4 - mov low
fmov v4.d[1], r7 @ AES block 4k+4 - mov high
cmp r5, #48
eor q5, q4, q0 @ AES block 4k+4 - result
ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag
bgt .L192_enc_blocks_more_than_3
sub r12, r12, #1
movi v10.8b, #0
mov q3, q2
movi q9, #0
cmp r5, #32
mov q2, q1
movi v11.8b, #0
bgt .L192_enc_blocks_more_than_2
sub r12, r12, #1
mov q3, q1
cmp r5, #16
bgt .L192_enc_blocks_more_than_1
sub r12, r12, #1
b .L192_enc_blocks_less_than_1
.L192_enc_blocks_more_than_3:@ blocks left > 3
st1 { q5}, [r2], #16 @ AES final-3 block - store result
ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high
#ifdef __ARMEB__
rev r6, r6
rev r7, r7
#endif
rev64 q4, q5 @ GHASH final-3 block
eor r6, r6, r13 @ AES final-2 block - round 12 low
eor q4, q4, q8 @ feed in partial tag
eor r7, r7, r14 @ AES final-2 block - round 12 high
fmov d5, r6 @ AES final-2 block - mov low
fmov v5.d[1], r7 @ AES final-2 block - mov high
mov d22, v4.d[1] @ GHASH final-3 block - mid
pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low
mov d10, v17.d[1] @ GHASH final-3 block - mid
eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid
movi q8, #0 @ suppress further partial tag feed in
pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high
pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid
eor q5, q5, q1 @ AES final-2 block - result
.L192_enc_blocks_more_than_2:@ blocks left > 2
st1 { q5}, [r2], #16 @ AES final-2 block - store result
rev64 q4, q5 @ GHASH final-2 block
ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high
#ifdef __ARMEB__
rev r6, r6
rev r7, r7
#endif
eor q4, q4, q8 @ feed in partial tag
eor r7, r7, r14 @ AES final-1 block - round 12 high
pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high
mov d22, v4.d[1] @ GHASH final-2 block - mid
pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low
eor r6, r6, r13 @ AES final-1 block - round 12 low
fmov d5, r6 @ AES final-1 block - mov low
fmov v5.d[1], r7 @ AES final-1 block - mov high
eor q9, q9, v20.16b @ GHASH final-2 block - high
eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid
eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low
pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid
movi q8, #0 @ suppress further partial tag feed in
eor q5, q5, q2 @ AES final-1 block - result
eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid
.L192_enc_blocks_more_than_1:@ blocks left > 1
st1 { q5}, [r2], #16 @ AES final-1 block - store result
ldp r6, r7, [r0], #16 @ AES final block - load input low & high
#ifdef __ARMEB__
rev r6, r6
rev r7, r7
#endif
rev64 q4, q5 @ GHASH final-1 block
eor r6, r6, r13 @ AES final block - round 12 low
eor q4, q4, q8 @ feed in partial tag
movi q8, #0 @ suppress further partial tag feed in
mov d22, v4.d[1] @ GHASH final-1 block - mid
eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid
eor r7, r7, r14 @ AES final block - round 12 high
fmov d5, r6 @ AES final block - mov low
pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high
fmov v5.d[1], r7 @ AES final block - mov high
ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid
eor q9, q9, v20.16b @ GHASH final-1 block - high
pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low
pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid
eor q5, q5, q3 @ AES final block - result
eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low
eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid
.L192_enc_blocks_less_than_1:@ blocks left <= 1
ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored
#ifndef __ARMEB__
rev r9, r12
#else
mov r9, r12
#endif
and r1, r1, #127 @ bit_length %= 128
sub r1, r1, #128 @ bit_length -= 128
mvn r14, xzr @ rk12_h = 0xffffffffffffffff
neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128])
mvn r13, xzr @ rk12_l = 0xffffffffffffffff
and r1, r1, #127 @ bit_length %= 128
lsr r14, r14, r1 @ rk12_h is mask for top 64b of last block
cmp r1, #64
csel r6, r13, r14, lt
csel r7, r14, xzr, lt
fmov d0, r6 @ ctr0b is mask for last block
fmov v0.d[1], r7
and q5, q5, q0 @ possibly partial last block has zeroes in highest bits
rev64 q4, q5 @ GHASH final block
eor q4, q4, q8 @ feed in partial tag
mov d8, v4.d[1] @ GHASH final block - mid
pmull v21.1q, q4, v12.1d @ GHASH final block - low
pmull2 v20.1q, q4, v12.2d @ GHASH final block - high
eor q8, q8, q4 @ GHASH final block - mid
eor v11.16b, v11.16b, v21.16b @ GHASH final block - low
eor q9, q9, v20.16b @ GHASH final block - high
pmull v8.1q, q8, v16.1d @ GHASH final block - mid
eor v10.16b, v10.16b, q8 @ GHASH final block - mid
movi q8, #0xc2
eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up
shl d8, d8, #56 @ mod_constant
bif q5, v18.16b, q0 @ insert existing bytes in top end of result before storing
eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up
pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid
ext q9, q9, q9, #8 @ MODULO - other top alignment
eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid
eor v10.16b, v10.16b, q9 @ MODULO - fold into mid
pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low
ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment
eor v11.16b, v11.16b, q9 @ MODULO - fold into low
str r9, [r16, #12] @ store the updated counter
st1 { q5}, [r2] @ store all 16B
eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low
ext v11.16b, v11.16b, v11.16b, #8
rev64 v11.16b, v11.16b
mov r0, r15
st1 { v11.16b }, [r3]
ldp r21, r22, [sp, #16]
ldp r23, r24, [sp, #32]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #64]
ldp d12, d13, [sp, #80]
ldp d14, d15, [sp, #96]
ldp r19, r20, [sp], #112
RET
.L192_enc_ret:
mov r0, #0x0
RET
.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
.globl aes_gcm_dec_192_kernel
.type aes_gcm_dec_192_kernel,%function
.align 4
aes_gcm_dec_192_kernel:
cbz r1, .L192_dec_ret
stp r19, r20, [sp, #-112]!
mov r16, r4
mov r8, r5
stp r21, r22, [sp, #16]
stp r23, r24, [sp, #32]
stp d8, d9, [sp, #48]
stp d10, d11, [sp, #64]
stp d12, d13, [sp, #80]
stp d14, d15, [sp, #96]
add r4, r0, r1, lsr #3 @ end_input_ptr
ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32
#ifdef __ARMEB__
rev r10, r10
rev r11, r11
#endif
ldp r13, r14, [r8, #192] @ load rk12
#ifdef __ARMEB__
ror r13, r13, #32
ror r14, r14, #32
#endif
ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible
ld1 {v18.4s}, [r8], #16 @ load rk0
lsr r5, r1, #3 @ byte_len
mov r15, r5
ld1 {v19.4s}, [r8], #16 @ load rk1
lsr r12, r11, #32
orr r11, r11, r11
fmov d3, r10 @ CTR block 3
rev r12, r12 @ rev_ctr32
fmov d1, r10 @ CTR block 1
add r12, r12, #1 @ increment rev_ctr32
ld1 {v20.4s}, [r8], #16 @ load rk2
aese q0, v18.16b
aesmc q0, q0 @ AES block 0 - round 0
rev r9, r12 @ CTR block 1
add r12, r12, #1 @ CTR block 1
orr r9, r11, r9, lsl #32 @ CTR block 1
ld1 {v21.4s}, [r8], #16 @ load rk3
fmov v1.d[1], r9 @ CTR block 1
rev r9, r12 @ CTR block 2
add r12, r12, #1 @ CTR block 2
fmov d2, r10 @ CTR block 2
orr r9, r11, r9, lsl #32 @ CTR block 2
fmov v2.d[1], r9 @ CTR block 2
rev r9, r12 @ CTR block 3
aese q0, v19.16b
aesmc q0, q0 @ AES block 0 - round 1
orr r9, r11, r9, lsl #32 @ CTR block 3
fmov v3.d[1], r9 @ CTR block 3
ld1 {v22.4s}, [r8], #16 @ load rk4
aese q0, v20.16b
aesmc q0, q0 @ AES block 0 - round 2
aese q2, v18.16b
aesmc q2, q2 @ AES block 2 - round 0
ld1 {v23.4s}, [r8], #16 @ load rk5
aese q1, v18.16b
aesmc q1, q1 @ AES block 1 - round 0
ldr q15, [r3, #112] @ load h4l | h4h
#ifndef __ARMEB__
ext v15.16b, v15.16b, v15.16b, #8
#endif
aese q3, v18.16b
aesmc q3, q3 @ AES block 3 - round 0
ldr q13, [r3, #64] @ load h2l | h2h
#ifndef __ARMEB__
ext v13.16b, v13.16b, v13.16b, #8
#endif
aese q2, v19.16b
aesmc q2, q2 @ AES block 2 - round 1
ldr q14, [r3, #80] @ load h3l | h3h
#ifndef __ARMEB__
ext v14.16b, v14.16b, v14.16b, #8
#endif
aese q1, v19.16b
aesmc q1, q1 @ AES block 1 - round 1
aese q3, v19.16b
aesmc q3, q3 @ AES block 3 - round 1
ldr q12, [r3, #32] @ load h1l | h1h
#ifndef __ARMEB__
ext v12.16b, v12.16b, v12.16b, #8
#endif
aese q2, v20.16b
aesmc q2, q2 @ AES block 2 - round 2
ld1 {v24.4s}, [r8], #16 @ load rk6
aese q0, v21.16b
aesmc q0, q0 @ AES block 0 - round 3
ld1 {v25.4s}, [r8], #16 @ load rk7
aese q1, v20.16b
aesmc q1, q1 @ AES block 1 - round 2
ld1 {v26.4s}, [r8], #16 @ load rk8
aese q3, v20.16b
aesmc q3, q3 @ AES block 3 - round 2
ld1 {v27.4s}, [r8], #16 @ load rk9
aese q2, v21.16b
aesmc q2, q2 @ AES block 2 - round 3
ld1 { v11.16b}, [r3]
ext v11.16b, v11.16b, v11.16b, #8
rev64 v11.16b, v11.16b
aese q1, v21.16b
aesmc q1, q1 @ AES block 1 - round 3
add r12, r12, #1 @ CTR block 3
aese q3, v21.16b
aesmc q3, q3 @ AES block 3 - round 3
trn1 q9, v14.2d, v15.2d @ h4h | h3h
aese q0, v22.16b
aesmc q0, q0 @ AES block 0 - round 4
ld1 {v28.4s}, [r8], #16 @ load rk10
aese q1, v22.16b
aesmc q1, q1 @ AES block 1 - round 4
trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l
aese q2, v22.16b
aesmc q2, q2 @ AES block 2 - round 4
aese q3, v22.16b
aesmc q3, q3 @ AES block 3 - round 4
trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l
aese q0, v23.16b
aesmc q0, q0 @ AES block 0 - round 5
ld1 {v29.4s}, [r8], #16 @ load rk11
aese q1, v23.16b
aesmc q1, q1 @ AES block 1 - round 5
aese q2, v23.16b
aesmc q2, q2 @ AES block 2 - round 5
aese q3, v23.16b
aesmc q3, q3 @ AES block 3 - round 5
aese q0, v24.16b
aesmc q0, q0 @ AES block 0 - round 6
aese q2, v24.16b
aesmc q2, q2 @ AES block 2 - round 6
aese q3, v24.16b
aesmc q3, q3 @ AES block 3 - round 6
aese q0, v25.16b
aesmc q0, q0 @ AES block 0 - round 7
aese q2, v25.16b
aesmc q2, q2 @ AES block 2 - round 7
aese q3, v25.16b
aesmc q3, q3 @ AES block 3 - round 7
aese q1, v24.16b
aesmc q1, q1 @ AES block 1 - round 6
aese q2, v26.16b
aesmc q2, q2 @ AES block 2 - round 8
aese q3, v26.16b
aesmc q3, q3 @ AES block 3 - round 8
aese q1, v25.16b
aesmc q1, q1 @ AES block 1 - round 7
aese q2, v27.16b
aesmc q2, q2 @ AES block 2 - round 9
aese q3, v27.16b
aesmc q3, q3 @ AES block 3 - round 9
aese q1, v26.16b
aesmc q1, q1 @ AES block 1 - round 8
sub r5, r5, #1 @ byte_len - 1
aese q0, v26.16b
aesmc q0, q0 @ AES block 0 - round 8
and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
aese q3, v28.16b
aesmc q3, q3 @ AES block 3 - round 10
add r5, r5, r0
aese q1, v27.16b
aesmc q1, q1 @ AES block 1 - round 9
cmp r0, r5 @ check if we have <= 4 blocks
aese q0, v27.16b
aesmc q0, q0 @ AES block 0 - round 9
trn1 q8, v12.2d, v13.2d @ h2h | h1h
aese q3, v29.16b @ AES block 3 - round 11
aese q2, v28.16b
aesmc q2, q2 @ AES block 2 - round 10
aese q1, v28.16b
aesmc q1, q1 @ AES block 1 - round 10
aese q0, v28.16b
aesmc q0, q0 @ AES block 0 - round 10
eor v16.16b, v16.16b, q8 @ h2k | h1k
aese q2, v29.16b @ AES block 2 - round 11
aese q1, v29.16b @ AES block 1 - round 11
eor v17.16b, v17.16b, q9 @ h4k | h3k
aese q0, v29.16b @ AES block 0 - round 11
bge .L192_dec_tail @ handle tail
ld1 {q4, q5}, [r0], #32 @ AES block 0,1 - load ciphertext
eor q1, q5, q1 @ AES block 1 - result
eor q0, q4, q0 @ AES block 0 - result
rev r9, r12 @ CTR block 4
ld1 {q6, q7}, [r0], #32 @ AES block 2,3 - load ciphertext
mov r19, v1.d[0] @ AES block 1 - mov low
mov r20, v1.d[1] @ AES block 1 - mov high
mov r6, v0.d[0] @ AES block 0 - mov low
orr r9, r11, r9, lsl #32 @ CTR block 4
add r12, r12, #1 @ CTR block 4
mov r7, v0.d[1] @ AES block 0 - mov high
rev64 q4, q4 @ GHASH block 0
fmov d0, r10 @ CTR block 4
rev64 q5, q5 @ GHASH block 1
cmp r0, r5 @ check if we have <= 8 blocks
eor r19, r19, r13 @ AES block 1 - round 12 low
#ifdef __ARMEB__
rev r19, r19
#endif
fmov v0.d[1], r9 @ CTR block 4
rev r9, r12 @ CTR block 5
orr r9, r11, r9, lsl #32 @ CTR block 5
fmov d1, r10 @ CTR block 5
eor r20, r20, r14 @ AES block 1 - round 12 high
#ifdef __ARMEB__
rev r20, r20
#endif
add r12, r12, #1 @ CTR block 5
fmov v1.d[1], r9 @ CTR block 5
eor r6, r6, r13 @ AES block 0 - round 12 low
#ifdef __ARMEB__
rev r6, r6
#endif
rev r9, r12 @ CTR block 6
eor r7, r7, r14 @ AES block 0 - round 12 high
#ifdef __ARMEB__
rev r7, r7
#endif
stp r6, r7, [r2], #16 @ AES block 0 - store result
orr r9, r11, r9, lsl #32 @ CTR block 6
stp r19, r20, [r2], #16 @ AES block 1 - store result
add r12, r12, #1 @ CTR block 6
eor q2, q6, q2 @ AES block 2 - result
bge .L192_dec_prepretail @ do prepretail
.L192_dec_main_loop:@ main loop start
aese q1, v18.16b
aesmc q1, q1 @ AES block 4k+5 - round 0
ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0
pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low
mov r21, v2.d[0] @ AES block 4k+2 - mov low
mov r22, v2.d[1] @ AES block 4k+2 - mov high
eor q3, q7, q3 @ AES block 4k+3 - result
rev64 q7, q7 @ GHASH block 4k+3
aese q1, v19.16b
aesmc q1, q1 @ AES block 4k+5 - round 1
fmov d2, r10 @ CTR block 4k+6
aese q0, v18.16b
aesmc q0, q0 @ AES block 4k+4 - round 0
eor q4, q4, v11.16b @ PRE 1
pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high
fmov v2.d[1], r9 @ CTR block 4k+6
aese q1, v20.16b
aesmc q1, q1 @ AES block 4k+5 - round 2
mov r24, v3.d[1] @ AES block 4k+3 - mov high
aese q0, v19.16b
aesmc q0, q0 @ AES block 4k+4 - round 1
mov r23, v3.d[0] @ AES block 4k+3 - mov low
pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high
fmov d3, r10 @ CTR block 4k+7
mov d8, v4.d[1] @ GHASH block 4k - mid
pmull v11.1q, q4, v15.1d @ GHASH block 4k - low
mov d10, v17.d[1] @ GHASH block 4k - mid
rev r9, r12 @ CTR block 4k+7
aese q2, v18.16b
aesmc q2, q2 @ AES block 4k+6 - round 0
orr r9, r11, r9, lsl #32 @ CTR block 4k+7
fmov v3.d[1], r9 @ CTR block 4k+7
eor q8, q8, q4 @ GHASH block 4k - mid
mov d4, v5.d[1] @ GHASH block 4k+1 - mid
aese q1, v21.16b
aesmc q1, q1 @ AES block 4k+5 - round 3
aese q0, v20.16b
aesmc q0, q0 @ AES block 4k+4 - round 2
eor r22, r22, r14 @ AES block 4k+2 - round 12 high
#ifdef __ARMEB__
rev r22, r22
#endif
aese q2, v19.16b
aesmc q2, q2 @ AES block 4k+6 - round 1
eor q4, q4, q5 @ GHASH block 4k+1 - mid
pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid
aese q3, v18.16b
aesmc q3, q3 @ AES block 4k+7 - round 0
rev64 q6, q6 @ GHASH block 4k+2
aese q2, v20.16b
aesmc q2, q2 @ AES block 4k+6 - round 2
pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid
eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low
eor r21, r21, r13 @ AES block 4k+2 - round 12 low
#ifdef __ARMEB__
rev r21, r21
#endif
aese q1, v22.16b
aesmc q1, q1 @ AES block 4k+5 - round 4
aese q0, v21.16b
aesmc q0, q0 @ AES block 4k+4 - round 3
eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid
mov d31, v6.d[1] @ GHASH block 4k+2 - mid
aese q3, v19.16b
aesmc q3, q3 @ AES block 4k+7 - round 1
eor q9, q9, v30.16b @ GHASH block 4k+1 - high
aese q0, v22.16b
aesmc q0, q0 @ AES block 4k+4 - round 4
pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high
eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid
pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low
aese q0, v23.16b
aesmc q0, q0 @ AES block 4k+4 - round 5
eor q9, q9, v30.16b @ GHASH block 4k+2 - high
mov d30, v7.d[1] @ GHASH block 4k+3 - mid
aese q1, v23.16b
aesmc q1, q1 @ AES block 4k+5 - round 5
pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high
aese q3, v20.16b
aesmc q3, q3 @ AES block 4k+7 - round 2
eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid
aese q1, v24.16b
aesmc q1, q1 @ AES block 4k+5 - round 6
aese q0, v24.16b
aesmc q0, q0 @ AES block 4k+4 - round 6
ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid
aese q3, v21.16b
aesmc q3, q3 @ AES block 4k+7 - round 3
pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid
eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low
aese q0, v25.16b
aesmc q0, q0 @ AES block 4k+4 - round 7
pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid
eor q9, q9, q5 @ GHASH block 4k+3 - high
aese q1, v25.16b
aesmc q1, q1 @ AES block 4k+5 - round 7
aese q0, v26.16b
aesmc q0, q0 @ AES block 4k+4 - round 8
movi q8, #0xc2
pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low
aese q1, v26.16b
aesmc q1, q1 @ AES block 4k+5 - round 8
eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid
aese q2, v21.16b
aesmc q2, q2 @ AES block 4k+6 - round 3
aese q0, v27.16b
aesmc q0, q0 @ AES block 4k+4 - round 9
eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low
aese q3, v22.16b
aesmc q3, q3 @ AES block 4k+7 - round 4
aese q2, v22.16b
aesmc q2, q2 @ AES block 4k+6 - round 4
eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid
aese q0, v28.16b
aesmc q0, q0 @ AES block 4k+4 - round 10
aese q1, v27.16b
aesmc q1, q1 @ AES block 4k+5 - round 9
eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up
aese q2, v23.16b
aesmc q2, q2 @ AES block 4k+6 - round 5
aese q3, v23.16b
aesmc q3, q3 @ AES block 4k+7 - round 5
shl d8, d8, #56 @ mod_constant
aese q1, v28.16b
aesmc q1, q1 @ AES block 4k+5 - round 10
aese q2, v24.16b
aesmc q2, q2 @ AES block 4k+6 - round 6
ld1 {q4}, [r0], #16 @ AES block 4k+4 - load ciphertext
aese q3, v24.16b
aesmc q3, q3 @ AES block 4k+7 - round 6
eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up
pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid
ld1 {q5}, [r0], #16 @ AES block 4k+5 - load ciphertext
eor r23, r23, r13 @ AES block 4k+3 - round 12 low
#ifdef __ARMEB__
rev r23, r23
#endif
aese q2, v25.16b
aesmc q2, q2 @ AES block 4k+6 - round 7
ext q9, q9, q9, #8 @ MODULO - other top alignment
aese q0, v29.16b @ AES block 4k+4 - round 11
add r12, r12, #1 @ CTR block 4k+7
aese q3, v25.16b
aesmc q3, q3 @ AES block 4k+7 - round 7
eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid
aese q2, v26.16b
aesmc q2, q2 @ AES block 4k+6 - round 8
ld1 {q6}, [r0], #16 @ AES block 4k+6 - load ciphertext
aese q1, v29.16b @ AES block 4k+5 - round 11
ld1 {q7}, [r0], #16 @ AES block 4k+7 - load ciphertext
rev r9, r12 @ CTR block 4k+8
aese q3, v26.16b
aesmc q3, q3 @ AES block 4k+7 - round 8
stp r21, r22, [r2], #16 @ AES block 4k+2 - store result
aese q2, v27.16b
aesmc q2, q2 @ AES block 4k+6 - round 9
eor v10.16b, v10.16b, q9 @ MODULO - fold into mid
cmp r0, r5 @ .LOOP CONTROL
eor q0, q4, q0 @ AES block 4k+4 - result
eor r24, r24, r14 @ AES block 4k+3 - round 12 high
#ifdef __ARMEB__
rev r24, r24
#endif
eor q1, q5, q1 @ AES block 4k+5 - result
aese q2, v28.16b
aesmc q2, q2 @ AES block 4k+6 - round 10
orr r9, r11, r9, lsl #32 @ CTR block 4k+8
aese q3, v27.16b
aesmc q3, q3 @ AES block 4k+7 - round 9
pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low
mov r19, v1.d[0] @ AES block 4k+5 - mov low
mov r6, v0.d[0] @ AES block 4k+4 - mov low
stp r23, r24, [r2], #16 @ AES block 4k+3 - store result
rev64 q5, q5 @ GHASH block 4k+5
aese q2, v29.16b @ AES block 4k+6 - round 11
mov r7, v0.d[1] @ AES block 4k+4 - mov high
aese q3, v28.16b
aesmc q3, q3 @ AES block 4k+7 - round 10
mov r20, v1.d[1] @ AES block 4k+5 - mov high
fmov d0, r10 @ CTR block 4k+8
add r12, r12, #1 @ CTR block 4k+8
ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment
eor q2, q6, q2 @ AES block 4k+6 - result
fmov v0.d[1], r9 @ CTR block 4k+8
rev r9, r12 @ CTR block 4k+9
eor r6, r6, r13 @ AES block 4k+4 - round 12 low
#ifdef __ARMEB__
rev r6, r6
#endif
orr r9, r11, r9, lsl #32 @ CTR block 4k+9
eor v11.16b, v11.16b, q8 @ MODULO - fold into low
fmov d1, r10 @ CTR block 4k+9
add r12, r12, #1 @ CTR block 4k+9
eor r19, r19, r13 @ AES block 4k+5 - round 12 low
#ifdef __ARMEB__
rev r19, r19
#endif
fmov v1.d[1], r9 @ CTR block 4k+9
rev r9, r12 @ CTR block 4k+10
eor r20, r20, r14 @ AES block 4k+5 - round 12 high
#ifdef __ARMEB__
rev r20, r20
#endif
eor r7, r7, r14 @ AES block 4k+4 - round 12 high
#ifdef __ARMEB__
rev r7, r7
#endif
stp r6, r7, [r2], #16 @ AES block 4k+4 - store result
eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low
add r12, r12, #1 @ CTR block 4k+10
rev64 q4, q4 @ GHASH block 4k+4
orr r9, r11, r9, lsl #32 @ CTR block 4k+10
aese q3, v29.16b @ AES block 4k+7 - round 11
stp r19, r20, [r2], #16 @ AES block 4k+5 - store result
blt .L192_dec_main_loop
.L192_dec_prepretail:@ PREPRETAIL
mov r22, v2.d[1] @ AES block 4k+2 - mov high
ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0
eor q3, q7, q3 @ AES block 4k+3 - result
aese q1, v18.16b
aesmc q1, q1 @ AES block 4k+5 - round 0
mov r21, v2.d[0] @ AES block 4k+2 - mov low
aese q0, v18.16b
aesmc q0, q0 @ AES block 4k+4 - round 0
mov d10, v17.d[1] @ GHASH block 4k - mid
eor q4, q4, v11.16b @ PRE 1
fmov d2, r10 @ CTR block 4k+6
aese q1, v19.16b
aesmc q1, q1 @ AES block 4k+5 - round 1
mov r23, v3.d[0] @ AES block 4k+3 - mov low
aese q0, v19.16b
aesmc q0, q0 @ AES block 4k+4 - round 1
mov r24, v3.d[1] @ AES block 4k+3 - mov high
pmull v11.1q, q4, v15.1d @ GHASH block 4k - low
mov d8, v4.d[1] @ GHASH block 4k - mid
fmov d3, r10 @ CTR block 4k+7
aese q1, v20.16b
aesmc q1, q1 @ AES block 4k+5 - round 2
rev64 q6, q6 @ GHASH block 4k+2
pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high
fmov v2.d[1], r9 @ CTR block 4k+6
rev r9, r12 @ CTR block 4k+7
orr r9, r11, r9, lsl #32 @ CTR block 4k+7
eor q8, q8, q4 @ GHASH block 4k - mid
mov d4, v5.d[1] @ GHASH block 4k+1 - mid
pmull v31.1q, q5, v14.1d @ GHASH block 4k+1 - low
eor r24, r24, r14 @ AES block 4k+3 - round 12 high
#ifdef __ARMEB__
rev r24, r24
#endif
fmov v3.d[1], r9 @ CTR block 4k+7
aese q0, v20.16b
aesmc q0, q0 @ AES block 4k+4 - round 2
eor r21, r21, r13 @ AES block 4k+2 - round 12 low
#ifdef __ARMEB__
rev r21, r21
#endif
pmull2 v30.1q, q5, v14.2d @ GHASH block 4k+1 - high
eor r22, r22, r14 @ AES block 4k+2 - round 12 high
#ifdef __ARMEB__
rev r22, r22
#endif
eor q4, q4, q5 @ GHASH block 4k+1 - mid
pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid
eor r23, r23, r13 @ AES block 4k+3 - round 12 low
#ifdef __ARMEB__
rev r23, r23
#endif
stp r21, r22, [r2], #16 @ AES block 4k+2 - store result
rev64 q7, q7 @ GHASH block 4k+3
stp r23, r24, [r2], #16 @ AES block 4k+3 - store result
aese q3, v18.16b
aesmc q3, q3 @ AES block 4k+7 - round 0
eor q9, q9, v30.16b @ GHASH block 4k+1 - high
pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid
add r12, r12, #1 @ CTR block 4k+7
pmull2 v30.1q, q6, v13.2d @ GHASH block 4k+2 - high
eor v11.16b, v11.16b, v31.16b @ GHASH block 4k+1 - low
aese q2, v18.16b
aesmc q2, q2 @ AES block 4k+6 - round 0
eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid
mov d31, v6.d[1] @ GHASH block 4k+2 - mid
aese q3, v19.16b
aesmc q3, q3 @ AES block 4k+7 - round 1
aese q2, v19.16b
aesmc q2, q2 @ AES block 4k+6 - round 1
eor q9, q9, v30.16b @ GHASH block 4k+2 - high
eor v31.8b, v31.8b, q6 @ GHASH block 4k+2 - mid
pmull v8.1q, q6, v13.1d @ GHASH block 4k+2 - low
aese q2, v20.16b
aesmc q2, q2 @ AES block 4k+6 - round 2
mov d30, v7.d[1] @ GHASH block 4k+3 - mid
aese q3, v20.16b
aesmc q3, q3 @ AES block 4k+7 - round 2
ins v31.d[1], v31.d[0] @ GHASH block 4k+2 - mid
pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low
aese q0, v21.16b
aesmc q0, q0 @ AES block 4k+4 - round 3
eor v30.8b, v30.8b, q7 @ GHASH block 4k+3 - mid
aese q1, v21.16b
aesmc q1, q1 @ AES block 4k+5 - round 3
pmull2 v31.1q, v31.2d, v16.2d @ GHASH block 4k+2 - mid
eor v11.16b, v11.16b, q8 @ GHASH block 4k+2 - low
aese q0, v22.16b
aesmc q0, q0 @ AES block 4k+4 - round 4
pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high
movi q8, #0xc2
pmull v30.1q, v30.1d, v16.1d @ GHASH block 4k+3 - mid
aese q2, v21.16b
aesmc q2, q2 @ AES block 4k+6 - round 3
shl d8, d8, #56 @ mod_constant
eor q9, q9, q5 @ GHASH block 4k+3 - high
aese q0, v23.16b
aesmc q0, q0 @ AES block 4k+4 - round 5
eor v10.16b, v10.16b, v31.16b @ GHASH block 4k+2 - mid
aese q2, v22.16b
aesmc q2, q2 @ AES block 4k+6 - round 4
pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid
eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low
aese q0, v24.16b
aesmc q0, q0 @ AES block 4k+4 - round 6
aese q3, v21.16b
aesmc q3, q3 @ AES block 4k+7 - round 3
eor v10.16b, v10.16b, v30.16b @ GHASH block 4k+3 - mid
aese q2, v23.16b
aesmc q2, q2 @ AES block 4k+6 - round 5
aese q0, v25.16b
aesmc q0, q0 @ AES block 4k+4 - round 7
eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up
aese q3, v22.16b
aesmc q3, q3 @ AES block 4k+7 - round 4
aese q2, v24.16b
aesmc q2, q2 @ AES block 4k+6 - round 6
ext q9, q9, q9, #8 @ MODULO - other top alignment
aese q0, v26.16b
aesmc q0, q0 @ AES block 4k+4 - round 8
aese q3, v23.16b
aesmc q3, q3 @ AES block 4k+7 - round 5
eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up
aese q1, v22.16b
aesmc q1, q1 @ AES block 4k+5 - round 4
aese q2, v25.16b
aesmc q2, q2 @ AES block 4k+6 - round 7
aese q0, v27.16b
aesmc q0, q0 @ AES block 4k+4 - round 9
aese q1, v23.16b
aesmc q1, q1 @ AES block 4k+5 - round 5
aese q3, v24.16b
aesmc q3, q3 @ AES block 4k+7 - round 6
eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid
aese q0, v28.16b
aesmc q0, q0 @ AES block 4k+4 - round 10
aese q1, v24.16b
aesmc q1, q1 @ AES block 4k+5 - round 6
aese q3, v25.16b
aesmc q3, q3 @ AES block 4k+7 - round 7
aese q2, v26.16b
aesmc q2, q2 @ AES block 4k+6 - round 8
eor v10.16b, v10.16b, q9 @ MODULO - fold into mid
aese q1, v25.16b
aesmc q1, q1 @ AES block 4k+5 - round 7
aese q3, v26.16b
aesmc q3, q3 @ AES block 4k+7 - round 8
aese q2, v27.16b
aesmc q2, q2 @ AES block 4k+6 - round 9
aese q1, v26.16b
aesmc q1, q1 @ AES block 4k+5 - round 8
aese q3, v27.16b
aesmc q3, q3 @ AES block 4k+7 - round 9
pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low
aese q1, v27.16b
aesmc q1, q1 @ AES block 4k+5 - round 9
aese q2, v28.16b
aesmc q2, q2 @ AES block 4k+6 - round 10
aese q3, v28.16b
aesmc q3, q3 @ AES block 4k+7 - round 10
ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment
aese q1, v28.16b
aesmc q1, q1 @ AES block 4k+5 - round 10
aese q0, v29.16b
eor v11.16b, v11.16b, q8 @ MODULO - fold into low
aese q2, v29.16b
aese q1, v29.16b
aese q3, v29.16b
eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low
.L192_dec_tail:@ TAIL
sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process
ld1 { q5}, [r0], #16 @ AES block 4k+4 - load ciphertext
eor q0, q5, q0 @ AES block 4k+4 - result
mov r7, v0.d[1] @ AES block 4k+4 - mov high
mov r6, v0.d[0] @ AES block 4k+4 - mov low
ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag
cmp r5, #48
eor r7, r7, r14 @ AES block 4k+4 - round 12 high
#ifdef __ARMEB__
rev r7, r7
#endif
eor r6, r6, r13 @ AES block 4k+4 - round 12 low
#ifdef __ARMEB__
rev r6, r6
#endif
bgt .L192_dec_blocks_more_than_3
movi v11.8b, #0
movi q9, #0
mov q3, q2
mov q2, q1
sub r12, r12, #1
movi v10.8b, #0
cmp r5, #32
bgt .L192_dec_blocks_more_than_2
mov q3, q1
cmp r5, #16
sub r12, r12, #1
bgt .L192_dec_blocks_more_than_1
sub r12, r12, #1
b .L192_dec_blocks_less_than_1
.L192_dec_blocks_more_than_3:@ blocks left > 3
rev64 q4, q5 @ GHASH final-3 block
ld1 { q5}, [r0], #16 @ AES final-2 block - load ciphertext
stp r6, r7, [r2], #16 @ AES final-3 block - store result
eor q4, q4, q8 @ feed in partial tag
eor q0, q5, q1 @ AES final-2 block - result
pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low
mov r6, v0.d[0] @ AES final-2 block - mov low
mov d22, v4.d[1] @ GHASH final-3 block - mid
mov r7, v0.d[1] @ AES final-2 block - mov high
mov d10, v17.d[1] @ GHASH final-3 block - mid
eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid
pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high
eor r6, r6, r13 @ AES final-2 block - round 12 low
#ifdef __ARMEB__
rev r6, r6
#endif
movi q8, #0 @ suppress further partial tag feed in
pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid
eor r7, r7, r14 @ AES final-2 block - round 12 high
#ifdef __ARMEB__
rev r7, r7
#endif
.L192_dec_blocks_more_than_2:@ blocks left > 2
rev64 q4, q5 @ GHASH final-2 block
ld1 { q5}, [r0], #16 @ AES final-1 block - load ciphertext
eor q4, q4, q8 @ feed in partial tag
movi q8, #0 @ suppress further partial tag feed in
eor q0, q5, q2 @ AES final-1 block - result
mov d22, v4.d[1] @ GHASH final-2 block - mid
pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low
stp r6, r7, [r2], #16 @ AES final-2 block - store result
eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid
mov r7, v0.d[1] @ AES final-1 block - mov high
eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low
mov r6, v0.d[0] @ AES final-1 block - mov low
pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high
pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid
eor q9, q9, v20.16b @ GHASH final-2 block - high
eor r7, r7, r14 @ AES final-1 block - round 12 high
#ifdef __ARMEB__
rev r7, r7
#endif
eor r6, r6, r13 @ AES final-1 block - round 12 low
#ifdef __ARMEB__
rev r6, r6
#endif
eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid
.L192_dec_blocks_more_than_1:@ blocks left > 1
rev64 q4, q5 @ GHASH final-1 block
eor q4, q4, q8 @ feed in partial tag
ld1 { q5}, [r0], #16 @ AES final block - load ciphertext
mov d22, v4.d[1] @ GHASH final-1 block - mid
pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high
eor q0, q5, q3 @ AES final block - result
stp r6, r7, [r2], #16 @ AES final-1 block - store result
eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid
eor q9, q9, v20.16b @ GHASH final-1 block - high
pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low
mov r7, v0.d[1] @ AES final block - mov high
ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid
mov r6, v0.d[0] @ AES final block - mov low
pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid
movi q8, #0 @ suppress further partial tag feed in
eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low
eor r7, r7, r14 @ AES final block - round 12 high
#ifdef __ARMEB__
rev r7, r7
#endif
eor r6, r6, r13 @ AES final block - round 12 low
#ifdef __ARMEB__
rev r6, r6
#endif
eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid
.L192_dec_blocks_less_than_1:@ blocks left <= 1
mvn r13, xzr @ rk12_l = 0xffffffffffffffff
ldp r4, r5, [r2] @ load existing bytes we need to not overwrite
and r1, r1, #127 @ bit_length %= 128
sub r1, r1, #128 @ bit_length -= 128
neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128])
and r1, r1, #127 @ bit_length %= 128
mvn r14, xzr @ rk12_h = 0xffffffffffffffff
lsr r14, r14, r1 @ rk12_h is mask for top 64b of last block
cmp r1, #64
csel r9, r13, r14, lt
csel r10, r14, xzr, lt
fmov d0, r9 @ ctr0b is mask for last block
and r6, r6, r9
bic r4, r4, r9 @ mask out low existing bytes
orr r6, r6, r4
mov v0.d[1], r10
#ifndef __ARMEB__
rev r9, r12
#else
mov r9, r12
#endif
and q5, q5, q0 @ possibly partial last block has zeroes in highest bits
str r9, [r16, #12] @ store the updated counter
rev64 q4, q5 @ GHASH final block
eor q4, q4, q8 @ feed in partial tag
bic r5, r5, r10 @ mask out high existing bytes
and r7, r7, r10
pmull2 v20.1q, q4, v12.2d @ GHASH final block - high
mov d8, v4.d[1] @ GHASH final block - mid
pmull v21.1q, q4, v12.1d @ GHASH final block - low
eor q8, q8, q4 @ GHASH final block - mid
eor q9, q9, v20.16b @ GHASH final block - high
pmull v8.1q, q8, v16.1d @ GHASH final block - mid
eor v11.16b, v11.16b, v21.16b @ GHASH final block - low
eor v10.16b, v10.16b, q8 @ GHASH final block - mid
movi q8, #0xc2
eor v30.16b, v11.16b, q9 @ MODULO - karatsuba tidy up
shl d8, d8, #56 @ mod_constant
eor v10.16b, v10.16b, v30.16b @ MODULO - karatsuba tidy up
pmull v31.1q, q9, q8 @ MODULO - top 64b align with mid
orr r7, r7, r5
stp r6, r7, [r2]
ext q9, q9, q9, #8 @ MODULO - other top alignment
eor v10.16b, v10.16b, v31.16b @ MODULO - fold into mid
eor v10.16b, v10.16b, q9 @ MODULO - fold into mid
pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low
eor v11.16b, v11.16b, q8 @ MODULO - fold into low
ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment
eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low
ext v11.16b, v11.16b, v11.16b, #8
rev64 v11.16b, v11.16b
mov r0, r15
st1 { v11.16b }, [r3]
ldp r21, r22, [sp, #16]
ldp r23, r24, [sp, #32]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #64]
ldp d12, d13, [sp, #80]
ldp d14, d15, [sp, #96]
ldp r19, r20, [sp], #112
RET
.L192_dec_ret:
mov r0, #0x0
RET
.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
.globl aes_gcm_enc_256_kernel
.type aes_gcm_enc_256_kernel,%function
.align 4
aes_gcm_enc_256_kernel:
cbz r1, .L256_enc_ret
stp r19, r20, [sp, #-112]!
mov r16, r4
mov r8, r5
stp r21, r22, [sp, #16]
stp r23, r24, [sp, #32]
stp d8, d9, [sp, #48]
stp d10, d11, [sp, #64]
stp d12, d13, [sp, #80]
stp d14, d15, [sp, #96]
add r4, r0, r1, lsr #3 @ end_input_ptr
lsr r5, r1, #3 @ byte_len
mov r15, r5
ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32
#ifdef __ARMEB__
rev r10, r10
rev r11, r11
#endif
ldp r13, r14, [r8, #224] @ load rk14
#ifdef __ARMEB__
ror r13, r13, #32
ror r14, r14, #32
#endif
ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible
sub r5, r5, #1 @ byte_len - 1
ld1 {v18.4s}, [r8], #16 @ load rk0
and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
ld1 {v19.4s}, [r8], #16 @ load rk1
add r5, r5, r0
lsr r12, r11, #32
fmov d2, r10 @ CTR block 2
orr r11, r11, r11
rev r12, r12 @ rev_ctr32
cmp r0, r5 @ check if we have <= 4 blocks
fmov d1, r10 @ CTR block 1
aese q0, v18.16b
aesmc q0, q0 @ AES block 0 - round 0
add r12, r12, #1 @ increment rev_ctr32
rev r9, r12 @ CTR block 1
fmov d3, r10 @ CTR block 3
orr r9, r11, r9, lsl #32 @ CTR block 1
add r12, r12, #1 @ CTR block 1
ld1 {v20.4s}, [r8], #16 @ load rk2
fmov v1.d[1], r9 @ CTR block 1
rev r9, r12 @ CTR block 2
add r12, r12, #1 @ CTR block 2
orr r9, r11, r9, lsl #32 @ CTR block 2
ld1 {v21.4s}, [r8], #16 @ load rk3
fmov v2.d[1], r9 @ CTR block 2
rev r9, r12 @ CTR block 3
aese q0, v19.16b
aesmc q0, q0 @ AES block 0 - round 1
orr r9, r11, r9, lsl #32 @ CTR block 3
fmov v3.d[1], r9 @ CTR block 3
aese q1, v18.16b
aesmc q1, q1 @ AES block 1 - round 0
ld1 {v22.4s}, [r8], #16 @ load rk4
aese q0, v20.16b
aesmc q0, q0 @ AES block 0 - round 2
ld1 {v23.4s}, [r8], #16 @ load rk5
aese q2, v18.16b
aesmc q2, q2 @ AES block 2 - round 0
ld1 {v24.4s}, [r8], #16 @ load rk6
aese q1, v19.16b
aesmc q1, q1 @ AES block 1 - round 1
ldr q14, [r3, #80] @ load h3l | h3h
#ifndef __ARMEB__
ext v14.16b, v14.16b, v14.16b, #8
#endif
aese q3, v18.16b
aesmc q3, q3 @ AES block 3 - round 0
ld1 {v25.4s}, [r8], #16 @ load rk7
aese q2, v19.16b
aesmc q2, q2 @ AES block 2 - round 1
ld1 {v26.4s}, [r8], #16 @ load rk8
aese q1, v20.16b
aesmc q1, q1 @ AES block 1 - round 2
ldr q13, [r3, #64] @ load h2l | h2h
#ifndef __ARMEB__
ext v13.16b, v13.16b, v13.16b, #8
#endif
aese q3, v19.16b
aesmc q3, q3 @ AES block 3 - round 1
ld1 {v27.4s}, [r8], #16 @ load rk9
aese q2, v20.16b
aesmc q2, q2 @ AES block 2 - round 2
ldr q15, [r3, #112] @ load h4l | h4h
#ifndef __ARMEB__
ext v15.16b, v15.16b, v15.16b, #8
#endif
aese q1, v21.16b
aesmc q1, q1 @ AES block 1 - round 3
ld1 {v28.4s}, [r8], #16 @ load rk10
aese q3, v20.16b
aesmc q3, q3 @ AES block 3 - round 2
ld1 {v29.4s}, [r8], #16 @ load rk11
aese q2, v21.16b
aesmc q2, q2 @ AES block 2 - round 3
add r12, r12, #1 @ CTR block 3
aese q0, v21.16b
aesmc q0, q0 @ AES block 0 - round 3
aese q3, v21.16b
aesmc q3, q3 @ AES block 3 - round 3
ld1 { v11.16b}, [r3]
ext v11.16b, v11.16b, v11.16b, #8
rev64 v11.16b, v11.16b
aese q2, v22.16b
aesmc q2, q2 @ AES block 2 - round 4
aese q0, v22.16b
aesmc q0, q0 @ AES block 0 - round 4
aese q1, v22.16b
aesmc q1, q1 @ AES block 1 - round 4
aese q3, v22.16b
aesmc q3, q3 @ AES block 3 - round 4
aese q0, v23.16b
aesmc q0, q0 @ AES block 0 - round 5
aese q1, v23.16b
aesmc q1, q1 @ AES block 1 - round 5
aese q3, v23.16b
aesmc q3, q3 @ AES block 3 - round 5
aese q2, v23.16b
aesmc q2, q2 @ AES block 2 - round 5
aese q1, v24.16b
aesmc q1, q1 @ AES block 1 - round 6
trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l
aese q3, v24.16b
aesmc q3, q3 @ AES block 3 - round 6
ld1 {v30.4s}, [r8], #16 @ load rk12
aese q0, v24.16b
aesmc q0, q0 @ AES block 0 - round 6
ldr q12, [r3, #32] @ load h1l | h1h
#ifndef __ARMEB__
ext v12.16b, v12.16b, v12.16b, #8
#endif
aese q2, v24.16b
aesmc q2, q2 @ AES block 2 - round 6
ld1 {v31.4s}, [r8], #16 @ load rk13
aese q1, v25.16b
aesmc q1, q1 @ AES block 1 - round 7
trn1 q9, v14.2d, v15.2d @ h4h | h3h
aese q0, v25.16b
aesmc q0, q0 @ AES block 0 - round 7
aese q2, v25.16b
aesmc q2, q2 @ AES block 2 - round 7
aese q3, v25.16b
aesmc q3, q3 @ AES block 3 - round 7
trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l
aese q1, v26.16b
aesmc q1, q1 @ AES block 1 - round 8
aese q2, v26.16b
aesmc q2, q2 @ AES block 2 - round 8
aese q3, v26.16b
aesmc q3, q3 @ AES block 3 - round 8
aese q1, v27.16b
aesmc q1, q1 @ AES block 1 - round 9
aese q2, v27.16b
aesmc q2, q2 @ AES block 2 - round 9
aese q0, v26.16b
aesmc q0, q0 @ AES block 0 - round 8
aese q1, v28.16b
aesmc q1, q1 @ AES block 1 - round 10
aese q3, v27.16b
aesmc q3, q3 @ AES block 3 - round 9
aese q0, v27.16b
aesmc q0, q0 @ AES block 0 - round 9
aese q2, v28.16b
aesmc q2, q2 @ AES block 2 - round 10
aese q3, v28.16b
aesmc q3, q3 @ AES block 3 - round 10
aese q1, v29.16b
aesmc q1, q1 @ AES block 1 - round 11
aese q2, v29.16b
aesmc q2, q2 @ AES block 2 - round 11
aese q0, v28.16b
aesmc q0, q0 @ AES block 0 - round 10
aese q1, v30.16b
aesmc q1, q1 @ AES block 1 - round 12
aese q2, v30.16b
aesmc q2, q2 @ AES block 2 - round 12
aese q0, v29.16b
aesmc q0, q0 @ AES block 0 - round 11
eor v17.16b, v17.16b, q9 @ h4k | h3k
aese q3, v29.16b
aesmc q3, q3 @ AES block 3 - round 11
aese q2, v31.16b @ AES block 2 - round 13
trn1 q8, v12.2d, v13.2d @ h2h | h1h
aese q0, v30.16b
aesmc q0, q0 @ AES block 0 - round 12
aese q3, v30.16b
aesmc q3, q3 @ AES block 3 - round 12
aese q1, v31.16b @ AES block 1 - round 13
aese q0, v31.16b @ AES block 0 - round 13
aese q3, v31.16b @ AES block 3 - round 13
eor v16.16b, v16.16b, q8 @ h2k | h1k
bge .L256_enc_tail @ handle tail
ldp r19, r20, [r0, #16] @ AES block 1 - load plaintext
#ifdef __ARMEB__
rev r19, r19
rev r20, r20
#endif
rev r9, r12 @ CTR block 4
ldp r6, r7, [r0, #0] @ AES block 0 - load plaintext
#ifdef __ARMEB__
rev r6, r6
rev r7, r7
#endif
ldp r23, r24, [r0, #48] @ AES block 3 - load plaintext
#ifdef __ARMEB__
rev r23, r23
rev r24, r24
#endif
ldp r21, r22, [r0, #32] @ AES block 2 - load plaintext
#ifdef __ARMEB__
rev r21, r21
rev r22, r22
#endif
add r0, r0, #64 @ AES input_ptr update
eor r19, r19, r13 @ AES block 1 - round 14 low
eor r20, r20, r14 @ AES block 1 - round 14 high
fmov d5, r19 @ AES block 1 - mov low
eor r6, r6, r13 @ AES block 0 - round 14 low
eor r7, r7, r14 @ AES block 0 - round 14 high
eor r24, r24, r14 @ AES block 3 - round 14 high
fmov d4, r6 @ AES block 0 - mov low
cmp r0, r5 @ check if we have <= 8 blocks
fmov v4.d[1], r7 @ AES block 0 - mov high
eor r23, r23, r13 @ AES block 3 - round 14 low
eor r21, r21, r13 @ AES block 2 - round 14 low
fmov v5.d[1], r20 @ AES block 1 - mov high
fmov d6, r21 @ AES block 2 - mov low
add r12, r12, #1 @ CTR block 4
orr r9, r11, r9, lsl #32 @ CTR block 4
fmov d7, r23 @ AES block 3 - mov low
eor r22, r22, r14 @ AES block 2 - round 14 high
fmov v6.d[1], r22 @ AES block 2 - mov high
eor q4, q4, q0 @ AES block 0 - result
fmov d0, r10 @ CTR block 4
fmov v0.d[1], r9 @ CTR block 4
rev r9, r12 @ CTR block 5
add r12, r12, #1 @ CTR block 5
eor q5, q5, q1 @ AES block 1 - result
fmov d1, r10 @ CTR block 5
orr r9, r11, r9, lsl #32 @ CTR block 5
fmov v1.d[1], r9 @ CTR block 5
rev r9, r12 @ CTR block 6
st1 { q4}, [r2], #16 @ AES block 0 - store result
fmov v7.d[1], r24 @ AES block 3 - mov high
orr r9, r11, r9, lsl #32 @ CTR block 6
eor q6, q6, q2 @ AES block 2 - result
st1 { q5}, [r2], #16 @ AES block 1 - store result
add r12, r12, #1 @ CTR block 6
fmov d2, r10 @ CTR block 6
fmov v2.d[1], r9 @ CTR block 6
st1 { q6}, [r2], #16 @ AES block 2 - store result
rev r9, r12 @ CTR block 7
orr r9, r11, r9, lsl #32 @ CTR block 7
eor q7, q7, q3 @ AES block 3 - result
st1 { q7}, [r2], #16 @ AES block 3 - store result
bge .L256_enc_prepretail @ do prepretail
.L256_enc_main_loop:@ main loop start
aese q0, v18.16b
aesmc q0, q0 @ AES block 4k+4 - round 0
rev64 q4, q4 @ GHASH block 4k (only t0 is free)
aese q1, v18.16b
aesmc q1, q1 @ AES block 4k+5 - round 0
fmov d3, r10 @ CTR block 4k+3
aese q2, v18.16b
aesmc q2, q2 @ AES block 4k+6 - round 0
ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0
aese q0, v19.16b
aesmc q0, q0 @ AES block 4k+4 - round 1
fmov v3.d[1], r9 @ CTR block 4k+3
aese q1, v19.16b
aesmc q1, q1 @ AES block 4k+5 - round 1
ldp r23, r24, [r0, #48] @ AES block 4k+7 - load plaintext
#ifdef __ARMEB__
rev r23, r23
rev r24, r24
#endif
aese q2, v19.16b
aesmc q2, q2 @ AES block 4k+6 - round 1
ldp r21, r22, [r0, #32] @ AES block 4k+6 - load plaintext
#ifdef __ARMEB__
rev r21, r21
rev r22, r22
#endif
aese q0, v20.16b
aesmc q0, q0 @ AES block 4k+4 - round 2
eor q4, q4, v11.16b @ PRE 1
aese q1, v20.16b
aesmc q1, q1 @ AES block 4k+5 - round 2
aese q3, v18.16b
aesmc q3, q3 @ AES block 4k+7 - round 0
eor r23, r23, r13 @ AES block 4k+7 - round 14 low
aese q0, v21.16b
aesmc q0, q0 @ AES block 4k+4 - round 3
mov d10, v17.d[1] @ GHASH block 4k - mid
pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high
eor r22, r22, r14 @ AES block 4k+6 - round 14 high
mov d8, v4.d[1] @ GHASH block 4k - mid
aese q3, v19.16b
aesmc q3, q3 @ AES block 4k+7 - round 1
rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free)
aese q0, v22.16b
aesmc q0, q0 @ AES block 4k+4 - round 4
pmull v11.1q, q4, v15.1d @ GHASH block 4k - low
eor q8, q8, q4 @ GHASH block 4k - mid
aese q2, v20.16b
aesmc q2, q2 @ AES block 4k+6 - round 2
aese q0, v23.16b
aesmc q0, q0 @ AES block 4k+4 - round 5
rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high
pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid
rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free)
pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low
eor q9, q9, q4 @ GHASH block 4k+1 - high
mov d4, v5.d[1] @ GHASH block 4k+1 - mid
aese q1, v21.16b
aesmc q1, q1 @ AES block 4k+5 - round 3
aese q3, v20.16b
aesmc q3, q3 @ AES block 4k+7 - round 2
eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low
aese q2, v21.16b
aesmc q2, q2 @ AES block 4k+6 - round 3
aese q1, v22.16b
aesmc q1, q1 @ AES block 4k+5 - round 4
mov d8, v6.d[1] @ GHASH block 4k+2 - mid
aese q3, v21.16b
aesmc q3, q3 @ AES block 4k+7 - round 3
eor q4, q4, q5 @ GHASH block 4k+1 - mid
aese q2, v22.16b
aesmc q2, q2 @ AES block 4k+6 - round 4
aese q0, v24.16b
aesmc q0, q0 @ AES block 4k+4 - round 6
eor q8, q8, q6 @ GHASH block 4k+2 - mid
aese q3, v22.16b
aesmc q3, q3 @ AES block 4k+7 - round 4
pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid
aese q0, v25.16b
aesmc q0, q0 @ AES block 4k+4 - round 7
aese q3, v23.16b
aesmc q3, q3 @ AES block 4k+7 - round 5
ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid
aese q1, v23.16b
aesmc q1, q1 @ AES block 4k+5 - round 5
aese q0, v26.16b
aesmc q0, q0 @ AES block 4k+4 - round 8
aese q2, v23.16b
aesmc q2, q2 @ AES block 4k+6 - round 5
aese q1, v24.16b
aesmc q1, q1 @ AES block 4k+5 - round 6
eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid
pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high
pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low
aese q1, v25.16b
aesmc q1, q1 @ AES block 4k+5 - round 7
pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low
eor q9, q9, q4 @ GHASH block 4k+2 - high
aese q3, v24.16b
aesmc q3, q3 @ AES block 4k+7 - round 6
ldp r19, r20, [r0, #16] @ AES block 4k+5 - load plaintext
#ifdef __ARMEB__
rev r19, r19
rev r20, r20
#endif
aese q1, v26.16b
aesmc q1, q1 @ AES block 4k+5 - round 8
mov d4, v7.d[1] @ GHASH block 4k+3 - mid
aese q2, v24.16b
aesmc q2, q2 @ AES block 4k+6 - round 6
eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low
pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid
pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high
eor q4, q4, q7 @ GHASH block 4k+3 - mid
aese q2, v25.16b
aesmc q2, q2 @ AES block 4k+6 - round 7
eor r19, r19, r13 @ AES block 4k+5 - round 14 low
aese q1, v27.16b
aesmc q1, q1 @ AES block 4k+5 - round 9
eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid
aese q3, v25.16b
aesmc q3, q3 @ AES block 4k+7 - round 7
eor r21, r21, r13 @ AES block 4k+6 - round 14 low
aese q0, v27.16b
aesmc q0, q0 @ AES block 4k+4 - round 9
movi q8, #0xc2
pmull v4.1q, q4, v16.1d @ GHASH block 4k+3 - mid
eor q9, q9, q5 @ GHASH block 4k+3 - high
fmov d5, r19 @ AES block 4k+5 - mov low
aese q2, v26.16b
aesmc q2, q2 @ AES block 4k+6 - round 8
ldp r6, r7, [r0, #0] @ AES block 4k+4 - load plaintext
#ifdef __ARMEB__
rev r6, r6
rev r7, r7
#endif
aese q0, v28.16b
aesmc q0, q0 @ AES block 4k+4 - round 10
shl d8, d8, #56 @ mod_constant
aese q3, v26.16b
aesmc q3, q3 @ AES block 4k+7 - round 8
eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low
aese q2, v27.16b
aesmc q2, q2 @ AES block 4k+6 - round 9
aese q1, v28.16b
aesmc q1, q1 @ AES block 4k+5 - round 10
eor v10.16b, v10.16b, q4 @ GHASH block 4k+3 - mid
aese q3, v27.16b
aesmc q3, q3 @ AES block 4k+7 - round 9
add r12, r12, #1 @ CTR block 4k+3
aese q0, v29.16b
aesmc q0, q0 @ AES block 4k+4 - round 11
eor q4, v11.16b, q9 @ MODULO - karatsuba tidy up
aese q1, v29.16b
aesmc q1, q1 @ AES block 4k+5 - round 11
add r0, r0, #64 @ AES input_ptr update
pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid
rev r9, r12 @ CTR block 4k+8
ext q9, q9, q9, #8 @ MODULO - other top alignment
aese q2, v28.16b
aesmc q2, q2 @ AES block 4k+6 - round 10
eor r6, r6, r13 @ AES block 4k+4 - round 14 low
aese q1, v30.16b
aesmc q1, q1 @ AES block 4k+5 - round 12
eor v10.16b, v10.16b, q4 @ MODULO - karatsuba tidy up
aese q3, v28.16b
aesmc q3, q3 @ AES block 4k+7 - round 10
eor r7, r7, r14 @ AES block 4k+4 - round 14 high
fmov d4, r6 @ AES block 4k+4 - mov low
orr r9, r11, r9, lsl #32 @ CTR block 4k+8
eor q7, q9, q7 @ MODULO - fold into mid
aese q0, v30.16b
aesmc q0, q0 @ AES block 4k+4 - round 12
eor r20, r20, r14 @ AES block 4k+5 - round 14 high
aese q2, v29.16b
aesmc q2, q2 @ AES block 4k+6 - round 11
eor r24, r24, r14 @ AES block 4k+7 - round 14 high
aese q3, v29.16b
aesmc q3, q3 @ AES block 4k+7 - round 11
add r12, r12, #1 @ CTR block 4k+8
aese q0, v31.16b @ AES block 4k+4 - round 13
fmov v4.d[1], r7 @ AES block 4k+4 - mov high
eor v10.16b, v10.16b, q7 @ MODULO - fold into mid
aese q2, v30.16b
aesmc q2, q2 @ AES block 4k+6 - round 12
fmov d7, r23 @ AES block 4k+7 - mov low
aese q1, v31.16b @ AES block 4k+5 - round 13
fmov v5.d[1], r20 @ AES block 4k+5 - mov high
fmov d6, r21 @ AES block 4k+6 - mov low
cmp r0, r5 @ .LOOP CONTROL
fmov v6.d[1], r22 @ AES block 4k+6 - mov high
pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low
eor q4, q4, q0 @ AES block 4k+4 - result
fmov d0, r10 @ CTR block 4k+8
fmov v0.d[1], r9 @ CTR block 4k+8
rev r9, r12 @ CTR block 4k+9
add r12, r12, #1 @ CTR block 4k+9
eor q5, q5, q1 @ AES block 4k+5 - result
fmov d1, r10 @ CTR block 4k+9
orr r9, r11, r9, lsl #32 @ CTR block 4k+9
aese q3, v30.16b
aesmc q3, q3 @ AES block 4k+7 - round 12
fmov v1.d[1], r9 @ CTR block 4k+9
aese q2, v31.16b @ AES block 4k+6 - round 13
rev r9, r12 @ CTR block 4k+10
st1 { q4}, [r2], #16 @ AES block 4k+4 - store result
orr r9, r11, r9, lsl #32 @ CTR block 4k+10
eor v11.16b, v11.16b, q9 @ MODULO - fold into low
fmov v7.d[1], r24 @ AES block 4k+7 - mov high
ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment
st1 { q5}, [r2], #16 @ AES block 4k+5 - store result
add r12, r12, #1 @ CTR block 4k+10
aese q3, v31.16b @ AES block 4k+7 - round 13
eor q6, q6, q2 @ AES block 4k+6 - result
fmov d2, r10 @ CTR block 4k+10
st1 { q6}, [r2], #16 @ AES block 4k+6 - store result
fmov v2.d[1], r9 @ CTR block 4k+10
rev r9, r12 @ CTR block 4k+11
eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low
orr r9, r11, r9, lsl #32 @ CTR block 4k+11
eor q7, q7, q3 @ AES block 4k+7 - result
st1 { q7}, [r2], #16 @ AES block 4k+7 - store result
blt .L256_enc_main_loop
.L256_enc_prepretail:@ PREPRETAIL
aese q1, v18.16b
aesmc q1, q1 @ AES block 4k+5 - round 0
rev64 q6, q6 @ GHASH block 4k+2 (t0, t1, and t2 free)
aese q2, v18.16b
aesmc q2, q2 @ AES block 4k+6 - round 0
fmov d3, r10 @ CTR block 4k+3
aese q0, v18.16b
aesmc q0, q0 @ AES block 4k+4 - round 0
rev64 q4, q4 @ GHASH block 4k (only t0 is free)
fmov v3.d[1], r9 @ CTR block 4k+3
ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0
aese q2, v19.16b
aesmc q2, q2 @ AES block 4k+6 - round 1
aese q0, v19.16b
aesmc q0, q0 @ AES block 4k+4 - round 1
eor q4, q4, v11.16b @ PRE 1
rev64 q5, q5 @ GHASH block 4k+1 (t0 and t1 free)
aese q2, v20.16b
aesmc q2, q2 @ AES block 4k+6 - round 2
aese q3, v18.16b
aesmc q3, q3 @ AES block 4k+7 - round 0
mov d10, v17.d[1] @ GHASH block 4k - mid
aese q1, v19.16b
aesmc q1, q1 @ AES block 4k+5 - round 1
pmull v11.1q, q4, v15.1d @ GHASH block 4k - low
mov d8, v4.d[1] @ GHASH block 4k - mid
pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high
aese q2, v21.16b
aesmc q2, q2 @ AES block 4k+6 - round 3
aese q1, v20.16b
aesmc q1, q1 @ AES block 4k+5 - round 2
eor q8, q8, q4 @ GHASH block 4k - mid
aese q0, v20.16b
aesmc q0, q0 @ AES block 4k+4 - round 2
aese q3, v19.16b
aesmc q3, q3 @ AES block 4k+7 - round 1
aese q1, v21.16b
aesmc q1, q1 @ AES block 4k+5 - round 3
pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid
pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high
pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low
aese q3, v20.16b
aesmc q3, q3 @ AES block 4k+7 - round 2
eor q9, q9, q4 @ GHASH block 4k+1 - high
mov d4, v5.d[1] @ GHASH block 4k+1 - mid
aese q0, v21.16b
aesmc q0, q0 @ AES block 4k+4 - round 3
eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low
aese q3, v21.16b
aesmc q3, q3 @ AES block 4k+7 - round 3
eor q4, q4, q5 @ GHASH block 4k+1 - mid
mov d8, v6.d[1] @ GHASH block 4k+2 - mid
aese q0, v22.16b
aesmc q0, q0 @ AES block 4k+4 - round 4
rev64 q7, q7 @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
aese q3, v22.16b
aesmc q3, q3 @ AES block 4k+7 - round 4
pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid
eor q8, q8, q6 @ GHASH block 4k+2 - mid
add r12, r12, #1 @ CTR block 4k+3
pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low
aese q3, v23.16b
aesmc q3, q3 @ AES block 4k+7 - round 5
aese q2, v22.16b
aesmc q2, q2 @ AES block 4k+6 - round 4
eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid
pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high
eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low
ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid
aese q2, v23.16b
aesmc q2, q2 @ AES block 4k+6 - round 5
eor q9, q9, q4 @ GHASH block 4k+2 - high
mov d4, v7.d[1] @ GHASH block 4k+3 - mid
aese q1, v22.16b
aesmc q1, q1 @ AES block 4k+5 - round 4
pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid
eor q4, q4, q7 @ GHASH block 4k+3 - mid
pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high
aese q1, v23.16b
aesmc q1, q1 @ AES block 4k+5 - round 5
pmull v4.1q, q4, v16.1d @ GHASH block 4k+3 - mid
eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid
aese q0, v23.16b
aesmc q0, q0 @ AES block 4k+4 - round 5
aese q1, v24.16b
aesmc q1, q1 @ AES block 4k+5 - round 6
aese q2, v24.16b
aesmc q2, q2 @ AES block 4k+6 - round 6
aese q0, v24.16b
aesmc q0, q0 @ AES block 4k+4 - round 6
movi q8, #0xc2
aese q3, v24.16b
aesmc q3, q3 @ AES block 4k+7 - round 6
aese q1, v25.16b
aesmc q1, q1 @ AES block 4k+5 - round 7
eor q9, q9, q5 @ GHASH block 4k+3 - high
aese q0, v25.16b
aesmc q0, q0 @ AES block 4k+4 - round 7
aese q3, v25.16b
aesmc q3, q3 @ AES block 4k+7 - round 7
shl d8, d8, #56 @ mod_constant
aese q1, v26.16b
aesmc q1, q1 @ AES block 4k+5 - round 8
eor v10.16b, v10.16b, q4 @ GHASH block 4k+3 - mid
pmull v6.1q, q7, v12.1d @ GHASH block 4k+3 - low
aese q3, v26.16b
aesmc q3, q3 @ AES block 4k+7 - round 8
aese q1, v27.16b
aesmc q1, q1 @ AES block 4k+5 - round 9
aese q0, v26.16b
aesmc q0, q0 @ AES block 4k+4 - round 8
eor v11.16b, v11.16b, q6 @ GHASH block 4k+3 - low
aese q3, v27.16b
aesmc q3, q3 @ AES block 4k+7 - round 9
eor v10.16b, v10.16b, q9 @ karatsuba tidy up
pmull v4.1q, q9, q8
ext q9, q9, q9, #8
aese q3, v28.16b
aesmc q3, q3 @ AES block 4k+7 - round 10
aese q2, v25.16b
aesmc q2, q2 @ AES block 4k+6 - round 7
eor v10.16b, v10.16b, v11.16b
aese q1, v28.16b
aesmc q1, q1 @ AES block 4k+5 - round 10
aese q0, v27.16b
aesmc q0, q0 @ AES block 4k+4 - round 9
aese q2, v26.16b
aesmc q2, q2 @ AES block 4k+6 - round 8
aese q1, v29.16b
aesmc q1, q1 @ AES block 4k+5 - round 11
eor v10.16b, v10.16b, q4
aese q0, v28.16b
aesmc q0, q0 @ AES block 4k+4 - round 10
aese q2, v27.16b
aesmc q2, q2 @ AES block 4k+6 - round 9
aese q1, v30.16b
aesmc q1, q1 @ AES block 4k+5 - round 12
aese q0, v29.16b
aesmc q0, q0 @ AES block 4k+4 - round 11
eor v10.16b, v10.16b, q9
aese q3, v29.16b
aesmc q3, q3 @ AES block 4k+7 - round 11
aese q2, v28.16b
aesmc q2, q2 @ AES block 4k+6 - round 10
aese q0, v30.16b
aesmc q0, q0 @ AES block 4k+4 - round 12
pmull v4.1q, v10.1d, q8
aese q2, v29.16b
aesmc q2, q2 @ AES block 4k+6 - round 11
ext v10.16b, v10.16b, v10.16b, #8
aese q3, v30.16b
aesmc q3, q3 @ AES block 4k+7 - round 12
aese q1, v31.16b @ AES block 4k+5 - round 13
eor v11.16b, v11.16b, q4
aese q2, v30.16b
aesmc q2, q2 @ AES block 4k+6 - round 12
aese q3, v31.16b @ AES block 4k+7 - round 13
aese q0, v31.16b @ AES block 4k+4 - round 13
aese q2, v31.16b @ AES block 4k+6 - round 13
eor v11.16b, v11.16b, v10.16b
.L256_enc_tail:@ TAIL
ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag
sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process
ldp r6, r7, [r0], #16 @ AES block 4k+4 - load plaintext
#ifdef __ARMEB__
rev r6, r6
rev r7, r7
#endif
eor r6, r6, r13 @ AES block 4k+4 - round 14 low
eor r7, r7, r14 @ AES block 4k+4 - round 14 high
cmp r5, #48
fmov d4, r6 @ AES block 4k+4 - mov low
fmov v4.d[1], r7 @ AES block 4k+4 - mov high
eor q5, q4, q0 @ AES block 4k+4 - result
bgt .L256_enc_blocks_more_than_3
cmp r5, #32
mov q3, q2
movi v11.8b, #0
movi q9, #0
sub r12, r12, #1
mov q2, q1
movi v10.8b, #0
bgt .L256_enc_blocks_more_than_2
mov q3, q1
sub r12, r12, #1
cmp r5, #16
bgt .L256_enc_blocks_more_than_1
sub r12, r12, #1
b .L256_enc_blocks_less_than_1
.L256_enc_blocks_more_than_3:@ blocks left > 3
st1 { q5}, [r2], #16 @ AES final-3 block - store result
ldp r6, r7, [r0], #16 @ AES final-2 block - load input low & high
#ifdef __ARMEB__
rev r6, r6
rev r7, r7
#endif
rev64 q4, q5 @ GHASH final-3 block
eor r6, r6, r13 @ AES final-2 block - round 14 low
eor q4, q4, q8 @ feed in partial tag
eor r7, r7, r14 @ AES final-2 block - round 14 high
mov d22, v4.d[1] @ GHASH final-3 block - mid
fmov d5, r6 @ AES final-2 block - mov low
fmov v5.d[1], r7 @ AES final-2 block - mov high
eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid
movi q8, #0 @ suppress further partial tag feed in
mov d10, v17.d[1] @ GHASH final-3 block - mid
pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low
pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high
pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid
eor q5, q5, q1 @ AES final-2 block - result
.L256_enc_blocks_more_than_2:@ blocks left > 2
st1 { q5}, [r2], #16 @ AES final-2 block - store result
ldp r6, r7, [r0], #16 @ AES final-1 block - load input low & high
#ifdef __ARMEB__
rev r6, r6
rev r7, r7
#endif
rev64 q4, q5 @ GHASH final-2 block
eor r6, r6, r13 @ AES final-1 block - round 14 low
eor q4, q4, q8 @ feed in partial tag
fmov d5, r6 @ AES final-1 block - mov low
eor r7, r7, r14 @ AES final-1 block - round 14 high
fmov v5.d[1], r7 @ AES final-1 block - mov high
movi q8, #0 @ suppress further partial tag feed in
pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high
mov d22, v4.d[1] @ GHASH final-2 block - mid
pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low
eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid
eor q5, q5, q2 @ AES final-1 block - result
eor q9, q9, v20.16b @ GHASH final-2 block - high
pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid
eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low
eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid
.L256_enc_blocks_more_than_1:@ blocks left > 1
st1 { q5}, [r2], #16 @ AES final-1 block - store result
rev64 q4, q5 @ GHASH final-1 block
ldp r6, r7, [r0], #16 @ AES final block - load input low & high
#ifdef __ARMEB__
rev r6, r6
rev r7, r7
#endif
eor q4, q4, q8 @ feed in partial tag
movi q8, #0 @ suppress further partial tag feed in
eor r6, r6, r13 @ AES final block - round 14 low
mov d22, v4.d[1] @ GHASH final-1 block - mid
pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high
eor r7, r7, r14 @ AES final block - round 14 high
eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid
eor q9, q9, v20.16b @ GHASH final-1 block - high
ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid
fmov d5, r6 @ AES final block - mov low
fmov v5.d[1], r7 @ AES final block - mov high
pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid
pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low
eor q5, q5, q3 @ AES final block - result
eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid
eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low
.L256_enc_blocks_less_than_1:@ blocks left <= 1
and r1, r1, #127 @ bit_length %= 128
mvn r13, xzr @ rk14_l = 0xffffffffffffffff
sub r1, r1, #128 @ bit_length -= 128
neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128])
ld1 { v18.16b}, [r2] @ load existing bytes where the possibly partial last block is to be stored
mvn r14, xzr @ rk14_h = 0xffffffffffffffff
and r1, r1, #127 @ bit_length %= 128
lsr r14, r14, r1 @ rk14_h is mask for top 64b of last block
cmp r1, #64
csel r6, r13, r14, lt
csel r7, r14, xzr, lt
fmov d0, r6 @ ctr0b is mask for last block
fmov v0.d[1], r7
and q5, q5, q0 @ possibly partial last block has zeroes in highest bits
rev64 q4, q5 @ GHASH final block
eor q4, q4, q8 @ feed in partial tag
bif q5, v18.16b, q0 @ insert existing bytes in top end of result before storing
pmull2 v20.1q, q4, v12.2d @ GHASH final block - high
mov d8, v4.d[1] @ GHASH final block - mid
#ifndef __ARMEB__
rev r9, r12
#else
mov r9, r12
#endif
pmull v21.1q, q4, v12.1d @ GHASH final block - low
eor q9, q9, v20.16b @ GHASH final block - high
eor q8, q8, q4 @ GHASH final block - mid
pmull v8.1q, q8, v16.1d @ GHASH final block - mid
eor v11.16b, v11.16b, v21.16b @ GHASH final block - low
eor v10.16b, v10.16b, q8 @ GHASH final block - mid
movi q8, #0xc2
eor q4, v11.16b, q9 @ MODULO - karatsuba tidy up
shl d8, d8, #56 @ mod_constant
eor v10.16b, v10.16b, q4 @ MODULO - karatsuba tidy up
pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid
ext q9, q9, q9, #8 @ MODULO - other top alignment
eor v10.16b, v10.16b, q7 @ MODULO - fold into mid
eor v10.16b, v10.16b, q9 @ MODULO - fold into mid
pmull v9.1q, v10.1d, q8 @ MODULO - mid 64b align with low
ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment
str r9, [r16, #12] @ store the updated counter
st1 { q5}, [r2] @ store all 16B
eor v11.16b, v11.16b, q9 @ MODULO - fold into low
eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low
ext v11.16b, v11.16b, v11.16b, #8
rev64 v11.16b, v11.16b
mov r0, r15
st1 { v11.16b }, [r3]
ldp r21, r22, [sp, #16]
ldp r23, r24, [sp, #32]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #64]
ldp d12, d13, [sp, #80]
ldp d14, d15, [sp, #96]
ldp r19, r20, [sp], #112
RET
.L256_enc_ret:
mov r0, #0x0
RET
.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
.globl aes_gcm_dec_256_kernel
.type aes_gcm_dec_256_kernel,%function
.align 4
aes_gcm_dec_256_kernel:
cbz r1, .L256_dec_ret
stp r19, r20, [sp, #-112]!
mov r16, r4
mov r8, r5
stp r21, r22, [sp, #16]
stp r23, r24, [sp, #32]
stp d8, d9, [sp, #48]
stp d10, d11, [sp, #64]
stp d12, d13, [sp, #80]
stp d14, d15, [sp, #96]
lsr r5, r1, #3 @ byte_len
mov r15, r5
ldp r10, r11, [r16] @ ctr96_b64, ctr96_t32
#ifdef __ARMEB__
rev r10, r10
rev r11, r11
#endif
ldp r13, r14, [r8, #224] @ load rk14
#ifdef __ARMEB__
ror r14, r14, #32
ror r13, r13, #32
#endif
ld1 {v18.4s}, [r8], #16 @ load rk0
sub r5, r5, #1 @ byte_len - 1
ld1 {v19.4s}, [r8], #16 @ load rk1
and r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
add r4, r0, r1, lsr #3 @ end_input_ptr
ld1 {v20.4s}, [r8], #16 @ load rk2
lsr r12, r11, #32
ld1 {v21.4s}, [r8], #16 @ load rk3
orr r11, r11, r11
ld1 {v22.4s}, [r8], #16 @ load rk4
add r5, r5, r0
rev r12, r12 @ rev_ctr32
add r12, r12, #1 @ increment rev_ctr32
fmov d3, r10 @ CTR block 3
rev r9, r12 @ CTR block 1
add r12, r12, #1 @ CTR block 1
fmov d1, r10 @ CTR block 1
orr r9, r11, r9, lsl #32 @ CTR block 1
ld1 { q0}, [r16] @ special case vector load initial counter so we can start first AES block as quickly as possible
fmov v1.d[1], r9 @ CTR block 1
rev r9, r12 @ CTR block 2
add r12, r12, #1 @ CTR block 2
fmov d2, r10 @ CTR block 2
orr r9, r11, r9, lsl #32 @ CTR block 2
fmov v2.d[1], r9 @ CTR block 2
rev r9, r12 @ CTR block 3
orr r9, r11, r9, lsl #32 @ CTR block 3
ld1 {v23.4s}, [r8], #16 @ load rk5
fmov v3.d[1], r9 @ CTR block 3
add r12, r12, #1 @ CTR block 3
ld1 {v24.4s}, [r8], #16 @ load rk6
ld1 {v25.4s}, [r8], #16 @ load rk7
ld1 {v26.4s}, [r8], #16 @ load rk8
aese q0, v18.16b
aesmc q0, q0 @ AES block 0 - round 0
ldr q14, [r3, #80] @ load h3l | h3h
#ifndef __ARMEB__
ext v14.16b, v14.16b, v14.16b, #8
#endif
aese q3, v18.16b
aesmc q3, q3 @ AES block 3 - round 0
ldr q15, [r3, #112] @ load h4l | h4h
#ifndef __ARMEB__
ext v15.16b, v15.16b, v15.16b, #8
#endif
aese q1, v18.16b
aesmc q1, q1 @ AES block 1 - round 0
ldr q13, [r3, #64] @ load h2l | h2h
#ifndef __ARMEB__
ext v13.16b, v13.16b, v13.16b, #8
#endif
aese q2, v18.16b
aesmc q2, q2 @ AES block 2 - round 0
ld1 {v27.4s}, [r8], #16 @ load rk9
aese q0, v19.16b
aesmc q0, q0 @ AES block 0 - round 1
aese q1, v19.16b
aesmc q1, q1 @ AES block 1 - round 1
ld1 { v11.16b}, [r3]
ext v11.16b, v11.16b, v11.16b, #8
rev64 v11.16b, v11.16b
aese q2, v19.16b
aesmc q2, q2 @ AES block 2 - round 1
ld1 {v28.4s}, [r8], #16 @ load rk10
aese q3, v19.16b
aesmc q3, q3 @ AES block 3 - round 1
ld1 {v29.4s}, [r8], #16 @ load rk11
aese q0, v20.16b
aesmc q0, q0 @ AES block 0 - round 2
ldr q12, [r3, #32] @ load h1l | h1h
#ifndef __ARMEB__
ext v12.16b, v12.16b, v12.16b, #8
#endif
aese q2, v20.16b
aesmc q2, q2 @ AES block 2 - round 2
ld1 {v30.4s}, [r8], #16 @ load rk12
aese q3, v20.16b
aesmc q3, q3 @ AES block 3 - round 2
aese q0, v21.16b
aesmc q0, q0 @ AES block 0 - round 3
aese q1, v20.16b
aesmc q1, q1 @ AES block 1 - round 2
aese q3, v21.16b
aesmc q3, q3 @ AES block 3 - round 3
aese q0, v22.16b
aesmc q0, q0 @ AES block 0 - round 4
cmp r0, r5 @ check if we have <= 4 blocks
aese q2, v21.16b
aesmc q2, q2 @ AES block 2 - round 3
aese q1, v21.16b
aesmc q1, q1 @ AES block 1 - round 3
aese q3, v22.16b
aesmc q3, q3 @ AES block 3 - round 4
aese q2, v22.16b
aesmc q2, q2 @ AES block 2 - round 4
aese q1, v22.16b
aesmc q1, q1 @ AES block 1 - round 4
aese q3, v23.16b
aesmc q3, q3 @ AES block 3 - round 5
aese q0, v23.16b
aesmc q0, q0 @ AES block 0 - round 5
aese q1, v23.16b
aesmc q1, q1 @ AES block 1 - round 5
aese q2, v23.16b
aesmc q2, q2 @ AES block 2 - round 5
aese q0, v24.16b
aesmc q0, q0 @ AES block 0 - round 6
aese q3, v24.16b
aesmc q3, q3 @ AES block 3 - round 6
aese q1, v24.16b
aesmc q1, q1 @ AES block 1 - round 6
aese q2, v24.16b
aesmc q2, q2 @ AES block 2 - round 6
aese q0, v25.16b
aesmc q0, q0 @ AES block 0 - round 7
aese q1, v25.16b
aesmc q1, q1 @ AES block 1 - round 7
aese q3, v25.16b
aesmc q3, q3 @ AES block 3 - round 7
aese q0, v26.16b
aesmc q0, q0 @ AES block 0 - round 8
aese q2, v25.16b
aesmc q2, q2 @ AES block 2 - round 7
aese q3, v26.16b
aesmc q3, q3 @ AES block 3 - round 8
aese q1, v26.16b
aesmc q1, q1 @ AES block 1 - round 8
aese q0, v27.16b
aesmc q0, q0 @ AES block 0 - round 9
aese q2, v26.16b
aesmc q2, q2 @ AES block 2 - round 8
ld1 {v31.4s}, [r8], #16 @ load rk13
aese q1, v27.16b
aesmc q1, q1 @ AES block 1 - round 9
aese q0, v28.16b
aesmc q0, q0 @ AES block 0 - round 10
aese q3, v27.16b
aesmc q3, q3 @ AES block 3 - round 9
aese q1, v28.16b
aesmc q1, q1 @ AES block 1 - round 10
aese q2, v27.16b
aesmc q2, q2 @ AES block 2 - round 9
aese q3, v28.16b
aesmc q3, q3 @ AES block 3 - round 10
aese q0, v29.16b
aesmc q0, q0 @ AES block 0 - round 11
aese q2, v28.16b
aesmc q2, q2 @ AES block 2 - round 10
aese q3, v29.16b
aesmc q3, q3 @ AES block 3 - round 11
aese q1, v29.16b
aesmc q1, q1 @ AES block 1 - round 11
aese q2, v29.16b
aesmc q2, q2 @ AES block 2 - round 11
trn1 q9, v14.2d, v15.2d @ h4h | h3h
trn2 v17.2d, v14.2d, v15.2d @ h4l | h3l
trn1 q8, v12.2d, v13.2d @ h2h | h1h
trn2 v16.2d, v12.2d, v13.2d @ h2l | h1l
aese q1, v30.16b
aesmc q1, q1 @ AES block 1 - round 12
aese q0, v30.16b
aesmc q0, q0 @ AES block 0 - round 12
aese q2, v30.16b
aesmc q2, q2 @ AES block 2 - round 12
aese q3, v30.16b
aesmc q3, q3 @ AES block 3 - round 12
eor v17.16b, v17.16b, q9 @ h4k | h3k
aese q1, v31.16b @ AES block 1 - round 13
aese q2, v31.16b @ AES block 2 - round 13
eor v16.16b, v16.16b, q8 @ h2k | h1k
aese q3, v31.16b @ AES block 3 - round 13
aese q0, v31.16b @ AES block 0 - round 13
bge .L256_dec_tail @ handle tail
ld1 {q4, q5}, [r0], #32 @ AES block 0,1 - load ciphertext
rev r9, r12 @ CTR block 4
eor q0, q4, q0 @ AES block 0 - result
eor q1, q5, q1 @ AES block 1 - result
rev64 q5, q5 @ GHASH block 1
ld1 {q6}, [r0], #16 @ AES block 2 - load ciphertext
mov r7, v0.d[1] @ AES block 0 - mov high
mov r6, v0.d[0] @ AES block 0 - mov low
rev64 q4, q4 @ GHASH block 0
add r12, r12, #1 @ CTR block 4
fmov d0, r10 @ CTR block 4
orr r9, r11, r9, lsl #32 @ CTR block 4
fmov v0.d[1], r9 @ CTR block 4
rev r9, r12 @ CTR block 5
add r12, r12, #1 @ CTR block 5
mov r19, v1.d[0] @ AES block 1 - mov low
orr r9, r11, r9, lsl #32 @ CTR block 5
mov r20, v1.d[1] @ AES block 1 - mov high
eor r7, r7, r14 @ AES block 0 - round 14 high
#ifdef __ARMEB__
rev r7, r7
#endif
eor r6, r6, r13 @ AES block 0 - round 14 low
#ifdef __ARMEB__
rev r6, r6
#endif
stp r6, r7, [r2], #16 @ AES block 0 - store result
fmov d1, r10 @ CTR block 5
ld1 {q7}, [r0], #16 @ AES block 3 - load ciphertext
fmov v1.d[1], r9 @ CTR block 5
rev r9, r12 @ CTR block 6
add r12, r12, #1 @ CTR block 6
eor r19, r19, r13 @ AES block 1 - round 14 low
#ifdef __ARMEB__
rev r19, r19
#endif
orr r9, r11, r9, lsl #32 @ CTR block 6
eor r20, r20, r14 @ AES block 1 - round 14 high
#ifdef __ARMEB__
rev r20, r20
#endif
stp r19, r20, [r2], #16 @ AES block 1 - store result
eor q2, q6, q2 @ AES block 2 - result
cmp r0, r5 @ check if we have <= 8 blocks
bge .L256_dec_prepretail @ do prepretail
.L256_dec_main_loop:@ main loop start
mov r21, v2.d[0] @ AES block 4k+2 - mov low
ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0
eor q3, q7, q3 @ AES block 4k+3 - result
aese q0, v18.16b
aesmc q0, q0 @ AES block 4k+4 - round 0
mov r22, v2.d[1] @ AES block 4k+2 - mov high
aese q1, v18.16b
aesmc q1, q1 @ AES block 4k+5 - round 0
fmov d2, r10 @ CTR block 4k+6
fmov v2.d[1], r9 @ CTR block 4k+6
eor q4, q4, v11.16b @ PRE 1
rev r9, r12 @ CTR block 4k+7
aese q0, v19.16b
aesmc q0, q0 @ AES block 4k+4 - round 1
mov r24, v3.d[1] @ AES block 4k+3 - mov high
aese q1, v19.16b
aesmc q1, q1 @ AES block 4k+5 - round 1
mov r23, v3.d[0] @ AES block 4k+3 - mov low
pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high
mov d8, v4.d[1] @ GHASH block 4k - mid
fmov d3, r10 @ CTR block 4k+7
aese q0, v20.16b
aesmc q0, q0 @ AES block 4k+4 - round 2
orr r9, r11, r9, lsl #32 @ CTR block 4k+7
aese q2, v18.16b
aesmc q2, q2 @ AES block 4k+6 - round 0
fmov v3.d[1], r9 @ CTR block 4k+7
aese q1, v20.16b
aesmc q1, q1 @ AES block 4k+5 - round 2
eor q8, q8, q4 @ GHASH block 4k - mid
aese q0, v21.16b
aesmc q0, q0 @ AES block 4k+4 - round 3
eor r22, r22, r14 @ AES block 4k+2 - round 14 high
#ifdef __ARMEB__
rev r22, r22
#endif
aese q2, v19.16b
aesmc q2, q2 @ AES block 4k+6 - round 1
mov d10, v17.d[1] @ GHASH block 4k - mid
aese q1, v21.16b
aesmc q1, q1 @ AES block 4k+5 - round 3
rev64 q6, q6 @ GHASH block 4k+2
aese q3, v18.16b
aesmc q3, q3 @ AES block 4k+7 - round 0
eor r21, r21, r13 @ AES block 4k+2 - round 14 low
#ifdef __ARMEB__
rev r21, r21
#endif
aese q2, v20.16b
aesmc q2, q2 @ AES block 4k+6 - round 2
stp r21, r22, [r2], #16 @ AES block 4k+2 - store result
pmull v11.1q, q4, v15.1d @ GHASH block 4k - low
pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high
aese q2, v21.16b
aesmc q2, q2 @ AES block 4k+6 - round 3
rev64 q7, q7 @ GHASH block 4k+3
pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid
eor r23, r23, r13 @ AES block 4k+3 - round 14 low
#ifdef __ARMEB__
rev r23, r23
#endif
pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low
eor r24, r24, r14 @ AES block 4k+3 - round 14 high
#ifdef __ARMEB__
rev r24, r24
#endif
eor q9, q9, q4 @ GHASH block 4k+1 - high
aese q2, v22.16b
aesmc q2, q2 @ AES block 4k+6 - round 4
aese q3, v19.16b
aesmc q3, q3 @ AES block 4k+7 - round 1
mov d4, v5.d[1] @ GHASH block 4k+1 - mid
aese q0, v22.16b
aesmc q0, q0 @ AES block 4k+4 - round 4
eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low
aese q2, v23.16b
aesmc q2, q2 @ AES block 4k+6 - round 5
add r12, r12, #1 @ CTR block 4k+7
aese q3, v20.16b
aesmc q3, q3 @ AES block 4k+7 - round 2
mov d8, v6.d[1] @ GHASH block 4k+2 - mid
aese q1, v22.16b
aesmc q1, q1 @ AES block 4k+5 - round 4
eor q4, q4, q5 @ GHASH block 4k+1 - mid
pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low
aese q3, v21.16b
aesmc q3, q3 @ AES block 4k+7 - round 3
eor q8, q8, q6 @ GHASH block 4k+2 - mid
aese q1, v23.16b
aesmc q1, q1 @ AES block 4k+5 - round 5
aese q0, v23.16b
aesmc q0, q0 @ AES block 4k+4 - round 5
eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low
pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid
rev r9, r12 @ CTR block 4k+8
aese q1, v24.16b
aesmc q1, q1 @ AES block 4k+5 - round 6
ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid
aese q0, v24.16b
aesmc q0, q0 @ AES block 4k+4 - round 6
add r12, r12, #1 @ CTR block 4k+8
aese q3, v22.16b
aesmc q3, q3 @ AES block 4k+7 - round 4
aese q1, v25.16b
aesmc q1, q1 @ AES block 4k+5 - round 7
eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid
aese q0, v25.16b
aesmc q0, q0 @ AES block 4k+4 - round 7
pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high
mov d6, v7.d[1] @ GHASH block 4k+3 - mid
aese q3, v23.16b
aesmc q3, q3 @ AES block 4k+7 - round 5
pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid
aese q0, v26.16b
aesmc q0, q0 @ AES block 4k+4 - round 8
eor q9, q9, q4 @ GHASH block 4k+2 - high
aese q3, v24.16b
aesmc q3, q3 @ AES block 4k+7 - round 6
pmull v4.1q, q7, v12.1d @ GHASH block 4k+3 - low
orr r9, r11, r9, lsl #32 @ CTR block 4k+8
eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid
pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high
aese q0, v27.16b
aesmc q0, q0 @ AES block 4k+4 - round 9
eor q6, q6, q7 @ GHASH block 4k+3 - mid
aese q1, v26.16b
aesmc q1, q1 @ AES block 4k+5 - round 8
aese q2, v24.16b
aesmc q2, q2 @ AES block 4k+6 - round 6
eor q9, q9, q5 @ GHASH block 4k+3 - high
aese q0, v28.16b
aesmc q0, q0 @ AES block 4k+4 - round 10
pmull v6.1q, q6, v16.1d @ GHASH block 4k+3 - mid
movi q8, #0xc2
aese q2, v25.16b
aesmc q2, q2 @ AES block 4k+6 - round 7
eor v11.16b, v11.16b, q4 @ GHASH block 4k+3 - low
aese q0, v29.16b
aesmc q0, q0 @ AES block 4k+4 - round 11
aese q3, v25.16b
aesmc q3, q3 @ AES block 4k+7 - round 7
shl d8, d8, #56 @ mod_constant
aese q2, v26.16b
aesmc q2, q2 @ AES block 4k+6 - round 8
eor v10.16b, v10.16b, q6 @ GHASH block 4k+3 - mid
aese q0, v30.16b
aesmc q0, q0 @ AES block 4k+4 - round 12
pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid
eor q6, v11.16b, q9 @ MODULO - karatsuba tidy up
aese q1, v27.16b
aesmc q1, q1 @ AES block 4k+5 - round 9
ld1 {q4}, [r0], #16 @ AES block 4k+4 - load ciphertext
aese q0, v31.16b @ AES block 4k+4 - round 13
ext q9, q9, q9, #8 @ MODULO - other top alignment
aese q1, v28.16b
aesmc q1, q1 @ AES block 4k+5 - round 10
eor v10.16b, v10.16b, q6 @ MODULO - karatsuba tidy up
aese q2, v27.16b
aesmc q2, q2 @ AES block 4k+6 - round 9
ld1 {q5}, [r0], #16 @ AES block 4k+5 - load ciphertext
aese q3, v26.16b
aesmc q3, q3 @ AES block 4k+7 - round 8
eor q0, q4, q0 @ AES block 4k+4 - result
aese q1, v29.16b
aesmc q1, q1 @ AES block 4k+5 - round 11
stp r23, r24, [r2], #16 @ AES block 4k+3 - store result
aese q2, v28.16b
aesmc q2, q2 @ AES block 4k+6 - round 10
eor v10.16b, v10.16b, q7 @ MODULO - fold into mid
aese q3, v27.16b
aesmc q3, q3 @ AES block 4k+7 - round 9
ld1 {q6}, [r0], #16 @ AES block 4k+6 - load ciphertext
aese q1, v30.16b
aesmc q1, q1 @ AES block 4k+5 - round 12
ld1 {q7}, [r0], #16 @ AES block 4k+7 - load ciphertext
aese q2, v29.16b
aesmc q2, q2 @ AES block 4k+6 - round 11
mov r7, v0.d[1] @ AES block 4k+4 - mov high
aese q3, v28.16b
aesmc q3, q3 @ AES block 4k+7 - round 10
eor v10.16b, v10.16b, q9 @ MODULO - fold into mid
aese q1, v31.16b @ AES block 4k+5 - round 13
mov r6, v0.d[0] @ AES block 4k+4 - mov low
aese q2, v30.16b
aesmc q2, q2 @ AES block 4k+6 - round 12
fmov d0, r10 @ CTR block 4k+8
aese q3, v29.16b
aesmc q3, q3 @ AES block 4k+7 - round 11
fmov v0.d[1], r9 @ CTR block 4k+8
pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low
eor q1, q5, q1 @ AES block 4k+5 - result
rev r9, r12 @ CTR block 4k+9
aese q2, v31.16b @ AES block 4k+6 - round 13
orr r9, r11, r9, lsl #32 @ CTR block 4k+9
cmp r0, r5 @ .LOOP CONTROL
add r12, r12, #1 @ CTR block 4k+9
eor r6, r6, r13 @ AES block 4k+4 - round 14 low
#ifdef __ARMEB__
rev r6, r6
#endif
eor r7, r7, r14 @ AES block 4k+4 - round 14 high
#ifdef __ARMEB__
rev r7, r7
#endif
mov r20, v1.d[1] @ AES block 4k+5 - mov high
eor q2, q6, q2 @ AES block 4k+6 - result
eor v11.16b, v11.16b, q8 @ MODULO - fold into low
aese q3, v30.16b
aesmc q3, q3 @ AES block 4k+7 - round 12
mov r19, v1.d[0] @ AES block 4k+5 - mov low
fmov d1, r10 @ CTR block 4k+9
ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment
fmov v1.d[1], r9 @ CTR block 4k+9
rev r9, r12 @ CTR block 4k+10
add r12, r12, #1 @ CTR block 4k+10
aese q3, v31.16b @ AES block 4k+7 - round 13
orr r9, r11, r9, lsl #32 @ CTR block 4k+10
rev64 q5, q5 @ GHASH block 4k+5
eor r20, r20, r14 @ AES block 4k+5 - round 14 high
#ifdef __ARMEB__
rev r20, r20
#endif
stp r6, r7, [r2], #16 @ AES block 4k+4 - store result
eor r19, r19, r13 @ AES block 4k+5 - round 14 low
#ifdef __ARMEB__
rev r19, r19
#endif
stp r19, r20, [r2], #16 @ AES block 4k+5 - store result
rev64 q4, q4 @ GHASH block 4k+4
eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low
blt .L256_dec_main_loop
.L256_dec_prepretail:@ PREPRETAIL
ext v11.16b, v11.16b, v11.16b, #8 @ PRE 0
mov r21, v2.d[0] @ AES block 4k+2 - mov low
eor q3, q7, q3 @ AES block 4k+3 - result
aese q0, v18.16b
aesmc q0, q0 @ AES block 4k+4 - round 0
mov r22, v2.d[1] @ AES block 4k+2 - mov high
aese q1, v18.16b
aesmc q1, q1 @ AES block 4k+5 - round 0
fmov d2, r10 @ CTR block 4k+6
fmov v2.d[1], r9 @ CTR block 4k+6
rev r9, r12 @ CTR block 4k+7
eor q4, q4, v11.16b @ PRE 1
rev64 q6, q6 @ GHASH block 4k+2
orr r9, r11, r9, lsl #32 @ CTR block 4k+7
mov r23, v3.d[0] @ AES block 4k+3 - mov low
aese q1, v19.16b
aesmc q1, q1 @ AES block 4k+5 - round 1
mov r24, v3.d[1] @ AES block 4k+3 - mov high
pmull v11.1q, q4, v15.1d @ GHASH block 4k - low
mov d8, v4.d[1] @ GHASH block 4k - mid
fmov d3, r10 @ CTR block 4k+7
pmull2 v9.1q, q4, v15.2d @ GHASH block 4k - high
fmov v3.d[1], r9 @ CTR block 4k+7
aese q2, v18.16b
aesmc q2, q2 @ AES block 4k+6 - round 0
mov d10, v17.d[1] @ GHASH block 4k - mid
aese q0, v19.16b
aesmc q0, q0 @ AES block 4k+4 - round 1
eor q8, q8, q4 @ GHASH block 4k - mid
pmull2 v4.1q, q5, v14.2d @ GHASH block 4k+1 - high
aese q2, v19.16b
aesmc q2, q2 @ AES block 4k+6 - round 1
rev64 q7, q7 @ GHASH block 4k+3
aese q3, v18.16b
aesmc q3, q3 @ AES block 4k+7 - round 0
pmull v10.1q, q8, v10.1d @ GHASH block 4k - mid
eor q9, q9, q4 @ GHASH block 4k+1 - high
pmull v8.1q, q5, v14.1d @ GHASH block 4k+1 - low
aese q3, v19.16b
aesmc q3, q3 @ AES block 4k+7 - round 1
mov d4, v5.d[1] @ GHASH block 4k+1 - mid
aese q0, v20.16b
aesmc q0, q0 @ AES block 4k+4 - round 2
aese q1, v20.16b
aesmc q1, q1 @ AES block 4k+5 - round 2
eor v11.16b, v11.16b, q8 @ GHASH block 4k+1 - low
aese q2, v20.16b
aesmc q2, q2 @ AES block 4k+6 - round 2
aese q0, v21.16b
aesmc q0, q0 @ AES block 4k+4 - round 3
mov d8, v6.d[1] @ GHASH block 4k+2 - mid
aese q3, v20.16b
aesmc q3, q3 @ AES block 4k+7 - round 2
eor q4, q4, q5 @ GHASH block 4k+1 - mid
pmull v5.1q, q6, v13.1d @ GHASH block 4k+2 - low
aese q0, v22.16b
aesmc q0, q0 @ AES block 4k+4 - round 4
aese q3, v21.16b
aesmc q3, q3 @ AES block 4k+7 - round 3
eor q8, q8, q6 @ GHASH block 4k+2 - mid
pmull v4.1q, q4, v17.1d @ GHASH block 4k+1 - mid
aese q0, v23.16b
aesmc q0, q0 @ AES block 4k+4 - round 5
eor v11.16b, v11.16b, q5 @ GHASH block 4k+2 - low
aese q3, v22.16b
aesmc q3, q3 @ AES block 4k+7 - round 4
pmull2 v5.1q, q7, v12.2d @ GHASH block 4k+3 - high
eor v10.16b, v10.16b, q4 @ GHASH block 4k+1 - mid
pmull2 v4.1q, q6, v13.2d @ GHASH block 4k+2 - high
aese q3, v23.16b
aesmc q3, q3 @ AES block 4k+7 - round 5
ins v8.d[1], v8.d[0] @ GHASH block 4k+2 - mid
aese q2, v21.16b
aesmc q2, q2 @ AES block 4k+6 - round 3
aese q1, v21.16b
aesmc q1, q1 @ AES block 4k+5 - round 3
eor q9, q9, q4 @ GHASH block 4k+2 - high
pmull v4.1q, q7, v12.1d @ GHASH block 4k+3 - low
aese q2, v22.16b
aesmc q2, q2 @ AES block 4k+6 - round 4
mov d6, v7.d[1] @ GHASH block 4k+3 - mid
aese q1, v22.16b
aesmc q1, q1 @ AES block 4k+5 - round 4
pmull2 v8.1q, q8, v16.2d @ GHASH block 4k+2 - mid
aese q2, v23.16b
aesmc q2, q2 @ AES block 4k+6 - round 5
eor q6, q6, q7 @ GHASH block 4k+3 - mid
aese q1, v23.16b
aesmc q1, q1 @ AES block 4k+5 - round 5
aese q3, v24.16b
aesmc q3, q3 @ AES block 4k+7 - round 6
eor v10.16b, v10.16b, q8 @ GHASH block 4k+2 - mid
aese q2, v24.16b
aesmc q2, q2 @ AES block 4k+6 - round 6
aese q0, v24.16b
aesmc q0, q0 @ AES block 4k+4 - round 6
movi q8, #0xc2
aese q1, v24.16b
aesmc q1, q1 @ AES block 4k+5 - round 6
eor v11.16b, v11.16b, q4 @ GHASH block 4k+3 - low
pmull v6.1q, q6, v16.1d @ GHASH block 4k+3 - mid
aese q3, v25.16b
aesmc q3, q3 @ AES block 4k+7 - round 7
eor q9, q9, q5 @ GHASH block 4k+3 - high
aese q1, v25.16b
aesmc q1, q1 @ AES block 4k+5 - round 7
aese q0, v25.16b
aesmc q0, q0 @ AES block 4k+4 - round 7
eor v10.16b, v10.16b, q6 @ GHASH block 4k+3 - mid
aese q3, v26.16b
aesmc q3, q3 @ AES block 4k+7 - round 8
aese q2, v25.16b
aesmc q2, q2 @ AES block 4k+6 - round 7
eor q6, v11.16b, q9 @ MODULO - karatsuba tidy up
aese q1, v26.16b
aesmc q1, q1 @ AES block 4k+5 - round 8
aese q0, v26.16b
aesmc q0, q0 @ AES block 4k+4 - round 8
shl d8, d8, #56 @ mod_constant
aese q2, v26.16b
aesmc q2, q2 @ AES block 4k+6 - round 8
aese q1, v27.16b
aesmc q1, q1 @ AES block 4k+5 - round 9
eor v10.16b, v10.16b, q6 @ MODULO - karatsuba tidy up
pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid
aese q2, v27.16b
aesmc q2, q2 @ AES block 4k+6 - round 9
ext q9, q9, q9, #8 @ MODULO - other top alignment
aese q3, v27.16b
aesmc q3, q3 @ AES block 4k+7 - round 9
aese q0, v27.16b
aesmc q0, q0 @ AES block 4k+4 - round 9
eor v10.16b, v10.16b, q7 @ MODULO - fold into mid
aese q2, v28.16b
aesmc q2, q2 @ AES block 4k+6 - round 10
aese q3, v28.16b
aesmc q3, q3 @ AES block 4k+7 - round 10
aese q0, v28.16b
aesmc q0, q0 @ AES block 4k+4 - round 10
eor r22, r22, r14 @ AES block 4k+2 - round 14 high
#ifdef __ARMEB__
rev r22, r22
#endif
aese q1, v28.16b
aesmc q1, q1 @ AES block 4k+5 - round 10
eor r23, r23, r13 @ AES block 4k+3 - round 14 low
#ifdef __ARMEB__
rev r23, r23
#endif
aese q2, v29.16b
aesmc q2, q2 @ AES block 4k+6 - round 11
eor v10.16b, v10.16b, q9 @ MODULO - fold into mid
aese q0, v29.16b
aesmc q0, q0 @ AES block 4k+4 - round 11
add r12, r12, #1 @ CTR block 4k+7
aese q1, v29.16b
aesmc q1, q1 @ AES block 4k+5 - round 11
eor r21, r21, r13 @ AES block 4k+2 - round 14 low
#ifdef __ARMEB__
rev r21, r21
#endif
aese q2, v30.16b
aesmc q2, q2 @ AES block 4k+6 - round 12
pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low
eor r24, r24, r14 @ AES block 4k+3 - round 14 high
#ifdef __ARMEB__
rev r24, r24
#endif
aese q3, v29.16b
aesmc q3, q3 @ AES block 4k+7 - round 11
stp r21, r22, [r2], #16 @ AES block 4k+2 - store result
aese q1, v30.16b
aesmc q1, q1 @ AES block 4k+5 - round 12
ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment
aese q0, v30.16b
aesmc q0, q0 @ AES block 4k+4 - round 12
stp r23, r24, [r2], #16 @ AES block 4k+3 - store result
aese q3, v30.16b
aesmc q3, q3 @ AES block 4k+7 - round 12
eor v11.16b, v11.16b, q8 @ MODULO - fold into low
aese q1, v31.16b @ AES block 4k+5 - round 13
aese q0, v31.16b @ AES block 4k+4 - round 13
aese q3, v31.16b @ AES block 4k+7 - round 13
aese q2, v31.16b @ AES block 4k+6 - round 13
eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low
.L256_dec_tail:@ TAIL
sub r5, r4, r0 @ main_end_input_ptr is number of bytes left to process
ld1 { q5}, [r0], #16 @ AES block 4k+4 - load ciphertext
eor q0, q5, q0 @ AES block 4k+4 - result
mov r6, v0.d[0] @ AES block 4k+4 - mov low
mov r7, v0.d[1] @ AES block 4k+4 - mov high
ext q8, v11.16b, v11.16b, #8 @ prepare final partial tag
cmp r5, #48
eor r6, r6, r13 @ AES block 4k+4 - round 14 low
#ifdef __ARMEB__
rev r6, r6
#endif
eor r7, r7, r14 @ AES block 4k+4 - round 14 high
#ifdef __ARMEB__
rev r7, r7
#endif
bgt .L256_dec_blocks_more_than_3
sub r12, r12, #1
mov q3, q2
movi v10.8b, #0
movi v11.8b, #0
cmp r5, #32
movi q9, #0
mov q2, q1
bgt .L256_dec_blocks_more_than_2
sub r12, r12, #1
mov q3, q1
cmp r5, #16
bgt .L256_dec_blocks_more_than_1
sub r12, r12, #1
b .L256_dec_blocks_less_than_1
.L256_dec_blocks_more_than_3:@ blocks left > 3
rev64 q4, q5 @ GHASH final-3 block
ld1 { q5}, [r0], #16 @ AES final-2 block - load ciphertext
stp r6, r7, [r2], #16 @ AES final-3 block - store result
mov d10, v17.d[1] @ GHASH final-3 block - mid
eor q4, q4, q8 @ feed in partial tag
eor q0, q5, q1 @ AES final-2 block - result
mov d22, v4.d[1] @ GHASH final-3 block - mid
mov r6, v0.d[0] @ AES final-2 block - mov low
mov r7, v0.d[1] @ AES final-2 block - mov high
eor v22.8b, v22.8b, q4 @ GHASH final-3 block - mid
movi q8, #0 @ suppress further partial tag feed in
pmull2 v9.1q, q4, v15.2d @ GHASH final-3 block - high
pmull v10.1q, v22.1d, v10.1d @ GHASH final-3 block - mid
eor r6, r6, r13 @ AES final-2 block - round 14 low
#ifdef __ARMEB__
rev r6, r6
#endif
pmull v11.1q, q4, v15.1d @ GHASH final-3 block - low
eor r7, r7, r14 @ AES final-2 block - round 14 high
#ifdef __ARMEB__
rev r7, r7
#endif
.L256_dec_blocks_more_than_2:@ blocks left > 2
rev64 q4, q5 @ GHASH final-2 block
ld1 { q5}, [r0], #16 @ AES final-1 block - load ciphertext
eor q4, q4, q8 @ feed in partial tag
stp r6, r7, [r2], #16 @ AES final-2 block - store result
eor q0, q5, q2 @ AES final-1 block - result
mov d22, v4.d[1] @ GHASH final-2 block - mid
pmull v21.1q, q4, v14.1d @ GHASH final-2 block - low
pmull2 v20.1q, q4, v14.2d @ GHASH final-2 block - high
eor v22.8b, v22.8b, q4 @ GHASH final-2 block - mid
mov r6, v0.d[0] @ AES final-1 block - mov low
mov r7, v0.d[1] @ AES final-1 block - mov high
eor v11.16b, v11.16b, v21.16b @ GHASH final-2 block - low
movi q8, #0 @ suppress further partial tag feed in
pmull v22.1q, v22.1d, v17.1d @ GHASH final-2 block - mid
eor q9, q9, v20.16b @ GHASH final-2 block - high
eor r6, r6, r13 @ AES final-1 block - round 14 low
#ifdef __ARMEB__
rev r6, r6
#endif
eor v10.16b, v10.16b, v22.16b @ GHASH final-2 block - mid
eor r7, r7, r14 @ AES final-1 block - round 14 high
#ifdef __ARMEB__
rev r7, r7
#endif
.L256_dec_blocks_more_than_1:@ blocks left > 1
stp r6, r7, [r2], #16 @ AES final-1 block - store result
rev64 q4, q5 @ GHASH final-1 block
ld1 { q5}, [r0], #16 @ AES final block - load ciphertext
eor q4, q4, q8 @ feed in partial tag
movi q8, #0 @ suppress further partial tag feed in
mov d22, v4.d[1] @ GHASH final-1 block - mid
eor q0, q5, q3 @ AES final block - result
pmull2 v20.1q, q4, v13.2d @ GHASH final-1 block - high
eor v22.8b, v22.8b, q4 @ GHASH final-1 block - mid
pmull v21.1q, q4, v13.1d @ GHASH final-1 block - low
mov r6, v0.d[0] @ AES final block - mov low
ins v22.d[1], v22.d[0] @ GHASH final-1 block - mid
mov r7, v0.d[1] @ AES final block - mov high
pmull2 v22.1q, v22.2d, v16.2d @ GHASH final-1 block - mid
eor r6, r6, r13 @ AES final block - round 14 low
#ifdef __ARMEB__
rev r6, r6
#endif
eor v11.16b, v11.16b, v21.16b @ GHASH final-1 block - low
eor q9, q9, v20.16b @ GHASH final-1 block - high
eor v10.16b, v10.16b, v22.16b @ GHASH final-1 block - mid
eor r7, r7, r14 @ AES final block - round 14 high
#ifdef __ARMEB__
rev r7, r7
#endif
.L256_dec_blocks_less_than_1:@ blocks left <= 1
and r1, r1, #127 @ bit_length %= 128
mvn r14, xzr @ rk14_h = 0xffffffffffffffff
sub r1, r1, #128 @ bit_length -= 128
mvn r13, xzr @ rk14_l = 0xffffffffffffffff
ldp r4, r5, [r2] @ load existing bytes we need to not overwrite
neg r1, r1 @ bit_length = 128 - #bits in input (in range [1,128])
and r1, r1, #127 @ bit_length %= 128
lsr r14, r14, r1 @ rk14_h is mask for top 64b of last block
cmp r1, #64
csel r9, r13, r14, lt
csel r10, r14, xzr, lt
fmov d0, r9 @ ctr0b is mask for last block
and r6, r6, r9
mov v0.d[1], r10
bic r4, r4, r9 @ mask out low existing bytes
#ifndef __ARMEB__
rev r9, r12
#else
mov r9, r12
#endif
bic r5, r5, r10 @ mask out high existing bytes
orr r6, r6, r4
and r7, r7, r10
orr r7, r7, r5
and q5, q5, q0 @ possibly partial last block has zeroes in highest bits
rev64 q4, q5 @ GHASH final block
eor q4, q4, q8 @ feed in partial tag
pmull v21.1q, q4, v12.1d @ GHASH final block - low
mov d8, v4.d[1] @ GHASH final block - mid
eor q8, q8, q4 @ GHASH final block - mid
pmull2 v20.1q, q4, v12.2d @ GHASH final block - high
pmull v8.1q, q8, v16.1d @ GHASH final block - mid
eor q9, q9, v20.16b @ GHASH final block - high
eor v11.16b, v11.16b, v21.16b @ GHASH final block - low
eor v10.16b, v10.16b, q8 @ GHASH final block - mid
movi q8, #0xc2
eor q6, v11.16b, q9 @ MODULO - karatsuba tidy up
shl d8, d8, #56 @ mod_constant
eor v10.16b, v10.16b, q6 @ MODULO - karatsuba tidy up
pmull v7.1q, q9, q8 @ MODULO - top 64b align with mid
ext q9, q9, q9, #8 @ MODULO - other top alignment
eor v10.16b, v10.16b, q7 @ MODULO - fold into mid
eor v10.16b, v10.16b, q9 @ MODULO - fold into mid
pmull v8.1q, v10.1d, q8 @ MODULO - mid 64b align with low
ext v10.16b, v10.16b, v10.16b, #8 @ MODULO - other mid alignment
eor v11.16b, v11.16b, q8 @ MODULO - fold into low
stp r6, r7, [r2]
str r9, [r16, #12] @ store the updated counter
eor v11.16b, v11.16b, v10.16b @ MODULO - fold into low
ext v11.16b, v11.16b, v11.16b, #8
rev64 v11.16b, v11.16b
mov r0, r15
st1 { v11.16b }, [r3]
ldp r21, r22, [sp, #16]
ldp r23, r24, [sp, #32]
ldp d8, d9, [sp, #48]
ldp d10, d11, [sp, #64]
ldp d12, d13, [sp, #80]
ldp d14, d15, [sp, #96]
ldp r19, r20, [sp], #112
RET
.L256_dec_ret:
mov r0, #0x0
RET
.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
.align 2
.align 2
#endif