/*
* Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#define BR_ENABLE_INTRINSICS 1
#include "inner.h"
#if BR_SSE2
/*
* This file contains a ChaCha20 implementation that leverages SSE2
* opcodes for better performance.
*/
/* see bearssl_block.h */
br_chacha20_run
br_chacha20_sse2_get(void)
{
/*
* If using 64-bit mode, then SSE2 opcodes should be automatically
* available, since they are part of the ABI.
*
* In 32-bit mode, we use CPUID to detect the SSE2 feature.
*/
#if BR_amd64
return &br_chacha20_sse2_run;
#else
/*
* SSE2 support is indicated by bit 26 in EDX.
*/
if (br_cpuid(0, 0, 0, 0x04000000)) {
return &br_chacha20_sse2_run;
} else {
return 0;
}
#endif
}
BR_TARGETS_X86_UP
/* see bearssl_block.h */
BR_TARGET("sse2")
uint32_t
br_chacha20_sse2_run(const void *key,
const void *iv, uint32_t cc, void *data, size_t len)
{
unsigned char *buf;
uint32_t ivtmp[4];
__m128i kw0, kw1;
__m128i iw, cw;
__m128i one;
static const uint32_t CW[] = {
0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
};
buf = data;
kw0 = _mm_loadu_si128(key);
kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16));
ivtmp[0] = cc;
memcpy(ivtmp + 1, iv, 12);
iw = _mm_loadu_si128((const void *)ivtmp);
cw = _mm_loadu_si128((const void *)CW);
one = _mm_set_epi32(0, 0, 0, 1);
while (len > 0) {
/*
* sj contains state words 4*j to 4*j+3.
*/
__m128i s0, s1, s2, s3;
int i;
s0 = cw;
s1 = kw0;
s2 = kw1;
s3 = iw;
for (i = 0; i < 10; i ++) {
/*
* Even round is straightforward application on
* the state words.
*/
s0 = _mm_add_epi32(s0, s1);
s3 = _mm_xor_si128(s3, s0);
s3 = _mm_or_si128(
_mm_slli_epi32(s3, 16),
_mm_srli_epi32(s3, 16));
s2 = _mm_add_epi32(s2, s3);
s1 = _mm_xor_si128(s1, s2);
s1 = _mm_or_si128(
_mm_slli_epi32(s1, 12),
_mm_srli_epi32(s1, 20));
s0 = _mm_add_epi32(s0, s1);
s3 = _mm_xor_si128(s3, s0);
s3 = _mm_or_si128(
_mm_slli_epi32(s3, 8),
_mm_srli_epi32(s3, 24));
s2 = _mm_add_epi32(s2, s3);
s1 = _mm_xor_si128(s1, s2);
s1 = _mm_or_si128(
_mm_slli_epi32(s1, 7),
_mm_srli_epi32(s1, 25));
/*
* For the odd round, we must rotate some state
* words so that the computations apply on the
* right combinations of words.
*/
s1 = _mm_shuffle_epi32(s1, 0x39);
s2 = _mm_shuffle_epi32(s2, 0x4E);
s3 = _mm_shuffle_epi32(s3, 0x93);
s0 = _mm_add_epi32(s0, s1);
s3 = _mm_xor_si128(s3, s0);
s3 = _mm_or_si128(
_mm_slli_epi32(s3, 16),
_mm_srli_epi32(s3, 16));
s2 = _mm_add_epi32(s2, s3);
s1 = _mm_xor_si128(s1, s2);
s1 = _mm_or_si128(
_mm_slli_epi32(s1, 12),
_mm_srli_epi32(s1, 20));
s0 = _mm_add_epi32(s0, s1);
s3 = _mm_xor_si128(s3, s0);
s3 = _mm_or_si128(
_mm_slli_epi32(s3, 8),
_mm_srli_epi32(s3, 24));
s2 = _mm_add_epi32(s2, s3);
s1 = _mm_xor_si128(s1, s2);
s1 = _mm_or_si128(
_mm_slli_epi32(s1, 7),
_mm_srli_epi32(s1, 25));
/*
* After the odd round, we rotate back the values
* to undo the rotate at the start of the odd round.
*/
s1 = _mm_shuffle_epi32(s1, 0x93);
s2 = _mm_shuffle_epi32(s2, 0x4E);
s3 = _mm_shuffle_epi32(s3, 0x39);
}
/*
* Addition with the initial state.
*/
s0 = _mm_add_epi32(s0, cw);
s1 = _mm_add_epi32(s1, kw0);
s2 = _mm_add_epi32(s2, kw1);
s3 = _mm_add_epi32(s3, iw);
/*
* Increment block counter.
*/
iw = _mm_add_epi32(iw, one);
/*
* XOR final state with the data.
*/
if (len < 64) {
unsigned char tmp[64];
size_t u;
_mm_storeu_si128((void *)(tmp + 0), s0);
_mm_storeu_si128((void *)(tmp + 16), s1);
_mm_storeu_si128((void *)(tmp + 32), s2);
_mm_storeu_si128((void *)(tmp + 48), s3);
for (u = 0; u < len; u ++) {
buf[u] ^= tmp[u];
}
break;
} else {
__m128i b0, b1, b2, b3;
b0 = _mm_loadu_si128((const void *)(buf + 0));
b1 = _mm_loadu_si128((const void *)(buf + 16));
b2 = _mm_loadu_si128((const void *)(buf + 32));
b3 = _mm_loadu_si128((const void *)(buf + 48));
b0 = _mm_xor_si128(b0, s0);
b1 = _mm_xor_si128(b1, s1);
b2 = _mm_xor_si128(b2, s2);
b3 = _mm_xor_si128(b3, s3);
_mm_storeu_si128((void *)(buf + 0), b0);
_mm_storeu_si128((void *)(buf + 16), b1);
_mm_storeu_si128((void *)(buf + 32), b2);
_mm_storeu_si128((void *)(buf + 48), b3);
buf += 64;
len -= 64;
}
}
/*
* _mm_extract_epi32() requires SSE4.1. We prefer to stick to
* raw SSE2, thus we use _mm_extract_epi16().
*/
return (uint32_t)_mm_extract_epi16(iw, 0)
| ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
}
BR_TARGETS_X86_DOWN
#else
/* see bearssl_block.h */
br_chacha20_run
br_chacha20_sse2_get(void)
{
return 0;
}
#endif