commit e84403331d99 Author: Daiki Ueno Date: Fri Sep 22 11:27:34 2017 +0200 Bug 1400603 - freebl: Reorganize AES-GCM source code based on hw/sw implementation, r=franziskus Reviewers: franziskus Reviewed By: franziskus Bug #: 1400603 Differential Revision: https://phabricator.services.mozilla.com/D65 --- lib/freebl/Makefile | 4 +- lib/freebl/aes-x86.c | 157 +++++++++++++++++++++++++++++++++++++++++ lib/freebl/freebl.gyp | 65 ++++++++++------- lib/freebl/gcm-x86.c | 127 ++++++++++++++++++++++++++++++++++ lib/freebl/gcm.c | 162 +++++++++++++------------------------------ lib/freebl/gcm.h | 14 ++++ lib/freebl/rijndael.c | 188 ++++++++------------------------------------------ lib/freebl/rijndael.h | 18 ++++- 8 files changed, 436 insertions(+), 299 deletions(-) diff --git lib/freebl/Makefile lib/freebl/Makefile index d50e18696b..bc1ea86a5e 100644 --- lib/freebl/Makefile +++ lib/freebl/Makefile @@ -110,7 +110,9 @@ endif # NSS_X86_OR_X64 means the target is either x86 or x64 ifeq (,$(filter-out i386 x386 x86 x86_64,$(CPU_ARCH))) DEFINES += -DNSS_X86_OR_X64 - CFLAGS += -mpclmul -maes + EXTRA_SRCS += gcm-x86.c aes-x86.c +$(OBJDIR)/gcm-x86.o: CFLAGS += -mpclmul -maes +$(OBJDIR)/aes-x86.o: CFLAGS += -mpclmul -maes ifneq (,$(USE_64)$(USE_X32)) DEFINES += -DNSS_X64 else diff --git lib/freebl/aes-x86.c lib/freebl/aes-x86.c new file mode 100644 index 0000000000..830b4782fe --- /dev/null +++ lib/freebl/aes-x86.c @@ -0,0 +1,157 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifdef FREEBL_NO_DEPEND +#include "stubs.h" +#endif +#include "rijndael.h" +#include "secerr.h" + +#include /* aes-ni */ + +#define EXPAND_KEY128(k, rcon, res) \ + tmp_key = _mm_aeskeygenassist_si128(k, rcon); \ + tmp_key = _mm_shuffle_epi32(tmp_key, 0xFF); \ + tmp = _mm_xor_si128(k, _mm_slli_si128(k, 4)); \ + tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \ + tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \ + res = _mm_xor_si128(tmp, tmp_key) + +static void +native_key_expansion128(AESContext *cx, const unsigned char *key) +{ + __m128i *keySchedule = cx->keySchedule; + pre_align __m128i tmp_key post_align; + pre_align __m128i tmp post_align; + keySchedule[0] = _mm_loadu_si128((__m128i *)key); + EXPAND_KEY128(keySchedule[0], 0x01, keySchedule[1]); + EXPAND_KEY128(keySchedule[1], 0x02, keySchedule[2]); + EXPAND_KEY128(keySchedule[2], 0x04, keySchedule[3]); + EXPAND_KEY128(keySchedule[3], 0x08, keySchedule[4]); + EXPAND_KEY128(keySchedule[4], 0x10, keySchedule[5]); + EXPAND_KEY128(keySchedule[5], 0x20, keySchedule[6]); + EXPAND_KEY128(keySchedule[6], 0x40, keySchedule[7]); + EXPAND_KEY128(keySchedule[7], 0x80, keySchedule[8]); + EXPAND_KEY128(keySchedule[8], 0x1B, keySchedule[9]); + EXPAND_KEY128(keySchedule[9], 0x36, keySchedule[10]); +} + +#define EXPAND_KEY192_PART1(res, k0, kt, rcon) \ + tmp2 = _mm_slli_si128(k0, 4); \ + tmp1 = _mm_xor_si128(k0, tmp2); \ + tmp2 = _mm_slli_si128(tmp2, 4); \ + tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \ + tmp2 = _mm_aeskeygenassist_si128(kt, rcon); \ + res = _mm_xor_si128(tmp1, _mm_shuffle_epi32(tmp2, 0x55)) + +#define EXPAND_KEY192_PART2(res, k1, k2) \ + tmp2 = _mm_xor_si128(k1, _mm_slli_si128(k1, 4)); \ + res = _mm_xor_si128(tmp2, _mm_shuffle_epi32(k2, 0xFF)) + +#define EXPAND_KEY192(k0, res1, res2, res3, carry, rcon1, rcon2) \ + EXPAND_KEY192_PART1(tmp3, k0, res1, rcon1); \ + EXPAND_KEY192_PART2(carry, res1, tmp3); \ + res1 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(res1), \ + _mm_castsi128_pd(tmp3), 0)); \ + res2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp3), \ + _mm_castsi128_pd(carry), 1)); \ + EXPAND_KEY192_PART1(res3, tmp3, carry, rcon2) + +static void +native_key_expansion192(AESContext *cx, const unsigned char *key) +{ + __m128i *keySchedule = cx->keySchedule; + pre_align __m128i tmp1 post_align; + pre_align __m128i tmp2 post_align; + pre_align __m128i tmp3 post_align; + pre_align __m128i carry post_align; + keySchedule[0] = _mm_loadu_si128((__m128i *)key); + keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16)); + EXPAND_KEY192(keySchedule[0], keySchedule[1], keySchedule[2], + keySchedule[3], carry, 0x1, 0x2); + EXPAND_KEY192_PART2(keySchedule[4], carry, keySchedule[3]); + EXPAND_KEY192(keySchedule[3], keySchedule[4], keySchedule[5], + keySchedule[6], carry, 0x4, 0x8); + EXPAND_KEY192_PART2(keySchedule[7], carry, keySchedule[6]); + EXPAND_KEY192(keySchedule[6], keySchedule[7], keySchedule[8], + keySchedule[9], carry, 0x10, 0x20); + EXPAND_KEY192_PART2(keySchedule[10], carry, keySchedule[9]); + EXPAND_KEY192(keySchedule[9], keySchedule[10], keySchedule[11], + keySchedule[12], carry, 0x40, 0x80); +} + +#define EXPAND_KEY256_PART(res, rconx, k1x, k2x, X) \ + tmp_key = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(k2x, rconx), X); \ + tmp2 = _mm_slli_si128(k1x, 4); \ + tmp1 = _mm_xor_si128(k1x, tmp2); \ + tmp2 = _mm_slli_si128(tmp2, 4); \ + tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \ + res = _mm_xor_si128(tmp1, tmp_key); + +#define EXPAND_KEY256(res1, res2, k1, k2, rcon) \ + EXPAND_KEY256_PART(res1, rcon, k1, k2, 0xFF); \ + EXPAND_KEY256_PART(res2, 0x00, k2, res1, 0xAA) + +static void +native_key_expansion256(AESContext *cx, const unsigned char *key) +{ + __m128i *keySchedule = cx->keySchedule; + pre_align __m128i tmp_key post_align; + pre_align __m128i tmp1 post_align; + pre_align __m128i tmp2 post_align; + keySchedule[0] = _mm_loadu_si128((__m128i *)key); + keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16)); + EXPAND_KEY256(keySchedule[2], keySchedule[3], keySchedule[0], + keySchedule[1], 0x01); + EXPAND_KEY256(keySchedule[4], keySchedule[5], keySchedule[2], + keySchedule[3], 0x02); + EXPAND_KEY256(keySchedule[6], keySchedule[7], keySchedule[4], + keySchedule[5], 0x04); + EXPAND_KEY256(keySchedule[8], keySchedule[9], keySchedule[6], + keySchedule[7], 0x08); + EXPAND_KEY256(keySchedule[10], keySchedule[11], keySchedule[8], + keySchedule[9], 0x10); + EXPAND_KEY256(keySchedule[12], keySchedule[13], keySchedule[10], + keySchedule[11], 0x20); + EXPAND_KEY256_PART(keySchedule[14], 0x40, keySchedule[12], + keySchedule[13], 0xFF); +} + +/* + * AES key expansion using aes-ni instructions. + */ +void +rijndael_native_key_expansion(AESContext *cx, const unsigned char *key, + unsigned int Nk) +{ + switch (Nk) { + case 4: + native_key_expansion128(cx, key); + return; + case 6: + native_key_expansion192(cx, key); + return; + case 8: + native_key_expansion256(cx, key); + return; + default: + /* This shouldn't happen (checked by the caller). */ + return; + } +} + +void +rijndael_native_encryptBlock(AESContext *cx, + unsigned char *output, + const unsigned char *input) +{ + int i; + pre_align __m128i m post_align = _mm_loadu_si128((__m128i *)input); + m = _mm_xor_si128(m, cx->keySchedule[0]); + for (i = 1; i < cx->Nr; ++i) { + m = _mm_aesenc_si128(m, cx->keySchedule[i]); + } + m = _mm_aesenclast_si128(m, cx->keySchedule[cx->Nr]); + _mm_storeu_si128((__m128i *)output, m); +} diff --git lib/freebl/freebl.gyp lib/freebl/freebl.gyp index 1e93475004..5f59eef29c 100644 --- lib/freebl/freebl.gyp +++ lib/freebl/freebl.gyp @@ -22,6 +22,37 @@ '-mssse3' ] }, + { + 'target_name': 'gcm-aes-x86_c_lib', + 'type': 'static_library', + 'sources': [ + 'gcm-x86.c', 'aes-x86.c' + ], + 'dependencies': [ + '<(DEPTH)/exports.gyp:nss_exports' + ], + # Enable isa option for pclmul and aes-ni; supported since gcc 4.4. + # This is only supported by x84/x64. It's not needed for Windows, + # unless clang-cl is used. + 'cflags_mozilla': [ + '-mpclmul', '-maes' + ], + 'conditions': [ + [ 'OS=="linux" or OS=="android" or OS=="dragonfly" or OS=="freebsd" or OS=="netbsd" or OS=="openbsd"', { + 'cflags': [ + '-mpclmul', '-maes' + ], + }], + # macOS build doesn't use cflags. + [ 'OS=="mac"', { + 'xcode_settings': { + 'OTHER_CFLAGS': [ + '-mpclmul', '-maes' + ], + }, + }] + ] + }, { 'target_name': 'freebl', 'type': 'static_library', @@ -45,6 +76,11 @@ '<(DEPTH)/exports.gyp:nss_exports', ], 'conditions': [ + [ 'target_arch=="ia32" or target_arch=="x64"', { + 'dependencies': [ + 'gcm-aes-x86_c_lib' + ], + }], [ 'OS=="linux"', { 'defines!': [ 'FREEBL_NO_DEPEND', @@ -76,6 +112,11 @@ '<(DEPTH)/exports.gyp:nss_exports', ], 'conditions': [ + [ 'target_arch=="ia32" or target_arch=="x64"', { + 'dependencies': [ + 'gcm-aes-x86_c_lib' + ] + }], [ 'OS!="linux" and OS!="android"', { 'conditions': [ [ 'moz_fold_libs==0', { @@ -154,27 +195,11 @@ 'MP_API_COMPATIBLE' ], 'conditions': [ - [ 'target_arch=="ia32" or target_arch=="x64"', { - 'cflags_mozilla': [ - '-mpclmul', - '-maes', - ], - 'conditions': [ - [ 'OS=="dragonfly" or OS=="freebsd" or OS=="netbsd" or OS=="openbsd"', { - 'cflags': [ - '-mpclmul', - '-maes', - ], - }], - ], - }], [ 'OS=="mac"', { 'xcode_settings': { # I'm not sure since when this is supported. # But I hope that doesn't matter. We also assume this is x86/x64. 'OTHER_CFLAGS': [ - '-mpclmul', - '-maes', '-std=gnu99', ], }, @@ -268,14 +293,6 @@ 'MP_USE_UINT_DIGIT', ], }], - [ 'target_arch=="ia32" or target_arch=="x64"', { - 'cflags': [ - # enable isa option for pclmul am aes-ni; supported since gcc 4.4 - # This is only support by x84/x64. It's not needed for Windows. - '-mpclmul', - '-maes', - ], - }], [ 'target_arch=="arm"', { 'defines': [ 'MP_ASSEMBLY_MULTIPLY', diff --git lib/freebl/gcm-x86.c lib/freebl/gcm-x86.c new file mode 100644 index 0000000000..e34d633943 --- /dev/null +++ lib/freebl/gcm-x86.c @@ -0,0 +1,127 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifdef FREEBL_NO_DEPEND +#include "stubs.h" +#endif +#include "gcm.h" +#include "secerr.h" + +#include /* clmul */ + +#define WRITE64(x, bytes) \ + (bytes)[0] = (x) >> 56; \ + (bytes)[1] = (x) >> 48; \ + (bytes)[2] = (x) >> 40; \ + (bytes)[3] = (x) >> 32; \ + (bytes)[4] = (x) >> 24; \ + (bytes)[5] = (x) >> 16; \ + (bytes)[6] = (x) >> 8; \ + (bytes)[7] = (x); + +SECStatus +gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf) +{ + uint64_t tmp_out[2]; + _mm_storeu_si128((__m128i *)tmp_out, ghash->x); + /* maxout must be larger than 16 byte (checked by the caller). */ + WRITE64(tmp_out[0], outbuf + 8); + WRITE64(tmp_out[1], outbuf); + return SECSuccess; +} + +SECStatus +gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf, + unsigned int count) +{ + size_t i; + pre_align __m128i z_high post_align; + pre_align __m128i z_low post_align; + pre_align __m128i C post_align; + pre_align __m128i D post_align; + pre_align __m128i E post_align; + pre_align __m128i F post_align; + pre_align __m128i bin post_align; + pre_align __m128i Ci post_align; + pre_align __m128i tmp post_align; + + for (i = 0; i < count; i++, buf += 16) { + bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1], + ((uint16_t)buf[2] << 8) | buf[3], + ((uint16_t)buf[4] << 8) | buf[5], + ((uint16_t)buf[6] << 8) | buf[7], + ((uint16_t)buf[8] << 8) | buf[9], + ((uint16_t)buf[10] << 8) | buf[11], + ((uint16_t)buf[12] << 8) | buf[13], + ((uint16_t)buf[14] << 8) | buf[15]); + Ci = _mm_xor_si128(bin, ghash->x); + + /* Do binary mult ghash->X = Ci * ghash->H. */ + C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00); + D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11); + E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01); + F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10); + tmp = _mm_xor_si128(E, F); + z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8)); + z_high = _mm_unpackhi_epi64(z_high, D); + z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C); + z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low); + + /* Shift one to the left (multiply by x) as gcm spec is stupid. */ + C = _mm_slli_si128(z_low, 8); + E = _mm_srli_epi64(C, 63); + D = _mm_slli_si128(z_high, 8); + F = _mm_srli_epi64(D, 63); + /* Carry over */ + C = _mm_srli_si128(z_low, 8); + D = _mm_srli_epi64(C, 63); + z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E); + z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D); + + /* Reduce */ + C = _mm_slli_si128(z_low, 8); + /* D = z_low << 127 */ + D = _mm_slli_epi64(C, 63); + /* E = z_low << 126 */ + E = _mm_slli_epi64(C, 62); + /* F = z_low << 121 */ + F = _mm_slli_epi64(C, 57); + /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */ + z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F); + C = _mm_srli_si128(z_low, 8); + /* D = z_low >> 1 */ + D = _mm_slli_epi64(C, 63); + D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D); + /* E = z_low >> 2 */ + E = _mm_slli_epi64(C, 62); + E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E); + /* F = z_low >> 7 */ + F = _mm_slli_epi64(C, 57); + F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F); + /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */ + ghash->x = _mm_xor_si128(_mm_xor_si128( + _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E), + F); + } + return SECSuccess; +} + +SECStatus +gcm_HashInit_hw(gcmHashContext *ghash) +{ + ghash->ghash_mul = gcm_HashMult_hw; + ghash->x = _mm_setzero_si128(); + /* MSVC requires __m64 to load epi64. */ + ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high, + ghash->h_low >> 32, (uint32_t)ghash->h_low); + ghash->hw = PR_TRUE; + return SECSuccess; +} + +SECStatus +gcm_HashZeroX_hw(gcmHashContext *ghash) +{ + ghash->x = _mm_setzero_si128(); + return SECSuccess; +} diff --git lib/freebl/gcm.c lib/freebl/gcm.c index 780b7a6322..f1e16da78e 100644 --- lib/freebl/gcm.c +++ lib/freebl/gcm.c @@ -17,18 +17,50 @@ #include -#ifdef NSS_X86_OR_X64 -#include /* clmul */ -#endif - /* Forward declarations */ +SECStatus gcm_HashInit_hw(gcmHashContext *ghash); +SECStatus gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf); SECStatus gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf, unsigned int count); +SECStatus gcm_HashZeroX_hw(gcmHashContext *ghash); SECStatus gcm_HashMult_sftw(gcmHashContext *ghash, const unsigned char *buf, unsigned int count); SECStatus gcm_HashMult_sftw32(gcmHashContext *ghash, const unsigned char *buf, unsigned int count); +/* Stub definitions for the above *_hw functions, which shouldn't be + * used unless NSS_X86_OR_X64 is defined */ +#ifndef NSS_X86_OR_X64 +SECStatus +gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf) +{ + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + return SECFailure; +} + +SECStatus +gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf, + unsigned int count) +{ + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + return SECFailure; +} + +SECStatus +gcm_HashInit_hw(gcmHashContext *ghash) +{ + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + return SECFailure; +} + +SECStatus +gcm_HashZeroX_hw(gcmHashContext *ghash) +{ + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + return SECFailure; +} +#endif /* NSS_X86_OR_X64 */ + uint64_t get64(const unsigned char *bytes) { @@ -46,6 +78,8 @@ get64(const unsigned char *bytes) SECStatus gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, PRBool sw) { + SECStatus rv = SECSuccess; + ghash->cLen = 0; ghash->bufLen = 0; PORT_Memset(ghash->counterBuf, 0, sizeof(ghash->counterBuf)); @@ -53,17 +87,7 @@ gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, PRBool sw) ghash->h_low = get64(H + 8); ghash->h_high = get64(H); if (clmul_support() && !sw) { -#ifdef NSS_X86_OR_X64 - ghash->ghash_mul = gcm_HashMult_hw; - ghash->x = _mm_setzero_si128(); - /* MSVC requires __m64 to load epi64. */ - ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high, - ghash->h_low >> 32, (uint32_t)ghash->h_low); - ghash->hw = PR_TRUE; -#else - PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); - return SECFailure; -#endif /* NSS_X86_OR_X64 */ + rv = gcm_HashInit_hw(ghash); } else { /* We fall back to the software implementation if we can't use / don't * want to use pclmul. */ @@ -75,7 +99,7 @@ gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, PRBool sw) ghash->x_high = ghash->x_low = 0; ghash->hw = PR_FALSE; } - return SECSuccess; + return rv; } #ifdef HAVE_INT128_SUPPORT @@ -283,102 +307,17 @@ gcm_HashMult_sftw32(gcmHashContext *ghash, const unsigned char *buf, } #endif /* HAVE_INT128_SUPPORT */ -SECStatus -gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf, - unsigned int count) -{ -#ifdef NSS_X86_OR_X64 - size_t i; - pre_align __m128i z_high post_align; - pre_align __m128i z_low post_align; - pre_align __m128i C post_align; - pre_align __m128i D post_align; - pre_align __m128i E post_align; - pre_align __m128i F post_align; - pre_align __m128i bin post_align; - pre_align __m128i Ci post_align; - pre_align __m128i tmp post_align; - - for (i = 0; i < count; i++, buf += 16) { - bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1], - ((uint16_t)buf[2] << 8) | buf[3], - ((uint16_t)buf[4] << 8) | buf[5], - ((uint16_t)buf[6] << 8) | buf[7], - ((uint16_t)buf[8] << 8) | buf[9], - ((uint16_t)buf[10] << 8) | buf[11], - ((uint16_t)buf[12] << 8) | buf[13], - ((uint16_t)buf[14] << 8) | buf[15]); - Ci = _mm_xor_si128(bin, ghash->x); - - /* Do binary mult ghash->X = Ci * ghash->H. */ - C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00); - D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11); - E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01); - F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10); - tmp = _mm_xor_si128(E, F); - z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8)); - z_high = _mm_unpackhi_epi64(z_high, D); - z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C); - z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low); - - /* Shift one to the left (multiply by x) as gcm spec is stupid. */ - C = _mm_slli_si128(z_low, 8); - E = _mm_srli_epi64(C, 63); - D = _mm_slli_si128(z_high, 8); - F = _mm_srli_epi64(D, 63); - /* Carry over */ - C = _mm_srli_si128(z_low, 8); - D = _mm_srli_epi64(C, 63); - z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E); - z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D); - - /* Reduce */ - C = _mm_slli_si128(z_low, 8); - /* D = z_low << 127 */ - D = _mm_slli_epi64(C, 63); - /* E = z_low << 126 */ - E = _mm_slli_epi64(C, 62); - /* F = z_low << 121 */ - F = _mm_slli_epi64(C, 57); - /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */ - z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F); - C = _mm_srli_si128(z_low, 8); - /* D = z_low >> 1 */ - D = _mm_slli_epi64(C, 63); - D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D); - /* E = z_low >> 2 */ - E = _mm_slli_epi64(C, 62); - E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E); - /* F = z_low >> 7 */ - F = _mm_slli_epi64(C, 57); - F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F); - /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */ - ghash->x = _mm_xor_si128(_mm_xor_si128( - _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E), - F); - } - return SECSuccess; -#else - PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); - return SECFailure; -#endif /* NSS_X86_OR_X64 */ -} - static SECStatus gcm_zeroX(gcmHashContext *ghash) { + SECStatus rv = SECSuccess; + if (ghash->hw) { -#ifdef NSS_X86_OR_X64 - ghash->x = _mm_setzero_si128(); - return SECSuccess; -#else - PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); - return SECFailure; -#endif /* NSS_X86_OR_X64 */ + rv = gcm_HashZeroX_hw(ghash); } ghash->x_high = ghash->x_low = 0; - return SECSuccess; + return rv; } /* @@ -503,15 +442,10 @@ gcmHash_Final(gcmHashContext *ghash, unsigned char *outbuf, } if (ghash->hw) { -#ifdef NSS_X86_OR_X64 - uint64_t tmp_out[2]; - _mm_storeu_si128((__m128i *)tmp_out, ghash->x); - WRITE64(tmp_out[0], T + 8); - WRITE64(tmp_out[1], T); -#else - PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); - return SECFailure; -#endif /* NSS_X86_OR_X64 */ + rv = gcm_HashWrite_hw(ghash, T); + if (rv != SECSuccess) { + goto cleanup; + } } else { WRITE64(ghash->x_low, T + 8); WRITE64(ghash->x_high, T); diff --git lib/freebl/gcm.h lib/freebl/gcm.h index 0c707a0811..42ef0f7179 100644 --- lib/freebl/gcm.h +++ lib/freebl/gcm.h @@ -9,7 +9,21 @@ #include #ifdef NSS_X86_OR_X64 +/* GCC <= 4.8 doesn't support including emmintrin.h without enabling SSE2 */ +#if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \ + (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 8)) +#pragma GCC push_options +#pragma GCC target("sse2") +#undef NSS_DISABLE_SSE2 +#define NSS_DISABLE_SSE2 1 +#endif /* GCC <= 4.8 */ + #include /* __m128i */ + +#ifdef NSS_DISABLE_SSE2 +#undef NSS_DISABLE_SSE2 +#pragma GCC pop_options +#endif /* NSS_DISABLE_SSE2 */ #endif SEC_BEGIN_PROTOS diff --git lib/freebl/rijndael.c lib/freebl/rijndael.c index a09f13098e..5de27de9ce 100644 --- lib/freebl/rijndael.c +++ lib/freebl/rijndael.c @@ -27,6 +27,34 @@ #include "intel-gcm.h" #endif /* INTEL_GCM */ +/* Forward declarations */ +void rijndael_native_key_expansion(AESContext *cx, const unsigned char *key, + unsigned int Nk); +void rijndael_native_encryptBlock(AESContext *cx, + unsigned char *output, + const unsigned char *input); + +/* Stub definitions for the above rijndael_native_* functions, which + * shouldn't be used unless NSS_X86_OR_X64 is defined */ +#ifndef NSS_X86_OR_X64 +void +rijndael_native_key_expansion(AESContext *cx, const unsigned char *key, + unsigned int Nk) +{ + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + PORT_Assert(0); +} + +void +rijndael_native_encryptBlock(AESContext *cx, + unsigned char *output, + const unsigned char *input) +{ + PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); + PORT_Assert(0); +} +#endif /* NSS_X86_OR_X64 */ + /* * There are currently three ways to build this code, varying in performance * and code size. @@ -309,162 +337,6 @@ rijndael_key_expansion7(AESContext *cx, const unsigned char *key, unsigned int N } } -#if defined(NSS_X86_OR_X64) -#define EXPAND_KEY128(k, rcon, res) \ - tmp_key = _mm_aeskeygenassist_si128(k, rcon); \ - tmp_key = _mm_shuffle_epi32(tmp_key, 0xFF); \ - tmp = _mm_xor_si128(k, _mm_slli_si128(k, 4)); \ - tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \ - tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \ - res = _mm_xor_si128(tmp, tmp_key) - -static void -native_key_expansion128(AESContext *cx, const unsigned char *key) -{ - __m128i *keySchedule = cx->keySchedule; - pre_align __m128i tmp_key post_align; - pre_align __m128i tmp post_align; - keySchedule[0] = _mm_loadu_si128((__m128i *)key); - EXPAND_KEY128(keySchedule[0], 0x01, keySchedule[1]); - EXPAND_KEY128(keySchedule[1], 0x02, keySchedule[2]); - EXPAND_KEY128(keySchedule[2], 0x04, keySchedule[3]); - EXPAND_KEY128(keySchedule[3], 0x08, keySchedule[4]); - EXPAND_KEY128(keySchedule[4], 0x10, keySchedule[5]); - EXPAND_KEY128(keySchedule[5], 0x20, keySchedule[6]); - EXPAND_KEY128(keySchedule[6], 0x40, keySchedule[7]); - EXPAND_KEY128(keySchedule[7], 0x80, keySchedule[8]); - EXPAND_KEY128(keySchedule[8], 0x1B, keySchedule[9]); - EXPAND_KEY128(keySchedule[9], 0x36, keySchedule[10]); -} - -#define EXPAND_KEY192_PART1(res, k0, kt, rcon) \ - tmp2 = _mm_slli_si128(k0, 4); \ - tmp1 = _mm_xor_si128(k0, tmp2); \ - tmp2 = _mm_slli_si128(tmp2, 4); \ - tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \ - tmp2 = _mm_aeskeygenassist_si128(kt, rcon); \ - res = _mm_xor_si128(tmp1, _mm_shuffle_epi32(tmp2, 0x55)) - -#define EXPAND_KEY192_PART2(res, k1, k2) \ - tmp2 = _mm_xor_si128(k1, _mm_slli_si128(k1, 4)); \ - res = _mm_xor_si128(tmp2, _mm_shuffle_epi32(k2, 0xFF)) - -#define EXPAND_KEY192(k0, res1, res2, res3, carry, rcon1, rcon2) \ - EXPAND_KEY192_PART1(tmp3, k0, res1, rcon1); \ - EXPAND_KEY192_PART2(carry, res1, tmp3); \ - res1 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(res1), \ - _mm_castsi128_pd(tmp3), 0)); \ - res2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp3), \ - _mm_castsi128_pd(carry), 1)); \ - EXPAND_KEY192_PART1(res3, tmp3, carry, rcon2) - -static void -native_key_expansion192(AESContext *cx, const unsigned char *key) -{ - __m128i *keySchedule = cx->keySchedule; - pre_align __m128i tmp1 post_align; - pre_align __m128i tmp2 post_align; - pre_align __m128i tmp3 post_align; - pre_align __m128i carry post_align; - keySchedule[0] = _mm_loadu_si128((__m128i *)key); - keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16)); - EXPAND_KEY192(keySchedule[0], keySchedule[1], keySchedule[2], - keySchedule[3], carry, 0x1, 0x2); - EXPAND_KEY192_PART2(keySchedule[4], carry, keySchedule[3]); - EXPAND_KEY192(keySchedule[3], keySchedule[4], keySchedule[5], - keySchedule[6], carry, 0x4, 0x8); - EXPAND_KEY192_PART2(keySchedule[7], carry, keySchedule[6]); - EXPAND_KEY192(keySchedule[6], keySchedule[7], keySchedule[8], - keySchedule[9], carry, 0x10, 0x20); - EXPAND_KEY192_PART2(keySchedule[10], carry, keySchedule[9]); - EXPAND_KEY192(keySchedule[9], keySchedule[10], keySchedule[11], - keySchedule[12], carry, 0x40, 0x80); -} - -#define EXPAND_KEY256_PART(res, rconx, k1x, k2x, X) \ - tmp_key = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(k2x, rconx), X); \ - tmp2 = _mm_slli_si128(k1x, 4); \ - tmp1 = _mm_xor_si128(k1x, tmp2); \ - tmp2 = _mm_slli_si128(tmp2, 4); \ - tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \ - res = _mm_xor_si128(tmp1, tmp_key); - -#define EXPAND_KEY256(res1, res2, k1, k2, rcon) \ - EXPAND_KEY256_PART(res1, rcon, k1, k2, 0xFF); \ - EXPAND_KEY256_PART(res2, 0x00, k2, res1, 0xAA) - -static void -native_key_expansion256(AESContext *cx, const unsigned char *key) -{ - __m128i *keySchedule = cx->keySchedule; - pre_align __m128i tmp_key post_align; - pre_align __m128i tmp1 post_align; - pre_align __m128i tmp2 post_align; - keySchedule[0] = _mm_loadu_si128((__m128i *)key); - keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16)); - EXPAND_KEY256(keySchedule[2], keySchedule[3], keySchedule[0], - keySchedule[1], 0x01); - EXPAND_KEY256(keySchedule[4], keySchedule[5], keySchedule[2], - keySchedule[3], 0x02); - EXPAND_KEY256(keySchedule[6], keySchedule[7], keySchedule[4], - keySchedule[5], 0x04); - EXPAND_KEY256(keySchedule[8], keySchedule[9], keySchedule[6], - keySchedule[7], 0x08); - EXPAND_KEY256(keySchedule[10], keySchedule[11], keySchedule[8], - keySchedule[9], 0x10); - EXPAND_KEY256(keySchedule[12], keySchedule[13], keySchedule[10], - keySchedule[11], 0x20); - EXPAND_KEY256_PART(keySchedule[14], 0x40, keySchedule[12], - keySchedule[13], 0xFF); -} - -#endif /* NSS_X86_OR_X64 */ - -/* - * AES key expansion using aes-ni instructions. - */ -static void -native_key_expansion(AESContext *cx, const unsigned char *key, unsigned int Nk) -{ -#ifdef NSS_X86_OR_X64 - switch (Nk) { - case 4: - native_key_expansion128(cx, key); - return; - case 6: - native_key_expansion192(cx, key); - return; - case 8: - native_key_expansion256(cx, key); - return; - default: - /* This shouldn't happen. */ - PORT_Assert(0); - } -#else - PORT_Assert(0); -#endif /* NSS_X86_OR_X64 */ -} - -static void -native_encryptBlock(AESContext *cx, - unsigned char *output, - const unsigned char *input) -{ -#ifdef NSS_X86_OR_X64 - int i; - pre_align __m128i m post_align = _mm_loadu_si128((__m128i *)input); - m = _mm_xor_si128(m, cx->keySchedule[0]); - for (i = 1; i < cx->Nr; ++i) { - m = _mm_aesenc_si128(m, cx->keySchedule[i]); - } - m = _mm_aesenclast_si128(m, cx->keySchedule[cx->Nr]); - _mm_storeu_si128((__m128i *)output, m); -#else - PORT_Assert(0); -#endif /* NSS_X86_OR_X64 */ -} - /* rijndael_key_expansion * * Generate the expanded key from the key input by the user. @@ -830,7 +702,7 @@ rijndael_encryptECB(AESContext *cx, unsigned char *output, if (aesni_support()) { /* Use hardware acceleration for normal AES parameters. */ - encryptor = &native_encryptBlock; + encryptor = &rijndael_native_encryptBlock; } else { encryptor = &rijndael_encryptBlock128; } @@ -1026,7 +898,7 @@ aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize, cx->mode == NSS_AES_CTR)) { PORT_Assert(keysize == 16 || keysize == 24 || keysize == 32); /* Prepare hardware key for normal AES parameters. */ - native_key_expansion(cx, key, Nk); + rijndael_native_key_expansion(cx, key, Nk); } else { rijndael_key_expansion(cx, key, Nk); } diff --git lib/freebl/rijndael.h lib/freebl/rijndael.h index 1f4a8a9f73..1b63a323da 100644 --- lib/freebl/rijndael.h +++ lib/freebl/rijndael.h @@ -8,8 +8,22 @@ #include "blapii.h" #include -#ifdef NSS_X86_OR_X64 -#include /* aes-ni */ +#if defined(NSS_X86_OR_X64) +/* GCC <= 4.8 doesn't support including emmintrin.h without enabling SSE2 */ +#if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \ + (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 8)) +#pragma GCC push_options +#pragma GCC target("sse2") +#undef NSS_DISABLE_SSE2 +#define NSS_DISABLE_SSE2 1 +#endif /* GCC <= 4.8 */ + +#include /* __m128i */ + +#ifdef NSS_DISABLE_SSE2 +#undef NSS_DISABLE_SSE2 +#pragma GCC pop_options +#endif /* NSS_DISABLE_SSE2 */ #endif typedef void AESBlockFunc(AESContext *cx,