diff options
Diffstat (limited to 'abs/extra/nss/bug1400603.patch')
-rw-r--r-- | abs/extra/nss/bug1400603.patch | 942 |
1 files changed, 942 insertions, 0 deletions
diff --git a/abs/extra/nss/bug1400603.patch b/abs/extra/nss/bug1400603.patch new file mode 100644 index 0000000..321f50d --- /dev/null +++ b/abs/extra/nss/bug1400603.patch @@ -0,0 +1,942 @@ +commit e84403331d99 +Author: Daiki Ueno <dueno@redhat.com> +Date: Fri Sep 22 11:27:34 2017 +0200 + + Bug 1400603 - freebl: Reorganize AES-GCM source code based on hw/sw implementation, r=franziskus + + Reviewers: franziskus + + Reviewed By: franziskus + + Bug #: 1400603 + + Differential Revision: https://phabricator.services.mozilla.com/D65 +--- + lib/freebl/Makefile | 4 +- + lib/freebl/aes-x86.c | 157 +++++++++++++++++++++++++++++++++++++++++ + lib/freebl/freebl.gyp | 65 ++++++++++------- + lib/freebl/gcm-x86.c | 127 ++++++++++++++++++++++++++++++++++ + lib/freebl/gcm.c | 162 +++++++++++++------------------------------ + lib/freebl/gcm.h | 14 ++++ + lib/freebl/rijndael.c | 188 ++++++++------------------------------------------ + lib/freebl/rijndael.h | 18 ++++- + 8 files changed, 436 insertions(+), 299 deletions(-) + +diff --git lib/freebl/Makefile lib/freebl/Makefile +index d50e18696b..bc1ea86a5e 100644 +--- lib/freebl/Makefile ++++ lib/freebl/Makefile +@@ -110,7 +110,9 @@ endif + # NSS_X86_OR_X64 means the target is either x86 or x64 + ifeq (,$(filter-out i386 x386 x86 x86_64,$(CPU_ARCH))) + DEFINES += -DNSS_X86_OR_X64 +- CFLAGS += -mpclmul -maes ++ EXTRA_SRCS += gcm-x86.c aes-x86.c ++$(OBJDIR)/gcm-x86.o: CFLAGS += -mpclmul -maes ++$(OBJDIR)/aes-x86.o: CFLAGS += -mpclmul -maes + ifneq (,$(USE_64)$(USE_X32)) + DEFINES += -DNSS_X64 + else +diff --git lib/freebl/aes-x86.c lib/freebl/aes-x86.c +new file mode 100644 +index 0000000000..830b4782fe +--- /dev/null ++++ lib/freebl/aes-x86.c +@@ -0,0 +1,157 @@ ++/* This Source Code Form is subject to the terms of the Mozilla Public ++ * License, v. 2.0. If a copy of the MPL was not distributed with this ++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ ++ ++#ifdef FREEBL_NO_DEPEND ++#include "stubs.h" ++#endif ++#include "rijndael.h" ++#include "secerr.h" ++ ++#include <wmmintrin.h> /* aes-ni */ ++ ++#define EXPAND_KEY128(k, rcon, res) \ ++ tmp_key = _mm_aeskeygenassist_si128(k, rcon); \ ++ tmp_key = _mm_shuffle_epi32(tmp_key, 0xFF); \ ++ tmp = _mm_xor_si128(k, _mm_slli_si128(k, 4)); \ ++ tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \ ++ tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \ ++ res = _mm_xor_si128(tmp, tmp_key) ++ ++static void ++native_key_expansion128(AESContext *cx, const unsigned char *key) ++{ ++ __m128i *keySchedule = cx->keySchedule; ++ pre_align __m128i tmp_key post_align; ++ pre_align __m128i tmp post_align; ++ keySchedule[0] = _mm_loadu_si128((__m128i *)key); ++ EXPAND_KEY128(keySchedule[0], 0x01, keySchedule[1]); ++ EXPAND_KEY128(keySchedule[1], 0x02, keySchedule[2]); ++ EXPAND_KEY128(keySchedule[2], 0x04, keySchedule[3]); ++ EXPAND_KEY128(keySchedule[3], 0x08, keySchedule[4]); ++ EXPAND_KEY128(keySchedule[4], 0x10, keySchedule[5]); ++ EXPAND_KEY128(keySchedule[5], 0x20, keySchedule[6]); ++ EXPAND_KEY128(keySchedule[6], 0x40, keySchedule[7]); ++ EXPAND_KEY128(keySchedule[7], 0x80, keySchedule[8]); ++ EXPAND_KEY128(keySchedule[8], 0x1B, keySchedule[9]); ++ EXPAND_KEY128(keySchedule[9], 0x36, keySchedule[10]); ++} ++ ++#define EXPAND_KEY192_PART1(res, k0, kt, rcon) \ ++ tmp2 = _mm_slli_si128(k0, 4); \ ++ tmp1 = _mm_xor_si128(k0, tmp2); \ ++ tmp2 = _mm_slli_si128(tmp2, 4); \ ++ tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \ ++ tmp2 = _mm_aeskeygenassist_si128(kt, rcon); \ ++ res = _mm_xor_si128(tmp1, _mm_shuffle_epi32(tmp2, 0x55)) ++ ++#define EXPAND_KEY192_PART2(res, k1, k2) \ ++ tmp2 = _mm_xor_si128(k1, _mm_slli_si128(k1, 4)); \ ++ res = _mm_xor_si128(tmp2, _mm_shuffle_epi32(k2, 0xFF)) ++ ++#define EXPAND_KEY192(k0, res1, res2, res3, carry, rcon1, rcon2) \ ++ EXPAND_KEY192_PART1(tmp3, k0, res1, rcon1); \ ++ EXPAND_KEY192_PART2(carry, res1, tmp3); \ ++ res1 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(res1), \ ++ _mm_castsi128_pd(tmp3), 0)); \ ++ res2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp3), \ ++ _mm_castsi128_pd(carry), 1)); \ ++ EXPAND_KEY192_PART1(res3, tmp3, carry, rcon2) ++ ++static void ++native_key_expansion192(AESContext *cx, const unsigned char *key) ++{ ++ __m128i *keySchedule = cx->keySchedule; ++ pre_align __m128i tmp1 post_align; ++ pre_align __m128i tmp2 post_align; ++ pre_align __m128i tmp3 post_align; ++ pre_align __m128i carry post_align; ++ keySchedule[0] = _mm_loadu_si128((__m128i *)key); ++ keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16)); ++ EXPAND_KEY192(keySchedule[0], keySchedule[1], keySchedule[2], ++ keySchedule[3], carry, 0x1, 0x2); ++ EXPAND_KEY192_PART2(keySchedule[4], carry, keySchedule[3]); ++ EXPAND_KEY192(keySchedule[3], keySchedule[4], keySchedule[5], ++ keySchedule[6], carry, 0x4, 0x8); ++ EXPAND_KEY192_PART2(keySchedule[7], carry, keySchedule[6]); ++ EXPAND_KEY192(keySchedule[6], keySchedule[7], keySchedule[8], ++ keySchedule[9], carry, 0x10, 0x20); ++ EXPAND_KEY192_PART2(keySchedule[10], carry, keySchedule[9]); ++ EXPAND_KEY192(keySchedule[9], keySchedule[10], keySchedule[11], ++ keySchedule[12], carry, 0x40, 0x80); ++} ++ ++#define EXPAND_KEY256_PART(res, rconx, k1x, k2x, X) \ ++ tmp_key = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(k2x, rconx), X); \ ++ tmp2 = _mm_slli_si128(k1x, 4); \ ++ tmp1 = _mm_xor_si128(k1x, tmp2); \ ++ tmp2 = _mm_slli_si128(tmp2, 4); \ ++ tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \ ++ res = _mm_xor_si128(tmp1, tmp_key); ++ ++#define EXPAND_KEY256(res1, res2, k1, k2, rcon) \ ++ EXPAND_KEY256_PART(res1, rcon, k1, k2, 0xFF); \ ++ EXPAND_KEY256_PART(res2, 0x00, k2, res1, 0xAA) ++ ++static void ++native_key_expansion256(AESContext *cx, const unsigned char *key) ++{ ++ __m128i *keySchedule = cx->keySchedule; ++ pre_align __m128i tmp_key post_align; ++ pre_align __m128i tmp1 post_align; ++ pre_align __m128i tmp2 post_align; ++ keySchedule[0] = _mm_loadu_si128((__m128i *)key); ++ keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16)); ++ EXPAND_KEY256(keySchedule[2], keySchedule[3], keySchedule[0], ++ keySchedule[1], 0x01); ++ EXPAND_KEY256(keySchedule[4], keySchedule[5], keySchedule[2], ++ keySchedule[3], 0x02); ++ EXPAND_KEY256(keySchedule[6], keySchedule[7], keySchedule[4], ++ keySchedule[5], 0x04); ++ EXPAND_KEY256(keySchedule[8], keySchedule[9], keySchedule[6], ++ keySchedule[7], 0x08); ++ EXPAND_KEY256(keySchedule[10], keySchedule[11], keySchedule[8], ++ keySchedule[9], 0x10); ++ EXPAND_KEY256(keySchedule[12], keySchedule[13], keySchedule[10], ++ keySchedule[11], 0x20); ++ EXPAND_KEY256_PART(keySchedule[14], 0x40, keySchedule[12], ++ keySchedule[13], 0xFF); ++} ++ ++/* ++ * AES key expansion using aes-ni instructions. ++ */ ++void ++rijndael_native_key_expansion(AESContext *cx, const unsigned char *key, ++ unsigned int Nk) ++{ ++ switch (Nk) { ++ case 4: ++ native_key_expansion128(cx, key); ++ return; ++ case 6: ++ native_key_expansion192(cx, key); ++ return; ++ case 8: ++ native_key_expansion256(cx, key); ++ return; ++ default: ++ /* This shouldn't happen (checked by the caller). */ ++ return; ++ } ++} ++ ++void ++rijndael_native_encryptBlock(AESContext *cx, ++ unsigned char *output, ++ const unsigned char *input) ++{ ++ int i; ++ pre_align __m128i m post_align = _mm_loadu_si128((__m128i *)input); ++ m = _mm_xor_si128(m, cx->keySchedule[0]); ++ for (i = 1; i < cx->Nr; ++i) { ++ m = _mm_aesenc_si128(m, cx->keySchedule[i]); ++ } ++ m = _mm_aesenclast_si128(m, cx->keySchedule[cx->Nr]); ++ _mm_storeu_si128((__m128i *)output, m); ++} +diff --git lib/freebl/freebl.gyp lib/freebl/freebl.gyp +index 1e93475004..5f59eef29c 100644 +--- lib/freebl/freebl.gyp ++++ lib/freebl/freebl.gyp +@@ -22,6 +22,37 @@ + '-mssse3' + ] + }, ++ { ++ 'target_name': 'gcm-aes-x86_c_lib', ++ 'type': 'static_library', ++ 'sources': [ ++ 'gcm-x86.c', 'aes-x86.c' ++ ], ++ 'dependencies': [ ++ '<(DEPTH)/exports.gyp:nss_exports' ++ ], ++ # Enable isa option for pclmul and aes-ni; supported since gcc 4.4. ++ # This is only supported by x84/x64. It's not needed for Windows, ++ # unless clang-cl is used. ++ 'cflags_mozilla': [ ++ '-mpclmul', '-maes' ++ ], ++ 'conditions': [ ++ [ 'OS=="linux" or OS=="android" or OS=="dragonfly" or OS=="freebsd" or OS=="netbsd" or OS=="openbsd"', { ++ 'cflags': [ ++ '-mpclmul', '-maes' ++ ], ++ }], ++ # macOS build doesn't use cflags. ++ [ 'OS=="mac"', { ++ 'xcode_settings': { ++ 'OTHER_CFLAGS': [ ++ '-mpclmul', '-maes' ++ ], ++ }, ++ }] ++ ] ++ }, + { + 'target_name': 'freebl', + 'type': 'static_library', +@@ -45,6 +76,11 @@ + '<(DEPTH)/exports.gyp:nss_exports', + ], + 'conditions': [ ++ [ 'target_arch=="ia32" or target_arch=="x64"', { ++ 'dependencies': [ ++ 'gcm-aes-x86_c_lib' ++ ], ++ }], + [ 'OS=="linux"', { + 'defines!': [ + 'FREEBL_NO_DEPEND', +@@ -76,6 +112,11 @@ + '<(DEPTH)/exports.gyp:nss_exports', + ], + 'conditions': [ ++ [ 'target_arch=="ia32" or target_arch=="x64"', { ++ 'dependencies': [ ++ 'gcm-aes-x86_c_lib' ++ ] ++ }], + [ 'OS!="linux" and OS!="android"', { + 'conditions': [ + [ 'moz_fold_libs==0', { +@@ -154,27 +195,11 @@ + 'MP_API_COMPATIBLE' + ], + 'conditions': [ +- [ 'target_arch=="ia32" or target_arch=="x64"', { +- 'cflags_mozilla': [ +- '-mpclmul', +- '-maes', +- ], +- 'conditions': [ +- [ 'OS=="dragonfly" or OS=="freebsd" or OS=="netbsd" or OS=="openbsd"', { +- 'cflags': [ +- '-mpclmul', +- '-maes', +- ], +- }], +- ], +- }], + [ 'OS=="mac"', { + 'xcode_settings': { + # I'm not sure since when this is supported. + # But I hope that doesn't matter. We also assume this is x86/x64. + 'OTHER_CFLAGS': [ +- '-mpclmul', +- '-maes', + '-std=gnu99', + ], + }, +@@ -268,14 +293,6 @@ + 'MP_USE_UINT_DIGIT', + ], + }], +- [ 'target_arch=="ia32" or target_arch=="x64"', { +- 'cflags': [ +- # enable isa option for pclmul am aes-ni; supported since gcc 4.4 +- # This is only support by x84/x64. It's not needed for Windows. +- '-mpclmul', +- '-maes', +- ], +- }], + [ 'target_arch=="arm"', { + 'defines': [ + 'MP_ASSEMBLY_MULTIPLY', +diff --git lib/freebl/gcm-x86.c lib/freebl/gcm-x86.c +new file mode 100644 +index 0000000000..e34d633943 +--- /dev/null ++++ lib/freebl/gcm-x86.c +@@ -0,0 +1,127 @@ ++/* This Source Code Form is subject to the terms of the Mozilla Public ++ * License, v. 2.0. If a copy of the MPL was not distributed with this ++ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ ++ ++#ifdef FREEBL_NO_DEPEND ++#include "stubs.h" ++#endif ++#include "gcm.h" ++#include "secerr.h" ++ ++#include <wmmintrin.h> /* clmul */ ++ ++#define WRITE64(x, bytes) \ ++ (bytes)[0] = (x) >> 56; \ ++ (bytes)[1] = (x) >> 48; \ ++ (bytes)[2] = (x) >> 40; \ ++ (bytes)[3] = (x) >> 32; \ ++ (bytes)[4] = (x) >> 24; \ ++ (bytes)[5] = (x) >> 16; \ ++ (bytes)[6] = (x) >> 8; \ ++ (bytes)[7] = (x); ++ ++SECStatus ++gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf) ++{ ++ uint64_t tmp_out[2]; ++ _mm_storeu_si128((__m128i *)tmp_out, ghash->x); ++ /* maxout must be larger than 16 byte (checked by the caller). */ ++ WRITE64(tmp_out[0], outbuf + 8); ++ WRITE64(tmp_out[1], outbuf); ++ return SECSuccess; ++} ++ ++SECStatus ++gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf, ++ unsigned int count) ++{ ++ size_t i; ++ pre_align __m128i z_high post_align; ++ pre_align __m128i z_low post_align; ++ pre_align __m128i C post_align; ++ pre_align __m128i D post_align; ++ pre_align __m128i E post_align; ++ pre_align __m128i F post_align; ++ pre_align __m128i bin post_align; ++ pre_align __m128i Ci post_align; ++ pre_align __m128i tmp post_align; ++ ++ for (i = 0; i < count; i++, buf += 16) { ++ bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1], ++ ((uint16_t)buf[2] << 8) | buf[3], ++ ((uint16_t)buf[4] << 8) | buf[5], ++ ((uint16_t)buf[6] << 8) | buf[7], ++ ((uint16_t)buf[8] << 8) | buf[9], ++ ((uint16_t)buf[10] << 8) | buf[11], ++ ((uint16_t)buf[12] << 8) | buf[13], ++ ((uint16_t)buf[14] << 8) | buf[15]); ++ Ci = _mm_xor_si128(bin, ghash->x); ++ ++ /* Do binary mult ghash->X = Ci * ghash->H. */ ++ C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00); ++ D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11); ++ E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01); ++ F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10); ++ tmp = _mm_xor_si128(E, F); ++ z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8)); ++ z_high = _mm_unpackhi_epi64(z_high, D); ++ z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C); ++ z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low); ++ ++ /* Shift one to the left (multiply by x) as gcm spec is stupid. */ ++ C = _mm_slli_si128(z_low, 8); ++ E = _mm_srli_epi64(C, 63); ++ D = _mm_slli_si128(z_high, 8); ++ F = _mm_srli_epi64(D, 63); ++ /* Carry over */ ++ C = _mm_srli_si128(z_low, 8); ++ D = _mm_srli_epi64(C, 63); ++ z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E); ++ z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D); ++ ++ /* Reduce */ ++ C = _mm_slli_si128(z_low, 8); ++ /* D = z_low << 127 */ ++ D = _mm_slli_epi64(C, 63); ++ /* E = z_low << 126 */ ++ E = _mm_slli_epi64(C, 62); ++ /* F = z_low << 121 */ ++ F = _mm_slli_epi64(C, 57); ++ /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */ ++ z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F); ++ C = _mm_srli_si128(z_low, 8); ++ /* D = z_low >> 1 */ ++ D = _mm_slli_epi64(C, 63); ++ D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D); ++ /* E = z_low >> 2 */ ++ E = _mm_slli_epi64(C, 62); ++ E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E); ++ /* F = z_low >> 7 */ ++ F = _mm_slli_epi64(C, 57); ++ F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F); ++ /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */ ++ ghash->x = _mm_xor_si128(_mm_xor_si128( ++ _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E), ++ F); ++ } ++ return SECSuccess; ++} ++ ++SECStatus ++gcm_HashInit_hw(gcmHashContext *ghash) ++{ ++ ghash->ghash_mul = gcm_HashMult_hw; ++ ghash->x = _mm_setzero_si128(); ++ /* MSVC requires __m64 to load epi64. */ ++ ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high, ++ ghash->h_low >> 32, (uint32_t)ghash->h_low); ++ ghash->hw = PR_TRUE; ++ return SECSuccess; ++} ++ ++SECStatus ++gcm_HashZeroX_hw(gcmHashContext *ghash) ++{ ++ ghash->x = _mm_setzero_si128(); ++ return SECSuccess; ++} +diff --git lib/freebl/gcm.c lib/freebl/gcm.c +index 780b7a6322..f1e16da78e 100644 +--- lib/freebl/gcm.c ++++ lib/freebl/gcm.c +@@ -17,18 +17,50 @@ + + #include <limits.h> + +-#ifdef NSS_X86_OR_X64 +-#include <wmmintrin.h> /* clmul */ +-#endif +- + /* Forward declarations */ ++SECStatus gcm_HashInit_hw(gcmHashContext *ghash); ++SECStatus gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf); + SECStatus gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf, + unsigned int count); ++SECStatus gcm_HashZeroX_hw(gcmHashContext *ghash); + SECStatus gcm_HashMult_sftw(gcmHashContext *ghash, const unsigned char *buf, + unsigned int count); + SECStatus gcm_HashMult_sftw32(gcmHashContext *ghash, const unsigned char *buf, + unsigned int count); + ++/* Stub definitions for the above *_hw functions, which shouldn't be ++ * used unless NSS_X86_OR_X64 is defined */ ++#ifndef NSS_X86_OR_X64 ++SECStatus ++gcm_HashWrite_hw(gcmHashContext *ghash, unsigned char *outbuf) ++{ ++ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); ++ return SECFailure; ++} ++ ++SECStatus ++gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf, ++ unsigned int count) ++{ ++ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); ++ return SECFailure; ++} ++ ++SECStatus ++gcm_HashInit_hw(gcmHashContext *ghash) ++{ ++ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); ++ return SECFailure; ++} ++ ++SECStatus ++gcm_HashZeroX_hw(gcmHashContext *ghash) ++{ ++ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); ++ return SECFailure; ++} ++#endif /* NSS_X86_OR_X64 */ ++ + uint64_t + get64(const unsigned char *bytes) + { +@@ -46,6 +78,8 @@ get64(const unsigned char *bytes) + SECStatus + gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, PRBool sw) + { ++ SECStatus rv = SECSuccess; ++ + ghash->cLen = 0; + ghash->bufLen = 0; + PORT_Memset(ghash->counterBuf, 0, sizeof(ghash->counterBuf)); +@@ -53,17 +87,7 @@ gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, PRBool sw) + ghash->h_low = get64(H + 8); + ghash->h_high = get64(H); + if (clmul_support() && !sw) { +-#ifdef NSS_X86_OR_X64 +- ghash->ghash_mul = gcm_HashMult_hw; +- ghash->x = _mm_setzero_si128(); +- /* MSVC requires __m64 to load epi64. */ +- ghash->h = _mm_set_epi32(ghash->h_high >> 32, (uint32_t)ghash->h_high, +- ghash->h_low >> 32, (uint32_t)ghash->h_low); +- ghash->hw = PR_TRUE; +-#else +- PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); +- return SECFailure; +-#endif /* NSS_X86_OR_X64 */ ++ rv = gcm_HashInit_hw(ghash); + } else { + /* We fall back to the software implementation if we can't use / don't + * want to use pclmul. */ +@@ -75,7 +99,7 @@ gcmHash_InitContext(gcmHashContext *ghash, const unsigned char *H, PRBool sw) + ghash->x_high = ghash->x_low = 0; + ghash->hw = PR_FALSE; + } +- return SECSuccess; ++ return rv; + } + + #ifdef HAVE_INT128_SUPPORT +@@ -283,102 +307,17 @@ gcm_HashMult_sftw32(gcmHashContext *ghash, const unsigned char *buf, + } + #endif /* HAVE_INT128_SUPPORT */ + +-SECStatus +-gcm_HashMult_hw(gcmHashContext *ghash, const unsigned char *buf, +- unsigned int count) +-{ +-#ifdef NSS_X86_OR_X64 +- size_t i; +- pre_align __m128i z_high post_align; +- pre_align __m128i z_low post_align; +- pre_align __m128i C post_align; +- pre_align __m128i D post_align; +- pre_align __m128i E post_align; +- pre_align __m128i F post_align; +- pre_align __m128i bin post_align; +- pre_align __m128i Ci post_align; +- pre_align __m128i tmp post_align; +- +- for (i = 0; i < count; i++, buf += 16) { +- bin = _mm_set_epi16(((uint16_t)buf[0] << 8) | buf[1], +- ((uint16_t)buf[2] << 8) | buf[3], +- ((uint16_t)buf[4] << 8) | buf[5], +- ((uint16_t)buf[6] << 8) | buf[7], +- ((uint16_t)buf[8] << 8) | buf[9], +- ((uint16_t)buf[10] << 8) | buf[11], +- ((uint16_t)buf[12] << 8) | buf[13], +- ((uint16_t)buf[14] << 8) | buf[15]); +- Ci = _mm_xor_si128(bin, ghash->x); +- +- /* Do binary mult ghash->X = Ci * ghash->H. */ +- C = _mm_clmulepi64_si128(Ci, ghash->h, 0x00); +- D = _mm_clmulepi64_si128(Ci, ghash->h, 0x11); +- E = _mm_clmulepi64_si128(Ci, ghash->h, 0x01); +- F = _mm_clmulepi64_si128(Ci, ghash->h, 0x10); +- tmp = _mm_xor_si128(E, F); +- z_high = _mm_xor_si128(tmp, _mm_slli_si128(D, 8)); +- z_high = _mm_unpackhi_epi64(z_high, D); +- z_low = _mm_xor_si128(_mm_slli_si128(tmp, 8), C); +- z_low = _mm_unpackhi_epi64(_mm_slli_si128(C, 8), z_low); +- +- /* Shift one to the left (multiply by x) as gcm spec is stupid. */ +- C = _mm_slli_si128(z_low, 8); +- E = _mm_srli_epi64(C, 63); +- D = _mm_slli_si128(z_high, 8); +- F = _mm_srli_epi64(D, 63); +- /* Carry over */ +- C = _mm_srli_si128(z_low, 8); +- D = _mm_srli_epi64(C, 63); +- z_low = _mm_or_si128(_mm_slli_epi64(z_low, 1), E); +- z_high = _mm_or_si128(_mm_or_si128(_mm_slli_epi64(z_high, 1), F), D); +- +- /* Reduce */ +- C = _mm_slli_si128(z_low, 8); +- /* D = z_low << 127 */ +- D = _mm_slli_epi64(C, 63); +- /* E = z_low << 126 */ +- E = _mm_slli_epi64(C, 62); +- /* F = z_low << 121 */ +- F = _mm_slli_epi64(C, 57); +- /* z_low ^= (z_low << 127) ^ (z_low << 126) ^ (z_low << 121); */ +- z_low = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(z_low, D), E), F); +- C = _mm_srli_si128(z_low, 8); +- /* D = z_low >> 1 */ +- D = _mm_slli_epi64(C, 63); +- D = _mm_or_si128(_mm_srli_epi64(z_low, 1), D); +- /* E = z_low >> 2 */ +- E = _mm_slli_epi64(C, 62); +- E = _mm_or_si128(_mm_srli_epi64(z_low, 2), E); +- /* F = z_low >> 7 */ +- F = _mm_slli_epi64(C, 57); +- F = _mm_or_si128(_mm_srli_epi64(z_low, 7), F); +- /* ghash->x ^= z_low ^ (z_low >> 1) ^ (z_low >> 2) ^ (z_low >> 7); */ +- ghash->x = _mm_xor_si128(_mm_xor_si128( +- _mm_xor_si128(_mm_xor_si128(z_high, z_low), D), E), +- F); +- } +- return SECSuccess; +-#else +- PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); +- return SECFailure; +-#endif /* NSS_X86_OR_X64 */ +-} +- + static SECStatus + gcm_zeroX(gcmHashContext *ghash) + { ++ SECStatus rv = SECSuccess; ++ + if (ghash->hw) { +-#ifdef NSS_X86_OR_X64 +- ghash->x = _mm_setzero_si128(); +- return SECSuccess; +-#else +- PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); +- return SECFailure; +-#endif /* NSS_X86_OR_X64 */ ++ rv = gcm_HashZeroX_hw(ghash); + } + + ghash->x_high = ghash->x_low = 0; +- return SECSuccess; ++ return rv; + } + + /* +@@ -503,15 +442,10 @@ gcmHash_Final(gcmHashContext *ghash, unsigned char *outbuf, + } + + if (ghash->hw) { +-#ifdef NSS_X86_OR_X64 +- uint64_t tmp_out[2]; +- _mm_storeu_si128((__m128i *)tmp_out, ghash->x); +- WRITE64(tmp_out[0], T + 8); +- WRITE64(tmp_out[1], T); +-#else +- PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); +- return SECFailure; +-#endif /* NSS_X86_OR_X64 */ ++ rv = gcm_HashWrite_hw(ghash, T); ++ if (rv != SECSuccess) { ++ goto cleanup; ++ } + } else { + WRITE64(ghash->x_low, T + 8); + WRITE64(ghash->x_high, T); +diff --git lib/freebl/gcm.h lib/freebl/gcm.h +index 0c707a0811..42ef0f7179 100644 +--- lib/freebl/gcm.h ++++ lib/freebl/gcm.h +@@ -9,7 +9,21 @@ + #include <stdint.h> + + #ifdef NSS_X86_OR_X64 ++/* GCC <= 4.8 doesn't support including emmintrin.h without enabling SSE2 */ ++#if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \ ++ (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 8)) ++#pragma GCC push_options ++#pragma GCC target("sse2") ++#undef NSS_DISABLE_SSE2 ++#define NSS_DISABLE_SSE2 1 ++#endif /* GCC <= 4.8 */ ++ + #include <emmintrin.h> /* __m128i */ ++ ++#ifdef NSS_DISABLE_SSE2 ++#undef NSS_DISABLE_SSE2 ++#pragma GCC pop_options ++#endif /* NSS_DISABLE_SSE2 */ + #endif + + SEC_BEGIN_PROTOS +diff --git lib/freebl/rijndael.c lib/freebl/rijndael.c +index a09f13098e..5de27de9ce 100644 +--- lib/freebl/rijndael.c ++++ lib/freebl/rijndael.c +@@ -27,6 +27,34 @@ + #include "intel-gcm.h" + #endif /* INTEL_GCM */ + ++/* Forward declarations */ ++void rijndael_native_key_expansion(AESContext *cx, const unsigned char *key, ++ unsigned int Nk); ++void rijndael_native_encryptBlock(AESContext *cx, ++ unsigned char *output, ++ const unsigned char *input); ++ ++/* Stub definitions for the above rijndael_native_* functions, which ++ * shouldn't be used unless NSS_X86_OR_X64 is defined */ ++#ifndef NSS_X86_OR_X64 ++void ++rijndael_native_key_expansion(AESContext *cx, const unsigned char *key, ++ unsigned int Nk) ++{ ++ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); ++ PORT_Assert(0); ++} ++ ++void ++rijndael_native_encryptBlock(AESContext *cx, ++ unsigned char *output, ++ const unsigned char *input) ++{ ++ PORT_SetError(SEC_ERROR_LIBRARY_FAILURE); ++ PORT_Assert(0); ++} ++#endif /* NSS_X86_OR_X64 */ ++ + /* + * There are currently three ways to build this code, varying in performance + * and code size. +@@ -309,162 +337,6 @@ rijndael_key_expansion7(AESContext *cx, const unsigned char *key, unsigned int N + } + } + +-#if defined(NSS_X86_OR_X64) +-#define EXPAND_KEY128(k, rcon, res) \ +- tmp_key = _mm_aeskeygenassist_si128(k, rcon); \ +- tmp_key = _mm_shuffle_epi32(tmp_key, 0xFF); \ +- tmp = _mm_xor_si128(k, _mm_slli_si128(k, 4)); \ +- tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \ +- tmp = _mm_xor_si128(tmp, _mm_slli_si128(tmp, 4)); \ +- res = _mm_xor_si128(tmp, tmp_key) +- +-static void +-native_key_expansion128(AESContext *cx, const unsigned char *key) +-{ +- __m128i *keySchedule = cx->keySchedule; +- pre_align __m128i tmp_key post_align; +- pre_align __m128i tmp post_align; +- keySchedule[0] = _mm_loadu_si128((__m128i *)key); +- EXPAND_KEY128(keySchedule[0], 0x01, keySchedule[1]); +- EXPAND_KEY128(keySchedule[1], 0x02, keySchedule[2]); +- EXPAND_KEY128(keySchedule[2], 0x04, keySchedule[3]); +- EXPAND_KEY128(keySchedule[3], 0x08, keySchedule[4]); +- EXPAND_KEY128(keySchedule[4], 0x10, keySchedule[5]); +- EXPAND_KEY128(keySchedule[5], 0x20, keySchedule[6]); +- EXPAND_KEY128(keySchedule[6], 0x40, keySchedule[7]); +- EXPAND_KEY128(keySchedule[7], 0x80, keySchedule[8]); +- EXPAND_KEY128(keySchedule[8], 0x1B, keySchedule[9]); +- EXPAND_KEY128(keySchedule[9], 0x36, keySchedule[10]); +-} +- +-#define EXPAND_KEY192_PART1(res, k0, kt, rcon) \ +- tmp2 = _mm_slli_si128(k0, 4); \ +- tmp1 = _mm_xor_si128(k0, tmp2); \ +- tmp2 = _mm_slli_si128(tmp2, 4); \ +- tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \ +- tmp2 = _mm_aeskeygenassist_si128(kt, rcon); \ +- res = _mm_xor_si128(tmp1, _mm_shuffle_epi32(tmp2, 0x55)) +- +-#define EXPAND_KEY192_PART2(res, k1, k2) \ +- tmp2 = _mm_xor_si128(k1, _mm_slli_si128(k1, 4)); \ +- res = _mm_xor_si128(tmp2, _mm_shuffle_epi32(k2, 0xFF)) +- +-#define EXPAND_KEY192(k0, res1, res2, res3, carry, rcon1, rcon2) \ +- EXPAND_KEY192_PART1(tmp3, k0, res1, rcon1); \ +- EXPAND_KEY192_PART2(carry, res1, tmp3); \ +- res1 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(res1), \ +- _mm_castsi128_pd(tmp3), 0)); \ +- res2 = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(tmp3), \ +- _mm_castsi128_pd(carry), 1)); \ +- EXPAND_KEY192_PART1(res3, tmp3, carry, rcon2) +- +-static void +-native_key_expansion192(AESContext *cx, const unsigned char *key) +-{ +- __m128i *keySchedule = cx->keySchedule; +- pre_align __m128i tmp1 post_align; +- pre_align __m128i tmp2 post_align; +- pre_align __m128i tmp3 post_align; +- pre_align __m128i carry post_align; +- keySchedule[0] = _mm_loadu_si128((__m128i *)key); +- keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16)); +- EXPAND_KEY192(keySchedule[0], keySchedule[1], keySchedule[2], +- keySchedule[3], carry, 0x1, 0x2); +- EXPAND_KEY192_PART2(keySchedule[4], carry, keySchedule[3]); +- EXPAND_KEY192(keySchedule[3], keySchedule[4], keySchedule[5], +- keySchedule[6], carry, 0x4, 0x8); +- EXPAND_KEY192_PART2(keySchedule[7], carry, keySchedule[6]); +- EXPAND_KEY192(keySchedule[6], keySchedule[7], keySchedule[8], +- keySchedule[9], carry, 0x10, 0x20); +- EXPAND_KEY192_PART2(keySchedule[10], carry, keySchedule[9]); +- EXPAND_KEY192(keySchedule[9], keySchedule[10], keySchedule[11], +- keySchedule[12], carry, 0x40, 0x80); +-} +- +-#define EXPAND_KEY256_PART(res, rconx, k1x, k2x, X) \ +- tmp_key = _mm_shuffle_epi32(_mm_aeskeygenassist_si128(k2x, rconx), X); \ +- tmp2 = _mm_slli_si128(k1x, 4); \ +- tmp1 = _mm_xor_si128(k1x, tmp2); \ +- tmp2 = _mm_slli_si128(tmp2, 4); \ +- tmp1 = _mm_xor_si128(_mm_xor_si128(tmp1, tmp2), _mm_slli_si128(tmp2, 4)); \ +- res = _mm_xor_si128(tmp1, tmp_key); +- +-#define EXPAND_KEY256(res1, res2, k1, k2, rcon) \ +- EXPAND_KEY256_PART(res1, rcon, k1, k2, 0xFF); \ +- EXPAND_KEY256_PART(res2, 0x00, k2, res1, 0xAA) +- +-static void +-native_key_expansion256(AESContext *cx, const unsigned char *key) +-{ +- __m128i *keySchedule = cx->keySchedule; +- pre_align __m128i tmp_key post_align; +- pre_align __m128i tmp1 post_align; +- pre_align __m128i tmp2 post_align; +- keySchedule[0] = _mm_loadu_si128((__m128i *)key); +- keySchedule[1] = _mm_loadu_si128((__m128i *)(key + 16)); +- EXPAND_KEY256(keySchedule[2], keySchedule[3], keySchedule[0], +- keySchedule[1], 0x01); +- EXPAND_KEY256(keySchedule[4], keySchedule[5], keySchedule[2], +- keySchedule[3], 0x02); +- EXPAND_KEY256(keySchedule[6], keySchedule[7], keySchedule[4], +- keySchedule[5], 0x04); +- EXPAND_KEY256(keySchedule[8], keySchedule[9], keySchedule[6], +- keySchedule[7], 0x08); +- EXPAND_KEY256(keySchedule[10], keySchedule[11], keySchedule[8], +- keySchedule[9], 0x10); +- EXPAND_KEY256(keySchedule[12], keySchedule[13], keySchedule[10], +- keySchedule[11], 0x20); +- EXPAND_KEY256_PART(keySchedule[14], 0x40, keySchedule[12], +- keySchedule[13], 0xFF); +-} +- +-#endif /* NSS_X86_OR_X64 */ +- +-/* +- * AES key expansion using aes-ni instructions. +- */ +-static void +-native_key_expansion(AESContext *cx, const unsigned char *key, unsigned int Nk) +-{ +-#ifdef NSS_X86_OR_X64 +- switch (Nk) { +- case 4: +- native_key_expansion128(cx, key); +- return; +- case 6: +- native_key_expansion192(cx, key); +- return; +- case 8: +- native_key_expansion256(cx, key); +- return; +- default: +- /* This shouldn't happen. */ +- PORT_Assert(0); +- } +-#else +- PORT_Assert(0); +-#endif /* NSS_X86_OR_X64 */ +-} +- +-static void +-native_encryptBlock(AESContext *cx, +- unsigned char *output, +- const unsigned char *input) +-{ +-#ifdef NSS_X86_OR_X64 +- int i; +- pre_align __m128i m post_align = _mm_loadu_si128((__m128i *)input); +- m = _mm_xor_si128(m, cx->keySchedule[0]); +- for (i = 1; i < cx->Nr; ++i) { +- m = _mm_aesenc_si128(m, cx->keySchedule[i]); +- } +- m = _mm_aesenclast_si128(m, cx->keySchedule[cx->Nr]); +- _mm_storeu_si128((__m128i *)output, m); +-#else +- PORT_Assert(0); +-#endif /* NSS_X86_OR_X64 */ +-} +- + /* rijndael_key_expansion + * + * Generate the expanded key from the key input by the user. +@@ -830,7 +702,7 @@ rijndael_encryptECB(AESContext *cx, unsigned char *output, + + if (aesni_support()) { + /* Use hardware acceleration for normal AES parameters. */ +- encryptor = &native_encryptBlock; ++ encryptor = &rijndael_native_encryptBlock; + } else { + encryptor = &rijndael_encryptBlock128; + } +@@ -1026,7 +898,7 @@ aes_InitContext(AESContext *cx, const unsigned char *key, unsigned int keysize, + cx->mode == NSS_AES_CTR)) { + PORT_Assert(keysize == 16 || keysize == 24 || keysize == 32); + /* Prepare hardware key for normal AES parameters. */ +- native_key_expansion(cx, key, Nk); ++ rijndael_native_key_expansion(cx, key, Nk); + } else { + rijndael_key_expansion(cx, key, Nk); + } +diff --git lib/freebl/rijndael.h lib/freebl/rijndael.h +index 1f4a8a9f73..1b63a323da 100644 +--- lib/freebl/rijndael.h ++++ lib/freebl/rijndael.h +@@ -8,8 +8,22 @@ + #include "blapii.h" + #include <stdint.h> + +-#ifdef NSS_X86_OR_X64 +-#include <wmmintrin.h> /* aes-ni */ ++#if defined(NSS_X86_OR_X64) ++/* GCC <= 4.8 doesn't support including emmintrin.h without enabling SSE2 */ ++#if !defined(__clang__) && defined(__GNUC__) && defined(__GNUC_MINOR__) && \ ++ (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 8)) ++#pragma GCC push_options ++#pragma GCC target("sse2") ++#undef NSS_DISABLE_SSE2 ++#define NSS_DISABLE_SSE2 1 ++#endif /* GCC <= 4.8 */ ++ ++#include <emmintrin.h> /* __m128i */ ++ ++#ifdef NSS_DISABLE_SSE2 ++#undef NSS_DISABLE_SSE2 ++#pragma GCC pop_options ++#endif /* NSS_DISABLE_SSE2 */ + #endif + + typedef void AESBlockFunc(AESContext *cx, |