📄 rijndael.cpp
字号:
// rijndael.cpp - modified by Chris Morgan <cmorgan@wpi.edu>// and Wei Dai from Paulo Baretto's Rijndael implementation// The original code and all modifications are in the public domain.// use "cl /EP /P /DCRYPTOPP_GENERATE_X64_MASM rijndael.cpp" to generate MASM code/*Feb 2009: The x86/x64 assembly code was rewritten in by Wei Dai to do counter mode caching, which was invented by Hongjun Wu and popularized by Daniel J. Bernstein and Peter Schwabe in their paper "New AES software speed records". The round function was also modified to include a trick similar to one in Brian Gladman's x86 assembly code, doing an 8-bit register move to minimize the number of register spills. Also switched to compressed tables and copying round keys to the stack.The C++ implementation now uses compressed tables if CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined.*//*July 2006: Defense against timing attacks was added in by Wei Dai.The code now uses smaller tables in the first and last rounds,and preloads them into L1 cache before usage (by loading at least one element in each cache line). We try to delay subsequent accesses to each table (used in the first and last rounds) until all of the table has been preloaded. Hopefullythe compiler isn't smart enough to optimize that code away.After preloading the table, we also try not to access any memory locationother than the table and the stack, in order to prevent table entries from being unloaded from L1 cache, until that round is finished.(Some popular CPUs have 2-way associative caches.)*/// This is the original introductory comment:/** * version 3.0 (December 2000) * * Optimised ANSI C code for the Rijndael cipher (now AES) * * author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be> * author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be> * author Paulo Barreto <paulo.barreto@terra.com.br> * * This code is hereby placed in the public domain. * * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */#include "pch.h"#ifndef CRYPTOPP_IMPORTS#ifndef CRYPTOPP_GENERATE_X64_MASM#include "rijndael.h"#include "misc.h"#include "cpu.h"#ifdef __sun#include <alloca.h>#endifNAMESPACE_BEGIN(CryptoPP)#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}using namespace rdtable;#elsestatic word64 Te[256];#endifstatic word64 Td[256];#elsestatic word32 Te[256*4], Td[256*4];#endifstatic bool s_TeFilled = false, s_TdFilled = false;// ************************* Portable Code ************************************#define QUARTER_ROUND(L, T, t, a, b, c, d) \ a ^= L(T, 3, byte(t)); t >>= 8;\ b ^= L(T, 2, byte(t)); t >>= 8;\ c ^= L(T, 1, byte(t)); t >>= 8;\ d ^= L(T, 0, t);#define QUARTER_ROUND_LE(t, a, b, c, d) \ tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\ tempBlock[d] = ((byte *)(Te+t))[1];#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS #define QUARTER_ROUND_LD(t, a, b, c, d) \ tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\ tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\ tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\ tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];#else #define QUARTER_ROUND_LD(t, a, b, c, d) \ tempBlock[a] = Sd[byte(t)]; t >>= 8;\ tempBlock[b] = Sd[byte(t)]; t >>= 8;\ tempBlock[c] = Sd[byte(t)]; t >>= 8;\ tempBlock[d] = Sd[t];#endif#define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)#define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)#ifdef IS_LITTLE_ENDIAN #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a) #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a) #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (6-i)%4+1)) #define TL_M(T, i, x) (*(word32 *)((byte *)T + x*8 + (i+3)%4+1)) #else #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8) #define TL_M(T, i, x) T[i*256 + x] #endif#else #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d) #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d) #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (4-i)%4)) #define TL_M TL_F #else #define TL_F(T, i, x) rotrFixed(T[x], i*8) #define TL_M(T, i, x) T[i*256 + x] #endif#endif#define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))#define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))#define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))#define f3(x) (f2(x) ^ x)#define f9(x) (f8(x) ^ x)#define fb(x) (f8(x) ^ f2(x) ^ x)#define fd(x) (f8(x) ^ f4(x) ^ x)#define fe(x) (f8(x) ^ f4(x) ^ f2(x))void Rijndael::Base::FillEncTable(){ for (int i=0; i<256; i++) { byte x = Se[i];#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24; Te[i] = word64(y | f3(x))<<32 | y;#else word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24; for (int j=0; j<4; j++) { Te[i+j*256] = y; y = rotrFixed(y, 8); }#endif }#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) Te[256] = Te[257] = 0;#endif s_TeFilled = true;}void Rijndael::Base::FillDecTable(){ for (int i=0; i<256; i++) { byte x = Sd[i];#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24; Td[i] = word64(y | fb(x))<<32 | y | x;#else word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;; for (int j=0; j<4; j++) { Td[i+j*256] = y; y = rotrFixed(y, 8); }#endif } s_TdFilled = true;}void Rijndael::Base::UncheckedSetKey(const byte *userKey, unsigned int keylen, const NameValuePairs &){ AssertValidKeyLength(keylen); m_rounds = keylen/4 + 6; m_key.New(4*(m_rounds+1)); word32 temp, *rk = m_key; const word32 *rc = rcon; GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen); while (true) { temp = rk[keylen/4-1]; rk[keylen/4] = rk[0] ^ (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)] ^ *(rc++); rk[keylen/4+1] = rk[1] ^ rk[keylen/4]; rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1]; rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2]; if (rk + keylen/4 + 4 == m_key.end()) break; if (keylen == 24) { rk[10] = rk[ 4] ^ rk[ 9]; rk[11] = rk[ 5] ^ rk[10]; } else if (keylen == 32) { temp = rk[11]; rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)]; rk[13] = rk[ 5] ^ rk[12]; rk[14] = rk[ 6] ^ rk[13]; rk[15] = rk[ 7] ^ rk[14]; } rk += keylen/4; } if (IsForwardTransformation()) { if (!s_TeFilled) FillEncTable(); } else { if (!s_TdFilled) FillDecTable(); unsigned int i, j; rk = m_key; /* invert the order of the round keys: */ for (i = 0, j = 4*m_rounds; i < j; i += 4, j -= 4) { temp = rk[i ]; rk[i ] = rk[j ]; rk[j ] = temp; temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp; temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp; temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp; }#define InverseMixColumn(x) x = TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)]) /* apply the inverse MixColumn transform to all round keys but the first and the last: */ for (i = 1; i < m_rounds; i++) { rk += 4; InverseMixColumn(rk[0]); InverseMixColumn(rk[1]); InverseMixColumn(rk[2]); InverseMixColumn(rk[3]); } } ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key.begin(), m_key.begin(), 16); ConditionalByteReverse(BIG_ENDIAN_ORDER, m_key + m_rounds*4, m_key + m_rounds*4, 16);}void Rijndael::Enc::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const{#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) if (HasSSE2()) { Rijndael::Enc::AdvancedProcessBlocks(inBlock, xorBlock, outBlock, 16, 0); return; }#endif typedef BlockGetAndPut<word32, NativeByteOrder> Block; word32 s0, s1, s2, s3, t0, t1, t2, t3; Block::Get(inBlock)(s0)(s1)(s2)(s3); const word32 *rk = m_key; s0 ^= rk[0]; s1 ^= rk[1]; s2 ^= rk[2]; s3 ^= rk[3]; t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7]; rk += 8; // timing attack countermeasure. see comments at top for more details const int cacheLineSize = GetCacheLineSize(); unsigned int i; word32 u = 0;#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS for (i=0; i<2048; i+=cacheLineSize)#else for (i=0; i<1024; i+=cacheLineSize)#endif u &= *(const word32 *)(((const byte *)Te)+i); u &= Te[255]; s0 |= u; s1 |= u; s2 |= u; s3 |= u; QUARTER_ROUND_FE(s3, t0, t1, t2, t3) QUARTER_ROUND_FE(s2, t3, t0, t1, t2) QUARTER_ROUND_FE(s1, t2, t3, t0, t1) QUARTER_ROUND_FE(s0, t1, t2, t3, t0) // Nr - 2 full rounds: unsigned int r = m_rounds/2 - 1; do { s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3]; QUARTER_ROUND_E(t3, s0, s1, s2, s3) QUARTER_ROUND_E(t2, s3, s0, s1, s2) QUARTER_ROUND_E(t1, s2, s3, s0, s1) QUARTER_ROUND_E(t0, s1, s2, s3, s0) t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7]; QUARTER_ROUND_E(s3, t0, t1, t2, t3) QUARTER_ROUND_E(s2, t3, t0, t1, t2) QUARTER_ROUND_E(s1, t2, t3, t0, t1) QUARTER_ROUND_E(s0, t1, t2, t3, t0) rk += 8; } while (--r); word32 tbw[4]; byte *const tempBlock = (byte *)tbw; QUARTER_ROUND_LE(t2, 15, 2, 5, 8) QUARTER_ROUND_LE(t1, 11, 14, 1, 4) QUARTER_ROUND_LE(t0, 7, 10, 13, 0) QUARTER_ROUND_LE(t3, 3, 6, 9, 12) Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);}void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const{ typedef BlockGetAndPut<word32, NativeByteOrder> Block; word32 s0, s1, s2, s3, t0, t1, t2, t3; Block::Get(inBlock)(s0)(s1)(s2)(s3); const word32 *rk = m_key; s0 ^= rk[0]; s1 ^= rk[1]; s2 ^= rk[2]; s3 ^= rk[3]; t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7]; rk += 8; // timing attack countermeasure. see comments at top for more details const int cacheLineSize = GetCacheLineSize(); unsigned int i; word32 u = 0;#ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS for (i=0; i<2048; i+=cacheLineSize)#else for (i=0; i<1024; i+=cacheLineSize)#endif u &= *(const word32 *)(((const byte *)Td)+i); u &= Td[255]; s0 |= u; s1 |= u; s2 |= u; s3 |= u; QUARTER_ROUND_FD(s3, t2, t1, t0, t3) QUARTER_ROUND_FD(s2, t1, t0, t3, t2) QUARTER_ROUND_FD(s1, t0, t3, t2, t1) QUARTER_ROUND_FD(s0, t3, t2, t1, t0) // Nr - 2 full rounds: unsigned int r = m_rounds/2 - 1; do { s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3]; QUARTER_ROUND_D(t3, s2, s1, s0, s3) QUARTER_ROUND_D(t2, s1, s0, s3, s2) QUARTER_ROUND_D(t1, s0, s3, s2, s1) QUARTER_ROUND_D(t0, s3, s2, s1, s0) t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7]; QUARTER_ROUND_D(s3, t2, t1, t0, t3) QUARTER_ROUND_D(s2, t1, t0, t3, t2) QUARTER_ROUND_D(s1, t0, t3, t2, t1) QUARTER_ROUND_D(s0, t3, t2, t1, t0) rk += 8; } while (--r);#ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS // timing attack countermeasure. see comments at top for more details // If CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS is defined, // QUARTER_ROUND_LD will use Td, which is already preloaded. u = 0; for (i=0; i<256; i+=cacheLineSize) u &= *(const word32 *)(Sd+i); u &= *(const word32 *)(Sd+252); t0 |= u; t1 |= u; t2 |= u; t3 |= u;#endif word32 tbw[4]; byte *const tempBlock = (byte *)tbw; QUARTER_ROUND_LD(t2, 7, 2, 13, 8) QUARTER_ROUND_LD(t1, 3, 14, 9, 4) QUARTER_ROUND_LD(t0, 15, 10, 5, 0) QUARTER_ROUND_LD(t3, 11, 6, 1, 12) Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);}// ************************* Assembly Code ************************************#pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code#endif // #ifndef CRYPTOPP_GENERATE_X64_MASM#if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLECRYPTOPP_NAKED void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(void *locals, const word32 *k){#if CRYPTOPP_BOOL_X86#define L_REG esp#define L_INDEX(i) (L_REG+512+i)#define L_INXORBLOCKS L_INBLOCKS+4#define L_OUTXORBLOCKS L_INBLOCKS+8#define L_OUTBLOCKS L_INBLOCKS+12#define L_INCREMENTS L_INDEX(16*15)#define L_SP L_INDEX(16*16)#define L_LENGTH L_INDEX(16*16+4)#define L_KEYS_BEGIN L_INDEX(16*16+8)#define MOVD movd#define MM(i) mm##i#define MXOR(a,b,c) \ AS2( movzx esi, b)\ AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\ AS2( pxor MM(a), mm7)\#define MMOV(a,b,c) \ AS2( movzx esi, b)\ AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\#else#define L_REG r8#define L_INDEX(i) (L_REG+i)#define L_INXORBLOCKS L_INBLOCKS+8#define L_OUTXORBLOCKS L_INBLOCKS+16#define L_OUTBLOCKS L_INBLOCKS+24#define L_INCREMENTS L_INDEX(16*16)#define L_LENGTH L_INDEX(16*18+8)#define L_KEYS_BEGIN L_INDEX(16*19)#define MOVD mov#define MM_0 r9d#define MM_1 r12d#ifdef __GNUC__#define MM_2 r11d#else#define MM_2 r10d#endif#define MM(i) MM_##i#define MXOR(a,b,c) \ AS2( movzx esi, b)\
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -