69 #ifndef CRYPTOPP_IMPORTS
70 #ifndef CRYPTOPP_GENERATE_X64_MASM
76 NAMESPACE_BEGIN(CryptoPP)
78 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
79 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
80 namespace rdtable {CRYPTOPP_ALIGN_DATA(16) word64 Te[256+2];}
81 using namespace rdtable;
83 static word64 Te[256];
85 static word64 Td[256];
87 static word32 Te[256*4], Td[256*4];
89 static volatile bool s_TeFilled =
false, s_TdFilled =
false;
93 #define QUARTER_ROUND(L, T, t, a, b, c, d) \
94 a ^= L(T, 3, byte(t)); t >>= 8;\
95 b ^= L(T, 2, byte(t)); t >>= 8;\
96 c ^= L(T, 1, byte(t)); t >>= 8;\
99 #define QUARTER_ROUND_LE(t, a, b, c, d) \
100 tempBlock[a] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
101 tempBlock[b] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
102 tempBlock[c] = ((byte *)(Te+byte(t)))[1]; t >>= 8;\
103 tempBlock[d] = ((byte *)(Te+t))[1];
105 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
106 #define QUARTER_ROUND_LD(t, a, b, c, d) \
107 tempBlock[a] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
108 tempBlock[b] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
109 tempBlock[c] = ((byte *)(Td+byte(t)))[GetNativeByteOrder()*7]; t >>= 8;\
110 tempBlock[d] = ((byte *)(Td+t))[GetNativeByteOrder()*7];
112 #define QUARTER_ROUND_LD(t, a, b, c, d) \
113 tempBlock[a] = Sd[byte(t)]; t >>= 8;\
114 tempBlock[b] = Sd[byte(t)]; t >>= 8;\
115 tempBlock[c] = Sd[byte(t)]; t >>= 8;\
116 tempBlock[d] = Sd[t];
119 #define QUARTER_ROUND_E(t, a, b, c, d) QUARTER_ROUND(TL_M, Te, t, a, b, c, d)
120 #define QUARTER_ROUND_D(t, a, b, c, d) QUARTER_ROUND(TL_M, Td, t, a, b, c, d)
122 #ifdef IS_LITTLE_ENDIAN
123 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, d, c, b, a)
124 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, d, c, b, a)
125 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
126 #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (6-i)%4+1))
127 #define TL_M(T, i, x) (*(word32 *)((byte *)T + x*8 + (i+3)%4+1))
129 #define TL_F(T, i, x) rotrFixed(T[x], (3-i)*8)
130 #define TL_M(T, i, x) T[i*256 + x]
133 #define QUARTER_ROUND_FE(t, a, b, c, d) QUARTER_ROUND(TL_F, Te, t, a, b, c, d)
134 #define QUARTER_ROUND_FD(t, a, b, c, d) QUARTER_ROUND(TL_F, Td, t, a, b, c, d)
135 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
136 #define TL_F(T, i, x) (*(word32 *)((byte *)T + x*8 + (4-i)%4))
139 #define TL_F(T, i, x) rotrFixed(T[x], i*8)
140 #define TL_M(T, i, x) T[i*256 + x]
145 #define f2(x) ((x<<1)^(((x>>7)&1)*0x11b))
146 #define f4(x) ((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
147 #define f8(x) ((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))
149 #define f3(x) (f2(x) ^ x)
150 #define f9(x) (f8(x) ^ x)
151 #define fb(x) (f8(x) ^ f2(x) ^ x)
152 #define fd(x) (f8(x) ^ f4(x) ^ x)
153 #define fe(x) (f8(x) ^ f4(x) ^ f2(x))
155 void Rijndael::Base::FillEncTable()
157 for (
int i=0; i<256; i++)
160 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
161 word32 y = word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
162 Te[i] = word64(y | f3(x))<<32 | y;
164 word32 y = f3(x) | word32(x)<<8 | word32(x)<<16 | word32(f2(x))<<24;
165 for (
int j=0; j<4; j++)
172 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
173 Te[256] = Te[257] = 0;
178 void Rijndael::Base::FillDecTable()
180 for (
int i=0; i<256; i++)
183 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
184 word32 y = word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;
185 Td[i] = word64(y | fb(x))<<32 | y | x;
187 word32 y = fb(x) | word32(fd(x))<<8 | word32(f9(x))<<16 | word32(fe(x))<<24;;
188 for (
int j=0; j<4; j++)
198 void Rijndael::Base::UncheckedSetKey(
const byte *userKey,
unsigned int keylen,
const NameValuePairs &)
200 AssertValidKeyLength(keylen);
202 m_rounds = keylen/4 + 6;
203 m_key.New(4*(m_rounds+1));
207 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE && (!defined(_MSC_VER) || _MSC_VER >= 1600 || CRYPTOPP_BOOL_X86)
211 static const word32 rcLE[] = {
212 0x01, 0x02, 0x04, 0x08,
213 0x10, 0x20, 0x40, 0x80,
216 const word32 *rc = rcLE;
218 __m128i temp = _mm_loadu_si128((__m128i *)(userKey+keylen-16));
219 memcpy(rk, userKey, keylen);
223 rk[keylen/4] = rk[0] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 3) ^ *(rc++);
224 rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
225 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
226 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
228 if (rk + keylen/4 + 4 == m_key.end())
233 rk[10] = rk[ 4] ^ rk[ 9];
234 rk[11] = rk[ 5] ^ rk[10];
235 temp = _mm_insert_epi32(temp, rk[11], 3);
237 else if (keylen == 32)
239 temp = _mm_insert_epi32(temp, rk[11], 3);
240 rk[12] = rk[ 4] ^ _mm_extract_epi32(_mm_aeskeygenassist_si128(temp, 0), 2);
241 rk[13] = rk[ 5] ^ rk[12];
242 rk[14] = rk[ 6] ^ rk[13];
243 rk[15] = rk[ 7] ^ rk[14];
244 temp = _mm_insert_epi32(temp, rk[15], 3);
247 temp = _mm_insert_epi32(temp, rk[7], 3);
252 if (!IsForwardTransformation())
257 std::swap(*(__m128i *)(rk), *(__m128i *)(rk+4*m_rounds));
259 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
261 temp = _mm_aesimc_si128(*(__m128i *)(rk+i));
262 *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+j));
263 *(__m128i *)(rk+j) = temp;
266 *(__m128i *)(rk+i) = _mm_aesimc_si128(*(__m128i *)(rk+i));
273 GetUserKey(BIG_ENDIAN_ORDER, rk, keylen/4, userKey, keylen);
274 const word32 *rc = rcon;
279 temp = rk[keylen/4-1];
280 word32 x = (word32(Se[GETBYTE(temp, 2)]) << 24) ^ (word32(Se[GETBYTE(temp, 1)]) << 16) ^ (word32(Se[GETBYTE(temp, 0)]) << 8) ^ Se[GETBYTE(temp, 3)];
281 rk[keylen/4] = rk[0] ^ x ^ *(rc++);
282 rk[keylen/4+1] = rk[1] ^ rk[keylen/4];
283 rk[keylen/4+2] = rk[2] ^ rk[keylen/4+1];
284 rk[keylen/4+3] = rk[3] ^ rk[keylen/4+2];
286 if (rk + keylen/4 + 4 == m_key.end())
291 rk[10] = rk[ 4] ^ rk[ 9];
292 rk[11] = rk[ 5] ^ rk[10];
294 else if (keylen == 32)
297 rk[12] = rk[ 4] ^ (word32(Se[GETBYTE(temp, 3)]) << 24) ^ (word32(Se[GETBYTE(temp, 2)]) << 16) ^ (word32(Se[GETBYTE(temp, 1)]) << 8) ^ Se[GETBYTE(temp, 0)];
298 rk[13] = rk[ 5] ^ rk[12];
299 rk[14] = rk[ 6] ^ rk[13];
300 rk[15] = rk[ 7] ^ rk[14];
307 if (IsForwardTransformation())
312 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk, rk, 16);
313 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk + m_rounds*4, rk + m_rounds*4, 16);
322 #define InverseMixColumn(x) TL_M(Td, 0, Se[GETBYTE(x, 3)]) ^ TL_M(Td, 1, Se[GETBYTE(x, 2)]) ^ TL_M(Td, 2, Se[GETBYTE(x, 1)]) ^ TL_M(Td, 3, Se[GETBYTE(x, 0)])
324 for (i = 4, j = 4*m_rounds-4; i < j; i += 4, j -= 4)
326 temp = InverseMixColumn(rk[i ]); rk[i ] = InverseMixColumn(rk[j ]); rk[j ] = temp;
327 temp = InverseMixColumn(rk[i + 1]); rk[i + 1] = InverseMixColumn(rk[j + 1]); rk[j + 1] = temp;
328 temp = InverseMixColumn(rk[i + 2]); rk[i + 2] = InverseMixColumn(rk[j + 2]); rk[j + 2] = temp;
329 temp = InverseMixColumn(rk[i + 3]); rk[i + 3] = InverseMixColumn(rk[j + 3]); rk[j + 3] = temp;
332 rk[i+0] = InverseMixColumn(rk[i+0]);
333 rk[i+1] = InverseMixColumn(rk[i+1]);
334 rk[i+2] = InverseMixColumn(rk[i+2]);
335 rk[i+3] = InverseMixColumn(rk[i+3]);
337 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[0]); rk[0] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+0]); rk[4*m_rounds+0] = temp;
338 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[1]); rk[1] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+1]); rk[4*m_rounds+1] = temp;
339 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[2]); rk[2] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+2]); rk[4*m_rounds+2] = temp;
340 temp = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[3]); rk[3] = ConditionalByteReverse(BIG_ENDIAN_ORDER, rk[4*m_rounds+3]); rk[4*m_rounds+3] = temp;
343 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
345 ConditionalByteReverse(BIG_ENDIAN_ORDER, rk+4, rk+4, (m_rounds-1)*16);
349 void Rijndael::Enc::ProcessAndXorBlock(
const byte *inBlock,
const byte *xorBlock, byte *outBlock)
const
351 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE) || CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
361 word32 s0, s1, s2, s3, t0, t1, t2, t3;
362 Block::Get(inBlock)(s0)(s1)(s2)(s3);
364 const word32 *rk = m_key;
376 const int cacheLineSize = GetCacheLineSize();
379 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
380 for (i=0; i<2048; i+=cacheLineSize)
382 for (i=0; i<1024; i+=cacheLineSize)
384 u &= *(
const word32 *)(((
const byte *)Te)+i);
386 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
388 QUARTER_ROUND_FE(s3, t0, t1, t2, t3)
389 QUARTER_ROUND_FE(s2, t3, t0, t1, t2)
390 QUARTER_ROUND_FE(s1, t2, t3, t0, t1)
391 QUARTER_ROUND_FE(s0, t1, t2, t3, t0)
394 unsigned int r = m_rounds/2 - 1;
397 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
399 QUARTER_ROUND_E(t3, s0, s1, s2, s3)
400 QUARTER_ROUND_E(t2, s3, s0, s1, s2)
401 QUARTER_ROUND_E(t1, s2, s3, s0, s1)
402 QUARTER_ROUND_E(t0, s1, s2, s3, s0)
404 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
406 QUARTER_ROUND_E(s3, t0, t1, t2, t3)
407 QUARTER_ROUND_E(s2, t3, t0, t1, t2)
408 QUARTER_ROUND_E(s1, t2, t3, t0, t1)
409 QUARTER_ROUND_E(s0, t1, t2, t3, t0)
415 byte *const tempBlock = (byte *)tbw;
417 QUARTER_ROUND_LE(t2, 15, 2, 5, 8)
418 QUARTER_ROUND_LE(t1, 11, 14, 1, 4)
419 QUARTER_ROUND_LE(t0, 7, 10, 13, 0)
420 QUARTER_ROUND_LE(t3, 3, 6, 9, 12)
422 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
425 void Rijndael::
Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock)
const
427 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
437 word32 s0, s1, s2, s3, t0, t1, t2, t3;
438 Block::Get(inBlock)(s0)(s1)(s2)(s3);
440 const word32 *rk = m_key;
452 const int cacheLineSize = GetCacheLineSize();
455 #ifdef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
456 for (i=0; i<2048; i+=cacheLineSize)
458 for (i=0; i<1024; i+=cacheLineSize)
460 u &= *(
const word32 *)(((
const byte *)Td)+i);
462 s0 |= u; s1 |= u; s2 |= u; s3 |= u;
464 QUARTER_ROUND_FD(s3, t2, t1, t0, t3)
465 QUARTER_ROUND_FD(s2, t1, t0, t3, t2)
466 QUARTER_ROUND_FD(s1, t0, t3, t2, t1)
467 QUARTER_ROUND_FD(s0, t3, t2, t1, t0)
470 unsigned int r = m_rounds/2 - 1;
473 s0 = rk[0]; s1 = rk[1]; s2 = rk[2]; s3 = rk[3];
475 QUARTER_ROUND_D(t3, s2, s1, s0, s3)
476 QUARTER_ROUND_D(t2, s1, s0, s3, s2)
477 QUARTER_ROUND_D(t1, s0, s3, s2, s1)
478 QUARTER_ROUND_D(t0, s3, s2, s1, s0)
480 t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
482 QUARTER_ROUND_D(s3, t2, t1, t0, t3)
483 QUARTER_ROUND_D(s2, t1, t0, t3, t2)
484 QUARTER_ROUND_D(s1, t0, t3, t2, t1)
485 QUARTER_ROUND_D(s0, t3, t2, t1, t0)
490 #ifndef CRYPTOPP_ALLOW_UNALIGNED_DATA_ACCESS
495 for (i=0; i<256; i+=cacheLineSize)
496 u &= *(
const word32 *)(Sd+i);
497 u &= *(
const word32 *)(Sd+252);
498 t0 |= u; t1 |= u; t2 |= u; t3 |= u;
502 byte *
const tempBlock = (byte *)tbw;
504 QUARTER_ROUND_LD(t2, 7, 2, 13, 8)
505 QUARTER_ROUND_LD(t1, 3, 14, 9, 4)
506 QUARTER_ROUND_LD(t0, 15, 10, 5, 0)
507 QUARTER_ROUND_LD(t3, 11, 6, 1, 12)
509 Block::Put(xorBlock, outBlock)(tbw[0]^rk[0])(tbw[1]^rk[1])(tbw[2]^rk[2])(tbw[3]^rk[3]);
514 #pragma warning(disable: 4731) // frame pointer register 'ebp' modified by inline assembly code
516 #endif // #ifndef CRYPTOPP_GENERATE_X64_MASM
518 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE
520 CRYPTOPP_NAKED
void CRYPTOPP_FASTCALL Rijndael_Enc_AdvancedProcessBlocks(
void *locals,
const word32 *k)
522 #if CRYPTOPP_BOOL_X86
525 #define L_INDEX(i) (L_REG+512+i)
526 #define L_INXORBLOCKS L_INBLOCKS+4
527 #define L_OUTXORBLOCKS L_INBLOCKS+8
528 #define L_OUTBLOCKS L_INBLOCKS+12
529 #define L_INCREMENTS L_INDEX(16*15)
530 #define L_SP L_INDEX(16*16)
531 #define L_LENGTH L_INDEX(16*16+4)
532 #define L_KEYS_BEGIN L_INDEX(16*16+8)
537 #define MXOR(a,b,c) \
539 AS2( movd mm7, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
540 AS2( pxor MM(a), mm7)\
542 #define MMOV(a,b,c) \
544 AS2( movd MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
549 #define L_INDEX(i) (L_REG+i)
550 #define L_INXORBLOCKS L_INBLOCKS+8
551 #define L_OUTXORBLOCKS L_INBLOCKS+16
552 #define L_OUTBLOCKS L_INBLOCKS+24
553 #define L_INCREMENTS L_INDEX(16*16)
554 #define L_LENGTH L_INDEX(16*18+8)
555 #define L_KEYS_BEGIN L_INDEX(16*19)
567 #define MXOR(a,b,c) \
569 AS2( xor MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
571 #define MMOV(a,b,c) \
573 AS2( mov MM(a), DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
577 #define L_SUBKEYS L_INDEX(0)
578 #define L_SAVED_X L_SUBKEYS
579 #define L_KEY12 L_INDEX(16*12)
580 #define L_LASTROUND L_INDEX(16*13)
581 #define L_INBLOCKS L_INDEX(16*14)
582 #define MAP0TO4(i) (ASM_MOD(i+3,4)+1)
586 AS2( xor a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
590 AS2( mov a, DWORD PTR [AS_REG_7+8*WORD_REG(si)+MAP0TO4(c)])\
592 #ifdef CRYPTOPP_GENERATE_X64_MASM
594 Rijndael_Enc_AdvancedProcessBlocks PROC FRAME
601 mov AS_REG_7, ?Te@rdtable@CryptoPP@@3PA_KA
602 mov edi, DWORD PTR [?g_cacheLineSize@CryptoPP@@3IA]
603 #elif defined(__GNUC__)
606 ".intel_syntax noprefix;"
607 #
if CRYPTOPP_BOOL_X64
612 AS2( mov AS_REG_7, WORD_REG(si))
618 AS2( lea AS_REG_7, [Te])
619 AS2( mov edi, [g_cacheLineSize])
622 #
if CRYPTOPP_BOOL_X86
623 AS2( mov [ecx+16*12+16*4], esp)
624 AS2( lea esp, [ecx-512])
628 AS2( mov WORD_REG(si), [L_KEYS_BEGIN])
629 AS2( mov WORD_REG(ax), 16)
630 AS2( and WORD_REG(ax), WORD_REG(si))
631 AS2( movdqa xmm3, XMMWORD_PTR [WORD_REG(dx)+16+WORD_REG(ax)])
632 AS2( movdqa [L_KEY12], xmm3)
633 AS2( lea WORD_REG(ax), [WORD_REG(dx)+WORD_REG(ax)+2*16])
634 AS2( sub WORD_REG(ax), WORD_REG(si))
636 AS2( movdqa xmm0, [WORD_REG(ax)+WORD_REG(si)])
637 AS2( movdqa XMMWORD_PTR [L_SUBKEYS+WORD_REG(si)], xmm0)
638 AS2( add WORD_REG(si), 16)
639 AS2( cmp WORD_REG(si), 16*12)
643 AS2( movdqa xmm4, [WORD_REG(ax)+WORD_REG(si)])
644 AS2( movdqa xmm1, [WORD_REG(dx)])
645 AS2( MOVD MM(1), [WORD_REG(dx)+4*4])
646 AS2( mov ebx, [WORD_REG(dx)+5*4])
647 AS2( mov ecx, [WORD_REG(dx)+6*4])
648 AS2( mov edx, [WORD_REG(dx)+7*4])
651 AS2( xor WORD_REG(ax), WORD_REG(ax))
653 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
654 AS2( add WORD_REG(ax), WORD_REG(di))
655 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
656 AS2( add WORD_REG(ax), WORD_REG(di))
657 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
658 AS2( add WORD_REG(ax), WORD_REG(di))
659 AS2( mov esi, [AS_REG_7+WORD_REG(ax)])
660 AS2( add WORD_REG(ax), WORD_REG(di))
661 AS2( cmp WORD_REG(ax), 2048)
665 AS2( test DWORD PTR [L_LENGTH], 1)
669 AS2( mov WORD_REG(si), [L_INBLOCKS])
670 AS2( movdqu xmm2, [WORD_REG(si)])
671 AS2( pxor xmm2, xmm1)
672 AS2( psrldq xmm1, 14)
674 AS2( mov al, BYTE PTR [WORD_REG(si)+15])
675 AS2( MOVD MM(2), eax)
676 #
if CRYPTOPP_BOOL_X86
714 AS2( mov eax, [L_KEY12+0*4])
715 AS2( mov edi, [L_KEY12+2*4])
716 AS2( MOVD MM(0), [L_KEY12+3*4])
723 AS2( xor ebx, [L_KEY12+1*4])
735 AS2( MOVD edx, MM(1))
736 AS2( MOVD [L_SAVED_X+3*4], MM(0))
737 AS2( mov [L_SAVED_X+0*4], eax)
738 AS2( mov [L_SAVED_X+1*4], ebx)
739 AS2( mov [L_SAVED_X+2*4], edi)
744 AS2( MOVD MM(1), [L_KEY12+0*4])
745 AS2( mov ebx, [L_KEY12+1*4])
746 AS2( mov ecx, [L_KEY12+2*4])
747 AS2( mov edx, [L_KEY12+3*4])
749 AS2( mov WORD_REG(ax), [L_INBLOCKS])
750 AS2( movdqu xmm2, [WORD_REG(ax)])
751 AS2( mov WORD_REG(si), [L_INXORBLOCKS])
752 AS2( movdqu xmm5, [WORD_REG(si)])
753 AS2( pxor xmm2, xmm1)
754 AS2( pxor xmm2, xmm5)
787 AS2( MOVD eax, MM(1))
789 AS2( add L_REG, [L_KEYS_BEGIN])
790 AS2( add L_REG, 4*16)
795 AS2( MOVD ecx, MM(2))
796 AS2( MOVD edx, MM(1))
797 AS2( mov eax, [L_SAVED_X+0*4])
798 AS2( mov ebx, [L_SAVED_X+1*4])
800 AS2( and WORD_REG(cx), 255)
802 #
if CRYPTOPP_BOOL_X86
803 AS2( paddb MM(2), mm3)
808 AS2( xor edx, DWORD PTR [AS_REG_7+WORD_REG(cx)*8+3])
812 AS2( xor ecx, [L_SAVED_X+2*4])
815 AS2( xor edx, [L_SAVED_X+3*4])
817 AS2( add L_REG, [L_KEYS_BEGIN])
818 AS2( add L_REG, 3*16)
847 AS2( MOVD MM(0), [L_SUBKEYS-4*16+3*4])
848 AS2( mov edi, [L_SUBKEYS-4*16+2*4])
851 AS2( xor eax, [L_SUBKEYS-4*16+0*4])
852 AS2( xor ebx, [L_SUBKEYS-4*16+1*4])
853 AS2( MOVD edx, MM(0))
856 AS2( MOVD MM(0), [L_SUBKEYS-4*16+7*4])
857 AS2( mov edi, [L_SUBKEYS-4*16+6*4])
860 AS2( xor eax, [L_SUBKEYS-4*16+4*4])
861 AS2( xor ebx, [L_SUBKEYS-4*16+5*4])
862 AS2( MOVD edx, MM(0))
865 AS2( test L_REG, 255)
867 AS2( sub L_REG, 16*16)
869 #define LAST(a, b, c) \
871 AS2( movzx edi, BYTE PTR [AS_REG_7+WORD_REG(si)*8+1] )\
873 AS2( xor edi, DWORD PTR [AS_REG_7+WORD_REG(si)*8+0] )\
874 AS2( mov WORD PTR [L_LASTROUND+c], di )\
890 AS2( mov WORD_REG(ax), [L_OUTXORBLOCKS])
891 AS2( mov WORD_REG(bx), [L_OUTBLOCKS])
893 AS2( mov WORD_REG(cx), [L_LENGTH])
894 AS2( sub WORD_REG(cx), 16)
896 AS2( movdqu xmm2, [WORD_REG(ax)])
897 AS2( pxor xmm2, xmm4)
899 #
if CRYPTOPP_BOOL_X86
900 AS2( movdqa xmm0, [L_INCREMENTS])
901 AS2( paddd xmm0, [L_INBLOCKS])
902 AS2( movdqa [L_INBLOCKS], xmm0)
904 AS2( movdqa xmm0, [L_INCREMENTS+16])
905 AS2( paddq xmm0, [L_INBLOCKS+16])
906 AS2( movdqa [L_INBLOCKS+16], xmm0)
909 AS2( pxor xmm2, [L_LASTROUND])
910 AS2( movdqu [WORD_REG(bx)], xmm2)
913 AS2( mov [L_LENGTH], WORD_REG(cx))
914 AS2( test WORD_REG(cx), 1)
916 #
if CRYPTOPP_BOOL_X64
917 AS2( movdqa xmm0, [L_INCREMENTS])
918 AS2( paddq xmm0, [L_INBLOCKS])
919 AS2( movdqa [L_INBLOCKS], xmm0)
925 AS2( xorps xmm0, xmm0)
926 AS2( lea WORD_REG(ax), [L_SUBKEYS+7*16])
927 AS2( movaps [WORD_REG(ax)-7*16], xmm0)
928 AS2( movaps [WORD_REG(ax)-6*16], xmm0)
929 AS2( movaps [WORD_REG(ax)-5*16], xmm0)
930 AS2( movaps [WORD_REG(ax)-4*16], xmm0)
931 AS2( movaps [WORD_REG(ax)-3*16], xmm0)
932 AS2( movaps [WORD_REG(ax)-2*16], xmm0)
933 AS2( movaps [WORD_REG(ax)-1*16], xmm0)
934 AS2( movaps [WORD_REG(ax)+0*16], xmm0)
935 AS2( movaps [WORD_REG(ax)+1*16], xmm0)
936 AS2( movaps [WORD_REG(ax)+2*16], xmm0)
937 AS2( movaps [WORD_REG(ax)+3*16], xmm0)
938 AS2( movaps [WORD_REG(ax)+4*16], xmm0)
939 AS2( movaps [WORD_REG(ax)+5*16], xmm0)
940 AS2( movaps [WORD_REG(ax)+6*16], xmm0)
941 #
if CRYPTOPP_BOOL_X86
942 AS2( mov esp, [L_SP])
947 #
if defined(_MSC_VER) && CRYPTOPP_BOOL_X86
952 #ifdef CRYPTOPP_GENERATE_X64_MASM
958 Rijndael_Enc_AdvancedProcessBlocks ENDP
961 ".att_syntax prefix;"
963 :
"c" (locals),
"d" (k),
"S" (Te),
"D" (g_cacheLineSize)
964 :
"memory",
"cc",
"%eax"
965 #
if CRYPTOPP_BOOL_X64
966 ,
"%rbx",
"%r8",
"%r9",
"%r10",
"%r11",
"%r12"
974 #ifndef CRYPTOPP_GENERATE_X64_MASM
976 #ifdef CRYPTOPP_X64_MASM_AVAILABLE
978 void Rijndael_Enc_AdvancedProcessBlocks(
void *locals,
const word32 *k);
982 #if CRYPTOPP_BOOL_X64 || CRYPTOPP_BOOL_X86
984 static inline bool AliasedWithTable(
const byte *begin,
const byte *end)
986 size_t s0 = size_t(begin)%4096, s1 = size_t(end)%4096;
987 size_t t0 = size_t(Te)%4096, t1 = (size_t(Te)+
sizeof(Te))%4096;
989 return (s0 >= t0 && s0 < t1) || (s1 > t0 && s1 <= t1);
991 return (s0 < t1 || s1 <= t1) || (s0 >= t0 || s1 > t0);
994 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
996 inline void AESNI_Enc_Block(__m128i &block,
const __m128i *subkeys,
unsigned int rounds)
998 block = _mm_xor_si128(block, subkeys[0]);
999 for (
unsigned int i=1; i<rounds-1; i+=2)
1001 block = _mm_aesenc_si128(block, subkeys[i]);
1002 block = _mm_aesenc_si128(block, subkeys[i+1]);
1004 block = _mm_aesenc_si128(block, subkeys[rounds-1]);
1005 block = _mm_aesenclast_si128(block, subkeys[rounds]);
1008 inline void AESNI_Enc_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
const __m128i *subkeys,
unsigned int rounds)
1010 __m128i rk = subkeys[0];
1011 block0 = _mm_xor_si128(block0, rk);
1012 block1 = _mm_xor_si128(block1, rk);
1013 block2 = _mm_xor_si128(block2, rk);
1014 block3 = _mm_xor_si128(block3, rk);
1015 for (
unsigned int i=1; i<rounds; i++)
1018 block0 = _mm_aesenc_si128(block0, rk);
1019 block1 = _mm_aesenc_si128(block1, rk);
1020 block2 = _mm_aesenc_si128(block2, rk);
1021 block3 = _mm_aesenc_si128(block3, rk);
1023 rk = subkeys[rounds];
1024 block0 = _mm_aesenclast_si128(block0, rk);
1025 block1 = _mm_aesenclast_si128(block1, rk);
1026 block2 = _mm_aesenclast_si128(block2, rk);
1027 block3 = _mm_aesenclast_si128(block3, rk);
1030 inline void AESNI_Dec_Block(__m128i &block,
const __m128i *subkeys,
unsigned int rounds)
1032 block = _mm_xor_si128(block, subkeys[0]);
1033 for (
unsigned int i=1; i<rounds-1; i+=2)
1035 block = _mm_aesdec_si128(block, subkeys[i]);
1036 block = _mm_aesdec_si128(block, subkeys[i+1]);
1038 block = _mm_aesdec_si128(block, subkeys[rounds-1]);
1039 block = _mm_aesdeclast_si128(block, subkeys[rounds]);
1042 inline void AESNI_Dec_4_Blocks(__m128i &block0, __m128i &block1, __m128i &block2, __m128i &block3,
const __m128i *subkeys,
unsigned int rounds)
1044 __m128i rk = subkeys[0];
1045 block0 = _mm_xor_si128(block0, rk);
1046 block1 = _mm_xor_si128(block1, rk);
1047 block2 = _mm_xor_si128(block2, rk);
1048 block3 = _mm_xor_si128(block3, rk);
1049 for (
unsigned int i=1; i<rounds; i++)
1052 block0 = _mm_aesdec_si128(block0, rk);
1053 block1 = _mm_aesdec_si128(block1, rk);
1054 block2 = _mm_aesdec_si128(block2, rk);
1055 block3 = _mm_aesdec_si128(block3, rk);
1057 rk = subkeys[rounds];
1058 block0 = _mm_aesdeclast_si128(block0, rk);
1059 block1 = _mm_aesdeclast_si128(block1, rk);
1060 block2 = _mm_aesdeclast_si128(block2, rk);
1061 block3 = _mm_aesdeclast_si128(block3, rk);
1064 static CRYPTOPP_ALIGN_DATA(16) const word32 s_one[] = {0, 0, 0, 1<<24};
1066 template <
typename F1,
typename F4>
1067 inline size_t AESNI_AdvancedProcessBlocks(F1 func1, F4 func4,
const __m128i *subkeys,
unsigned int rounds,
const byte *inBlocks,
const byte *xorBlocks, byte *outBlocks,
size_t length, word32 flags)
1069 size_t blockSize = 16;
1070 size_t inIncrement = (flags & (BlockTransformation::BT_InBlockIsCounter|BlockTransformation::BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1071 size_t xorIncrement = xorBlocks ? blockSize : 0;
1072 size_t outIncrement = (flags & BlockTransformation::BT_DontIncrementInOutPointers) ? 0 : blockSize;
1074 if (flags & BlockTransformation::BT_ReverseDirection)
1076 assert(length % blockSize == 0);
1077 inBlocks += length - blockSize;
1078 xorBlocks += length - blockSize;
1079 outBlocks += length - blockSize;
1080 inIncrement = 0-inIncrement;
1081 xorIncrement = 0-xorIncrement;
1082 outIncrement = 0-outIncrement;
1085 if (flags & BlockTransformation::BT_AllowParallel)
1087 while (length >= 4*blockSize)
1089 __m128i block0 = _mm_loadu_si128((
const __m128i *)inBlocks), block1, block2, block3;
1090 if (flags & BlockTransformation::BT_InBlockIsCounter)
1092 const __m128i be1 = *(
const __m128i *)s_one;
1093 block1 = _mm_add_epi32(block0, be1);
1094 block2 = _mm_add_epi32(block1, be1);
1095 block3 = _mm_add_epi32(block2, be1);
1096 _mm_storeu_si128((__m128i *)inBlocks, _mm_add_epi32(block3, be1));
1100 inBlocks += inIncrement;
1101 block1 = _mm_loadu_si128((
const __m128i *)inBlocks);
1102 inBlocks += inIncrement;
1103 block2 = _mm_loadu_si128((
const __m128i *)inBlocks);
1104 inBlocks += inIncrement;
1105 block3 = _mm_loadu_si128((
const __m128i *)inBlocks);
1106 inBlocks += inIncrement;
1109 if (flags & BlockTransformation::BT_XorInput)
1111 block0 = _mm_xor_si128(block0, _mm_loadu_si128((
const __m128i *)xorBlocks));
1112 xorBlocks += xorIncrement;
1113 block1 = _mm_xor_si128(block1, _mm_loadu_si128((
const __m128i *)xorBlocks));
1114 xorBlocks += xorIncrement;
1115 block2 = _mm_xor_si128(block2, _mm_loadu_si128((
const __m128i *)xorBlocks));
1116 xorBlocks += xorIncrement;
1117 block3 = _mm_xor_si128(block3, _mm_loadu_si128((
const __m128i *)xorBlocks));
1118 xorBlocks += xorIncrement;
1121 func4(block0, block1, block2, block3, subkeys, rounds);
1123 if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1125 block0 = _mm_xor_si128(block0, _mm_loadu_si128((
const __m128i *)xorBlocks));
1126 xorBlocks += xorIncrement;
1127 block1 = _mm_xor_si128(block1, _mm_loadu_si128((
const __m128i *)xorBlocks));
1128 xorBlocks += xorIncrement;
1129 block2 = _mm_xor_si128(block2, _mm_loadu_si128((
const __m128i *)xorBlocks));
1130 xorBlocks += xorIncrement;
1131 block3 = _mm_xor_si128(block3, _mm_loadu_si128((
const __m128i *)xorBlocks));
1132 xorBlocks += xorIncrement;
1135 _mm_storeu_si128((__m128i *)outBlocks, block0);
1136 outBlocks += outIncrement;
1137 _mm_storeu_si128((__m128i *)outBlocks, block1);
1138 outBlocks += outIncrement;
1139 _mm_storeu_si128((__m128i *)outBlocks, block2);
1140 outBlocks += outIncrement;
1141 _mm_storeu_si128((__m128i *)outBlocks, block3);
1142 outBlocks += outIncrement;
1144 length -= 4*blockSize;
1148 while (length >= blockSize)
1150 __m128i block = _mm_loadu_si128((
const __m128i *)inBlocks);
1152 if (flags & BlockTransformation::BT_XorInput)
1153 block = _mm_xor_si128(block, _mm_loadu_si128((
const __m128i *)xorBlocks));
1155 if (flags & BlockTransformation::BT_InBlockIsCounter)
1156 const_cast<byte *
>(inBlocks)[15]++;
1158 func1(block, subkeys, rounds);
1160 if (xorBlocks && !(flags & BlockTransformation::BT_XorInput))
1161 block = _mm_xor_si128(block, _mm_loadu_si128((
const __m128i *)xorBlocks));
1163 _mm_storeu_si128((__m128i *)outBlocks, block);
1165 inBlocks += inIncrement;
1166 outBlocks += outIncrement;
1167 xorBlocks += xorIncrement;
1168 length -= blockSize;
1177 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1179 return AESNI_AdvancedProcessBlocks(AESNI_Enc_Block, AESNI_Enc_4_Blocks, (
const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1182 #if CRYPTOPP_BOOL_SSE2_ASM_AVAILABLE || defined(CRYPTOPP_X64_MASM_AVAILABLE)
1185 if (length < BLOCKSIZE)
1190 word32 subkeys[4*12], workspace[8];
1191 const byte *inBlocks, *inXorBlocks, *outXorBlocks;
1193 size_t inIncrement, inXorIncrement, outXorIncrement, outIncrement;
1194 size_t regSpill, lengthAndCounterFlag, keysBegin;
1197 size_t increment = BLOCKSIZE;
1198 const byte* zeros = (byte *)(Te+256);
1202 space = (byte *)alloca(255+
sizeof(Locals));
1203 space += (256-(size_t)space%256)%256;
1205 while (AliasedWithTable(space, space+
sizeof(Locals)));
1207 if (flags & BT_ReverseDirection)
1209 assert(length % BLOCKSIZE == 0);
1210 inBlocks += length - BLOCKSIZE;
1211 xorBlocks += length - BLOCKSIZE;
1212 outBlocks += length - BLOCKSIZE;
1213 increment = 0-increment;
1216 Locals &locals = *(Locals *)space;
1218 locals.inBlocks = inBlocks;
1219 locals.inXorBlocks = (flags & BT_XorInput) && xorBlocks ? xorBlocks : zeros;
1220 locals.outXorBlocks = (flags & BT_XorInput) || !xorBlocks ? zeros : xorBlocks;
1221 locals.outBlocks = outBlocks;
1223 locals.inIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1224 locals.inXorIncrement = (flags & BT_XorInput) && xorBlocks ? increment : 0;
1225 locals.outXorIncrement = (flags & BT_XorInput) || !xorBlocks ? 0 : increment;
1226 locals.outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : increment;
1228 locals.lengthAndCounterFlag = length - (length%16) -
bool(flags & BT_InBlockIsCounter);
1229 int keysToCopy = m_rounds - (flags & BT_InBlockIsCounter ? 3 : 2);
1230 locals.keysBegin = (12-keysToCopy)*16;
1232 Rijndael_Enc_AdvancedProcessBlocks(&locals, m_key);
1233 return length % BLOCKSIZE;
1242 #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE
1247 return AESNI_AdvancedProcessBlocks(AESNI_Dec_Block, AESNI_Dec_4_Blocks, (
const __m128i *)m_key.begin(), m_rounds, inBlocks, xorBlocks, outBlocks, length, flags);
1252 #endif // #if CRYPTOPP_BOOL_AESNI_INTRINSICS_AVAILABLE