Crypto++  8.4
Free C++ class library of cryptographic schemes
sha_simd.cpp
1 // sha_simd.cpp - written and placed in the public domain by
2 // Jeffrey Walton, Uri Blumenthal and Marcel Raad.
3 //
4 // This source file uses intrinsics to gain access to SHA-NI and
5 // ARMv8a SHA instructions. A separate source file is needed
6 // because additional CXXFLAGS are required to enable the
7 // appropriate instructions sets in some build configurations.
8 
9 #include "pch.h"
10 #include "config.h"
11 #include "sha.h"
12 #include "misc.h"
13 
14 #if defined(CRYPTOPP_DISABLE_SHA_ASM)
15 # undef CRYPTOPP_X86_ASM_AVAILABLE
16 # undef CRYPTOPP_X32_ASM_AVAILABLE
17 # undef CRYPTOPP_X64_ASM_AVAILABLE
18 # undef CRYPTOPP_SSE2_ASM_AVAILABLE
19 #endif
20 
21 #if (CRYPTOPP_SHANI_AVAILABLE)
22 # include <nmmintrin.h>
23 # include <immintrin.h>
24 #endif
25 
26 #if (CRYPTOPP_ARM_NEON_HEADER)
27 # include <arm_neon.h>
28 #endif
29 
30 #if (CRYPTOPP_ARM_ACLE_HEADER)
31 # include <stdint.h>
32 # include <arm_acle.h>
33 #endif
34 
35 #if CRYPTOPP_POWER8_SHA_AVAILABLE
36 # include "ppc_simd.h"
37 #endif
38 
39 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
40 # include <signal.h>
41 # include <setjmp.h>
42 #endif
43 
44 #ifndef EXCEPTION_EXECUTE_HANDLER
45 # define EXCEPTION_EXECUTE_HANDLER 1
46 #endif
47 
48 // Clang intrinsic casts
49 #define M128_CAST(x) ((__m128i *)(void *)(x))
50 #define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
51 
52 // Squash MS LNK4221 and libtool warnings
53 extern const char SHA_SIMD_FNAME[] = __FILE__;
54 
55 NAMESPACE_BEGIN(CryptoPP)
56 
57 // ***************** SHA key tables ********************
58 
59 extern const word32 SHA256_K[64];
60 extern const word64 SHA512_K[80];
61 
62 // ***************** SIGILL probes ********************
63 
64 #ifdef CRYPTOPP_GNU_STYLE_INLINE_ASSEMBLY
65 extern "C" {
66  typedef void (*SigHandler)(int);
67 
68  static jmp_buf s_jmpSIGILL;
69  static void SigIllHandler(int)
70  {
71  longjmp(s_jmpSIGILL, 1);
72  }
73 }
74 #endif // Not CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY
75 
76 #if (CRYPTOPP_BOOL_ARM32 || CRYPTOPP_BOOL_ARMV8)
77 bool CPU_ProbeSHA1()
78 {
79 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
80  return false;
81 #elif (CRYPTOPP_ARM_SHA1_AVAILABLE)
82 # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
83  volatile bool result = true;
84  __try
85  {
86  unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
87  uint32x4_t data1 = vld1q_u32(w+0);
88  uint32x4_t data2 = vld1q_u32(w+4);
89  uint32x4_t data3 = vld1q_u32(w+8);
90 
91  uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
92  uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
93  uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
94  uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
95  uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
96 
97  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
98  }
99  __except (EXCEPTION_EXECUTE_HANDLER)
100  {
101  return false;
102  }
103  return result;
104 # else
105 
106  // longjmp and clobber warnings. Volatile is required.
107  // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
108  volatile bool result = true;
109 
110  volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
111  if (oldHandler == SIG_ERR)
112  return false;
113 
114  volatile sigset_t oldMask;
115  if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
116  {
117  signal(SIGILL, oldHandler);
118  return false;
119  }
120 
121  if (setjmp(s_jmpSIGILL))
122  result = false;
123  else
124  {
125  unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
126  uint32x4_t data1 = vld1q_u32(w+0);
127  uint32x4_t data2 = vld1q_u32(w+4);
128  uint32x4_t data3 = vld1q_u32(w+8);
129 
130  uint32x4_t r1 = vsha1cq_u32 (data1, 0, data2);
131  uint32x4_t r2 = vsha1mq_u32 (data1, 0, data2);
132  uint32x4_t r3 = vsha1pq_u32 (data1, 0, data2);
133  uint32x4_t r4 = vsha1su0q_u32 (data1, data2, data3);
134  uint32x4_t r5 = vsha1su1q_u32 (data1, data2);
135 
136  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3) | vgetq_lane_u32(r5,0));
137  }
138 
139  sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
140  signal(SIGILL, oldHandler);
141  return result;
142 # endif
143 #else
144  return false;
145 #endif // CRYPTOPP_ARM_SHA1_AVAILABLE
146 }
147 
148 bool CPU_ProbeSHA256()
149 {
150 #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES)
151  return false;
152 #elif (CRYPTOPP_ARM_SHA2_AVAILABLE)
153 # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY)
154  volatile bool result = true;
155  __try
156  {
157  unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
158  uint32x4_t data1 = vld1q_u32(w+0);
159  uint32x4_t data2 = vld1q_u32(w+4);
160  uint32x4_t data3 = vld1q_u32(w+8);
161 
162  uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
163  uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
164  uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
165  uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
166 
167  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
168  }
169  __except (EXCEPTION_EXECUTE_HANDLER)
170  {
171  return false;
172  }
173  return result;
174 #else
175 
176  // longjmp and clobber warnings. Volatile is required.
177  // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854
178  volatile bool result = true;
179 
180  volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler);
181  if (oldHandler == SIG_ERR)
182  return false;
183 
184  volatile sigset_t oldMask;
185  if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask))
186  {
187  signal(SIGILL, oldHandler);
188  return false;
189  }
190 
191  if (setjmp(s_jmpSIGILL))
192  result = false;
193  else
194  {
195  unsigned int w[] = {1,2,3,4, 5,6,7,8, 9,10,11,12};
196  uint32x4_t data1 = vld1q_u32(w+0);
197  uint32x4_t data2 = vld1q_u32(w+4);
198  uint32x4_t data3 = vld1q_u32(w+8);
199 
200  uint32x4_t r1 = vsha256hq_u32 (data1, data2, data3);
201  uint32x4_t r2 = vsha256h2q_u32 (data1, data2, data3);
202  uint32x4_t r3 = vsha256su0q_u32 (data1, data2);
203  uint32x4_t r4 = vsha256su1q_u32 (data1, data2, data3);
204 
205  result = !!(vgetq_lane_u32(r1,0) | vgetq_lane_u32(r2,1) | vgetq_lane_u32(r3,2) | vgetq_lane_u32(r4,3));
206  }
207 
208  sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR);
209  signal(SIGILL, oldHandler);
210  return result;
211 # endif
212 #else
213  return false;
214 #endif // CRYPTOPP_ARM_SHA2_AVAILABLE
215 }
216 #endif // ARM32 or ARM64
217 
218 // ***************** Intel x86 SHA ********************
219 
220 /////////////////////////////////////
221 // start of Walton and Gulley code //
222 /////////////////////////////////////
223 
224 #if CRYPTOPP_SHANI_AVAILABLE
225 // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
226 void SHA1_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
227 {
228  CRYPTOPP_ASSERT(state);
229  CRYPTOPP_ASSERT(data);
230  CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
231 
232  __m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
233  __m128i MASK, MSG0, MSG1, MSG2, MSG3;
234 
235  // Load initial values
236  ABCD = _mm_loadu_si128(CONST_M128_CAST(state));
237  E0 = _mm_set_epi32(state[4], 0, 0, 0);
238  ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
239 
240  // IA-32 SHA is little endian, SHA::Transform is big endian,
241  // and SHA::HashMultipleBlocks can be either. ByteOrder
242  // allows us to avoid extra endian reversals. It saves 1.0 cpb.
243  MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
244  _mm_set_epi8(0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15) :
245  _mm_set_epi8(3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12) ;
246 
247  while (length >= SHA1::BLOCKSIZE)
248  {
249  // Save current hash
250  ABCD_SAVE = ABCD;
251  E0_SAVE = E0;
252 
253  // Rounds 0-3
254  MSG0 = _mm_loadu_si128(CONST_M128_CAST(data+0));
255  MSG0 = _mm_shuffle_epi8(MSG0, MASK);
256  E0 = _mm_add_epi32(E0, MSG0);
257  E1 = ABCD;
258  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
259 
260  // Rounds 4-7
261  MSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
262  MSG1 = _mm_shuffle_epi8(MSG1, MASK);
263  E1 = _mm_sha1nexte_epu32(E1, MSG1);
264  E0 = ABCD;
265  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
266  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
267 
268  // Rounds 8-11
269  MSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
270  MSG2 = _mm_shuffle_epi8(MSG2, MASK);
271  E0 = _mm_sha1nexte_epu32(E0, MSG2);
272  E1 = ABCD;
273  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
274  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
275  MSG0 = _mm_xor_si128(MSG0, MSG2);
276 
277  // Rounds 12-15
278  MSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
279  MSG3 = _mm_shuffle_epi8(MSG3, MASK);
280  E1 = _mm_sha1nexte_epu32(E1, MSG3);
281  E0 = ABCD;
282  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
283  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
284  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
285  MSG1 = _mm_xor_si128(MSG1, MSG3);
286 
287  // Rounds 16-19
288  E0 = _mm_sha1nexte_epu32(E0, MSG0);
289  E1 = ABCD;
290  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
291  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
292  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
293  MSG2 = _mm_xor_si128(MSG2, MSG0);
294 
295  // Rounds 20-23
296  E1 = _mm_sha1nexte_epu32(E1, MSG1);
297  E0 = ABCD;
298  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
299  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
300  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
301  MSG3 = _mm_xor_si128(MSG3, MSG1);
302 
303  // Rounds 24-27
304  E0 = _mm_sha1nexte_epu32(E0, MSG2);
305  E1 = ABCD;
306  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
307  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
308  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
309  MSG0 = _mm_xor_si128(MSG0, MSG2);
310 
311  // Rounds 28-31
312  E1 = _mm_sha1nexte_epu32(E1, MSG3);
313  E0 = ABCD;
314  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
315  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
316  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
317  MSG1 = _mm_xor_si128(MSG1, MSG3);
318 
319  // Rounds 32-35
320  E0 = _mm_sha1nexte_epu32(E0, MSG0);
321  E1 = ABCD;
322  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
323  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
324  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
325  MSG2 = _mm_xor_si128(MSG2, MSG0);
326 
327  // Rounds 36-39
328  E1 = _mm_sha1nexte_epu32(E1, MSG1);
329  E0 = ABCD;
330  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
331  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
332  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
333  MSG3 = _mm_xor_si128(MSG3, MSG1);
334 
335  // Rounds 40-43
336  E0 = _mm_sha1nexte_epu32(E0, MSG2);
337  E1 = ABCD;
338  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
339  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
340  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
341  MSG0 = _mm_xor_si128(MSG0, MSG2);
342 
343  // Rounds 44-47
344  E1 = _mm_sha1nexte_epu32(E1, MSG3);
345  E0 = ABCD;
346  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
347  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
348  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
349  MSG1 = _mm_xor_si128(MSG1, MSG3);
350 
351  // Rounds 48-51
352  E0 = _mm_sha1nexte_epu32(E0, MSG0);
353  E1 = ABCD;
354  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
355  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
356  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
357  MSG2 = _mm_xor_si128(MSG2, MSG0);
358 
359  // Rounds 52-55
360  E1 = _mm_sha1nexte_epu32(E1, MSG1);
361  E0 = ABCD;
362  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
363  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
364  MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
365  MSG3 = _mm_xor_si128(MSG3, MSG1);
366 
367  // Rounds 56-59
368  E0 = _mm_sha1nexte_epu32(E0, MSG2);
369  E1 = ABCD;
370  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
371  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
372  MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
373  MSG0 = _mm_xor_si128(MSG0, MSG2);
374 
375  // Rounds 60-63
376  E1 = _mm_sha1nexte_epu32(E1, MSG3);
377  E0 = ABCD;
378  MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
379  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
380  MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
381  MSG1 = _mm_xor_si128(MSG1, MSG3);
382 
383  // Rounds 64-67
384  E0 = _mm_sha1nexte_epu32(E0, MSG0);
385  E1 = ABCD;
386  MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
387  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
388  MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
389  MSG2 = _mm_xor_si128(MSG2, MSG0);
390 
391  // Rounds 68-71
392  E1 = _mm_sha1nexte_epu32(E1, MSG1);
393  E0 = ABCD;
394  MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
395  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
396  MSG3 = _mm_xor_si128(MSG3, MSG1);
397 
398  // Rounds 72-75
399  E0 = _mm_sha1nexte_epu32(E0, MSG2);
400  E1 = ABCD;
401  MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
402  ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
403 
404  // Rounds 76-79
405  E1 = _mm_sha1nexte_epu32(E1, MSG3);
406  E0 = ABCD;
407  ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
408 
409  // Add values back to state
410  E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
411  ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
412 
413  data += SHA1::BLOCKSIZE/sizeof(word32);
414  length -= SHA1::BLOCKSIZE;
415  }
416 
417  // Save state
418  ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
419  _mm_storeu_si128(M128_CAST(state), ABCD);
420  state[4] = _mm_extract_epi32(E0, 3);
421 }
422 
423 // Based on http://software.intel.com/en-us/articles/intel-sha-extensions and code by Sean Gulley.
424 void SHA256_HashMultipleBlocks_SHANI(word32 *state, const word32 *data, size_t length, ByteOrder order)
425 {
426  CRYPTOPP_ASSERT(state);
427  CRYPTOPP_ASSERT(data);
428  CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
429 
430  __m128i STATE0, STATE1;
431  __m128i MSG, TMP, MASK;
432  __m128i TMSG0, TMSG1, TMSG2, TMSG3;
433  __m128i ABEF_SAVE, CDGH_SAVE;
434 
435  // Load initial values
436  TMP = _mm_loadu_si128(M128_CAST(&state[0]));
437  STATE1 = _mm_loadu_si128(M128_CAST(&state[4]));
438 
439  // IA-32 SHA is little endian, SHA::Transform is big endian,
440  // and SHA::HashMultipleBlocks can be either. ByteOrder
441  // allows us to avoid extra endian reversals. It saves 1.0 cpb.
442  MASK = order == BIG_ENDIAN_ORDER ? // Data arrangement
443  _mm_set_epi8(12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3) :
444  _mm_set_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0) ;
445 
446  TMP = _mm_shuffle_epi32(TMP, 0xB1); // CDAB
447  STATE1 = _mm_shuffle_epi32(STATE1, 0x1B); // EFGH
448  STATE0 = _mm_alignr_epi8(TMP, STATE1, 8); // ABEF
449  STATE1 = _mm_blend_epi16(STATE1, TMP, 0xF0); // CDGH
450 
451  while (length >= SHA256::BLOCKSIZE)
452  {
453  // Save current hash
454  ABEF_SAVE = STATE0;
455  CDGH_SAVE = STATE1;
456 
457  // Rounds 0-3
458  MSG = _mm_loadu_si128(CONST_M128_CAST(data+0));
459  TMSG0 = _mm_shuffle_epi8(MSG, MASK);
460  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0xE9B5DBA5B5C0FBCF), W64LIT(0x71374491428A2F98)));
461  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
462  MSG = _mm_shuffle_epi32(MSG, 0x0E);
463  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
464 
465  // Rounds 4-7
466  TMSG1 = _mm_loadu_si128(CONST_M128_CAST(data+4));
467  TMSG1 = _mm_shuffle_epi8(TMSG1, MASK);
468  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0xAB1C5ED5923F82A4), W64LIT(0x59F111F13956C25B)));
469  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
470  MSG = _mm_shuffle_epi32(MSG, 0x0E);
471  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
472  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
473 
474  // Rounds 8-11
475  TMSG2 = _mm_loadu_si128(CONST_M128_CAST(data+8));
476  TMSG2 = _mm_shuffle_epi8(TMSG2, MASK);
477  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x550C7DC3243185BE), W64LIT(0x12835B01D807AA98)));
478  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
479  MSG = _mm_shuffle_epi32(MSG, 0x0E);
480  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
481  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
482 
483  // Rounds 12-15
484  TMSG3 = _mm_loadu_si128(CONST_M128_CAST(data+12));
485  TMSG3 = _mm_shuffle_epi8(TMSG3, MASK);
486  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC19BF1749BDC06A7), W64LIT(0x80DEB1FE72BE5D74)));
487  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
488  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
489  TMSG0 = _mm_add_epi32(TMSG0, TMP);
490  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
491  MSG = _mm_shuffle_epi32(MSG, 0x0E);
492  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
493  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
494 
495  // Rounds 16-19
496  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x240CA1CC0FC19DC6), W64LIT(0xEFBE4786E49B69C1)));
497  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
498  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
499  TMSG1 = _mm_add_epi32(TMSG1, TMP);
500  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
501  MSG = _mm_shuffle_epi32(MSG, 0x0E);
502  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
503  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
504 
505  // Rounds 20-23
506  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x76F988DA5CB0A9DC), W64LIT(0x4A7484AA2DE92C6F)));
507  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
508  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
509  TMSG2 = _mm_add_epi32(TMSG2, TMP);
510  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
511  MSG = _mm_shuffle_epi32(MSG, 0x0E);
512  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
513  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
514 
515  // Rounds 24-27
516  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xBF597FC7B00327C8), W64LIT(0xA831C66D983E5152)));
517  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
518  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
519  TMSG3 = _mm_add_epi32(TMSG3, TMP);
520  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
521  MSG = _mm_shuffle_epi32(MSG, 0x0E);
522  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
523  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
524 
525  // Rounds 28-31
526  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x1429296706CA6351), W64LIT(0xD5A79147C6E00BF3)));
527  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
528  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
529  TMSG0 = _mm_add_epi32(TMSG0, TMP);
530  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
531  MSG = _mm_shuffle_epi32(MSG, 0x0E);
532  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
533  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
534 
535  // Rounds 32-35
536  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x53380D134D2C6DFC), W64LIT(0x2E1B213827B70A85)));
537  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
538  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
539  TMSG1 = _mm_add_epi32(TMSG1, TMP);
540  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
541  MSG = _mm_shuffle_epi32(MSG, 0x0E);
542  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
543  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
544 
545  // Rounds 36-39
546  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x92722C8581C2C92E), W64LIT(0x766A0ABB650A7354)));
547  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
548  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
549  TMSG2 = _mm_add_epi32(TMSG2, TMP);
550  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
551  MSG = _mm_shuffle_epi32(MSG, 0x0E);
552  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
553  TMSG0 = _mm_sha256msg1_epu32(TMSG0, TMSG1);
554 
555  // Rounds 40-43
556  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0xC76C51A3C24B8B70), W64LIT(0xA81A664BA2BFE8A1)));
557  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
558  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
559  TMSG3 = _mm_add_epi32(TMSG3, TMP);
560  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
561  MSG = _mm_shuffle_epi32(MSG, 0x0E);
562  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
563  TMSG1 = _mm_sha256msg1_epu32(TMSG1, TMSG2);
564 
565  // Rounds 44-47
566  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0x106AA070F40E3585), W64LIT(0xD6990624D192E819)));
567  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
568  TMP = _mm_alignr_epi8(TMSG3, TMSG2, 4);
569  TMSG0 = _mm_add_epi32(TMSG0, TMP);
570  TMSG0 = _mm_sha256msg2_epu32(TMSG0, TMSG3);
571  MSG = _mm_shuffle_epi32(MSG, 0x0E);
572  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
573  TMSG2 = _mm_sha256msg1_epu32(TMSG2, TMSG3);
574 
575  // Rounds 48-51
576  MSG = _mm_add_epi32(TMSG0, _mm_set_epi64x(W64LIT(0x34B0BCB52748774C), W64LIT(0x1E376C0819A4C116)));
577  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
578  TMP = _mm_alignr_epi8(TMSG0, TMSG3, 4);
579  TMSG1 = _mm_add_epi32(TMSG1, TMP);
580  TMSG1 = _mm_sha256msg2_epu32(TMSG1, TMSG0);
581  MSG = _mm_shuffle_epi32(MSG, 0x0E);
582  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
583  TMSG3 = _mm_sha256msg1_epu32(TMSG3, TMSG0);
584 
585  // Rounds 52-55
586  MSG = _mm_add_epi32(TMSG1, _mm_set_epi64x(W64LIT(0x682E6FF35B9CCA4F), W64LIT(0x4ED8AA4A391C0CB3)));
587  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
588  TMP = _mm_alignr_epi8(TMSG1, TMSG0, 4);
589  TMSG2 = _mm_add_epi32(TMSG2, TMP);
590  TMSG2 = _mm_sha256msg2_epu32(TMSG2, TMSG1);
591  MSG = _mm_shuffle_epi32(MSG, 0x0E);
592  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
593 
594  // Rounds 56-59
595  MSG = _mm_add_epi32(TMSG2, _mm_set_epi64x(W64LIT(0x8CC7020884C87814), W64LIT(0x78A5636F748F82EE)));
596  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
597  TMP = _mm_alignr_epi8(TMSG2, TMSG1, 4);
598  TMSG3 = _mm_add_epi32(TMSG3, TMP);
599  TMSG3 = _mm_sha256msg2_epu32(TMSG3, TMSG2);
600  MSG = _mm_shuffle_epi32(MSG, 0x0E);
601  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
602 
603  // Rounds 60-63
604  MSG = _mm_add_epi32(TMSG3, _mm_set_epi64x(W64LIT(0xC67178F2BEF9A3F7), W64LIT(0xA4506CEB90BEFFFA)));
605  STATE1 = _mm_sha256rnds2_epu32(STATE1, STATE0, MSG);
606  MSG = _mm_shuffle_epi32(MSG, 0x0E);
607  STATE0 = _mm_sha256rnds2_epu32(STATE0, STATE1, MSG);
608 
609  // Add values back to state
610  STATE0 = _mm_add_epi32(STATE0, ABEF_SAVE);
611  STATE1 = _mm_add_epi32(STATE1, CDGH_SAVE);
612 
613  data += SHA256::BLOCKSIZE/sizeof(word32);
614  length -= SHA256::BLOCKSIZE;
615  }
616 
617  TMP = _mm_shuffle_epi32(STATE0, 0x1B); // FEBA
618  STATE1 = _mm_shuffle_epi32(STATE1, 0xB1); // DCHG
619  STATE0 = _mm_blend_epi16(TMP, STATE1, 0xF0); // DCBA
620  STATE1 = _mm_alignr_epi8(STATE1, TMP, 8); // ABEF
621 
622  // Save state
623  _mm_storeu_si128(M128_CAST(&state[0]), STATE0);
624  _mm_storeu_si128(M128_CAST(&state[4]), STATE1);
625 }
626 #endif // CRYPTOPP_SHANI_AVAILABLE
627 
628 ///////////////////////////////////
629 // end of Walton and Gulley code //
630 ///////////////////////////////////
631 
632 // ***************** ARMV8 SHA ********************
633 
634 /////////////////////////////////////////////////////////////
635 // start of Walton, Schneiders, O'Rourke and Hovsmith code //
636 /////////////////////////////////////////////////////////////
637 
638 #if CRYPTOPP_ARM_SHA1_AVAILABLE
639 void SHA1_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
640 {
641  CRYPTOPP_ASSERT(state);
642  CRYPTOPP_ASSERT(data);
643  CRYPTOPP_ASSERT(length >= SHA1::BLOCKSIZE);
644 
645  uint32x4_t C0, C1, C2, C3;
646  uint32x4_t ABCD, ABCD_SAVED;
647  uint32x4_t MSG0, MSG1, MSG2, MSG3;
648  uint32x4_t TMP0, TMP1;
649  uint32_t E0, E0_SAVED, E1;
650 
651  // Load initial values
652  C0 = vdupq_n_u32(0x5A827999);
653  C1 = vdupq_n_u32(0x6ED9EBA1);
654  C2 = vdupq_n_u32(0x8F1BBCDC);
655  C3 = vdupq_n_u32(0xCA62C1D6);
656 
657  ABCD = vld1q_u32(&state[0]);
658  E0 = state[4];
659 
660  while (length >= SHA1::BLOCKSIZE)
661  {
662  // Save current hash
663  ABCD_SAVED = ABCD;
664  E0_SAVED = E0;
665 
666  MSG0 = vld1q_u32(data + 0);
667  MSG1 = vld1q_u32(data + 4);
668  MSG2 = vld1q_u32(data + 8);
669  MSG3 = vld1q_u32(data + 12);
670 
671  if (order == BIG_ENDIAN_ORDER) // Data arrangement
672  {
673  MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
674  MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
675  MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
676  MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
677  }
678 
679  TMP0 = vaddq_u32(MSG0, C0);
680  TMP1 = vaddq_u32(MSG1, C0);
681 
682  // Rounds 0-3
683  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
684  ABCD = vsha1cq_u32(ABCD, E0, TMP0);
685  TMP0 = vaddq_u32(MSG2, C0);
686  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
687 
688  // Rounds 4-7
689  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
690  ABCD = vsha1cq_u32(ABCD, E1, TMP1);
691  TMP1 = vaddq_u32(MSG3, C0);
692  MSG0 = vsha1su1q_u32(MSG0, MSG3);
693  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
694 
695  // Rounds 8-11
696  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
697  ABCD = vsha1cq_u32(ABCD, E0, TMP0);
698  TMP0 = vaddq_u32(MSG0, C0);
699  MSG1 = vsha1su1q_u32(MSG1, MSG0);
700  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
701 
702  // Rounds 12-15
703  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
704  ABCD = vsha1cq_u32(ABCD, E1, TMP1);
705  TMP1 = vaddq_u32(MSG1, C1);
706  MSG2 = vsha1su1q_u32(MSG2, MSG1);
707  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
708 
709  // Rounds 16-19
710  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
711  ABCD = vsha1cq_u32(ABCD, E0, TMP0);
712  TMP0 = vaddq_u32(MSG2, C1);
713  MSG3 = vsha1su1q_u32(MSG3, MSG2);
714  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
715 
716  // Rounds 20-23
717  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
718  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
719  TMP1 = vaddq_u32(MSG3, C1);
720  MSG0 = vsha1su1q_u32(MSG0, MSG3);
721  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
722 
723  // Rounds 24-27
724  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
725  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
726  TMP0 = vaddq_u32(MSG0, C1);
727  MSG1 = vsha1su1q_u32(MSG1, MSG0);
728  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
729 
730  // Rounds 28-31
731  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
732  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
733  TMP1 = vaddq_u32(MSG1, C1);
734  MSG2 = vsha1su1q_u32(MSG2, MSG1);
735  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
736 
737  // Rounds 32-35
738  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
739  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
740  TMP0 = vaddq_u32(MSG2, C2);
741  MSG3 = vsha1su1q_u32(MSG3, MSG2);
742  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
743 
744  // Rounds 36-39
745  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
746  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
747  TMP1 = vaddq_u32(MSG3, C2);
748  MSG0 = vsha1su1q_u32(MSG0, MSG3);
749  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
750 
751  // Rounds 40-43
752  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
753  ABCD = vsha1mq_u32(ABCD, E0, TMP0);
754  TMP0 = vaddq_u32(MSG0, C2);
755  MSG1 = vsha1su1q_u32(MSG1, MSG0);
756  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
757 
758  // Rounds 44-47
759  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
760  ABCD = vsha1mq_u32(ABCD, E1, TMP1);
761  TMP1 = vaddq_u32(MSG1, C2);
762  MSG2 = vsha1su1q_u32(MSG2, MSG1);
763  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
764 
765  // Rounds 48-51
766  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
767  ABCD = vsha1mq_u32(ABCD, E0, TMP0);
768  TMP0 = vaddq_u32(MSG2, C2);
769  MSG3 = vsha1su1q_u32(MSG3, MSG2);
770  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
771 
772  // Rounds 52-55
773  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
774  ABCD = vsha1mq_u32(ABCD, E1, TMP1);
775  TMP1 = vaddq_u32(MSG3, C3);
776  MSG0 = vsha1su1q_u32(MSG0, MSG3);
777  MSG1 = vsha1su0q_u32(MSG1, MSG2, MSG3);
778 
779  // Rounds 56-59
780  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
781  ABCD = vsha1mq_u32(ABCD, E0, TMP0);
782  TMP0 = vaddq_u32(MSG0, C3);
783  MSG1 = vsha1su1q_u32(MSG1, MSG0);
784  MSG2 = vsha1su0q_u32(MSG2, MSG3, MSG0);
785 
786  // Rounds 60-63
787  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
788  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
789  TMP1 = vaddq_u32(MSG1, C3);
790  MSG2 = vsha1su1q_u32(MSG2, MSG1);
791  MSG3 = vsha1su0q_u32(MSG3, MSG0, MSG1);
792 
793  // Rounds 64-67
794  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
795  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
796  TMP0 = vaddq_u32(MSG2, C3);
797  MSG3 = vsha1su1q_u32(MSG3, MSG2);
798  MSG0 = vsha1su0q_u32(MSG0, MSG1, MSG2);
799 
800  // Rounds 68-71
801  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
802  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
803  TMP1 = vaddq_u32(MSG3, C3);
804  MSG0 = vsha1su1q_u32(MSG0, MSG3);
805 
806  // Rounds 72-75
807  E1 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
808  ABCD = vsha1pq_u32(ABCD, E0, TMP0);
809 
810  // Rounds 76-79
811  E0 = vsha1h_u32(vgetq_lane_u32(ABCD, 0));
812  ABCD = vsha1pq_u32(ABCD, E1, TMP1);
813 
814  E0 += E0_SAVED;
815  ABCD = vaddq_u32(ABCD_SAVED, ABCD);
816 
817  data += SHA1::BLOCKSIZE/sizeof(word32);
818  length -= SHA1::BLOCKSIZE;
819  }
820 
821  // Save state
822  vst1q_u32(&state[0], ABCD);
823  state[4] = E0;
824 }
825 #endif // CRYPTOPP_ARM_SHA1_AVAILABLE
826 
827 #if CRYPTOPP_ARM_SHA2_AVAILABLE
828 void SHA256_HashMultipleBlocks_ARMV8(word32 *state, const word32 *data, size_t length, ByteOrder order)
829 {
830  CRYPTOPP_ASSERT(state);
831  CRYPTOPP_ASSERT(data);
832  CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
833 
834  uint32x4_t STATE0, STATE1, ABEF_SAVE, CDGH_SAVE;
835  uint32x4_t MSG0, MSG1, MSG2, MSG3;
836  uint32x4_t TMP0, TMP1, TMP2;
837 
838  // Load initial values
839  STATE0 = vld1q_u32(&state[0]);
840  STATE1 = vld1q_u32(&state[4]);
841 
842  while (length >= SHA256::BLOCKSIZE)
843  {
844  // Save current hash
845  ABEF_SAVE = STATE0;
846  CDGH_SAVE = STATE1;
847 
848  // Load message
849  MSG0 = vld1q_u32(data + 0);
850  MSG1 = vld1q_u32(data + 4);
851  MSG2 = vld1q_u32(data + 8);
852  MSG3 = vld1q_u32(data + 12);
853 
854  if (order == BIG_ENDIAN_ORDER) // Data arrangement
855  {
856  MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG0)));
857  MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG1)));
858  MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG2)));
859  MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(MSG3)));
860  }
861 
862  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x00]));
863 
864  // Rounds 0-3
865  MSG0 = vsha256su0q_u32(MSG0, MSG1);
866  TMP2 = STATE0;
867  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x04]));
868  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
869  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
870  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
871 
872  // Rounds 4-7
873  MSG1 = vsha256su0q_u32(MSG1, MSG2);
874  TMP2 = STATE0;
875  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x08]));
876  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
877  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
878  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
879 
880  // Rounds 8-11
881  MSG2 = vsha256su0q_u32(MSG2, MSG3);
882  TMP2 = STATE0;
883  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x0c]));
884  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
885  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
886  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
887 
888  // Rounds 12-15
889  MSG3 = vsha256su0q_u32(MSG3, MSG0);
890  TMP2 = STATE0;
891  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x10]));
892  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
893  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
894  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
895 
896  // Rounds 16-19
897  MSG0 = vsha256su0q_u32(MSG0, MSG1);
898  TMP2 = STATE0;
899  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x14]));
900  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
901  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
902  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
903 
904  // Rounds 20-23
905  MSG1 = vsha256su0q_u32(MSG1, MSG2);
906  TMP2 = STATE0;
907  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x18]));
908  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
909  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
910  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
911 
912  // Rounds 24-27
913  MSG2 = vsha256su0q_u32(MSG2, MSG3);
914  TMP2 = STATE0;
915  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x1c]));
916  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
917  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
918  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
919 
920  // Rounds 28-31
921  MSG3 = vsha256su0q_u32(MSG3, MSG0);
922  TMP2 = STATE0;
923  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x20]));
924  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
925  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
926  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
927 
928  // Rounds 32-35
929  MSG0 = vsha256su0q_u32(MSG0, MSG1);
930  TMP2 = STATE0;
931  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x24]));
932  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
933  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
934  MSG0 = vsha256su1q_u32(MSG0, MSG2, MSG3);
935 
936  // Rounds 36-39
937  MSG1 = vsha256su0q_u32(MSG1, MSG2);
938  TMP2 = STATE0;
939  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x28]));
940  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
941  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
942  MSG1 = vsha256su1q_u32(MSG1, MSG3, MSG0);
943 
944  // Rounds 40-43
945  MSG2 = vsha256su0q_u32(MSG2, MSG3);
946  TMP2 = STATE0;
947  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x2c]));
948  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
949  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
950  MSG2 = vsha256su1q_u32(MSG2, MSG0, MSG1);
951 
952  // Rounds 44-47
953  MSG3 = vsha256su0q_u32(MSG3, MSG0);
954  TMP2 = STATE0;
955  TMP0 = vaddq_u32(MSG0, vld1q_u32(&SHA256_K[0x30]));
956  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
957  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
958  MSG3 = vsha256su1q_u32(MSG3, MSG1, MSG2);
959 
960  // Rounds 48-51
961  TMP2 = STATE0;
962  TMP1 = vaddq_u32(MSG1, vld1q_u32(&SHA256_K[0x34]));
963  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
964  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
965 
966  // Rounds 52-55
967  TMP2 = STATE0;
968  TMP0 = vaddq_u32(MSG2, vld1q_u32(&SHA256_K[0x38]));
969  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
970  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
971 
972  // Rounds 56-59
973  TMP2 = STATE0;
974  TMP1 = vaddq_u32(MSG3, vld1q_u32(&SHA256_K[0x3c]));
975  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP0);
976  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP0);
977 
978  // Rounds 60-63
979  TMP2 = STATE0;
980  STATE0 = vsha256hq_u32(STATE0, STATE1, TMP1);
981  STATE1 = vsha256h2q_u32(STATE1, TMP2, TMP1);
982 
983  // Add back to state
984  STATE0 = vaddq_u32(STATE0, ABEF_SAVE);
985  STATE1 = vaddq_u32(STATE1, CDGH_SAVE);
986 
987  data += SHA256::BLOCKSIZE/sizeof(word32);
988  length -= SHA256::BLOCKSIZE;
989  }
990 
991  // Save state
992  vst1q_u32(&state[0], STATE0);
993  vst1q_u32(&state[4], STATE1);
994 }
995 #endif // CRYPTOPP_ARM_SHA2_AVAILABLE
996 
997 ///////////////////////////////////////////////////////////
998 // end of Walton, Schneiders, O'Rourke and Hovsmith code //
999 ///////////////////////////////////////////////////////////
1000 
1001 // ***************** Power8 SHA ********************
1002 
1003 //////////////////////////////////////////////////
1004 // start Gustavo, Serra, Scalet and Walton code //
1005 //////////////////////////////////////////////////
1006 
1007 #if CRYPTOPP_POWER8_SHA_AVAILABLE
1008 
1009 // Indexes into the S[] array
1010 enum {A=0, B=1, C, D, E, F, G, H};
1011 
1012 inline
1013 uint32x4_p VecLoad32(const word32* data, int offset)
1014 {
1015 #if (CRYPTOPP_LITTLE_ENDIAN)
1016  const uint8x16_p mask = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
1017  const uint32x4_p val = VecLoad(offset, data);
1018  return (uint32x4_p)VecPermute(val, val, mask);
1019 #else
1020  return VecLoad(offset, data);
1021 #endif
1022 }
1023 
1024 template<class T> inline
1025 void VecStore32(const T data, word32 dest[4])
1026 {
1027  VecStore(data, dest);
1028 }
1029 
1030 inline
1031 uint32x4_p VectorCh(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
1032 {
1033  // The trick below is due to Andy Polyakov and Jack Lloyd
1034  return vec_sel(z,y,x);
1035 }
1036 
1037 inline
1038 uint32x4_p VectorMaj(const uint32x4_p x, const uint32x4_p y, const uint32x4_p z)
1039 {
1040  // The trick below is due to Andy Polyakov and Jack Lloyd
1041  return vec_sel(y, z, VecXor(x, y));
1042 }
1043 
1044 inline
1045 uint32x4_p Vector_sigma0(const uint32x4_p val)
1046 {
1047  return VecSHA256<0,0>(val);
1048 }
1049 
1050 inline
1051 uint32x4_p Vector_sigma1(const uint32x4_p val)
1052 {
1053  return VecSHA256<0,0xf>(val);
1054 }
1055 
1056 inline
1057 uint32x4_p VectorSigma0(const uint32x4_p val)
1058 {
1059  return VecSHA256<1,0>(val);
1060 }
1061 
1062 inline
1063 uint32x4_p VectorSigma1(const uint32x4_p val)
1064 {
1065  return VecSHA256<1,0xf>(val);
1066 }
1067 
1068 inline
1069 uint32x4_p VectorPack(const uint32x4_p a, const uint32x4_p b,
1070  const uint32x4_p c, const uint32x4_p d)
1071 {
1072  const uint8x16_p m1 = {0,1,2,3, 16,17,18,19, 0,0,0,0, 0,0,0,0};
1073  const uint8x16_p m2 = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1074  return VecPermute(VecPermute(a,b,m1), VecPermute(c,d,m1), m2);
1075 }
1076 
1077 template <unsigned int R> inline
1078 void SHA256_ROUND1(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K, const uint32x4_p M)
1079 {
1080  uint32x4_p T1, T2;
1081 
1082  W[R] = M;
1083  T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1084  T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1085 
1086  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1087  S[E] = S[D] + T1;
1088  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1089  S[A] = T1 + T2;
1090 }
1091 
1092 template <unsigned int R> inline
1093 void SHA256_ROUND2(uint32x4_p W[16], uint32x4_p S[8], const uint32x4_p K)
1094 {
1095  // Indexes into the W[] array
1096  enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1097 
1098  const uint32x4_p s0 = Vector_sigma0(W[IDX1]);
1099  const uint32x4_p s1 = Vector_sigma1(W[IDX14]);
1100 
1101  uint32x4_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1102  T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1103  uint32x4_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1104 
1105  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1106  S[E] = S[D] + T1;
1107  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1108  S[A] = T1 + T2;
1109 }
1110 
1111 void SHA256_HashMultipleBlocks_POWER8(word32 *state, const word32 *data, size_t length, ByteOrder order)
1112 {
1113  CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1114  CRYPTOPP_ASSERT(length >= SHA256::BLOCKSIZE);
1115  CRYPTOPP_UNUSED(order);
1116 
1117  const uint32_t* k = reinterpret_cast<const uint32_t*>(SHA256_K);
1118  const uint32_t* m = reinterpret_cast<const uint32_t*>(data);
1119 
1120  uint32x4_p abcd = VecLoad(state+0);
1121  uint32x4_p efgh = VecLoad(state+4);
1122  uint32x4_p W[16], S[8], vm, vk;
1123 
1124  size_t blocks = length / SHA256::BLOCKSIZE;
1125  while (blocks--)
1126  {
1127  unsigned int offset=0;
1128 
1129  S[A] = abcd; S[E] = efgh;
1130  S[B] = VecShiftLeftOctet<4>(S[A]);
1131  S[F] = VecShiftLeftOctet<4>(S[E]);
1132  S[C] = VecShiftLeftOctet<4>(S[B]);
1133  S[G] = VecShiftLeftOctet<4>(S[F]);
1134  S[D] = VecShiftLeftOctet<4>(S[C]);
1135  S[H] = VecShiftLeftOctet<4>(S[G]);
1136 
1137  // Rounds 0-16
1138  vk = VecLoad(offset, k);
1139  vm = VecLoad32(m, offset);
1140  SHA256_ROUND1<0>(W,S, vk,vm);
1141  offset+=16;
1142 
1143  vk = VecShiftLeftOctet<4>(vk);
1144  vm = VecShiftLeftOctet<4>(vm);
1145  SHA256_ROUND1<1>(W,S, vk,vm);
1146 
1147  vk = VecShiftLeftOctet<4>(vk);
1148  vm = VecShiftLeftOctet<4>(vm);
1149  SHA256_ROUND1<2>(W,S, vk,vm);
1150 
1151  vk = VecShiftLeftOctet<4>(vk);
1152  vm = VecShiftLeftOctet<4>(vm);
1153  SHA256_ROUND1<3>(W,S, vk,vm);
1154 
1155  vk = VecLoad(offset, k);
1156  vm = VecLoad32(m, offset);
1157  SHA256_ROUND1<4>(W,S, vk,vm);
1158  offset+=16;
1159 
1160  vk = VecShiftLeftOctet<4>(vk);
1161  vm = VecShiftLeftOctet<4>(vm);
1162  SHA256_ROUND1<5>(W,S, vk,vm);
1163 
1164  vk = VecShiftLeftOctet<4>(vk);
1165  vm = VecShiftLeftOctet<4>(vm);
1166  SHA256_ROUND1<6>(W,S, vk,vm);
1167 
1168  vk = VecShiftLeftOctet<4>(vk);
1169  vm = VecShiftLeftOctet<4>(vm);
1170  SHA256_ROUND1<7>(W,S, vk,vm);
1171 
1172  vk = VecLoad(offset, k);
1173  vm = VecLoad32(m, offset);
1174  SHA256_ROUND1<8>(W,S, vk,vm);
1175  offset+=16;
1176 
1177  vk = VecShiftLeftOctet<4>(vk);
1178  vm = VecShiftLeftOctet<4>(vm);
1179  SHA256_ROUND1<9>(W,S, vk,vm);
1180 
1181  vk = VecShiftLeftOctet<4>(vk);
1182  vm = VecShiftLeftOctet<4>(vm);
1183  SHA256_ROUND1<10>(W,S, vk,vm);
1184 
1185  vk = VecShiftLeftOctet<4>(vk);
1186  vm = VecShiftLeftOctet<4>(vm);
1187  SHA256_ROUND1<11>(W,S, vk,vm);
1188 
1189  vk = VecLoad(offset, k);
1190  vm = VecLoad32(m, offset);
1191  SHA256_ROUND1<12>(W,S, vk,vm);
1192  offset+=16;
1193 
1194  vk = VecShiftLeftOctet<4>(vk);
1195  vm = VecShiftLeftOctet<4>(vm);
1196  SHA256_ROUND1<13>(W,S, vk,vm);
1197 
1198  vk = VecShiftLeftOctet<4>(vk);
1199  vm = VecShiftLeftOctet<4>(vm);
1200  SHA256_ROUND1<14>(W,S, vk,vm);
1201 
1202  vk = VecShiftLeftOctet<4>(vk);
1203  vm = VecShiftLeftOctet<4>(vm);
1204  SHA256_ROUND1<15>(W,S, vk,vm);
1205 
1206  m += 16; // 32-bit words, not bytes
1207 
1208  // Rounds 16-64
1209  for (unsigned int i=16; i<64; i+=16)
1210  {
1211  vk = VecLoad(offset, k);
1212  SHA256_ROUND2<0>(W,S, vk);
1213  SHA256_ROUND2<1>(W,S, VecShiftLeftOctet<4>(vk));
1214  SHA256_ROUND2<2>(W,S, VecShiftLeftOctet<8>(vk));
1215  SHA256_ROUND2<3>(W,S, VecShiftLeftOctet<12>(vk));
1216  offset+=16;
1217 
1218  vk = VecLoad(offset, k);
1219  SHA256_ROUND2<4>(W,S, vk);
1220  SHA256_ROUND2<5>(W,S, VecShiftLeftOctet<4>(vk));
1221  SHA256_ROUND2<6>(W,S, VecShiftLeftOctet<8>(vk));
1222  SHA256_ROUND2<7>(W,S, VecShiftLeftOctet<12>(vk));
1223  offset+=16;
1224 
1225  vk = VecLoad(offset, k);
1226  SHA256_ROUND2<8>(W,S, vk);
1227  SHA256_ROUND2<9>(W,S, VecShiftLeftOctet<4>(vk));
1228  SHA256_ROUND2<10>(W,S, VecShiftLeftOctet<8>(vk));
1229  SHA256_ROUND2<11>(W,S, VecShiftLeftOctet<12>(vk));
1230  offset+=16;
1231 
1232  vk = VecLoad(offset, k);
1233  SHA256_ROUND2<12>(W,S, vk);
1234  SHA256_ROUND2<13>(W,S, VecShiftLeftOctet<4>(vk));
1235  SHA256_ROUND2<14>(W,S, VecShiftLeftOctet<8>(vk));
1236  SHA256_ROUND2<15>(W,S, VecShiftLeftOctet<12>(vk));
1237  offset+=16;
1238  }
1239 
1240  abcd += VectorPack(S[A],S[B],S[C],S[D]);
1241  efgh += VectorPack(S[E],S[F],S[G],S[H]);
1242  }
1243 
1244  VecStore32(abcd, state+0);
1245  VecStore32(efgh, state+4);
1246 }
1247 
1248 inline
1249 void VecStore64(const uint64x2_p val, word64* data)
1250 {
1251  VecStore(val, data);
1252 }
1253 
1254 inline
1255 uint64x2_p VecLoad64(const word64* data, int offset)
1256 {
1257 #if (CRYPTOPP_LITTLE_ENDIAN)
1258  const uint8x16_p mask = {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15};
1259  return VecPermute(VecLoad(offset, data), mask);
1260 #else
1261  return VecLoad(offset, data);
1262 #endif
1263 }
1264 
1265 inline
1266 uint64x2_p VectorCh(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
1267 {
1268  // The trick below is due to Andy Polyakov and Jack Lloyd
1269  return vec_sel(z,y,x);
1270 }
1271 
1272 inline
1273 uint64x2_p VectorMaj(const uint64x2_p x, const uint64x2_p y, const uint64x2_p z)
1274 {
1275  // The trick below is due to Andy Polyakov and Jack Lloyd
1276  return vec_sel(y, z, VecXor(x, y));
1277 }
1278 
1279 inline
1280 uint64x2_p Vector_sigma0(const uint64x2_p val)
1281 {
1282  return VecSHA512<0,0>(val);
1283 }
1284 
1285 inline
1286 uint64x2_p Vector_sigma1(const uint64x2_p val)
1287 {
1288  return VecSHA512<0,0xf>(val);
1289 }
1290 
1291 inline
1292 uint64x2_p VectorSigma0(const uint64x2_p val)
1293 {
1294  return VecSHA512<1,0>(val);
1295 }
1296 
1297 inline
1298 uint64x2_p VectorSigma1(const uint64x2_p val)
1299 {
1300  return VecSHA512<1,0xf>(val);
1301 }
1302 
1303 inline
1304 uint64x2_p VectorPack(const uint64x2_p x, const uint64x2_p y)
1305 {
1306  const uint8x16_p m = {0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
1307  return VecPermute(x,y,m);
1308 }
1309 
1310 template <unsigned int R> inline
1311 void SHA512_ROUND1(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K, const uint64x2_p M)
1312 {
1313  uint64x2_p T1, T2;
1314 
1315  W[R] = M;
1316  T1 = S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K + M;
1317  T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1318 
1319  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1320  S[E] = S[D] + T1;
1321  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1322  S[A] = T1 + T2;
1323 }
1324 
1325 template <unsigned int R> inline
1326 void SHA512_ROUND2(uint64x2_p W[16], uint64x2_p S[8], const uint64x2_p K)
1327 {
1328  // Indexes into the W[] array
1329  enum {IDX0=(R+0)&0xf, IDX1=(R+1)&0xf, IDX9=(R+9)&0xf, IDX14=(R+14)&0xf};
1330 
1331  const uint64x2_p s0 = Vector_sigma0(W[IDX1]);
1332  const uint64x2_p s1 = Vector_sigma1(W[IDX14]);
1333 
1334  uint64x2_p T1 = (W[IDX0] += s0 + s1 + W[IDX9]);
1335  T1 += S[H] + VectorSigma1(S[E]) + VectorCh(S[E],S[F],S[G]) + K;
1336  uint64x2_p T2 = VectorSigma0(S[A]) + VectorMaj(S[A],S[B],S[C]);
1337 
1338  S[H] = S[G]; S[G] = S[F]; S[F] = S[E];
1339  S[E] = S[D] + T1;
1340  S[D] = S[C]; S[C] = S[B]; S[B] = S[A];
1341  S[A] = T1 + T2;
1342 }
1343 
1344 void SHA512_HashMultipleBlocks_POWER8(word64 *state, const word64 *data, size_t length, ByteOrder order)
1345 {
1346  CRYPTOPP_ASSERT(state); CRYPTOPP_ASSERT(data);
1347  CRYPTOPP_ASSERT(length >= SHA512::BLOCKSIZE);
1348  CRYPTOPP_UNUSED(order);
1349 
1350  const uint64_t* k = reinterpret_cast<const uint64_t*>(SHA512_K);
1351  const uint64_t* m = reinterpret_cast<const uint64_t*>(data);
1352 
1353  uint64x2_p ab = VecLoad(state+0);
1354  uint64x2_p cd = VecLoad(state+2);
1355  uint64x2_p ef = VecLoad(state+4);
1356  uint64x2_p gh = VecLoad(state+6);
1357  uint64x2_p W[16], S[8], vm, vk;
1358 
1359  size_t blocks = length / SHA512::BLOCKSIZE;
1360  while (blocks--)
1361  {
1362  unsigned int offset=0;
1363 
1364  S[A] = ab; S[C] = cd;
1365  S[E] = ef; S[G] = gh;
1366  S[B] = VecShiftLeftOctet<8>(S[A]);
1367  S[D] = VecShiftLeftOctet<8>(S[C]);
1368  S[F] = VecShiftLeftOctet<8>(S[E]);
1369  S[H] = VecShiftLeftOctet<8>(S[G]);
1370 
1371  // Rounds 0-16
1372  vk = VecLoad(offset, k);
1373  vm = VecLoad64(m, offset);
1374  SHA512_ROUND1<0>(W,S, vk,vm);
1375  offset+=16;
1376 
1377  vk = VecShiftLeftOctet<8>(vk);
1378  vm = VecShiftLeftOctet<8>(vm);
1379  SHA512_ROUND1<1>(W,S, vk,vm);
1380 
1381  vk = VecLoad(offset, k);
1382  vm = VecLoad64(m, offset);
1383  SHA512_ROUND1<2>(W,S, vk,vm);
1384  offset+=16;
1385 
1386  vk = VecShiftLeftOctet<8>(vk);
1387  vm = VecShiftLeftOctet<8>(vm);
1388  SHA512_ROUND1<3>(W,S, vk,vm);
1389 
1390  vk = VecLoad(offset, k);
1391  vm = VecLoad64(m, offset);
1392  SHA512_ROUND1<4>(W,S, vk,vm);
1393  offset+=16;
1394 
1395  vk = VecShiftLeftOctet<8>(vk);
1396  vm = VecShiftLeftOctet<8>(vm);
1397  SHA512_ROUND1<5>(W,S, vk,vm);
1398 
1399  vk = VecLoad(offset, k);
1400  vm = VecLoad64(m, offset);
1401  SHA512_ROUND1<6>(W,S, vk,vm);
1402  offset+=16;
1403 
1404  vk = VecShiftLeftOctet<8>(vk);
1405  vm = VecShiftLeftOctet<8>(vm);
1406  SHA512_ROUND1<7>(W,S, vk,vm);
1407 
1408  vk = VecLoad(offset, k);
1409  vm = VecLoad64(m, offset);
1410  SHA512_ROUND1<8>(W,S, vk,vm);
1411  offset+=16;
1412 
1413  vk = VecShiftLeftOctet<8>(vk);
1414  vm = VecShiftLeftOctet<8>(vm);
1415  SHA512_ROUND1<9>(W,S, vk,vm);
1416 
1417  vk = VecLoad(offset, k);
1418  vm = VecLoad64(m, offset);
1419  SHA512_ROUND1<10>(W,S, vk,vm);
1420  offset+=16;
1421 
1422  vk = VecShiftLeftOctet<8>(vk);
1423  vm = VecShiftLeftOctet<8>(vm);
1424  SHA512_ROUND1<11>(W,S, vk,vm);
1425 
1426  vk = VecLoad(offset, k);
1427  vm = VecLoad64(m, offset);
1428  SHA512_ROUND1<12>(W,S, vk,vm);
1429  offset+=16;
1430 
1431  vk = VecShiftLeftOctet<8>(vk);
1432  vm = VecShiftLeftOctet<8>(vm);
1433  SHA512_ROUND1<13>(W,S, vk,vm);
1434 
1435  vk = VecLoad(offset, k);
1436  vm = VecLoad64(m, offset);
1437  SHA512_ROUND1<14>(W,S, vk,vm);
1438  offset+=16;
1439 
1440  vk = VecShiftLeftOctet<8>(vk);
1441  vm = VecShiftLeftOctet<8>(vm);
1442  SHA512_ROUND1<15>(W,S, vk,vm);
1443 
1444  m += 16; // 64-bit words, not bytes
1445 
1446  // Rounds 16-80
1447  for (unsigned int i=16; i<80; i+=16)
1448  {
1449  vk = VecLoad(offset, k);
1450  SHA512_ROUND2<0>(W,S, vk);
1451  SHA512_ROUND2<1>(W,S, VecShiftLeftOctet<8>(vk));
1452  offset+=16;
1453 
1454  vk = VecLoad(offset, k);
1455  SHA512_ROUND2<2>(W,S, vk);
1456  SHA512_ROUND2<3>(W,S, VecShiftLeftOctet<8>(vk));
1457  offset+=16;
1458 
1459  vk = VecLoad(offset, k);
1460  SHA512_ROUND2<4>(W,S, vk);
1461  SHA512_ROUND2<5>(W,S, VecShiftLeftOctet<8>(vk));
1462  offset+=16;
1463 
1464  vk = VecLoad(offset, k);
1465  SHA512_ROUND2<6>(W,S, vk);
1466  SHA512_ROUND2<7>(W,S, VecShiftLeftOctet<8>(vk));
1467  offset+=16;
1468 
1469  vk = VecLoad(offset, k);
1470  SHA512_ROUND2<8>(W,S, vk);
1471  SHA512_ROUND2<9>(W,S, VecShiftLeftOctet<8>(vk));
1472  offset+=16;
1473 
1474  vk = VecLoad(offset, k);
1475  SHA512_ROUND2<10>(W,S, vk);
1476  SHA512_ROUND2<11>(W,S, VecShiftLeftOctet<8>(vk));
1477  offset+=16;
1478 
1479  vk = VecLoad(offset, k);
1480  SHA512_ROUND2<12>(W,S, vk);
1481  SHA512_ROUND2<13>(W,S, VecShiftLeftOctet<8>(vk));
1482  offset+=16;
1483 
1484  vk = VecLoad(offset, k);
1485  SHA512_ROUND2<14>(W,S, vk);
1486  SHA512_ROUND2<15>(W,S, VecShiftLeftOctet<8>(vk));
1487  offset+=16;
1488  }
1489 
1490  ab += VectorPack(S[A],S[B]);
1491  cd += VectorPack(S[C],S[D]);
1492  ef += VectorPack(S[E],S[F]);
1493  gh += VectorPack(S[G],S[H]);
1494  }
1495 
1496  VecStore64(ab, state+0);
1497  VecStore64(cd, state+2);
1498  VecStore64(ef, state+4);
1499  VecStore64(gh, state+6);
1500 }
1501 
1502 #endif // CRYPTOPP_POWER8_SHA_AVAILABLE
1503 
1504 ////////////////////////////////////////////////
1505 // end Gustavo, Serra, Scalet and Walton code //
1506 ////////////////////////////////////////////////
1507 
1508 NAMESPACE_END
#define M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:609
#define CONST_M128_CAST(x)
Clang workaround.
Definition: adv_simd.h:614
Library configuration file.
#define W64LIT(x)
Declare an unsigned word64.
Definition: config_int.h:119
unsigned int word32
32-bit unsigned datatype
Definition: config_int.h:62
unsigned long long word64
64-bit unsigned datatype
Definition: config_int.h:91
ByteOrder
Provides the byte ordering.
Definition: cryptlib.h:143
@ BIG_ENDIAN_ORDER
byte order is big-endian
Definition: cryptlib.h:147
Utility functions for the Crypto++ library.
Crypto++ library namespace.
Precompiled header file.
Support functions for PowerPC and vector operations.
__vector unsigned int uint32x4_p
Vector of 32-bit elements.
Definition: ppc_simd.h:202
T1 VecPermute(const T1 vec, const T2 mask)
Permutes a vector.
Definition: ppc_simd.h:1478
__vector unsigned char uint8x16_p
Vector of 8-bit elements.
Definition: ppc_simd.h:192
T1 VecXor(const T1 vec1, const T2 vec2)
XOR two vectors.
Definition: ppc_simd.h:1414
__vector unsigned long long uint64x2_p
Vector of 64-bit elements.
Definition: ppc_simd.h:212
void VecStore(const T data, byte dest[16])
Stores a vector to a byte array.
Definition: ppc_simd.h:895
uint32x4_p VecLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc_simd.h:369
Classes for SHA-1 and SHA-2 family of message digests.
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:68