Crypto++  6.1
Free C++ class library of cryptographic schemes
adv-simd.h
1 // adv-simd.h - written and placed in the public domain by Jeffrey Walton
2 //
3 // The SIMD based implementations for ciphers that use SSE, NEON and Power7
4 // have a commom pattern. Namely, they have a specialized implementation of
5 // AdvancedProcessBlocks which processes multiple block using hardware
6 // acceleration. After several implementations we noticed a lot of copy and
7 // paste occuring. adv-simd.h provides a template to avoid the copy and paste.
8 //
9 // There are 8 templates provided in this file. The number following the
10 // function name is the block size of the cipher. The name following that
11 // is the acceleration and arrangement. For example 4x1_SSE means Intel SSE
12 // using two encrypt (or decrypt) functions: one that operates on 4 blocks,
13 // and one that operates on 1 block.
14 //
15 // * AdvancedProcessBlocks64_4x1_SSE
16 // * AdvancedProcessBlocks128_4x1_SSE
17 // * AdvancedProcessBlocks64_6x2_SSE
18 // * AdvancedProcessBlocks128_6x2_SSE
19 // * AdvancedProcessBlocks64_6x2_NEON
20 // * AdvancedProcessBlocks128_6x2_NEON
21 // * AdvancedProcessBlocks64_6x2_ALTIVEC
22 // * AdvancedProcessBlocks128_6x2_ALTIVEC
23 //
24 
25 #ifndef CRYPTOPP_ADVANCED_SIMD_TEMPLATES
26 #define CRYPTOPP_ADVANCED_SIMD_TEMPLATES
27 
28 #include "config.h"
29 #include "misc.h"
30 #include "stdcpp.h"
31 
32 #if (CRYPTOPP_ARM_NEON_AVAILABLE)
33 # include <arm_neon.h>
34 #endif
35 
36 #if (CRYPTOPP_SSSE3_AVAILABLE)
37 # include <emmintrin.h>
38 # include <pmmintrin.h>
39 # include <tmmintrin.h>
40 #endif
41 
42 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
43 # include "ppc-simd.h"
44 #endif
45 
46 // https://www.spinics.net/lists/gcchelp/msg47735.html and
47 // https://www.spinics.net/lists/gcchelp/msg47749.html
48 #if (CRYPTOPP_GCC_VERSION >= 40900)
49 # define GCC_NO_UBSAN __attribute__ ((no_sanitize_undefined))
50 #else
51 # define GCC_NO_UBSAN
52 #endif
53 
54 // ************************ All block ciphers *********************** //
55 
56 ANONYMOUS_NAMESPACE_BEGIN
57 
58 using CryptoPP::BlockTransformation;
59 
60 CRYPTOPP_CONSTANT(BT_XorInput = BlockTransformation::BT_XorInput)
61 CRYPTOPP_CONSTANT(BT_AllowParallel = BlockTransformation::BT_AllowParallel)
62 CRYPTOPP_CONSTANT(BT_InBlockIsCounter = BlockTransformation::BT_InBlockIsCounter)
63 CRYPTOPP_CONSTANT(BT_ReverseDirection = BlockTransformation::BT_ReverseDirection)
64 CRYPTOPP_CONSTANT(BT_DontIncrementInOutPointers = BlockTransformation::BT_DontIncrementInOutPointers)
65 
66 ANONYMOUS_NAMESPACE_END
67 
68 // *************************** ARM NEON ************************** //
69 
70 #if defined(CRYPTOPP_ARM_NEON_AVAILABLE)
71 
72 NAMESPACE_BEGIN(CryptoPP)
73 
74 template <typename F2, typename F6>
75 inline size_t AdvancedProcessBlocks64_6x2_NEON(F2 func2, F6 func6,
76  const word32 *subKeys, size_t rounds, const byte *inBlocks,
77  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
78 {
79  CRYPTOPP_ASSERT(subKeys);
80  CRYPTOPP_ASSERT(inBlocks);
81  CRYPTOPP_ASSERT(outBlocks);
82  CRYPTOPP_ASSERT(length >= 8);
83 
84 #if defined(CRYPTOPP_LITTLE_ENDIAN)
85  const word32 s_zero32x4[] = {0, 0, 0, 0};
86  const word32 s_one32x4[] = {0, 0, 0, 1<<24};
87  const word32 s_one32x4_1b[] = {0, 0, 0, 1<<24};
88  const word32 s_one32x4_2b[] = {0, 2<<24, 0, 2<<24};
89 #else
90  const word32 s_zero32x4[] = {0, 0, 0, 0};
91  const word32 s_one32x4[] = {0, 0, 0, 1};
92  const word32 s_one32x4_1b[] = {0, 0, 0, 1};
93  const word32 s_one32x4_2b[] = {0, 2, 0, 2};
94 #endif
95 
96  const ptrdiff_t blockSize = 8;
97  const ptrdiff_t neonBlockSize = 16;
98 
99  ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : neonBlockSize;
100  ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? neonBlockSize : 0;
101  ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : neonBlockSize;
102 
103  // Clang and Coverity are generating findings using xorBlocks as a flag.
104  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
105  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
106 
107  if (flags & BT_ReverseDirection)
108  {
109  inBlocks += static_cast<ptrdiff_t>(length) - neonBlockSize;
110  xorBlocks += static_cast<ptrdiff_t>(length) - neonBlockSize;
111  outBlocks += static_cast<ptrdiff_t>(length) - neonBlockSize;
112  inIncrement = 0-inIncrement;
113  xorIncrement = 0-xorIncrement;
114  outIncrement = 0-outIncrement;
115  }
116 
117  if (flags & BT_AllowParallel)
118  {
119  while (length >= 6*neonBlockSize)
120  {
121  uint32x4_t block0, block1, block2, block3, block4, block5;
122  if (flags & BT_InBlockIsCounter)
123  {
124  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
125  // After the dup load we have two counters in the NEON word. Then we need
126  // to increment the low ctr by 0 and the high ctr by 1.
127  const uint8x8_t ctr = vld1_u8(inBlocks);
128  block0 = vaddq_u32(vld1q_u32(s_one32x4_1b),
129  vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
130 
131  // After initial increment of {0,1} remaining counters increment by {2,2}.
132  const uint32x4_t be2 = vld1q_u32(s_one32x4_2b);
133  block1 = vaddq_u32(be2, block0);
134  block2 = vaddq_u32(be2, block1);
135  block3 = vaddq_u32(be2, block2);
136  block4 = vaddq_u32(be2, block3);
137  block5 = vaddq_u32(be2, block4);
138 
139  vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
140  vreinterpretq_u8_u32(vaddq_u32(be2, block5))));
141  }
142  else
143  {
144  block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
145  inBlocks += inIncrement;
146  block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
147  inBlocks += inIncrement;
148  block2 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
149  inBlocks += inIncrement;
150  block3 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
151  inBlocks += inIncrement;
152  block4 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
153  inBlocks += inIncrement;
154  block5 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
155  inBlocks += inIncrement;
156  }
157 
158  if (xorInput)
159  {
160  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
161  xorBlocks += xorIncrement;
162  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
163  xorBlocks += xorIncrement;
164  block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
165  xorBlocks += xorIncrement;
166  block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
167  xorBlocks += xorIncrement;
168  block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
169  xorBlocks += xorIncrement;
170  block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
171  xorBlocks += xorIncrement;
172  }
173 
174  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
175 
176  if (xorOutput)
177  {
178  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
179  xorBlocks += xorIncrement;
180  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
181  xorBlocks += xorIncrement;
182  block2 = veorq_u32(block2, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
183  xorBlocks += xorIncrement;
184  block3 = veorq_u32(block3, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
185  xorBlocks += xorIncrement;
186  block4 = veorq_u32(block4, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
187  xorBlocks += xorIncrement;
188  block5 = veorq_u32(block5, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
189  xorBlocks += xorIncrement;
190  }
191 
192  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
193  outBlocks += outIncrement;
194  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
195  outBlocks += outIncrement;
196  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block2));
197  outBlocks += outIncrement;
198  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block3));
199  outBlocks += outIncrement;
200  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block4));
201  outBlocks += outIncrement;
202  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block5));
203  outBlocks += outIncrement;
204 
205  length -= 6*neonBlockSize;
206  }
207 
208  while (length >= 2*neonBlockSize)
209  {
210  uint32x4_t block0, block1;
211  if (flags & BT_InBlockIsCounter)
212  {
213  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
214  // After the dup load we have two counters in the NEON word. Then we need
215  // to increment the low ctr by 0 and the high ctr by 1.
216  const uint8x8_t ctr = vld1_u8(inBlocks);
217  block0 = vaddq_u32(vld1q_u32(s_one32x4_1b),
218  vreinterpretq_u32_u8(vcombine_u8(ctr,ctr)));
219 
220  // After initial increment of {0,1} remaining counters increment by {2,2}.
221  const uint32x4_t be2 = vld1q_u32(s_one32x4_2b);
222  block1 = vaddq_u32(be2, block0);
223 
224  vst1_u8(const_cast<byte*>(inBlocks), vget_low_u8(
225  vreinterpretq_u8_u32(vaddq_u32(be2, block1))));
226  }
227  else
228  {
229  block0 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
230  inBlocks += inIncrement;
231  block1 = vreinterpretq_u32_u8(vld1q_u8(inBlocks));
232  inBlocks += inIncrement;
233  }
234 
235  if (xorInput)
236  {
237  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
238  xorBlocks += xorIncrement;
239  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
240  xorBlocks += xorIncrement;
241  }
242 
243  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
244 
245  if (xorOutput)
246  {
247  block0 = veorq_u32(block0, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
248  xorBlocks += xorIncrement;
249  block1 = veorq_u32(block1, vreinterpretq_u32_u8(vld1q_u8(xorBlocks)));
250  xorBlocks += xorIncrement;
251  }
252 
253  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block0));
254  outBlocks += outIncrement;
255  vst1q_u8(outBlocks, vreinterpretq_u8_u32(block1));
256  outBlocks += outIncrement;
257 
258  length -= 2*neonBlockSize;
259  }
260  }
261 
262  if (length)
263  {
264  // Adjust to real block size
265  if (flags & BT_ReverseDirection)
266  {
267  inIncrement += inIncrement ? blockSize : 0;
268  xorIncrement += xorIncrement ? blockSize : 0;
269  outIncrement += outIncrement ? blockSize : 0;
270  inBlocks -= inIncrement;
271  xorBlocks -= xorIncrement;
272  outBlocks -= outIncrement;
273  }
274  else
275  {
276  inIncrement -= inIncrement ? blockSize : 0;
277  xorIncrement -= xorIncrement ? blockSize : 0;
278  outIncrement -= outIncrement ? blockSize : 0;
279  }
280 
281  while (length >= blockSize)
282  {
283  uint32x4_t block, zero = vld1q_u32(s_zero32x4);
284 
285  const uint8x8_t v = vld1_u8(inBlocks);
286  block = vreinterpretq_u32_u8(vcombine_u8(v,v));
287 
288  if (xorInput)
289  {
290  const uint8x8_t x = vld1_u8(xorBlocks);
291  block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
292  }
293 
294  if (flags & BT_InBlockIsCounter)
295  const_cast<byte *>(inBlocks)[7]++;
296 
297  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
298 
299  if (xorOutput)
300  {
301  const uint8x8_t x = vld1_u8(xorBlocks);
302  block = veorq_u32(block, vreinterpretq_u32_u8(vcombine_u8(x,x)));
303  }
304 
305  vst1_u8(const_cast<byte*>(outBlocks),
306  vget_low_u8(vreinterpretq_u8_u32(block)));
307 
308  inBlocks += inIncrement;
309  outBlocks += outIncrement;
310  xorBlocks += xorIncrement;
311  length -= blockSize;
312  }
313  }
314 
315  return length;
316 }
317 
318 template <typename F1, typename F6>
319 inline size_t AdvancedProcessBlocks128_NEON1x6(F1 func1, F6 func6,
320  const word32 *subKeys, size_t rounds, const byte *inBlocks,
321  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
322 {
323  CRYPTOPP_ASSERT(subKeys);
324  CRYPTOPP_ASSERT(inBlocks);
325  CRYPTOPP_ASSERT(outBlocks);
326  CRYPTOPP_ASSERT(length >= 16);
327 
328 #if defined(CRYPTOPP_LITTLE_ENDIAN)
329  const word32 s_zero32x4[] = {0, 0, 0, 0};
330  const word32 s_one32x4[] = {0, 0, 0, 1<<24};
331  const word32 s_one32x4_1b[] = {0, 0, 0, 1<<24};
332  const word32 s_one32x4_2b[] = {0, 2<<24, 0, 2<<24};
333 #else
334  const word32 s_zero32x4[] = {0, 0, 0, 0};
335  const word32 s_one32x4[] = {0, 0, 0, 1};
336  const word32 s_one32x4_1b[] = {0, 0, 0, 1};
337  const word32 s_one32x4_2b[] = {0, 2, 0, 2};
338 #endif
339 
340  const ptrdiff_t blockSize = 16;
341  // const ptrdiff_t neonBlockSize = 16;
342 
343  ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
344  ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
345  ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
346 
347  // Clang and Coverity are generating findings using xorBlocks as a flag.
348  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
349  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
350 
351  if (flags & BT_ReverseDirection)
352  {
353  inBlocks += static_cast<ptrdiff_t>(length) - blockSize;
354  xorBlocks += static_cast<ptrdiff_t>(length) - blockSize;
355  outBlocks += static_cast<ptrdiff_t>(length) - blockSize;
356  inIncrement = 0-inIncrement;
357  xorIncrement = 0-xorIncrement;
358  outIncrement = 0-outIncrement;
359  }
360 
361  if (flags & BT_AllowParallel)
362  {
363  while (length >= 6*blockSize)
364  {
365  uint64x2_t block0, block1, block2, block3, block4, block5;
366  if (flags & BT_InBlockIsCounter)
367  {
368  const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one32x4));
369  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
370 
371  block1 = vaddq_u64(block0, be);
372  block2 = vaddq_u64(block1, be);
373  block3 = vaddq_u64(block2, be);
374  block4 = vaddq_u64(block3, be);
375  block5 = vaddq_u64(block4, be);
376  vst1q_u8(const_cast<byte*>(inBlocks),
377  vreinterpretq_u8_u64(vaddq_u64(block5, be)));
378  }
379  else
380  {
381  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
382  inBlocks += inIncrement;
383  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
384  inBlocks += inIncrement;
385  block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
386  inBlocks += inIncrement;
387  block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
388  inBlocks += inIncrement;
389  block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
390  inBlocks += inIncrement;
391  block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
392  inBlocks += inIncrement;
393  }
394 
395  if (xorInput)
396  {
397  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
398  xorBlocks += xorIncrement;
399  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
400  xorBlocks += xorIncrement;
401  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
402  xorBlocks += xorIncrement;
403  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
404  xorBlocks += xorIncrement;
405  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
406  xorBlocks += xorIncrement;
407  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
408  xorBlocks += xorIncrement;
409  }
410 
411  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
412 
413  if (xorOutput)
414  {
415  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
416  xorBlocks += xorIncrement;
417  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
418  xorBlocks += xorIncrement;
419  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
420  xorBlocks += xorIncrement;
421  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
422  xorBlocks += xorIncrement;
423  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
424  xorBlocks += xorIncrement;
425  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
426  xorBlocks += xorIncrement;
427  }
428 
429  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
430  outBlocks += outIncrement;
431  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
432  outBlocks += outIncrement;
433  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
434  outBlocks += outIncrement;
435  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
436  outBlocks += outIncrement;
437  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
438  outBlocks += outIncrement;
439  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
440  outBlocks += outIncrement;
441 
442  length -= 6*blockSize;
443  }
444  }
445 
446  while (length >= blockSize)
447  {
448  uint64x2_t block;
449  block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
450 
451  if (xorInput)
452  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
453 
454  if (flags & BT_InBlockIsCounter)
455  const_cast<byte *>(inBlocks)[15]++;
456 
457  func1(block, subKeys, static_cast<unsigned int>(rounds));
458 
459  if (xorOutput)
460  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
461 
462  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
463 
464  inBlocks += inIncrement;
465  outBlocks += outIncrement;
466  xorBlocks += xorIncrement;
467  length -= blockSize;
468  }
469 
470  return length;
471 }
472 
473 template <typename F2, typename F6>
474 inline size_t AdvancedProcessBlocks128_6x2_NEON(F2 func2, F6 func6,
475  const word64 *subKeys, size_t rounds, const byte *inBlocks,
476  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
477 {
478  CRYPTOPP_ASSERT(subKeys);
479  CRYPTOPP_ASSERT(inBlocks);
480  CRYPTOPP_ASSERT(outBlocks);
481  CRYPTOPP_ASSERT(length >= 16);
482 
483 #if defined(CRYPTOPP_LITTLE_ENDIAN)
484  const word32 s_zero32x4[] = {0, 0, 0, 0};
485  const word32 s_one32x4[] = {0, 0, 0, 1<<24};
486  const word32 s_one32x4_1b[] = {0, 0, 0, 1<<24};
487  const word32 s_one32x4_2b[] = {0, 2<<24, 0, 2<<24};
488 #else
489  const word32 s_zero32x4[] = {0, 0, 0, 0};
490  const word32 s_one32x4[] = {0, 0, 0, 1};
491  const word32 s_one32x4_1b[] = {0, 0, 0, 1};
492  const word32 s_one32x4_2b[] = {0, 2, 0, 2};
493 #endif
494 
495  const ptrdiff_t blockSize = 16;
496  // const ptrdiff_t neonBlockSize = 16;
497 
498  ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
499  ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
500  ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
501 
502  // Clang and Coverity are generating findings using xorBlocks as a flag.
503  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
504  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
505 
506  if (flags & BT_ReverseDirection)
507  {
508  inBlocks += static_cast<ptrdiff_t>(length) - blockSize;
509  xorBlocks += static_cast<ptrdiff_t>(length) - blockSize;
510  outBlocks += static_cast<ptrdiff_t>(length) - blockSize;
511  inIncrement = 0-inIncrement;
512  xorIncrement = 0-xorIncrement;
513  outIncrement = 0-outIncrement;
514  }
515 
516  if (flags & BT_AllowParallel)
517  {
518  while (length >= 6*blockSize)
519  {
520  uint64x2_t block0, block1, block2, block3, block4, block5;
521  if (flags & BT_InBlockIsCounter)
522  {
523  const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one32x4));
524  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
525 
526  block1 = vaddq_u64(block0, be);
527  block2 = vaddq_u64(block1, be);
528  block3 = vaddq_u64(block2, be);
529  block4 = vaddq_u64(block3, be);
530  block5 = vaddq_u64(block4, be);
531  vst1q_u8(const_cast<byte*>(inBlocks),
532  vreinterpretq_u8_u64(vaddq_u64(block5, be)));
533  }
534  else
535  {
536  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
537  inBlocks += inIncrement;
538  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
539  inBlocks += inIncrement;
540  block2 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
541  inBlocks += inIncrement;
542  block3 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
543  inBlocks += inIncrement;
544  block4 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
545  inBlocks += inIncrement;
546  block5 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
547  inBlocks += inIncrement;
548  }
549 
550  if (xorInput)
551  {
552  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
553  xorBlocks += xorIncrement;
554  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
555  xorBlocks += xorIncrement;
556  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
557  xorBlocks += xorIncrement;
558  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
559  xorBlocks += xorIncrement;
560  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
561  xorBlocks += xorIncrement;
562  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
563  xorBlocks += xorIncrement;
564  }
565 
566  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
567 
568  if (xorOutput)
569  {
570  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
571  xorBlocks += xorIncrement;
572  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
573  xorBlocks += xorIncrement;
574  block2 = veorq_u64(block2, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
575  xorBlocks += xorIncrement;
576  block3 = veorq_u64(block3, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
577  xorBlocks += xorIncrement;
578  block4 = veorq_u64(block4, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
579  xorBlocks += xorIncrement;
580  block5 = veorq_u64(block5, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
581  xorBlocks += xorIncrement;
582  }
583 
584  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
585  outBlocks += outIncrement;
586  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
587  outBlocks += outIncrement;
588  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block2));
589  outBlocks += outIncrement;
590  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block3));
591  outBlocks += outIncrement;
592  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block4));
593  outBlocks += outIncrement;
594  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block5));
595  outBlocks += outIncrement;
596 
597  length -= 6*blockSize;
598  }
599 
600  while (length >= 2*blockSize)
601  {
602  uint64x2_t block0, block1;
603  if (flags & BT_InBlockIsCounter)
604  {
605  const uint64x2_t be = vreinterpretq_u64_u32(vld1q_u32(s_one32x4));
606  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
607  block1 = vaddq_u64(block0, be);
608 
609  vst1q_u8(const_cast<byte*>(inBlocks),
610  vreinterpretq_u8_u64(vaddq_u64(block1, be)));
611  }
612  else
613  {
614  block0 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
615  inBlocks += inIncrement;
616  block1 = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
617  inBlocks += inIncrement;
618  }
619 
620  if (xorInput)
621  {
622  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
623  xorBlocks += xorIncrement;
624  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
625  xorBlocks += xorIncrement;
626  }
627 
628  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
629 
630  if (xorOutput)
631  {
632  block0 = veorq_u64(block0, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
633  xorBlocks += xorIncrement;
634  block1 = veorq_u64(block1, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
635  xorBlocks += xorIncrement;
636  }
637 
638  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block0));
639  outBlocks += outIncrement;
640  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block1));
641  outBlocks += outIncrement;
642 
643  length -= 2*blockSize;
644  }
645  }
646 
647  while (length >= blockSize)
648  {
649  uint64x2_t block, zero = {0,0};
650  block = vreinterpretq_u64_u8(vld1q_u8(inBlocks));
651 
652  if (xorInput)
653  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
654 
655  if (flags & BT_InBlockIsCounter)
656  const_cast<byte *>(inBlocks)[15]++;
657 
658  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
659 
660  if (xorOutput)
661  block = veorq_u64(block, vreinterpretq_u64_u8(vld1q_u8(xorBlocks)));
662 
663  vst1q_u8(outBlocks, vreinterpretq_u8_u64(block));
664 
665  inBlocks += inIncrement;
666  outBlocks += outIncrement;
667  xorBlocks += xorIncrement;
668  length -= blockSize;
669  }
670 
671  return length;
672 }
673 
674 NAMESPACE_END // CryptoPP
675 
676 #endif // CRYPTOPP_ARM_NEON_AVAILABLE
677 
678 // *************************** Intel SSE ************************** //
679 
680 #if defined(CRYPTOPP_SSSE3_AVAILABLE)
681 
682 // Hack for SunCC, http://github.com/weidai11/cryptopp/issues/224
683 #if (__SUNPRO_CC >= 0x5130)
684 # define MAYBE_CONST
685 # define MAYBE_UNCONST_CAST(T, x) const_cast<MAYBE_CONST T>(x)
686 #else
687 # define MAYBE_CONST const
688 # define MAYBE_UNCONST_CAST(T, x) (x)
689 #endif
690 
691 // Clang __m128i casts, http://bugs.llvm.org/show_bug.cgi?id=20670
692 #ifndef M128_CAST
693 # define M128_CAST(x) ((__m128i *)(void *)(x))
694 #endif
695 #ifndef CONST_M128_CAST
696 # define CONST_M128_CAST(x) ((const __m128i *)(const void *)(x))
697 #endif
698 
699 // GCC double casts, https://www.spinics.net/lists/gcchelp/msg47735.html
700 #ifndef DOUBLE_CAST
701 # define DOUBLE_CAST(x) ((double *)(void *)(x))
702 #endif
703 #ifndef CONST_DOUBLE_CAST
704 # define CONST_DOUBLE_CAST(x) ((const double *)(const void *)(x))
705 #endif
706 
707 NAMESPACE_BEGIN(CryptoPP)
708 
709 template <typename F2, typename F6>
710 inline size_t GCC_NO_UBSAN AdvancedProcessBlocks64_6x2_SSE(F2 func2, F6 func6,
711  const word32 *subKeys, size_t rounds, const byte *inBlocks,
712  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
713 {
714  CRYPTOPP_ASSERT(subKeys);
715  CRYPTOPP_ASSERT(inBlocks);
716  CRYPTOPP_ASSERT(outBlocks);
717  CRYPTOPP_ASSERT(length >= 8);
718 
719  CRYPTOPP_ALIGN_DATA(16)
720  const word32 s_one32x4_1b[] = {0, 0, 0, 1<<24};
721  CRYPTOPP_ALIGN_DATA(16)
722  const word32 s_one32x4_2b[] = {0, 2<<24, 0, 2<<24};
723 
724  const ptrdiff_t blockSize = 8;
725  const ptrdiff_t xmmBlockSize = 16;
726 
727  ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : xmmBlockSize;
728  ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? xmmBlockSize : 0;
729  ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : xmmBlockSize;
730 
731  // Clang and Coverity are generating findings using xorBlocks as a flag.
732  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
733  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
734 
735  if (flags & BT_ReverseDirection)
736  {
737  inBlocks += static_cast<ptrdiff_t>(length) - xmmBlockSize;
738  xorBlocks += static_cast<ptrdiff_t>(length) - xmmBlockSize;
739  outBlocks += static_cast<ptrdiff_t>(length) - xmmBlockSize;
740  inIncrement = 0-inIncrement;
741  xorIncrement = 0-xorIncrement;
742  outIncrement = 0-outIncrement;
743  }
744 
745  if (flags & BT_AllowParallel)
746  {
747  while (length >= 6*xmmBlockSize)
748  {
749  __m128i block0, block1, block2, block3, block4, block5;
750  if (flags & BT_InBlockIsCounter)
751  {
752  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
753  // After the dup load we have two counters in the XMM word. Then we need
754  // to increment the low ctr by 0 and the high ctr by 1.
755  block0 = _mm_add_epi32(*CONST_M128_CAST(s_one32x4_1b), _mm_castpd_si128(
756  _mm_loaddup_pd(CONST_DOUBLE_CAST(inBlocks))));
757 
758  // After initial increment of {0,1} remaining counters increment by {2,2}.
759  const __m128i be2 = *CONST_M128_CAST(s_one32x4_2b);
760  block1 = _mm_add_epi32(be2, block0);
761  block2 = _mm_add_epi32(be2, block1);
762  block3 = _mm_add_epi32(be2, block2);
763  block4 = _mm_add_epi32(be2, block3);
764  block5 = _mm_add_epi32(be2, block4);
765 
766  // Store the next counter. UBsan false positive; mem_addr can be unaligned.
767  _mm_store_sd(DOUBLE_CAST(inBlocks),
768  _mm_castsi128_pd(_mm_add_epi32(be2, block5)));
769  }
770  else
771  {
772  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
773  inBlocks += inIncrement;
774  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
775  inBlocks += inIncrement;
776  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
777  inBlocks += inIncrement;
778  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
779  inBlocks += inIncrement;
780  block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
781  inBlocks += inIncrement;
782  block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
783  inBlocks += inIncrement;
784  }
785 
786  if (xorInput)
787  {
788  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
789  xorBlocks += xorIncrement;
790  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
791  xorBlocks += xorIncrement;
792  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
793  xorBlocks += xorIncrement;
794  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
795  xorBlocks += xorIncrement;
796  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
797  xorBlocks += xorIncrement;
798  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
799  xorBlocks += xorIncrement;
800  }
801 
802  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
803 
804  if (xorOutput)
805  {
806  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
807  xorBlocks += xorIncrement;
808  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
809  xorBlocks += xorIncrement;
810  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
811  xorBlocks += xorIncrement;
812  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
813  xorBlocks += xorIncrement;
814  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
815  xorBlocks += xorIncrement;
816  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
817  xorBlocks += xorIncrement;
818  }
819 
820  _mm_storeu_si128(M128_CAST(outBlocks), block0);
821  outBlocks += outIncrement;
822  _mm_storeu_si128(M128_CAST(outBlocks), block1);
823  outBlocks += outIncrement;
824  _mm_storeu_si128(M128_CAST(outBlocks), block2);
825  outBlocks += outIncrement;
826  _mm_storeu_si128(M128_CAST(outBlocks), block3);
827  outBlocks += outIncrement;
828  _mm_storeu_si128(M128_CAST(outBlocks), block4);
829  outBlocks += outIncrement;
830  _mm_storeu_si128(M128_CAST(outBlocks), block5);
831  outBlocks += outIncrement;
832 
833  length -= 6*xmmBlockSize;
834  }
835 
836  while (length >= 2*xmmBlockSize)
837  {
838  __m128i block0, block1;
839  if (flags & BT_InBlockIsCounter)
840  {
841  // For 64-bit block ciphers we need to load the CTR block, which is 8 bytes.
842  // After the dup load we have two counters in the XMM word. Then we need
843  // to increment the low ctr by 0 and the high ctr by 1.
844  block0 = _mm_add_epi32(*CONST_M128_CAST(s_one32x4_1b), _mm_castpd_si128(
845  _mm_loaddup_pd(CONST_DOUBLE_CAST(inBlocks))));
846 
847  // After initial increment of {0,1} remaining counters increment by {2,2}.
848  const __m128i be2 = *CONST_M128_CAST(s_one32x4_2b);
849  block1 = _mm_add_epi32(be2, block0);
850 
851  // Store the next counter. UBsan false positive; mem_addr can be unaligned.
852  _mm_store_sd(DOUBLE_CAST(inBlocks),
853  _mm_castsi128_pd(_mm_add_epi64(be2, block1)));
854  }
855  else
856  {
857  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
858  inBlocks += inIncrement;
859  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
860  inBlocks += inIncrement;
861  }
862 
863  if (xorInput)
864  {
865  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
866  xorBlocks += xorIncrement;
867  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
868  xorBlocks += xorIncrement;
869  }
870 
871  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
872 
873  if (xorOutput)
874  {
875  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
876  xorBlocks += xorIncrement;
877  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
878  xorBlocks += xorIncrement;
879  }
880 
881  _mm_storeu_si128(M128_CAST(outBlocks), block0);
882  outBlocks += outIncrement;
883  _mm_storeu_si128(M128_CAST(outBlocks), block1);
884  outBlocks += outIncrement;
885 
886  length -= 2*xmmBlockSize;
887  }
888  }
889 
890  if (length)
891  {
892  // Adjust to real block size
893  if (flags & BT_ReverseDirection)
894  {
895  inIncrement += inIncrement ? blockSize : 0;
896  xorIncrement += xorIncrement ? blockSize : 0;
897  outIncrement += outIncrement ? blockSize : 0;
898  inBlocks -= inIncrement;
899  xorBlocks -= xorIncrement;
900  outBlocks -= outIncrement;
901  }
902  else
903  {
904  inIncrement -= inIncrement ? blockSize : 0;
905  xorIncrement -= xorIncrement ? blockSize : 0;
906  outIncrement -= outIncrement ? blockSize : 0;
907  }
908 
909  while (length >= blockSize)
910  {
911  __m128i block, zero = _mm_setzero_si128();
912  block = _mm_castpd_si128(
913  // UBsan false positive; mem_addr can be unaligned.
914  _mm_load_sd(CONST_DOUBLE_CAST(inBlocks)));
915 
916  if (xorInput)
917  {
918  block = _mm_xor_si128(block, _mm_castpd_si128(
919  // UBsan false positive; mem_addr can be unaligned.
920  _mm_load_sd(CONST_DOUBLE_CAST(xorBlocks))));
921  }
922 
923  if (flags & BT_InBlockIsCounter)
924  const_cast<byte *>(inBlocks)[7]++;
925 
926  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
927 
928  if (xorOutput)
929  {
930  block = _mm_xor_si128(block, _mm_castpd_si128(
931  // UBsan false positive; mem_addr can be unaligned.
932  _mm_load_sd(CONST_DOUBLE_CAST(xorBlocks))));
933  }
934 
935  // UBsan false positive; mem_addr can be unaligned.
936  _mm_store_sd(DOUBLE_CAST(outBlocks), _mm_castsi128_pd(block));
937 
938  inBlocks += inIncrement;
939  outBlocks += outIncrement;
940  xorBlocks += xorIncrement;
941  length -= blockSize;
942  }
943  }
944 
945  return length;
946 }
947 
948 template <typename F2, typename F6>
949 inline size_t AdvancedProcessBlocks128_6x2_SSE(F2 func2, F6 func6,
950  const word64 *subKeys, size_t rounds, const byte *inBlocks,
951  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
952 {
953  CRYPTOPP_ASSERT(subKeys);
954  CRYPTOPP_ASSERT(inBlocks);
955  CRYPTOPP_ASSERT(outBlocks);
956  CRYPTOPP_ASSERT(length >= 16);
957 
958  CRYPTOPP_ALIGN_DATA(16)
959  const word32 s_one32x4[] = {0, 0, 0, 1<<24};
960 
961  const ptrdiff_t blockSize = 16;
962  // const ptrdiff_t xmmBlockSize = 16;
963 
964  ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
965  ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
966  ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
967 
968  // Clang and Coverity are generating findings using xorBlocks as a flag.
969  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
970  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
971 
972  if (flags & BT_ReverseDirection)
973  {
974  inBlocks += static_cast<ptrdiff_t>(length) - blockSize;
975  xorBlocks += static_cast<ptrdiff_t>(length) - blockSize;
976  outBlocks += static_cast<ptrdiff_t>(length) - blockSize;
977  inIncrement = 0-inIncrement;
978  xorIncrement = 0-xorIncrement;
979  outIncrement = 0-outIncrement;
980  }
981 
982  if (flags & BT_AllowParallel)
983  {
984  while (length >= 6*blockSize)
985  {
986  __m128i block0, block1, block2, block3, block4, block5;
987  if (flags & BT_InBlockIsCounter)
988  {
989  const __m128i be1 = *CONST_M128_CAST(s_one32x4);
990  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
991  block1 = _mm_add_epi32(block0, be1);
992  block2 = _mm_add_epi32(block1, be1);
993  block3 = _mm_add_epi32(block2, be1);
994  block4 = _mm_add_epi32(block3, be1);
995  block5 = _mm_add_epi32(block4, be1);
996  _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block5, be1));
997  }
998  else
999  {
1000  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1001  inBlocks += inIncrement;
1002  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1003  inBlocks += inIncrement;
1004  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1005  inBlocks += inIncrement;
1006  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1007  inBlocks += inIncrement;
1008  block4 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1009  inBlocks += inIncrement;
1010  block5 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1011  inBlocks += inIncrement;
1012  }
1013 
1014  if (xorInput)
1015  {
1016  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1017  xorBlocks += xorIncrement;
1018  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1019  xorBlocks += xorIncrement;
1020  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1021  xorBlocks += xorIncrement;
1022  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1023  xorBlocks += xorIncrement;
1024  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1025  xorBlocks += xorIncrement;
1026  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1027  xorBlocks += xorIncrement;
1028  }
1029 
1030  func6(block0, block1, block2, block3, block4, block5, subKeys, static_cast<unsigned int>(rounds));
1031 
1032  if (xorOutput)
1033  {
1034  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1035  xorBlocks += xorIncrement;
1036  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1037  xorBlocks += xorIncrement;
1038  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1039  xorBlocks += xorIncrement;
1040  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1041  xorBlocks += xorIncrement;
1042  block4 = _mm_xor_si128(block4, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1043  xorBlocks += xorIncrement;
1044  block5 = _mm_xor_si128(block5, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1045  xorBlocks += xorIncrement;
1046  }
1047 
1048  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1049  outBlocks += outIncrement;
1050  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1051  outBlocks += outIncrement;
1052  _mm_storeu_si128(M128_CAST(outBlocks), block2);
1053  outBlocks += outIncrement;
1054  _mm_storeu_si128(M128_CAST(outBlocks), block3);
1055  outBlocks += outIncrement;
1056  _mm_storeu_si128(M128_CAST(outBlocks), block4);
1057  outBlocks += outIncrement;
1058  _mm_storeu_si128(M128_CAST(outBlocks), block5);
1059  outBlocks += outIncrement;
1060 
1061  length -= 6*blockSize;
1062  }
1063 
1064  while (length >= 2*blockSize)
1065  {
1066  __m128i block0, block1;
1067  if (flags & BT_InBlockIsCounter)
1068  {
1069  const __m128i be1 = *CONST_M128_CAST(s_one32x4);
1070  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1071  block1 = _mm_add_epi32(block0, be1);
1072  _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block1, be1));
1073  }
1074  else
1075  {
1076  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1077  inBlocks += inIncrement;
1078  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1079  inBlocks += inIncrement;
1080  }
1081 
1082  if (xorInput)
1083  {
1084  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1085  xorBlocks += xorIncrement;
1086  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1087  xorBlocks += xorIncrement;
1088  }
1089 
1090  func2(block0, block1, subKeys, static_cast<unsigned int>(rounds));
1091 
1092  if (xorOutput)
1093  {
1094  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1095  xorBlocks += xorIncrement;
1096  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1097  xorBlocks += xorIncrement;
1098  }
1099 
1100  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1101  outBlocks += outIncrement;
1102  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1103  outBlocks += outIncrement;
1104 
1105  length -= 2*blockSize;
1106  }
1107  }
1108 
1109  while (length >= blockSize)
1110  {
1111  __m128i block, zero = _mm_setzero_si128();
1112  block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1113 
1114  if (xorInput)
1115  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1116 
1117  if (flags & BT_InBlockIsCounter)
1118  const_cast<byte *>(inBlocks)[15]++;
1119 
1120  func2(block, zero, subKeys, static_cast<unsigned int>(rounds));
1121 
1122  if (xorOutput)
1123  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1124 
1125  _mm_storeu_si128(M128_CAST(outBlocks), block);
1126 
1127  inBlocks += inIncrement;
1128  outBlocks += outIncrement;
1129  xorBlocks += xorIncrement;
1130  length -= blockSize;
1131  }
1132 
1133  return length;
1134 }
1135 
1136 template <typename F1, typename F4>
1137 inline size_t AdvancedProcessBlocks128_4x1_SSE(F1 func1, F4 func4,
1138  MAYBE_CONST word32 *subKeys, size_t rounds, const byte *inBlocks,
1139  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1140 {
1141  CRYPTOPP_ASSERT(subKeys);
1142  CRYPTOPP_ASSERT(inBlocks);
1143  CRYPTOPP_ASSERT(outBlocks);
1144  CRYPTOPP_ASSERT(length >= 16);
1145 
1146  CRYPTOPP_ALIGN_DATA(16)
1147  const word32 s_one32x4[] = {0, 0, 0, 1<<24};
1148 
1149  const ptrdiff_t blockSize = 16;
1150  // const ptrdiff_t xmmBlockSize = 16;
1151 
1152  ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1153  ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1154  ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1155 
1156  // Clang and Coverity are generating findings using xorBlocks as a flag.
1157  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1158  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1159 
1160  if (flags & BT_ReverseDirection)
1161  {
1162  inBlocks += static_cast<ptrdiff_t>(length) - blockSize;
1163  xorBlocks += static_cast<ptrdiff_t>(length) - blockSize;
1164  outBlocks += static_cast<ptrdiff_t>(length) - blockSize;
1165  inIncrement = 0-inIncrement;
1166  xorIncrement = 0-xorIncrement;
1167  outIncrement = 0-outIncrement;
1168  }
1169 
1170  if (flags & BT_AllowParallel)
1171  {
1172  while (length >= 4*blockSize)
1173  {
1174  __m128i block0, block1, block2, block3;
1175  if (flags & BT_InBlockIsCounter)
1176  {
1177  const __m128i be1 = *CONST_M128_CAST(s_one32x4);
1178  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1179  block1 = _mm_add_epi32(block0, be1);
1180  block2 = _mm_add_epi32(block1, be1);
1181  block3 = _mm_add_epi32(block2, be1);
1182  _mm_storeu_si128(M128_CAST(inBlocks), _mm_add_epi32(block3, be1));
1183  }
1184  else
1185  {
1186  block0 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1187  inBlocks += inIncrement;
1188  block1 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1189  inBlocks += inIncrement;
1190  block2 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1191  inBlocks += inIncrement;
1192  block3 = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1193  inBlocks += inIncrement;
1194  }
1195 
1196  if (xorInput)
1197  {
1198  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1199  xorBlocks += xorIncrement;
1200  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1201  xorBlocks += xorIncrement;
1202  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1203  xorBlocks += xorIncrement;
1204  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1205  xorBlocks += xorIncrement;
1206  }
1207 
1208  func4(block0, block1, block2, block3, subKeys, static_cast<unsigned int>(rounds));
1209 
1210  if (xorOutput)
1211  {
1212  block0 = _mm_xor_si128(block0, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1213  xorBlocks += xorIncrement;
1214  block1 = _mm_xor_si128(block1, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1215  xorBlocks += xorIncrement;
1216  block2 = _mm_xor_si128(block2, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1217  xorBlocks += xorIncrement;
1218  block3 = _mm_xor_si128(block3, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1219  xorBlocks += xorIncrement;
1220  }
1221 
1222  _mm_storeu_si128(M128_CAST(outBlocks), block0);
1223  outBlocks += outIncrement;
1224  _mm_storeu_si128(M128_CAST(outBlocks), block1);
1225  outBlocks += outIncrement;
1226  _mm_storeu_si128(M128_CAST(outBlocks), block2);
1227  outBlocks += outIncrement;
1228  _mm_storeu_si128(M128_CAST(outBlocks), block3);
1229  outBlocks += outIncrement;
1230 
1231  length -= 4*blockSize;
1232  }
1233  }
1234 
1235  while (length >= blockSize)
1236  {
1237  __m128i block = _mm_loadu_si128(CONST_M128_CAST(inBlocks));
1238 
1239  if (xorInput)
1240  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1241 
1242  if (flags & BT_InBlockIsCounter)
1243  const_cast<byte *>(inBlocks)[15]++;
1244 
1245  func1(block, subKeys, static_cast<unsigned int>(rounds));
1246 
1247  if (xorOutput)
1248  block = _mm_xor_si128(block, _mm_loadu_si128(CONST_M128_CAST(xorBlocks)));
1249 
1250  _mm_storeu_si128(M128_CAST(outBlocks), block);
1251 
1252  inBlocks += inIncrement;
1253  outBlocks += outIncrement;
1254  xorBlocks += xorIncrement;
1255  length -= blockSize;
1256  }
1257 
1258  return length;
1259 }
1260 
1261 NAMESPACE_END // CryptoPP
1262 
1263 #endif // CRYPTOPP_SSSE3_AVAILABLE
1264 
1265 // *********************** Altivec/Power 4 ********************** //
1266 
1267 #if defined(CRYPTOPP_ALTIVEC_AVAILABLE)
1268 
1269 NAMESPACE_BEGIN(CryptoPP)
1270 
1271 template <typename F1, typename F6>
1272 inline size_t AdvancedProcessBlocks128_6x1_ALTIVEC(F1 func1, F6 func6,
1273  const word32 *subKeys, size_t rounds, const byte *inBlocks,
1274  const byte *xorBlocks, byte *outBlocks, size_t length, word32 flags)
1275 {
1276  CRYPTOPP_ASSERT(subKeys);
1277  CRYPTOPP_ASSERT(inBlocks);
1278  CRYPTOPP_ASSERT(outBlocks);
1279  CRYPTOPP_ASSERT(length >= 16);
1280 
1281 #if defined(CRYPTOPP_LITTLE_ENDIAN)
1282  const uint32x4_p s_one = {1,0,0,0};
1283 #else
1284  const uint32x4_p s_one = {0,0,0,1};
1285 #endif
1286 
1287  const ptrdiff_t blockSize = 16;
1288  // const ptrdiff_t vexBlockSize = 16;
1289 
1290  ptrdiff_t inIncrement = (flags & (BT_InBlockIsCounter|BT_DontIncrementInOutPointers)) ? 0 : blockSize;
1291  ptrdiff_t xorIncrement = (xorBlocks != NULLPTR) ? blockSize : 0;
1292  ptrdiff_t outIncrement = (flags & BT_DontIncrementInOutPointers) ? 0 : blockSize;
1293 
1294  // Clang and Coverity are generating findings using xorBlocks as a flag.
1295  const bool xorInput = (xorBlocks != NULLPTR) && (flags & BT_XorInput);
1296  const bool xorOutput = (xorBlocks != NULLPTR) && !(flags & BT_XorInput);
1297 
1298  if (flags & BT_ReverseDirection)
1299  {
1300  inBlocks += static_cast<ptrdiff_t>(length) - blockSize;
1301  xorBlocks += static_cast<ptrdiff_t>(length) - blockSize;
1302  outBlocks += static_cast<ptrdiff_t>(length) - blockSize;
1303  inIncrement = 0-inIncrement;
1304  xorIncrement = 0-xorIncrement;
1305  outIncrement = 0-outIncrement;
1306  }
1307 
1308  if (flags & BT_AllowParallel)
1309  {
1310  while (length >= 6*blockSize)
1311  {
1312  uint32x4_p block0, block1, block2, block3, block4, block5, temp;
1313 
1314  if (flags & BT_InBlockIsCounter)
1315  {
1316  block0 = VectorLoad(inBlocks);
1317  block1 = VectorAdd(block0, s_one);
1318  block2 = VectorAdd(block1, s_one);
1319  block3 = VectorAdd(block2, s_one);
1320  block4 = VectorAdd(block3, s_one);
1321  block5 = VectorAdd(block4, s_one);
1322  temp = VectorAdd(block5, s_one);
1323  VectorStore(temp, const_cast<byte*>(inBlocks));
1324  }
1325  else
1326  {
1327  block0 = VectorLoad(inBlocks);
1328  inBlocks += inIncrement;
1329  block1 = VectorLoad(inBlocks);
1330  inBlocks += inIncrement;
1331  block2 = VectorLoad(inBlocks);
1332  inBlocks += inIncrement;
1333  block3 = VectorLoad(inBlocks);
1334  inBlocks += inIncrement;
1335  block4 = VectorLoad(inBlocks);
1336  inBlocks += inIncrement;
1337  block5 = VectorLoad(inBlocks);
1338  inBlocks += inIncrement;
1339  }
1340 
1341  if (xorInput)
1342  {
1343  block0 = VectorXor(block0, VectorLoad(xorBlocks));
1344  xorBlocks += xorIncrement;
1345  block1 = VectorXor(block1, VectorLoad(xorBlocks));
1346  xorBlocks += xorIncrement;
1347  block2 = VectorXor(block2, VectorLoad(xorBlocks));
1348  xorBlocks += xorIncrement;
1349  block3 = VectorXor(block3, VectorLoad(xorBlocks));
1350  xorBlocks += xorIncrement;
1351  block4 = VectorXor(block4, VectorLoad(xorBlocks));
1352  xorBlocks += xorIncrement;
1353  block5 = VectorXor(block5, VectorLoad(xorBlocks));
1354  xorBlocks += xorIncrement;
1355  }
1356 
1357  func6(block0, block1, block2, block3, block4, block5, subKeys, rounds);
1358 
1359  if (xorOutput)
1360  {
1361  block0 = VectorXor(block0, VectorLoad(xorBlocks));
1362  xorBlocks += xorIncrement;
1363  block1 = VectorXor(block1, VectorLoad(xorBlocks));
1364  xorBlocks += xorIncrement;
1365  block2 = VectorXor(block2, VectorLoad(xorBlocks));
1366  xorBlocks += xorIncrement;
1367  block3 = VectorXor(block3, VectorLoad(xorBlocks));
1368  xorBlocks += xorIncrement;
1369  block4 = VectorXor(block4, VectorLoad(xorBlocks));
1370  xorBlocks += xorIncrement;
1371  block5 = VectorXor(block5, VectorLoad(xorBlocks));
1372  xorBlocks += xorIncrement;
1373  }
1374 
1375  VectorStore(block0, outBlocks);
1376  outBlocks += outIncrement;
1377  VectorStore(block1, outBlocks);
1378  outBlocks += outIncrement;
1379  VectorStore(block2, outBlocks);
1380  outBlocks += outIncrement;
1381  VectorStore(block3, outBlocks);
1382  outBlocks += outIncrement;
1383  VectorStore(block4, outBlocks);
1384  outBlocks += outIncrement;
1385  VectorStore(block5, outBlocks);
1386  outBlocks += outIncrement;
1387 
1388  length -= 6*blockSize;
1389  }
1390  }
1391 
1392  while (length >= blockSize)
1393  {
1394  uint32x4_p block = VectorLoad(inBlocks);
1395 
1396  if (xorInput)
1397  block = VectorXor(block, VectorLoad(xorBlocks));
1398 
1399  if (flags & BT_InBlockIsCounter)
1400  const_cast<byte *>(inBlocks)[15]++;
1401 
1402  func1(block, subKeys, rounds);
1403 
1404  if (xorOutput)
1405  block = VectorXor(block, VectorLoad(xorBlocks));
1406 
1407  VectorStore(block, outBlocks);
1408 
1409  inBlocks += inIncrement;
1410  outBlocks += outIncrement;
1411  xorBlocks += xorIncrement;
1412  length -= blockSize;
1413  }
1414 
1415  return length;
1416 }
1417 
1418 NAMESPACE_END // CryptoPP
1419 
1420 #endif // CRYPTOPP_ALTIVEC_AVAILABLE
1421 
1422 #endif // CRYPTOPP_ADVANCED_SIMD_TEMPLATES
Allow parallel transformations.
Definition: cryptlib.h:880
Utility functions for the Crypto++ library.
Library configuration file.
should not modify block pointers
Definition: cryptlib.h:874
Common C++ header files.
Support functions for PowerPC and vector operations.
T1 VectorAdd(const T1 &vec1, const T2 &vec2)
Add two vector.
Definition: ppc-simd.h:388
uint32x4_p VectorLoad(const byte src[16])
Loads a vector from a byte array.
Definition: ppc-simd.h:188
#define CRYPTOPP_ASSERT(exp)
Debugging and diagnostic assertion.
Definition: trap.h:60
Xor inputs before transformation.
Definition: cryptlib.h:876
T1 VectorXor(const T1 &vec1, const T2 &vec2)
XOR two vectors.
Definition: ppc-simd.h:373
perform the transformation in reverse
Definition: cryptlib.h:878
Crypto++ library namespace.
void VectorStore(const T &src, byte dest[16])
Stores a vector to a byte array.
Definition: ppc-simd.h:310