Crypto++  8.4
Free C++ class library of cryptographic schemes
arm_simd.h
Go to the documentation of this file.
1 // arm_simd.h - written and placed in public domain by Jeffrey Walton
2 
3 /// \file arm_simd.h
4 /// \brief Support functions for ARM and vector operations
5 
6 #ifndef CRYPTOPP_ARM_SIMD_H
7 #define CRYPTOPP_ARM_SIMD_H
8 
9 #include "config.h"
10 
11 #if (CRYPTOPP_ARM_NEON_HEADER)
12 # include <arm_neon.h>
13 #endif
14 
15 #if (CRYPTOPP_ARM_ACLE_HEADER)
16 # include <stdint.h>
17 # include <arm_acle.h>
18 #endif
19 
20 #if (CRYPTOPP_ARM_PMULL_AVAILABLE) || defined(CRYPTOPP_DOXYGEN_PROCESSING)
21 
22 /// \brief Polynomial multiplication
23 /// \param a the first term
24 /// \param b the second term
25 /// \return vector product
26 /// \details PMULL_00() performs polynomial multiplication and presents
27 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x00)</tt>.
28 /// The <tt>0x00</tt> indicates the low 64-bits of <tt>a</tt> and <tt>b</tt>
29 /// are multiplied.
30 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
31 /// is MSB and numbered 127, while the the rightmost bit is LSB and
32 /// numbered 0.
33 /// \since Crypto++ 8.0
34 inline uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
35 {
36 #if defined(_MSC_VER)
37  const __n64 x = { vgetq_lane_u64(a, 0) };
38  const __n64 y = { vgetq_lane_u64(b, 0) };
39  return vmull_p64(x, y);
40 #elif defined(__GNUC__)
41  uint64x2_t r;
42  __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
43  :"=w" (r) : "w" (a), "w" (b) );
44  return r;
45 #else
46  return (uint64x2_t)(vmull_p64(
47  vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
48  vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
49 #endif
50 }
51 
52 /// \brief Polynomial multiplication
53 /// \param a the first term
54 /// \param b the second term
55 /// \return vector product
56 /// \details PMULL_01 performs() polynomial multiplication and presents
57 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x01)</tt>.
58 /// The <tt>0x01</tt> indicates the low 64-bits of <tt>a</tt> and high
59 /// 64-bits of <tt>b</tt> are multiplied.
60 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
61 /// is MSB and numbered 127, while the the rightmost bit is LSB and
62 /// numbered 0.
63 /// \since Crypto++ 8.0
64 inline uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
65 {
66 #if defined(_MSC_VER)
67  const __n64 x = { vgetq_lane_u64(a, 0) };
68  const __n64 y = { vgetq_lane_u64(b, 1) };
69  return vmull_p64(x, y);
70 #elif defined(__GNUC__)
71  uint64x2_t r;
72  __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
73  :"=w" (r) : "w" (a), "w" (vget_high_u64(b)) );
74  return r;
75 #else
76  return (uint64x2_t)(vmull_p64(
77  vgetq_lane_u64(vreinterpretq_u64_u8(a),0),
78  vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
79 #endif
80 }
81 
82 /// \brief Polynomial multiplication
83 /// \param a the first term
84 /// \param b the second term
85 /// \return vector product
86 /// \details PMULL_10() performs polynomial multiplication and presents
87 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x10)</tt>.
88 /// The <tt>0x10</tt> indicates the high 64-bits of <tt>a</tt> and low
89 /// 64-bits of <tt>b</tt> are multiplied.
90 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
91 /// is MSB and numbered 127, while the the rightmost bit is LSB and
92 /// numbered 0.
93 /// \since Crypto++ 8.0
94 inline uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
95 {
96 #if defined(_MSC_VER)
97  const __n64 x = { vgetq_lane_u64(a, 1) };
98  const __n64 y = { vgetq_lane_u64(b, 0) };
99  return vmull_p64(x, y);
100 #elif defined(__GNUC__)
101  uint64x2_t r;
102  __asm __volatile("pmull %0.1q, %1.1d, %2.1d \n\t"
103  :"=w" (r) : "w" (vget_high_u64(a)), "w" (b) );
104  return r;
105 #else
106  return (uint64x2_t)(vmull_p64(
107  vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
108  vgetq_lane_u64(vreinterpretq_u64_u8(b),0)));
109 #endif
110 }
111 
112 /// \brief Polynomial multiplication
113 /// \param a the first term
114 /// \param b the second term
115 /// \return vector product
116 /// \details PMULL_11() performs polynomial multiplication and presents
117 /// the result like Intel's <tt>c = _mm_clmulepi64_si128(a, b, 0x11)</tt>.
118 /// The <tt>0x11</tt> indicates the high 64-bits of <tt>a</tt> and <tt>b</tt>
119 /// are multiplied.
120 /// \note An Intel XMM register is composed of 128-bits. The leftmost bit
121 /// is MSB and numbered 127, while the the rightmost bit is LSB and
122 /// numbered 0.
123 /// \since Crypto++ 8.0
124 inline uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
125 {
126 #if defined(_MSC_VER)
127  const __n64 x = { vgetq_lane_u64(a, 1) };
128  const __n64 y = { vgetq_lane_u64(b, 1) };
129  return vmull_p64(x, y);
130 #elif defined(__GNUC__)
131  uint64x2_t r;
132  __asm __volatile("pmull2 %0.1q, %1.2d, %2.2d \n\t"
133  :"=w" (r) : "w" (a), "w" (b) );
134  return r;
135 #else
136  return (uint64x2_t)(vmull_p64(
137  vgetq_lane_u64(vreinterpretq_u64_u8(a),1),
138  vgetq_lane_u64(vreinterpretq_u64_u8(b),1)));
139 #endif
140 }
141 
142 /// \brief Vector extraction
143 /// \param a the first term
144 /// \param b the second term
145 /// \param c the byte count
146 /// \return vector
147 /// \details VEXT_U8() extracts the first <tt>c</tt> bytes of vector
148 /// <tt>a</tt> and the remaining bytes in <tt>b</tt>.
149 /// \since Crypto++ 8.0
150 inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
151 {
152 #if defined(_MSC_VER)
153  return (uint64x2_t)vextq_u8(
154  vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), c);
155 #else
156  uint64x2_t r;
157  __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
158  :"=w" (r) : "w" (a), "w" (b), "I" (c) );
159  return r;
160 #endif
161 }
162 
163 /// \brief Vector extraction
164 /// \tparam C the byte count
165 /// \param a the first term
166 /// \param b the second term
167 /// \return vector
168 /// \details VEXT_U8() extracts the first <tt>C</tt> bytes of vector
169 /// <tt>a</tt> and the remaining bytes in <tt>b</tt>.
170 /// \since Crypto++ 8.0
171 template <unsigned int C>
172 inline uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b)
173 {
174  // https://github.com/weidai11/cryptopp/issues/366
175 #if defined(_MSC_VER)
176  return (uint64x2_t)vextq_u8(
177  vreinterpretq_u8_u64(a), vreinterpretq_u8_u64(b), C);
178 #else
179  uint64x2_t r;
180  __asm __volatile("ext %0.16b, %1.16b, %2.16b, %3 \n\t"
181  :"=w" (r) : "w" (a), "w" (b), "I" (C) );
182  return r;
183 #endif
184 }
185 
186 #endif // CRYPTOPP_ARM_PMULL_AVAILABLE
187 
188 #endif // CRYPTOPP_ARM_SIMD_H
uint64x2_t PMULL_00(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:34
uint64x2_t PMULL_11(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:124
uint64x2_t PMULL_01(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:64
uint64x2_t PMULL_10(const uint64x2_t a, const uint64x2_t b)
Polynomial multiplication.
Definition: arm_simd.h:94
uint64x2_t VEXT_U8(uint64x2_t a, uint64x2_t b, unsigned int c)
Vector extraction.
Definition: arm_simd.h:150
Library configuration file.