1 #include <m4ri/m4ri_config.h> 12 static inline void __M4RI_TEMPLATE_NAME(_mzd_combine)(
word *m,
word const *t[N],
wi_t wide) {
13 assert(1 <= N && N <= 8);
32 case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++;
break;
33 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++;
break;
34 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++;
break;
35 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++;
break;
36 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++;
break;
37 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++;
break;
38 case 2: *m++ ^= *t[0]++ ^ *t[1]++;
break;
39 case 1: *m++ ^= *t[0]++;
break;
44 __m128i *m__ = (__m128i*)m;
48 case 8: t__[N-8] = (__m128i*)t[N-8];
49 case 7: t__[N-7] = (__m128i*)t[N-7];
50 case 6: t__[N-6] = (__m128i*)t[N-6];
51 case 5: t__[N-5] = (__m128i*)t[N-5];
52 case 4: t__[N-4] = (__m128i*)t[N-4];
53 case 3: t__[N-3] = (__m128i*)t[N-3];
54 case 2: t__[N-2] = (__m128i*)t[N-2];
55 case 1: t__[N-1] = (__m128i*)t[N-1];
58 __m128i xmm0, xmm1, xmm2, xmm3;
60 for(
wi_t i=0; i< (wide>>1); i++) {
64 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
65 xmm2 = _mm_xor_si128(*t__[4]++, *t__[5]++); xmm3 = _mm_xor_si128(*t__[6]++, *t__[7]++);
66 xmm0 = _mm_xor_si128(xmm0, xmm1); xmm2 = _mm_xor_si128(xmm2, xmm3);
67 xmm0 = _mm_xor_si128(xmm0, xmm2); xmm0 = _mm_xor_si128(*m__, xmm0);
70 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
71 xmm0 = _mm_xor_si128(xmm0, *t__[4]++); xmm1 = _mm_xor_si128(xmm1, *t__[5]++);
72 xmm0 = _mm_xor_si128(xmm0, *t__[6]++); xmm0 = _mm_xor_si128(xmm0, xmm1);
73 xmm0 = _mm_xor_si128(*m__, xmm0);
76 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
77 xmm0 = _mm_xor_si128(xmm0, *t__[4]++); xmm1 = _mm_xor_si128(xmm1, *t__[5]++);
78 xmm0 = _mm_xor_si128(xmm0, xmm1); xmm0 = _mm_xor_si128(*m__, xmm0);
81 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
82 xmm0 = _mm_xor_si128(xmm0, *t__[4]++); xmm0 = _mm_xor_si128(xmm0, xmm1);
83 xmm0 = _mm_xor_si128(*m__, xmm0);
86 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*t__[2]++, *t__[3]++);
87 xmm0 = _mm_xor_si128(xmm0, xmm1); xmm0 = _mm_xor_si128(*m__, xmm0);
90 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm1 = _mm_xor_si128(*m__, *t__[2]++);
91 xmm0 = _mm_xor_si128(xmm0, xmm1);
94 xmm0 = _mm_xor_si128(*t__[0]++, *t__[1]++); xmm0 = _mm_xor_si128(*m__, xmm0);
97 xmm0 = _mm_xor_si128(*m__, *t__[0]++);
106 case 8: t[N-8] = (
word*)t__[N-8];
107 case 7: t[N-7] = (
word*)t__[N-7];
108 case 6: t[N-6] = (
word*)t__[N-6];
109 case 5: t[N-5] = (
word*)t__[N-5];
110 case 4: t[N-4] = (
word*)t__[N-4];
111 case 3: t[N-3] = (
word*)t__[N-3];
112 case 2: t[N-2] = (
word*)t__[N-2];
113 case 1: t[N-1] = (
word*)t__[N-1];
117 case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++;
break;
118 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++;
break;
119 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++;
break;
120 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++;
break;
121 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++;
break;
122 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++;
break;
123 case 2: *m++ ^= *t[0]++ ^ *t[1]++;
break;
124 case 1: *m++ ^= *t[0]++;
break;
130 for(
wi_t i=0; i< wide; i++) {
132 case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++;
break;
133 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++;
break;
134 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++;
break;
135 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++;
break;
136 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++;
break;
137 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++;
break;
138 case 2: *m++ ^= *t[0]++ ^ *t[1]++;
break;
139 case 1: *m++ ^= *t[0]++;
break;
144 #endif // __M4RI_HAVE_SSE2 155 static inline void __M4RI_TEMPLATE_NAME(_mzd_combine_u)(
word *m,
word const *t[N],
wi_t wide) {
156 assert(1 <= N && N <= 8);
174 case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++;
break;
175 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++;
break;
176 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++;
break;
177 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++;
break;
178 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++;
break;
179 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++;
break;
180 case 2: *m++ ^= *t[0]++ ^ *t[1]++;
break;
181 case 1: *m++ ^= *t[0]++;
break;
185 __m128i *m__ = (__m128i*)m;
189 case 8: t__[N-8] = (__m128i*)t[N-8];
190 case 7: t__[N-7] = (__m128i*)t[N-7];
191 case 6: t__[N-6] = (__m128i*)t[N-6];
192 case 5: t__[N-5] = (__m128i*)t[N-5];
193 case 4: t__[N-4] = (__m128i*)t[N-4];
194 case 3: t__[N-3] = (__m128i*)t[N-3];
195 case 2: t__[N-2] = (__m128i*)t[N-2];
196 case 1: t__[N-1] = (__m128i*)t[N-1];
201 for(
wi_t i=0; i< (wide>>1); i++) {
202 xmm1 = _mm_xor_si128(*m__, *t__[0]++);
205 case 8: xmm1 = _mm_xor_si128(xmm1, *t__[N-7]++);
206 case 7: xmm1 = _mm_xor_si128(xmm1, *t__[N-6]++);
207 case 6: xmm1 = _mm_xor_si128(xmm1, *t__[N-5]++);
208 case 5: xmm1 = _mm_xor_si128(xmm1, *t__[N-4]++);
209 case 4: xmm1 = _mm_xor_si128(xmm1, *t__[N-3]++);
210 case 3: xmm1 = _mm_xor_si128(xmm1, *t__[N-2]++);
211 case 2: xmm1 = _mm_xor_si128(xmm1, *t__[N-1]++);
220 case 8: t[N-8] = (
word*)t__[N-8];
221 case 7: t[N-7] = (
word*)t__[N-7];
222 case 6: t[N-6] = (
word*)t__[N-6];
223 case 5: t[N-5] = (
word*)t__[N-5];
224 case 4: t[N-4] = (
word*)t__[N-4];
225 case 3: t[N-3] = (
word*)t__[N-3];
226 case 2: t[N-2] = (
word*)t__[N-2];
227 case 1: t[N-1] = (
word*)t__[N-1];
231 case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++;
break;
232 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++;
break;
233 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++;
break;
234 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++;
break;
235 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++;
break;
236 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++;
break;
237 case 2: *m++ ^= *t[0]++ ^ *t[1]++;
break;
238 case 1: *m++ ^= *t[0]++;
break;
244 for(
wi_t i=0; i< wide; i++) {
246 case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++;
break;
247 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++;
break;
248 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++;
break;
249 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++;
break;
250 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++;
break;
251 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++;
break;
252 case 2: *m++ ^= *t[0]++ ^ *t[1]++;
break;
253 case 1: *m++ ^= *t[0]++;
break;
258 #endif // __M4RI_HAVE_SSE2 269 static inline void __M4RI_TEMPLATE_NAME(_mzd_combine_a)(
word *m,
word const *t[N],
wi_t wide) {
270 assert(1 <= N && N <= 8);
287 __m128i *m__ = (__m128i*)m;
291 case 8: t__[N-8] = (__m128i*)t[N-8];
292 case 7: t__[N-7] = (__m128i*)t[N-7];
293 case 6: t__[N-6] = (__m128i*)t[N-6];
294 case 5: t__[N-5] = (__m128i*)t[N-5];
295 case 4: t__[N-4] = (__m128i*)t[N-4];
296 case 3: t__[N-3] = (__m128i*)t[N-3];
297 case 2: t__[N-2] = (__m128i*)t[N-2];
298 case 1: t__[N-1] = (__m128i*)t[N-1];
303 for(
wi_t i=0; i< (wide>>1); i++) {
304 xmm1 = _mm_xor_si128(*m__, *t__[0]++);
307 case 8: xmm1 = _mm_xor_si128(xmm1, *t__[N-7]++);
308 case 7: xmm1 = _mm_xor_si128(xmm1, *t__[N-6]++);
309 case 6: xmm1 = _mm_xor_si128(xmm1, *t__[N-5]++);
310 case 5: xmm1 = _mm_xor_si128(xmm1, *t__[N-4]++);
311 case 4: xmm1 = _mm_xor_si128(xmm1, *t__[N-3]++);
312 case 3: xmm1 = _mm_xor_si128(xmm1, *t__[N-2]++);
313 case 2: xmm1 = _mm_xor_si128(xmm1, *t__[N-1]++);
322 case 8: t[N-8] = (
word*)t__[N-8];
323 case 7: t[N-7] = (
word*)t__[N-7];
324 case 6: t[N-6] = (
word*)t__[N-6];
325 case 5: t[N-5] = (
word*)t__[N-5];
326 case 4: t[N-4] = (
word*)t__[N-4];
327 case 3: t[N-3] = (
word*)t__[N-3];
328 case 2: t[N-2] = (
word*)t__[N-2];
329 case 1: t[N-1] = (
word*)t__[N-1];
333 case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++;
break;
334 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++;
break;
335 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++;
break;
336 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++;
break;
337 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++;
break;
338 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++;
break;
339 case 2: *m++ ^= *t[0]++ ^ *t[1]++;
break;
340 case 1: *m++ ^= *t[0]++;
break;
346 for(
wi_t i=0; i< wide; i++) {
348 case 8: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++ ^ *t[7]++;
break;
349 case 7: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++ ^ *t[6]++;
break;
350 case 6: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++ ^ *t[5]++;
break;
351 case 5: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++ ^ *t[4]++;
break;
352 case 4: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++ ^ *t[3]++;
break;
353 case 3: *m++ ^= *t[0]++ ^ *t[1]++ ^ *t[2]++;
break;
354 case 2: *m++ ^= *t[0]++ ^ *t[1]++;
break;
355 case 1: *m++ ^= *t[0]++;
break;
360 #endif // __M4RI_HAVE_SSE2
#define __M4RI_ALIGNMENT(addr, n)
Return alignment of addr w.r.t. n. For example the address 17 would be 1 aligned w....
Definition: misc.h:421
#define __M4RI_UNLIKELY(cond)
Macro to help with branch prediction.
Definition: misc.h:449
uint64_t word
A word is the typical packed data structure to represent packed bits.
Definition: misc.h:87
int wi_t
Type of word indexes.
Definition: misc.h:80