Vector Optimized Library of Kernels 3.0.0
Architecture-tuned implementations of math kernels
volk_8u_x2_encodeframepolar_8u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * SPDX-License-Identifier: LGPL-3.0-or-later
8 */
9
10/*
11 * for documentation see 'volk_8u_x3_encodepolar_8u_x2.h'
12 */
13
14#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
15#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
16#include <string.h>
17
18static inline unsigned int log2_of_power_of_2(unsigned int val)
19{
20 // algorithm from: http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog
21 static const unsigned int b[] = {
22 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
23 };
24
25 unsigned int res = (val & b[0]) != 0;
26 res |= ((val & b[4]) != 0) << 4;
27 res |= ((val & b[3]) != 0) << 3;
28 res |= ((val & b[2]) != 0) << 2;
29 res |= ((val & b[1]) != 0) << 1;
30 return res;
31}
32
33static inline void encodepolar_single_stage(unsigned char* frame_ptr,
34 const unsigned char* temp_ptr,
35 const unsigned int num_branches,
36 const unsigned int frame_half)
37{
38 unsigned int branch, bit;
39 for (branch = 0; branch < num_branches; ++branch) {
40 for (bit = 0; bit < frame_half; ++bit) {
41 *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
42 *(frame_ptr + frame_half) = *(temp_ptr + 1);
43 ++frame_ptr;
44 temp_ptr += 2;
45 }
46 frame_ptr += frame_half;
47 }
48}
49
50#ifdef LV_HAVE_GENERIC
51
52static inline void volk_8u_x2_encodeframepolar_8u_generic(unsigned char* frame,
53 unsigned char* temp,
54 unsigned int frame_size)
55{
56 unsigned int stage = log2_of_power_of_2(frame_size);
57 unsigned int frame_half = frame_size >> 1;
58 unsigned int num_branches = 1;
59
60 while (stage) {
61 // encode stage
62 encodepolar_single_stage(frame, temp, num_branches, frame_half);
63 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
64
65 // update all the parameters.
66 num_branches = num_branches << 1;
67 frame_half = frame_half >> 1;
68 --stage;
69 }
70}
71#endif /* LV_HAVE_GENERIC */
72
73#ifdef LV_HAVE_SSSE3
74#include <tmmintrin.h>
75
76static inline void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char* frame,
77 unsigned char* temp,
78 unsigned int frame_size)
79{
80 const unsigned int po2 = log2_of_power_of_2(frame_size);
81
82 unsigned int stage = po2;
83 unsigned char* frame_ptr = frame;
84 unsigned char* temp_ptr = temp;
85
86 unsigned int frame_half = frame_size >> 1;
87 unsigned int num_branches = 1;
88 unsigned int branch;
89 unsigned int bit;
90
91 // prepare constants
92 const __m128i mask_stage1 = _mm_set_epi8(0x0,
93 0xFF,
94 0x0,
95 0xFF,
96 0x0,
97 0xFF,
98 0x0,
99 0xFF,
100 0x0,
101 0xFF,
102 0x0,
103 0xFF,
104 0x0,
105 0xFF,
106 0x0,
107 0xFF);
108
109 // get some SIMD registers to play with.
110 __m128i r_frame0, r_temp0, shifted;
111
112 {
113 __m128i r_frame1, r_temp1;
114 const __m128i shuffle_separate =
115 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
116
117 while (stage > 4) {
118 frame_ptr = frame;
119 temp_ptr = temp;
120
121 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
122 for (branch = 0; branch < num_branches; ++branch) {
123 for (bit = 0; bit < frame_half; bit += 16) {
124 r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
125 temp_ptr += 16;
126 r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr);
127 temp_ptr += 16;
128
129 shifted = _mm_srli_si128(r_temp0, 1);
130 shifted = _mm_and_si128(shifted, mask_stage1);
131 r_temp0 = _mm_xor_si128(shifted, r_temp0);
132 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
133
134 shifted = _mm_srli_si128(r_temp1, 1);
135 shifted = _mm_and_si128(shifted, mask_stage1);
136 r_temp1 = _mm_xor_si128(shifted, r_temp1);
137 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
138
139 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
140 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
141
142 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
143 _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
144 frame_ptr += 16;
145 }
146
147 frame_ptr += frame_half;
148 }
149 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
150
151 num_branches = num_branches << 1;
152 frame_half = frame_half >> 1;
153 stage--;
154 }
155 }
156
157 // This last part requires at least 16-bit frames.
158 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
159
160 // reset pointers to correct positions.
161 frame_ptr = frame;
162 temp_ptr = temp;
163
164 // prefetch first chunk
165 __VOLK_PREFETCH(temp_ptr);
166
167 const __m128i shuffle_stage4 =
168 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
169 const __m128i mask_stage4 = _mm_set_epi8(0x0,
170 0x0,
171 0x0,
172 0x0,
173 0x0,
174 0x0,
175 0x0,
176 0x0,
177 0xFF,
178 0xFF,
179 0xFF,
180 0xFF,
181 0xFF,
182 0xFF,
183 0xFF,
184 0xFF);
185 const __m128i mask_stage3 = _mm_set_epi8(0x0,
186 0x0,
187 0x0,
188 0x0,
189 0xFF,
190 0xFF,
191 0xFF,
192 0xFF,
193 0x0,
194 0x0,
195 0x0,
196 0x0,
197 0xFF,
198 0xFF,
199 0xFF,
200 0xFF);
201 const __m128i mask_stage2 = _mm_set_epi8(0x0,
202 0x0,
203 0xFF,
204 0xFF,
205 0x0,
206 0x0,
207 0xFF,
208 0xFF,
209 0x0,
210 0x0,
211 0xFF,
212 0xFF,
213 0x0,
214 0x0,
215 0xFF,
216 0xFF);
217
218 for (branch = 0; branch < num_branches; ++branch) {
219 r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
220
221 // prefetch next chunk
222 temp_ptr += 16;
223 __VOLK_PREFETCH(temp_ptr);
224
225 // shuffle once for bit-reversal.
226 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
227
228 shifted = _mm_srli_si128(r_temp0, 8);
229 shifted = _mm_and_si128(shifted, mask_stage4);
230 r_frame0 = _mm_xor_si128(shifted, r_temp0);
231
232 shifted = _mm_srli_si128(r_frame0, 4);
233 shifted = _mm_and_si128(shifted, mask_stage3);
234 r_frame0 = _mm_xor_si128(shifted, r_frame0);
235
236 shifted = _mm_srli_si128(r_frame0, 2);
237 shifted = _mm_and_si128(shifted, mask_stage2);
238 r_frame0 = _mm_xor_si128(shifted, r_frame0);
239
240 shifted = _mm_srli_si128(r_frame0, 1);
241 shifted = _mm_and_si128(shifted, mask_stage1);
242 r_frame0 = _mm_xor_si128(shifted, r_frame0);
243
244 // store result of chunk.
245 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
246 frame_ptr += 16;
247 }
248}
249
250#endif /* LV_HAVE_SSSE3 */
251
252#ifdef LV_HAVE_AVX2
253#include <immintrin.h>
254
255static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(unsigned char* frame,
256 unsigned char* temp,
257 unsigned int frame_size)
258{
259 const unsigned int po2 = log2_of_power_of_2(frame_size);
260
261 unsigned int stage = po2;
262 unsigned char* frame_ptr = frame;
263 unsigned char* temp_ptr = temp;
264
265 unsigned int frame_half = frame_size >> 1;
266 unsigned int num_branches = 1;
267 unsigned int branch;
268 unsigned int bit;
269
270 // prepare constants
271 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
272 0xFF,
273 0x0,
274 0xFF,
275 0x0,
276 0xFF,
277 0x0,
278 0xFF,
279 0x0,
280 0xFF,
281 0x0,
282 0xFF,
283 0x0,
284 0xFF,
285 0x0,
286 0xFF,
287 0x0,
288 0xFF,
289 0x0,
290 0xFF,
291 0x0,
292 0xFF,
293 0x0,
294 0xFF,
295 0x0,
296 0xFF,
297 0x0,
298 0xFF,
299 0x0,
300 0xFF,
301 0x0,
302 0xFF);
303
304 const __m128i mask_stage0 = _mm_set_epi8(0x0,
305 0xFF,
306 0x0,
307 0xFF,
308 0x0,
309 0xFF,
310 0x0,
311 0xFF,
312 0x0,
313 0xFF,
314 0x0,
315 0xFF,
316 0x0,
317 0xFF,
318 0x0,
319 0xFF);
320 // get some SIMD registers to play with.
321 __m256i r_frame0, r_temp0, shifted;
322 __m128i r_temp2, r_frame2, shifted2;
323 {
324 __m256i r_frame1, r_temp1;
325 __m128i r_frame3, r_temp3;
326 const __m256i shuffle_separate = _mm256_setr_epi8(0,
327 2,
328 4,
329 6,
330 8,
331 10,
332 12,
333 14,
334 1,
335 3,
336 5,
337 7,
338 9,
339 11,
340 13,
341 15,
342 0,
343 2,
344 4,
345 6,
346 8,
347 10,
348 12,
349 14,
350 1,
351 3,
352 5,
353 7,
354 9,
355 11,
356 13,
357 15);
358 const __m128i shuffle_separate128 =
359 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
360
361 while (stage > 4) {
362 frame_ptr = frame;
363 temp_ptr = temp;
364
365 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
366 for (branch = 0; branch < num_branches; ++branch) {
367 for (bit = 0; bit < frame_half; bit += 32) {
368 if ((frame_half - bit) <
369 32) // if only 16 bits remaining in frame, not 32
370 {
371 r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr);
372 temp_ptr += 16;
373 r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr);
374 temp_ptr += 16;
375
376 shifted2 = _mm_srli_si128(r_temp2, 1);
377 shifted2 = _mm_and_si128(shifted2, mask_stage0);
378 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
379 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
380
381 shifted2 = _mm_srli_si128(r_temp3, 1);
382 shifted2 = _mm_and_si128(shifted2, mask_stage0);
383 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
384 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
385
386 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
387 _mm_storeu_si128((__m128i*)frame_ptr, r_frame2);
388
389 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
390 _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
391 frame_ptr += 16;
392 break;
393 }
394 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
395 temp_ptr += 32;
396 r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
397 temp_ptr += 32;
398
399 shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
400 shifted = _mm256_and_si256(shifted, mask_stage1);
401 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
402 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
403
404 shifted = _mm256_srli_si256(r_temp1, 1);
405 shifted = _mm256_and_si256(shifted, mask_stage1);
406 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
407 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
408
409 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
410 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
411 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
412 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
413
414 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
415
416 _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
417 frame_ptr += 32;
418 }
419
420 frame_ptr += frame_half;
421 }
422 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
423
424 num_branches = num_branches << 1;
425 frame_half = frame_half >> 1;
426 stage--;
427 }
428 }
429
430 // This last part requires at least 32-bit frames.
431 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
432
433 // reset pointers to correct positions.
434 frame_ptr = frame;
435 temp_ptr = temp;
436
437 // prefetch first chunk
438 __VOLK_PREFETCH(temp_ptr);
439
440 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
441 8,
442 4,
443 12,
444 2,
445 10,
446 6,
447 14,
448 1,
449 9,
450 5,
451 13,
452 3,
453 11,
454 7,
455 15,
456 0,
457 8,
458 4,
459 12,
460 2,
461 10,
462 6,
463 14,
464 1,
465 9,
466 5,
467 13,
468 3,
469 11,
470 7,
471 15);
472 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
473 0x0,
474 0x0,
475 0x0,
476 0x0,
477 0x0,
478 0x0,
479 0x0,
480 0xFF,
481 0xFF,
482 0xFF,
483 0xFF,
484 0xFF,
485 0xFF,
486 0xFF,
487 0xFF,
488 0x0,
489 0x0,
490 0x0,
491 0x0,
492 0x0,
493 0x0,
494 0x0,
495 0x0,
496 0xFF,
497 0xFF,
498 0xFF,
499 0xFF,
500 0xFF,
501 0xFF,
502 0xFF,
503 0xFF);
504 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
505 0x0,
506 0x0,
507 0x0,
508 0xFF,
509 0xFF,
510 0xFF,
511 0xFF,
512 0x0,
513 0x0,
514 0x0,
515 0x0,
516 0xFF,
517 0xFF,
518 0xFF,
519 0xFF,
520 0x0,
521 0x0,
522 0x0,
523 0x0,
524 0xFF,
525 0xFF,
526 0xFF,
527 0xFF,
528 0x0,
529 0x0,
530 0x0,
531 0x0,
532 0xFF,
533 0xFF,
534 0xFF,
535 0xFF);
536 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
537 0x0,
538 0xFF,
539 0xFF,
540 0x0,
541 0x0,
542 0xFF,
543 0xFF,
544 0x0,
545 0x0,
546 0xFF,
547 0xFF,
548 0x0,
549 0x0,
550 0xFF,
551 0xFF,
552 0x0,
553 0x0,
554 0xFF,
555 0xFF,
556 0x0,
557 0x0,
558 0xFF,
559 0xFF,
560 0x0,
561 0x0,
562 0xFF,
563 0xFF,
564 0x0,
565 0x0,
566 0xFF,
567 0xFF);
568
569 for (branch = 0; branch < num_branches / 2; ++branch) {
570 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
571
572 // prefetch next chunk
573 temp_ptr += 32;
574 __VOLK_PREFETCH(temp_ptr);
575
576 // shuffle once for bit-reversal.
577 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
578
579 shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
580 shifted = _mm256_and_si256(shifted, mask_stage4);
581 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
582
583
584 shifted = _mm256_srli_si256(r_frame0, 4);
585 shifted = _mm256_and_si256(shifted, mask_stage3);
586 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
587
588 shifted = _mm256_srli_si256(r_frame0, 2);
589 shifted = _mm256_and_si256(shifted, mask_stage2);
590 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
591
592 shifted = _mm256_srli_si256(r_frame0, 1);
593 shifted = _mm256_and_si256(shifted, mask_stage1);
594 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
595
596 // store result of chunk.
597 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
598 frame_ptr += 32;
599 }
600}
601#endif /* LV_HAVE_AVX2 */
602
603#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_ */
604
605#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
606#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
607
608#ifdef LV_HAVE_SSSE3
609#include <tmmintrin.h>
610
611static inline void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char* frame,
612 unsigned char* temp,
613 unsigned int frame_size)
614{
615 const unsigned int po2 = log2_of_power_of_2(frame_size);
616
617 unsigned int stage = po2;
618 unsigned char* frame_ptr = frame;
619 unsigned char* temp_ptr = temp;
620
621 unsigned int frame_half = frame_size >> 1;
622 unsigned int num_branches = 1;
623 unsigned int branch;
624 unsigned int bit;
625
626 // prepare constants
627 const __m128i mask_stage1 = _mm_set_epi8(0x0,
628 0xFF,
629 0x0,
630 0xFF,
631 0x0,
632 0xFF,
633 0x0,
634 0xFF,
635 0x0,
636 0xFF,
637 0x0,
638 0xFF,
639 0x0,
640 0xFF,
641 0x0,
642 0xFF);
643
644 // get some SIMD registers to play with.
645 __m128i r_frame0, r_temp0, shifted;
646
647 {
648 __m128i r_frame1, r_temp1;
649 const __m128i shuffle_separate =
650 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
651
652 while (stage > 4) {
653 frame_ptr = frame;
654 temp_ptr = temp;
655
656 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
657 for (branch = 0; branch < num_branches; ++branch) {
658 for (bit = 0; bit < frame_half; bit += 16) {
659 r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
660 temp_ptr += 16;
661 r_temp1 = _mm_load_si128((__m128i*)temp_ptr);
662 temp_ptr += 16;
663
664 shifted = _mm_srli_si128(r_temp0, 1);
665 shifted = _mm_and_si128(shifted, mask_stage1);
666 r_temp0 = _mm_xor_si128(shifted, r_temp0);
667 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
668
669 shifted = _mm_srli_si128(r_temp1, 1);
670 shifted = _mm_and_si128(shifted, mask_stage1);
671 r_temp1 = _mm_xor_si128(shifted, r_temp1);
672 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
673
674 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
675 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
676
677 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
678 _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
679 frame_ptr += 16;
680 }
681
682 frame_ptr += frame_half;
683 }
684 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
685
686 num_branches = num_branches << 1;
687 frame_half = frame_half >> 1;
688 stage--;
689 }
690 }
691
692 // This last part requires at least 16-bit frames.
693 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
694
695 // reset pointers to correct positions.
696 frame_ptr = frame;
697 temp_ptr = temp;
698
699 // prefetch first chunk
700 __VOLK_PREFETCH(temp_ptr);
701
702 const __m128i shuffle_stage4 =
703 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
704 const __m128i mask_stage4 = _mm_set_epi8(0x0,
705 0x0,
706 0x0,
707 0x0,
708 0x0,
709 0x0,
710 0x0,
711 0x0,
712 0xFF,
713 0xFF,
714 0xFF,
715 0xFF,
716 0xFF,
717 0xFF,
718 0xFF,
719 0xFF);
720 const __m128i mask_stage3 = _mm_set_epi8(0x0,
721 0x0,
722 0x0,
723 0x0,
724 0xFF,
725 0xFF,
726 0xFF,
727 0xFF,
728 0x0,
729 0x0,
730 0x0,
731 0x0,
732 0xFF,
733 0xFF,
734 0xFF,
735 0xFF);
736 const __m128i mask_stage2 = _mm_set_epi8(0x0,
737 0x0,
738 0xFF,
739 0xFF,
740 0x0,
741 0x0,
742 0xFF,
743 0xFF,
744 0x0,
745 0x0,
746 0xFF,
747 0xFF,
748 0x0,
749 0x0,
750 0xFF,
751 0xFF);
752
753 for (branch = 0; branch < num_branches; ++branch) {
754 r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
755
756 // prefetch next chunk
757 temp_ptr += 16;
758 __VOLK_PREFETCH(temp_ptr);
759
760 // shuffle once for bit-reversal.
761 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
762
763 shifted = _mm_srli_si128(r_temp0, 8);
764 shifted = _mm_and_si128(shifted, mask_stage4);
765 r_frame0 = _mm_xor_si128(shifted, r_temp0);
766
767 shifted = _mm_srli_si128(r_frame0, 4);
768 shifted = _mm_and_si128(shifted, mask_stage3);
769 r_frame0 = _mm_xor_si128(shifted, r_frame0);
770
771 shifted = _mm_srli_si128(r_frame0, 2);
772 shifted = _mm_and_si128(shifted, mask_stage2);
773 r_frame0 = _mm_xor_si128(shifted, r_frame0);
774
775 shifted = _mm_srli_si128(r_frame0, 1);
776 shifted = _mm_and_si128(shifted, mask_stage1);
777 r_frame0 = _mm_xor_si128(shifted, r_frame0);
778
779 // store result of chunk.
780 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
781 frame_ptr += 16;
782 }
783}
784#endif /* LV_HAVE_SSSE3 */
785
786#ifdef LV_HAVE_AVX2
787#include <immintrin.h>
788
789static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(unsigned char* frame,
790 unsigned char* temp,
791 unsigned int frame_size)
792{
793 const unsigned int po2 = log2_of_power_of_2(frame_size);
794
795 unsigned int stage = po2;
796 unsigned char* frame_ptr = frame;
797 unsigned char* temp_ptr = temp;
798
799 unsigned int frame_half = frame_size >> 1;
800 unsigned int num_branches = 1;
801 unsigned int branch;
802 unsigned int bit;
803
804 // prepare constants
805 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
806 0xFF,
807 0x0,
808 0xFF,
809 0x0,
810 0xFF,
811 0x0,
812 0xFF,
813 0x0,
814 0xFF,
815 0x0,
816 0xFF,
817 0x0,
818 0xFF,
819 0x0,
820 0xFF,
821 0x0,
822 0xFF,
823 0x0,
824 0xFF,
825 0x0,
826 0xFF,
827 0x0,
828 0xFF,
829 0x0,
830 0xFF,
831 0x0,
832 0xFF,
833 0x0,
834 0xFF,
835 0x0,
836 0xFF);
837
838 const __m128i mask_stage0 = _mm_set_epi8(0x0,
839 0xFF,
840 0x0,
841 0xFF,
842 0x0,
843 0xFF,
844 0x0,
845 0xFF,
846 0x0,
847 0xFF,
848 0x0,
849 0xFF,
850 0x0,
851 0xFF,
852 0x0,
853 0xFF);
854 // get some SIMD registers to play with.
855 __m256i r_frame0, r_temp0, shifted;
856 __m128i r_temp2, r_frame2, shifted2;
857 {
858 __m256i r_frame1, r_temp1;
859 __m128i r_frame3, r_temp3;
860 const __m256i shuffle_separate = _mm256_setr_epi8(0,
861 2,
862 4,
863 6,
864 8,
865 10,
866 12,
867 14,
868 1,
869 3,
870 5,
871 7,
872 9,
873 11,
874 13,
875 15,
876 0,
877 2,
878 4,
879 6,
880 8,
881 10,
882 12,
883 14,
884 1,
885 3,
886 5,
887 7,
888 9,
889 11,
890 13,
891 15);
892 const __m128i shuffle_separate128 =
893 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
894
895 while (stage > 4) {
896 frame_ptr = frame;
897 temp_ptr = temp;
898
899 // for stage = 5 a branch has 32 elements. So upper stages are even bigger.
900 for (branch = 0; branch < num_branches; ++branch) {
901 for (bit = 0; bit < frame_half; bit += 32) {
902 if ((frame_half - bit) <
903 32) // if only 16 bits remaining in frame, not 32
904 {
905 r_temp2 = _mm_load_si128((__m128i*)temp_ptr);
906 temp_ptr += 16;
907 r_temp3 = _mm_load_si128((__m128i*)temp_ptr);
908 temp_ptr += 16;
909
910 shifted2 = _mm_srli_si128(r_temp2, 1);
911 shifted2 = _mm_and_si128(shifted2, mask_stage0);
912 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
913 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
914
915 shifted2 = _mm_srli_si128(r_temp3, 1);
916 shifted2 = _mm_and_si128(shifted2, mask_stage0);
917 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
918 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
919
920 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
921 _mm_store_si128((__m128i*)frame_ptr, r_frame2);
922
923 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
924 _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
925 frame_ptr += 16;
926 break;
927 }
928 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
929 temp_ptr += 32;
930 r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
931 temp_ptr += 32;
932
933 shifted = _mm256_srli_si256(r_temp0, 1); // operate on 128 bit lanes
934 shifted = _mm256_and_si256(shifted, mask_stage1);
935 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
936 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
937
938 shifted = _mm256_srli_si256(r_temp1, 1);
939 shifted = _mm256_and_si256(shifted, mask_stage1);
940 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
941 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
942
943 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
944 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
945 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
946 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
947
948 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
949
950 _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
951 frame_ptr += 32;
952 }
953
954 frame_ptr += frame_half;
955 }
956 memcpy(temp, frame, sizeof(unsigned char) * frame_size);
957
958 num_branches = num_branches << 1;
959 frame_half = frame_half >> 1;
960 stage--;
961 }
962 }
963
964 // This last part requires at least 32-bit frames.
965 // Smaller frames are useless for SIMD optimization anyways. Just choose GENERIC!
966
967 // reset pointers to correct positions.
968 frame_ptr = frame;
969 temp_ptr = temp;
970
971 // prefetch first chunk.
972 __VOLK_PREFETCH(temp_ptr);
973
974 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
975 8,
976 4,
977 12,
978 2,
979 10,
980 6,
981 14,
982 1,
983 9,
984 5,
985 13,
986 3,
987 11,
988 7,
989 15,
990 0,
991 8,
992 4,
993 12,
994 2,
995 10,
996 6,
997 14,
998 1,
999 9,
1000 5,
1001 13,
1002 3,
1003 11,
1004 7,
1005 15);
1006 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
1007 0x0,
1008 0x0,
1009 0x0,
1010 0x0,
1011 0x0,
1012 0x0,
1013 0x0,
1014 0xFF,
1015 0xFF,
1016 0xFF,
1017 0xFF,
1018 0xFF,
1019 0xFF,
1020 0xFF,
1021 0xFF,
1022 0x0,
1023 0x0,
1024 0x0,
1025 0x0,
1026 0x0,
1027 0x0,
1028 0x0,
1029 0x0,
1030 0xFF,
1031 0xFF,
1032 0xFF,
1033 0xFF,
1034 0xFF,
1035 0xFF,
1036 0xFF,
1037 0xFF);
1038 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
1039 0x0,
1040 0x0,
1041 0x0,
1042 0xFF,
1043 0xFF,
1044 0xFF,
1045 0xFF,
1046 0x0,
1047 0x0,
1048 0x0,
1049 0x0,
1050 0xFF,
1051 0xFF,
1052 0xFF,
1053 0xFF,
1054 0x0,
1055 0x0,
1056 0x0,
1057 0x0,
1058 0xFF,
1059 0xFF,
1060 0xFF,
1061 0xFF,
1062 0x0,
1063 0x0,
1064 0x0,
1065 0x0,
1066 0xFF,
1067 0xFF,
1068 0xFF,
1069 0xFF);
1070 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
1071 0x0,
1072 0xFF,
1073 0xFF,
1074 0x0,
1075 0x0,
1076 0xFF,
1077 0xFF,
1078 0x0,
1079 0x0,
1080 0xFF,
1081 0xFF,
1082 0x0,
1083 0x0,
1084 0xFF,
1085 0xFF,
1086 0x0,
1087 0x0,
1088 0xFF,
1089 0xFF,
1090 0x0,
1091 0x0,
1092 0xFF,
1093 0xFF,
1094 0x0,
1095 0x0,
1096 0xFF,
1097 0xFF,
1098 0x0,
1099 0x0,
1100 0xFF,
1101 0xFF);
1102
1103 for (branch = 0; branch < num_branches / 2; ++branch) {
1104 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
1105
1106 // prefetch next chunk
1107 temp_ptr += 32;
1108 __VOLK_PREFETCH(temp_ptr);
1109
1110 // shuffle once for bit-reversal.
1111 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
1112
1113 shifted = _mm256_srli_si256(r_temp0, 8); // 128 bit lanes
1114 shifted = _mm256_and_si256(shifted, mask_stage4);
1115 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
1116
1117 shifted = _mm256_srli_si256(r_frame0, 4);
1118 shifted = _mm256_and_si256(shifted, mask_stage3);
1119 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1120
1121 shifted = _mm256_srli_si256(r_frame0, 2);
1122 shifted = _mm256_and_si256(shifted, mask_stage2);
1123 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1124
1125 shifted = _mm256_srli_si256(r_frame0, 1);
1126 shifted = _mm256_and_si256(shifted, mask_stage1);
1127 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1128
1129 // store result of chunk.
1130 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
1131 frame_ptr += 32;
1132 }
1133}
1134#endif /* LV_HAVE_AVX2 */
1135
1136
1137#endif /* VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_ */