79#ifndef INCLUDED_volk_32f_log2_32f_a_H
80#define INCLUDED_volk_32f_log2_32f_a_H
87#define LOG_POLY_DEGREE 6
94 float* bPtr = bVector;
95 const float* aPtr = aVector;
96 unsigned int number = 0;
98 for (number = 0; number < num_points; number++)
103#if LV_HAVE_AVX2 && LV_HAVE_FMA
104#include <immintrin.h>
106#define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
107#define POLY1_FMAAVX2(x, c0, c1) \
108 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
109#define POLY2_FMAAVX2(x, c0, c1, c2) \
110 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
111#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
112 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
113#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
114 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
115#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
116 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
118static inline void volk_32f_log2_32f_a_avx2_fma(
float* bVector,
119 const float* aVector,
120 unsigned int num_points)
122 float* bPtr = bVector;
123 const float* aPtr = aVector;
125 unsigned int number = 0;
126 const unsigned int eighthPoints = num_points / 8;
128 __m256 aVal, bVal, mantissa, frac, leadingOne;
131 for (; number < eighthPoints; number++) {
133 aVal = _mm256_load_ps(aPtr);
134 bias = _mm256_set1_epi32(127);
135 leadingOne = _mm256_set1_ps(1.0f);
136 exp = _mm256_sub_epi32(
137 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
138 _mm256_set1_epi32(0x7f800000)),
141 bVal = _mm256_cvtepi32_ps(exp);
146 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
148#if LOG_POLY_DEGREE == 6
149 mantissa = POLY5_FMAAVX2(frac,
156#elif LOG_POLY_DEGREE == 5
157 mantissa = POLY4_FMAAVX2(frac,
158 2.8882704548164776201f,
159 -2.52074962577807006663f,
160 1.48116647521213171641f,
161 -0.465725644288844778798f,
162 0.0596515482674574969533f);
163#elif LOG_POLY_DEGREE == 4
164 mantissa = POLY3_FMAAVX2(frac,
165 2.61761038894603480148f,
166 -1.75647175389045657003f,
167 0.688243882994381274313f,
168 -0.107254423828329604454f);
169#elif LOG_POLY_DEGREE == 3
170 mantissa = POLY2_FMAAVX2(frac,
171 2.28330284476918490682f,
172 -1.04913055217340124191f,
173 0.204446009836232697516f);
178 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
179 _mm256_store_ps(bPtr, bVal);
185 number = eighthPoints * 8;
192#include <immintrin.h>
194#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
195#define POLY1_AVX2(x, c0, c1) \
196 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
197#define POLY2_AVX2(x, c0, c1, c2) \
198 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
199#define POLY3_AVX2(x, c0, c1, c2, c3) \
200 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
201#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
202 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
203#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
204 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
207volk_32f_log2_32f_a_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
209 float* bPtr = bVector;
210 const float* aPtr = aVector;
212 unsigned int number = 0;
213 const unsigned int eighthPoints = num_points / 8;
215 __m256 aVal, bVal, mantissa, frac, leadingOne;
218 for (; number < eighthPoints; number++) {
220 aVal = _mm256_load_ps(aPtr);
221 bias = _mm256_set1_epi32(127);
222 leadingOne = _mm256_set1_ps(1.0f);
223 exp = _mm256_sub_epi32(
224 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
225 _mm256_set1_epi32(0x7f800000)),
228 bVal = _mm256_cvtepi32_ps(exp);
233 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
235#if LOG_POLY_DEGREE == 6
236 mantissa = POLY5_AVX2(frac,
243#elif LOG_POLY_DEGREE == 5
244 mantissa = POLY4_AVX2(frac,
245 2.8882704548164776201f,
246 -2.52074962577807006663f,
247 1.48116647521213171641f,
248 -0.465725644288844778798f,
249 0.0596515482674574969533f);
250#elif LOG_POLY_DEGREE == 4
251 mantissa = POLY3_AVX2(frac,
252 2.61761038894603480148f,
253 -1.75647175389045657003f,
254 0.688243882994381274313f,
255 -0.107254423828329604454f);
256#elif LOG_POLY_DEGREE == 3
257 mantissa = POLY2_AVX2(frac,
258 2.28330284476918490682f,
259 -1.04913055217340124191f,
260 0.204446009836232697516f);
266 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
267 _mm256_store_ps(bPtr, bVal);
273 number = eighthPoints * 8;
280#include <smmintrin.h>
282#define POLY0(x, c0) _mm_set1_ps(c0)
283#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
284#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
285#define POLY3(x, c0, c1, c2, c3) \
286 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
287#define POLY4(x, c0, c1, c2, c3, c4) \
288 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
289#define POLY5(x, c0, c1, c2, c3, c4, c5) \
290 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
293volk_32f_log2_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
295 float* bPtr = bVector;
296 const float* aPtr = aVector;
298 unsigned int number = 0;
299 const unsigned int quarterPoints = num_points / 4;
301 __m128 aVal, bVal, mantissa, frac, leadingOne;
304 for (; number < quarterPoints; number++) {
319#if LOG_POLY_DEGREE == 6
320 mantissa = POLY5(frac,
327#elif LOG_POLY_DEGREE == 5
328 mantissa = POLY4(frac,
329 2.8882704548164776201f,
330 -2.52074962577807006663f,
331 1.48116647521213171641f,
332 -0.465725644288844778798f,
333 0.0596515482674574969533f);
334#elif LOG_POLY_DEGREE == 4
335 mantissa = POLY3(frac,
336 2.61761038894603480148f,
337 -1.75647175389045657003f,
338 0.688243882994381274313f,
339 -0.107254423828329604454f);
340#elif LOG_POLY_DEGREE == 3
341 mantissa = POLY2(frac,
342 2.28330284476918490682f,
343 -1.04913055217340124191f,
344 0.204446009836232697516f);
356 number = quarterPoints * 4;
366#define VLOG2Q_NEON_PREAMBLE() \
367 int32x4_t one = vdupq_n_s32(0x000800000); \
369 float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
370 float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
371 float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
372 float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
373 float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
374 float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
375 float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
376 int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
377 int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
378 int32x4_t exp_bias = vdupq_n_s32(127);
381#define VLOG2Q_NEON_F32(log2_approx, aval) \
382 int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
383 int32x4_t significand_i = vandq_s32(aval, sig_mask); \
384 exponent_i = vshrq_n_s32(exponent_i, 23); \
389 significand_i = vorrq_s32(one, significand_i); \
390 float32x4_t significand_f = vcvtq_n_f32_s32(significand_i, 23); \
392 exponent_i = vsubq_s32(exponent_i, exp_bias); \
393 float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
397 log2_approx = vaddq_f32(exponent_f, p0); \
398 float32x4_t tmp1 = vmulq_f32(significand_f, p1); \
399 log2_approx = vaddq_f32(log2_approx, tmp1); \
400 float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); \
401 tmp1 = vmulq_f32(sig_2, p2); \
402 log2_approx = vaddq_f32(log2_approx, tmp1); \
404 float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); \
405 tmp1 = vmulq_f32(sig_3, p3); \
406 log2_approx = vaddq_f32(log2_approx, tmp1); \
407 float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); \
408 tmp1 = vmulq_f32(sig_4, p4); \
409 log2_approx = vaddq_f32(log2_approx, tmp1); \
410 float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); \
411 tmp1 = vmulq_f32(sig_5, p5); \
412 log2_approx = vaddq_f32(log2_approx, tmp1); \
413 float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); \
414 tmp1 = vmulq_f32(sig_6, p6); \
415 log2_approx = vaddq_f32(log2_approx, tmp1);
420 float* bPtr = bVector;
421 const float* aPtr = aVector;
423 const unsigned int quarterPoints = num_points / 4;
426 float32x4_t log2_approx;
437 for (number = 0; number < quarterPoints; ++number) {
439 aval = vld1q_s32((
int*)aPtr);
443 vst1q_f32(bPtr, log2_approx);
449 number = quarterPoints * 4;
458#ifndef INCLUDED_volk_32f_log2_32f_u_H
459#define INCLUDED_volk_32f_log2_32f_u_H
462#ifdef LV_HAVE_GENERIC
467 float* bPtr = bVector;
468 const float* aPtr = aVector;
469 unsigned int number = 0;
471 for (number = 0; number < num_points; number++) {
472 float const result = log2f(*aPtr++);
473 *bPtr++ = isinf(result) ? -127.0f : result;
481#include <smmintrin.h>
483#define POLY0(x, c0) _mm_set1_ps(c0)
484#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
485#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
486#define POLY3(x, c0, c1, c2, c3) \
487 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
488#define POLY4(x, c0, c1, c2, c3, c4) \
489 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
490#define POLY5(x, c0, c1, c2, c3, c4, c5) \
491 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
494volk_32f_log2_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
496 float* bPtr = bVector;
497 const float* aPtr = aVector;
499 unsigned int number = 0;
500 const unsigned int quarterPoints = num_points / 4;
502 __m128 aVal, bVal, mantissa, frac, leadingOne;
505 for (; number < quarterPoints; number++) {
520#if LOG_POLY_DEGREE == 6
521 mantissa = POLY5(frac,
528#elif LOG_POLY_DEGREE == 5
529 mantissa = POLY4(frac,
530 2.8882704548164776201f,
531 -2.52074962577807006663f,
532 1.48116647521213171641f,
533 -0.465725644288844778798f,
534 0.0596515482674574969533f);
535#elif LOG_POLY_DEGREE == 4
536 mantissa = POLY3(frac,
537 2.61761038894603480148f,
538 -1.75647175389045657003f,
539 0.688243882994381274313f,
540 -0.107254423828329604454f);
541#elif LOG_POLY_DEGREE == 3
542 mantissa = POLY2(frac,
543 2.28330284476918490682f,
544 -1.04913055217340124191f,
545 0.204446009836232697516f);
557 number = quarterPoints * 4;
563#if LV_HAVE_AVX2 && LV_HAVE_FMA
564#include <immintrin.h>
566#define POLY0_FMAAVX2(x, c0) _mm256_set1_ps(c0)
567#define POLY1_FMAAVX2(x, c0, c1) \
568 _mm256_fmadd_ps(POLY0_FMAAVX2(x, c1), x, _mm256_set1_ps(c0))
569#define POLY2_FMAAVX2(x, c0, c1, c2) \
570 _mm256_fmadd_ps(POLY1_FMAAVX2(x, c1, c2), x, _mm256_set1_ps(c0))
571#define POLY3_FMAAVX2(x, c0, c1, c2, c3) \
572 _mm256_fmadd_ps(POLY2_FMAAVX2(x, c1, c2, c3), x, _mm256_set1_ps(c0))
573#define POLY4_FMAAVX2(x, c0, c1, c2, c3, c4) \
574 _mm256_fmadd_ps(POLY3_FMAAVX2(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
575#define POLY5_FMAAVX2(x, c0, c1, c2, c3, c4, c5) \
576 _mm256_fmadd_ps(POLY4_FMAAVX2(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
578static inline void volk_32f_log2_32f_u_avx2_fma(
float* bVector,
579 const float* aVector,
580 unsigned int num_points)
582 float* bPtr = bVector;
583 const float* aPtr = aVector;
585 unsigned int number = 0;
586 const unsigned int eighthPoints = num_points / 8;
588 __m256 aVal, bVal, mantissa, frac, leadingOne;
591 for (; number < eighthPoints; number++) {
593 aVal = _mm256_loadu_ps(aPtr);
594 bias = _mm256_set1_epi32(127);
595 leadingOne = _mm256_set1_ps(1.0f);
596 exp = _mm256_sub_epi32(
597 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
598 _mm256_set1_epi32(0x7f800000)),
601 bVal = _mm256_cvtepi32_ps(exp);
606 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
608#if LOG_POLY_DEGREE == 6
609 mantissa = POLY5_FMAAVX2(frac,
616#elif LOG_POLY_DEGREE == 5
617 mantissa = POLY4_FMAAVX2(frac,
618 2.8882704548164776201f,
619 -2.52074962577807006663f,
620 1.48116647521213171641f,
621 -0.465725644288844778798f,
622 0.0596515482674574969533f);
623#elif LOG_POLY_DEGREE == 4
624 mantissa = POLY3_FMAAVX2(frac,
625 2.61761038894603480148f,
626 -1.75647175389045657003f,
627 0.688243882994381274313f,
628 -0.107254423828329604454f);
629#elif LOG_POLY_DEGREE == 3
630 mantissa = POLY2_FMAAVX2(frac,
631 2.28330284476918490682f,
632 -1.04913055217340124191f,
633 0.204446009836232697516f);
638 bVal = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), bVal);
639 _mm256_storeu_ps(bPtr, bVal);
645 number = eighthPoints * 8;
652#include <immintrin.h>
654#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
655#define POLY1_AVX2(x, c0, c1) \
656 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
657#define POLY2_AVX2(x, c0, c1, c2) \
658 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
659#define POLY3_AVX2(x, c0, c1, c2, c3) \
660 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
661#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
662 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
663#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
664 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
667volk_32f_log2_32f_u_avx2(
float* bVector,
const float* aVector,
unsigned int num_points)
669 float* bPtr = bVector;
670 const float* aPtr = aVector;
672 unsigned int number = 0;
673 const unsigned int eighthPoints = num_points / 8;
675 __m256 aVal, bVal, mantissa, frac, leadingOne;
678 for (; number < eighthPoints; number++) {
680 aVal = _mm256_loadu_ps(aPtr);
681 bias = _mm256_set1_epi32(127);
682 leadingOne = _mm256_set1_ps(1.0f);
683 exp = _mm256_sub_epi32(
684 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
685 _mm256_set1_epi32(0x7f800000)),
688 bVal = _mm256_cvtepi32_ps(exp);
693 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
695#if LOG_POLY_DEGREE == 6
696 mantissa = POLY5_AVX2(frac,
703#elif LOG_POLY_DEGREE == 5
704 mantissa = POLY4_AVX2(frac,
705 2.8882704548164776201f,
706 -2.52074962577807006663f,
707 1.48116647521213171641f,
708 -0.465725644288844778798f,
709 0.0596515482674574969533f);
710#elif LOG_POLY_DEGREE == 4
711 mantissa = POLY3_AVX2(frac,
712 2.61761038894603480148f,
713 -1.75647175389045657003f,
714 0.688243882994381274313f,
715 -0.107254423828329604454f);
716#elif LOG_POLY_DEGREE == 3
717 mantissa = POLY2_AVX2(frac,
718 2.28330284476918490682f,
719 -1.04913055217340124191f,
720 0.204446009836232697516f);
726 _mm256_add_ps(_mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), bVal);
727 _mm256_storeu_ps(bPtr, bVal);
733 number = eighthPoints * 8;