Vector Optimized Library of Kernels
3.1.1
Architecture-tuned implementations of math kernels
Loading...
Searching...
No Matches
volk_32fc_conjugate_32fc.h
Go to the documentation of this file.
1
/* -*- c++ -*- */
2
/*
3
* Copyright 2012, 2014 Free Software Foundation, Inc.
4
*
5
* This file is part of VOLK
6
*
7
* SPDX-License-Identifier: LGPL-3.0-or-later
8
*/
9
55
#ifndef INCLUDED_volk_32fc_conjugate_32fc_u_H
56
#define INCLUDED_volk_32fc_conjugate_32fc_u_H
57
58
#include <float.h>
59
#include <inttypes.h>
60
#include <stdio.h>
61
#include <
volk/volk_complex.h
>
62
63
#ifdef LV_HAVE_AVX
64
#include <immintrin.h>
65
66
static
inline
void
volk_32fc_conjugate_32fc_u_avx
(
lv_32fc_t
* cVector,
67
const
lv_32fc_t
* aVector,
68
unsigned
int
num_points)
69
{
70
unsigned
int
number = 0;
71
const
unsigned
int
quarterPoints = num_points / 4;
72
73
__m256 x;
74
lv_32fc_t
* c = cVector;
75
const
lv_32fc_t
* a = aVector;
76
77
__m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
78
79
for
(; number < quarterPoints; number++) {
80
81
x = _mm256_loadu_ps((
float
*)a);
// Load the complex data as ar,ai,br,bi
82
83
x = _mm256_xor_ps(x, conjugator);
// conjugate register
84
85
_mm256_storeu_ps((
float
*)c, x);
// Store the results back into the C container
86
87
a += 4;
88
c += 4;
89
}
90
91
number = quarterPoints * 4;
92
93
for
(; number < num_points; number++) {
94
*c++ =
lv_conj
(*a++);
95
}
96
}
97
#endif
/* LV_HAVE_AVX */
98
99
#ifdef LV_HAVE_SSE3
100
#include <pmmintrin.h>
101
102
static
inline
void
volk_32fc_conjugate_32fc_u_sse3
(
lv_32fc_t
* cVector,
103
const
lv_32fc_t
* aVector,
104
unsigned
int
num_points)
105
{
106
unsigned
int
number = 0;
107
const
unsigned
int
halfPoints = num_points / 2;
108
109
__m128
x;
110
lv_32fc_t
* c = cVector;
111
const
lv_32fc_t
* a = aVector;
112
113
__m128
conjugator =
_mm_setr_ps
(0, -0.f, 0, -0.f);
114
115
for
(; number < halfPoints; number++) {
116
117
x =
_mm_loadu_ps
((
float
*)a);
// Load the complex data as ar,ai,br,bi
118
119
x =
_mm_xor_ps
(x, conjugator);
// conjugate register
120
121
_mm_storeu_ps
((
float
*)c, x);
// Store the results back into the C container
122
123
a += 2;
124
c += 2;
125
}
126
127
if
((num_points % 2) != 0) {
128
*c =
lv_conj
(*a);
129
}
130
}
131
#endif
/* LV_HAVE_SSE3 */
132
133
#ifdef LV_HAVE_GENERIC
134
135
static
inline
void
volk_32fc_conjugate_32fc_generic
(
lv_32fc_t
* cVector,
136
const
lv_32fc_t
* aVector,
137
unsigned
int
num_points)
138
{
139
lv_32fc_t
* cPtr = cVector;
140
const
lv_32fc_t
* aPtr = aVector;
141
unsigned
int
number = 0;
142
143
for
(number = 0; number < num_points; number++) {
144
*cPtr++ =
lv_conj
(*aPtr++);
145
}
146
}
147
#endif
/* LV_HAVE_GENERIC */
148
149
150
#endif
/* INCLUDED_volk_32fc_conjugate_32fc_u_H */
151
#ifndef INCLUDED_volk_32fc_conjugate_32fc_a_H
152
#define INCLUDED_volk_32fc_conjugate_32fc_a_H
153
154
#include <float.h>
155
#include <inttypes.h>
156
#include <stdio.h>
157
#include <
volk/volk_complex.h
>
158
159
#ifdef LV_HAVE_AVX
160
#include <immintrin.h>
161
162
static
inline
void
volk_32fc_conjugate_32fc_a_avx
(
lv_32fc_t
* cVector,
163
const
lv_32fc_t
* aVector,
164
unsigned
int
num_points)
165
{
166
unsigned
int
number = 0;
167
const
unsigned
int
quarterPoints = num_points / 4;
168
169
__m256 x;
170
lv_32fc_t
* c = cVector;
171
const
lv_32fc_t
* a = aVector;
172
173
__m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
174
175
for
(; number < quarterPoints; number++) {
176
177
x = _mm256_load_ps((
float
*)a);
// Load the complex data as ar,ai,br,bi
178
179
x = _mm256_xor_ps(x, conjugator);
// conjugate register
180
181
_mm256_store_ps((
float
*)c, x);
// Store the results back into the C container
182
183
a += 4;
184
c += 4;
185
}
186
187
number = quarterPoints * 4;
188
189
for
(; number < num_points; number++) {
190
*c++ =
lv_conj
(*a++);
191
}
192
}
193
#endif
/* LV_HAVE_AVX */
194
195
#ifdef LV_HAVE_SSE3
196
#include <pmmintrin.h>
197
198
static
inline
void
volk_32fc_conjugate_32fc_a_sse3
(
lv_32fc_t
* cVector,
199
const
lv_32fc_t
* aVector,
200
unsigned
int
num_points)
201
{
202
unsigned
int
number = 0;
203
const
unsigned
int
halfPoints = num_points / 2;
204
205
__m128
x;
206
lv_32fc_t
* c = cVector;
207
const
lv_32fc_t
* a = aVector;
208
209
__m128
conjugator =
_mm_setr_ps
(0, -0.f, 0, -0.f);
210
211
for
(; number < halfPoints; number++) {
212
213
x =
_mm_load_ps
((
float
*)a);
// Load the complex data as ar,ai,br,bi
214
215
x =
_mm_xor_ps
(x, conjugator);
// conjugate register
216
217
_mm_store_ps
((
float
*)c, x);
// Store the results back into the C container
218
219
a += 2;
220
c += 2;
221
}
222
223
if
((num_points % 2) != 0) {
224
*c =
lv_conj
(*a);
225
}
226
}
227
#endif
/* LV_HAVE_SSE3 */
228
229
#ifdef LV_HAVE_NEON
230
#include <arm_neon.h>
231
232
static
inline
void
volk_32fc_conjugate_32fc_a_neon
(
lv_32fc_t
* cVector,
233
const
lv_32fc_t
* aVector,
234
unsigned
int
num_points)
235
{
236
unsigned
int
number;
237
const
unsigned
int
quarterPoints = num_points / 4;
238
239
float32x4x2_t x;
240
lv_32fc_t
* c = cVector;
241
const
lv_32fc_t
* a = aVector;
242
243
for
(number = 0; number < quarterPoints; number++) {
244
__VOLK_PREFETCH
(a + 4);
245
x = vld2q_f32((
float
*)a);
// Load the complex data as ar,br,cr,dr; ai,bi,ci,di
246
247
// xor the imaginary lane
248
x.val[1] = vnegq_f32(x.val[1]);
249
250
vst2q_f32((
float
*)c, x);
// Store the results back into the C container
251
252
a += 4;
253
c += 4;
254
}
255
256
for
(number = quarterPoints * 4; number < num_points; number++) {
257
*c++ =
lv_conj
(*a++);
258
}
259
}
260
#endif
/* LV_HAVE_NEON */
261
262
263
#endif
/* INCLUDED_volk_32fc_conjugate_32fc_a_H */
kernels
volk
volk_32fc_conjugate_32fc.h
Generated by
1.9.8