[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Commit-gnuradio] [gnuradio] 11/14: volk (gsoc): whitespace
From: |
git |
Subject: |
[Commit-gnuradio] [gnuradio] 11/14: volk (gsoc): whitespace |
Date: |
Wed, 15 Oct 2014 23:25:09 +0000 (UTC) |
This is an automated email from the git hooks/post-receive script.
trondeau pushed a commit to branch master
in repository gnuradio.
commit 9230086cad0ea09087e38ce609ef920b24c7a606
Author: Tom Rondeau <address@hidden>
Date: Wed Oct 15 10:50:10 2014 -0400
volk (gsoc): whitespace
Conflicts:
volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
---
volk/kernels/volk/volk_32f_acos_32f.h | 234 ++++++------
volk/kernels/volk/volk_32f_asin_32f.h | 217 ++++++-----
volk/kernels/volk/volk_32f_atan_32f.h | 211 +++++------
volk/kernels/volk/volk_32f_cos_32f.h | 282 +++++++-------
volk/kernels/volk/volk_32f_expfast_32f.h | 221 +++++------
volk/kernels/volk/volk_32f_sin_32f.h | 274 +++++++-------
volk/kernels/volk/volk_32f_tan_32f.h | 293 +++++++--------
volk/kernels/volk/volk_32f_x2_pow_32f.h | 427 ++++++++++------------
volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h | 10 +-
volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h | 1 -
volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h | 44 +--
11 files changed, 1038 insertions(+), 1176 deletions(-)
diff --git a/volk/kernels/volk/volk_32f_acos_32f.h
b/volk/kernels/volk/volk_32f_acos_32f.h
index deba615..19444df 100644
--- a/volk/kernels/volk/volk_32f_acos_32f.h
+++ b/volk/kernels/volk/volk_32f_acos_32f.h
@@ -18,83 +18,67 @@
*/
static inline void volk_32f_acos_32f_a_sse4_1(float* bVector, const float*
aVector, unsigned int num_points){
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- int i, j;
-
- __m128 aVal, d, pi, pio2, x, y, z, arccosine;
- __m128 fzeroes, fones, ftwos, ffours, condition;
-
- pi = _mm_set1_ps(3.14159265358979323846);
- pio2 = _mm_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm_setzero_ps();
- fones = _mm_set1_ps(1.0);
- ftwos = _mm_set1_ps(2.0);
- ffours = _mm_set1_ps(4.0);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- d = aVal;
- aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones,
aVal), _mm_sub_ps(fones, aVal))), aVal);
- z = aVal;
- condition = _mm_cmplt_ps(z, fzeroes);
- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
- x = z;
- condition = _mm_cmplt_ps(z, fones);
- x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z),
z), condition));
-
- for(i = 0; i < 2; i++) x = _mm_add_ps(x,
_mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
- x = _mm_div_ps(fones, x);
- y = fzeroes;
- for(j = TERMS - 1; j >=0 ; j--) y = _mm_add_ps(_mm_mul_ps(y,
_mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-
- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
- condition = _mm_cmpgt_ps(z, fones);
-
- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y,
ftwos)), condition));
- arccosine = y;
- condition = _mm_cmplt_ps(aVal, fzeroes);
- arccosine = _mm_sub_ps(arccosine,
_mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
- condition = _mm_cmplt_ps(d, fzeroes);
- arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
-
- _mm_store_ps(bPtr, arccosine);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = acos(*aPtr++);
- }
-}
-
-#endif /* LV_HAVE_SSE4_1 for aligned */
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Computes arccosine of input vector and stores results in output vector
- \param bVector The vector where results will be stored
- \param aVector The input vector of floats
- \param num_points Number of points for which arccosine is to be computed
-*/
-static inline void volk_32f_acos_32f_a_generic(float* bVector, const float*
aVector, unsigned int num_points){
float* bPtr = bVector;
const float* aPtr = aVector;
+
unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ int i, j;
+
+ __m128 aVal, d, pi, pio2, x, y, z, arccosine;
+ __m128 fzeroes, fones, ftwos, ffours, condition;
+
+ pi = _mm_set1_ps(3.14159265358979323846);
+ pio2 = _mm_set1_ps(3.14159265358979323846/2);
+ fzeroes = _mm_setzero_ps();
+ fones = _mm_set1_ps(1.0);
+ ftwos = _mm_set1_ps(2.0);
+ ffours = _mm_set1_ps(4.0);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ d = aVal;
+ aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal),
_mm_sub_ps(fones, aVal))), aVal);
+ z = aVal;
+ condition = _mm_cmplt_ps(z, fzeroes);
+ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+ x = z;
+ condition = _mm_cmplt_ps(z, fones);
+ x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z),
condition));
+
+ for(i = 0; i < 2; i++){
+ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x,
x))));
+ }
+ x = _mm_div_ps(fones, x);
+ y = fzeroes;
+ for(j = TERMS - 1; j >=0 ; j--){
+ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
_mm_set1_ps(pow(-1,j)/(2*j+1)));
+ }
+
+ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+ condition = _mm_cmpgt_ps(z, fones);
+
+ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)),
condition));
+ arccosine = y;
+ condition = _mm_cmplt_ps(aVal, fzeroes);
+ arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine,
ftwos), condition));
+ condition = _mm_cmplt_ps(d, fzeroes);
+ arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
+
+ _mm_store_ps(bPtr, arccosine);
+ aPtr += 4;
+ bPtr += 4;
+ }
- for(number = 0; number < num_points; number++){
- *bPtr++ = acos(*aPtr++);
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = acos(*aPtr++);
}
-
}
-#endif /* LV_HAVE_GENERIC */
-#endif /* INCLUDED_volk_32f_acos_32f_a_H */
+#endif /* LV_HAVE_SSE4_1 for aligned */
+#endif /* INCLUDED_volk_32f_acos_32_f_H */
#ifndef INCLUDED_volk_32f_acos_32f_u_H
#define INCLUDED_volk_32f_acos_32f_u_H
@@ -108,58 +92,62 @@ static inline void volk_32f_acos_32f_a_generic(float*
bVector, const float* aVec
*/
static inline void volk_32f_acos_32f_u_sse4_1(float* bVector, const float*
aVector, unsigned int num_points){
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
unsigned int quarterPoints = num_points / 4;
- int i, j;
-
- __m128 aVal, d, pi, pio2, x, y, z, arccosine;
- __m128 fzeroes, fones, ftwos, ffours, condition;
-
- pi = _mm_set1_ps(3.14159265358979323846);
- pio2 = _mm_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm_setzero_ps();
- fones = _mm_set1_ps(1.0);
- ftwos = _mm_set1_ps(2.0);
- ffours = _mm_set1_ps(4.0);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_loadu_ps(aPtr);
- d = aVal;
- aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones,
aVal), _mm_sub_ps(fones, aVal))), aVal);
- z = aVal;
- condition = _mm_cmplt_ps(z, fzeroes);
- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
- x = z;
- condition = _mm_cmplt_ps(z, fones);
- x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z),
z), condition));
-
- for(i = 0; i < 2; i++) x = _mm_add_ps(x,
_mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
- x = _mm_div_ps(fones, x);
- y = fzeroes;
- for(j = TERMS - 1; j >=0 ; j--) y = _mm_add_ps(_mm_mul_ps(y,
_mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-
- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
- condition = _mm_cmpgt_ps(z, fones);
-
- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y,
ftwos)), condition));
- arccosine = y;
- condition = _mm_cmplt_ps(aVal, fzeroes);
- arccosine = _mm_sub_ps(arccosine,
_mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
- condition = _mm_cmplt_ps(d, fzeroes);
- arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
-
- _mm_storeu_ps(bPtr, arccosine);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = acos(*aPtr++);
- }
+ int i, j;
+
+ __m128 aVal, d, pi, pio2, x, y, z, arccosine;
+ __m128 fzeroes, fones, ftwos, ffours, condition;
+
+ pi = _mm_set1_ps(3.14159265358979323846);
+ pio2 = _mm_set1_ps(3.14159265358979323846/2);
+ fzeroes = _mm_setzero_ps();
+ fones = _mm_set1_ps(1.0);
+ ftwos = _mm_set1_ps(2.0);
+ ffours = _mm_set1_ps(4.0);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_loadu_ps(aPtr);
+ d = aVal;
+ aVal = _mm_div_ps(_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal),
_mm_sub_ps(fones, aVal))), aVal);
+ z = aVal;
+ condition = _mm_cmplt_ps(z, fzeroes);
+ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+ x = z;
+ condition = _mm_cmplt_ps(z, fones);
+ x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z),
condition));
+
+ for(i = 0; i < 2; i++){
+ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x,
x))));
+ }
+ x = _mm_div_ps(fones, x);
+ y = fzeroes;
+ for(j = TERMS - 1; j >=0 ; j--){
+ x = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
_mm_set1_ps(pow(-1,j)/(2*j+1)));
+ }
+
+ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+ condition = _mm_cmpgt_ps(z, fones);
+
+ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)),
condition));
+ arccosine = y;
+ condition = _mm_cmplt_ps(aVal, fzeroes);
+ arccosine = _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine,
ftwos), condition));
+ condition = _mm_cmplt_ps(d, fzeroes);
+ arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
+
+ _mm_storeu_ps(bPtr, arccosine);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = acos(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for aligned */
@@ -171,15 +159,15 @@ static inline void volk_32f_acos_32f_u_sse4_1(float*
bVector, const float* aVect
\param aVector The input vector of floats
\param num_points Number of points for which arccosine is to be computed
*/
-static inline void volk_32f_acos_32f_u_generic(float* bVector, const float*
aVector, unsigned int num_points){
+static inline void volk_32f_acos_32f_generic(float* bVector, const float*
aVector, unsigned int num_points){
float* bPtr = bVector;
const float* aPtr = aVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
- *bPtr++ = acos(*aPtr++);
+ *bPtr++ = acos(*aPtr++);
}
-
+
}
#endif /* LV_HAVE_GENERIC */
diff --git a/volk/kernels/volk/volk_32f_asin_32f.h
b/volk/kernels/volk/volk_32f_asin_32f.h
index 976aabc..80a834b 100644
--- a/volk/kernels/volk/volk_32f_asin_32f.h
+++ b/volk/kernels/volk/volk_32f_asin_32f.h
@@ -18,76 +18,61 @@
*/
static inline void volk_32f_asin_32f_a_sse4_1(float* bVector, const float*
aVector, unsigned int num_points){
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- int i, j;
-
- __m128 aVal, pio2, x, y, z, arcsine;
- __m128 fzeroes, fones, ftwos, ffours, condition;
-
- pio2 = _mm_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm_setzero_ps();
- fones = _mm_set1_ps(1.0);
- ftwos = _mm_set1_ps(2.0);
- ffours = _mm_set1_ps(4.0);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- aVal = _mm_div_ps(aVal,
_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
- z = aVal;
- condition = _mm_cmplt_ps(z, fzeroes);
- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
- x = z;
- condition = _mm_cmplt_ps(z, fones);
- x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z),
z), condition));
-
- for(i = 0; i < 2; i++) x = _mm_add_ps(x,
_mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
- x = _mm_div_ps(fones, x);
- y = fzeroes;
- for(j = TERMS - 1; j >=0 ; j--) y = _mm_add_ps(_mm_mul_ps(y,
_mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-
- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
- condition = _mm_cmpgt_ps(z, fones);
-
- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y,
ftwos)), condition));
- arcsine = y;
- condition = _mm_cmplt_ps(aVal, fzeroes);
- arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine,
ftwos), condition));
-
- _mm_store_ps(bPtr, arcsine);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = asin(*aPtr++);
- }
-}
-
-#endif /* LV_HAVE_SSE4_1 for aligned */
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Computes arcsine of input vector and stores results in output vector
- \param bVector The vector where results will be stored
- \param aVector The input vector of floats
- \param num_points Number of points for which arcsine is to be computed
-*/
-static inline void volk_32f_asin_32f_a_generic(float* bVector, const float*
aVector, unsigned int num_points){
float* bPtr = bVector;
const float* aPtr = aVector;
+
unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ int i, j;
+
+ __m128 aVal, pio2, x, y, z, arcsine;
+ __m128 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm_set1_ps(3.14159265358979323846/2);
+ fzeroes = _mm_setzero_ps();
+ fones = _mm_set1_ps(1.0);
+ ftwos = _mm_set1_ps(2.0);
+ ffours = _mm_set1_ps(4.0);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones,
aVal), _mm_sub_ps(fones, aVal))));
+ z = aVal;
+ condition = _mm_cmplt_ps(z, fzeroes);
+ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+ x = z;
+ condition = _mm_cmplt_ps(z, fones);
+ x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z),
condition));
+
+ for(i = 0; i < 2; i++){
+ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x,
x))));
+ }
+ x = _mm_div_ps(fones, x);
+ y = fzeroes;
+ for(j = TERMS - 1; j >=0 ; j--){
+ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
_mm_set1_ps(pow(-1,j)/(2*j+1)));
+ }
+
+ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+ condition = _mm_cmpgt_ps(z, fones);
+
+ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)),
condition));
+ arcsine = y;
+ condition = _mm_cmplt_ps(aVal, fzeroes);
+ arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos),
condition));
+
+ _mm_store_ps(bPtr, arcsine);
+ aPtr += 4;
+ bPtr += 4;
+ }
- for(number = 0; number < num_points; number++){
- *bPtr++ = asin(*aPtr++);
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = asin(*aPtr++);
}
-
}
-#endif /* LV_HAVE_GENERIC */
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
#endif /* INCLUDED_volk_32f_asin_32f_a_H */
@@ -104,54 +89,58 @@ static inline void volk_32f_asin_32f_a_generic(float*
bVector, const float* aVec
*/
static inline void volk_32f_asin_32f_u_sse4_1(float* bVector, const float*
aVector, unsigned int num_points){
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
unsigned int quarterPoints = num_points / 4;
- int i, j;
-
- __m128 aVal, pio2, x, y, z, arcsine;
- __m128 fzeroes, fones, ftwos, ffours, condition;
-
- pio2 = _mm_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm_setzero_ps();
- fones = _mm_set1_ps(1.0);
- ftwos = _mm_set1_ps(2.0);
- ffours = _mm_set1_ps(4.0);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_loadu_ps(aPtr);
- aVal = _mm_div_ps(aVal,
_mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))));
- z = aVal;
- condition = _mm_cmplt_ps(z, fzeroes);
- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
- x = z;
- condition = _mm_cmplt_ps(z, fones);
- x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z),
z), condition));
-
- for(i = 0; i < 2; i++) x = _mm_add_ps(x,
_mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
- x = _mm_div_ps(fones, x);
- y = fzeroes;
- for(j = TERMS - 1; j >=0 ; j--) y = _mm_add_ps(_mm_mul_ps(y,
_mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-
- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
- condition = _mm_cmpgt_ps(z, fones);
-
- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y,
ftwos)), condition));
- arcsine = y;
- condition = _mm_cmplt_ps(aVal, fzeroes);
- arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine,
ftwos), condition));
-
- _mm_storeu_ps(bPtr, arcsine);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = asin(*aPtr++);
- }
+ int i, j;
+
+ __m128 aVal, pio2, x, y, z, arcsine;
+ __m128 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm_set1_ps(3.14159265358979323846/2);
+ fzeroes = _mm_setzero_ps();
+ fones = _mm_set1_ps(1.0);
+ ftwos = _mm_set1_ps(2.0);
+ ffours = _mm_set1_ps(4.0);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_loadu_ps(aPtr);
+ aVal = _mm_div_ps(aVal, _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones,
aVal), _mm_sub_ps(fones, aVal))));
+ z = aVal;
+ condition = _mm_cmplt_ps(z, fzeroes);
+ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+ x = z;
+ condition = _mm_cmplt_ps(z, fones);
+ x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z),
condition));
+
+ for(i = 0; i < 2; i++){
+ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x,
x))));
+ }
+ x = _mm_div_ps(fones, x);
+ y = fzeroes;
+ for(j = TERMS - 1; j >=0 ; j--){
+ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
_mm_set1_ps(pow(-1,j)/(2*j+1)));
+ }
+
+ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+ condition = _mm_cmpgt_ps(z, fones);
+
+ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)),
condition));
+ arcsine = y;
+ condition = _mm_cmplt_ps(aVal, fzeroes);
+ arcsine = _mm_sub_ps(arcsine, _mm_and_ps(_mm_mul_ps(arcsine, ftwos),
condition));
+
+ _mm_storeu_ps(bPtr, arcsine);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = asin(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -163,15 +152,15 @@ static inline void volk_32f_asin_32f_u_sse4_1(float*
bVector, const float* aVect
\param aVector The input vector of floats
\param num_points Number of points for which arcsine is to be computed
*/
-static inline void volk_32f_asin_32f_u_generic(float* bVector, const float*
aVector, unsigned int num_points){
+static inline void volk_32f_asin_32f_u_generic(float* bVector, const float*
aVector, unsigned int num_points){
float* bPtr = bVector;
const float* aPtr = aVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
- *bPtr++ = asin(*aPtr++);
+ *bPtr++ = asin(*aPtr++);
}
-
+
}
#endif /* LV_HAVE_GENERIC */
diff --git a/volk/kernels/volk/volk_32f_atan_32f.h
b/volk/kernels/volk/volk_32f_atan_32f.h
index a60e2b8..eaee7f3 100644
--- a/volk/kernels/volk/volk_32f_atan_32f.h
+++ b/volk/kernels/volk/volk_32f_atan_32f.h
@@ -18,75 +18,60 @@
*/
static inline void volk_32f_atan_32f_a_sse4_1(float* bVector, const float*
aVector, unsigned int num_points){
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- int i, j;
-
- __m128 aVal, pio2, x, y, z, arctangent;
- __m128 fzeroes, fones, ftwos, ffours, condition;
-
- pio2 = _mm_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm_setzero_ps();
- fones = _mm_set1_ps(1.0);
- ftwos = _mm_set1_ps(2.0);
- ffours = _mm_set1_ps(4.0);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- z = aVal;
- condition = _mm_cmplt_ps(z, fzeroes);
- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
- x = z;
- condition = _mm_cmplt_ps(z, fones);
- x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z),
z), condition));
-
- for(i = 0; i < 2; i++) x = _mm_add_ps(x,
_mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
- x = _mm_div_ps(fones, x);
- y = fzeroes;
- for(j = TERMS - 1; j >=0 ; j--) y = _mm_add_ps(_mm_mul_ps(y,
_mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-
- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
- condition = _mm_cmpgt_ps(z, fones);
-
- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y,
ftwos)), condition));
- arctangent = y;
- condition = _mm_cmplt_ps(aVal, fzeroes);
- arctangent = _mm_sub_ps(arctangent,
_mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
-
- _mm_store_ps(bPtr, arctangent);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = atan(*aPtr++);
- }
-}
-
-#endif /* LV_HAVE_SSE4_1 for aligned */
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Computes arctangent of input vector and stores results in output
vector
- \param bVector The vector where results will be stored
- \param aVector The input vector of floats
- \param num_points Number of points for which arctangent is to be computed
-*/
-static inline void volk_32f_atan_32f_a_generic(float* bVector, const float*
aVector, unsigned int num_points){
float* bPtr = bVector;
const float* aPtr = aVector;
+
unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ int i, j;
+
+ __m128 aVal, pio2, x, y, z, arctangent;
+ __m128 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm_set1_ps(3.14159265358979323846/2);
+ fzeroes = _mm_setzero_ps();
+ fones = _mm_set1_ps(1.0);
+ ftwos = _mm_set1_ps(2.0);
+ ffours = _mm_set1_ps(4.0);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ z = aVal;
+ condition = _mm_cmplt_ps(z, fzeroes);
+ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+ x = z;
+ condition = _mm_cmplt_ps(z, fones);
+ x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z),
condition));
+
+ for(i = 0; i < 2; i++){
+ x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x,
x))));
+ }
+ x = _mm_div_ps(fones, x);
+ y = fzeroes;
+ for(j = TERMS - 1; j >=0 ; j--){
+ y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
_mm_set1_ps(pow(-1,j)/(2*j+1)));
+ }
+
+ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+ condition = _mm_cmpgt_ps(z, fones);
+
+ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)),
condition));
+ arctangent = y;
+ condition = _mm_cmplt_ps(aVal, fzeroes);
+ arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent,
ftwos), condition));
+
+ _mm_store_ps(bPtr, arctangent);
+ aPtr += 4;
+ bPtr += 4;
+ }
- for(number = 0; number < num_points; number++){
- *bPtr++ = atan(*aPtr++);
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = atan(*aPtr++);
}
-
}
-#endif /* LV_HAVE_GENERIC */
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
#endif /* INCLUDED_volk_32f_atan_32f_a_H */
@@ -103,53 +88,53 @@ static inline void volk_32f_atan_32f_a_generic(float*
bVector, const float* aVec
*/
static inline void volk_32f_atan_32f_u_sse4_1(float* bVector, const float*
aVector, unsigned int num_points){
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- int i, j;
-
- __m128 aVal, pio2, x, y, z, arctangent;
- __m128 fzeroes, fones, ftwos, ffours, condition;
-
- pio2 = _mm_set1_ps(3.14159265358979323846/2);
- fzeroes = _mm_setzero_ps();
- fones = _mm_set1_ps(1.0);
- ftwos = _mm_set1_ps(2.0);
- ffours = _mm_set1_ps(4.0);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_loadu_ps(aPtr);
- z = aVal;
- condition = _mm_cmplt_ps(z, fzeroes);
- z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
- x = z;
- condition = _mm_cmplt_ps(z, fones);
- x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z),
z), condition));
-
- for(i = 0; i < 2; i++) x = _mm_add_ps(x,
_mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
- x = _mm_div_ps(fones, x);
- y = fzeroes;
- for(j = TERMS - 1; j >= 0; j--) y = _mm_add_ps(_mm_mul_ps(y,
_mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
-
- y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
- condition = _mm_cmpgt_ps(z, fones);
-
- y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y,
ftwos)), condition));
- arctangent = y;
- condition = _mm_cmplt_ps(aVal, fzeroes);
- arctangent = _mm_sub_ps(arctangent,
_mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
-
- _mm_storeu_ps(bPtr, arctangent);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = atan(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ int i, j;
+
+ __m128 aVal, pio2, x, y, z, arctangent;
+ __m128 fzeroes, fones, ftwos, ffours, condition;
+
+ pio2 = _mm_set1_ps(3.14159265358979323846/2);
+ fzeroes = _mm_setzero_ps();
+ fones = _mm_set1_ps(1.0);
+ ftwos = _mm_set1_ps(2.0);
+ ffours = _mm_set1_ps(4.0);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_loadu_ps(aPtr);
+ z = aVal;
+ condition = _mm_cmplt_ps(z, fzeroes);
+ z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
+ x = z;
+ condition = _mm_cmplt_ps(z, fones);
+ x = _mm_add_ps(x, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z),
condition));
+
+ for(i = 0; i < 2; i++) x = _mm_add_ps(x,
_mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
+ x = _mm_div_ps(fones, x);
+ y = fzeroes;
+ for(j = TERMS - 1; j >= 0; j--) y = _mm_add_ps(_mm_mul_ps(y,
_mm_mul_ps(x, x)), _mm_set1_ps(pow(-1,j)/(2*j+1)));
+
+ y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
+ condition = _mm_cmpgt_ps(z, fones);
+
+ y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)),
condition));
+ arctangent = y;
+ condition = _mm_cmplt_ps(aVal, fzeroes);
+ arctangent = _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent,
ftwos), condition));
+
+ _mm_storeu_ps(bPtr, arctangent);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = atan(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -161,15 +146,15 @@ static inline void volk_32f_atan_32f_u_sse4_1(float*
bVector, const float* aVect
\param aVector The input vector of floats
\param num_points Number of points for which arctangent is to be computed
*/
-static inline void volk_32f_atan_32f_u_generic(float* bVector, const float*
aVector, unsigned int num_points){
+static inline void volk_32f_atan_32f_generic(float* bVector, const float*
aVector, unsigned int num_points){
float* bPtr = bVector;
const float* aPtr = aVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
- *bPtr++ = atan(*aPtr++);
+ *bPtr++ = atan(*aPtr++);
}
-
+
}
#endif /* LV_HAVE_GENERIC */
diff --git a/volk/kernels/volk/volk_32f_cos_32f.h
b/volk/kernels/volk/volk_32f_cos_32f.h
index cd72672..7aa575f 100644
--- a/volk/kernels/volk/volk_32f_cos_32f.h
+++ b/volk/kernels/volk/volk_32f_cos_32f.h
@@ -15,94 +15,75 @@
*/
static inline void volk_32f_cos_32f_a_sse4_1(float* bVector, const float*
aVector, unsigned int num_points){
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- unsigned int i = 0;
-
- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours,
ftwos, fones, fzeroes;
- __m128 sine, cosine, condition1, condition2, condition3;
- __m128i q, r, ones, twos, fours;
-
- m4pi = _mm_set1_ps(1.273239545);
- pio4A = _mm_set1_ps(0.78515625);
- pio4B = _mm_set1_ps(0.241876e-3);
- ffours = _mm_set1_ps(4.0);
- ftwos = _mm_set1_ps(2.0);
- fones = _mm_set1_ps(1.0);
- fzeroes = _mm_setzero_ps();
- ones = _mm_set1_epi32(1);
- twos = _mm_set1_epi32(2);
- fours = _mm_set1_epi32(4);
-
- cp1 = _mm_set1_ps(1.0);
- cp2 = _mm_set1_ps(0.83333333e-1);
- cp3 = _mm_set1_ps(0.2777778e-2);
- cp4 = _mm_set1_ps(0.49603e-4);
- cp5 = _mm_set1_ps(0.551e-6);
-
- for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos),
_mm_cmplt_ps(aVal, fzeroes)));
- q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
- r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3
times argument reduction
- s = _mm_mul_ps(s, s);
- // Evaluate Taylor series
- s =
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
- for(i = 0; i < 3; i++) s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
- s = _mm_div_ps(s, ftwos);
-
- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
- cosine = _mm_sub_ps(fones, s);
-
- condition1 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)),
fzeroes);
-
- // Need this condition only for sin
- //condition2 =
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
_mm_cmplt_ps(aVal, fzeroes));
- condition3 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)),
fzeroes);
-
- cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine),
condition1));
- cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine,
_mm_set1_ps(2.0f)), condition3));
- _mm_store_ps(bPtr, cosine);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = cos(*aPtr++);
- }
-}
-
-#endif /* LV_HAVE_SSE4_1 for aligned */
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Computes cosine of input vector and stores results in output vector
- \param bVector The vector where results will be stored
- \param aVector The input vector of floats
- \param num_points Number of points for which cosine is to be computed
-*/
-static inline void volk_32f_cos_32f_a_generic(float* bVector, const float*
aVector, unsigned int num_points){
float* bPtr = bVector;
const float* aPtr = aVector;
+
unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ unsigned int i = 0;
+
+ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours,
ftwos, fones, fzeroes;
+ __m128 sine, cosine, condition1, condition2, condition3;
+ __m128i q, r, ones, twos, fours;
+
+ m4pi = _mm_set1_ps(1.273239545);
+ pio4A = _mm_set1_ps(0.78515625);
+ pio4B = _mm_set1_ps(0.241876e-3);
+ ffours = _mm_set1_ps(4.0);
+ ftwos = _mm_set1_ps(2.0);
+ fones = _mm_set1_ps(1.0);
+ fzeroes = _mm_setzero_ps();
+ ones = _mm_set1_epi32(1);
+ twos = _mm_set1_epi32(2);
+ fours = _mm_set1_epi32(4);
+
+ cp1 = _mm_set1_ps(1.0);
+ cp2 = _mm_set1_ps(0.83333333e-1);
+ cp3 = _mm_set1_ps(0.2777778e-2);
+ cp4 = _mm_set1_ps(0.49603e-4);
+ cp5 = _mm_set1_ps(0.551e-6);
+
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos),
_mm_cmplt_ps(aVal, fzeroes)));
+ q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
+ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+ s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3
times argument reduction
+ s = _mm_mul_ps(s, s);
+ // Evaluate Taylor series
+ s =
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+
+ for(i = 0; i < 3; i++) s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ s = _mm_div_ps(s, ftwos);
+
+ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+ cosine = _mm_sub_ps(fones, s);
+
+ condition1 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
ones), twos)), fzeroes);
+
+ // Need this condition only for sin
+ //condition2 =
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
_mm_cmplt_ps(aVal, fzeroes));
+ condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
twos), fours)), fzeroes);
+
+ cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine),
condition1));
+ cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine,
_mm_set1_ps(2.0f)), condition3));
+ _mm_store_ps(bPtr, cosine);
+ aPtr += 4;
+ bPtr += 4;
+ }
- for(; number < num_points; number++){
- *bPtr++ = cos(*aPtr++);
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = cos(*aPtr++);
}
-
}
-#endif /* LV_HAVE_GENERIC */
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
#endif /* INCLUDED_volk_32f_cos_32f_a_H */
@@ -119,72 +100,73 @@ static inline void volk_32f_cos_32f_a_generic(float*
bVector, const float* aVect
*/
static inline void volk_32f_cos_32f_u_sse4_1(float* bVector, const float*
aVector, unsigned int num_points){
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
unsigned int quarterPoints = num_points / 4;
- unsigned int i = 0;
-
- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours,
ftwos, fones, fzeroes;
- __m128 sine, cosine, condition1, condition2, condition3;
- __m128i q, r, ones, twos, fours;
-
- m4pi = _mm_set1_ps(1.273239545);
- pio4A = _mm_set1_ps(0.78515625);
- pio4B = _mm_set1_ps(0.241876e-3);
- ffours = _mm_set1_ps(4.0);
- ftwos = _mm_set1_ps(2.0);
- fones = _mm_set1_ps(1.0);
- fzeroes = _mm_setzero_ps();
- ones = _mm_set1_epi32(1);
- twos = _mm_set1_epi32(2);
- fours = _mm_set1_epi32(4);
-
- cp1 = _mm_set1_ps(1.0);
- cp2 = _mm_set1_ps(0.83333333e-1);
- cp3 = _mm_set1_ps(0.2777778e-2);
- cp4 = _mm_set1_ps(0.49603e-4);
- cp5 = _mm_set1_ps(0.551e-6);
-
- for(;number < quarterPoints; number++){
-
- aVal = _mm_loadu_ps(aPtr);
- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos),
_mm_cmplt_ps(aVal, fzeroes)));
- q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
- r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3
times argument reduction
- s = _mm_mul_ps(s, s);
- // Evaluate Taylor series
- s =
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
- for(i = 0; i < 3; i++) s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
- s = _mm_div_ps(s, ftwos);
-
- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
- cosine = _mm_sub_ps(fones, s);
-
- condition1 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)),
fzeroes);
-
- // Need this condition only for sin
- //condition2 =
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
_mm_cmplt_ps(aVal, fzeroes));
- condition3 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)),
fzeroes);
-
- cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine),
condition1));
- cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine,
_mm_set1_ps(2.0f)), condition3));
- _mm_storeu_ps(bPtr, cosine);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = cos(*aPtr++);
- }
+ unsigned int i = 0;
+
+ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours,
ftwos, fones, fzeroes;
+ __m128 sine, cosine, condition1, condition2, condition3;
+ __m128i q, r, ones, twos, fours;
+
+ m4pi = _mm_set1_ps(1.273239545);
+ pio4A = _mm_set1_ps(0.78515625);
+ pio4B = _mm_set1_ps(0.241876e-3);
+ ffours = _mm_set1_ps(4.0);
+ ftwos = _mm_set1_ps(2.0);
+ fones = _mm_set1_ps(1.0);
+ fzeroes = _mm_setzero_ps();
+ ones = _mm_set1_epi32(1);
+ twos = _mm_set1_epi32(2);
+ fours = _mm_set1_epi32(4);
+
+ cp1 = _mm_set1_ps(1.0);
+ cp2 = _mm_set1_ps(0.83333333e-1);
+ cp3 = _mm_set1_ps(0.2777778e-2);
+ cp4 = _mm_set1_ps(0.49603e-4);
+ cp5 = _mm_set1_ps(0.551e-6);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_loadu_ps(aPtr);
+ s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos),
_mm_cmplt_ps(aVal, fzeroes)));
+ q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
+ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+ s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3
times argument reduction
+ s = _mm_mul_ps(s, s);
+ // Evaluate Taylor series
+ s =
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+
+ for(i = 0; i < 3; i++){
+ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ }
+ s = _mm_div_ps(s, ftwos);
+
+ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+ cosine = _mm_sub_ps(fones, s);
+
+ condition1 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)),
fzeroes);
+
+ // Need this condition only for sin
+ //condition2 =
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
_mm_cmplt_ps(aVal, fzeroes));
+ condition3 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)),
fzeroes);
+
+ cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine),
condition1));
+ cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine,
_mm_set1_ps(2.0f)), condition3));
+ _mm_storeu_ps(bPtr, cosine);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = cos(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -196,15 +178,15 @@ static inline void volk_32f_cos_32f_u_sse4_1(float*
bVector, const float* aVecto
\param aVector The input vector of floats
\param num_points Number of points for which cosine is to be computed
*/
-static inline void volk_32f_cos_32f_u_generic(float* bVector, const float*
aVector, unsigned int num_points){
+static inline void volk_32f_cos_32f_generic(float* bVector, const float*
aVector, unsigned int num_points){
float* bPtr = bVector;
const float* aPtr = aVector;
unsigned int number = 0;
for(; number < num_points; number++){
- *bPtr++ = cos(*aPtr++);
+ *bPtr++ = cos(*aPtr++);
}
-
+
}
#endif /* LV_HAVE_GENERIC */
diff --git a/volk/kernels/volk/volk_32f_expfast_32f.h
b/volk/kernels/volk/volk_32f_expfast_32f.h
index 30f82d5..b8f6ea6 100644
--- a/volk/kernels/volk/volk_32f_expfast_32f.h
+++ b/volk/kernels/volk/volk_32f_expfast_32f.h
@@ -21,31 +21,31 @@
*/
static inline void volk_32f_expfast_32f_a_avx(float* bVector, const float*
aVector, unsigned int num_points){
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- __m256 aVal, bVal, a, b;
- __m256i exp;
- a = _mm256_set1_ps(A/Mln2);
- b = _mm256_set1_ps(B-C);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_load_ps(aPtr);
- exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
- bVal = _mm256_castsi256_ps(exp);
-
- _mm256_store_ps(bPtr, bVal);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = expf(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ __m256 aVal, bVal, a, b;
+ __m256i exp;
+ a = _mm256_set1_ps(A/Mln2);
+ b = _mm256_set1_ps(B-C);
+
+ for(;number < eighthPoints; number++){
+ aVal = _mm256_load_ps(aPtr);
+ exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
+ bVal = _mm256_castsi256_ps(exp);
+
+ _mm256_store_ps(bPtr, bVal);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *bPtr++ = expf(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX for aligned */
@@ -60,54 +60,34 @@ static inline void volk_32f_expfast_32f_a_avx(float*
bVector, const float* aVect
*/
static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector, const float*
aVector, unsigned int num_points){
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- __m128 aVal, bVal, a, b;
- __m128i exp;
- a = _mm_set1_ps(A/Mln2);
- b = _mm_set1_ps(B-C);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_load_ps(aPtr);
- exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
- bVal = _mm_castsi128_ps(exp);
-
- _mm_store_ps(bPtr, bVal);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = expf(*aPtr++);
- }
-}
-
-#endif /* LV_HAVE_SSE4_1 for aligned */
-
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Computes fast exp (max 7% error) of input vector and stores results
in output vector
- \param bVector The vector where results will be stored
- \param aVector The input vector of floats
- \param num_points Number of points for which exp is to be computed
-*/
-static inline void volk_32f_expfast_32f_a_generic(float* bVector, const float*
aVector, unsigned int num_points){
float* bPtr = bVector;
const float* aPtr = aVector;
+
unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
- for(number = 0; number < num_points; number++){
- *bPtr++ = expf(*aPtr++);
+ __m128 aVal, bVal, a, b;
+ __m128i exp;
+ a = _mm_set1_ps(A/Mln2);
+ b = _mm_set1_ps(B-C);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
+ bVal = _mm_castsi128_ps(exp);
+
+ _mm_store_ps(bPtr, bVal);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = expf(*aPtr++);
}
-
}
-#endif /* LV_HAVE_GENERIC */
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
#endif /* INCLUDED_volk_32f_expfast_32f_a_H */
@@ -124,31 +104,31 @@ static inline void volk_32f_expfast_32f_a_generic(float*
bVector, const float* a
*/
static inline void volk_32f_expfast_32f_u_avx(float* bVector, const float*
aVector, unsigned int num_points){
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- const unsigned int eighthPoints = num_points / 8;
-
- __m256 aVal, bVal, a, b;
- __m256i exp;
- a = _mm256_set1_ps(A/Mln2);
- b = _mm256_set1_ps(B-C);
-
- for(;number < eighthPoints; number++){
- aVal = _mm256_loadu_ps(aPtr);
- exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
- bVal = _mm256_castsi256_ps(exp);
-
- _mm256_storeu_ps(bPtr, bVal);
- aPtr += 8;
- bPtr += 8;
- }
-
- number = eighthPoints * 8;
- for(;number < num_points; number++){
- *bPtr++ = expf(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ __m256 aVal, bVal, a, b;
+ __m256i exp;
+ a = _mm256_set1_ps(A/Mln2);
+ b = _mm256_set1_ps(B-C);
+
+ for(;number < eighthPoints; number++){
+ aVal = _mm256_loadu_ps(aPtr);
+ exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a,aVal), b));
+ bVal = _mm256_castsi256_ps(exp);
+
+ _mm256_storeu_ps(bPtr, bVal);
+ aPtr += 8;
+ bPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++){
+ *bPtr++ = expf(*aPtr++);
+ }
}
#endif /* LV_HAVE_AVX for aligned */
@@ -163,36 +143,35 @@ static inline void volk_32f_expfast_32f_u_avx(float*
bVector, const float* aVect
*/
static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector, const float*
aVector, unsigned int num_points){
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- __m128 aVal, bVal, a, b;
- __m128i exp;
- a = _mm_set1_ps(A/Mln2);
- b = _mm_set1_ps(B-C);
-
- for(;number < quarterPoints; number++){
- aVal = _mm_loadu_ps(aPtr);
- exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
- bVal = _mm_castsi128_ps(exp);
-
- _mm_storeu_ps(bPtr, bVal);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = expf(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 aVal, bVal, a, b;
+ __m128i exp;
+ a = _mm_set1_ps(A/Mln2);
+ b = _mm_set1_ps(B-C);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_loadu_ps(aPtr);
+ exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a,aVal), b));
+ bVal = _mm_castsi128_ps(exp);
+
+ _mm_storeu_ps(bPtr, bVal);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = expf(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for unaligned */
-
#ifdef LV_HAVE_GENERIC
/*!
\brief Computes fast exp (max 7% error) of input vector and stores results
in output vector
@@ -200,15 +179,15 @@ static inline void volk_32f_expfast_32f_u_sse4_1(float*
bVector, const float* aV
\param aVector The input vector of floats
\param num_points Number of points for which log is to be computed
*/
-static inline void volk_32f_expfast_32f_u_generic(float* bVector, const float*
aVector, unsigned int num_points){
+static inline void volk_32f_expfast_32f_generic(float* bVector, const float*
aVector, unsigned int num_points){
float* bPtr = bVector;
const float* aPtr = aVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
- *bPtr++ = expf(*aPtr++);
+ *bPtr++ = expf(*aPtr++);
}
-
+
}
#endif /* LV_HAVE_GENERIC */
diff --git a/volk/kernels/volk/volk_32f_sin_32f.h
b/volk/kernels/volk/volk_32f_sin_32f.h
index 5147c54..96e021a 100644
--- a/volk/kernels/volk/volk_32f_sin_32f.h
+++ b/volk/kernels/volk/volk_32f_sin_32f.h
@@ -15,93 +15,74 @@
*/
static inline void volk_32f_sin_32f_a_sse4_1(float* bVector, const float*
aVector, unsigned int num_points){
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- unsigned int i = 0;
-
- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours,
ftwos, fones, fzeroes;
- __m128 sine, cosine, condition1, condition2, condition3;
- __m128i q, r, ones, twos, fours;
-
- m4pi = _mm_set1_ps(1.273239545);
- pio4A = _mm_set1_ps(0.78515625);
- pio4B = _mm_set1_ps(0.241876e-3);
- ffours = _mm_set1_ps(4.0);
- ftwos = _mm_set1_ps(2.0);
- fones = _mm_set1_ps(1.0);
- fzeroes = _mm_setzero_ps();
- ones = _mm_set1_epi32(1);
- twos = _mm_set1_epi32(2);
- fours = _mm_set1_epi32(4);
-
- cp1 = _mm_set1_ps(1.0);
- cp2 = _mm_set1_ps(0.83333333e-1);
- cp3 = _mm_set1_ps(0.2777778e-2);
- cp4 = _mm_set1_ps(0.49603e-4);
- cp5 = _mm_set1_ps(0.551e-6);
-
- for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos),
_mm_cmplt_ps(aVal, fzeroes)));
- q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
- r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3
times argument reduction
- s = _mm_mul_ps(s, s);
- // Evaluate Taylor series
- s =
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
- for(i = 0; i < 3; i++) s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
- s = _mm_div_ps(s, ftwos);
-
- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
- cosine = _mm_sub_ps(fones, s);
-
- condition1 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)),
fzeroes);
- condition2 =
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
_mm_cmplt_ps(aVal, fzeroes));
- // Need this condition only for cos
- //condition3 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)),
fzeroes);
-
- sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine),
condition1));
- sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)),
condition2));
- _mm_store_ps(bPtr, sine);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = sin(*aPtr++);
- }
-}
-
-#endif /* LV_HAVE_SSE4_1 for aligned */
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Computes sine of input vector and stores results in output vector
- \param bVector The vector where results will be stored
- \param aVector The input vector of floats
- \param num_points Number of points for which sine is to be computed
-*/
-static inline void volk_32f_sin_32f_a_generic(float* bVector, const float*
aVector, unsigned int num_points){
float* bPtr = bVector;
const float* aPtr = aVector;
+
unsigned int number = 0;
-
- for(; number < num_points; number++){
- *bPtr++ = sin(*aPtr++);
+ unsigned int quarterPoints = num_points / 4;
+ unsigned int i = 0;
+
+ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours,
ftwos, fones, fzeroes;
+ __m128 sine, cosine, condition1, condition2, condition3;
+ __m128i q, r, ones, twos, fours;
+
+ m4pi = _mm_set1_ps(1.273239545);
+ pio4A = _mm_set1_ps(0.78515625);
+ pio4B = _mm_set1_ps(0.241876e-3);
+ ffours = _mm_set1_ps(4.0);
+ ftwos = _mm_set1_ps(2.0);
+ fones = _mm_set1_ps(1.0);
+ fzeroes = _mm_setzero_ps();
+ ones = _mm_set1_epi32(1);
+ twos = _mm_set1_epi32(2);
+ fours = _mm_set1_epi32(4);
+
+ cp1 = _mm_set1_ps(1.0);
+ cp2 = _mm_set1_ps(0.83333333e-1);
+ cp3 = _mm_set1_ps(0.2777778e-2);
+ cp4 = _mm_set1_ps(0.49603e-4);
+ cp5 = _mm_set1_ps(0.551e-6);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos),
_mm_cmplt_ps(aVal, fzeroes)));
+ q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
+ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+ s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3
times argument reduction
+ s = _mm_mul_ps(s, s);
+ // Evaluate Taylor series
+ s =
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+
+ for(i = 0; i < 3; i++) {
+ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ }
+ s = _mm_div_ps(s, ftwos);
+
+ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+ cosine = _mm_sub_ps(fones, s);
+
+ condition1 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)),
fzeroes);
+ condition2 =
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
_mm_cmplt_ps(aVal, fzeroes));
+ // Need this condition only for cos
+ //condition3 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)),
fzeroes);
+
+ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine),
condition1));
+ sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine,
_mm_set1_ps(2.0f)), condition2));
+ _mm_store_ps(bPtr, sine);
+ aPtr += 4;
+ bPtr += 4;
}
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = sin(*aPtr++);
+ }
}
-#endif /* LV_HAVE_GENERIC */
+#endif /* LV_HAVE_SSE4_1 for aligned */
#endif /* INCLUDED_volk_32f_sin_32f_a_H */
@@ -118,71 +99,72 @@ static inline void volk_32f_sin_32f_a_generic(float*
bVector, const float* aVect
*/
static inline void volk_32f_sin_32f_u_sse4_1(float* bVector, const float*
aVector, unsigned int num_points){
- float* bPtr = bVector;
- const float* aPtr = aVector;
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- unsigned int i = 0;
-
- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours,
ftwos, fones, fzeroes;
- __m128 sine, cosine, condition1, condition2, condition3;
- __m128i q, r, ones, twos, fours;
-
- m4pi = _mm_set1_ps(1.273239545);
- pio4A = _mm_set1_ps(0.78515625);
- pio4B = _mm_set1_ps(0.241876e-3);
- ffours = _mm_set1_ps(4.0);
- ftwos = _mm_set1_ps(2.0);
- fones = _mm_set1_ps(1.0);
- fzeroes = _mm_setzero_ps();
- ones = _mm_set1_epi32(1);
- twos = _mm_set1_epi32(2);
- fours = _mm_set1_epi32(4);
-
- cp1 = _mm_set1_ps(1.0);
- cp2 = _mm_set1_ps(0.83333333e-1);
- cp3 = _mm_set1_ps(0.2777778e-2);
- cp4 = _mm_set1_ps(0.49603e-4);
- cp5 = _mm_set1_ps(0.551e-6);
-
- for(;number < quarterPoints; number++){
-
- aVal = _mm_loadu_ps(aPtr);
- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos),
_mm_cmplt_ps(aVal, fzeroes)));
- q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
- r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3
times argument reduction
- s = _mm_mul_ps(s, s);
- // Evaluate Taylor series
- s =
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
- for(i = 0; i < 3; i++) s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
- s = _mm_div_ps(s, ftwos);
-
- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
- cosine = _mm_sub_ps(fones, s);
-
- condition1 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)),
fzeroes);
- condition2 =
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
_mm_cmplt_ps(aVal, fzeroes));
- // Need this condition only for cos
- //condition3 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)),
fzeroes);
-
- sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine),
condition1));
- sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)),
condition2));
- _mm_storeu_ps(bPtr, sine);
- aPtr += 4;
- bPtr += 4;
- }
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ unsigned int i = 0;
+
+ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours,
ftwos, fones, fzeroes;
+ __m128 sine, cosine, condition1, condition2, condition3;
+ __m128i q, r, ones, twos, fours;
+
+ m4pi = _mm_set1_ps(1.273239545);
+ pio4A = _mm_set1_ps(0.78515625);
+ pio4B = _mm_set1_ps(0.241876e-3);
+ ffours = _mm_set1_ps(4.0);
+ ftwos = _mm_set1_ps(2.0);
+ fones = _mm_set1_ps(1.0);
+ fzeroes = _mm_setzero_ps();
+ ones = _mm_set1_epi32(1);
+ twos = _mm_set1_epi32(2);
+ fours = _mm_set1_epi32(4);
+
+ cp1 = _mm_set1_ps(1.0);
+ cp2 = _mm_set1_ps(0.83333333e-1);
+ cp3 = _mm_set1_ps(0.2777778e-2);
+ cp4 = _mm_set1_ps(0.49603e-4);
+ cp5 = _mm_set1_ps(0.551e-6);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_loadu_ps(aPtr);
+ s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos),
_mm_cmplt_ps(aVal, fzeroes)));
+ q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
+ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+ s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3
times argument reduction
+ s = _mm_mul_ps(s, s);
+ // Evaluate Taylor series
+ s =
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+
+ for(i = 0; i < 3; i++) {
+ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ }
+ s = _mm_div_ps(s, ftwos);
+
+ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+ cosine = _mm_sub_ps(fones, s);
+
+ condition1 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)),
fzeroes);
+ condition2 =
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
_mm_cmplt_ps(aVal, fzeroes));
+ // Need this condition only for cos
+ //condition3 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)),
fzeroes);
+
+ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine),
condition1));
+ sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine,
_mm_set1_ps(2.0f)), condition2));
+ _mm_storeu_ps(bPtr, sine);
+ aPtr += 4;
+ bPtr += 4;
+ }
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = sin(*aPtr++);
- }
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = sin(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -194,13 +176,13 @@ static inline void volk_32f_sin_32f_u_sse4_1(float*
bVector, const float* aVecto
\param aVector The input vector of floats
\param num_points Number of points for which sine is to be computed
*/
-static inline void volk_32f_sin_32f_u_generic(float* bVector, const float*
aVector, unsigned int num_points){
+static inline void volk_32f_sin_32f_generic(float* bVector, const float*
aVector, unsigned int num_points){
float* bPtr = bVector;
const float* aPtr = aVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
- *bPtr++ = sin(*aPtr++);
+ *bPtr++ = sin(*aPtr++);
}
}
diff --git a/volk/kernels/volk/volk_32f_tan_32f.h
b/volk/kernels/volk/volk_32f_tan_32f.h
index 48611b0..70eb5e3 100644
--- a/volk/kernels/volk/volk_32f_tan_32f.h
+++ b/volk/kernels/volk/volk_32f_tan_32f.h
@@ -15,96 +15,78 @@
*/
static inline void volk_32f_tan_32f_a_sse4_1(float* bVector, const float*
aVector, unsigned int num_points){
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- unsigned int i = 0;
-
- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours,
ftwos, fones, fzeroes;
- __m128 sine, cosine, tangent, condition1, condition2, condition3;
- __m128i q, r, ones, twos, fours;
-
- m4pi = _mm_set1_ps(1.273239545);
- pio4A = _mm_set1_ps(0.78515625);
- pio4B = _mm_set1_ps(0.241876e-3);
- ffours = _mm_set1_ps(4.0);
- ftwos = _mm_set1_ps(2.0);
- fones = _mm_set1_ps(1.0);
- fzeroes = _mm_setzero_ps();
- ones = _mm_set1_epi32(1);
- twos = _mm_set1_epi32(2);
- fours = _mm_set1_epi32(4);
-
- cp1 = _mm_set1_ps(1.0);
- cp2 = _mm_set1_ps(0.83333333e-1);
- cp3 = _mm_set1_ps(0.2777778e-2);
- cp4 = _mm_set1_ps(0.49603e-4);
- cp5 = _mm_set1_ps(0.551e-6);
-
- for(;number < quarterPoints; number++){
-
- aVal = _mm_load_ps(aPtr);
- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos),
_mm_cmplt_ps(aVal, fzeroes)));
- q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
- r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3
times argument reduction
- s = _mm_mul_ps(s, s);
- // Evaluate Taylor series
- s =
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
- for(i = 0; i < 3; i++) s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
- s = _mm_div_ps(s, ftwos);
-
- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
- cosine = _mm_sub_ps(fones, s);
-
- condition1 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)),
fzeroes);
- condition2 =
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
_mm_cmplt_ps(aVal, fzeroes));
- condition3 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)),
fzeroes);
-
- __m128 temp = cosine;
- cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine),
condition1));
- sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
- sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)),
condition2));
- cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine,
_mm_set1_ps(2.0f)), condition3));
- tangent = _mm_div_ps(sine, cosine);
- _mm_store_ps(bPtr, tangent);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = tan(*aPtr++);
- }
-}
-
-#endif /* LV_HAVE_SSE4_1 for aligned */
-
-#ifdef LV_HAVE_GENERIC
-/*!
- \brief Computes tangent of input vector and stores results in output vector
- \param bVector The vector where results will be stored
- \param aVector The input vector of floats
- \param num_points Number of points for which tangent is to be computed
-*/
-static inline void volk_32f_tan_32f_a_generic(float* bVector, const float*
aVector, unsigned int num_points){
float* bPtr = bVector;
const float* aPtr = aVector;
+
unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ unsigned int i = 0;
+
+ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours,
ftwos, fones, fzeroes;
+ __m128 sine, cosine, tangent, condition1, condition2, condition3;
+ __m128i q, r, ones, twos, fours;
+
+ m4pi = _mm_set1_ps(1.273239545);
+ pio4A = _mm_set1_ps(0.78515625);
+ pio4B = _mm_set1_ps(0.241876e-3);
+ ffours = _mm_set1_ps(4.0);
+ ftwos = _mm_set1_ps(2.0);
+ fones = _mm_set1_ps(1.0);
+ fzeroes = _mm_setzero_ps();
+ ones = _mm_set1_epi32(1);
+ twos = _mm_set1_epi32(2);
+ fours = _mm_set1_epi32(4);
+
+ cp1 = _mm_set1_ps(1.0);
+ cp2 = _mm_set1_ps(0.83333333e-1);
+ cp3 = _mm_set1_ps(0.2777778e-2);
+ cp4 = _mm_set1_ps(0.49603e-4);
+ cp5 = _mm_set1_ps(0.551e-6);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_load_ps(aPtr);
+ s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos),
_mm_cmplt_ps(aVal, fzeroes)));
+ q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
+ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+ s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3
times argument reduction
+ s = _mm_mul_ps(s, s);
+ // Evaluate Taylor series
+ s =
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+
+ for(i = 0; i < 3; i++){
+ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ }
+ s = _mm_div_ps(s, ftwos);
+
+ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+ cosine = _mm_sub_ps(fones, s);
+
+ condition1 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)),
fzeroes);
+ condition2 =
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
_mm_cmplt_ps(aVal, fzeroes));
+ condition3 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)),
fzeroes);
+
+ __m128 temp = cosine;
+ cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine),
condition1));
+ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine),
condition1));
+ sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine,
_mm_set1_ps(2.0f)), condition2));
+ cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine,
_mm_set1_ps(2.0f)), condition3));
+ tangent = _mm_div_ps(sine, cosine);
+ _mm_store_ps(bPtr, tangent);
+ aPtr += 4;
+ bPtr += 4;
+ }
- for(; number < num_points; number++){
- *bPtr++ = tan(*aPtr++);
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = tan(*aPtr++);
}
-
}
-#endif /* LV_HAVE_GENERIC */
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
#endif /* INCLUDED_volk_32f_tan_32f_a_H */
@@ -121,74 +103,75 @@ static inline void volk_32f_tan_32f_a_generic(float*
bVector, const float* aVect
*/
static inline void volk_32f_tan_32f_u_sse4_1(float* bVector, const float*
aVector, unsigned int num_points){
- float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- unsigned int quarterPoints = num_points / 4;
- unsigned int i = 0;
-
- __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours,
ftwos, fones, fzeroes;
- __m128 sine, cosine, tangent, condition1, condition2, condition3;
- __m128i q, r, ones, twos, fours;
-
- m4pi = _mm_set1_ps(1.273239545);
- pio4A = _mm_set1_ps(0.78515625);
- pio4B = _mm_set1_ps(0.241876e-3);
- ffours = _mm_set1_ps(4.0);
- ftwos = _mm_set1_ps(2.0);
- fones = _mm_set1_ps(1.0);
- fzeroes = _mm_setzero_ps();
- ones = _mm_set1_epi32(1);
- twos = _mm_set1_epi32(2);
- fours = _mm_set1_epi32(4);
-
- cp1 = _mm_set1_ps(1.0);
- cp2 = _mm_set1_ps(0.83333333e-1);
- cp3 = _mm_set1_ps(0.2777778e-2);
- cp4 = _mm_set1_ps(0.49603e-4);
- cp5 = _mm_set1_ps(0.551e-6);
-
- for(;number < quarterPoints; number++){
-
- aVal = _mm_loadu_ps(aPtr);
- s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos),
_mm_cmplt_ps(aVal, fzeroes)));
- q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
- r = _mm_add_epi32(q, _mm_and_si128(q, ones));
-
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
- s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
-
- s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3
times argument reduction
- s = _mm_mul_ps(s, s);
- // Evaluate Taylor series
- s =
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
cp5), cp4), s), cp3), s), cp2), s), cp1), s);
-
- for(i = 0; i < 3; i++) s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
- s = _mm_div_ps(s, ftwos);
-
- sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
- cosine = _mm_sub_ps(fones, s);
-
- condition1 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)),
fzeroes);
- condition2 =
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
_mm_cmplt_ps(aVal, fzeroes));
- condition3 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)),
fzeroes);
-
- __m128 temp = cosine;
- cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine),
condition1));
- sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine), condition1));
- sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)),
condition2));
- cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine,
_mm_set1_ps(2.0f)), condition3));
- tangent = _mm_div_ps(sine, cosine);
- _mm_storeu_ps(bPtr, tangent);
- aPtr += 4;
- bPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *bPtr++ = tan(*aPtr++);
- }
+ float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ unsigned int quarterPoints = num_points / 4;
+ unsigned int i = 0;
+
+ __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours,
ftwos, fones, fzeroes;
+ __m128 sine, cosine, tangent, condition1, condition2, condition3;
+ __m128i q, r, ones, twos, fours;
+
+ m4pi = _mm_set1_ps(1.273239545);
+ pio4A = _mm_set1_ps(0.78515625);
+ pio4B = _mm_set1_ps(0.241876e-3);
+ ffours = _mm_set1_ps(4.0);
+ ftwos = _mm_set1_ps(2.0);
+ fones = _mm_set1_ps(1.0);
+ fzeroes = _mm_setzero_ps();
+ ones = _mm_set1_epi32(1);
+ twos = _mm_set1_epi32(2);
+ fours = _mm_set1_epi32(4);
+
+ cp1 = _mm_set1_ps(1.0);
+ cp2 = _mm_set1_ps(0.83333333e-1);
+ cp3 = _mm_set1_ps(0.2777778e-2);
+ cp4 = _mm_set1_ps(0.49603e-4);
+ cp5 = _mm_set1_ps(0.551e-6);
+
+ for(;number < quarterPoints; number++){
+ aVal = _mm_loadu_ps(aPtr);
+ s = _mm_sub_ps(aVal, _mm_and_ps(_mm_mul_ps(aVal, ftwos),
_mm_cmplt_ps(aVal, fzeroes)));
+ q = _mm_cvtps_epi32(_mm_mul_ps(s, m4pi));
+ r = _mm_add_epi32(q, _mm_and_si128(q, ones));
+
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
+ s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
+
+ s = _mm_div_ps(s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3
times argument reduction
+ s = _mm_mul_ps(s, s);
+ // Evaluate Taylor series
+ s =
_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s,
cp5), cp4), s), cp3), s), cp2), s), cp1), s);
+
+ for(i = 0; i < 3; i++){
+ s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
+ }
+ s = _mm_div_ps(s, ftwos);
+
+ sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
+ cosine = _mm_sub_ps(fones, s);
+
+ condition1 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)),
fzeroes);
+ condition2 =
_mm_cmpneq_ps(_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
_mm_cmplt_ps(aVal, fzeroes));
+ condition3 =
_mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)),
fzeroes);
+
+ __m128 temp = cosine;
+ cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine),
condition1));
+ sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(temp, sine),
condition1));
+ sine = _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine,
_mm_set1_ps(2.0f)), condition2));
+ cosine = _mm_sub_ps(cosine, _mm_and_ps(_mm_mul_ps(cosine,
_mm_set1_ps(2.0f)), condition3));
+ tangent = _mm_div_ps(sine, cosine);
+ _mm_storeu_ps(bPtr, tangent);
+ aPtr += 4;
+ bPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *bPtr++ = tan(*aPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for unaligned */
@@ -200,15 +183,15 @@ static inline void volk_32f_tan_32f_u_sse4_1(float*
bVector, const float* aVecto
\param aVector The input vector of floats
\param num_points Number of points for which tangent is to be computed
*/
-static inline void volk_32f_tan_32f_u_generic(float* bVector, const float*
aVector, unsigned int num_points){
+static inline void volk_32f_tan_32f_generic(float* bVector, const float*
aVector, unsigned int num_points){
float* bPtr = bVector;
const float* aPtr = aVector;
unsigned int number = 0;
for(; number < num_points; number++){
- *bPtr++ = tan(*aPtr++);
+ *bPtr++ = tan(*aPtr++);
}
-
+
}
#endif /* LV_HAVE_GENERIC */
diff --git a/volk/kernels/volk/volk_32f_x2_pow_32f.h
b/volk/kernels/volk/volk_32f_x2_pow_32f.h
index cc11daf..43de06c 100755
--- a/volk/kernels/volk/volk_32f_x2_pow_32f.h
+++ b/volk/kernels/volk/volk_32f_x2_pow_32f.h
@@ -15,7 +15,8 @@
#ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
#define INCLUDED_volk_32f_x2_pow_32f_a_H
-#ifdef LV_HAVE_GENERIC
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
/*!
\brief Computes pow(x,y) by using exp and log
\param cVector The vector where results will be stored
@@ -23,132 +24,108 @@
\param bVector The input vector of indices
\param num_points Number of points for which pow is to be computed
*/
-static inline void volk_32f_x2_pow_32f_a_generic(float* cVector, const float*
bVector, const float* aVector, unsigned int num_points){
+static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float*
bVector, const float* aVector, unsigned int num_points){
+
float* cPtr = cVector;
const float* bPtr = bVector;
const float* aPtr = aVector;
- unsigned int number = 0;
- for(number = 0; number < num_points; number++){
- *cPtr++ = pow(*aPtr++, *bPtr++);
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+ __m128 tmp, fx, mask, pow2n, z, y;
+ __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+ __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+ __m128i bias, exp, emm0, pi32_0x7f;
+
+ one = _mm_set1_ps(1.0);
+ exp_hi = _mm_set1_ps(88.3762626647949);
+ exp_lo = _mm_set1_ps(-88.3762626647949);
+ ln2 = _mm_set1_ps(0.6931471805);
+ log2EF = _mm_set1_ps(1.44269504088896341);
+ half = _mm_set1_ps(0.5);
+ exp_C1 = _mm_set1_ps(0.693359375);
+ exp_C2 = _mm_set1_ps(-2.12194440e-4);
+ pi32_0x7f = _mm_set1_epi32(0x7f);
+
+ exp_p0 = _mm_set1_ps(1.9875691500e-4);
+ exp_p1 = _mm_set1_ps(1.3981999507e-3);
+ exp_p2 = _mm_set1_ps(8.3334519073e-3);
+ exp_p3 = _mm_set1_ps(4.1665795894e-2);
+ exp_p4 = _mm_set1_ps(1.6666665459e-1);
+ exp_p5 = _mm_set1_ps(5.0000001201e-1);
+
+ for(;number < quarterPoints; number++){
+ // First compute the logarithm
+ aVal = _mm_load_ps(aPtr);
+ bias = _mm_set1_epi32(127);
+ leadingOne = _mm_set1_ps(1.0f);
+ exp =
_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal),
_mm_set1_epi32(0x7f800000)), 23), bias);
+ logarithm = _mm_cvtepi32_ps(exp);
+
+ frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal,
_mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+
+ #if LOG_POLY_DEGREE == 6
+ mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f,
-1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+ #elif LOG_POLY_DEGREE == 5
+ mantissa = POLY4( frac, 2.8882704548164776201f,
-2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f,
0.0596515482674574969533f);
+ #elif LOG_POLY_DEGREE == 4
+ mantissa = POLY3( frac, 2.61761038894603480148f,
-1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+ #elif LOG_POLY_DEGREE == 3
+ mantissa = POLY2( frac, 2.28330284476918490682f,
-1.04913055217340124191f, 0.204446009836232697516f);
+ #else
+ #error
+ #endif
+
+ logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa,
_mm_sub_ps(frac, leadingOne)));
+ logarithm = _mm_mul_ps(logarithm, ln2);
+
+
+ // Now calculate b*lna
+ bVal = _mm_load_ps(bPtr);
+ bVal = _mm_mul_ps(bVal, logarithm);
+
+ // Now compute exp(b*lna)
+ tmp = _mm_setzero_ps();
+
+ bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
+
+ fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
+
+ emm0 = _mm_cvttps_epi32(fx);
+ tmp = _mm_cvtepi32_ps(emm0);
+
+ mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+ fx = _mm_sub_ps(tmp, mask);
+
+ tmp = _mm_mul_ps(fx, exp_C1);
+ z = _mm_mul_ps(fx, exp_C2);
+ bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
+ z = _mm_mul_ps(bVal, bVal);
+
+ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
+ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
+ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
+ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
+ y = _mm_add_ps(y, one);
+
+ emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f),
23);
+
+ pow2n = _mm_castsi128_ps(emm0);
+ cVal = _mm_mul_ps(y, pow2n);
+
+ _mm_store_ps(cPtr, cVal);
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
}
-
-}
-#endif /* LV_HAVE_GENERIC */
-
-
-#ifdef LV_HAVE_SSE4_1
-#include <smmintrin.h>
-/*!
- \brief Computes pow(x,y) by using exp and log
- \param cVector The vector where results will be stored
- \param aVector The input vector of bases
- \param bVector The input vector of indices
- \param num_points Number of points for which pow is to be computed
-*/
-static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float*
bVector, const float* aVector, unsigned int num_points){
- float* cPtr = cVector;
- const float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
- __m128 tmp, fx, mask, pow2n, z, y;
- __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
- __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
- __m128i bias, exp, emm0, pi32_0x7f;
-
- one = _mm_set1_ps(1.0);
- exp_hi = _mm_set1_ps(88.3762626647949);
- exp_lo = _mm_set1_ps(-88.3762626647949);
- ln2 = _mm_set1_ps(0.6931471805);
- log2EF = _mm_set1_ps(1.44269504088896341);
- half = _mm_set1_ps(0.5);
- exp_C1 = _mm_set1_ps(0.693359375);
- exp_C2 = _mm_set1_ps(-2.12194440e-4);
- pi32_0x7f = _mm_set1_epi32(0x7f);
-
- exp_p0 = _mm_set1_ps(1.9875691500e-4);
- exp_p1 = _mm_set1_ps(1.3981999507e-3);
- exp_p2 = _mm_set1_ps(8.3334519073e-3);
- exp_p3 = _mm_set1_ps(4.1665795894e-2);
- exp_p4 = _mm_set1_ps(1.6666665459e-1);
- exp_p5 = _mm_set1_ps(5.0000001201e-1);
-
- for(;number < quarterPoints; number++){
-
- // First compute the logarithm
- aVal = _mm_load_ps(aPtr);
- bias = _mm_set1_epi32(127);
- leadingOne = _mm_set1_ps(1.0f);
- exp =
_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal),
_mm_set1_epi32(0x7f800000)), 23), bias);
- logarithm = _mm_cvtepi32_ps(exp);
-
- frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal,
_mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
-
- #if LOG_POLY_DEGREE == 6
- mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f,
-1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
- #elif LOG_POLY_DEGREE == 5
- mantissa = POLY4( frac, 2.8882704548164776201f,
-2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f,
0.0596515482674574969533f);
- #elif LOG_POLY_DEGREE == 4
- mantissa = POLY3( frac, 2.61761038894603480148f,
-1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
- #elif LOG_POLY_DEGREE == 3
- mantissa = POLY2( frac, 2.28330284476918490682f,
-1.04913055217340124191f, 0.204446009836232697516f);
- #else
- #error
- #endif
-
- logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac,
leadingOne)));
- logarithm = _mm_mul_ps(logarithm, ln2);
-
-
- // Now calculate b*lna
- bVal = _mm_load_ps(bPtr);
- bVal = _mm_mul_ps(bVal, logarithm);
-
- // Now compute exp(b*lna)
- tmp = _mm_setzero_ps();
-
- bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
-
- fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
-
- emm0 = _mm_cvttps_epi32(fx);
- tmp = _mm_cvtepi32_ps(emm0);
-
- mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
- fx = _mm_sub_ps(tmp, mask);
-
- tmp = _mm_mul_ps(fx, exp_C1);
- z = _mm_mul_ps(fx, exp_C2);
- bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
- z = _mm_mul_ps(bVal, bVal);
-
- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
- y = _mm_add_ps(y, one);
-
- emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f),
23);
-
- pow2n = _mm_castsi128_ps(emm0);
- cVal = _mm_mul_ps(y, pow2n);
-
- _mm_store_ps(cPtr, cVal);
-
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = pow(*aPtr++, *bPtr++);
- }
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = pow(*aPtr++, *bPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for aligned */
@@ -166,16 +143,16 @@ static inline void volk_32f_x2_pow_32f_a_sse4_1(float*
cVector, const float* bVe
\param bVector The input vector of indices
\param num_points Number of points for which pow is to be computed
*/
-static inline void volk_32f_x2_pow_32f_u_generic(float* cVector, const float*
bVector, const float* aVector, unsigned int num_points){
+static inline void volk_32f_x2_pow_32f_generic(float* cVector, const float*
bVector, const float* aVector, unsigned int num_points){
float* cPtr = cVector;
const float* bPtr = bVector;
const float* aPtr = aVector;
unsigned int number = 0;
for(number = 0; number < num_points; number++){
- *cPtr++ = pow(*aPtr++, *bPtr++);
+ *cPtr++ = pow(*aPtr++, *bPtr++);
}
-
+
}
#endif /* LV_HAVE_GENERIC */
@@ -191,107 +168,107 @@ static inline void volk_32f_x2_pow_32f_u_generic(float*
cVector, const float* bV
*/
static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float*
bVector, const float* aVector, unsigned int num_points){
- float* cPtr = cVector;
- const float* bPtr = bVector;
- const float* aPtr = aVector;
-
- unsigned int number = 0;
- const unsigned int quarterPoints = num_points / 4;
-
- __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
- __m128 tmp, fx, mask, pow2n, z, y;
- __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
- __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
- __m128i bias, exp, emm0, pi32_0x7f;
-
- one = _mm_set1_ps(1.0);
- exp_hi = _mm_set1_ps(88.3762626647949);
- exp_lo = _mm_set1_ps(-88.3762626647949);
- ln2 = _mm_set1_ps(0.6931471805);
- log2EF = _mm_set1_ps(1.44269504088896341);
- half = _mm_set1_ps(0.5);
- exp_C1 = _mm_set1_ps(0.693359375);
- exp_C2 = _mm_set1_ps(-2.12194440e-4);
- pi32_0x7f = _mm_set1_epi32(0x7f);
-
- exp_p0 = _mm_set1_ps(1.9875691500e-4);
- exp_p1 = _mm_set1_ps(1.3981999507e-3);
- exp_p2 = _mm_set1_ps(8.3334519073e-3);
- exp_p3 = _mm_set1_ps(4.1665795894e-2);
- exp_p4 = _mm_set1_ps(1.6666665459e-1);
- exp_p5 = _mm_set1_ps(5.0000001201e-1);
-
- for(;number < quarterPoints; number++){
-
- // First compute the logarithm
- aVal = _mm_loadu_ps(aPtr);
- bias = _mm_set1_epi32(127);
- leadingOne = _mm_set1_ps(1.0f);
- exp =
_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal),
_mm_set1_epi32(0x7f800000)), 23), bias);
- logarithm = _mm_cvtepi32_ps(exp);
-
- frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal,
_mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
-
- #if LOG_POLY_DEGREE == 6
- mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f,
-1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
- #elif LOG_POLY_DEGREE == 5
- mantissa = POLY4( frac, 2.8882704548164776201f,
-2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f,
0.0596515482674574969533f);
- #elif LOG_POLY_DEGREE == 4
- mantissa = POLY3( frac, 2.61761038894603480148f,
-1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
- #elif LOG_POLY_DEGREE == 3
- mantissa = POLY2( frac, 2.28330284476918490682f,
-1.04913055217340124191f, 0.204446009836232697516f);
- #else
- #error
- #endif
-
- logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac,
leadingOne)));
- logarithm = _mm_mul_ps(logarithm, ln2);
-
-
- // Now calculate b*lna
- bVal = _mm_loadu_ps(bPtr);
- bVal = _mm_mul_ps(bVal, logarithm);
-
- // Now compute exp(b*lna)
- tmp = _mm_setzero_ps();
-
- bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
-
- fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
-
- emm0 = _mm_cvttps_epi32(fx);
- tmp = _mm_cvtepi32_ps(emm0);
-
- mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
- fx = _mm_sub_ps(tmp, mask);
-
- tmp = _mm_mul_ps(fx, exp_C1);
- z = _mm_mul_ps(fx, exp_C2);
- bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
- z = _mm_mul_ps(bVal, bVal);
-
- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
- y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
- y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
- y = _mm_add_ps(y, one);
-
- emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f),
23);
-
- pow2n = _mm_castsi128_ps(emm0);
- cVal = _mm_mul_ps(y, pow2n);
-
- _mm_storeu_ps(cPtr, cVal);
-
- aPtr += 4;
- bPtr += 4;
- cPtr += 4;
- }
-
- number = quarterPoints * 4;
- for(;number < num_points; number++){
- *cPtr++ = pow(*aPtr++, *bPtr++);
- }
+ float* cPtr = cVector;
+ const float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+ __m128 tmp, fx, mask, pow2n, z, y;
+ __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+ __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+ __m128i bias, exp, emm0, pi32_0x7f;
+
+ one = _mm_set1_ps(1.0);
+ exp_hi = _mm_set1_ps(88.3762626647949);
+ exp_lo = _mm_set1_ps(-88.3762626647949);
+ ln2 = _mm_set1_ps(0.6931471805);
+ log2EF = _mm_set1_ps(1.44269504088896341);
+ half = _mm_set1_ps(0.5);
+ exp_C1 = _mm_set1_ps(0.693359375);
+ exp_C2 = _mm_set1_ps(-2.12194440e-4);
+ pi32_0x7f = _mm_set1_epi32(0x7f);
+
+ exp_p0 = _mm_set1_ps(1.9875691500e-4);
+ exp_p1 = _mm_set1_ps(1.3981999507e-3);
+ exp_p2 = _mm_set1_ps(8.3334519073e-3);
+ exp_p3 = _mm_set1_ps(4.1665795894e-2);
+ exp_p4 = _mm_set1_ps(1.6666665459e-1);
+ exp_p5 = _mm_set1_ps(5.0000001201e-1);
+
+ for(;number < quarterPoints; number++){
+
+ // First compute the logarithm
+ aVal = _mm_loadu_ps(aPtr);
+ bias = _mm_set1_epi32(127);
+ leadingOne = _mm_set1_ps(1.0f);
+ exp = _mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal),
_mm_set1_epi32(0x7f800000)), 23), bias);
+ logarithm = _mm_cvtepi32_ps(exp);
+
+ frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal,
_mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+
+ #if LOG_POLY_DEGREE == 6
+ mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f,
-1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+ #elif LOG_POLY_DEGREE == 5
+ mantissa = POLY4( frac, 2.8882704548164776201f,
-2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f,
0.0596515482674574969533f);
+ #elif LOG_POLY_DEGREE == 4
+ mantissa = POLY3( frac, 2.61761038894603480148f,
-1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+ #elif LOG_POLY_DEGREE == 3
+ mantissa = POLY2( frac, 2.28330284476918490682f,
-1.04913055217340124191f, 0.204446009836232697516f);
+ #else
+ #error
+ #endif
+
+ logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac,
leadingOne)));
+ logarithm = _mm_mul_ps(logarithm, ln2);
+
+
+ // Now calculate b*lna
+ bVal = _mm_loadu_ps(bPtr);
+ bVal = _mm_mul_ps(bVal, logarithm);
+
+ // Now compute exp(b*lna)
+ tmp = _mm_setzero_ps();
+
+ bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
+
+ fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
+
+ emm0 = _mm_cvttps_epi32(fx);
+ tmp = _mm_cvtepi32_ps(emm0);
+
+ mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+ fx = _mm_sub_ps(tmp, mask);
+
+ tmp = _mm_mul_ps(fx, exp_C1);
+ z = _mm_mul_ps(fx, exp_C2);
+ bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
+ z = _mm_mul_ps(bVal, bVal);
+
+ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
+ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
+ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
+ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
+ y = _mm_add_ps(y, one);
+
+ emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
+
+ pow2n = _mm_castsi128_ps(emm0);
+ cVal = _mm_mul_ps(y, pow2n);
+
+ _mm_storeu_ps(cPtr, cVal);
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = pow(*aPtr++, *bPtr++);
+ }
}
#endif /* LV_HAVE_SSE4_1 for unaligned */
diff --git a/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
index 3ae6f59..337ef18 100644
--- a/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
+++ b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
@@ -29,7 +29,6 @@ static inline void
volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, const
yh = _mm256_set1_ps(lv_cimag(scalar));
for(;number < quarterPoints; number++){
-
x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as
ar,ai,br,bi
tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@@ -47,8 +46,8 @@ static inline void
volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector, const
}
for(i = num_points-isodd; i < num_points; i++) {
- *c++ = (*a++) * scalar;
- }
+ *c++ = (*a++) * scalar;
+ }
}
#endif /* LV_HAVE_AVX */
@@ -163,7 +162,6 @@ static inline void
volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, const
yh = _mm256_set1_ps(lv_cimag(scalar));
for(;number < quarterPoints; number++){
-
x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as
ar,ai,br,bi
tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
@@ -181,8 +179,8 @@ static inline void
volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector, const
}
for(i = num_points-isodd; i < num_points; i++) {
- *c++ = (*a++) * scalar;
- }
+ *c++ = (*a++) * scalar;
+ }
}
#endif /* LV_HAVE_AVX */
diff --git a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
index 7e8c91a..69eec0d 100644
--- a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -338,7 +338,6 @@ static inline void
volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result, const lv_
dotProdVal = _mm256_setzero_ps();
for(;number < quarterPoints; number++){
-
x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
diff --git a/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
b/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
index 37f9d74..ee43518 100644
--- a/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
+++ b/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
@@ -19,25 +19,25 @@ static inline void
volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16
const int8_t* complexVectorPtr = (int8_t*)complexVector;
int16_t* iBufferPtr = iBuffer;
int16_t* qBufferPtr = qBuffer;
- __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values
+ __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values
__m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 15, 13, 11, 9, 7, 5, 3, 1);
__m128i complexVal, iOutputVal, qOutputVal;
unsigned int eighthPoints = num_points / 8;
for(number = 0; number < eighthPoints; number++){
- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr
+= 16; // aligned load
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr
+= 16; // aligned load
- iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask);
// shuffle 16 bytes of 128bit complexVal
+ iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask); // shuffle 16
bytes of 128bit complexVal
qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
- iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte
sign extended versions of lower 8 bytes of input to output
- iOutputVal = _mm_slli_epi16(iOutputVal, 8); // shift in
left by 8 bits, each of the 8 16-bit integers, shift in with zeros
+ iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign
extended versions of lower 8 bytes of input to output
+ iOutputVal = _mm_slli_epi16(iOutputVal, 8); // shift in left by 8
bits, each of the 8 16-bit integers, shift in with zeros
qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
qOutputVal = _mm_slli_epi16(qOutputVal, 8);
- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
_mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
iBufferPtr += 8;
@@ -46,7 +46,7 @@ static inline void
volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16
number = eighthPoints * 8;
for(; number < num_points; number++){
- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit
Complexvector into 16 bit, shift left by 8 bits and store
+ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit
Complexvector into 16 bit, shift left by 8 bits and store
*qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
}
}
@@ -66,7 +66,7 @@ static inline void
volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, int16_t*
const int8_t* complexVectorPtr = (int8_t*)complexVector;
int16_t* iBufferPtr = iBuffer;
int16_t* qBufferPtr = qBuffer;
- __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values
+ __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values
__m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 15, 13, 11, 9, 7, 5, 3, 1);
__m256i complexVal, iOutputVal, qOutputVal;
__m128i complexVal1, complexVal0;
@@ -75,20 +75,20 @@ static inline void
volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, int16_t*
unsigned int sixteenthPoints = num_points / 16;
for(number = 0; number < sixteenthPoints; number++){
- complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
complexVectorPtr += 32; // aligned load
+ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
complexVectorPtr += 32; // aligned load
// Extract from complexVal to iOutputVal and qOutputVal
- complexVal1 = _mm256_extractf128_si256(complexVal, 1);
- complexVal0 = _mm256_extractf128_si256(complexVal, 0);
+ complexVal1 = _mm256_extractf128_si256(complexVal, 1);
+ complexVal0 = _mm256_extractf128_si256(complexVal, 0);
- iOutputVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask);
// shuffle 16 bytes of 128bit complexVal
- iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
+ iOutputVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask); // shuffle 16
bytes of 128bit complexVal
+ iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
- iOutputVal1 = _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte
sign extended versions of lower 8 bytes of input to output
- iOutputVal1 = _mm_slli_epi16(iOutputVal1, 8); // shift in
left by 8 bits, each of the 8 16-bit integers, shift in with zeros
- iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
+ iOutputVal1 = _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign
extended versions of lower 8 bytes of input to output
+ iOutputVal1 = _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8
bits, each of the 8 16-bit integers, shift in with zeros
+ iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
@@ -98,12 +98,12 @@ static inline void
volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, int16_t*
// Pack iOutputVal0,1 to iOutputVal
__m256i dummy = _mm256_setzero_si256();
- iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
- iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
- qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
- qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
+ iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
+ iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
+ qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
+ qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
- _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store
+ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store
_mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
iBufferPtr += 16;
@@ -112,7 +112,7 @@ static inline void
volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer, int16_t*
number = sixteenthPoints * 16;
for(; number < num_points; number++){
- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit
Complexvector into 16 bit, shift left by 8 bits and store
+ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit
Complexvector into 16 bit, shift left by 8 bits and store
*qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
}
}
- [Commit-gnuradio] [gnuradio] 06/14: volk: expfast comments, (continued)
- [Commit-gnuradio] [gnuradio] 06/14: volk: expfast comments, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 08/14: volk: Added sin, cos kernels., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 12/14: volk: fixed some warnings, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 13/14: volk: fixed a problem with acos during some translation in the git history., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 09/14: volk: Added tan kernel., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 07/14: volk: added power kernel., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 10/14: volk: Added atan, asin, acos kernels., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 01/14: added new proto-kernels, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 05/14: volk: Added avx proto-kernel for fast exp., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 02/14: volk: Added proto-kernels for convert, multiply, conjugate, deinterleave, magnitude, mag-square, psd functions., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 11/14: volk (gsoc): whitespace,
git <=
- [Commit-gnuradio] [gnuradio] 14/14: volk: adding copyright notice to all volk kernels., git, 2014/10/15