[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Commit-gnuradio] [gnuradio] 02/14: volk: Added proto-kernels for conver
From: |
git |
Subject: |
[Commit-gnuradio] [gnuradio] 02/14: volk: Added proto-kernels for convert, multiply, conjugate, deinterleave, magnitude, mag-square, psd functions. |
Date: |
Wed, 15 Oct 2014 23:25:08 +0000 (UTC) |
This is an automated email from the git hooks/post-receive script.
trondeau pushed a commit to branch master
in repository gnuradio.
commit 655b2ca7d0ce014ecf5a2f4c8184c09c0fc5a946
Author: Abhishek Bhowmick <address@hidden>
Date: Tue Jun 3 16:15:13 2014 +0530
volk: Added proto-kernels for convert, multiply, conjugate, deinterleave,
magnitude, mag-square, psd functions.
---
volk/kernels/volk/volk_16i_s32f_convert_32f.h | 114 +++++++++++++++++++++
volk/kernels/volk/volk_32f_s32f_convert_16i.h | 114 +++++++++++++++++++++
volk/kernels/volk/volk_32fc_conjugate_32fc.h | 76 ++++++++++++++
volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h | 51 +++++++++
volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h | 102 ++++++++++++++++++
.../kernels/volk/volk_32fc_deinterleave_imag_32f.h | 44 ++++++++
volk/kernels/volk/volk_32fc_magnitude_32f.h | 92 +++++++++++++++++
.../kernels/volk/volk_32fc_magnitude_squared_32f.h | 88 ++++++++++++++++
.../volk_32fc_s32f_x2_power_spectral_density_32f.h | 92 +++++++++++++++++
volk/kernels/volk/volk_32fc_x2_multiply_32fc.h | 98 ++++++++++++++++++
.../volk/volk_32fc_x2_multiply_conjugate_32fc.h | 106 +++++++++++++++++++
11 files changed, 977 insertions(+)
diff --git a/volk/kernels/volk/volk_16i_s32f_convert_32f.h
b/volk/kernels/volk/volk_16i_s32f_convert_32f.h
index a810a60..e989084 100644
--- a/volk/kernels/volk/volk_16i_s32f_convert_32f.h
+++ b/volk/kernels/volk/volk_16i_s32f_convert_32f.h
@@ -4,6 +4,63 @@
#include <inttypes.h>
#include <stdio.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data,
and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_16i_s32f_convert_32f_u_avx(float* outputVector, const
int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal, inputVal2;
+ __m128 ret;
+ __m256 output;
+ __m256 dummy = _mm256_setzero_ps();
+
+ for(;number < eighthPoints; number++){
+
+ // Load the 8 values
+ //inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+ inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ output = _mm256_insertf128_ps(dummy, ret, 0);
+
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ output = _mm256_insertf128_ps(output, ret, 1);
+
+ _mm256_storeu_ps(outputVectorPtr, output);
+
+ outputVectorPtr += 8;
+
+ inputPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) / scalar;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
@@ -126,6 +183,63 @@ static inline void
volk_16i_s32f_convert_32f_generic(float* outputVector, const
#include <inttypes.h>
#include <stdio.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+ /*!
+ \brief Converts the input 16 bit integer data into floating point data,
and divides the each floating point output data point by the scalar value
+ \param inputVector The 16 bit input data buffer
+ \param outputVector The floating point output data buffer
+ \param scalar The value divided against each point in the output buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_16i_s32f_convert_32f_a_avx(float* outputVector, const
int16_t* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* outputVectorPtr = outputVector;
+ __m128 invScalar = _mm_set_ps1(1.0/scalar);
+ int16_t* inputPtr = (int16_t*)inputVector;
+ __m128i inputVal, inputVal2;
+ __m128 ret;
+ __m256 output;
+ __m256 dummy = _mm256_setzero_ps();
+
+ for(;number < eighthPoints; number++){
+
+ // Load the 8 values
+ //inputVal = _mm_loadu_si128((__m128i*)inputPtr);
+ inputVal = _mm_load_si128((__m128i*)inputPtr);
+
+ // Shift the input data to the right by 64 bits ( 8 bytes )
+ inputVal2 = _mm_srli_si128(inputVal, 8);
+
+ // Convert the lower 4 values into 32 bit words
+ inputVal = _mm_cvtepi16_epi32(inputVal);
+ inputVal2 = _mm_cvtepi16_epi32(inputVal2);
+
+ ret = _mm_cvtepi32_ps(inputVal);
+ ret = _mm_mul_ps(ret, invScalar);
+ output = _mm256_insertf128_ps(dummy, ret, 0);
+
+ ret = _mm_cvtepi32_ps(inputVal2);
+ ret = _mm_mul_ps(ret, invScalar);
+ output = _mm256_insertf128_ps(output, ret, 1);
+
+ _mm256_store_ps(outputVectorPtr, output);
+
+ outputVectorPtr += 8;
+
+ inputPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ outputVector[number] =((float)(inputVector[number])) / scalar;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
diff --git a/volk/kernels/volk/volk_32f_s32f_convert_16i.h
b/volk/kernels/volk/volk_32f_s32f_convert_16i.h
index 9fd7586..7a53da6 100644
--- a/volk/kernels/volk/volk_32f_s32f_convert_16i.h
+++ b/volk/kernels/volk/volk_32f_s32f_convert_16i.h
@@ -5,6 +5,63 @@
#include <stdio.h>
#include <math.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then
converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector,
const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256 inputVal, ret;
+ __m256i intInputVal;
+ __m128i intInputVal1, intInputVal2;
+ __m256 vmin_val = _mm256_set1_ps(min_val);
+ __m256 vmax_val = _mm256_set1_ps(max_val);
+
+ for(;number < eighthPoints; number++){
+ inputVal = _mm256_loadu_ps(inputVectorPtr); inputVectorPtr += 8;
+
+ // Scale and clip
+ ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar),
vmax_val), vmin_val);
+
+ intInputVal = _mm256_cvtps_epi32(ret);
+
+ intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
+ intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
@@ -158,6 +215,63 @@ static inline void
volk_32f_s32f_convert_16i_generic(int16_t* outputVector, cons
#include <stdio.h>
#include <math.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Multiplies each point in the input buffer by the scalar value, then
converts the result into a 16 bit integer value
+ \param inputVector The floating point input data buffer
+ \param outputVector The 16 bit output data buffer
+ \param scalar The value multiplied against each point in the input buffer
+ \param num_points The number of data values to be converted
+ */
+static inline void volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector,
const float* inputVector, const float scalar, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ int16_t* outputVectorPtr = outputVector;
+
+ float min_val = -32768;
+ float max_val = 32767;
+ float r;
+
+ __m256 vScalar = _mm256_set1_ps(scalar);
+ __m256 inputVal, ret;
+ __m256i intInputVal;
+ __m128i intInputVal1, intInputVal2;
+ __m256 vmin_val = _mm256_set1_ps(min_val);
+ __m256 vmax_val = _mm256_set1_ps(max_val);
+
+ for(;number < eighthPoints; number++){
+ inputVal = _mm256_load_ps(inputVectorPtr); inputVectorPtr += 8;
+
+ // Scale and clip
+ ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar),
vmax_val), vmin_val);
+
+ intInputVal = _mm256_cvtps_epi32(ret);
+
+ intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
+ intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
+
+ intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
+
+ _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
+ outputVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ r = inputVector[number] * scalar;
+ if(r > max_val)
+ r = max_val;
+ else if(r < min_val)
+ r = min_val;
+ outputVector[number] = (int16_t)rintf(r);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
diff --git a/volk/kernels/volk/volk_32fc_conjugate_32fc.h
b/volk/kernels/volk/volk_32fc_conjugate_32fc.h
index 0f74b01..80159d4 100644
--- a/volk/kernels/volk/volk_32fc_conjugate_32fc.h
+++ b/volk/kernels/volk/volk_32fc_conjugate_32fc.h
@@ -6,6 +6,44 @@
#include <volk/volk_complex.h>
#include <float.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated
and stored into cVector
+ */
+static inline void volk_32fc_conjugate_32fc_u_avx(lv_32fc_t* cVector, const
lv_32fc_t* aVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+
+ for(;number < quarterPoints; number++){
+
+ x = _mm256_loadu_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+ x = _mm256_xor_ps(x, conjugator); // conjugate register
+
+ _mm256_storeu_ps((float*)c,x); // Store the results back into the C
container
+
+ a += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+
+ for(;number < num_points; number++) {
+ *c++ = lv_conj(*a++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
@@ -70,6 +108,44 @@ static inline void
volk_32fc_conjugate_32fc_generic(lv_32fc_t* cVector, const lv
#include <volk/volk_complex.h>
#include <float.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Takes the conjugate of a complex vector.
+ \param cVector The vector where the results will be stored
+ \param aVector Vector to be conjugated
+ \param num_points The number of complex values in aVector to be conjugated
and stored into cVector
+ */
+static inline void volk_32fc_conjugate_32fc_a_avx(lv_32fc_t* cVector, const
lv_32fc_t* aVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+
+ for(;number < quarterPoints; number++){
+
+ x = _mm256_load_ps((float*)a); // Load the complex data as ar,ai,br,bi
+
+ x = _mm256_xor_ps(x, conjugator); // conjugate register
+
+ _mm256_store_ps((float*)c,x); // Store the results back into the C
container
+
+ a += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+
+ for(;number < num_points; number++) {
+ *c++ = lv_conj(*a++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
index 0d33ed7..aad9fbe 100644
--- a/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_32f_x2.h
@@ -4,6 +4,57 @@
#include <inttypes.h>
#include <stdio.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Deinterleaves the complex vector into I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_32f_x2_a_avx(float* iBuffer, float*
qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ const float* complexVectorPtr = (float*)complexVector;
+ float* iBufferPtr = iBuffer;
+ float* qBufferPtr = qBuffer;
+
+ unsigned int number = 0;
+ // Mask for real and imaginary parts
+ int realMask = 0x88;
+ int imagMask = 0xdd;
+ const unsigned int eighthPoints = num_points / 8;
+ __m256 cplxValue1, cplxValue2, complex1, complex2, iValue, qValue;
+ for(;number < eighthPoints; number++){
+
+ cplxValue1 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cplxValue2 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+ // Arrange in i1i2i3i4 format
+ iValue = _mm256_shuffle_ps(complex1, complex2, realMask);
+ // Arrange in q1q2q3q4 format
+ qValue = _mm256_shuffle_ps(complex1, complex2, imagMask);
+
+ _mm256_store_ps(iBufferPtr, iValue);
+ _mm256_store_ps(qBufferPtr, qValue);
+
+ iBufferPtr += 8;
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
/*!
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h
b/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h
index 4a4c550..b0c93ff 100644
--- a/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_64f_x2.h
@@ -4,6 +4,57 @@
#include <inttypes.h>
#include <stdio.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_u_avx(double* iBuffer,
double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ const unsigned int quarterPoints = num_points / 4;
+ __m256 cplxValue;
+ __m128 complexH, complexL, fVal;
+ __m256d dVal;
+
+ for(;number < quarterPoints; number++){
+
+ cplxValue = _mm256_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ complexH = _mm256_extractf128_ps(cplxValue, 1);
+ complexL = _mm256_extractf128_ps(cplxValue, 0);
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2,0,2,0));
+ dVal = _mm256_cvtps_pd(fVal);
+ _mm256_storeu_pd(iBufferPtr, dVal);
+
+ // Arrange in q1q2q1q2 format
+ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3,1,3,1));
+ dVal = _mm256_cvtps_pd(fVal);
+ _mm256_storeu_pd(qBufferPtr, dVal);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
@@ -82,6 +133,57 @@ static inline void
volk_32fc_deinterleave_64f_x2_generic(double* iBuffer, double
#include <inttypes.h>
#include <stdio.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Deinterleaves the lv_32fc_t vector into double I & Q vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_64f_x2_a_avx(double* iBuffer,
double* qBuffer, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ double* iBufferPtr = iBuffer;
+ double* qBufferPtr = qBuffer;
+
+ const unsigned int quarterPoints = num_points / 4;
+ __m256 cplxValue;
+ __m128 complexH, complexL, fVal;
+ __m256d dVal;
+
+ for(;number < quarterPoints; number++){
+
+ cplxValue = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ complexH = _mm256_extractf128_ps(cplxValue, 1);
+ complexL = _mm256_extractf128_ps(cplxValue, 0);
+
+ // Arrange in i1i2i1i2 format
+ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(2,0,2,0));
+ dVal = _mm256_cvtps_pd(fVal);
+ _mm256_store_pd(iBufferPtr, dVal);
+
+ // Arrange in q1q2q1q2 format
+ fVal = _mm_shuffle_ps(complexL, complexH, _MM_SHUFFLE(3,1,3,1));
+ dVal = _mm256_cvtps_pd(fVal);
+ _mm256_store_pd(qBufferPtr, dVal);
+
+ iBufferPtr += 4;
+ qBufferPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
diff --git a/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
index b196829..9b5c57b 100644
--- a/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
+++ b/volk/kernels/volk/volk_32fc_deinterleave_imag_32f.h
@@ -4,6 +4,50 @@
#include <inttypes.h>
#include <stdio.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Deinterleaves the complex vector into Q vector data
+ \param complexVector The complex input vector
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_32fc_deinterleave_imag_32f_a_avx(float* qBuffer, const
lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+ int imagMask = 0xdd;
+ const float* complexVectorPtr = (const float*)complexVector;
+ float* qBufferPtr = qBuffer;
+
+ __m256 cplxValue1, cplxValue2, complex1, complex2, qValue;
+ for(;number < eighthPoints; number++){
+
+ cplxValue1 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cplxValue2 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+ // Arrange in q1q2q3q4 format
+ qValue = _mm256_shuffle_ps(complex1, complex2, imagMask);
+ //iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3,1,3,1));
+
+ _mm256_store_ps(qBufferPtr, qValue);
+
+ qBufferPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ complexVectorPtr++;
+ *qBufferPtr++ = *complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
/*!
diff --git a/volk/kernels/volk/volk_32fc_magnitude_32f.h
b/volk/kernels/volk/volk_32fc_magnitude_32f.h
index b6da7f3..f611091 100644
--- a/volk/kernels/volk/volk_32fc_magnitude_32f.h
+++ b/volk/kernels/volk/volk_32fc_magnitude_32f.h
@@ -5,6 +5,52 @@
#include <stdio.h>
#include <math.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the
results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be
calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_u_avx(float* magnitudeVector, const
lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m256 cplxValue1, cplxValue2, complex1, complex2, result;
+ for(;number < eighthPoints; number++){
+ cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+ result = _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
+
+ result = _mm256_sqrt_ps(result);
+
+ _mm256_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag *
val1Imag));
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
@@ -123,6 +169,52 @@ static inline void volk_32fc_magnitude_32f_generic(float*
magnitudeVector, const
#include <stdio.h>
#include <math.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Calculates the magnitude of the complexVector and stores the
results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be
calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_32f_a_avx(float* magnitudeVector, const
lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m256 cplxValue1, cplxValue2, complex1, complex2, result;
+ for(;number < eighthPoints; number++){
+ cplxValue1 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cplxValue2 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+ result = _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
+
+ result = _mm256_sqrt_ps(result);
+
+ _mm256_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = sqrtf((val1Real * val1Real) + (val1Imag *
val1Imag));
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
diff --git a/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
index 878794b..be85bda 100644
--- a/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
+++ b/volk/kernels/volk/volk_32fc_magnitude_squared_32f.h
@@ -5,6 +5,50 @@
#include <stdio.h>
#include <math.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores
the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be
calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_u_avx(float*
magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m256 cplxValue1, cplxValue2, complex1, complex2, result;
+ for(;number < eighthPoints; number++){
+ cplxValue1 = _mm256_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cplxValue2 = _mm256_loadu_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+ result = _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
+
+ _mm256_storeu_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
@@ -119,6 +163,50 @@ static inline void
volk_32fc_magnitude_squared_32f_generic(float* magnitudeVecto
#include <stdio.h>
#include <math.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Calculates the magnitude squared of the complexVector and stores
the results in the magnitudeVector
+ \param complexVector The vector containing the complex input values
+ \param magnitudeVector The vector containing the real output values
+ \param num_points The number of complex values in complexVector to be
calculated and stored into cVector
+ */
+static inline void volk_32fc_magnitude_squared_32f_a_avx(float*
magnitudeVector, const lv_32fc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ const float* complexVectorPtr = (float*)complexVector;
+ float* magnitudeVectorPtr = magnitudeVector;
+
+ __m256 cplxValue1, cplxValue2, complex1, complex2, result;
+ for(;number < eighthPoints; number++){
+ cplxValue1 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cplxValue2 = _mm256_load_ps(complexVectorPtr);
+ complexVectorPtr += 8;
+
+ cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
+ cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
+
+ complex1 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x20);
+ complex2 = _mm256_permute2f128_ps(cplxValue1, cplxValue2, 0x31);
+
+ result = _mm256_hadd_ps(complex1, complex2); // Add the I2 and Q2 values
+
+ _mm256_store_ps(magnitudeVectorPtr, result);
+ magnitudeVectorPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(; number < num_points; number++){
+ float val1Real = *complexVectorPtr++;
+ float val1Imag = *complexVectorPtr++;
+ *magnitudeVectorPtr++ = (val1Real * val1Real) + (val1Imag * val1Imag);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
diff --git a/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
b/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
index e73eb09..9ec92e2 100644
--- a/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
+++ b/volk/kernels/volk/volk_32fc_s32f_x2_power_spectral_density_32f.h
@@ -5,6 +5,98 @@
#include <stdio.h>
#include <math.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+#include <simdmath.h>
+#endif /* LV_HAVE_LIB_SIMDMATH */
+
+/*!
+ \brief Calculates the log10 power value divided by the RBW for each input
point
+ \param logPowerOutput The 10.0 * log10((r*r + i*i)/RBW) for each data point
+ \param complexFFTInput The complex data output from the FFT point
+ \param normalizationFactor This value is divided against all the input
values before the power is calculated
+ \param rbw The resolution bandwith of the fft spectrum
+ \param num_points The number of fft data points
+*/
+static inline void volk_32fc_s32f_x2_power_spectral_density_32f_a_avx(float*
logPowerOutput, const lv_32fc_t* complexFFTInput, const float
normalizationFactor, const float rbw, unsigned int num_points){
+ const float* inputPtr = (const float*)complexFFTInput;
+ float* destPtr = logPowerOutput;
+ uint64_t number = 0;
+ const float iRBW = 1.0 / rbw;
+ const float iNormalizationFactor = 1.0 / normalizationFactor;
+
+#ifdef LV_HAVE_LIB_SIMDMATH
+ __m256 magScalar = _mm256_set1_ps(10.0);
+ magScalar = _mm256_div_ps(magScalar, logf4(magScalar));
+
+ __m256 invRBW = _mm256_set1_ps(iRBW);
+
+ __m256 invNormalizationFactor = _mm256_set1_ps(iNormalizationFactor);
+
+ __m256 power;
+ __m256 input1, input2;
+ const uint64_t eighthPoints = num_points / 8;
+ for(;number < eighthPoints; number++){
+ // Load the complex values
+ input1 =_mm256_load_ps(inputPtr);
+ inputPtr += 8;
+ input2 =_mm256_load_ps(inputPtr);
+ inputPtr += 8;
+
+ // Apply the normalization factor
+ input1 = _mm256_mul_ps(input1, invNormalizationFactor);
+ input2 = _mm256_mul_ps(input2, invNormalizationFactor);
+
+ // Multiply each value by itself
+ // (r1*r1), (i1*i1), (r2*r2), (i2*i2)
+ input1 = _mm256_mul_ps(input1, input1);
+ // (r3*r3), (i3*i3), (r4*r4), (i4*i4)
+ input2 = _mm256_mul_ps(input2, input2);
+
+ // Horizontal add, to add (r*r) + (i*i) for each complex value
+ // (r1*r1)+(i1*i1), (r2*r2) + (i2*i2), (r3*r3)+(i3*i3), (r4*r4)+(i4*i4)
+ inputVal1 = _mm256_permute2f128_ps(input1, input2, 0x20);
+ inputVal2 = _mm256_permute2f128_ps(input1, input2, 0x31);
+
+ power = _mm256_hadd_ps(inputVal1, inputVal2);
+
+ // Divide by the rbw
+ power = _mm256_mul_ps(power, invRBW);
+
+ // Calculate the natural log power
+ power = logf4(power);
+
+ // Convert to log10 and multiply by 10.0
+ power = _mm256_mul_ps(power, magScalar);
+
+ // Store the floating point results
+ _mm256_store_ps(destPtr, power);
+
+ destPtr += 8;
+ }
+
+ number = eighthPoints*8;
+#endif /* LV_HAVE_LIB_SIMDMATH */
+ // Calculate the FFT for any remaining points
+ for(; number < num_points; number++){
+ // Calculate dBm
+ // 50 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 50.0 * .001)) = 10 * log10( v^2 * 10)
+ // 75 ohm load assumption
+ // 10 * log10 (v^2 / (2 * 75.0 * .001)) = 10 * log10( v^2 * 15)
+
+ const float real = *inputPtr++ * iNormalizationFactor;
+ const float imag = *inputPtr++ * iNormalizationFactor;
+
+ *destPtr = 10.0*log10f((((real * real) + (imag * imag)) + 1e-20) * iRBW);
+ destPtr++;
+ }
+
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
index 7c723bc..cf08e91 100644
--- a/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_multiply_32fc.h
@@ -6,6 +6,55 @@
#include <volk/volk_complex.h>
#include <float.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results
in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to
be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_32fc_u_avx(lv_32fc_t* cVector, const
lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x, y, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for(;number < quarterPoints; number++){
+
+ x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi ... as
ar,ai,br,bi ...
+ y = _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di ... as
cr,ci,dr,di ...
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
+
+ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+
+ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ...
+
+ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci,
br*dr-bi*di, bi*dr+br*di
+
+ _mm256_storeu_ps((float*)c,z); // Store the results back into the C
container
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+
+ for(; number < num_points; number++) {
+ *c++ = (*a++) * (*b++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
@@ -83,6 +132,55 @@ static inline void
volk_32fc_x2_multiply_32fc_generic(lv_32fc_t* cVector, const
#include <volk/volk_complex.h>
#include <float.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results
in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to
be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_32fc_a_avx(lv_32fc_t* cVector, const
lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x, y, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ for(;number < quarterPoints; number++){
+
+ x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as
ar,ai,br,bi ...
+ y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as
cr,ci,dr,di ...
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
+
+ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+
+ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ...
+
+ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci,
br*dr-bi*di, bi*dr+br*di
+
+ _mm256_store_ps((float*)c,z); // Store the results back into the C
container
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+
+ for(; number < num_points; number++) {
+ *c++ = (*a++) * (*b++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
diff --git a/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
index dbc123f..82070e0 100644
--- a/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_multiply_conjugate_32fc.h
@@ -6,6 +6,59 @@
#include <volk/volk_complex.h>
#include <float.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Multiplies vector a by the conjugate of vector b and stores the
results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector First vector to be multiplied
+ \param bVector Second vector that is conjugated before being multiplied
+ \param num_points The number of complex values in aVector and bVector to
be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_u_avx(lv_32fc_t*
cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int
num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x, y, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+
+ for(;number < quarterPoints; number++){
+
+ x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi ... as
ar,ai,br,bi ...
+ y = _mm256_loadu_ps((float*)b); // Load the cr + ci, dr + di ... as
cr,ci,dr,di ...
+
+ y = _mm256_xor_ps(y, conjugator); // conjugate y
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
+
+ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+
+ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ...
+
+ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci,
br*dr-bi*di, bi*dr+br*di ...
+
+ _mm256_storeu_ps((float*)c,z); // Store the results back into the C
container
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+
+ for(; number < num_points; number++) {
+ *c++ = (*a++) * lv_conj(*b++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
@@ -87,6 +140,59 @@ static inline void
volk_32fc_x2_multiply_conjugate_32fc_generic(lv_32fc_t* cVect
#include <volk/volk_complex.h>
#include <float.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Multiplies vector a by the conjugate of vector b and stores the
results in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector First vector to be multiplied
+ \param bVector Second vector that is conjugated before being multiplied
+ \param num_points The number of complex values in aVector and bVector to
be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_x2_multiply_conjugate_32fc_a_avx(lv_32fc_t*
cVector, const lv_32fc_t* aVector, const lv_32fc_t* bVector, unsigned int
num_points){
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x, y, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+ const lv_32fc_t* b = bVector;
+
+ __m256 conjugator = _mm256_setr_ps(0, -0.f, 0, -0.f, 0, -0.f, 0, -0.f);
+
+ for(;number < quarterPoints; number++){
+
+ x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi ... as
ar,ai,br,bi ...
+ y = _mm256_load_ps((float*)b); // Load the cr + ci, dr + di ... as
cr,ci,dr,di ...
+
+ y = _mm256_xor_ps(y, conjugator); // conjugate y
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr ...
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di ...
+
+ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+
+ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br ...
+
+ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci,
br*dr-bi*di, bi*dr+br*di ...
+
+ _mm256_store_ps((float*)c,z); // Store the results back into the C
container
+
+ a += 4;
+ b += 4;
+ c += 4;
+ }
+
+ number = quarterPoints * 4;
+
+ for(; number < num_points; number++) {
+ *c++ = (*a++) * lv_conj(*b++);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
- [Commit-gnuradio] [gnuradio] 04/14: volk: Added log2, (continued)
- [Commit-gnuradio] [gnuradio] 04/14: volk: Added log2, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 06/14: volk: expfast comments, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 08/14: volk: Added sin, cos kernels., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 12/14: volk: fixed some warnings, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 13/14: volk: fixed a problem with acos during some translation in the git history., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 09/14: volk: Added tan kernel., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 07/14: volk: added power kernel., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 10/14: volk: Added atan, asin, acos kernels., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 01/14: added new proto-kernels, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 05/14: volk: Added avx proto-kernel for fast exp., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 02/14: volk: Added proto-kernels for convert, multiply, conjugate, deinterleave, magnitude, mag-square, psd functions.,
git <=
- [Commit-gnuradio] [gnuradio] 11/14: volk (gsoc): whitespace, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 14/14: volk: adding copyright notice to all volk kernels., git, 2014/10/15