[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Commit-gnuradio] [gnuradio] 01/14: added new proto-kernels
From: |
git |
Subject: |
[Commit-gnuradio] [gnuradio] 01/14: added new proto-kernels |
Date: |
Wed, 15 Oct 2014 23:25:08 +0000 (UTC) |
This is an automated email from the git hooks/post-receive script.
trondeau pushed a commit to branch master
in repository gnuradio.
commit bb79c5b07b6c7156a355d5d055f9efa5e10be974
Author: Abhishek Bhowmick <address@hidden>
Date: Mon May 12 21:18:42 2014 +0530
added new proto-kernels
Conflicts:
volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
---
volk/kernels/volk/volk_32f_convert_64f.h | 72 +++++++++++++
volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h | 94 +++++++++++++++++
volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h | 116 +++++++++++++++++++++
volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h | 81 ++++++++++++--
volk/kernels/volk/volk_8ic_deinterleave_real_16i.h | 48 +++++++++
volk/kernels/volk/volk_8ic_deinterleave_real_8i.h | 58 +++++++++++
6 files changed, 462 insertions(+), 7 deletions(-)
diff --git a/volk/kernels/volk/volk_32f_convert_64f.h
b/volk/kernels/volk/volk_32f_convert_64f.h
index 2f03695..12c2f22 100644
--- a/volk/kernels/volk/volk_32f_convert_64f.h
+++ b/volk/kernels/volk/volk_32f_convert_64f.h
@@ -4,6 +4,42 @@
#include <inttypes.h>
#include <stdio.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+ */
+
+static inline void volk_32f_convert_64f_u_avx(double* outputVector, const
float* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m256d ret;
+ __m128 inputVal;
+
+ for(;number < quarterPoints; number++){
+ inputVal = _mm_loadu_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ ret = _mm256_cvtps_pd(inputVal);
+ _mm256_storeu_pd(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (double)(inputVector[number]);
+ }
+}
+
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
@@ -68,12 +104,48 @@ static inline void volk_32f_convert_64f_generic(double*
outputVector, const floa
#endif /* INCLUDED_volk_32f_convert_64f_u_H */
+
+
#ifndef INCLUDED_volk_32f_convert_64f_a_H
#define INCLUDED_volk_32f_convert_64f_a_H
#include <inttypes.h>
#include <stdio.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Converts the float values into double values
+ \param dVector The converted double vector values
+ \param fVector The float vector values to be converted
+ \param num_points The number of points in the two vectors to be converted
+ */
+static inline void volk_32f_convert_64f_a_avx(double* outputVector, const
float* inputVector, unsigned int num_points){
+ unsigned int number = 0;
+
+ const unsigned int quarterPoints = num_points / 4;
+
+ const float* inputVectorPtr = (const float*)inputVector;
+ double* outputVectorPtr = outputVector;
+ __m256d ret;
+ __m128 inputVal;
+
+ for(;number < quarterPoints; number++){
+ inputVal = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4;
+
+ ret = _mm256_cvtps_pd(inputVal);
+ _mm256_store_pd(outputVectorPtr, ret);
+
+ outputVectorPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(; number < num_points; number++){
+ outputVector[number] = (double)(inputVector[number]);
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE2
#include <emmintrin.h>
/*!
diff --git a/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
index 668a047..3ae6f59 100644
--- a/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
+++ b/volk/kernels/volk/volk_32fc_s32fc_multiply_32fc.h
@@ -6,6 +6,53 @@
#include <volk/volk_complex.h>
#include <float.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results
in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to
be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t* cVector,
const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ unsigned int number = 0;
+ unsigned int i = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ unsigned int isodd = num_points & 3;
+ __m256 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm256_set1_ps(lv_creal(scalar));
+ yh = _mm256_set1_ps(lv_cimag(scalar));
+
+ for(;number < quarterPoints; number++){
+
+ x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as
ar,ai,br,bi
+
+ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci,
br*dr-bi*di, bi*dr+br*di
+
+ _mm256_storeu_ps((float*)c,z); // Store the results back into the C
container
+
+ a += 4;
+ c += 4;
+ }
+
+ for(i = num_points-isodd; i < num_points; i++) {
+ *c++ = (*a++) * scalar;
+ }
+
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
@@ -93,6 +140,53 @@ static inline void
volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t* cVector, con
#include <volk/volk_complex.h>
#include <float.h>
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+ /*!
+ \brief Multiplies the two input complex vectors and stores their results
in the third vector
+ \param cVector The vector where the results will be stored
+ \param aVector One of the vectors to be multiplied
+ \param bVector One of the vectors to be multiplied
+ \param num_points The number of complex values in aVector and bVector to
be multiplied together and stored into cVector
+ */
+static inline void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t* cVector,
const lv_32fc_t* aVector, const lv_32fc_t scalar, unsigned int num_points){
+ unsigned int number = 0;
+ unsigned int i = 0;
+ const unsigned int quarterPoints = num_points / 4;
+ unsigned int isodd = num_points & 3;
+ __m256 x, yl, yh, z, tmp1, tmp2;
+ lv_32fc_t* c = cVector;
+ const lv_32fc_t* a = aVector;
+
+ // Set up constant scalar vector
+ yl = _mm256_set1_ps(lv_creal(scalar));
+ yh = _mm256_set1_ps(lv_cimag(scalar));
+
+ for(;number < quarterPoints; number++){
+
+ x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as
ar,ai,br,bi
+
+ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
+
+ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be ai,ar,bi,br
+
+ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci,
br*dr-bi*di, bi*dr+br*di
+
+ _mm256_store_ps((float*)c,z); // Store the results back into the C
container
+
+ a += 4;
+ c += 4;
+ }
+
+ for(i = num_points-isodd; i < num_points; i++) {
+ *c++ = (*a++) * scalar;
+ }
+
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_SSE3
#include <pmmintrin.h>
/*!
diff --git a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
index 430b747..7e8c91a 100644
--- a/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
+++ b/volk/kernels/volk/volk_32fc_x2_dot_prod_32fc.h
@@ -316,10 +316,67 @@ static inline void
volk_32fc_x2_dot_prod_32fc_u_sse4_1(lv_32fc_t* result, const
#endif /*LV_HAVE_SSE4_1*/
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_u_avx(lv_32fc_t* result, const
lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int isodd = num_points & 3;
+ unsigned int i = 0;
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2*sizeof(float));
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
+
+ dotProdVal = _mm256_setzero_ps();
+
+ for(;number < quarterPoints; number++){
+
+ x = _mm256_loadu_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+ y = _mm256_loadu_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+
+ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+
+ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be
ai,ar,bi,br,ei,er,fi,fr
+
+ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di,
bi*dr+br*di
+
+ dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex
multiplication results together
+
+ a += 4;
+ b += 4;
+ }
+
+ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+
+ _mm256_storeu_ps((float*)dotProductVector,dotProdVal); // Store the results
back into the dot product vector
+
+ dotProduct += ( dotProductVector[0] + dotProductVector[1] +
dotProductVector[2] + dotProductVector[3]);
+
+ for(i = num_points-isodd; i < num_points; i++) {
+ dotProduct += input[i] * taps[i];
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_AVX*/
#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_u_H*/
+
#ifndef INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
#define INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H
@@ -680,6 +737,7 @@ static inline void
volk_32fc_x2_dot_prod_32fc_a_sse3(lv_32fc_t* result, const lv
#endif /*LV_HAVE_SSE3*/
+
#ifdef LV_HAVE_SSE4_1
#include <smmintrin.h>
@@ -976,4 +1034,62 @@ static inline void
volk_32fc_x2_dot_prod_32fc_neon_optfmaunroll(lv_32fc_t* resul
#endif /*LV_HAVE_NEON*/
+#ifdef LV_HAVE_AVX
+
+#include <immintrin.h>
+
+static inline void volk_32fc_x2_dot_prod_32fc_a_avx(lv_32fc_t* result, const
lv_32fc_t* input, const lv_32fc_t* taps, unsigned int num_points) {
+
+ unsigned int isodd = num_points & 3;
+ unsigned int i = 0;
+ lv_32fc_t dotProduct;
+ memset(&dotProduct, 0x0, 2*sizeof(float));
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m256 x, y, yl, yh, z, tmp1, tmp2, dotProdVal;
+
+ const lv_32fc_t* a = input;
+ const lv_32fc_t* b = taps;
+
+ dotProdVal = _mm256_setzero_ps();
+
+ for(;number < quarterPoints; number++){
+
+ x = _mm256_load_ps((float*)a); // Load a,b,e,f as ar,ai,br,bi,er,ei,fr,fi
+ y = _mm256_load_ps((float*)b); // Load c,d,g,h as cr,ci,dr,di,gr,gi,hr,hi
+
+ yl = _mm256_moveldup_ps(y); // Load yl with cr,cr,dr,dr,gr,gr,hr,hr
+ yh = _mm256_movehdup_ps(y); // Load yh with ci,ci,di,di,gi,gi,hi,hi
+
+ tmp1 = _mm256_mul_ps(x,yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr ...
+
+ x = _mm256_shuffle_ps(x,x,0xB1); // Re-arrange x to be
ai,ar,bi,br,ei,er,fi,fr
+
+ tmp2 = _mm256_mul_ps(x,yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di ...
+
+ z = _mm256_addsub_ps(tmp1,tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di,
bi*dr+br*di
+
+ dotProdVal = _mm256_add_ps(dotProdVal, z); // Add the complex
multiplication results together
+
+ a += 4;
+ b += 4;
+ }
+
+ __VOLK_ATTR_ALIGNED(32) lv_32fc_t dotProductVector[4];
+
+ _mm256_store_ps((float*)dotProductVector,dotProdVal); // Store the results
back into the dot product vector
+
+ dotProduct += ( dotProductVector[0] + dotProductVector[1] +
dotProductVector[2] + dotProductVector[3]);
+
+ for(i = num_points-isodd; i < num_points; i++) {
+ dotProduct += input[i] * taps[i];
+ }
+
+ *result = dotProduct;
+}
+
+#endif /*LV_HAVE_AVX*/
+
#endif /*INCLUDED_volk_32fc_x2_dot_prod_32fc_a_H*/
diff --git a/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
b/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
index b59d22d..37f9d74 100644
--- a/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
+++ b/volk/kernels/volk/volk_8ic_deinterleave_16i_x2.h
@@ -13,30 +13,31 @@
\param qBuffer The Q buffer output data
\param num_points The number of complex data values to be deinterleaved
*/
+
static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer,
int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
unsigned int number = 0;
const int8_t* complexVectorPtr = (int8_t*)complexVector;
int16_t* iBufferPtr = iBuffer;
int16_t* qBufferPtr = qBuffer;
- __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values
__m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 15, 13, 11, 9, 7, 5, 3, 1);
__m128i complexVal, iOutputVal, qOutputVal;
unsigned int eighthPoints = num_points / 8;
for(number = 0; number < eighthPoints; number++){
- complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr
+= 16;
+ complexVal = _mm_load_si128((__m128i*)complexVectorPtr); complexVectorPtr
+= 16; // aligned load
- iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask);
+ iOutputVal = _mm_shuffle_epi8(complexVal, iMoveMask);
// shuffle 16 bytes of 128bit complexVal
qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
- iOutputVal = _mm_cvtepi8_epi16(iOutputVal);
- iOutputVal = _mm_slli_epi16(iOutputVal, 8);
+ iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte
sign extended versions of lower 8 bytes of input to output
+ iOutputVal = _mm_slli_epi16(iOutputVal, 8); // shift in
left by 8 bits, each of the 8 16-bit integers, shift in with zeros
qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
qOutputVal = _mm_slli_epi16(qOutputVal, 8);
- _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
+ _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
_mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
iBufferPtr += 8;
@@ -45,12 +46,78 @@ static inline void
volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer, int16
number = eighthPoints * 8;
for(; number < num_points; number++){
- *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit
Complexvector into 16 bit, shift left by 8 bits and store
*qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
}
}
#endif /* LV_HAVE_SSE4_1 */
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param qBuffer The Q buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer,
int16_t* qBuffer, const lv_8sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ int16_t* qBufferPtr = qBuffer;
+ __m128i iMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 14, 12, 10, 8, 6, 4, 2, 0); // set 16 byte values
+ __m128i qMoveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 15, 13, 11, 9, 7, 5, 3, 1);
+ __m256i complexVal, iOutputVal, qOutputVal;
+ __m128i complexVal1, complexVal0;
+ __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for(number = 0; number < sixteenthPoints; number++){
+ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
complexVectorPtr += 32; // aligned load
+
+ // Extract from complexVal to iOutputVal and qOutputVal
+ complexVal1 = _mm256_extractf128_si256(complexVal, 1);
+ complexVal0 = _mm256_extractf128_si256(complexVal, 0);
+
+ iOutputVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask);
// shuffle 16 bytes of 128bit complexVal
+ iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
+ qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
+ qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
+
+ iOutputVal1 = _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte
sign extended versions of lower 8 bytes of input to output
+ iOutputVal1 = _mm_slli_epi16(iOutputVal1, 8); // shift in
left by 8 bits, each of the 8 16-bit integers, shift in with zeros
+ iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
+ iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
+
+ qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
+ qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8);
+ qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0);
+ qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8);
+
+ // Pack iOutputVal0,1 to iOutputVal
+ __m256i dummy = _mm256_setzero_si256();
+ iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
+ iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
+ qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
+ qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
+
+ _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store
+ _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
+
+ iBufferPtr += 16;
+ qBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256; // load 8 bit
Complexvector into 16 bit, shift left by 8 bits and store
+ *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Deinterleaves the complex 8 bit vector into I & Q 16 bit vector data
diff --git a/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h
b/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h
index 82cedb2..9adec93 100644
--- a/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h
+++ b/volk/kernels/volk/volk_8ic_deinterleave_real_16i.h
@@ -41,6 +41,54 @@ static inline void
volk_8ic_deinterleave_real_16i_a_sse4_1(int16_t* iBuffer, con
}
#endif /* LV_HAVE_SSE4_1 */
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I 16 bit vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+static inline void volk_8ic_deinterleave_real_16i_a_avx(int16_t* iBuffer,
const lv_8sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int16_t* iBufferPtr = iBuffer;
+ __m128i moveMask = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m256i complexVal, outputVal;
+ __m128i complexVal1, complexVal0, outputVal1, outputVal0;
+
+ unsigned int sixteenthPoints = num_points / 16;
+
+ for(number = 0; number < sixteenthPoints; number++){
+ complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
complexVectorPtr += 32;
+
+ complexVal1 = _mm256_extractf128_si256(complexVal, 1);
+ complexVal0 = _mm256_extractf128_si256(complexVal, 0);
+
+ outputVal1 = _mm_shuffle_epi8(complexVal1, moveMask);
+ outputVal0 = _mm_shuffle_epi8(complexVal0, moveMask);
+
+ outputVal1 = _mm_cvtepi8_epi16(outputVal1);
+ outputVal1 = _mm_slli_epi16(outputVal1, 7);
+ outputVal0 = _mm_cvtepi8_epi16(outputVal0);
+ outputVal0 = _mm_slli_epi16(outputVal0, 7);
+
+ __m256i dummy = _mm256_setzero_si256();
+ outputVal = _mm256_insertf128_si256(dummy, outputVal0, 0);
+ outputVal = _mm256_insertf128_si256(outputVal, outputVal1, 1);
+ _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+
+ iBufferPtr += 16;
+ }
+
+ number = sixteenthPoints * 16;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = ((int16_t)*complexVectorPtr++) * 128;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_GENERIC
/*!
diff --git a/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
index 427c9ab..0749422 100644
--- a/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
+++ b/volk/kernels/volk/volk_8ic_deinterleave_real_8i.h
@@ -43,6 +43,64 @@ static inline void
volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer, const
}
#endif /* LV_HAVE_SSSE3 */
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+ \brief Deinterleaves the complex 8 bit vector into I vector data
+ \param complexVector The complex input vector
+ \param iBuffer The I buffer output data
+ \param num_points The number of complex data values to be deinterleaved
+*/
+
+static inline void volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer, const
lv_8sc_t* complexVector, unsigned int num_points){
+ unsigned int number = 0;
+ const int8_t* complexVectorPtr = (int8_t*)complexVector;
+ int8_t* iBufferPtr = iBuffer;
+ __m128i moveMaskL = _mm_set_epi8(0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 14, 12, 10, 8, 6, 4, 2, 0);
+ __m128i moveMaskH = _mm_set_epi8(14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
+ __m256i complexVal1, complexVal2, outputVal;
+ __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1,
outputVal2;
+
+ unsigned int thirtysecondPoints = num_points / 32;
+
+ for(number = 0; number < thirtysecondPoints; number++){
+
+ complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+ complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
+ complexVectorPtr += 32;
+
+ complexVal1H = _mm256_extractf128_si256(complexVal1, 1);
+ complexVal1L = _mm256_extractf128_si256(complexVal1, 0);
+ complexVal2H = _mm256_extractf128_si256(complexVal2, 1);
+ complexVal2L = _mm256_extractf128_si256(complexVal2, 0);
+
+ complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH);
+ complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL);
+ outputVal1 = _mm_or_si128(complexVal1H, complexVal1L);
+
+
+ complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH);
+ complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL);
+ outputVal2 = _mm_or_si128(complexVal2H, complexVal2L);
+
+ __m256i dummy = _mm256_setzero_si256();
+ outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0);
+ outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1);
+
+
+ _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
+ iBufferPtr += 32;
+ }
+
+ number = thirtysecondPoints * 32;
+ for(; number < num_points; number++){
+ *iBufferPtr++ = *complexVectorPtr++;
+ complexVectorPtr++;
+ }
+}
+#endif /* LV_HAVE_AVX */
+
#ifdef LV_HAVE_GENERIC
/*!
\brief Deinterleaves the complex 8 bit vector into I vector data
- [Commit-gnuradio] [gnuradio] branch master updated (d50c57a -> 8ebe90f), git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 03/14: volk: temp log kernels., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 04/14: volk: Added log2, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 06/14: volk: expfast comments, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 08/14: volk: Added sin, cos kernels., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 12/14: volk: fixed some warnings, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 13/14: volk: fixed a problem with acos during some translation in the git history., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 09/14: volk: Added tan kernel., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 07/14: volk: added power kernel., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 10/14: volk: Added atan, asin, acos kernels., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 01/14: added new proto-kernels,
git <=
- [Commit-gnuradio] [gnuradio] 05/14: volk: Added avx proto-kernel for fast exp., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 02/14: volk: Added proto-kernels for convert, multiply, conjugate, deinterleave, magnitude, mag-square, psd functions., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 11/14: volk (gsoc): whitespace, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 14/14: volk: adding copyright notice to all volk kernels., git, 2014/10/15