[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Commit-gnuradio] [gnuradio] 07/14: volk: added power kernel.
From: |
git |
Subject: |
[Commit-gnuradio] [gnuradio] 07/14: volk: added power kernel. |
Date: |
Wed, 15 Oct 2014 23:25:09 +0000 (UTC) |
This is an automated email from the git hooks/post-receive script.
trondeau pushed a commit to branch master
in repository gnuradio.
commit b2632afd71f06ec6da1aef8ab8cc9a137d5134ba
Author: Tom Rondeau <address@hidden>
Date: Wed Oct 15 10:49:45 2014 -0400
volk: added power kernel.
---
volk/apps/volk_profile.cc | 1 +
volk/kernels/volk/volk_32f_x2_pow_32f.h | 299 ++++++++++++++++++++++++++++++++
volk/lib/testqa.cc | 1 +
3 files changed, 301 insertions(+)
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index 08c31cb..f470157 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -164,6 +164,7 @@ int main(int argc, char *argv[]) {
VOLK_PROFILE(volk_32fc_32f_multiply_32fc, 1e-4, 0, 204602, 1000, &results,
benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_log2_32f, 1e-3, 0, 204602, 1000, &results,
benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_expfast_32f, 1e-1, 0, 204602, 1000, &results,
benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_32f_x2_pow_32f, 1e-2, 0, 204602, 1000, &results,
benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32fc_s32f_power_32fc, 1e-4, 0, 204602, 50, &results,
benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0,
204602, 1000, &results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 204602, 100, &results,
benchmark_mode, kernel_regex);
diff --git a/volk/kernels/volk/volk_32f_x2_pow_32f.h
b/volk/kernels/volk/volk_32f_x2_pow_32f.h
new file mode 100755
index 0000000..cc11daf
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_x2_pow_32f.h
@@ -0,0 +1,299 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <math.h>
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x),
_mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x),
_mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3),
x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2,
c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1,
c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+#define LOG_POLY_DEGREE 3
+
+#ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
+#define INCLUDED_volk_32f_x2_pow_32f_a_H
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Computes pow(x,y) by using exp and log
+ \param cVector The vector where results will be stored
+ \param aVector The input vector of bases
+ \param bVector The input vector of indices
+ \param num_points Number of points for which pow is to be computed
+*/
+static inline void volk_32f_x2_pow_32f_a_generic(float* cVector, const float*
bVector, const float* aVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = pow(*aPtr++, *bPtr++);
+ }
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Computes pow(x,y) by using exp and log
+ \param cVector The vector where results will be stored
+ \param aVector The input vector of bases
+ \param bVector The input vector of indices
+ \param num_points Number of points for which pow is to be computed
+*/
+static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector, const float*
bVector, const float* aVector, unsigned int num_points){
+
+ float* cPtr = cVector;
+ const float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+ __m128 tmp, fx, mask, pow2n, z, y;
+ __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+ __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+ __m128i bias, exp, emm0, pi32_0x7f;
+
+ one = _mm_set1_ps(1.0);
+ exp_hi = _mm_set1_ps(88.3762626647949);
+ exp_lo = _mm_set1_ps(-88.3762626647949);
+ ln2 = _mm_set1_ps(0.6931471805);
+ log2EF = _mm_set1_ps(1.44269504088896341);
+ half = _mm_set1_ps(0.5);
+ exp_C1 = _mm_set1_ps(0.693359375);
+ exp_C2 = _mm_set1_ps(-2.12194440e-4);
+ pi32_0x7f = _mm_set1_epi32(0x7f);
+
+ exp_p0 = _mm_set1_ps(1.9875691500e-4);
+ exp_p1 = _mm_set1_ps(1.3981999507e-3);
+ exp_p2 = _mm_set1_ps(8.3334519073e-3);
+ exp_p3 = _mm_set1_ps(4.1665795894e-2);
+ exp_p4 = _mm_set1_ps(1.6666665459e-1);
+ exp_p5 = _mm_set1_ps(5.0000001201e-1);
+
+ for(;number < quarterPoints; number++){
+
+ // First compute the logarithm
+ aVal = _mm_load_ps(aPtr);
+ bias = _mm_set1_epi32(127);
+ leadingOne = _mm_set1_ps(1.0f);
+ exp =
_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal),
_mm_set1_epi32(0x7f800000)), 23), bias);
+ logarithm = _mm_cvtepi32_ps(exp);
+
+ frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal,
_mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+
+ #if LOG_POLY_DEGREE == 6
+ mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f,
-1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+ #elif LOG_POLY_DEGREE == 5
+ mantissa = POLY4( frac, 2.8882704548164776201f,
-2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f,
0.0596515482674574969533f);
+ #elif LOG_POLY_DEGREE == 4
+ mantissa = POLY3( frac, 2.61761038894603480148f,
-1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+ #elif LOG_POLY_DEGREE == 3
+ mantissa = POLY2( frac, 2.28330284476918490682f,
-1.04913055217340124191f, 0.204446009836232697516f);
+ #else
+ #error
+ #endif
+
+ logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac,
leadingOne)));
+ logarithm = _mm_mul_ps(logarithm, ln2);
+
+
+ // Now calculate b*lna
+ bVal = _mm_load_ps(bPtr);
+ bVal = _mm_mul_ps(bVal, logarithm);
+
+ // Now compute exp(b*lna)
+ tmp = _mm_setzero_ps();
+
+ bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
+
+ fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
+
+ emm0 = _mm_cvttps_epi32(fx);
+ tmp = _mm_cvtepi32_ps(emm0);
+
+ mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+ fx = _mm_sub_ps(tmp, mask);
+
+ tmp = _mm_mul_ps(fx, exp_C1);
+ z = _mm_mul_ps(fx, exp_C2);
+ bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
+ z = _mm_mul_ps(bVal, bVal);
+
+ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
+ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
+ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
+ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
+ y = _mm_add_ps(y, one);
+
+ emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f),
23);
+
+ pow2n = _mm_castsi128_ps(emm0);
+ cVal = _mm_mul_ps(y, pow2n);
+
+ _mm_store_ps(cPtr, cVal);
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = pow(*aPtr++, *bPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for aligned */
+
+#endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
+#define INCLUDED_volk_32f_x2_pow_32f_u_H
+
+#ifdef LV_HAVE_GENERIC
+/*!
+ \brief Computes pow(x,y) by using exp and log
+ \param cVector The vector where results will be stored
+ \param aVector The input vector of bases
+ \param bVector The input vector of indices
+ \param num_points Number of points for which pow is to be computed
+*/
+static inline void volk_32f_x2_pow_32f_u_generic(float* cVector, const float*
bVector, const float* aVector, unsigned int num_points){
+ float* cPtr = cVector;
+ const float* bPtr = bVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+
+ for(number = 0; number < num_points; number++){
+ *cPtr++ = pow(*aPtr++, *bPtr++);
+ }
+
+}
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_SSE4_1
+#include <smmintrin.h>
+/*!
+ \brief Computes pow(x,y) by using exp and log
+ \param cVector The vector where results will be stored
+ \param aVector The input vector of bases
+ \param bVector The input vector of indices
+ \param num_points Number of points for which pow is to be computed
+*/
+static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector, const float*
bVector, const float* aVector, unsigned int num_points){
+
+ float* cPtr = cVector;
+ const float* bPtr = bVector;
+ const float* aPtr = aVector;
+
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
+ __m128 tmp, fx, mask, pow2n, z, y;
+ __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
+ __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
+ __m128i bias, exp, emm0, pi32_0x7f;
+
+ one = _mm_set1_ps(1.0);
+ exp_hi = _mm_set1_ps(88.3762626647949);
+ exp_lo = _mm_set1_ps(-88.3762626647949);
+ ln2 = _mm_set1_ps(0.6931471805);
+ log2EF = _mm_set1_ps(1.44269504088896341);
+ half = _mm_set1_ps(0.5);
+ exp_C1 = _mm_set1_ps(0.693359375);
+ exp_C2 = _mm_set1_ps(-2.12194440e-4);
+ pi32_0x7f = _mm_set1_epi32(0x7f);
+
+ exp_p0 = _mm_set1_ps(1.9875691500e-4);
+ exp_p1 = _mm_set1_ps(1.3981999507e-3);
+ exp_p2 = _mm_set1_ps(8.3334519073e-3);
+ exp_p3 = _mm_set1_ps(4.1665795894e-2);
+ exp_p4 = _mm_set1_ps(1.6666665459e-1);
+ exp_p5 = _mm_set1_ps(5.0000001201e-1);
+
+ for(;number < quarterPoints; number++){
+
+ // First compute the logarithm
+ aVal = _mm_loadu_ps(aPtr);
+ bias = _mm_set1_epi32(127);
+ leadingOne = _mm_set1_ps(1.0f);
+ exp =
_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(_mm_castps_si128(aVal),
_mm_set1_epi32(0x7f800000)), 23), bias);
+ logarithm = _mm_cvtepi32_ps(exp);
+
+ frac = _mm_or_ps(leadingOne, _mm_and_ps(aVal,
_mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
+
+ #if LOG_POLY_DEGREE == 6
+ mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f,
-1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
+ #elif LOG_POLY_DEGREE == 5
+ mantissa = POLY4( frac, 2.8882704548164776201f,
-2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f,
0.0596515482674574969533f);
+ #elif LOG_POLY_DEGREE == 4
+ mantissa = POLY3( frac, 2.61761038894603480148f,
-1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+ #elif LOG_POLY_DEGREE == 3
+ mantissa = POLY2( frac, 2.28330284476918490682f,
-1.04913055217340124191f, 0.204446009836232697516f);
+ #else
+ #error
+ #endif
+
+ logarithm = _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac,
leadingOne)));
+ logarithm = _mm_mul_ps(logarithm, ln2);
+
+
+ // Now calculate b*lna
+ bVal = _mm_loadu_ps(bPtr);
+ bVal = _mm_mul_ps(bVal, logarithm);
+
+ // Now compute exp(b*lna)
+ tmp = _mm_setzero_ps();
+
+ bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
+
+ fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
+
+ emm0 = _mm_cvttps_epi32(fx);
+ tmp = _mm_cvtepi32_ps(emm0);
+
+ mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
+ fx = _mm_sub_ps(tmp, mask);
+
+ tmp = _mm_mul_ps(fx, exp_C1);
+ z = _mm_mul_ps(fx, exp_C2);
+ bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
+ z = _mm_mul_ps(bVal, bVal);
+
+ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
+ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
+ y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
+ y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
+ y = _mm_add_ps(y, one);
+
+ emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f),
23);
+
+ pow2n = _mm_castsi128_ps(emm0);
+ cVal = _mm_mul_ps(y, pow2n);
+
+ _mm_storeu_ps(cPtr, cVal);
+
+ aPtr += 4;
+ bPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++){
+ *cPtr++ = pow(*aPtr++, *bPtr++);
+ }
+}
+
+#endif /* LV_HAVE_SSE4_1 for unaligned */
+
+#endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index bdea584..a809934 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -46,6 +46,7 @@ VOLK_RUN_TESTS(volk_32f_x2_add_32f, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32fc_32f_multiply_32fc, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32f_log2_32f, 1e-3, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32f_expfast_32f, 1e-1, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_32f_x2_pow_32f, 1e-2, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32fc_s32f_power_32fc, 1e-4, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32f_s32f_calc_spectral_noise_floor_32f, 1e-4, 20.0, 20462,
1);
VOLK_RUN_TESTS(volk_32fc_s32f_atan2_32f, 1e-4, 10.0, 20462, 1);
- [Commit-gnuradio] [gnuradio] branch master updated (d50c57a -> 8ebe90f), git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 03/14: volk: temp log kernels., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 04/14: volk: Added log2, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 06/14: volk: expfast comments, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 08/14: volk: Added sin, cos kernels., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 12/14: volk: fixed some warnings, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 13/14: volk: fixed a problem with acos during some translation in the git history., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 09/14: volk: Added tan kernel., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 07/14: volk: added power kernel.,
git <=
- [Commit-gnuradio] [gnuradio] 10/14: volk: Added atan, asin, acos kernels., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 01/14: added new proto-kernels, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 05/14: volk: Added avx proto-kernel for fast exp., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 02/14: volk: Added proto-kernels for convert, multiply, conjugate, deinterleave, magnitude, mag-square, psd functions., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 11/14: volk (gsoc): whitespace, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 14/14: volk: adding copyright notice to all volk kernels., git, 2014/10/15