[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Commit-gnuradio] [gnuradio] 01/01: volk: adding tanh kernel with suppor
From: |
git |
Subject: |
[Commit-gnuradio] [gnuradio] 01/01: volk: adding tanh kernel with support for sse and avx. |
Date: |
Tue, 2 Sep 2014 19:50:03 +0000 (UTC) |
This is an automated email from the git hooks/post-receive script.
trondeau pushed a commit to branch master
in repository gnuradio.
commit 01deede32858ef9e2fe4cc937f3245b5b0e6d7c9
Author: Tom Rondeau <address@hidden>
Date: Tue Sep 2 11:36:11 2014 -0400
volk: adding tanh kernel with support for sse and avx.
There are two generic kernels, one using tanh and another using a series
approximation. The SIMD code uses the series approximation.
---
volk/apps/volk_profile.cc | 1 +
volk/kernels/volk/volk_32f_tanh_32f.h | 296 ++++++++++++++++++++++++++++++++++
volk/lib/testqa.cc | 1 +
3 files changed, 298 insertions(+)
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index 074d1e7..1dd0f9b 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -178,6 +178,7 @@ int main(int argc, char *argv[]) {
VOLK_PROFILE(volk_32f_s32f_multiply_32f, 1e-4, 1.0, 204602, 10000,
&results, benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_binary_slicer_32i, 0, 1.0, 204602, 10000, &results,
benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_binary_slicer_8i, 0, 1.0, 204602, 10000, &results,
benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_32f_tanh_32f, 1e-6, 0, 204602, 1000, &results,
benchmark_mode, kernel_regex);
// Until we can update the config on a kernel by kernel basis
// do not overwrite volk_config when using a regex.
diff --git a/volk/kernels/volk/volk_32f_tanh_32f.h
b/volk/kernels/volk/volk_32f_tanh_32f.h
new file mode 100644
index 0000000..3f407d4
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_tanh_32f.h
@@ -0,0 +1,296 @@
+#ifndef INCLUDED_volk_32f_tanh_32f_a_H
+#define INCLUDED_volk_32f_tanh_32f_a_H
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+
+#ifdef LV_HAVE_GENERIC
+/*!
+\brief Calculates tanh(x)
+\param cVector The vector where the results will be stored
+\param aVector Input vector
+\param num_points The number of values to calulate
+*/
+static inline void volk_32f_tanh_32f_generic(float* cVector, const float*
aVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ for(; number < num_points; number++) {
+ *cPtr++ = tanh(*aPtr++);
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+#ifdef LV_HAVE_GENERIC
+/*!
+\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of
the actual tanh.
+\param cVector The vector where the results will be stored
+\param aVector Input vector
+\param num_points The number of values to calulate
+*/
+static inline void volk_32f_tanh_32f_series(float* cVector, const float*
aVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+ for(; number < num_points; number++) {
+ if(*aPtr > 4.97)
+ *cPtr++ = 1;
+ else if(*aPtr <= -4.97)
+ *cPtr++ = -1;
+ else {
+ float x2 = (*aPtr) * (*aPtr);
+ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+ *cPtr++ = a / b;
+ aPtr++;
+ }
+ }
+}
+
+#endif /* LV_HAVE_GENERIC */
+
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of
the actual tanh.
+\param cVector The vector where the results will be stored
+\param aVector Input vector
+\param num_points The number of values to calulate
+*/
+static inline void volk_32f_tanh_32f_a_sse(float* cVector, const float*
aVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m128 aVal, cVal, x2, a, b;
+ __m128 const1, const2, const3, const4, const5, const6;
+ const1 = _mm_set_ps1(135135.0f);
+ const2 = _mm_set_ps1(17325.0f);
+ const3 = _mm_set_ps1(378.0f);
+ const4 = _mm_set_ps1(62370.0f);
+ const5 = _mm_set_ps1(3150.0f);
+ const6 = _mm_set_ps1(28.0f);
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_load_ps(aPtr);
+ x2 = _mm_mul_ps(aVal, aVal);
+ a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2,
_mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
+ b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2,
_mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
+
+ cVal = _mm_div_ps(a, b);
+
+ _mm_store_ps(cPtr, cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++) {
+ if(*aPtr > 4.97)
+ *cPtr++ = 1;
+ else if(*aPtr <= -4.97)
+ *cPtr++ = -1;
+ else {
+ float x2 = (*aPtr) * (*aPtr);
+ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+ *cPtr++ = a / b;
+ aPtr++;
+ }
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of
the actual tanh.
+\param cVector The vector where the results will be stored
+\param aVector Input vector
+\param num_points The number of values to calulate
+*/
+static inline void volk_32f_tanh_32f_a_avx(float* cVector, const float*
aVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m256 aVal, cVal, x2, a, b;
+ __m256 const1, const2, const3, const4, const5, const6;
+ const1 = _mm256_set1_ps(135135.0f);
+ const2 = _mm256_set1_ps(17325.0f);
+ const3 = _mm256_set1_ps(378.0f);
+ const4 = _mm256_set1_ps(62370.0f);
+ const5 = _mm256_set1_ps(3150.0f);
+ const6 = _mm256_set1_ps(28.0f);
+ for(;number < eighthPoints; number++){
+
+ aVal = _mm256_load_ps(aPtr);
+ x2 = _mm256_mul_ps(aVal, aVal);
+ a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2,
_mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
+ b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4,
_mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
+
+ cVal = _mm256_div_ps(a, b);
+
+ _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
+
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++) {
+ if(*aPtr > 4.97)
+ *cPtr++ = 1;
+ else if(*aPtr <= -4.97)
+ *cPtr++ = -1;
+ else {
+ float x2 = (*aPtr) * (*aPtr);
+ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+ *cPtr++ = a / b;
+ aPtr++;
+ }
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+
+
+
+#ifdef LV_HAVE_SSE
+#include <xmmintrin.h>
+/*!
+\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of
the actual tanh.
+\param cVector The vector where the results will be stored
+\param aVector Input vector
+\param num_points The number of values to calulate
+*/
+static inline void volk_32f_tanh_32f_u_sse(float* cVector, const float*
aVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int quarterPoints = num_points / 4;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m128 aVal, cVal, x2, a, b;
+ __m128 const1, const2, const3, const4, const5, const6;
+ const1 = _mm_set_ps1(135135.0f);
+ const2 = _mm_set_ps1(17325.0f);
+ const3 = _mm_set_ps1(378.0f);
+ const4 = _mm_set_ps1(62370.0f);
+ const5 = _mm_set_ps1(3150.0f);
+ const6 = _mm_set_ps1(28.0f);
+ for(;number < quarterPoints; number++){
+
+ aVal = _mm_loadu_ps(aPtr);
+ x2 = _mm_mul_ps(aVal, aVal);
+ a = _mm_mul_ps(aVal, _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const2,
_mm_mul_ps(x2, _mm_add_ps(const3, x2))))));
+ b = _mm_add_ps(const1, _mm_mul_ps(x2, _mm_add_ps(const4, _mm_mul_ps(x2,
_mm_add_ps(const5, _mm_mul_ps(x2, const6))))));
+
+ cVal = _mm_div_ps(a, b);
+
+ _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
+
+ aPtr += 4;
+ cPtr += 4;
+ }
+
+ number = quarterPoints * 4;
+ for(;number < num_points; number++) {
+ if(*aPtr > 4.97)
+ *cPtr++ = 1;
+ else if(*aPtr <= -4.97)
+ *cPtr++ = -1;
+ else {
+ float x2 = (*aPtr) * (*aPtr);
+ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+ *cPtr++ = a / b;
+ aPtr++;
+ }
+ }
+}
+#endif /* LV_HAVE_SSE */
+
+
+
+#ifdef LV_HAVE_AVX
+#include <immintrin.h>
+/*!
+\brief Calculates tanh(x) using a series approximation, good to within 1e-6 of
the actual tanh.
+\param cVector The vector where the results will be stored
+\param aVector Input vector
+\param num_points The number of values to calulate
+*/
+static inline void volk_32f_tanh_32f_u_avx(float* cVector, const float*
aVector,
+ unsigned int num_points)
+{
+ unsigned int number = 0;
+ const unsigned int eighthPoints = num_points / 8;
+
+ float* cPtr = cVector;
+ const float* aPtr = aVector;
+
+ __m256 aVal, cVal, x2, a, b;
+ __m256 const1, const2, const3, const4, const5, const6;
+ const1 = _mm256_set1_ps(135135.0f);
+ const2 = _mm256_set1_ps(17325.0f);
+ const3 = _mm256_set1_ps(378.0f);
+ const4 = _mm256_set1_ps(62370.0f);
+ const5 = _mm256_set1_ps(3150.0f);
+ const6 = _mm256_set1_ps(28.0f);
+ for(;number < eighthPoints; number++){
+
+ aVal = _mm256_loadu_ps(aPtr);
+ x2 = _mm256_mul_ps(aVal, aVal);
+ a = _mm256_mul_ps(aVal, _mm256_add_ps(const1, _mm256_mul_ps(x2,
_mm256_add_ps(const2, _mm256_mul_ps(x2, _mm256_add_ps(const3, x2))))));
+ b = _mm256_add_ps(const1, _mm256_mul_ps(x2, _mm256_add_ps(const4,
_mm256_mul_ps(x2, _mm256_add_ps(const5, _mm256_mul_ps(x2, const6))))));
+
+ cVal = _mm256_div_ps(a, b);
+
+ _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C
container
+
+ aPtr += 8;
+ cPtr += 8;
+ }
+
+ number = eighthPoints * 8;
+ for(;number < num_points; number++) {
+ if(*aPtr > 4.97)
+ *cPtr++ = 1;
+ else if(*aPtr <= -4.97)
+ *cPtr++ = -1;
+ else {
+ float x2 = (*aPtr) * (*aPtr);
+ float a = (*aPtr) * (135135.0f + x2 * (17325.0f + x2 * (378.0f + x2)));
+ float b = 135135.0f + x2 * (62370.0f + x2 * (3150.0f + x2 * 28.0f));
+ *cPtr++ = a / b;
+ aPtr++;
+ }
+ }
+}
+#endif /* LV_HAVE_AVX */
+
+#endif /* INCLUDED_volk_32f_tanh_32f_a_H */
diff --git a/volk/lib/testqa.cc b/volk/lib/testqa.cc
index bc97ad1..9d83751 100644
--- a/volk/lib/testqa.cc
+++ b/volk/lib/testqa.cc
@@ -114,3 +114,4 @@ VOLK_RUN_TESTS(volk_8u_conv_k7_r2puppet_8u, 0, 0, 2060, 1);
VOLK_RUN_TESTS(volk_32f_invsqrt_32f, 1e-2, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32f_binary_slicer_32i, 0, 0, 20462, 1);
VOLK_RUN_TESTS(volk_32f_binary_slicer_8i, 0, 0, 20462, 1);
+VOLK_RUN_TESTS(volk_32f_tanh_32f, 1e-6, 0, 20462, 1);