[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Commit-gnuradio] [gnuradio] 19/22: volk: add neon log2 implementation a
From: |
git |
Subject: |
[Commit-gnuradio] [gnuradio] 19/22: volk: add neon log2 implementation and fix QA to properly test |
Date: |
Fri, 31 Oct 2014 19:22:31 +0000 (UTC) |
This is an automated email from the git hooks/post-receive script.
jcorgan pushed a commit to branch master
in repository gnuradio.
commit eba094a3010ca326da747390be51da8326fff0c1
Author: Nathan West <address@hidden>
Date: Fri Oct 24 19:49:19 2014 -0500
volk: add neon log2 implementation and fix QA to properly test
The implementation adds the float exponent to a 6th order minimax fit
of log2(significand) which has domain [1,2).
---
volk/apps/volk_profile.cc | 2 +-
volk/kernels/volk/volk_32f_log2_32f.h | 120 +++++++++++++++++++++++++++++++---
volk/lib/qa_utils.cc | 2 +-
3 files changed, 112 insertions(+), 12 deletions(-)
diff --git a/volk/apps/volk_profile.cc b/volk/apps/volk_profile.cc
index 4167f4d..9bc1842 100644
--- a/volk/apps/volk_profile.cc
+++ b/volk/apps/volk_profile.cc
@@ -162,7 +162,7 @@ int main(int argc, char *argv[]) {
VOLK_PROFILE(volk_32f_accumulator_s32f, 1e-4, 0, 204602, 10000, &results,
benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_x2_add_32f, 1e-4, 0, 204602, 10000, &results,
benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32fc_32f_multiply_32fc, 1e-4, 0, 204602, 1000, &results,
benchmark_mode, kernel_regex);
- VOLK_PROFILE(volk_32f_log2_32f, 1e-3, 0, 204602, 1000, &results,
benchmark_mode, kernel_regex);
+ VOLK_PROFILE(volk_32f_log2_32f, 1.5e-1, 0, 204602, 1000, &results,
benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_expfast_32f, 1e-1, 0, 204602, 1000, &results,
benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_x2_pow_32f, 1e-2, 0, 204602, 1000, &results,
benchmark_mode, kernel_regex);
VOLK_PROFILE(volk_32f_sin_32f, 1e-6, 0, 204602, 1000, &results,
benchmark_mode, kernel_regex);
diff --git a/volk/kernels/volk/volk_32f_log2_32f.h
b/volk/kernels/volk/volk_32f_log2_32f.h
index 52c1b60..9452d35 100644
--- a/volk/kernels/volk/volk_32f_log2_32f.h
+++ b/volk/kernels/volk/volk_32f_log2_32f.h
@@ -145,30 +145,130 @@ static inline void volk_32f_log2_32f_a_sse4_1(float*
bVector, const float* aVect
#endif /* LV_HAVE_SSE4_1 for aligned */
-#endif /* INCLUDED_volk_32f_log2_32f_a_H */
-
-#ifndef INCLUDED_volk_32f_log2_32f_u_H
-#define INCLUDED_volk_32f_log2_32f_u_H
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+
+/* these macros allow us to embed logs in other kernels */
+#define VLOG2Q_NEON_PREAMBLE() \
+ int32x4_t one = vdupq_n_s32(0x000800000); \
+ /* minimax polynomial */ \
+ float32x4_t p0 = vdupq_n_f32(-3.0400402727048585); \
+ float32x4_t p1 = vdupq_n_f32(6.1129631282966113); \
+ float32x4_t p2 = vdupq_n_f32(-5.3419892024633207); \
+ float32x4_t p3 = vdupq_n_f32(3.2865287703753912); \
+ float32x4_t p4 = vdupq_n_f32(-1.2669182593441635); \
+ float32x4_t p5 = vdupq_n_f32(0.2751487703421256); \
+ float32x4_t p6 = vdupq_n_f32(-0.0256910888150985); \
+ int32x4_t exp_mask = vdupq_n_s32(0x7f800000); \
+ int32x4_t sig_mask = vdupq_n_s32(0x007fffff); \
+ int32x4_t exp_bias = vdupq_n_s32(127);
+
+
+#define VLOG2Q_NEON_F32(log2_approx, aval) \
+ int32x4_t exponent_i = vandq_s32(aval, exp_mask); \
+ int32x4_t significand_i = vandq_s32(aval, sig_mask); \
+ exponent_i = vshrq_n_s32(exponent_i, 23); \
+ \
+ /* extract the exponent and significand \
+ we can treat this as fixed point to save ~9% on the \
+ conversion + float add */ \
+ significand_i = vorrq_s32(one, significand_i); \
+ float32x4_t significand_f = vcvtq_n_f32_s32(significand_i,23); \
+ /* debias the exponent and convert to float */ \
+ exponent_i = vsubq_s32(exponent_i, exp_bias); \
+ float32x4_t exponent_f = vcvtq_f32_s32(exponent_i); \
+ \
+ /* put the significand through a polynomial fit of log2(x) [1,2]\
+ add the result to the exponent */ \
+ log2_approx = vaddq_f32(exponent_f, p0); /* p0 */ \
+ float32x4_t tmp1 = vmulq_f32(significand_f, p1); /* p1 * x */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_2 = vmulq_f32(significand_f, significand_f); /* x^2 */
\
+ tmp1 = vmulq_f32(sig_2, p2); /* p2 * x^2 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ \
+ float32x4_t sig_3 = vmulq_f32(sig_2, significand_f); /* x^3 */ \
+ tmp1 = vmulq_f32(sig_3, p3); /* p3 * x^3 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_4 = vmulq_f32(sig_2, sig_2); /* x^4 */ \
+ tmp1 = vmulq_f32(sig_4, p4); /* p4 * x^4 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_5 = vmulq_f32(sig_3, sig_2); /* x^5 */ \
+ tmp1 = vmulq_f32(sig_5, p5); /* p5 * x^5 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1); \
+ float32x4_t sig_6 = vmulq_f32(sig_3, sig_3); /* x^6 */ \
+ tmp1 = vmulq_f32(sig_6, p6); /* p6 * x^6 */ \
+ log2_approx = vaddq_f32(log2_approx, tmp1);
-#ifdef LV_HAVE_GENERIC
/*!
\brief Computes base 2 log of input vector and stores results in output
vector
\param bVector The vector where results will be stored
\param aVector The input vector of floats
\param num_points Number of points for which log is to be computed
*/
-static inline void volk_32f_log2_32f_u_generic(float* bVector, const float*
aVector, unsigned int num_points){
+static inline void volk_32f_log2_32f_neon(float* bVector, const float*
aVector, unsigned int num_points){
float* bPtr = bVector;
const float* aPtr = aVector;
- unsigned int number = 0;
+ unsigned int number;
+ const unsigned int quarterPoints = num_points / 4;
- for(number = 0; number < num_points; number++){
- *bPtr++ = log2(*aPtr++);
+ int32x4_t aval;
+ float32x4_t log2_approx;
+
+ VLOG2Q_NEON_PREAMBLE()
+ // lms
+ //p0 = vdupq_n_f32(-1.649132280361871);
+ //p1 = vdupq_n_f32(1.995047138579499);
+ //p2 = vdupq_n_f32(-0.336914839219728);
+
+ // keep in mind a single precision float is represented as
+ // (-1)^sign * 2^exp * 1.significand, so the log2 is
+ // log2(2^exp * sig) = exponent + log2(1 + significand/(1<<23)
+ for(number = 0; number < quarterPoints; ++number){
+ // load float in to an int register without conversion
+ aval = vld1q_s32((int*)aPtr);
+
+ VLOG2Q_NEON_F32(log2_approx, aval)
+
+ vst1q_f32(bPtr, log2_approx);
+
+ aPtr += 4;
+ bPtr += 4;
}
+ for(number = quarterPoints * 4; number < num_points; number++){
+ *bPtr++ = log2(*aPtr++);
+ }
}
-#endif /* LV_HAVE_GENERIC */
+
+#endif /* LV_HAVE_NEON */
+
+
+#endif /* INCLUDED_volk_32f_log2_32f_a_H */
+
+#ifndef INCLUDED_volk_32f_log2_32f_u_H
+#define INCLUDED_volk_32f_log2_32f_u_H
+
+
+//#ifdef LV_HAVE_GENERIC
+///*!
+// \brief Computes base 2 log of input vector and stores results in output
vector
+// \param bVector The vector where results will be stored
+// \param aVector The input vector of floats
+// \param num_points Number of points for which log is to be computed
+//*/
+//static inline void volk_32f_log2_32f_u_generic(float* bVector, const float*
aVector, unsigned int num_points){
+// float* bPtr = bVector;
+// const float* aPtr = aVector;
+// unsigned int number = 0;
+//
+// for(number = 0; number < num_points; number++){
+// *bPtr++ = log2(*aPtr++);
+// }
+//
+//}
+//#endif /* LV_HAVE_GENERIC */
#ifdef LV_HAVE_SSE4_1
diff --git a/volk/lib/qa_utils.cc b/volk/lib/qa_utils.cc
index 3ab4a99..be20ed6 100644
--- a/volk/lib/qa_utils.cc
+++ b/volk/lib/qa_utils.cc
@@ -247,7 +247,7 @@ bool fcompare(t *in1, t *in2, unsigned int vlen, float tol)
{
}
}
// the primary test is the percent different greater than given tol
- else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/(((t *)in1)[i]) > tol)
{
+ else if(fabs(((t *)(in1))[i] - ((t *)(in2))[i])/fabs(((t *)in1)[i]) >
tol) {
fail=true;
if(print_max_errs-- > 0) {
std::cout << "offset " << i << " in1: " << t(((t *)(in1))[i])
<< " in2: " << t(((t *)(in2))[i]) << std::endl;
- [Commit-gnuradio] [gnuradio] 09/22: volk: add neon version for 32f_binary_slicer_8i, (continued)
- [Commit-gnuradio] [gnuradio] 09/22: volk: add neon version for 32f_binary_slicer_8i, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 08/22: volk: add neon protokernel for 16i_s32f_convert_32f., git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 02/22: volk: add neon kernels for 32fc->32f deinterleavers, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 07/22: volk: add neon kernel for 16i_32fc_dot_prod_32fc, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 11/22: volk: add neon version of 32u_byteswap, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 10/22: volk: removed unused variable from neon binary slicer, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 01/22: volk: add neon kernel for 16i_convert_8i, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 14/22: volk: fixing 32u_byteswap puppet for SSE, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 13/22: volk: add neon versions for 32i bitwise operators, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 22/22: Merge commit '0c92479f', git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 19/22: volk: add neon log2 implementation and fix QA to properly test,
git <=
- [Commit-gnuradio] [gnuradio] 15/22: volk: add neon version of complex<float> dot product, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 17/22: volk: add a neon table version of 16u_byteswap, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 21/22: volk: relax log2 qa constraints and use a higher order polynomial, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 20/22: volk: fix memory overrun/corruption in neon binary_slicer_8i, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 16/22: volk: add neon kernel for 64u_byteswap and puppets for 64/16 byteswap, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 18/22: volk: fixing *byteswap sse puppet signatures, git, 2014/10/31