[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Commit-gnuradio] [gnuradio] 09/22: volk: add neon version for 32f_binar
From: |
git |
Subject: |
[Commit-gnuradio] [gnuradio] 09/22: volk: add neon version for 32f_binary_slicer_8i |
Date: |
Fri, 31 Oct 2014 19:22:30 +0000 (UTC) |
This is an automated email from the git hooks/post-receive script.
jcorgan pushed a commit to branch master
in repository gnuradio.
commit c07d1a85c244215b5b9f1f4a078df99677175523
Author: Nathan West <address@hidden>
Date: Sun Oct 19 17:46:16 2014 -0500
volk: add neon version for 32f_binary_slicer_8i
---
volk/kernels/volk/volk_32f_binary_slicer_8i.h | 80 +++++++++++++++++++++++++++
1 file changed, 80 insertions(+)
diff --git a/volk/kernels/volk/volk_32f_binary_slicer_8i.h
b/volk/kernels/volk/volk_32f_binary_slicer_8i.h
index 88a25b7..aa14c79 100644
--- a/volk/kernels/volk/volk_32f_binary_slicer_8i.h
+++ b/volk/kernels/volk/volk_32f_binary_slicer_8i.h
@@ -206,4 +206,84 @@ volk_32f_binary_slicer_8i_u_sse2(int8_t* cVector, const
float* aVector,
#endif /* LV_HAVE_SSE2 */
+#ifdef LV_HAVE_NEON
+#include <arm_neon.h>
+/*!
+ \brief Returns integer 1 if float input is greater than or equal to 0, 1
otherwise
+ \param cVector The char (int8_t) output (either 0 or 1)
+ \param aVector The float input
+ \param num_points The number of values in aVector and stored into cVector
+*/
+static inline void
+volk_32f_binary_slicer_8i_neon(int8_t* cVector, const float* aVector,
+ unsigned int num_points)
+{
+ int8_t* cPtr = cVector;
+ const float* aPtr = aVector;
+ unsigned int number = 0;
+ unsigned int n8points = num_points / 8;
+
+ float32x4x2_t input_val0, input_val1;
+ float32x4_t zero_val;
+ uint32x4x2_t res0_u32, res1_u32;
+ uint16x4x2_t res0_u16x4, res1_u16x4;
+ uint16x8x2_t res_u16x8;
+ uint8x8x2_t res_u8;
+ uint8x8_t zero_u8, one;
+
+ zero_val = vdupq_n_f32(0.0);
+ one = vdup_n_u8(0x01);
+
+ // TODO: this is a good candidate for asm because the vcombines
+ // can be eliminated simply by picking dst registers that are
+ // adjacent.
+ for(number = 0; number < n8points; number++) {
+ input_val0 = vld2q_f32(aPtr);
+ input_val1 = vld2q_f32(aPtr+8);
+
+ // test against 0; return uint32
+ res0_u32.val[0] = vcgeq_f32(input_val0.val[0], zero_val);
+ res0_u32.val[1] = vcgeq_f32(input_val0.val[1], zero_val);
+ res1_u32.val[0] = vcgeq_f32(input_val1.val[0], zero_val);
+ res1_u32.val[1] = vcgeq_f32(input_val1.val[1], zero_val);
+
+ // narrow uint32 -> uint16 followed by combine to 8-element vectors
+ res0_u16x4.val[0] = vmovn_u32(res0_u32.val[0]);
+ res0_u16x4.val[1] = vmovn_u32(res0_u32.val[1]);
+ res1_u16x4.val[0] = vmovn_u32(res1_u32.val[0]);
+ res1_u16x4.val[1] = vmovn_u32(res1_u32.val[1]);
+
+ res_u16x8.val[0] = vcombine_u16(res0_u16x4.val[0], res1_u16x4.val[0]);
+ res_u16x8.val[1] = vcombine_u16(res0_u16x4.val[1], res1_u16x4.val[1]);
+
+ // narrow uint16x8 -> uint8x8
+ res_u8.val[0] = vmovn_u16(res_u16x8.val[0]);
+ res_u8.val[1] = vmovn_u16(res_u16x8.val[1]);
+ // we *could* load twice as much data and do another vcombine here
+ // to get a uint8x16x2 vector, still only do 2 vandqs and a single store
+ // but that turns out to be ~16% slower than this version on zc702
+ // it's possible register contention in GCC scheduler slows it down
+ // and a hand-written asm with quad-word u8 registers is much faster.
+
+ res_u8.val[0] = vand_u8(one, res_u8.val[0]);
+ res_u8.val[1] = vand_u8(one, res_u8.val[1]);
+
+ vst2_u8((unsigned char*)cPtr, res_u8);
+ cPtr += 8;
+ aPtr += 8;
+
+ }
+
+ for(number = n8points * 8; number < num_points; number++) {
+ if(*aPtr++ >= 0) {
+ *cPtr++ = 1;
+ }
+ else {
+ *cPtr++ = 0;
+ }
+ }
+}
+#endif /* LV_HAVE_NEON */
+
+
#endif /* INCLUDED_volk_32f_binary_slicer_8i_H */
- [Commit-gnuradio] [gnuradio] branch master updated (b3bbe56 -> 4869607), git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 03/22: volk: removing executable bit for volk_32f_x2_pow_32f.h, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 05/22: volk: popcnt support, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 04/22: volk: add neon support for 32fc_s32fc_multiply_32fc, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 12/22: volk: update profile to use the new 32u_byteswap puppet, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 06/22: volk: adding popcnt puppets to qa, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 09/22: volk: add neon version for 32f_binary_slicer_8i,
git <=
- [Commit-gnuradio] [gnuradio] 08/22: volk: add neon protokernel for 16i_s32f_convert_32f., git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 02/22: volk: add neon kernels for 32fc->32f deinterleavers, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 07/22: volk: add neon kernel for 16i_32fc_dot_prod_32fc, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 11/22: volk: add neon version of 32u_byteswap, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 10/22: volk: removed unused variable from neon binary slicer, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 01/22: volk: add neon kernel for 16i_convert_8i, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 14/22: volk: fixing 32u_byteswap puppet for SSE, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 13/22: volk: add neon versions for 32i bitwise operators, git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 22/22: Merge commit '0c92479f', git, 2014/10/31
- [Commit-gnuradio] [gnuradio] 19/22: volk: add neon log2 implementation and fix QA to properly test, git, 2014/10/31