[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Commit-gnuradio] [gnuradio] 03/14: volk: temp log kernels.
From: |
git |
Subject: |
[Commit-gnuradio] [gnuradio] 03/14: volk: temp log kernels. |
Date: |
Wed, 15 Oct 2014 23:25:08 +0000 (UTC) |
This is an automated email from the git hooks/post-receive script.
trondeau pushed a commit to branch master
in repository gnuradio.
commit d0c0e856c0618d87ff88ca53fd63584679e2fb56
Author: Abhishek Bhowmick <address@hidden>
Date: Thu Jun 5 11:16:41 2014 +0530
volk: temp log kernels.
---
volk/kernels/volk/volk_32f_log2_32f_mine.cc | 169 ++++++++++++++++++++++
volk/kernels/volk/volk_32f_log2_32f_mitLicense.cc | 115 +++++++++++++++
2 files changed, 284 insertions(+)
diff --git a/volk/kernels/volk/volk_32f_log2_32f_mine.cc
b/volk/kernels/volk/volk_32f_log2_32f_mine.cc
new file mode 100644
index 0000000..94d3daa
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_log2_32f_mine.cc
@@ -0,0 +1,169 @@
+
+#include <stdio.h>
+#include <math.h>
+#include <inttypes.h>
+#include <iostream>
+#include <immintrin.h>
+
+void print128_num(__m128i var)
+{
+ uint32_t *val = (uint32_t*) &var;//can also use uint32_t instead of 16_t
+ printf("Vector of Ints: %x %x %d %d \n",
+ val[3], val[2], val[1], val[0]);
+}
+
+void print128f_num(__m128 var)
+{
+ float *val = (float*) &var;//can also use uint32_t instead of 16_t
+ printf("Vector of Floats: %f %f %f %f \n",
+ val[3], val[2], val[1], val[0]);
+}
+
+static inline void volk_32f_log2_32f_a_avx(float* bVector, float* aVector,
unsigned int num_points){
+
+ float* bPtr = bVector;
+ float* aPtr = aVector;
+ int exp1 = 0x7f800000;
+
+ __m128 aVal, bVal, charac, mantissa;
+ __m128i bias = _mm_set1_epi32(127);
+ __m128i aVali = _mm_castps_si128(aVal);
+ __m128i exp = _mm_set1_epi32(exp1);
+ aVal = _mm_load_ps(aPtr);
+ exp = _mm_and_si128(aVali, exp);
+ exp = _mm_srli_epi32(exp, 23);
+ exp = _mm_sub_epi32(exp, bias);
+ charac = _mm_cvtepi32_ps(exp);
+
+ // Now to extract mantissa
+ int frac1 = 0x007fffff;
+ __m128i fracMask = _mm_set1_epi32(frac1);
+ __m128 frac = _mm_castsi128_ps(fracMask);
+ frac = _mm_and_ps(aVal, frac);
+ __m128 leadingOne = _mm_set1_ps(1.0f);
+ frac = _mm_or_ps(leadingOne, frac);
+
+
+ // Add the decimal bits one by one, can use LUT to speed up
+ __m128 compareResults, compareResultsNot, compareTo, fracByTwo, allOnes, a,
b, c;
+ __m128i allOnesi;
+ allOnesi = _mm_set1_epi32(0xffffffff);
+ allOnes = _mm_castsi128_ps(allOnesi);
+ compareTo = _mm_set1_ps(2.0);
+
+ mantissa = _mm_set1_ps(0.5);
+ frac = _mm_mul_ps(frac, frac);
+ compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare
+ compareResultsNot = _mm_andnot_ps(compareResults, allOnes);
+ fracByTwo = _mm_div_ps(frac, compareTo);
+ a = _mm_and_ps(compareResults, fracByTwo);
+ //b = _mm_and_ps(compareResultsNot, frac);
+ frac = _mm_sub_ps(frac, a);
+ c = _mm_and_ps(compareResults, mantissa);
+ charac = _mm_add_ps(charac, c);
+
+
+ mantissa = _mm_set1_ps(0.25);
+ frac = _mm_mul_ps(frac, frac);
+ compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare
+ compareResultsNot = _mm_andnot_ps(compareResults, allOnes);
+ fracByTwo = _mm_div_ps(frac, compareTo);
+ a = _mm_and_ps(compareResults, fracByTwo);
+ //b = _mm_and_ps(compareResultsNot, frac);
+ frac = _mm_sub_ps(frac, a);
+ charac = _mm_add_ps(charac, _mm_and_ps(compareResults, mantissa));
+
+
+ mantissa = _mm_set1_ps(0.125);
+ frac = _mm_mul_ps(frac, frac);
+ compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare
+ compareResultsNot = _mm_andnot_ps(compareResults, allOnes);
+ fracByTwo = _mm_div_ps(frac, compareTo);
+ a = _mm_and_ps(compareResults, fracByTwo);
+ b = _mm_and_ps(compareResultsNot, frac);
+ frac = _mm_add_ps(a, b);
+ charac = _mm_add_ps(charac, _mm_and_ps(compareResults, mantissa));
+
+/*
+ mantissa = _mm_set1_ps(0.0625);
+ frac = _mm_mul_ps(frac, frac);
+ compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare
+ compareResultsNot = _mm_andnot_ps(compareResults, allOnes);
+ fracByTwo = _mm_div_ps(frac, compareTo);
+ a = _mm_and_ps(compareResults, fracByTwo);
+ b = _mm_and_ps(compareResultsNot, frac);
+ frac = _mm_add_ps(a, b);
+ charac = _mm_add_ps(charac, _mm_and_ps(compareResults, mantissa));
+
+ mantissa = _mm_set1_ps(0.03125);
+ frac = _mm_mul_ps(frac, frac);
+ compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare
+ compareResultsNot = _mm_andnot_ps(compareResults, allOnes);
+ fracByTwo = _mm_div_ps(frac, compareTo);
+ a = _mm_and_ps(compareResults, fracByTwo);
+ b = _mm_and_ps(compareResultsNot, frac);
+ frac = _mm_add_ps(a, b);
+ charac = _mm_add_ps(charac, _mm_and_ps(compareResults, mantissa));
+
+ mantissa = _mm_set1_ps(0.015625);
+ frac = _mm_mul_ps(frac, frac);
+ compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare
+ compareResultsNot = _mm_andnot_ps(compareResults, allOnes);
+ fracByTwo = _mm_div_ps(frac, compareTo);
+ a = _mm_and_ps(compareResults, fracByTwo);
+ b = _mm_and_ps(compareResultsNot, frac);
+ frac = _mm_add_ps(a, b);
+ charac = _mm_add_ps(charac, _mm_and_ps(compareResults, mantissa));
+
+ mantissa = _mm_set1_ps(0.0078125);
+ frac = _mm_mul_ps(frac, frac);
+ compareResults = _mm_cmpge_ps(frac, compareTo); // ge compare
+ compareResultsNot = _mm_andnot_ps(compareResults, allOnes);
+ fracByTwo = _mm_div_ps(frac, compareTo);
+ a = _mm_and_ps(compareResults, fracByTwo);
+ b = _mm_and_ps(compareResultsNot, frac);
+ frac = _mm_add_ps(a, b);
+ charac = _mm_add_ps(charac, _mm_and_ps(compareResults, mantissa));
+*/
+ _mm_store_ps(bPtr, charac);
+}
+
+
+int main(){
+
+clock_t start, end;
+double time;
+float dummy;
+//float input[] = {0.5, 1.0, 4.0, 9.5};
+float* input = new float(4*sizeof(float));
+input[0] = 0.125;
+input[1] = 2.4;
+input[2] = 8.0;
+input[3] = 16.0;
+float* output = new float(4*sizeof(float));
+
+start = clock();
+for(int i = 0; i < 10000000; i++)
+ volk_32f_log2_32f_a_avx(output, input, 4);
+end = clock();
+time = 1000 * (double)(end-start);
+std::cout << "my time is " << time << std::endl;
+
+start = clock();
+for(int j = 0; j < 40000000; j++)
+ dummy = log2(input[0]);
+end = clock();
+time = 1000 * (double)(end-start);
+std::cout << "cmath time is " << time << std::endl;
+
+
+for(int i =0; i < 4; i ++){
+ std::cout << input[i] << "\t" << output[i];
+ std::cout << " cmath : \t " << log2(input[i]) << std::endl;
+}
+
+delete input;
+delete output;
+return 0;
+
+}
diff --git a/volk/kernels/volk/volk_32f_log2_32f_mitLicense.cc
b/volk/kernels/volk/volk_32f_log2_32f_mitLicense.cc
new file mode 100644
index 0000000..7ee18cc
--- /dev/null
+++ b/volk/kernels/volk/volk_32f_log2_32f_mitLicense.cc
@@ -0,0 +1,115 @@
+
+#include <stdio.h>
+#include <math.h>
+#include <inttypes.h>
+#include <iostream>
+#include <immintrin.h>
+
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x),
_mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x),
_mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3),
x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2,
c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1,
c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+#define LOG_POLY_DEGREE 3
+
+void print128_num(__m128i var)
+{
+ uint32_t *val = (uint32_t*) &var;//can also use uint32_t instead of 16_t
+ printf("Vector of Ints: %x %x %d %d \n",
+ val[3], val[2], val[1], val[0]);
+}
+
+void print128f_num(__m128 var)
+{
+ float *val = (float*) &var;//can also use uint32_t instead of 16_t
+ printf("Vector of Floats: %f %f %f %f \n",
+ val[3], val[2], val[1], val[0]);
+}
+
+static inline void volk_32f_log2_32f_a_avx(float* bVector, float* aVector,
unsigned int num_points){
+
+ float* bPtr = bVector;
+ float* aPtr = aVector;
+ int exp1 = 0x7f800000;
+
+ __m128 aVal, bVal, charac, mantissa, frac;
+ __m128i bias = _mm_set1_epi32(127);
+ __m128i aVali = _mm_castps_si128(aVal);
+ __m128i exp = _mm_set1_epi32(exp1);
+ aVal = _mm_load_ps(aPtr);
+ exp = _mm_and_si128(aVali, exp);
+ exp = _mm_srli_epi32(exp, 23);
+ exp = _mm_sub_epi32(exp, bias);
+ charac = _mm_cvtepi32_ps(exp);
+
+ // Now to extract mantissa
+ int frac1 = 0x007fffff;
+ __m128i fracMask = _mm_set1_epi32(frac1);
+ frac = _mm_castsi128_ps(fracMask);
+ frac = _mm_and_ps(aVal, frac);
+ __m128 leadingOne = _mm_set1_ps(1.0f);
+ frac = _mm_or_ps(leadingOne, frac);
+
+
+ // This is where MIT Licensed code starts
+ #if LOG_POLY_DEGREE == 6
+ mantissa = POLY5( frac, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f,
3.1821337e-1f, -3.4436006e-2f);
+#elif LOG_POLY_DEGREE == 5
+ mantissa = POLY4( frac, 2.8882704548164776201f, -2.52074962577807006663f,
1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+ mantissa = POLY3( frac, 2.61761038894603480148f, -1.75647175389045657003f,
0.688243882994381274313f, -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+ mantissa = POLY2( frac, 2.28330284476918490682f, -1.04913055217340124191f,
0.204446009836232697516f);
+#else
+#error
+#endif
+
+ mantissa = _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne));
+
+ bVal = _mm_add_ps(charac, mantissa);
+
+ _mm_store_ps(bPtr, bVal);
+}
+
+
+int main(){
+
+clock_t start, end;
+double time;
+float dummy;
+//float input[] = {0.5, 1.0, 4.0, 9.5};
+float* input = new float(4*sizeof(float));
+input[0] = 0.125;
+input[1] = 2.4;
+input[2] = 8.0;
+input[3] = 16.0;
+float* output = new float(4*sizeof(float));
+
+start = clock();
+for(int i = 0; i < 10000000; i++)
+ volk_32f_log2_32f_a_avx(output, input, 4);
+end = clock();
+time = 1000 * (double)(end-start);
+std::cout << "my time is " << time << std::endl;
+
+start = clock();
+for(int j = 0; j < 40000000; j++)
+ dummy = log2(input[0]);
+end = clock();
+time = 1000 * (double)(end-start);
+std::cout << "cmath time is " << time << std::endl;
+
+
+for(int i =0; i < 4; i ++){
+ std::cout << input[i] << "\t" << output[i];
+ std::cout << " cmath : \t " << log2(input[i]) << std::endl;
+}
+
+delete input;
+delete output;
+return 0;
+
+}
- [Commit-gnuradio] [gnuradio] branch master updated (d50c57a -> 8ebe90f), git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 03/14: volk: temp log kernels.,
git <=
- [Commit-gnuradio] [gnuradio] 04/14: volk: Added log2, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 06/14: volk: expfast comments, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 08/14: volk: Added sin, cos kernels., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 12/14: volk: fixed some warnings, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 13/14: volk: fixed a problem with acos during some translation in the git history., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 09/14: volk: Added tan kernel., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 07/14: volk: added power kernel., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 10/14: volk: Added atan, asin, acos kernels., git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 01/14: added new proto-kernels, git, 2014/10/15
- [Commit-gnuradio] [gnuradio] 05/14: volk: Added avx proto-kernel for fast exp., git, 2014/10/15