bug-gnubg
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Bug-gnubg] Vectorizing 2nd step


From: Øystein Johansen
Subject: [Bug-gnubg] Vectorizing 2nd step
Date: Sat, 16 Apr 2005 18:47:05 +0200
User-agent: Mozilla Thunderbird 0.8 (Windows/20040913)

-----BEGIN PGP SIGNED MESSAGE-----
Hash: SHA1

Here's a patch for i386 / GCC vectorizing of the inner loops of
Evaluate(). I see some improvement, but I believe this can be improved
even further.

Some comments:
I believe having a integer counter in the loop slows it down. Can I exit
the loop in an other way?

I initialize a vector for scaling in the second loop. I believe this can
be made simpler. Any suggestions?

Please comment on this two issues.

I get an internal compiler error whin I compile with a gcc-4.1 snapshot.
This is compiled with gcc-3.4. I will also vectorice the other loops in
the evaluation function. And I will also vectorize the sigmoid function.

- -Øystein
-----BEGIN PGP SIGNATURE-----
Version: GnuPG v1.2.4 (MingW32)
Comment: Using GnuPG with Thunderbird - http://enigmail.mozdev.org

iD8DBQFCYUGJ6kDTFPhwyqYRAv9IAJoCTyBmxfknnxaIyezApxtYkfLP9gCePLdx
kiN6PdG/t/RW54PI20mLPJ0=
=kjeM
-----END PGP SIGNATURE-----

Index: neuralnet.c
===================================================================
RCS file: /cvsroot/gnubg/gnubg/lib/neuralnet.c,v
retrieving revision 1.23
diff -u -r1.23 neuralnet.c
--- neuralnet.c 25 Feb 2005 11:34:24 -0000      1.23
+++ neuralnet.c 16 Apr 2005 16:43:36 -0000
@@ -444,6 +444,21 @@
   return 0;
 }
 
+typedef float v4sf __attribute__ ((vector_size(16)));
+
+typedef union _vec4f {
+  v4sf v;
+  float f[4];
+} vec4f;
+
+#if DEBUG
+void
+printvec( v4sf vec ){
+  float *pFloat = ( float *) &vec;
+  printf("%f, %f, %f, %f\n", pFloat[0], pFloat[1], pFloat[2], pFloat[3]);
+}
+#endif
+
 static int Evaluate( neuralnet *pnn, float arInput[], float ar[],
                         float arOutput[], float *saveAr ) {
 
@@ -452,6 +467,9 @@
 #else
     int i, j;
     float *prWeight;
+    v4sf sum, vec0, vec1, vec3;
+    
+    assert(pnn->cHidden == 128);
 
     /* Calculate activity at hidden nodes */
     for( i = 0; i < pnn->cHidden; i++ )
@@ -466,11 +484,28 @@
            float *pr = ar;
 
            if( ari == 1.0f )
-               for( j = pnn->cHidden; j; j-- )
-                   *pr++ += *prWeight++;
-           else
-               for( j = pnn->cHidden; j; j-- )
-                   *pr++ += *prWeight++ * ari;
+               for( j = 32; j; j--, pr += 4, prWeight += 4 ){
+                   vec0 = __builtin_ia32_loadups(pr);  
+                  vec1 = __builtin_ia32_loadups(prWeight); 
+                   sum = __builtin_ia32_addps(vec0, vec1);
+                   __builtin_ia32_storeups (pr, sum);
+               }
+//                 *pr++ += *prWeight++;
+           else {
+               float scale[4];
+               v4sf scalevector;
+                scale[0] = scale[1] = scale[2] = scale[3] = ari;
+               scalevector = __builtin_ia32_loadups(scale);
+               for( j = 32; j; j--, pr += 4, prWeight += 4 ){
+                   vec0 = __builtin_ia32_loadups(pr);  
+                  vec1 = __builtin_ia32_loadups(prWeight); 
+                  vec3 = __builtin_ia32_mulps(vec1, scalevector);
+                   sum = __builtin_ia32_addps(vec0, vec3);
+                   __builtin_ia32_storeups (pr, sum);
+               }
+//             for( j = pnn->cHidden; j; j-- )
+//                 *pr++ += *prWeight++ * ari;
+           }
        } else
            prWeight += pnn->cHidden;
     }
@@ -484,14 +519,22 @@
 
     /* Calculate activity at output nodes */
     prWeight = pnn->arOutputWeight;
-
+    
     for( i = 0; i < pnn->cOutput; i++ ) {
-       float r = pnn->arOutputThreshold[ i ];
-       
-       for( j = 0; j < pnn->cHidden; j++ )
-           r += ar[ j ] * *prWeight++;
-
-       arOutput[ i ] = sigmoid( -pnn->rBetaOutput * r );
+       float r = pnn->arOutputThreshold[ i ];
+       float *pr = ar;
+       vec4f sum;
+       v4sf vec0, vec1, vec3;
+       sum.v = __builtin_ia32_xorps(sum.v, sum.v);
+       for( j = 32; j ; j--, prWeight += 4, pr += 4 ){
+         vec0 = __builtin_ia32_loadups(pr);       /* Four floats into vec0 */
+         vec1 = __builtin_ia32_loadups(prWeight); /* Four weights into vect1 
*/ 
+         vec3 = __builtin_ia32_mulps(vec0, vec1); /* Multiply */
+         sum.v = __builtin_ia32_addps(sum.v, vec3); /* Add */
+       }
+       
+       r += sum.f[0] + sum.f[1] + sum.f[2] + sum.f[3]; 
+       arOutput[ i ] = sigmoid( -pnn->rBetaOutput * r );
     }
 
     return 0;


reply via email to

[Prev in Thread] Current Thread [Next in Thread]