getfem-commits
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

[Getfem-commits] r5477 - in /trunk/getfem: contrib/opt_assembly/ doc/sph


From: Yves . Renard
Subject: [Getfem-commits] r5477 - in /trunk/getfem: contrib/opt_assembly/ doc/sphinx/source/project/ src/
Date: Thu, 10 Nov 2016 16:40:52 -0000

Author: renard
Date: Thu Nov 10 17:40:51 2016
New Revision: 5477

URL: http://svn.gna.org/viewcvs/getfem?rev=5477&view=rev
Log:
an optimisation for vector source term

Modified:
    trunk/getfem/contrib/opt_assembly/opt_assembly.cc
    trunk/getfem/doc/sphinx/source/project/libdesc_high_gen_assemb.rst
    trunk/getfem/src/getfem_generic_assembly.cc

Modified: trunk/getfem/contrib/opt_assembly/opt_assembly.cc
URL: 
http://svn.gna.org/viewcvs/getfem/trunk/getfem/contrib/opt_assembly/opt_assembly.cc?rev=5477&r1=5476&r2=5477&view=diff
==============================================================================
--- trunk/getfem/contrib/opt_assembly/opt_assembly.cc   (original)
+++ trunk/getfem/contrib/opt_assembly/opt_assembly.cc   Thu Nov 10 17:40:51 2016
@@ -677,7 +677,7 @@
   // Non-homogeneous elast: 0.34 | 2.26 | 0.09 | 0.15 | 0.06 | 0.13 |
   if (all || only_one == 2) // ndofu = 151959 ndofp =  50653 ndofchi = 6553
     test_new_assembly(3, 36, 1);
-  // Vector source term   : 0.25 | 0.79 |
+  // Vector source term   : 0.23 | 0.79 |
   // Nonlinear residual   : 0.46 |      |
   // Mass (scalar)        : 0.21 | 0.58 | 0.05 | 0.09 | 0.08 | 0.05 |
   // Mass (vector)        : 0.36 | 1.37 | 0.12 | 0.17 | 0.08 | 0.11 |
@@ -695,7 +695,7 @@
   // Non-homogeneous elast: 0.26 | 2.38 | 0.06 | 0.10 | 0.03 | 0.13 |
   if (all || only_one == 4) // ndofu = 151959 ndofp =  50653 ndofchi = 6553
     test_new_assembly(3, 18, 2);
-  // Vector source term   : 0.12 | 0.23 |
+  // Vector source term   : 0.10 | 0.23 |
   // Nonlinear residual   : 0.28 |      |
   // Mass (scalar)        : 0.11 | 0.25 | 0.05 | 0.05 | 0.03 | 0.03 |
   // Mass (vector)        : 0.29 | 0.89 | 0.11 | 0.16 | 0.03 | 0.10 |
@@ -704,7 +704,7 @@
   // Non-homogeneous elast: 1.68 | 9.08 | 0.59 | 0.73 | 0.03 | 0.92 |
   if (all || only_one == 5) // ndofu = 151959 ndofp =  50653 ndofchi = 6553
     test_new_assembly(3, 9, 4);
-  // Vector source term   : 0.10 | 0.19 |
+  // Vector source term   : 0.08 | 0.19 |
   // Nonlinear residual   : 0.27 |      |
   // Mass (scalar)        : 0.51 | 0.34 | 0.09 | 0.16 | 0.01 | 0.34 |
   // Mass (vector)        : 1.29 | 1.31 | 0.23 | 0.41 | 0.01 | 0.87 |
@@ -716,7 +716,9 @@
   // - Deactivation of debug test has no sensible effect.
   // - Compile time of assembly strings is negligible (< 0.0004)
   // - (J, K, B) computation takes half the computational time of the exec part
-  // - The optimized instruction call is negligible
+  // - The call itself of optimized instruction virtual functions is negligible
+  //   It means that a real compilation (g++) avoiding them is not necessary
+  //   and would we far more expensive in compilation time.
   // - For uniform mesh_fem, the resize operations has been suppressed and
   //   the "update pfp" has been isolated in a set  of instruction being
   //   executed only on change of integration method.

Modified: trunk/getfem/doc/sphinx/source/project/libdesc_high_gen_assemb.rst
URL: 
http://svn.gna.org/viewcvs/getfem/trunk/getfem/doc/sphinx/source/project/libdesc_high_gen_assemb.rst?rev=5477&r1=5476&r2=5477&view=diff
==============================================================================
--- trunk/getfem/doc/sphinx/source/project/libdesc_high_gen_assemb.rst  
(original)
+++ trunk/getfem/doc/sphinx/source/project/libdesc_high_gen_assemb.rst  Thu Nov 
10 17:40:51 2016
@@ -102,10 +102,8 @@
        :header: "Scalar tensor", "Vectorized tensor"
        :widths: 5, 12
 
-       ":math:`\bar{t}(i) = \varphi_i(x)`",
-       ":math:`t(j,k) = \varphi_{j/Q}(x) \delta_{k, (j \mbox{ mod } Q)}`"
-       ":math:`[\varphi_0(x), \varphi_1(x)]`",
-       ":math:`[\varphi_0(x), 0, \varphi_1(x), 0, 0, \varphi_0(x), 0, 
\varphi_1(x)]`"
+       ":math:`\bar{t}(i) = \varphi_i(x)`", ":math:`t(j,k) = \varphi_{j/Q}(x) 
\delta_{k, (j \mbox{ mod } Q)}`"
+       ":math:`[\varphi_0(x), \varphi_1(x)]`", ":math:`[\varphi_0(x), 0, 
\varphi_1(x), 0, 0, \varphi_0(x), 0, \varphi_1(x)]`"
   
   - 2: Grad condensed format
 

Modified: trunk/getfem/src/getfem_generic_assembly.cc
URL: 
http://svn.gna.org/viewcvs/getfem/trunk/getfem/src/getfem_generic_assembly.cc?rev=5477&r1=5476&r2=5477&view=diff
==============================================================================
--- trunk/getfem/src/getfem_generic_assembly.cc (original)
+++ trunk/getfem/src/getfem_generic_assembly.cc Thu Nov 10 17:40:51 2016
@@ -4761,14 +4761,71 @@
       : t(t_), tc1(tc1_), tc2(tc2_), nn(n_) {}
   };
 
-
   // Performs Ani Bmi -> Cmn
-  struct ga_instruction_reduction_opt1 : public ga_instruction {
+  struct ga_instruction_reduction_opt0_1 : public ga_instruction {
+    base_tensor &t, &tc1, &tc2;
+    size_type nn;
+    virtual int exec() {
+      cout << "Not unrolled " << nn << " tc1 = " << tc1 << endl;
+      cout << "tc2 = " << tc2 << endl;
+      GA_DEBUG_INFO("Instruction: reduction operation of size " << nn
+                   " optimized for vectorized second tensor of type 1");
+      size_type ss1=tc1.size(), s2=tc2.size()/nn, s2_n=s2/nn;
+      auto it = t.begin(), it2 = tc2.begin();
+      for (size_type i = 0; i < s2_n; ++i, it2 += nn) {
+       auto it1 = tc1.begin();
+       for (size_type j = 0; j < ss1; ++j) *it++ = (*it1++) * (*it2);
+      }
+      cout << "t = " << t << endl;
+      ga_instruction_reduction toto(t, tc1, tc2, nn);
+      toto.exec();
+      cout << "t = " << t << endl;
+
+      getchar();
+      return 0;
+    }
+    ga_instruction_reduction_opt0_1(base_tensor &t_, base_tensor &tc1_,
+                                   base_tensor &tc2_, size_type n_)
+      : t(t_), tc1(tc1_), tc2(tc2_), nn(n_) {}
+  };
+
+  template<int N> inline void reduc_elem_unrolled_opt1_
+  (base_vector::iterator &it, base_vector::iterator &it1, scalar_type a) {
+    *it++ = (*it1++) * a;
+    reduc_elem_unrolled_opt1_<N-1>(it, it1, a);
+  }
+  template<> inline void reduc_elem_unrolled_opt1_<1>
+  (base_vector::iterator &it, base_vector::iterator &it1, scalar_type a)
+  { *it++ = (*it1++) * a; }
+
+  // Performs Ani Bmi -> Cmn
+  template <int N>
+  struct ga_instruction_reduction_opt0_1_unrolled : public ga_instruction {
+    base_tensor &t, &tc1, &tc2;
+    size_type nn;
+    virtual int exec() {
+      GA_DEBUG_INFO("Instruction: unrolled reduction operation of size " << nn
+                   " optimized for vectorized second tensor of type 1");
+      size_type s2=tc2.size()/nn, s2_n=s2/nn;
+      base_vector::iterator it = t.begin(), it2 = tc2.begin();
+      for (size_type i = 0; i < s2_n; ++i, it2 += nn) {
+       auto it1 = tc1.begin();
+       reduc_elem_unrolled_opt1_<N>(it, it1, *it2);
+      }
+      return 0;
+    }
+    ga_instruction_reduction_opt0_1_unrolled(base_tensor &t_, base_tensor 
&tc1_,
+                                            base_tensor &tc2_, size_type n_)
+      : t(t_), tc1(tc1_), tc2(tc2_), nn(n_) {}
+  };
+
+  // Performs Ani Bmi -> Cmn
+  struct ga_instruction_reduction_opt1_1 : public ga_instruction {
     base_tensor &t, &tc1, &tc2;
     size_type nn;
     virtual int exec() {
       GA_DEBUG_INFO("Instruction: reduction operation of size " << nn <<
-                   " optimized for vectorized tensor");
+                   " optimized for both vectorized tensor of type 1");
       size_type s1 = tc1.size()/nn, s2 = tc2.size()/nn, s2_1 = s2+1;
       GA_DEBUG_ASSERT(t.size() == s2*s1, "Internal error");
       size_type ss1 = s1/nn, ss2 = s2/nn;
@@ -4786,8 +4843,8 @@
       }      
       return 0;
     }
-    ga_instruction_reduction_opt1(base_tensor &t_, base_tensor &tc1_,
-                                  base_tensor &tc2_, size_type n_)
+    ga_instruction_reduction_opt1_1(base_tensor &t_, base_tensor &tc1_,
+                                   base_tensor &tc2_, size_type n_)
       : t(t_), tc1(tc1_), tc2(tc2_), nn(n_) {}
   };
 
@@ -4909,50 +4966,52 @@
 
 
   pga_instruction ga_instruction_reduction_switch
-  (assembly_tensor &t, assembly_tensor &tc1, assembly_tensor &tc2,
+  (assembly_tensor &t_, assembly_tensor &tc1_, assembly_tensor &tc2_,
    size_type n, bool &to_clear) {
-
-    if (tc1.sparsity() == 1 && tc2.sparsity() == 1 &&
-       tc1.qdim() == n && tc2.qdim() == n) {
+    base_tensor &t = t_.tensor(), &tc1 = tc1_.tensor(), &tc2 = tc2_.tensor();
+
+    if (tc1_.sparsity() == 1 && tc2_.sparsity() == 1 &&
+       tc1_.qdim() == n && tc2_.qdim() == n) {
       to_clear = true;
-      t.set_sparsity(10, tc1.qdim());
-      return std::make_shared<ga_instruction_reduction_opt1>
-       (t.tensor(), tc1.tensor(), tc2.tensor(), n);
-    }
-    
+      t_.set_sparsity(10, tc1_.qdim());
+      return std::make_shared<ga_instruction_reduction_opt1_1>(t, tc1, tc2, n);
+    }
+    if (tc2_.sparsity() == 1 && false)
+      return std::make_shared<ga_instruction_reduction_opt0_1>(t, tc1, tc2, n);
+
     switch(n) {
     case  2 : return std::make_shared<ga_instruction_reduction_unrolled< 2>>
-                    (t.tensor(), tc1.tensor(), tc2.tensor());
+                    (t, tc1, tc2);
     case  3 : return std::make_shared<ga_instruction_reduction_unrolled< 3>>
-                     (t.tensor(), tc1.tensor(), tc2.tensor());
+                    (t, tc1, tc2);
     case  4 : return std::make_shared<ga_instruction_reduction_unrolled< 4>>
-                     (t.tensor(), tc1.tensor(), tc2.tensor());
+                    (t, tc1, tc2);
     case  5 : return std::make_shared<ga_instruction_reduction_unrolled< 5>>
-                     (t.tensor(), tc1.tensor(), tc2.tensor());
+                     (t, tc1, tc2);
     case  6 : return std::make_shared<ga_instruction_reduction_unrolled< 6>>
-                     (t.tensor(), tc1.tensor(), tc2.tensor());
+                     (t, tc1, tc2);
     case  7 : return std::make_shared<ga_instruction_reduction_unrolled< 7>>
-                     (t.tensor(), tc1.tensor(), tc2.tensor());
+                     (t, tc1, tc2);
     case  8 : return std::make_shared<ga_instruction_reduction_unrolled< 8>>
-                     (t.tensor(), tc1.tensor(), tc2.tensor());
+                     (t, tc1, tc2);
     case  9 : return std::make_shared<ga_instruction_reduction_unrolled< 9>>
-                     (t.tensor(), tc1.tensor(), tc2.tensor());
+                     (t, tc1, tc2);
     case 10 : return std::make_shared<ga_instruction_reduction_unrolled<10>>
-                     (t.tensor(), tc1.tensor(), tc2.tensor());
+                     (t, tc1, tc2);
     case 11 : return std::make_shared<ga_instruction_reduction_unrolled<11>>
-                     (t.tensor(), tc1.tensor(), tc2.tensor());
+                     (t, tc1, tc2);
     case 12 : return std::make_shared<ga_instruction_reduction_unrolled<12>>
-                     (t.tensor(), tc1.tensor(), tc2.tensor());
+                     (t, tc1, tc2);
     case 13 : return std::make_shared<ga_instruction_reduction_unrolled<13>>
-                     (t.tensor(), tc1.tensor(), tc2.tensor());
+                     (t, tc1, tc2);
     case 14 : return std::make_shared<ga_instruction_reduction_unrolled<14>>
-                     (t.tensor(), tc1.tensor(), tc2.tensor());
+                     (t, tc1, tc2);
     case 15 : return std::make_shared<ga_instruction_reduction_unrolled<15>>
-                    (t.tensor(), tc1.tensor(), tc2.tensor());
+                    (t, tc1, tc2);
     case 16 : return std::make_shared<ga_instruction_reduction_unrolled<16>>
-                    (t.tensor(), tc1.tensor(), tc2.tensor());
+                    (t, tc1, tc2);
     default : return std::make_shared<ga_instruction_reduction>
-                    (t.tensor(), tc1.tensor(), tc2.tensor(), n);
+                    (t, tc1, tc2, n);
     }
   }
 
@@ -4965,8 +5024,40 @@
        tc1_.qdim() == n && tc2_.qdim() == n) {
       to_clear = true;
       t_.set_sparsity(10, tc1_.qdim());
-      return std::make_shared<ga_instruction_reduction_opt1>(t,tc1,tc2,n);
-    }
+      return std::make_shared<ga_instruction_reduction_opt1_1>(t,tc1,tc2,n);
+    }
+    if (tc2_.sparsity() == 1 && false) {
+      cout << "tc1.size() = " << tc1.size() << endl;
+      switch(tc1.size()) {
+      case 1 : break;
+      case 2:
+       if (tc2.size() > 16)
+         return std::make_shared<ga_instruction_reduction_opt0_1_unrolled<2>>
+           (t, tc1, tc2, n);
+       break;
+      case 3:
+       return std::make_shared<ga_instruction_reduction_opt0_1_unrolled<3>>
+         (t, tc1, tc2, n);
+      case 4:
+       return std::make_shared<ga_instruction_reduction_opt0_1_unrolled<4>>
+         (t, tc1, tc2, n);
+      case 5:
+       return std::make_shared<ga_instruction_reduction_opt0_1_unrolled<5>>
+         (t, tc1, tc2, n);
+      case 6:
+       return std::make_shared<ga_instruction_reduction_opt0_1_unrolled<6>>
+         (t, tc1, tc2, n);
+      case 7:
+       return std::make_shared<ga_instruction_reduction_opt0_1_unrolled<7>>
+         (t, tc1, tc2, n);
+      case 8:
+       return std::make_shared<ga_instruction_reduction_opt0_1_unrolled<8>>
+         (t, tc1, tc2, n);
+      default:
+       return std::make_shared<ga_instruction_reduction_opt0_1>(t,tc1,tc2, n);
+      }
+    }
+
     // Only specialized for certain values
     size_type s2 = tc2.size()/n;
     switch(s2) {
@@ -12662,6 +12753,17 @@
       const ga_instruction_list &gilb = instr.second.begin_instructions;
       const ga_instruction_list &gile = instr.second.elt_instructions;
       const ga_instruction_list &gil = instr.second.instructions;
+#if 0
+      if (gilb.size()) cout << "Begin instructions\n";
+      for (size_type j = 0; j < gilb.size(); ++j)
+       cout << typeid(*(gilb[j])).name() << endl;
+      if (gile.size()) cout << "\nElement instructions\n";
+      for (size_type j = 0; j < gile.size(); ++j)
+       cout << typeid(*(gile[j])).name() << endl;
+      cout << "\nGauss pt instructions\n";
+      for (size_type j = 0; j < gil.size(); ++j)
+       cout << typeid(*(gil[j])).name() << endl;
+#endif
       const mesh_region &region = *(instr.first.region());
 
       // iteration on elements (or faces of elements)




reply via email to

[Prev in Thread] Current Thread [Next in Thread]