Hello guys,

I've spent some time experimenting with various performance optimisations
and I would like to share my latest results with you:

I've run a lot of tests using Callgrind, which is part of the
Valgrind<http://valgrind.org/>tool (documentation
here <http://valgrind.org/docs/manual/cl-manual.html>). In doing so, I've
concluded that a disproportionate amount of time is spent copying values
(this can be parallelised; more about that below).

I set out to see how much faster I could make simple test program that
applied a monadic scalar function. Here is my test program:

∇Z←testinv;tmp
src←10000 4000⍴÷⍳100
'Starting'
tmp←{--------⍵} time src
Z←1
∇


This program calls my time operator which simply shows the amount of time
it took to execute the operation. This is of course needed for
benchmarking. For completeness, here is the implementation of time:

∇Z←L (OP time) R;start;end
start←⎕AI
→(0≠⎕NC 'L')/twoargs
Z←OP R
→finish
twoargs:
Z←L OP R
finish:
end←⎕AI
'Time:',((end[3]+end[2]×1E6) - (start[3]+start[2]×1E6))÷1E6
∇


The unmodified version of GNU APL runs this in *5037.00* milliseconds on my
machine.

I then set out to minimise the amount of cloning of values, taking
advantage of the existing temp functionality. Once I had done this, the
execution time was reduced to *2577.00* ms.

I then used the Threading Building
Blocks<https://www.threadingbuildingblocks.org/>library to parallelise
two operations: The clone operation and the monadic
SkalarFunction::eval_skalar_B(). After this, on my 4-core machine, the
runtime was reduced to *1430.00* ms.

Threading Building Blocks is available from the application repositories of
at least Arch Linux and Ubuntu, and I'm sure it's available elsewhere too.
To test in on OSX I had to download it separately.

To summarise:

   - Standard: 5037.00
   - Reduced cloning: 2577.00
   - Parallel: 1430.00

I have attached the patch, but it's definitely not something that should be
applied blindly. I have hacked around is several parts of the code, some of
which I can't say I understand fully, so see it as a proof-of-concept,
nothing else.

Note that the code that implements the parallelism using TBB is pretty
ugly, and the code ends up being duplicated in the parallel and
non-parallel version. This can, of course, be encapsulated much nicer if
one wants to make this generic.

Another thing, TBB is incredibly efficient, especially on Intel CPU's. I'd
be very interested to see how OpenMP performs on this same code.

Regards,
Elias
Index: src/SkalarFunction.cc
===================================================================
--- src/SkalarFunction.cc       (revision 164)
+++ src/SkalarFunction.cc       (working copy)
@@ -18,6 +18,9 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
+#include <tbb/tbb.h>
+#include <time.h>
+
 #include "SkalarFunction.hh"
 #include "Value.hh"
 #include "Workspace.hh"
@@ -57,27 +60,54 @@
 Bif_F12_WITHOUT   Bif_F12_WITHOUT::fun;
 
 //-----------------------------------------------------------------------------
+
+class ApplyRavel {
+public:
+    ApplyRavel( SkalarFunction &function_in,
+                Value_P &B_in,
+                Value_P &Z_in,
+                prim_f1 &fun_in )
+        : function( function_in ), B( B_in ), Z( Z_in ), fun( fun_in ) {}
+
+    void operator()( const tbb::blocked_range<ShapeItem> &range ) const {
+        for( ShapeItem c = range.begin() ; c != range.end() ; c++ ) {
+            const Cell * cell_B =  &B->get_ravel(c);
+            Cell * cell_Z = &Z->get_ravel(c);
+            if (cell_B->is_pointer_cell())
+            {
+                Token token = 
function.eval_skalar_B(cell_B->get_pointer_value(), fun);
+                new (cell_Z) PointerCell(token.get_apl_val());
+            }
+            else
+            {
+                (cell_B->*fun)(cell_Z);
+            }
+        }
+    }
+
+private:
+    SkalarFunction &function;
+    Value_P &B;
+    Value_P &Z;
+    prim_f1 &fun;
+};
+
 Token
 SkalarFunction::eval_skalar_B(Value_P B, prim_f1 fun)
 {
 const ShapeItem count = B->element_count();
    if (count == 0)   return eval_fill_B(B);
 
-Value_P Z(new Value(B->get_shape(), LOC));
-   loop(c, count)
-       {
-         const Cell * cell_B =  &B->get_ravel(c);
-         Cell * cell_Z = &Z->get_ravel(c);
-         if (cell_B->is_pointer_cell())
-            {
-              Token token = eval_skalar_B(cell_B->get_pointer_value(), fun);
-              new (cell_Z) PointerCell(token.get_apl_val());
-            }
-         else
-            {
-              (cell_B->*fun)(cell_Z);
-            }
-       }
+   Value_P Z;
+   if( B->is_temp() ) {
+       Z = B;
+   }
+   else {
+       Z = new Value(B->get_shape(), LOC);
+       Z->set_temp();
+   }
+//Value_P Z(new Value(B->get_shape(), LOC));
+//   Value_P Z(Zp);
 
    if (count == 0)   // Z was empty (hence B was empty)
       {
@@ -85,7 +115,36 @@
         if (cB.is_pointer_cell())   Z->get_ravel(0).init(cB);
         else                        new (&Z->get_ravel(0)) IntCell(0);
       }
+   else {
+       struct timespec start_ts, end_ts;
+       clock_gettime( CLOCK_REALTIME, &start_ts );
+#ifdef PARALLEL_DISABLED
+           loop(c, count)
+           {
+               const Cell * cell_B =  &B->get_ravel(c);
+               Cell * cell_Z = &Z->get_ravel(c);
+               if (cell_B->is_pointer_cell())
+               {
+                   Token token = eval_skalar_B(cell_B->get_pointer_value(), 
fun);
+                   new (cell_Z) PointerCell(token.get_apl_val());
+               }
+               else
+               {
+                   (cell_B->*fun)(cell_Z);
+               }
+           }
+#else
+           tbb::parallel_for( tbb::blocked_range<ShapeItem>( 0, count ), 
ApplyRavel( *this, B, Z, fun ) );
+#endif
 
+#if 0
+           clock_gettime( CLOCK_REALTIME, &end_ts );
+           long millis = (end_ts.tv_sec * 1000 + end_ts.tv_nsec / 1000000)
+               - (start_ts.tv_sec * 1000 + start_ts.tv_nsec / 1000000);
+           cout << "Total time: " << millis << endl;
+#endif
+   }
+
    Z->check_value(LOC);
    return Token(TOK_APL_VALUE1, Z);
 }
Index: src/Symbol.cc
===================================================================
--- src/Symbol.cc       (revision 164)
+++ src/Symbol.cc       (working copy)
@@ -126,7 +126,10 @@
    switch(vs.name_class)
       {
         case NC_UNUSED_USER_NAME:
-             new_value = new_value->clone(loc);
+            if( !new_value->is_temp() ) {
+                new_value = new_value->clone(loc);
+            }
+             new_value->clear_temp();
 
              vs.name_class = NC_VARIABLE;
              vs.apl_val = new_value;
@@ -136,7 +139,10 @@
         case NC_VARIABLE:
              if (vs.apl_val == new_value)   return;   // X←X
 
-             new_value = new_value->clone(loc);
+             if( !new_value->is_temp() ) {
+                 new_value = new_value->clone(loc);
+             }
+             new_value->clear_temp();
 
              // un-assign and erase old value
              //
Index: src/Value.cc
===================================================================
--- src/Value.cc        (revision 164)
+++ src/Value.cc        (working copy)
@@ -18,6 +18,8 @@
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
+#include <tbb/tbb.h>
+
 #include "CDR_string.hh"
 #include "CharCell.hh"
 #include "Common.hh"
@@ -127,6 +129,7 @@
      flags(VF_NONE),
      valid_ravel_items(0)
 {
+    set_temp();
    ADD_EVENT(this, VHE_Create, 0, loc);
    init_ravel();
 }
@@ -136,6 +139,7 @@
      flags(VF_forever),
      valid_ravel_items(0)
 {
+    set_temp();
    // default shape is skalar
    //
    ADD_EVENT(this, VHE_Create, 0, loc);
@@ -249,6 +253,7 @@
      flags(VF_NONE),
      valid_ravel_items(0)
 {
+    set_temp();
    ADD_EVENT(this, VHE_Create, 0, loc);
    init_ravel();
 }
@@ -259,6 +264,7 @@
      flags(VF_NONE),
      valid_ravel_items(0)
 {
+    set_temp();
    ADD_EVENT(this, VHE_Create, 0, loc);
    init_ravel();
 
@@ -280,6 +286,7 @@
      flags(VF_NONE),
      valid_ravel_items(0)
 {
+    set_temp();
    ADD_EVENT(this, VHE_Create, 0, loc);
    init_ravel();
 
@@ -301,6 +308,7 @@
      flags(VF_NONE),
      valid_ravel_items(0)
 {
+    set_temp();
    ADD_EVENT(this, VHE_Create, 0, loc);
    init_ravel();
 
@@ -316,6 +324,7 @@
      flags(VF_NONE),
      valid_ravel_items(0)
 {
+    set_temp();
    ADD_EVENT(this, VHE_Create, 0, loc);
    init_ravel();
 }
@@ -325,6 +334,7 @@
      flags(VF_NONE),
      valid_ravel_items(0)
 {
+    set_temp();
    ADD_EVENT(this, VHE_Create, 0, loc);
    init_ravel();
 
@@ -1634,19 +1644,43 @@
 
 }
 //-----------------------------------------------------------------------------
+
+class ApplyClone {
+public:
+    ApplyClone( const Cell *src_in, Cell *dst_in ) : src( src_in ), dst( 
dst_in ) {}
+    void operator()( const tbb::blocked_range<size_t> &range ) const {
+        for( size_t p = range.begin() ; p != range.end() ; p++ ) {
+            dst[p].init( src[p] );
+        }
+    }
+
+private:
+    const Cell *src;
+    Cell *dst;
+};
+
 Value_P
 Value::clone(const char * loc) const
 {
-Value_P ret(new Value(get_shape(), loc));
+    if( is_temp() ) {
+        return Value_P( const_cast<Value *>( this ), LOC );
+    }
+    else {
+        Value_P ret(new Value(get_shape(), loc));
 
-const Cell * src = &get_ravel(0);
-Cell * dst = &ret->get_ravel(0);
-const ShapeItem count = nz_element_count();
+        const Cell * src = &get_ravel(0);
+        Cell * dst = &ret->get_ravel(0);
+        const ShapeItem count = nz_element_count();
 
-   loop(c, count)   dst++->init(*src++);
+#ifdef PARALLEL_DISABLED
+        loop(c, count)   dst++->init(*src++);
+#else
+        tbb::parallel_for( tbb::blocked_range<size_t>( 0, count ), ApplyClone( 
src, dst ) );
+#endif
 
-   ret->check_value(LOC);
-   return ret;
+        ret->check_value(LOC);
+        return ret;
+    }
 }
 //-----------------------------------------------------------------------------
 /// lrp p.138: S←⍴⍴A + NOTCHAR (per column)
Index: src/Value.hh
===================================================================
--- src/Value.hh        (revision 164)
+++ src/Value.hh        (working copy)
@@ -340,6 +340,7 @@
 
 # define clear_forever()  CLEAR_forever(_LOC)
 # define clear_marked()   CLEAR_marked(_LOC)
+# define clear_temp()     CLEAR_temp(_LOC);
 
    VF_flag(forever)
    VF_flag(complete)

Reply via email to