Re: [PATCH] add self-tuning to x86 hardware fast path in libitm

Nuno Diegues Wed, 29 Apr 2015 20:25:19 -0700

Hello,

I have taken the chance to improve the patch by addressing the
comments above in this thread.
Namely:
 - to use a simple random generator managed inside the library only
 - removed floating point usage and replaced by fixed arithmetic
 - added some comments where relevant


Re-running the STAMP benchmarks shows similar gains (to those shown
above) with respect to an unmodified GCC 5.0.0:

benchmark: speedup
genome: 1.58
intruder: 1.78
labyrinth: 1.0
ssca2: 1.01
yada: 1.0
kmeans-high: 1.16
kmeans-low: 1.16
vacation-high: 2.05
vacation-low: 2.81

I appreciate any feedback and comments!

Index: libitm/libitm_i.h
===================================================================
--- libitm/libitm_i.h (revision 219316)
+++ libitm/libitm_i.h (working copy)
@@ -242,6 +242,9 @@ struct gtm_thread
   uint32_t restart_reason[NUM_RESTARTS];
   uint32_t restart_total;

+  // Keeps track of how many transactions were successfully executed.
+  uint64_t number_executed_txns;
+
   // *** The shared part of gtm_thread starts here. ***
   // Shared state is on separate cachelines to avoid false sharing with
   // thread-local parts of gtm_thread.
@@ -286,6 +289,8 @@ struct gtm_thread
   static void *operator new(size_t);
   static void operator delete(void *);

+  static void reoptimize_htm_execution();
+
   // Invoked from assembly language, thus the "asm" specifier on
   // the name, avoiding complex name mangling.
   static uint32_t begin_transaction(uint32_t, const gtm_jmpbuf *)
@@ -309,6 +314,59 @@ struct gtm_thread
   void commit_user_actions ();
 };

+// Different ways to deal with capacity aborts in HTM execution.
+enum gtm_capacity_abort_mode
+{
+  STUBBORN,
+  HALVEN,
+  GIVEUP
+};
+
+// Definition of fixed point arithmetic types.
+// Half the bits are dedicated to the fractional type, and the rest to the
+// "whole" part.
+#define FIXED_PT_WIDTH  64
+#define FIXED_PT_INTEGER_WIDTH  32
+typedef uint64_t fixed_pt_t;
+typedef __uint128_t fixed_pt_td;
+
+#define FIXED_PT_FRAC_WIDTH     (FIXED_PT_WIDTH - FIXED_PT_INTEGER_WIDTH)
+#define FIXED_PT_ONE    ((fixed_pt_t)((fixed_pt_t)1 << FIXED_PT_FRAC_WIDTH))
+#define FIXED_PT_TWO    (FIXED_PT_ONE + FIXED_PT_ONE)
+
+#define fixed_mul(A,B) \
+  ((fixed_pt_t)(((fixed_pt_td)(A) * (fixed_pt_td)(B)) >> FIXED_PT_FRAC_WIDTH))
+#define fixed_div(A,B) \
+  ((fixed_pt_t)(((fixed_pt_td)(A) << FIXED_PT_FRAC_WIDTH) / (fixed_pt_td)(B)))
+#define fixed_const(R) \
+  ((fixed_pt_t)((R) * FIXED_PT_ONE + ((R) >= 0 ? 0.5 : -0.5)))
+
+// Maintains the current values optimized for HTM execution and the
+// corresponding statistics gathered for the decision-making.
+struct gtm_global_optimizer
+{
+  // Mode chosen to currently deal with capacity aborts.
+  gtm_capacity_abort_mode optimized_mode;
+  // Number of attempts chosen to currently insist on HTM execution.
+  uint32_t optimized_attempts;
+
+  uint64_t last_cycles;
+  uint64_t last_total_txs_executed;
+
+  fixed_pt_t last_throughput;
+  uint32_t last_attempts;
+
+  fixed_pt_t best_ever_throughput;
+  uint32_t best_ever_attempts;
+
+  uint64_t txns_while_stubborn;
+  uint64_t cycles_while_stubborn;
+  uint64_t txns_while_halven;
+  uint64_t cycles_while_halven;
+  uint64_t txns_while_giveup;
+  uint64_t cycles_while_giveup;
+};
+
 } // namespace GTM

 #include "tls.h"
@@ -346,6 +404,9 @@ extern gtm_cacheline_mask gtm_mask_stack(gtm_cache
 // the name, avoiding complex name mangling.
 extern uint32_t htm_fastpath __asm__(UPFX "gtm_htm_fastpath");

+// Maintains the optimization for HTM execution.
+extern gtm_global_optimizer optimizer;
+
 } // namespace GTM

 #endif // LIBITM_I_H
Index: libitm/libitm.h
===================================================================
--- libitm/libitm.h (revision 219316)
+++ libitm/libitm.h (working copy)
@@ -101,7 +101,8 @@ typedef enum
    /* These are not part of the ABI but used for custom HTM fast paths.  See
       ITM_beginTransaction and gtm_thread::begin_transaction.  */
    pr_HTMRetryableAbort = 0x800000,
-   pr_HTMRetriedAfterAbort = 0x1000000
+   pr_HTMRetriedAfterAbort = 0x1000000,
+   pr_HTMCapacityAbort          = 0x2000000
 } _ITM_codeProperties;

 /* Result from startTransaction that describes what actions to take.
Index: libitm/method-serial.cc
===================================================================
--- libitm/method-serial.cc (revision 219316)
+++ libitm/method-serial.cc (working copy)
@@ -223,7 +223,23 @@ struct htm_mg : public method_group
     // initially disabled.
 #ifdef USE_HTM_FASTPATH
     htm_fastpath = htm_init();
+#ifdef HAVE_AS_RTM
+    optimizer.optimized_mode = STUBBORN;
+    optimizer.optimized_attempts = htm_fastpath;
+    optimizer.last_cycles = rdtsc();
+    optimizer.last_total_txs_executed = 0;
+    optimizer.last_throughput = fixed_const(0.0001);
+    optimizer.last_attempts = htm_fastpath > 0 ? htm_fastpath - 1 : 1;
+    optimizer.best_ever_throughput = optimizer.last_throughput;
+    optimizer.best_ever_attempts = htm_fastpath;
+    optimizer.txns_while_stubborn = 1;
+    optimizer.cycles_while_stubborn = 100;
+    optimizer.txns_while_halven = 1;
+    optimizer.cycles_while_halven = 100;
+    optimizer.txns_while_giveup = 1;
+    optimizer.cycles_while_giveup = 100;
 #endif
+#endif
   }
   virtual void fini()
   {
Index: libitm/beginend.cc
===================================================================
--- libitm/beginend.cc (revision 219316)
+++ libitm/beginend.cc (working copy)
@@ -25,7 +25,6 @@
 #include "libitm_i.h"
 #include <pthread.h>

-
 using namespace GTM;

 #if !defined(HAVE_ARCH_GTM_THREAD) || !defined(HAVE_ARCH_GTM_THREAD_DISP)
@@ -39,6 +38,9 @@ unsigned GTM::gtm_thread::number_of_threads = 0;
 gtm_stmlock GTM::gtm_stmlock_array[LOCK_ARRAY_SIZE];
 atomic<gtm_version> GTM::gtm_clock;

+// Optimization of HTM executions.
+gtm_global_optimizer GTM::optimizer;
+
 /* ??? Move elsewhere when we figure out library initialization.  */
 uint64_t GTM::gtm_spin_count_var = 1000;

@@ -149,6 +151,225 @@ choose_code_path(uint32_t prop, abi_dispatch *disp
     return a_runInstrumentedCode;
 }

+static inline fixed_pt_t
+fixed_sqrt(fixed_pt_t n)
+{
+    int invert = 0;
+    int iter = FIXED_PT_FRAC_WIDTH;
+    int l, i;
+
+    if (n == 0 || n == FIXED_PT_ONE)
+      {
+        return n;
+      }
+    if (n < FIXED_PT_ONE && n > 6)
+      {
+        invert = 1;
+        n = fixed_div(FIXED_PT_ONE, n);
+      }
+    if (n > FIXED_PT_ONE)
+      {
+        int s = n;
+        iter = 0;
+        while (s > 0)
+          {
+            s >>= 2;
+            iter++;
+          }
+      }
+
+    l = (n >> 1) + 1;
+    for (i = 0; i < iter; i++)
+      {
+        l = (l + fixed_div(n, l)) >> 1;
+      }
+    if (invert)
+      {
+        return (fixed_div(FIXED_PT_ONE, l));
+      }
+    return l;
+}
+
+static inline fixed_pt_t
+fixed_ln(fixed_pt_t x)
+{
+  const fixed_pt_t LN2 = fixed_const(0.69314718055994530942);
+  const fixed_pt_t LG[7] = {
+    fixed_const(6.666666666666735130e-01),
+    fixed_const(3.999999999940941908e-01),
+    fixed_const(2.857142874366239149e-01),
+    fixed_const(2.222219843214978396e-01),
+    fixed_const(1.818357216161805012e-01),
+    fixed_const(1.531383769920937332e-01),
+    fixed_const(1.479819860511658591e-01)
+  };
+
+  if (x == 0)
+    {
+      return 0xffffffff;
+    }
+
+  fixed_pt_t log2 = 0;
+  fixed_pt_t xi = x;
+  while (xi > FIXED_PT_TWO)
+    {
+      xi >>= 1;
+      log2++;
+    }
+
+  fixed_pt_t f = xi - FIXED_PT_ONE;
+  fixed_pt_t s = fixed_div(f, FIXED_PT_TWO + f);
+  fixed_pt_t z = fixed_mul(s, s);
+  fixed_pt_t w = fixed_mul(z, z);
+  fixed_pt_t R =
+    fixed_mul(w, LG[1] + fixed_mul(w, LG[3] + fixed_mul(w, LG[5])))
+    + fixed_mul(z, LG[0] + fixed_mul(w, LG[2]
+        + fixed_mul(w, LG[4] + fixed_mul(w, LG[6]))));
+  return fixed_mul(LN2, (log2 << FIXED_PT_FRAC_WIDTH))
+    + f - fixed_mul(s, f - R);
+}
+
+// State not synchronized; assumes single thread usage at any time.
+// Invoked only by the thread reoptimizing, so assumption holds.
+int
+obtainRandomInt(int max)
+{
+  static int seed = 123456789;
+  seed = (1103515245 * seed + 12345) % 4294967296;
+  return seed;
+}
+
+// Called by the thread at the tail of the list of threads.
+void
+GTM::gtm_thread::reoptimize_htm_execution()
+{
+  gtm_thread *tx = gtm_thr();
+  uint64_t total_txs_executed = 0;
+
+  // Collect the statistics obtained so far.
+  serial_lock.read_lock(tx);
+  gtm_thread *ptr = list_of_threads;
+  for (; ptr; ptr = ptr->next_thread)
+    {
+      total_txs_executed += ptr->number_executed_txns;
+    }
+  serial_lock.read_unlock(tx);
+
+  // Obtain the delta performance with respect to the last period.
+  const uint64_t current_cycles = rdtsc();
+  const uint64_t cycles_used = current_cycles - optimizer.last_cycles;
+  const uint64_t new_txs_executed =
+    total_txs_executed - optimizer.last_total_txs_executed;
+  optimizer.last_cycles = current_cycles;
+  optimizer.last_total_txs_executed = total_txs_executed;
+  if (optimizer.optimized_mode == STUBBORN)
+    {
+      optimizer.txns_while_stubborn += new_txs_executed;
+      optimizer.cycles_while_stubborn += cycles_used;
+    }
+  else if (optimizer.optimized_mode == HALVEN)
+    {
+      optimizer.txns_while_halven += new_txs_executed;
+      optimizer.cycles_while_halven += cycles_used;
+    }
+  else
+    {
+      optimizer.txns_while_giveup += new_txs_executed;
+      optimizer.cycles_while_giveup += cycles_used;
+    }
+
+  // Compute Upper Confidence Bounds for the mode to choose next.
+  // Use fixed point arithmetic types to spare floating point usage.
+  const fixed_pt_t log_sum =
+    fixed_mul(FIXED_PT_TWO,
+      fixed_ln(fixed_const(optimizer.txns_while_stubborn)
+               + fixed_const(optimizer.txns_while_halven)
+               + fixed_const(optimizer.txns_while_giveup)));
+  const fixed_pt_t ucb_stubborn =
+    fixed_div(fixed_div(fixed_const(optimizer.txns_while_stubborn),
+                        fixed_const(optimizer.cycles_while_stubborn)),
+              optimizer.best_ever_throughput)
+    + fixed_sqrt(fixed_div(log_sum,
+                           fixed_const(optimizer.txns_while_stubborn)));
+  const fixed_pt_t ucb_halven =
+    fixed_div(fixed_div(fixed_const(optimizer.txns_while_halven),
+                        fixed_const(optimizer.cycles_while_halven)),
+              optimizer.best_ever_throughput)
+    + fixed_sqrt(fixed_div(log_sum, fixed_const(optimizer.txns_while_halven)));
+  const fixed_pt_t ucb_giveup =
+    fixed_div(fixed_div(fixed_const(optimizer.txns_while_giveup),
+                        fixed_const(optimizer.cycles_while_giveup)),
+              optimizer.best_ever_throughput)
+    + fixed_sqrt(fixed_div(log_sum, fixed_const(optimizer.txns_while_giveup)));
+
+  if (ucb_stubborn > ucb_halven && ucb_stubborn > ucb_giveup)
+    {
+      optimizer.optimized_mode = STUBBORN;
+    }
+  else if (ucb_halven > ucb_giveup)
+    {
+      optimizer.optimized_mode = HALVEN;
+    }
+  else
+    {
+      optimizer.optimized_mode = GIVEUP;
+    }
+
+  // Compute gradient descent for the number of retries.
+  const fixed_pt_t current_throughput =
+    fixed_div(fixed_const(new_txs_executed), fixed_const(cycles_used));
+  const fixed_pt_t change_for_better =
+    fixed_div(current_throughput, optimizer.last_throughput);
+  const fixed_pt_t change_for_worse =
+    fixed_div(optimizer.last_throughput, current_throughput);
+  int32_t last_attempts = optimizer.last_attempts;
+  int32_t current_attempts = optimizer.optimized_attempts;
+  int32_t new_attempts = current_attempts;
+  if (unlikely(change_for_worse > fixed_const(1.40)))
+    {
+      optimizer.optimized_attempts = optimizer.best_ever_attempts;
+      optimizer.last_throughput = current_throughput;
+      optimizer.last_attempts = current_attempts;
+      return;
+    }
+
+  if (unlikely(obtainRandomInt(100) == 0))
+    {
+      optimizer.last_attempts = optimizer.optimized_attempts;
+      optimizer.last_throughput = current_throughput;
+      optimizer.optimized_attempts = obtainRandomInt(18) + 2;
+      return;
+    }
+
+      if (change_for_better > fixed_const(1.05))
+        {
+          new_attempts += current_attempts - last_attempts;
+          if (current_attempts == last_attempts)
+            new_attempts += obtainRandomInt(2) == 0 ? -1 : 1;
+        }
+      else if (change_for_worse > fixed_const(1.05))
+        {
+          new_attempts -= current_attempts - last_attempts;
+          if (current_attempts == last_attempts)
+            new_attempts += obtainRandomInt(2) == 0 ? -1 : 1;
+        }
+      if (unlikely(new_attempts > 20))
+        new_attempts = 20;
+      else if (unlikely(new_attempts < 2))
+        new_attempts = 2;
+      if (current_attempts != new_attempts)
+        {
+          optimizer.last_attempts = current_attempts;
+          optimizer.last_throughput = current_throughput;
+        }
+      optimizer.optimized_attempts = new_attempts;
+      if (unlikely(current_throughput > optimizer.best_ever_throughput))
+        {
+          optimizer.best_ever_throughput = current_throughput;
+          optimizer.best_ever_attempts = current_attempts;
+        }
+}
+
 uint32_t
 GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb)
 {
@@ -190,7 +411,7 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
 #ifndef HTM_CUSTOM_FASTPATH
   if (likely(htm_fastpath && (prop & pr_hasNoAbort)))
     {
-      for (uint32_t t = htm_fastpath; t; t--)
+      for (uint32_t t = optimizer.optimized_attempts; t; t--)
  {
   uint32_t ret = htm_begin();
   if (htm_begin_success(ret))
@@ -209,19 +430,36 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
     }
   // The transaction has aborted.  Don't retry if it's unlikely that
   // retrying the transaction will be successful.
-  if (!htm_abort_should_retry(ret))
+#ifdef HAVE_AS_RTM
+          if (htm_is_capacity_abort(ret))
+            {
+              gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode;
+              if (capacity_mode == HALVEN)
+                t = t / 2 + 1;
+              else if (capacity_mode == GIVEUP)
+                t = 1;
+            }
+          // Only one thread performs this to avoid the need for
+          // synchronization. We use the oldest thread that is active.
+          // We also reoptimize at most once per transaction.
+          if (unlikely(tx->next_thread == NULL &&
+              tx->number_executed_txns % 500 == 0 && t == 1))
+              reoptimize_htm_execution();
+#else
+          if (!htm_abort_should_retry(ret))
     break;
+#endif
   // Wait until any concurrent serial-mode transactions have finished.
   // This is an empty critical section, but won't be elided.
   if (serial_lock.is_write_locked())
     {
-      tx = gtm_thr();
-      if (unlikely(tx == NULL))
-        {
-          // See below.
-          tx = new gtm_thread();
-          set_gtm_thr(tx);
-        }
+              tx = gtm_thr();
+              if (unlikely(tx == NULL))
+                {
+                  // See below.
+                  tx = new gtm_thread();
+                  set_gtm_thr(tx);
+                }
       // Check whether there is an enclosing serial-mode transaction;
       // if so, we just continue as a nested transaction and don't
       // try to use the HTM fastpath.  This case can happen when an
@@ -262,12 +500,26 @@ GTM::gtm_thread::begin_transaction (uint32_t prop,
       // other fallback will use serial transactions, which don't use
       // restart_total but will reset it when committing.
       if (!(prop & pr_HTMRetriedAfterAbort))
- tx->restart_total = htm_fastpath;
+        tx->restart_total = optimizer.optimized_attempts;

       if (--tx->restart_total > 0)
  {
   // Wait until any concurrent serial-mode transactions have finished.
   // Essentially the same code as above.
+#ifdef HAVE_AS_RTM
+          if (prop & pr_HTMCapacityAbort)
+            {
+              gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode;
+              if (capacity_mode == HALVEN)
+                tx->restart_total = tx->restart_total;
+              else if (capacity_mode == GIVEUP)
+                goto stop_custom_htm_fastpath;
+            }
+
+          if (unlikely(tx->next_thread == NULL &&
+              tx->number_executed_txns % 500 == 0 && tx->restart_total == 1))
+              reoptimize_htm_execution();
+#endif
   if (serial_lock.is_write_locked())
     {
       if (tx->nesting > 0)
@@ -665,12 +917,21 @@ _ITM_commitTransaction(void)
   if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
     {
       htm_commit();
+      gtm_thread *tx = gtm_thr();
+      if (unlikely(tx == NULL))
+        {
+          tx = new gtm_thread();
+          set_gtm_thr(tx);
+        }
+      tx->number_executed_txns++;
       return;
     }
 #endif
   gtm_thread *tx = gtm_thr();
   if (!tx->trycommit ())
     tx->restart (RESTART_VALIDATE_COMMIT);
+
+  tx->number_executed_txns++;
 }

 void ITM_REGPARM
@@ -681,6 +942,13 @@ _ITM_commitTransactionEH(void *exc_ptr)
   if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked()))
     {
       htm_commit();
+      gtm_thread *tx = gtm_thr();
+      if (unlikely(tx == NULL))
+        {
+          tx = new gtm_thread();
+          set_gtm_thr(tx);
+        }
+      tx->number_executed_txns++;
       return;
     }
 #endif
@@ -690,4 +958,5 @@ _ITM_commitTransactionEH(void *exc_ptr)
       tx->eh_in_flight = exc_ptr;
       tx->restart (RESTART_VALIDATE_COMMIT);
     }
+  tx->number_executed_txns++;
 }
Index: libitm/config/x86/target.h
===================================================================
--- libitm/config/x86/target.h (revision 219316)
+++ libitm/config/x86/target.h (working copy)
@@ -126,12 +126,25 @@ htm_abort_should_retry (uint32_t begin_ret)
   return begin_ret & _XABORT_RETRY;
 }

+static inline bool
+htm_is_capacity_abort(uint32_t begin_ret)
+{
+  return begin_ret & _XABORT_CAPACITY;
+}
+
 /* Returns true iff a hardware transaction is currently being executed.  */
 static inline bool
 htm_transaction_active ()
 {
   return _xtest() != 0;
 }
+
+static inline uint64_t rdtsc()
+{
+    uint32_t hi, lo;
+    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+    return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
+}
 #endif


Index: libitm/config/x86/sjlj.S
===================================================================
--- libitm/config/x86/sjlj.S (revision 219316)
+++ libitm/config/x86/sjlj.S (working copy)
@@ -59,12 +59,14 @@
 #define pr_hasNoAbort 0x08
 #define pr_HTMRetryableAbort 0x800000
 #define pr_HTMRetriedAfterAbort 0x1000000
+#define pr_HTMCapacityAbort     0x2000000
 #define a_runInstrumentedCode 0x01
 #define a_runUninstrumentedCode 0x02
 #define a_tryHTMFastPath 0x20

 #define _XABORT_EXPLICIT (1 << 0)
 #define _XABORT_RETRY (1 << 1)
+#define _XABORT_CAPACITY (1 << 3)

  .text

@@ -108,9 +110,12 @@ SYM(_ITM_beginTransaction):
 .Ltxn_abort:
  /* If it might make sense to retry the HTM fast path, let the C++
    code decide.  */
- testl $(_XABORT_RETRY|_XABORT_EXPLICIT), %eax
+ testl $(_XABORT_RETRY|_XABORT_EXPLICIT|_XABORT_CAPACITY), %eax
  jz .Lno_htm
  orl $pr_HTMRetryableAbort, %edi
+ testl   $(_XABORT_CAPACITY), %eax
+ jz      .Lno_htm
+ orl     $pr_HTMCapacityAbort, %edi
  /* Let the C++ code handle the retry policy.  */
 .Lno_htm:
 #endif




On Fri, Apr 10, 2015 at 8:24 AM, Andi Kleen <a...@firstfloor.org> wrote:
> Nuno Diegues <n...@ist.utl.pt> writes:
>>
>> One general question on how to proceed:
>> given that I make some further changes, should I post the whole patch again?
>
> Yes please resend the patch.
>
> -Andi

Re: [PATCH] add self-tuning to x86 hardware fast path in libitm

Reply via email to