Hello, I have taken the chance to improve the patch by addressing the comments above in this thread. Namely: - to use a simple random generator managed inside the library only - removed floating point usage and replaced by fixed arithmetic - added some comments where relevant
Re-running the STAMP benchmarks shows similar gains (to those shown above) with respect to an unmodified GCC 5.0.0: benchmark: speedup genome: 1.58 intruder: 1.78 labyrinth: 1.0 ssca2: 1.01 yada: 1.0 kmeans-high: 1.16 kmeans-low: 1.16 vacation-high: 2.05 vacation-low: 2.81 I appreciate any feedback and comments! Index: libitm/libitm_i.h =================================================================== --- libitm/libitm_i.h (revision 219316) +++ libitm/libitm_i.h (working copy) @@ -242,6 +242,9 @@ struct gtm_thread uint32_t restart_reason[NUM_RESTARTS]; uint32_t restart_total; + // Keeps track of how many transactions were successfully executed. + uint64_t number_executed_txns; + // *** The shared part of gtm_thread starts here. *** // Shared state is on separate cachelines to avoid false sharing with // thread-local parts of gtm_thread. @@ -286,6 +289,8 @@ struct gtm_thread static void *operator new(size_t); static void operator delete(void *); + static void reoptimize_htm_execution(); + // Invoked from assembly language, thus the "asm" specifier on // the name, avoiding complex name mangling. static uint32_t begin_transaction(uint32_t, const gtm_jmpbuf *) @@ -309,6 +314,59 @@ struct gtm_thread void commit_user_actions (); }; +// Different ways to deal with capacity aborts in HTM execution. +enum gtm_capacity_abort_mode +{ + STUBBORN, + HALVEN, + GIVEUP +}; + +// Definition of fixed point arithmetic types. +// Half the bits are dedicated to the fractional type, and the rest to the +// "whole" part. +#define FIXED_PT_WIDTH 64 +#define FIXED_PT_INTEGER_WIDTH 32 +typedef uint64_t fixed_pt_t; +typedef __uint128_t fixed_pt_td; + +#define FIXED_PT_FRAC_WIDTH (FIXED_PT_WIDTH - FIXED_PT_INTEGER_WIDTH) +#define FIXED_PT_ONE ((fixed_pt_t)((fixed_pt_t)1 << FIXED_PT_FRAC_WIDTH)) +#define FIXED_PT_TWO (FIXED_PT_ONE + FIXED_PT_ONE) + +#define fixed_mul(A,B) \ + ((fixed_pt_t)(((fixed_pt_td)(A) * (fixed_pt_td)(B)) >> FIXED_PT_FRAC_WIDTH)) +#define fixed_div(A,B) \ + ((fixed_pt_t)(((fixed_pt_td)(A) << FIXED_PT_FRAC_WIDTH) / (fixed_pt_td)(B))) +#define fixed_const(R) \ + ((fixed_pt_t)((R) * FIXED_PT_ONE + ((R) >= 0 ? 0.5 : -0.5))) + +// Maintains the current values optimized for HTM execution and the +// corresponding statistics gathered for the decision-making. +struct gtm_global_optimizer +{ + // Mode chosen to currently deal with capacity aborts. + gtm_capacity_abort_mode optimized_mode; + // Number of attempts chosen to currently insist on HTM execution. + uint32_t optimized_attempts; + + uint64_t last_cycles; + uint64_t last_total_txs_executed; + + fixed_pt_t last_throughput; + uint32_t last_attempts; + + fixed_pt_t best_ever_throughput; + uint32_t best_ever_attempts; + + uint64_t txns_while_stubborn; + uint64_t cycles_while_stubborn; + uint64_t txns_while_halven; + uint64_t cycles_while_halven; + uint64_t txns_while_giveup; + uint64_t cycles_while_giveup; +}; + } // namespace GTM #include "tls.h" @@ -346,6 +404,9 @@ extern gtm_cacheline_mask gtm_mask_stack(gtm_cache // the name, avoiding complex name mangling. extern uint32_t htm_fastpath __asm__(UPFX "gtm_htm_fastpath"); +// Maintains the optimization for HTM execution. +extern gtm_global_optimizer optimizer; + } // namespace GTM #endif // LIBITM_I_H Index: libitm/libitm.h =================================================================== --- libitm/libitm.h (revision 219316) +++ libitm/libitm.h (working copy) @@ -101,7 +101,8 @@ typedef enum /* These are not part of the ABI but used for custom HTM fast paths. See ITM_beginTransaction and gtm_thread::begin_transaction. */ pr_HTMRetryableAbort = 0x800000, - pr_HTMRetriedAfterAbort = 0x1000000 + pr_HTMRetriedAfterAbort = 0x1000000, + pr_HTMCapacityAbort = 0x2000000 } _ITM_codeProperties; /* Result from startTransaction that describes what actions to take. Index: libitm/method-serial.cc =================================================================== --- libitm/method-serial.cc (revision 219316) +++ libitm/method-serial.cc (working copy) @@ -223,7 +223,23 @@ struct htm_mg : public method_group // initially disabled. #ifdef USE_HTM_FASTPATH htm_fastpath = htm_init(); +#ifdef HAVE_AS_RTM + optimizer.optimized_mode = STUBBORN; + optimizer.optimized_attempts = htm_fastpath; + optimizer.last_cycles = rdtsc(); + optimizer.last_total_txs_executed = 0; + optimizer.last_throughput = fixed_const(0.0001); + optimizer.last_attempts = htm_fastpath > 0 ? htm_fastpath - 1 : 1; + optimizer.best_ever_throughput = optimizer.last_throughput; + optimizer.best_ever_attempts = htm_fastpath; + optimizer.txns_while_stubborn = 1; + optimizer.cycles_while_stubborn = 100; + optimizer.txns_while_halven = 1; + optimizer.cycles_while_halven = 100; + optimizer.txns_while_giveup = 1; + optimizer.cycles_while_giveup = 100; #endif +#endif } virtual void fini() { Index: libitm/beginend.cc =================================================================== --- libitm/beginend.cc (revision 219316) +++ libitm/beginend.cc (working copy) @@ -25,7 +25,6 @@ #include "libitm_i.h" #include <pthread.h> - using namespace GTM; #if !defined(HAVE_ARCH_GTM_THREAD) || !defined(HAVE_ARCH_GTM_THREAD_DISP) @@ -39,6 +38,9 @@ unsigned GTM::gtm_thread::number_of_threads = 0; gtm_stmlock GTM::gtm_stmlock_array[LOCK_ARRAY_SIZE]; atomic<gtm_version> GTM::gtm_clock; +// Optimization of HTM executions. +gtm_global_optimizer GTM::optimizer; + /* ??? Move elsewhere when we figure out library initialization. */ uint64_t GTM::gtm_spin_count_var = 1000; @@ -149,6 +151,225 @@ choose_code_path(uint32_t prop, abi_dispatch *disp return a_runInstrumentedCode; } +static inline fixed_pt_t +fixed_sqrt(fixed_pt_t n) +{ + int invert = 0; + int iter = FIXED_PT_FRAC_WIDTH; + int l, i; + + if (n == 0 || n == FIXED_PT_ONE) + { + return n; + } + if (n < FIXED_PT_ONE && n > 6) + { + invert = 1; + n = fixed_div(FIXED_PT_ONE, n); + } + if (n > FIXED_PT_ONE) + { + int s = n; + iter = 0; + while (s > 0) + { + s >>= 2; + iter++; + } + } + + l = (n >> 1) + 1; + for (i = 0; i < iter; i++) + { + l = (l + fixed_div(n, l)) >> 1; + } + if (invert) + { + return (fixed_div(FIXED_PT_ONE, l)); + } + return l; +} + +static inline fixed_pt_t +fixed_ln(fixed_pt_t x) +{ + const fixed_pt_t LN2 = fixed_const(0.69314718055994530942); + const fixed_pt_t LG[7] = { + fixed_const(6.666666666666735130e-01), + fixed_const(3.999999999940941908e-01), + fixed_const(2.857142874366239149e-01), + fixed_const(2.222219843214978396e-01), + fixed_const(1.818357216161805012e-01), + fixed_const(1.531383769920937332e-01), + fixed_const(1.479819860511658591e-01) + }; + + if (x == 0) + { + return 0xffffffff; + } + + fixed_pt_t log2 = 0; + fixed_pt_t xi = x; + while (xi > FIXED_PT_TWO) + { + xi >>= 1; + log2++; + } + + fixed_pt_t f = xi - FIXED_PT_ONE; + fixed_pt_t s = fixed_div(f, FIXED_PT_TWO + f); + fixed_pt_t z = fixed_mul(s, s); + fixed_pt_t w = fixed_mul(z, z); + fixed_pt_t R = + fixed_mul(w, LG[1] + fixed_mul(w, LG[3] + fixed_mul(w, LG[5]))) + + fixed_mul(z, LG[0] + fixed_mul(w, LG[2] + + fixed_mul(w, LG[4] + fixed_mul(w, LG[6])))); + return fixed_mul(LN2, (log2 << FIXED_PT_FRAC_WIDTH)) + + f - fixed_mul(s, f - R); +} + +// State not synchronized; assumes single thread usage at any time. +// Invoked only by the thread reoptimizing, so assumption holds. +int +obtainRandomInt(int max) +{ + static int seed = 123456789; + seed = (1103515245 * seed + 12345) % 4294967296; + return seed; +} + +// Called by the thread at the tail of the list of threads. +void +GTM::gtm_thread::reoptimize_htm_execution() +{ + gtm_thread *tx = gtm_thr(); + uint64_t total_txs_executed = 0; + + // Collect the statistics obtained so far. + serial_lock.read_lock(tx); + gtm_thread *ptr = list_of_threads; + for (; ptr; ptr = ptr->next_thread) + { + total_txs_executed += ptr->number_executed_txns; + } + serial_lock.read_unlock(tx); + + // Obtain the delta performance with respect to the last period. + const uint64_t current_cycles = rdtsc(); + const uint64_t cycles_used = current_cycles - optimizer.last_cycles; + const uint64_t new_txs_executed = + total_txs_executed - optimizer.last_total_txs_executed; + optimizer.last_cycles = current_cycles; + optimizer.last_total_txs_executed = total_txs_executed; + if (optimizer.optimized_mode == STUBBORN) + { + optimizer.txns_while_stubborn += new_txs_executed; + optimizer.cycles_while_stubborn += cycles_used; + } + else if (optimizer.optimized_mode == HALVEN) + { + optimizer.txns_while_halven += new_txs_executed; + optimizer.cycles_while_halven += cycles_used; + } + else + { + optimizer.txns_while_giveup += new_txs_executed; + optimizer.cycles_while_giveup += cycles_used; + } + + // Compute Upper Confidence Bounds for the mode to choose next. + // Use fixed point arithmetic types to spare floating point usage. + const fixed_pt_t log_sum = + fixed_mul(FIXED_PT_TWO, + fixed_ln(fixed_const(optimizer.txns_while_stubborn) + + fixed_const(optimizer.txns_while_halven) + + fixed_const(optimizer.txns_while_giveup))); + const fixed_pt_t ucb_stubborn = + fixed_div(fixed_div(fixed_const(optimizer.txns_while_stubborn), + fixed_const(optimizer.cycles_while_stubborn)), + optimizer.best_ever_throughput) + + fixed_sqrt(fixed_div(log_sum, + fixed_const(optimizer.txns_while_stubborn))); + const fixed_pt_t ucb_halven = + fixed_div(fixed_div(fixed_const(optimizer.txns_while_halven), + fixed_const(optimizer.cycles_while_halven)), + optimizer.best_ever_throughput) + + fixed_sqrt(fixed_div(log_sum, fixed_const(optimizer.txns_while_halven))); + const fixed_pt_t ucb_giveup = + fixed_div(fixed_div(fixed_const(optimizer.txns_while_giveup), + fixed_const(optimizer.cycles_while_giveup)), + optimizer.best_ever_throughput) + + fixed_sqrt(fixed_div(log_sum, fixed_const(optimizer.txns_while_giveup))); + + if (ucb_stubborn > ucb_halven && ucb_stubborn > ucb_giveup) + { + optimizer.optimized_mode = STUBBORN; + } + else if (ucb_halven > ucb_giveup) + { + optimizer.optimized_mode = HALVEN; + } + else + { + optimizer.optimized_mode = GIVEUP; + } + + // Compute gradient descent for the number of retries. + const fixed_pt_t current_throughput = + fixed_div(fixed_const(new_txs_executed), fixed_const(cycles_used)); + const fixed_pt_t change_for_better = + fixed_div(current_throughput, optimizer.last_throughput); + const fixed_pt_t change_for_worse = + fixed_div(optimizer.last_throughput, current_throughput); + int32_t last_attempts = optimizer.last_attempts; + int32_t current_attempts = optimizer.optimized_attempts; + int32_t new_attempts = current_attempts; + if (unlikely(change_for_worse > fixed_const(1.40))) + { + optimizer.optimized_attempts = optimizer.best_ever_attempts; + optimizer.last_throughput = current_throughput; + optimizer.last_attempts = current_attempts; + return; + } + + if (unlikely(obtainRandomInt(100) == 0)) + { + optimizer.last_attempts = optimizer.optimized_attempts; + optimizer.last_throughput = current_throughput; + optimizer.optimized_attempts = obtainRandomInt(18) + 2; + return; + } + + if (change_for_better > fixed_const(1.05)) + { + new_attempts += current_attempts - last_attempts; + if (current_attempts == last_attempts) + new_attempts += obtainRandomInt(2) == 0 ? -1 : 1; + } + else if (change_for_worse > fixed_const(1.05)) + { + new_attempts -= current_attempts - last_attempts; + if (current_attempts == last_attempts) + new_attempts += obtainRandomInt(2) == 0 ? -1 : 1; + } + if (unlikely(new_attempts > 20)) + new_attempts = 20; + else if (unlikely(new_attempts < 2)) + new_attempts = 2; + if (current_attempts != new_attempts) + { + optimizer.last_attempts = current_attempts; + optimizer.last_throughput = current_throughput; + } + optimizer.optimized_attempts = new_attempts; + if (unlikely(current_throughput > optimizer.best_ever_throughput)) + { + optimizer.best_ever_throughput = current_throughput; + optimizer.best_ever_attempts = current_attempts; + } +} + uint32_t GTM::gtm_thread::begin_transaction (uint32_t prop, const gtm_jmpbuf *jb) { @@ -190,7 +411,7 @@ GTM::gtm_thread::begin_transaction (uint32_t prop, #ifndef HTM_CUSTOM_FASTPATH if (likely(htm_fastpath && (prop & pr_hasNoAbort))) { - for (uint32_t t = htm_fastpath; t; t--) + for (uint32_t t = optimizer.optimized_attempts; t; t--) { uint32_t ret = htm_begin(); if (htm_begin_success(ret)) @@ -209,19 +430,36 @@ GTM::gtm_thread::begin_transaction (uint32_t prop, } // The transaction has aborted. Don't retry if it's unlikely that // retrying the transaction will be successful. - if (!htm_abort_should_retry(ret)) +#ifdef HAVE_AS_RTM + if (htm_is_capacity_abort(ret)) + { + gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode; + if (capacity_mode == HALVEN) + t = t / 2 + 1; + else if (capacity_mode == GIVEUP) + t = 1; + } + // Only one thread performs this to avoid the need for + // synchronization. We use the oldest thread that is active. + // We also reoptimize at most once per transaction. + if (unlikely(tx->next_thread == NULL && + tx->number_executed_txns % 500 == 0 && t == 1)) + reoptimize_htm_execution(); +#else + if (!htm_abort_should_retry(ret)) break; +#endif // Wait until any concurrent serial-mode transactions have finished. // This is an empty critical section, but won't be elided. if (serial_lock.is_write_locked()) { - tx = gtm_thr(); - if (unlikely(tx == NULL)) - { - // See below. - tx = new gtm_thread(); - set_gtm_thr(tx); - } + tx = gtm_thr(); + if (unlikely(tx == NULL)) + { + // See below. + tx = new gtm_thread(); + set_gtm_thr(tx); + } // Check whether there is an enclosing serial-mode transaction; // if so, we just continue as a nested transaction and don't // try to use the HTM fastpath. This case can happen when an @@ -262,12 +500,26 @@ GTM::gtm_thread::begin_transaction (uint32_t prop, // other fallback will use serial transactions, which don't use // restart_total but will reset it when committing. if (!(prop & pr_HTMRetriedAfterAbort)) - tx->restart_total = htm_fastpath; + tx->restart_total = optimizer.optimized_attempts; if (--tx->restart_total > 0) { // Wait until any concurrent serial-mode transactions have finished. // Essentially the same code as above. +#ifdef HAVE_AS_RTM + if (prop & pr_HTMCapacityAbort) + { + gtm_capacity_abort_mode capacity_mode = optimizer.optimized_mode; + if (capacity_mode == HALVEN) + tx->restart_total = tx->restart_total; + else if (capacity_mode == GIVEUP) + goto stop_custom_htm_fastpath; + } + + if (unlikely(tx->next_thread == NULL && + tx->number_executed_txns % 500 == 0 && tx->restart_total == 1)) + reoptimize_htm_execution(); +#endif if (serial_lock.is_write_locked()) { if (tx->nesting > 0) @@ -665,12 +917,21 @@ _ITM_commitTransaction(void) if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked())) { htm_commit(); + gtm_thread *tx = gtm_thr(); + if (unlikely(tx == NULL)) + { + tx = new gtm_thread(); + set_gtm_thr(tx); + } + tx->number_executed_txns++; return; } #endif gtm_thread *tx = gtm_thr(); if (!tx->trycommit ()) tx->restart (RESTART_VALIDATE_COMMIT); + + tx->number_executed_txns++; } void ITM_REGPARM @@ -681,6 +942,13 @@ _ITM_commitTransactionEH(void *exc_ptr) if (likely(htm_fastpath && !gtm_thread::serial_lock.is_write_locked())) { htm_commit(); + gtm_thread *tx = gtm_thr(); + if (unlikely(tx == NULL)) + { + tx = new gtm_thread(); + set_gtm_thr(tx); + } + tx->number_executed_txns++; return; } #endif @@ -690,4 +958,5 @@ _ITM_commitTransactionEH(void *exc_ptr) tx->eh_in_flight = exc_ptr; tx->restart (RESTART_VALIDATE_COMMIT); } + tx->number_executed_txns++; } Index: libitm/config/x86/target.h =================================================================== --- libitm/config/x86/target.h (revision 219316) +++ libitm/config/x86/target.h (working copy) @@ -126,12 +126,25 @@ htm_abort_should_retry (uint32_t begin_ret) return begin_ret & _XABORT_RETRY; } +static inline bool +htm_is_capacity_abort(uint32_t begin_ret) +{ + return begin_ret & _XABORT_CAPACITY; +} + /* Returns true iff a hardware transaction is currently being executed. */ static inline bool htm_transaction_active () { return _xtest() != 0; } + +static inline uint64_t rdtsc() +{ + uint32_t hi, lo; + __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); + return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 ); +} #endif Index: libitm/config/x86/sjlj.S =================================================================== --- libitm/config/x86/sjlj.S (revision 219316) +++ libitm/config/x86/sjlj.S (working copy) @@ -59,12 +59,14 @@ #define pr_hasNoAbort 0x08 #define pr_HTMRetryableAbort 0x800000 #define pr_HTMRetriedAfterAbort 0x1000000 +#define pr_HTMCapacityAbort 0x2000000 #define a_runInstrumentedCode 0x01 #define a_runUninstrumentedCode 0x02 #define a_tryHTMFastPath 0x20 #define _XABORT_EXPLICIT (1 << 0) #define _XABORT_RETRY (1 << 1) +#define _XABORT_CAPACITY (1 << 3) .text @@ -108,9 +110,12 @@ SYM(_ITM_beginTransaction): .Ltxn_abort: /* If it might make sense to retry the HTM fast path, let the C++ code decide. */ - testl $(_XABORT_RETRY|_XABORT_EXPLICIT), %eax + testl $(_XABORT_RETRY|_XABORT_EXPLICIT|_XABORT_CAPACITY), %eax jz .Lno_htm orl $pr_HTMRetryableAbort, %edi + testl $(_XABORT_CAPACITY), %eax + jz .Lno_htm + orl $pr_HTMCapacityAbort, %edi /* Let the C++ code handle the retry policy. */ .Lno_htm: #endif On Fri, Apr 10, 2015 at 8:24 AM, Andi Kleen <a...@firstfloor.org> wrote: > Nuno Diegues <n...@ist.utl.pt> writes: >> >> One general question on how to proceed: >> given that I make some further changes, should I post the whole patch again? > > Yes please resend the patch. > > -Andi