Hi devs, I reworked the patch set according to the feedback I got here on this list. This is what changed in the first part:
* split original part 1/2 into 1/3 (append char) and 2/3 (apply text delta). * move memcpy optimization to a new fast_memcpy * add rationales to the commentary -- Stefan^2. [[[ svn_txdelta_apply_instructions is relatively slow for long instruction sequences copying small pieces of data. This seems to be particularly visible in non-packed FSFS repositories. This patch extracts invariants out from the 'for' loop, optimizes overlapping copies as well as small data copy runtime. As a result, the performance is about doubled and the total svn export runtime gets reduced by 2 %. * subversion/libsvn_delta/text_delta.c (fast_memcpy): new function (svn_txdelta_apply_instructions): reduce loop overhead, use fast_memcpy, optimize overlapped copy code patch by stefanfuhrmann < at > alice-dsl.de ]]]
Index: subversion/libsvn_delta/text_delta.c =================================================================== --- subversion/libsvn_delta/text_delta.c (revision 939976) +++ subversion/libsvn_delta/text_delta.c (working copy) @@ -564,61 +591,109 @@ return SVN_NO_ERROR; } +/* memcpy() is hard to tune for a wide range of buffer lengths. + * Therefore, it is often tuned for high throughput on large + * buffers and relatively low latency for mid-sized buffers + * (tens of bytes). However, the overhead for very small buffers + * (<10 bytes) is still high. Even passing the parameters, for + * instance, may take as long as copying 3 bytes. + * + * Because short copy sequences seem to be a common case in + * non-packed FSFS repositories, we copy them directly. Larger + * buffer sizes aren't hurt measurably by the exta 'if' clause. + */ +static APR_INLINE char* +fast_memcpy(char* target, const char* source, apr_size_t len) +{ + if (len > 7) + { + memcpy(target, source, len); + target += len; + } + else + { + /* memcpy is not exactly fast for small block sizes. + Since they are common, let's run optimized code for them. */ + const char* end = source + len; + for (; source != end; source++) + *(target++) = *source; + } + return target; +} + void svn_txdelta_apply_instructions(svn_txdelta_window_t *window, const char *sbuf, char *tbuf, apr_size_t *tlen) { - const svn_txdelta_op_t *op; - apr_size_t i, j, tpos = 0; + /* The main loop is run quite often. + * Pre-calculate as much data as possible. + */ + const svn_txdelta_op_t *op, *last_op = window->ops + window->num_ops; + apr_size_t to_fill = *tlen < window->tview_len ? *tlen : window->tview_len; + apr_size_t left = to_fill; + const char* end, *source; + char *target = tbuf; - for (op = window->ops; op < window->ops + window->num_ops; op++) + for (op = window->ops; left > 0; op++) { - const apr_size_t buf_len = (op->length < *tlen - tpos - ? op->length : *tlen - tpos); + const apr_size_t buf_len = op->length < left ? op->length : left; + left -= buf_len; /* Check some invariants common to all instructions. */ - assert(tpos + op->length <= window->tview_len); + assert(target - tbuf + op->length <= window->tview_len); switch (op->action_code) { case svn_txdelta_source: /* Copy from source area. */ assert(op->offset + op->length <= window->sview_len); - memcpy(tbuf + tpos, sbuf + op->offset, buf_len); + target = fast_memcpy(target, sbuf + op->offset, buf_len); break; case svn_txdelta_target: - /* Copy from target area. Don't use memcpy() since its - semantics aren't guaranteed for overlapping memory areas, - and target copies are allowed to overlap to generate - repeated data. */ - assert(op->offset < tpos); - for (i = op->offset, j = tpos; i < op->offset + buf_len; i++) - tbuf[j++] = tbuf[i]; + /* Copy from target area. We can't use memcpy() or the like + since we need a specific semantics for overlapping copies: + they must result in repeating patterns. + */ + assert(op->offset < target - *tbuf); + source = tbuf + op->offset; + end = tbuf + op->offset + buf_len; + + /* Copy in larger chunks if source and target don't overlap + * closer than the size of the chunks (or don't overlap at all). + * Use the natural machine word as chunk size + * (for some architectures size_t is even a bit larger). + */ + if (end + sizeof(apr_size_t) <= target) + for (; source + sizeof (apr_size_t) <= end; + source += sizeof (apr_size_t), + target += sizeof (apr_size_t)) + *(apr_size_t*)(target) = *(apr_size_t*)(source); + + /* Copy trailing bytes */ + for (; source != end; source++) + *(target++) = *source; break; case svn_txdelta_new: /* Copy from window new area. */ assert(op->offset + op->length <= window->new_data->len); - memcpy(tbuf + tpos, - window->new_data->data + op->offset, - buf_len); + target = fast_memcpy(target, + window->new_data->data + op->offset, + buf_len); break; default: assert(!"Invalid delta instruction code"); } - - tpos += op->length; - if (tpos >= *tlen) - return; /* The buffer is full. */ } - /* Check that we produced the right amount of data. */ - assert(tpos == window->tview_len); - *tlen = tpos; + /* We always fill the whole target buffer unless we stumble across + * an invalid instruction (caught by debug assertions only). + */ + *tlen = to_fill; } /* This is a private interlibrary compatibility wrapper. */