Hi there,

this is an improved version of the patch posted here:

 http://svn.haxx.se/dev/archive-2010-05/0002.shtml

The improvements address the issues listed there:

 http://svn.haxx.se/dev/archive-2010-05/0216.shtml

-- Stefan^2.


[[[
svn_txdelta_apply_instructions is relatively slow for long
instruction sequences copying small pieces of data. This
seems to be particularly visible in non-packed FSFS
repositories.

This patch extracts invariants out from the 'for' loop,
optimizes overlapping copies as well as small data copy
runtime.
* subversion/libsvn_delta/text_delta.c
 (fast_memcpy, patterning_copy): new functions,
 optimized for our specific workload
 (svn_txdelta_apply_instructions): reduce loop overhead,
 use fast_memcpy and patterning_copy

patch by stefanfuhrmann < at > alice-dsl.de
]]]

Index: subversion/libsvn_delta/text_delta.c
===================================================================
--- subversion/libsvn_delta/text_delta.c        (revision 944296)
+++ subversion/libsvn_delta/text_delta.c        (working copy)
@@ -564,61 +564,147 @@
   return SVN_NO_ERROR;
 }
 
+/* memcpy() is hard to tune for a wide range of buffer lengths.
+ * Therefore, it is often tuned for high throughput on large
+ * buffers and relatively low latency for mid-sized buffers
+ * (tens of bytes). However, the overhead for very small buffers
+ * (<10 bytes) is still high. Even passing the parameters, for
+ * instance, may take as long as copying 3 bytes.
+ *
+ * Because short copy sequences seem to be a common case in
+ * non-packed FSFS repositories, we copy them directly. Larger
+ * buffer sizes aren't hurt measurably by the exta 'if' clause.
+ *
+ * As a further optimization, we return a pointer to the first
+ * byte after the copied target range.
+ */
+static APR_INLINE char*
+fast_memcpy(char *target, const char *source, apr_size_t len)
+{
+  if (len > 7)
+    {
+      memcpy(target, source, len);
+      target += len;
+    }
+  else
+    {
+      /* memcpy is not exactly fast for small block sizes.
+         Since they are common, let's run optimized code for them. */
+      const char *end = source + len;
+      for (; source != end; source++)
+        *(target++) = *source;
+    }
 
+  return target;
+}
+
+/* Unlike memmove() or memcpy(), create repeating patterns when 
+ * source and target range overlap. Returns a pointer to the first
+ * byte after the copied target range.
+ */
+static APR_INLINE char*
+patterning_copy(char *target, const char *source, apr_size_t len)
+{
+  const char *end = source + len;
+
+  /* Copy in larger chunks if source and target don't overlap
+   * closer than the size of the chunks (or don't overlap at all).
+   * Use the natural machine word as chunk size
+   * (for some architectures size_t is even a bit larger).
+   */
+  if (end + sizeof(apr_size_t) <= target)
+    for (; source + sizeof (apr_size_t) <= end; 
+           source += sizeof (apr_size_t), 
+           target += sizeof (apr_size_t))
+      *(apr_size_t*)(target) = *(apr_size_t*)(source);
+
+  /* Copy trailing bytes */
+  for (; source != end; source++)
+    *(target++) = *source;
+
+  return target;
+}
+
 void
 svn_txdelta_apply_instructions(svn_txdelta_window_t *window,
                                const char *sbuf, char *tbuf,
                                apr_size_t *tlen)
 {
-  const svn_txdelta_op_t *op;
-  apr_size_t i, j, tpos = 0;
+  /* The main loop is run quite often.
+   * Pre-calculate as much data as possible.
+   */
+  const svn_txdelta_op_t *op = window->ops;
+  const svn_txdelta_op_t *last_op = window->ops + window->num_ops;
 
-  for (op = window->ops; op < window->ops + window->num_ops; op++)
+  /* target buffer range is limited by target buffer length 
+   * as well as the window length
+   */
+  apr_size_t to_fill = *tlen < window->tview_len ? *tlen : window->tview_len;
+
+  /* we copy data until left is exactly 0 */
+  apr_size_t left = to_fill;
+
+  /* where to copy to */
+  char *target = tbuf;
+
+  /* a temporary */
+  apr_size_t buf_len;
+
+  /* Process the window copy ops until we filled the target buffer. */
+  while (left > 0)
     {
-      const apr_size_t buf_len = (op->length < *tlen - tpos
-                                  ? op->length : *tlen - tpos);
+      /* If there were to few window ops, the value for window->tview_len 
+       * would be inconsistent with the sum of op->length values. 
+       */
+      assert(op < last_op);
 
       /* Check some invariants common to all instructions.  */
-      assert(tpos + op->length <= window->tview_len);
+      assert(to_fill - left + op->length <= window->tview_len);
 
+      /* under no circumstance will we write beyond the end 
+       * of the target buffer.
+       */
+      buf_len = op->length < left ? op->length : left;
+      left -= buf_len;
+
       switch (op->action_code)
         {
         case svn_txdelta_source:
           /* Copy from source area.  */
           assert(op->offset + op->length <= window->sview_len);
-          memcpy(tbuf + tpos, sbuf + op->offset, buf_len);
+          target = fast_memcpy(target, sbuf + op->offset, buf_len);
           break;
 
         case svn_txdelta_target:
-          /* Copy from target area.  Don't use memcpy() since its
-             semantics aren't guaranteed for overlapping memory areas,
-             and target copies are allowed to overlap to generate
-             repeated data.  */
-          assert(op->offset < tpos);
-          for (i = op->offset, j = tpos; i < op->offset + buf_len; i++)
-            tbuf[j++] = tbuf[i];
+          /* Copy from target area.  We can't use memcpy() or the like 
+             since we need a specific semantics for overlapping copies: 
+             they must result in repeating patterns.
+           */
+          assert(op->offset < target - *tbuf);
+          target = patterning_copy(target, tbuf + op->offset, buf_len);
           break;
 
         case svn_txdelta_new:
           /* Copy from window new area.  */
           assert(op->offset + op->length <= window->new_data->len);
-          memcpy(tbuf + tpos,
-                 window->new_data->data + op->offset,
-                 buf_len);
+          target = fast_memcpy(target, 
+                               window->new_data->data + op->offset,
+                               buf_len);
           break;
 
         default:
           assert(!"Invalid delta instruction code");
         }
 
-      tpos += op->length;
-      if (tpos >= *tlen)
-        return;                 /* The buffer is full. */
+      /* next operation */
+      ++op;
     }
 
-  /* Check that we produced the right amount of data.  */
-  assert(tpos == window->tview_len);
-  *tlen = tpos;
+  /* We always fill the whole target buffer, i.e. left gets exactly 0,
+   * unless we stumble across an invalid instruction (caught by debug 
+   * assertions only).
+   */
+  *tlen = to_fill;
 }
 
 /* This is a private interlibrary compatibility wrapper. */

Reply via email to