My bad; I had forgotten to wrap two of the counting loops in NULL
checks. This version fixes it. Thanks again for catching that bug,
Mark.

I've also figured out how to run the test C programs. The new
version passes all libsvn_diff tests.

I haven't changed the index/count types yet. What's the right type
to use to get signed 32 bit on 32-bit machines and signed 64 bit
on 64-bit machines?

Johan, there are no heuristics involved, but I'll discuss that in detail
in a new thread for that patch.

When diffing each of the first 200 versions of merge.c against the
previous version, the total running time is 14 secs for original
version and 12 secs for patched for release, 29 secs and 19 secs
for debug. In this case most of the time is spent reading
the tokens from file anyway, so it doesn't really test the speed of
the LCS algorithm itself.


Morten
Index: subversion/libsvn_diff/diff.c
===================================================================
--- subversion/libsvn_diff/diff.c       (revision 1128318)
+++ subversion/libsvn_diff/diff.c       (working copy)
@@ -105,6 +105,10 @@
 {
   svn_diff__tree_t *tree;
   svn_diff__position_t *position_list[2];
+  apr_uint32_t num_tokens;
+  apr_uint32_t token_index;
+  svn_diff__position_t *current;
+  apr_uint32_t *token_counts[2];
   svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
                                         svn_diff_datasource_modified};
   svn_diff__lcs_t *lcs;
@@ -138,6 +142,8 @@
                                prefix_lines,
                                subpool));
 
+  num_tokens = svn_diff__get_node_count(tree);
+
   /* The cool part is that we don't need the tokens anymore.
    * Allow the app to clean them up if it wants to.
    */
@@ -147,8 +153,35 @@
   /* We don't need the nodes in the tree either anymore, nor the tree itself */
   svn_pool_destroy(treepool);
 
+  token_counts[0] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  token_counts[1] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  for (token_index = 0; token_index < num_tokens; token_index++)
+    token_counts[0][token_index] = token_counts[1][token_index] = 0;
+
+  current = position_list[0];
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[0][current->node]++;
+          current = current->next;
+        }
+      while (current != position_list[0]);
+    }
+  current = position_list[1];
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[1][current->node]++;
+          current = current->next;
+        }
+      while (current != position_list[1]);
+    }
+
   /* Get the lcs */
-  lcs = svn_diff__lcs(position_list[0], position_list[1], prefix_lines,
+  lcs = svn_diff__lcs(position_list[0], position_list[1], token_counts[0],
+                         token_counts[1], num_tokens, prefix_lines,
                       suffix_lines, subpool);
 
   /* Produce the diff */
Index: subversion/libsvn_diff/diff.h
===================================================================
--- subversion/libsvn_diff/diff.h       (revision 1128318)
+++ subversion/libsvn_diff/diff.h       (working copy)
@@ -62,7 +62,7 @@
 struct svn_diff__position_t
 {
   svn_diff__position_t *next;
-  svn_diff__node_t     *node;
+  apr_int32_t          node;
   apr_off_t             offset;
 };
 
@@ -91,12 +91,21 @@
 svn_diff__lcs_t *
 svn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) 
*/
               svn_diff__position_t *position_list2, /* pointer to tail (ring) 
*/
+              apr_uint32_t *token_counts_list1, /* array of counts */
+              apr_uint32_t *token_counts_list2, /* array of counts */
+              apr_uint32_t num_tokens,
               apr_off_t prefix_lines,
               apr_off_t suffix_lines,
               apr_pool_t *pool);
 
 
 /*
+ * Returns number of tokens in a tree
+ */
+apr_uint32_t
+svn_diff__get_node_count(svn_diff__tree_t *tree);
+
+/*
  * Support functions to build a tree of token positions
  */
 void
@@ -128,6 +137,7 @@
 svn_diff__resolve_conflict(svn_diff_t *hunk,
                            svn_diff__position_t **position_list1,
                            svn_diff__position_t **position_list2,
+                           apr_uint32_t num_tokens,
                            apr_pool_t *pool);
 
 
Index: subversion/libsvn_diff/diff3.c
===================================================================
--- subversion/libsvn_diff/diff3.c      (revision 1128318)
+++ subversion/libsvn_diff/diff3.c      (working copy)
@@ -38,6 +38,7 @@
 svn_diff__resolve_conflict(svn_diff_t *hunk,
                            svn_diff__position_t **position_list1,
                            svn_diff__position_t **position_list2,
+                           apr_uint32_t num_tokens,
                            apr_pool_t *pool)
 {
     apr_off_t modified_start = hunk->modified_start + 1;
@@ -47,6 +48,9 @@
     apr_off_t latest_length = hunk->latest_length;
     svn_diff__position_t *start_position[2];
     svn_diff__position_t *position[2];
+    apr_uint32_t token_index;
+    svn_diff__position_t *current;
+    apr_uint32_t *token_counts[2];
     svn_diff__lcs_t *lcs = NULL;
     svn_diff__lcs_t **lcs_ref = &lcs;
     svn_diff_t **diff_ref = &hunk->resolved_diff;
@@ -173,9 +177,35 @@
         position[1]->next = start_position[1];
       }
 
-    *lcs_ref = svn_diff__lcs(position[0], position[1], 0, 0,
-                             subpool);
+    token_counts[0] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+    token_counts[1] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+    for (token_index = 0; token_index < num_tokens; token_index++)
+      token_counts[0][token_index] = token_counts[1][token_index] = 0;
 
+    current = position[0];
+    if (current != NULL)
+      {
+        do
+          {
+            token_counts[0][current->node]++;
+            current = current->next;
+          }
+        while (current != position[0]);
+      }
+    current = position[1];
+    if (current != NULL)
+      {
+        do
+          {
+            token_counts[1][current->node]++;
+            current = current->next;
+          }
+        while (current !=  position[1]);
+      }
+
+    *lcs_ref = svn_diff__lcs(position[0], position[1], token_counts[0],
+                             token_counts[1], num_tokens, 0, 0, subpool);
+
     /* Fix up the EOF lcs element in case one of
      * the two sequences was NULL.
      */
@@ -251,6 +281,10 @@
 {
   svn_diff__tree_t *tree;
   svn_diff__position_t *position_list[3];
+  apr_uint32_t num_tokens;
+  apr_uint32_t token_index;
+  svn_diff__position_t *current;
+  apr_uint32_t *token_counts[3];
   svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
                                         svn_diff_datasource_modified,
                                         svn_diff_datasource_latest};
@@ -292,6 +326,8 @@
                                prefix_lines,
                                subpool));
 
+  num_tokens = svn_diff__get_node_count(tree);
+
   /* Get rid of the tokens, we don't need them to calc the diff */
   if (vtable->token_discard_all != NULL)
     vtable->token_discard_all(diff_baton);
@@ -299,10 +335,52 @@
   /* We don't need the nodes in the tree either anymore, nor the tree itself */
   svn_pool_destroy(treepool);
 
+  token_counts[0] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  token_counts[1] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  token_counts[2] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  for (token_index = 0; token_index < num_tokens; token_index++)
+    {
+      token_counts[0][token_index] = token_counts[1][token_index] = 0;
+      token_counts[2][token_index] = 0;
+    }
+
+  current = position_list[0];
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[0][current->node]++;
+          current = current->next;
+        }
+      while (current != position_list[0]);
+    }
+  current = position_list[1];
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[1][current->node]++;
+          current = current->next;
+        }
+      while (current != position_list[1]);
+    }
+  current = position_list[2];
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[2][current->node]++;
+          current = current->next;
+        }
+      while (current != position_list[2]);
+    }
+
   /* Get the lcs for original-modified and original-latest */
-  lcs_om = svn_diff__lcs(position_list[0], position_list[1], prefix_lines,
+  lcs_om = svn_diff__lcs(position_list[0], position_list[1], token_counts[0],
+                         token_counts[1], num_tokens, prefix_lines,
                          suffix_lines, subpool);
-  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
+  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], token_counts[0],
+                         token_counts[2], num_tokens, prefix_lines,
                          suffix_lines, subpool);
 
   /* Produce a merged diff */
@@ -435,6 +513,7 @@
                 svn_diff__resolve_conflict(*diff_ref,
                                            &position_list[1],
                                            &position_list[2],
+                                           num_tokens,
                                            pool);
               }
             else if (is_modified)
Index: subversion/libsvn_diff/diff4.c
===================================================================
--- subversion/libsvn_diff/diff4.c      (revision 1128318)
+++ subversion/libsvn_diff/diff4.c      (working copy)
@@ -173,6 +173,10 @@
 {
   svn_diff__tree_t *tree;
   svn_diff__position_t *position_list[4];
+  apr_uint32_t num_tokens;
+  apr_uint32_t token_index;
+  svn_diff__position_t *current;
+  apr_uint32_t *token_counts[4];
   svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
                                         svn_diff_datasource_modified,
                                         svn_diff_datasource_latest,
@@ -227,6 +231,8 @@
                                prefix_lines,
                                subpool2));
 
+  num_tokens = svn_diff__get_node_count(tree);
+
   /* Get rid of the tokens, we don't need them to calc the diff */
   if (vtable->token_discard_all != NULL)
     vtable->token_discard_all(diff_baton);
@@ -234,8 +240,61 @@
   /* We don't need the nodes in the tree either anymore, nor the tree itself */
   svn_pool_clear(subpool3);
 
+  token_counts[0] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  token_counts[1] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  token_counts[2] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  token_counts[3] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  for (token_index = 0; token_index < num_tokens; token_index++)
+    {
+      token_counts[0][token_index] = token_counts[1][token_index] = 0;
+      token_counts[2][token_index] = token_counts[3][token_index] = 0;
+    }
+
+  current = position_list[0];
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[0][current->node]++;
+          current = current->next;
+        }
+      while (current != position_list[0]);
+    }
+  current = position_list[1];
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[1][current->node]++;
+          current = current->next;
+        }
+      while (current != position_list[1]);
+    }
+  current = position_list[2];
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[2][current->node]++;
+          current = current->next;
+        }
+      while (current != position_list[2]);
+    }
+  current = position_list[3];
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[3][current->node]++;
+          current = current->next;
+        }
+      while (current != position_list[3]);
+    }
+
   /* Get the lcs for original - latest */
-  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
+  lcs_ol = svn_diff__lcs(position_list[0], position_list[2],
+                         token_counts[0], token_counts[2],
+                         num_tokens, prefix_lines,
                          suffix_lines, subpool3);
   diff_ol = svn_diff__diff(lcs_ol, 1, 1, TRUE, pool);
 
@@ -257,7 +316,9 @@
   /* Get the lcs for common ancestor - original
    * Do reverse adjustements
    */
-  lcs_adjust = svn_diff__lcs(position_list[3], position_list[2], prefix_lines,
+  lcs_adjust = svn_diff__lcs(position_list[3], position_list[2],
+                             token_counts[3], token_counts[2],
+                             num_tokens, prefix_lines,
                              suffix_lines, subpool3);
   diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
   adjust_diff(diff_ol, diff_adjust);
@@ -267,7 +328,9 @@
   /* Get the lcs for modified - common ancestor
    * Do forward adjustments
    */
-  lcs_adjust = svn_diff__lcs(position_list[1], position_list[3], prefix_lines,
+  lcs_adjust = svn_diff__lcs(position_list[1], position_list[3],
+                             token_counts[1], token_counts[3],
+                             num_tokens, prefix_lines,
                              suffix_lines, subpool3);
   diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
   adjust_diff(diff_ol, diff_adjust);
@@ -283,7 +346,7 @@
       if (hunk->type == svn_diff__type_conflict)
         {
           svn_diff__resolve_conflict(hunk, &position_list[1],
-                                     &position_list[2], pool);
+                                     &position_list[2], num_tokens, pool);
         }
     }
 
Index: subversion/libsvn_diff/lcs.c
===================================================================
--- subversion/libsvn_diff/lcs.c        (revision 1128318)
+++ subversion/libsvn_diff/lcs.c        (working copy)
@@ -51,6 +51,7 @@
 svn_diff__snake(apr_off_t k,
                 svn_diff__snake_t *fp,
                 int idx,
+                               apr_uint32_t *token_counts[2],
                 svn_diff__lcs_t **freelist,
                 apr_pool_t *pool)
 {
@@ -92,6 +93,11 @@
     }
 
 
+  if (previous_lcs)
+    {
+      previous_lcs->refcount++;
+    }
+
   /* ### Optimization, skip all positions that don't have matchpoints
    * ### anyway. Beware of the sentinel, don't skip it!
    */
@@ -99,6 +105,8 @@
   position[0] = start_position[0];
   position[1] = start_position[1];
 
+  while (1)
+    {
   while (position[0]->node == position[1]->node)
     {
       position[0] = position[0]->next;
@@ -122,18 +130,19 @@
       lcs->length = position[1]->offset - start_position[1]->offset;
       lcs->next = previous_lcs;
       lcs->refcount = 1;
-      fp[k].lcs = lcs;
+      previous_lcs = lcs;
+      start_position[0] = position[0];
+      start_position[1] = position[1];
     }
-  else
-    {
-      fp[k].lcs = previous_lcs;
+      if (position[0]->node >= 0 && token_counts[1][position[0]->node] == 0)
+        start_position[0] = position[0] = position[0]->next;
+      else if (position[1]->node >= 0 && token_counts[0][position[1]->node] == 
0)
+        start_position[1] = position[1] = position[1]->next;
+      else
+        break;
     }
 
-  if (previous_lcs)
-    {
-      previous_lcs->refcount++;
-    }
-
+  fp[k].lcs = previous_lcs;
   fp[k].position[0] = position[0];
   fp[k].position[1] = position[1];
 
@@ -188,12 +197,18 @@
 svn_diff__lcs_t *
 svn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) 
*/
               svn_diff__position_t *position_list2, /* pointer to tail (ring) 
*/
+              apr_uint32_t *token_counts_list1, /* array of counts */
+              apr_uint32_t *token_counts_list2, /* array of counts */
+              apr_uint32_t num_tokens,
               apr_off_t prefix_lines,
               apr_off_t suffix_lines,
               apr_pool_t *pool)
 {
   int idx;
   apr_off_t length[2];
+  apr_uint32_t *token_counts[2];
+  apr_uint32_t unique_count[2];
+  apr_uint32_t token_index;
   svn_diff__snake_t *fp;
   apr_off_t d;
   apr_off_t k;
@@ -231,10 +246,19 @@
       return lcs;
     }
 
+  unique_count[1] = unique_count[0] = 0;
+  for (token_index = 0; token_index < num_tokens; token_index++)
+    {
+      if (token_counts_list1[token_index] == 0)
+        unique_count[1] += token_counts_list2[token_index];
+      if (token_counts_list2[token_index] == 0)
+        unique_count[0] += token_counts_list1[token_index];
+    }
+
   /* Calculate length of both sequences to be compared */
   length[0] = position_list1->offset - position_list1->next->offset + 1;
   length[1] = position_list2->offset - position_list2->next->offset + 1;
-  idx = length[0] > length[1] ? 1 : 0;
+  idx = length[0] - unique_count[0] > length[1] - unique_count[1] ? 1 : 0;
 
   /* strikerXXX: here we allocate the furthest point array, which is
    * strikerXXX: sized M + N + 3 (!)
@@ -246,18 +270,20 @@
   sentinel_position[idx].next = position_list1->next;
   position_list1->next = &sentinel_position[idx];
   sentinel_position[idx].offset = position_list1->offset + 1;
+  token_counts[idx] = token_counts_list1;
 
   sentinel_position[abs(1 - idx)].next = position_list2->next;
   position_list2->next = &sentinel_position[abs(1 - idx)];
   sentinel_position[abs(1 - idx)].offset = position_list2->offset + 1;
+  token_counts[abs(1 - idx)] = token_counts_list2;
 
-  /* These are never dereferenced, only compared by value, so we
-   * can safely fake these up and the void* cast is OK.
+  /* Negative indices will not be used elsewhere
    */
-  sentinel_position[0].node = (void*)&sentinel_position[0];
-  sentinel_position[1].node = (void*)&sentinel_position[1];
+  sentinel_position[0].node = -1;
+  sentinel_position[1].node = -2;
 
-  d = length[abs(1 - idx)] - length[idx];
+  d = length[abs(1 - idx)] - unique_count[abs(1 - idx)]
+      - (length[idx] - unique_count[idx]);
 
   /* k = -1 will be the first to be used to get previous
    * position information from, make sure it holds sane
@@ -272,12 +298,12 @@
       /* Forward */
       for (k = -p; k < d; k++)
         {
-          svn_diff__snake(k, fp, idx, &lcs_freelist, pool);
+          svn_diff__snake(k, fp, idx, token_counts, &lcs_freelist, pool);
         }
 
       for (k = d + p; k >= d; k--)
         {
-          svn_diff__snake(k, fp, idx, &lcs_freelist, pool);
+          svn_diff__snake(k, fp, idx, token_counts, &lcs_freelist, pool);
         }
 
       p++;
Index: subversion/libsvn_diff/token.c
===================================================================
--- subversion/libsvn_diff/token.c      (revision 1128318)
+++ subversion/libsvn_diff/token.c      (working copy)
@@ -46,6 +46,7 @@
   svn_diff__node_t     *right;
 
   apr_uint32_t          hash;
+  apr_int32_t           index;
   void                 *token;
 };
 
@@ -53,10 +54,20 @@
 {
   svn_diff__node_t     *root[SVN_DIFF__HASH_SIZE];
   apr_pool_t           *pool;
+  apr_uint32_t          node_count;
 };
 
 
 /*
+ * Returns number of tokens in a tree
+ */
+apr_uint32_t
+svn_diff__get_node_count(svn_diff__tree_t *tree)
+{
+  return tree->node_count;
+}
+
+/*
  * Support functions to build a tree of token positions
  */
 
@@ -65,6 +76,7 @@
 {
   *tree = apr_pcalloc(pool, sizeof(**tree));
   (*tree)->pool = pool;
+  (*tree)->node_count = 0;
 }
 
 
@@ -122,6 +134,7 @@
   new_node->right = NULL;
   new_node->hash = hash;
   new_node->token = token;
+  new_node->index = tree->node_count++;
 
   *node = *node_ref = new_node;
 
@@ -168,7 +181,7 @@
       /* Create a new position */
       position = apr_palloc(pool, sizeof(*position));
       position->next = NULL;
-      position->node = node;
+      position->node = node->index;
       position->offset = offset;
 
       *position_ref = position;

Reply via email to