On Wed, May 18, 2011 at 8:33 AM, Johan Corveleyn <jcor...@gmail.com> wrote:
> On Wed, May 18, 2011 at 1:56 AM, Morten Kloster <mor...@gmail.com> wrote:
>> Log message:
>>
>> Speed-up of libsvn_diff using token counts
>> By using indices, not node pointers, to refer to tokens, and counting
>> the number of each token, the longest common subsequence (lcs)
>> algorithm at the heart of libsvn_diff becomes much faster in many
>> situations by skipping tokens that are unique to one source or the other.
>>
>> * subversion/libsvn_diff/token.c
>>  (svn_diff__node_t): Added 'index' member.
>>  (svn_diff__tree_t): Added 'node_count' member.
>>  (svn_diff__get_token_count): New method.
>>  (tree_insert_token): Assigns indices to nodes.
>>  (svn_diff__get_tokens): Uses index, not pointer,
>>   to identify node in position struct.
>>
>> * subversion/libsvn_diff/lcs.c
>>  (svn_diff__snake): Takes token counts as additional argument.
>>   Skips tokens that are unique to one or the other file.
>>  (svn_diff__lcs): Takes token counts and number of different tokens
>>   as additional arguments. Calculates number of tokens unique to
>>   each source and uses this to compute effective length difference.
>>
>> * subversion/libsvn_diff/diff.h
>>  (svn_diff__position_t): Changed node member from being
>>   a pointer to the node to an integer index for the node.
>>  Updated method declarations.
>>
>> * subversion/libsvn_diff/diff.c
>> * subversion/libsvn_diff/diff3.c
>> * subversion/libsvn_diff/diff4.c
>>  (svn_diff_diff_2, svn_diff_diff_3, svn_diff_diff_4): Count the
>>   number of times each token appears in each source.
>>  (svn_diff__resolve_conflict): Takes number of different tokens
>>   as additional argument. Counts the number of times
>>   each token appears in each (partial) source.
>>
>>
>>
>>
>>
>>
>> Comments:
>> This patch can reduce the time required for a diff by a large factor
>> when comparing long files with large differences. As an extreme case,
>> splitting sqlite.c into two roughly equal parts and diffing them against
>> each other took about a minute on my laptop with the original code,
>> and 7-8 seconds with my patch. The speed-up is gained only if a
>> large fraction of the changed tokens are unique to one source or the
>> other. I can not see that the change would cause a significant
>> performance degradation in any reasonable case.
>>
>> Counting the number of times each token appears is also useful for
>> potential additional features in the future, such as identifying moved
>> blocks outside of the longest common subsequence.
>>
>> I have tested only svn_diff_diff_2 (that it gives identical results as
>> before), but the changes to svn_diff_diff3 and svn_diff_diff4 are
>> completely analogous. Tested on a Windows 7 64-bit machine,
>> TortoiseSVN build script.
>>
>>
>> Morten Kloster (first patch)
>
> Interesting! The patch seems to be missing though. The list software
> tends to strip off attachments if they don't have a text/plain
> mime-type. Can you try sending it again with a .txt extension?
>
> --
> Johan
>

Sorry, here's the patch with .txt extension. I found a bug in the
first version anyway, so it was just as well. :)

And of course I meant sqlite3.c for the stress test.

Morten
Index: subversion/libsvn_diff/diff.c
===================================================================
--- subversion/libsvn_diff/diff.c       (revision 1104713)
+++ subversion/libsvn_diff/diff.c       (working copy)
@@ -105,6 +105,10 @@
 {
   svn_diff__tree_t *tree;
   svn_diff__position_t *position_list[2];
+  apr_uint32_t num_tokens;
+  apr_uint32_t token_index;
+  svn_diff__position_t *current;
+  apr_uint32_t *token_counts[2];
   svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
                                         svn_diff_datasource_modified};
   svn_diff__lcs_t *lcs;
@@ -138,6 +142,8 @@
                                prefix_lines,
                                subpool));
 
+  num_tokens = svn_diff__get_node_count(tree);
+
   /* The cool part is that we don't need the tokens anymore.
    * Allow the app to clean them up if it wants to.
    */
@@ -147,8 +153,35 @@
   /* We don't need the nodes in the tree either anymore, nor the tree itself */
   svn_pool_destroy(treepool);
 
+  token_counts[0] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  token_counts[1] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  for (token_index = 0; token_index < num_tokens; token_index++)
+    token_counts[0][token_index] = token_counts[1][token_index] = 0;
+
+  current = position_list[0];
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[0][current->node]++;
+          current = current->next;
+        }
+      while (current != position_list[0]);
+    }
+  current = position_list[1];
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[1][current->node]++;
+          current = current->next;
+        }
+      while (current != position_list[1]);
+    }
+
   /* Get the lcs */
-  lcs = svn_diff__lcs(position_list[0], position_list[1], prefix_lines,
+  lcs = svn_diff__lcs(position_list[0], position_list[1], token_counts[0],
+                         token_counts[1], num_tokens, prefix_lines,
                       suffix_lines, subpool);
 
   /* Produce the diff */
Index: subversion/libsvn_diff/diff.h
===================================================================
--- subversion/libsvn_diff/diff.h       (revision 1104713)
+++ subversion/libsvn_diff/diff.h       (working copy)
@@ -62,7 +62,7 @@
 struct svn_diff__position_t
 {
   svn_diff__position_t *next;
-  svn_diff__node_t     *node;
+  apr_int32_t          node;
   apr_off_t             offset;
 };
 
@@ -91,12 +91,21 @@
 svn_diff__lcs_t *
 svn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) 
*/
               svn_diff__position_t *position_list2, /* pointer to tail (ring) 
*/
+              apr_uint32_t *token_counts_list1, /* array of counts */
+              apr_uint32_t *token_counts_list2, /* array of counts */
+              apr_uint32_t num_tokens,
               apr_off_t prefix_lines,
               apr_off_t suffix_lines,
               apr_pool_t *pool);
 
 
 /*
+ * Returns number of tokens in a tree
+ */
+apr_uint32_t
+svn_diff__get_node_count(svn_diff__tree_t *tree);
+
+/*
  * Support functions to build a tree of token positions
  */
 void
@@ -128,6 +137,7 @@
 svn_diff__resolve_conflict(svn_diff_t *hunk,
                            svn_diff__position_t **position_list1,
                            svn_diff__position_t **position_list2,
+                           apr_uint32_t num_tokens,
                            apr_pool_t *pool);
 
 
Index: subversion/libsvn_diff/diff3.c
===================================================================
--- subversion/libsvn_diff/diff3.c      (revision 1104713)
+++ subversion/libsvn_diff/diff3.c      (working copy)
@@ -38,208 +38,232 @@
 svn_diff__resolve_conflict(svn_diff_t *hunk,
                            svn_diff__position_t **position_list1,
                            svn_diff__position_t **position_list2,
+                           apr_uint32_t num_tokens,
                            apr_pool_t *pool)
 {
-    apr_off_t modified_start = hunk->modified_start + 1;
-    apr_off_t latest_start = hunk->latest_start + 1;
-    apr_off_t common_length;
-    apr_off_t modified_length = hunk->modified_length;
-    apr_off_t latest_length = hunk->latest_length;
-    svn_diff__position_t *start_position[2];
-    svn_diff__position_t *position[2];
-    svn_diff__lcs_t *lcs = NULL;
-    svn_diff__lcs_t **lcs_ref = &lcs;
-    svn_diff_t **diff_ref = &hunk->resolved_diff;
-    apr_pool_t *subpool;
+  apr_off_t modified_start = hunk->modified_start + 1;
+  apr_off_t latest_start = hunk->latest_start + 1;
+  apr_off_t common_length;
+  apr_off_t modified_length = hunk->modified_length;
+  apr_off_t latest_length = hunk->latest_length;
+  svn_diff__position_t *start_position[2];
+  svn_diff__position_t *position[2];
+  apr_uint32_t token_index;
+  svn_diff__position_t *current;
+  apr_uint32_t *token_counts[2];
+  svn_diff__lcs_t *lcs = NULL;
+  svn_diff__lcs_t **lcs_ref = &lcs;
+  svn_diff_t **diff_ref = &hunk->resolved_diff;
+  apr_pool_t *subpool;
 
-    /* First find the starting positions for the
-     * comparison
-     */
+  /* First find the starting positions for the
+   * comparison
+   */
 
-    start_position[0] = *position_list1;
-    start_position[1] = *position_list2;
+  start_position[0] = *position_list1;
+  start_position[1] = *position_list2;
 
-    while (start_position[0]->offset < modified_start)
-      start_position[0] = start_position[0]->next;
+  while (start_position[0]->offset < modified_start)
+    start_position[0] = start_position[0]->next;
 
-    while (start_position[1]->offset < latest_start)
-      start_position[1] = start_position[1]->next;
+  while (start_position[1]->offset < latest_start)
+    start_position[1] = start_position[1]->next;
 
-    position[0] = start_position[0];
-    position[1] = start_position[1];
+  position[0] = start_position[0];
+  position[1] = start_position[1];
 
-    common_length = modified_length < latest_length
-                  ? modified_length : latest_length;
+  common_length = modified_length < latest_length
+                ? modified_length : latest_length;
 
-    while (common_length > 0
-           && position[0]->node == position[1]->node)
-      {
-        position[0] = position[0]->next;
-        position[1] = position[1]->next;
+  while (common_length > 0
+         && position[0]->node == position[1]->node)
+    {
+      position[0] = position[0]->next;
+      position[1] = position[1]->next;
 
-        common_length--;
-      }
+      common_length--;
+    }
 
-    if (common_length == 0
-        && modified_length == latest_length)
-      {
-        hunk->type = svn_diff__type_diff_common;
-        hunk->resolved_diff = NULL;
+  if (common_length == 0
+      && modified_length == latest_length)
+    {
+      hunk->type = svn_diff__type_diff_common;
+      hunk->resolved_diff = NULL;
 
-        *position_list1 = position[0];
-        *position_list2 = position[1];
+      *position_list1 = position[0];
+      *position_list2 = position[1];
 
-        return;
-      }
+      return;
+    }
 
-    hunk->type = svn_diff__type_conflict;
+  hunk->type = svn_diff__type_conflict;
 
-    /* ### If we have a conflict we can try to find the
-     * ### common parts in it by getting an lcs between
-     * ### modified (start to start + length) and
-     * ### latest (start to start + length).
-     * ### We use this lcs to create a simple diff.  Only
-     * ### where there is a diff between the two, we have
-     * ### a conflict.
-     * ### This raises a problem; several common diffs and
-     * ### conflicts can occur within the same original
-     * ### block.  This needs some thought.
-     * ###
-     * ### NB: We can use the node _pointers_ to identify
-     * ###     different tokens
-     */
+  /* ### If we have a conflict we can try to find the
+   * ### common parts in it by getting an lcs between
+   * ### modified (start to start + length) and
+   * ### latest (start to start + length).
+   * ### We use this lcs to create a simple diff.  Only
+   * ### where there is a diff between the two, we have
+   * ### a conflict.
+   * ### This raises a problem; several common diffs and
+   * ### conflicts can occur within the same original
+   * ### block.  This needs some thought.
+   * ###
+   * ### NB: We can use the node _pointers_ to identify
+   * ###     different tokens
+   */
 
-    subpool = svn_pool_create(pool);
+  subpool = svn_pool_create(pool);
 
-    /* Calculate how much of the two sequences was
-     * actually the same.
-     */
-    common_length = (modified_length < latest_length
-                    ? modified_length : latest_length)
-                  - common_length;
+  /* Calculate how much of the two sequences was
+   * actually the same.
+   */
+  common_length = (modified_length < latest_length
+                  ? modified_length : latest_length)
+                - common_length;
 
-    /* If there were matching symbols at the start of
-     * both sequences, record that fact.
-     */
-    if (common_length > 0)
-      {
-        lcs = apr_palloc(subpool, sizeof(*lcs));
-        lcs->next = NULL;
-        lcs->position[0] = start_position[0];
-        lcs->position[1] = start_position[1];
-        lcs->length = common_length;
+  /* If there were matching symbols at the start of
+   * both sequences, record that fact.
+   */
+  if (common_length > 0)
+    {
+      lcs = apr_palloc(subpool, sizeof(*lcs));
+      lcs->next = NULL;
+      lcs->position[0] = start_position[0];
+      lcs->position[1] = start_position[1];
+      lcs->length = common_length;
 
-        lcs_ref = &lcs->next;
-      }
+      lcs_ref = &lcs->next;
+    }
 
-    modified_length -= common_length;
-    latest_length -= common_length;
+  modified_length -= common_length;
+  latest_length -= common_length;
 
-    modified_start = start_position[0]->offset;
-    latest_start = start_position[1]->offset;
+  modified_start = start_position[0]->offset;
+  latest_start = start_position[1]->offset;
 
-    start_position[0] = position[0];
-    start_position[1] = position[1];
+  start_position[0] = position[0];
+  start_position[1] = position[1];
 
-    /* Create a new ring for svn_diff__lcs to grok.
-     * We can safely do this given we don't need the
-     * positions we processed anymore.
-     */
-    if (modified_length == 0)
-      {
-        *position_list1 = position[0];
-        position[0] = NULL;
-      }
-    else
-      {
-        while (--modified_length)
-          position[0] = position[0]->next;
+  /* Create a new ring for svn_diff__lcs to grok.
+   * We can safely do this given we don't need the
+   * positions we processed anymore.
+   */
+  if (modified_length == 0)
+    {
+      *position_list1 = position[0];
+      position[0] = NULL;
+    }
+  else
+    {
+      while (--modified_length)
+        position[0] = position[0]->next;
 
-        *position_list1 = position[0]->next;
-        position[0]->next = start_position[0];
-      }
+      *position_list1 = position[0]->next;
+      position[0]->next = start_position[0];
+    }
 
-    if (latest_length == 0)
-      {
-        *position_list2 = position[1];
-        position[1] = NULL;
-      }
-    else
-      {
-        while (--latest_length)
-          position[1] = position[1]->next;
+  if (latest_length == 0)
+    {
+      *position_list2 = position[1];
+      position[1] = NULL;
+    }
+  else
+    {
+      while (--latest_length)
+        position[1] = position[1]->next;
 
-        *position_list2 = position[1]->next;
-        position[1]->next = start_position[1];
-      }
+      *position_list2 = position[1]->next;
+      position[1]->next = start_position[1];
+    }
 
-    *lcs_ref = svn_diff__lcs(position[0], position[1], 0, 0,
-                             subpool);
+  token_counts[0] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  token_counts[1] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  for (token_index = 0; token_index < num_tokens; token_index++)
+    token_counts[0][token_index] = token_counts[1][token_index] = 0;
 
-    /* Fix up the EOF lcs element in case one of
-     * the two sequences was NULL.
-     */
-    if ((*lcs_ref)->position[0]->offset == 1)
-      (*lcs_ref)->position[0] = *position_list1;
+  current = position[0];
+  do
+    {
+      token_counts[0][current->node]++;
+      current = current->next;
+    }
+  while (current != position[0]);
+  current = position[1];
+  do
+    {
+      token_counts[1][current->node]++;
+      current = current->next;
+    }
+  while (current != position[1]);
 
-    if ((*lcs_ref)->position[1]->offset == 1)
-      (*lcs_ref)->position[1] = *position_list2;
+  *lcs_ref = svn_diff__lcs(position[0], position[1], token_counts[0],
+                           token_counts[1], num_tokens, 0, 0, subpool);
 
-    /* Restore modified_length and latest_length */
-    modified_length = hunk->modified_length;
-    latest_length = hunk->latest_length;
+  /* Fix up the EOF lcs element in case one of
+   * the two sequences was NULL.
+   */
+  if ((*lcs_ref)->position[0]->offset == 1)
+    (*lcs_ref)->position[0] = *position_list1;
 
-    /* Produce the resolved diff */
-    while (1)
-      {
-        if (modified_start < lcs->position[0]->offset
-            || latest_start < lcs->position[1]->offset)
-          {
-            (*diff_ref) = apr_palloc(pool, sizeof(**diff_ref));
+  if ((*lcs_ref)->position[1]->offset == 1)
+    (*lcs_ref)->position[1] = *position_list2;
 
-            (*diff_ref)->type = svn_diff__type_conflict;
-            (*diff_ref)->original_start = hunk->original_start;
-            (*diff_ref)->original_length = hunk->original_length;
-            (*diff_ref)->modified_start = modified_start - 1;
-            (*diff_ref)->modified_length = lcs->position[0]->offset
+  /* Restore modified_length and latest_length */
+  modified_length = hunk->modified_length;
+  latest_length = hunk->latest_length;
+
+  /* Produce the resolved diff */
+  while (1)
+    {
+      if (modified_start < lcs->position[0]->offset
+          || latest_start < lcs->position[1]->offset)
+        {
+          (*diff_ref) = apr_palloc(pool, sizeof(**diff_ref));
+
+          (*diff_ref)->type = svn_diff__type_conflict;
+          (*diff_ref)->original_start = hunk->original_start;
+          (*diff_ref)->original_length = hunk->original_length;
+          (*diff_ref)->modified_start = modified_start - 1;
+          (*diff_ref)->modified_length = lcs->position[0]->offset
                                            - modified_start;
-            (*diff_ref)->latest_start = latest_start - 1;
-            (*diff_ref)->latest_length = lcs->position[1]->offset
+          (*diff_ref)->latest_start = latest_start - 1;
+          (*diff_ref)->latest_length = lcs->position[1]->offset
                                          - latest_start;
-            (*diff_ref)->resolved_diff = NULL;
+          (*diff_ref)->resolved_diff = NULL;
 
-            diff_ref = &(*diff_ref)->next;
-          }
+          diff_ref = &(*diff_ref)->next;
+        }
 
-        /* Detect the EOF */
-        if (lcs->length == 0)
-          break;
+      /* Detect the EOF */
+      if (lcs->length == 0)
+        break;
 
-        modified_start = lcs->position[0]->offset;
-        latest_start = lcs->position[1]->offset;
+      modified_start = lcs->position[0]->offset;
+      latest_start = lcs->position[1]->offset;
 
-        (*diff_ref) = apr_palloc(pool, sizeof(**diff_ref));
+      (*diff_ref) = apr_palloc(pool, sizeof(**diff_ref));
 
-        (*diff_ref)->type = svn_diff__type_diff_common;
-        (*diff_ref)->original_start = hunk->original_start;
-        (*diff_ref)->original_length = hunk->original_length;
-        (*diff_ref)->modified_start = modified_start - 1;
-        (*diff_ref)->modified_length = lcs->length;
-        (*diff_ref)->latest_start = latest_start - 1;
-        (*diff_ref)->latest_length = lcs->length;
-        (*diff_ref)->resolved_diff = NULL;
+      (*diff_ref)->type = svn_diff__type_diff_common;
+      (*diff_ref)->original_start = hunk->original_start;
+      (*diff_ref)->original_length = hunk->original_length;
+      (*diff_ref)->modified_start = modified_start - 1;
+      (*diff_ref)->modified_length = lcs->length;
+      (*diff_ref)->latest_start = latest_start - 1;
+      (*diff_ref)->latest_length = lcs->length;
+      (*diff_ref)->resolved_diff = NULL;
 
-        diff_ref = &(*diff_ref)->next;
+      diff_ref = &(*diff_ref)->next;
 
-        modified_start += lcs->length;
-        latest_start += lcs->length;
+      modified_start += lcs->length;
+      latest_start += lcs->length;
 
-        lcs = lcs->next;
-      }
+      lcs = lcs->next;
+    }
 
-    *diff_ref = NULL;
+  *diff_ref = NULL;
 
-    svn_pool_destroy(subpool);
+  svn_pool_destroy(subpool);
 }
 
 
@@ -251,6 +275,10 @@
 {
   svn_diff__tree_t *tree;
   svn_diff__position_t *position_list[3];
+  apr_uint32_t num_tokens;
+  apr_uint32_t token_index;
+  svn_diff__position_t *current;
+  apr_uint32_t *token_counts[3];
   svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
                                         svn_diff_datasource_modified,
                                         svn_diff_datasource_latest};
@@ -292,6 +320,8 @@
                                prefix_lines,
                                subpool));
 
+  num_tokens = svn_diff__get_node_count(tree);
+
   /* Get rid of the tokens, we don't need them to calc the diff */
   if (vtable->token_discard_all != NULL)
     vtable->token_discard_all(diff_baton);
@@ -299,10 +329,52 @@
   /* We don't need the nodes in the tree either anymore, nor the tree itself */
   svn_pool_destroy(treepool);
 
+  token_counts[0] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  token_counts[1] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  token_counts[2] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  for (token_index = 0; token_index < num_tokens; token_index++)
+    {
+      token_counts[0][token_index] = token_counts[1][token_index] = 0;
+      token_counts[2][token_index] = 0;
+    }
+
+  current = position_list[0];
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[0][current->node]++;
+          current = current->next;
+        }
+      while (current != position_list[0]);
+    }
+  current = position_list[1];
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[1][current->node]++;
+          current = current->next;
+        }
+      while (current != position_list[1]);
+    }
+  current = position_list[2];
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[2][current->node]++;
+          current = current->next;
+        }
+      while (current != position_list[2]);
+    }
+
   /* Get the lcs for original-modified and original-latest */
-  lcs_om = svn_diff__lcs(position_list[0], position_list[1], prefix_lines,
+  lcs_om = svn_diff__lcs(position_list[0], position_list[1], token_counts[0],
+                         token_counts[1], num_tokens, prefix_lines,
                          suffix_lines, subpool);
-  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
+  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], token_counts[0],
+                         token_counts[2], num_tokens, prefix_lines,
                          suffix_lines, subpool);
 
   /* Produce a merged diff */
@@ -435,6 +507,7 @@
                 svn_diff__resolve_conflict(*diff_ref,
                                            &position_list[1],
                                            &position_list[2],
+                                           num_tokens,
                                            pool);
               }
             else if (is_modified)
Index: subversion/libsvn_diff/diff4.c
===================================================================
--- subversion/libsvn_diff/diff4.c      (revision 1104713)
+++ subversion/libsvn_diff/diff4.c      (working copy)
@@ -173,6 +173,10 @@
 {
   svn_diff__tree_t *tree;
   svn_diff__position_t *position_list[4];
+  apr_uint32_t num_tokens;
+  apr_uint32_t token_index;
+  svn_diff__position_t *current;
+  apr_uint32_t *token_counts[4];
   svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
                                         svn_diff_datasource_modified,
                                         svn_diff_datasource_latest,
@@ -227,6 +231,8 @@
                                prefix_lines,
                                subpool2));
 
+  num_tokens = svn_diff__get_node_count(tree);
+
   /* Get rid of the tokens, we don't need them to calc the diff */
   if (vtable->token_discard_all != NULL)
     vtable->token_discard_all(diff_baton);
@@ -234,8 +240,61 @@
   /* We don't need the nodes in the tree either anymore, nor the tree itself */
   svn_pool_clear(subpool3);
 
+  token_counts[0] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  token_counts[1] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  token_counts[2] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  token_counts[3] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+  for (token_index = 0; token_index < num_tokens; token_index++)
+    {
+      token_counts[0][token_index] = token_counts[1][token_index] = 0;
+      token_counts[2][token_index] = token_counts[3][token_index] = 0;
+    }
+
+  current = position_list[0];
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[0][current->node]++;
+          current = current->next;
+        }
+      while (current != position_list[0]);
+    }
+  current = position_list[1];
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[1][current->node]++;
+          current = current->next;
+        }
+      while (current != position_list[1]);
+    }
+  current = position_list[2];
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[2][current->node]++;
+          current = current->next;
+        }
+      while (current != position_list[2]);
+    }
+  current = position_list[3];
+  if (current != NULL)
+    {
+      do
+        {
+          token_counts[3][current->node]++;
+          current = current->next;
+        }
+      while (current != position_list[3]);
+    }
+
   /* Get the lcs for original - latest */
-  lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
+  lcs_ol = svn_diff__lcs(position_list[0], position_list[2],
+                         token_counts[0], token_counts[2],
+                         num_tokens, prefix_lines,
                          suffix_lines, subpool3);
   diff_ol = svn_diff__diff(lcs_ol, 1, 1, TRUE, pool);
 
@@ -257,7 +316,9 @@
   /* Get the lcs for common ancestor - original
    * Do reverse adjustements
    */
-  lcs_adjust = svn_diff__lcs(position_list[3], position_list[2], prefix_lines,
+  lcs_adjust = svn_diff__lcs(position_list[3], position_list[2],
+                             token_counts[3], token_counts[2],
+                             num_tokens, prefix_lines,
                              suffix_lines, subpool3);
   diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
   adjust_diff(diff_ol, diff_adjust);
@@ -267,7 +328,9 @@
   /* Get the lcs for modified - common ancestor
    * Do forward adjustments
    */
-  lcs_adjust = svn_diff__lcs(position_list[1], position_list[3], prefix_lines,
+  lcs_adjust = svn_diff__lcs(position_list[1], position_list[3],
+                             token_counts[1], token_counts[3],
+                             num_tokens, prefix_lines,
                              suffix_lines, subpool3);
   diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
   adjust_diff(diff_ol, diff_adjust);
@@ -283,7 +346,7 @@
       if (hunk->type == svn_diff__type_conflict)
         {
           svn_diff__resolve_conflict(hunk, &position_list[1],
-                                     &position_list[2], pool);
+                                     &position_list[2], num_tokens, pool);
         }
     }
 
Index: subversion/libsvn_diff/lcs.c
===================================================================
--- subversion/libsvn_diff/lcs.c        (revision 1104713)
+++ subversion/libsvn_diff/lcs.c        (working copy)
@@ -51,6 +51,7 @@
 svn_diff__snake(apr_off_t k,
                 svn_diff__snake_t *fp,
                 int idx,
+                               apr_uint32_t *token_counts[2],
                 svn_diff__lcs_t **freelist,
                 apr_pool_t *pool)
 {
@@ -92,6 +93,11 @@
     }
 
 
+  if (previous_lcs)
+    {
+      previous_lcs->refcount++;
+    }
+
   /* ### Optimization, skip all positions that don't have matchpoints
    * ### anyway. Beware of the sentinel, don't skip it!
    */
@@ -99,41 +105,44 @@
   position[0] = start_position[0];
   position[1] = start_position[1];
 
-  while (position[0]->node == position[1]->node)
+  while (1)
     {
-      position[0] = position[0]->next;
-      position[1] = position[1]->next;
-    }
-
-  if (position[1] != start_position[1])
-    {
-      lcs = *freelist;
-      if (lcs)
+      while (position[0]->node == position[1]->node)
         {
-          *freelist = lcs->next;
+          position[0] = position[0]->next;
+          position[1] = position[1]->next;
         }
-      else
+
+      if (position[1] != start_position[1])
         {
-          lcs = apr_palloc(pool, sizeof(*lcs));
+          lcs = *freelist;
+          if (lcs)
+            {
+              *freelist = lcs->next;
+            }
+          else
+            {
+              lcs = apr_palloc(pool, sizeof(*lcs));
+            }
+
+          lcs->position[idx] = start_position[0];
+          lcs->position[abs(1 - idx)] = start_position[1];
+          lcs->length = position[1]->offset - start_position[1]->offset;
+          lcs->next = previous_lcs;
+          lcs->refcount = 1;
+          previous_lcs = lcs;
+             start_position[0] = position[0];
+             start_position[1] = position[1];
         }
-
-      lcs->position[idx] = start_position[0];
-      lcs->position[abs(1 - idx)] = start_position[1];
-      lcs->length = position[1]->offset - start_position[1]->offset;
-      lcs->next = previous_lcs;
-      lcs->refcount = 1;
-      fp[k].lcs = lcs;
+      if (position[0]->node >= 0 && token_counts[1][position[0]->node] == 0)
+        start_position[0] = position[0] = position[0]->next;
+      else if (position[1]->node >= 0 && token_counts[0][position[1]->node] == 
0)
+        start_position[1] = position[1] = position[1]->next;
+         else
+        break;
     }
-  else
-    {
-      fp[k].lcs = previous_lcs;
-    }
 
-  if (previous_lcs)
-    {
-      previous_lcs->refcount++;
-    }
-
+  fp[k].lcs = previous_lcs;
   fp[k].position[0] = position[0];
   fp[k].position[1] = position[1];
 
@@ -188,12 +197,18 @@
 svn_diff__lcs_t *
 svn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring) 
*/
               svn_diff__position_t *position_list2, /* pointer to tail (ring) 
*/
+              apr_uint32_t *token_counts_list1, /* array of counts */
+              apr_uint32_t *token_counts_list2, /* array of counts */
+              apr_uint32_t num_tokens,
               apr_off_t prefix_lines,
               apr_off_t suffix_lines,
               apr_pool_t *pool)
 {
   int idx;
   apr_off_t length[2];
+  apr_uint32_t *token_counts[2];
+  apr_uint32_t unique_count[2];
+  apr_uint32_t token_index;
   svn_diff__snake_t *fp;
   apr_off_t d;
   apr_off_t k;
@@ -231,10 +246,19 @@
       return lcs;
     }
 
+  unique_count[1] = unique_count[0] = 0;
+  for (token_index = 0; token_index < num_tokens; token_index++)
+    {
+      if (token_counts_list1[token_index] == 0)
+        unique_count[1] += token_counts_list2[token_index];
+      if (token_counts_list2[token_index] == 0)
+        unique_count[0] += token_counts_list1[token_index];
+    }
+
   /* Calculate length of both sequences to be compared */
   length[0] = position_list1->offset - position_list1->next->offset + 1;
   length[1] = position_list2->offset - position_list2->next->offset + 1;
-  idx = length[0] > length[1] ? 1 : 0;
+  idx = length[0] - unique_count[0] > length[1] - unique_count[1] ? 1 : 0;
 
   /* strikerXXX: here we allocate the furthest point array, which is
    * strikerXXX: sized M + N + 3 (!)
@@ -246,18 +270,20 @@
   sentinel_position[idx].next = position_list1->next;
   position_list1->next = &sentinel_position[idx];
   sentinel_position[idx].offset = position_list1->offset + 1;
+  token_counts[idx] = token_counts_list1;
 
   sentinel_position[abs(1 - idx)].next = position_list2->next;
   position_list2->next = &sentinel_position[abs(1 - idx)];
   sentinel_position[abs(1 - idx)].offset = position_list2->offset + 1;
+  token_counts[abs(1 - idx)] = token_counts_list2;
 
-  /* These are never dereferenced, only compared by value, so we
-   * can safely fake these up and the void* cast is OK.
+  /* Negative indices will not be used elsewhere
    */
-  sentinel_position[0].node = (void*)&sentinel_position[0];
-  sentinel_position[1].node = (void*)&sentinel_position[1];
+  sentinel_position[0].node = -1;
+  sentinel_position[1].node = -2;
 
-  d = length[abs(1 - idx)] - length[idx];
+  d = length[abs(1 - idx)] - unique_count[abs(1 - idx)]
+      - (length[idx] - unique_count[idx]);
 
   /* k = -1 will be the first to be used to get previous
    * position information from, make sure it holds sane
@@ -272,12 +298,12 @@
       /* Forward */
       for (k = -p; k < d; k++)
         {
-          svn_diff__snake(k, fp, idx, &lcs_freelist, pool);
+          svn_diff__snake(k, fp, idx, token_counts, &lcs_freelist, pool);
         }
 
       for (k = d + p; k >= d; k--)
         {
-          svn_diff__snake(k, fp, idx, &lcs_freelist, pool);
+          svn_diff__snake(k, fp, idx, token_counts, &lcs_freelist, pool);
         }
 
       p++;
Index: subversion/libsvn_diff/token.c
===================================================================
--- subversion/libsvn_diff/token.c      (revision 1104713)
+++ subversion/libsvn_diff/token.c      (working copy)
@@ -46,6 +46,7 @@
   svn_diff__node_t     *right;
 
   apr_uint32_t          hash;
+  apr_int32_t           index;
   void                 *token;
 };
 
@@ -53,10 +54,20 @@
 {
   svn_diff__node_t     *root[SVN_DIFF__HASH_SIZE];
   apr_pool_t           *pool;
+  apr_uint32_t          node_count;
 };
 
 
 /*
+ * Returns number of tokens in a tree
+ */
+apr_uint32_t
+svn_diff__get_node_count(svn_diff__tree_t *tree)
+{
+  return tree->node_count;
+}
+
+/*
  * Support functions to build a tree of token positions
  */
 
@@ -65,6 +76,7 @@
 {
   *tree = apr_pcalloc(pool, sizeof(**tree));
   (*tree)->pool = pool;
+  (*tree)->node_count = 0;
 }
 
 
@@ -122,6 +134,7 @@
   new_node->right = NULL;
   new_node->hash = hash;
   new_node->token = token;
+  new_node->index = tree->node_count++;
 
   *node = *node_ref = new_node;
 
@@ -168,7 +181,7 @@
       /* Create a new position */
       position = apr_palloc(pool, sizeof(*position));
       position->next = NULL;
-      position->node = node;
+      position->node = node->index;
       position->offset = offset;
 
       *position_ref = position;

Reply via email to