Re: new module 'fstrcmp', generic diff algorithm

Paul Eggert Fri, 17 Aug 2007 16:36:56 -0700

Here are the diffutils changes I just installed, in order to use the new
diffseq module in gnulib.


2007-08-17  Paul Eggert  <[EMAIL PROTECTED]>

        Break out diffseq.h into a separate file, so that gettext can use
        this code.  Idea and code from Bruno Haible.
        * bootstrap.conf (gnulib_modules): Add diffseq.
        * src/analyze.c (xvec, yvec, fdiag, bdiag, too_expensive, SNAKE_LIMIT):
        (struct partition, diag, compareseq): Remove; now in diffseq.h.
        (ELEMENT, EQUAL, OFFSET, EXTRA_CONTEXT_FIELDS, NOTE_DELETE, 
NOTE_INSERT):
        (USE_HEURISTIC): New macros.
        Include "diffseq.h".
        (diff_2_files): Rewrite to use new diffseq.h interface.

Index: bootstrap.conf
===================================================================
RCS file: /cvsroot/diffutils/diffutils/bootstrap.conf,v
retrieving revision 1.3
diff -u -p -r1.3 bootstrap.conf
--- bootstrap.conf      19 Jul 2007 17:45:26 -0000      1.3
+++ bootstrap.conf      17 Aug 2007 23:35:45 -0000
@@ -18,7 +18,7 @@

 # gnulib modules used by this package.
 gnulib_modules='
-       c-stack config-h dirname dup2 error exclude exit exitfail
+       c-stack config-h diffseq dirname dup2 error exclude exit exitfail
        extensions fcntl fdl file-type fnmatch-gnu getopt gettext
        gettime hard-locale inttostr inttypes mkstemp regex sh-quote
        stat-macros stat-time strcase strftime strtoumax unistd
Index: src/analyze.c
===================================================================
RCS file: /cvsroot/diffutils/diffutils/src/analyze.c,v
retrieving revision 1.26
diff -u -p -r1.26 analyze.c
--- src/analyze.c       19 Jul 2007 17:45:28 -0000      1.26
+++ src/analyze.c       17 Aug 2007 23:35:45 -0000
@@ -1,7 +1,7 @@
 /* Analyze file differences for GNU DIFF.

    Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002,
-   2004, 2006 Free Software Foundation, Inc.
+   2004, 2006, 2007 Free Software Foundation, Inc.

    This file is part of GNU DIFF.

@@ -18,345 +18,22 @@
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.  */

-/* The basic algorithm is described in:
-   "An O(ND) Difference Algorithm and its Variations", Eugene Myers,
-   Algorithmica Vol. 1 No. 2, 1986, pp. 251-266;
-   see especially section 4.2, which describes the variation used below.
-   Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE
-   heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N)
-   at the price of producing suboptimal output for large inputs with
-   many differences.
-
-   The basic algorithm was independently discovered as described in:
-   "Algorithms for Approximate String Matching", E. Ukkonen,
-   Information and Control Vol. 64, 1985, pp. 100-118.  */
-
 #include "diff.h"
 #include <cmpbuf.h>
 #include <error.h>
 #include <file-type.h>
 #include <xalloc.h>

-static lin *xvec, *yvec;       /* Vectors being compared. */
-static lin *fdiag;             /* Vector, indexed by diagonal, containing
-                                  1 + the X coordinate of the point furthest
-                                  along the given diagonal in the forward
-                                  search of the edit matrix. */
-static lin *bdiag;             /* Vector, indexed by diagonal, containing
-                                  the X coordinate of the point furthest
-                                  along the given diagonal in the backward
-                                  search of the edit matrix. */
-static lin too_expensive;      /* Edit scripts longer than this are too
-                                  expensive to compute.  */
-
-#define SNAKE_LIMIT 20 /* Snakes bigger than this are considered `big'.  */
-
-struct partition
-{
-  lin xmid, ymid;      /* Midpoints of this partition.  */
-  bool lo_minimal;     /* Nonzero if low half will be analyzed minimally.  */
-  bool hi_minimal;     /* Likewise for high half.  */
-};
-
-/* Find the midpoint of the shortest edit script for a specified
-   portion of the two files.
-
-   Scan from the beginnings of the files, and simultaneously from the ends,
-   doing a breadth-first search through the space of edit-sequence.
-   When the two searches meet, we have found the midpoint of the shortest
-   edit sequence.
-
-   If FIND_MINIMAL is nonzero, find the minimal edit script regardless
-   of expense.  Otherwise, if the search is too expensive, use
-   heuristics to stop the search and report a suboptimal answer.
-
-   Set PART->(xmid,ymid) to the midpoint (XMID,YMID).  The diagonal number
-   XMID - YMID equals the number of inserted lines minus the number
-   of deleted lines (counting only lines before the midpoint).
-
-   Set PART->lo_minimal to true iff the minimal edit script for the
-   left half of the partition is known; similarly for PART->hi_minimal.
-
-   This function assumes that the first lines of the specified portions
-   of the two files do not match, and likewise that the last lines do not
-   match.  The caller must trim matching lines from the beginning and end
-   of the portions it is going to specify.
-
-   If we return the "wrong" partitions,
-   the worst this can do is cause suboptimal diff output.
-   It cannot cause incorrect diff output.  */
-
-static void
-diag (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal,
-      struct partition *part)
-{
-  lin *const fd = fdiag;       /* Give the compiler a chance. */
-  lin *const bd = bdiag;       /* Additional help for the compiler. */
-  lin const *const xv = xvec;  /* Still more help for the compiler. */
-  lin const *const yv = yvec;  /* And more and more . . . */
-  lin const dmin = xoff - ylim;        /* Minimum valid diagonal. */
-  lin const dmax = xlim - yoff;        /* Maximum valid diagonal. */
-  lin const fmid = xoff - yoff;        /* Center diagonal of top-down search. 
*/
-  lin const bmid = xlim - ylim;        /* Center diagonal of bottom-up search. 
*/
-  lin fmin = fmid, fmax = fmid;        /* Limits of top-down search. */
-  lin bmin = bmid, bmax = bmid;        /* Limits of bottom-up search. */
-  lin c;                       /* Cost. */
-  bool odd = (fmid - bmid) & 1;        /* True if southeast corner is on an odd
-                                  diagonal with respect to the northwest. */
-
-  fd[fmid] = xoff;
-  bd[bmid] = xlim;
-
-  for (c = 1;; ++c)
-    {
-      lin d;                   /* Active diagonal. */
-      bool big_snake = false;
-
-      /* Extend the top-down search by an edit step in each diagonal. */
-      fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin;
-      fmax < dmax ? fd[++fmax + 1] = -1 : --fmax;
-      for (d = fmax; d >= fmin; d -= 2)
-       {
-         lin x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1];
-
-         if (tlo >= thi)
-           x = tlo + 1;
-         else
-           x = thi;
-         oldx = x;
-         y = x - d;
-         while (x < xlim && y < ylim && xv[x] == yv[y])
-           ++x, ++y;
-         if (x - oldx > SNAKE_LIMIT)
-           big_snake = true;
-         fd[d] = x;
-         if (odd && bmin <= d && d <= bmax && bd[d] <= x)
-           {
-             part->xmid = x;
-             part->ymid = y;
-             part->lo_minimal = part->hi_minimal = true;
-             return;
-           }
-       }
-
-      /* Similarly extend the bottom-up search.  */
-      bmin > dmin ? bd[--bmin - 1] = LIN_MAX : ++bmin;
-      bmax < dmax ? bd[++bmax + 1] = LIN_MAX : --bmax;
-      for (d = bmax; d >= bmin; d -= 2)
-       {
-         lin x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1];
-
-         if (tlo < thi)
-           x = tlo;
-         else
-           x = thi - 1;
-         oldx = x;
-         y = x - d;
-         while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1])
-           --x, --y;
-         if (oldx - x > SNAKE_LIMIT)
-           big_snake = true;
-         bd[d] = x;
-         if (!odd && fmin <= d && d <= fmax && x <= fd[d])
-           {
-             part->xmid = x;
-             part->ymid = y;
-             part->lo_minimal = part->hi_minimal = true;
-             return;
-           }
-       }
-
-      if (find_minimal)
-       continue;
-
-      /* Heuristic: check occasionally for a diagonal that has made
-        lots of progress compared with the edit distance.
-        If we have any such, find the one that has made the most
-        progress and return it as if it had succeeded.
-
-        With this heuristic, for files with a constant small density
-        of changes, the algorithm is linear in the file size.  */
-
-      if (200 < c && big_snake && speed_large_files)
-       {
-         lin best = 0;
-
-         for (d = fmax; d >= fmin; d -= 2)
-           {
-             lin dd = d - fmid;
-             lin x = fd[d];
-             lin y = x - d;
-             lin v = (x - xoff) * 2 - dd;
-             if (v > 12 * (c + (dd < 0 ? -dd : dd)))
-               {
-                 if (v > best
-                     && xoff + SNAKE_LIMIT <= x && x < xlim
-                     && yoff + SNAKE_LIMIT <= y && y < ylim)
-                   {
-                     /* We have a good enough best diagonal;
-                        now insist that it end with a significant snake.  */
-                     int k;
-
-                     for (k = 1; xv[x - k] == yv[y - k]; k++)
-                       if (k == SNAKE_LIMIT)
-                         {
-                           best = v;
-                           part->xmid = x;
-                           part->ymid = y;
-                           break;
-                         }
-                   }
-               }
-           }
-         if (best > 0)
-           {
-             part->lo_minimal = true;
-             part->hi_minimal = false;
-             return;
-           }
-
-         best = 0;
-         for (d = bmax; d >= bmin; d -= 2)
-           {
-             lin dd = d - bmid;
-             lin x = bd[d];
-             lin y = x - d;
-             lin v = (xlim - x) * 2 + dd;
-             if (v > 12 * (c + (dd < 0 ? -dd : dd)))
-               {
-                 if (v > best
-                     && xoff < x && x <= xlim - SNAKE_LIMIT
-                     && yoff < y && y <= ylim - SNAKE_LIMIT)
-                   {
-                     /* We have a good enough best diagonal;
-                        now insist that it end with a significant snake.  */
-                     int k;
-
-                     for (k = 0; xv[x + k] == yv[y + k]; k++)
-                       if (k == SNAKE_LIMIT - 1)
-                         {
-                           best = v;
-                           part->xmid = x;
-                           part->ymid = y;
-                           break;
-                         }
-                   }
-               }
-           }
-         if (best > 0)
-           {
-             part->lo_minimal = false;
-             part->hi_minimal = true;
-             return;
-           }
-       }
-
-      /* Heuristic: if we've gone well beyond the call of duty,
-        give up and report halfway between our best results so far.  */
-      if (c >= too_expensive)
-       {
-         lin fxybest;
-         lin bxybest;
-         lin fxbest IF_LINT (= 0);
-         lin bxbest IF_LINT (= 0);
-
-         /* Find forward diagonal that maximizes X + Y.  */
-         fxybest = -1;
-         for (d = fmax; d >= fmin; d -= 2)
-           {
-             lin x = MIN (fd[d], xlim);
-             lin y = x - d;
-             if (ylim < y)
-               x = ylim + d, y = ylim;
-             if (fxybest < x + y)
-               {
-                 fxybest = x + y;
-                 fxbest = x;
-               }
-           }
+/* The core of the Diff algorithm.  */
+#define ELEMENT lin
+#define EQUAL(x,y) ((x) == (y))
+#define OFFSET lin
+#define EXTRA_CONTEXT_FIELDS /* none */
+#define NOTE_DELETE(c, xoff) (files[0].changed[files[0].realindexes[xoff]] = 1)
+#define NOTE_INSERT(c, yoff) (files[1].changed[files[1].realindexes[yoff]] = 1)
+#define USE_HEURISTIC 1
+#include <diffseq.h>

-         /* Find backward diagonal that minimizes X + Y.  */
-         bxybest = LIN_MAX;
-         for (d = bmax; d >= bmin; d -= 2)
-           {
-             lin x = MAX (xoff, bd[d]);
-             lin y = x - d;
-             if (y < yoff)
-               x = yoff + d, y = yoff;
-             if (x + y < bxybest)
-               {
-                 bxybest = x + y;
-                 bxbest = x;
-               }
-           }
-
-         /* Use the better of the two diagonals.  */
-         if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff))
-           {
-             part->xmid = fxbest;
-             part->ymid = fxybest - fxbest;
-             part->lo_minimal = true;
-             part->hi_minimal = false;
-           }
-         else
-           {
-             part->xmid = bxbest;
-             part->ymid = bxybest - bxbest;
-             part->lo_minimal = false;
-             part->hi_minimal = true;
-           }
-         return;
-       }
-    }
-}
-
-/* Compare in detail contiguous subsequences of the two files
-   which are known, as a whole, to match each other.
-
-   The results are recorded in the vectors files[N].changed, by
-   storing 1 in the element for each line that is an insertion or deletion.
-
-   The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
-
-   Note that XLIM, YLIM are exclusive bounds.
-   All line numbers are origin-0 and discarded lines are not counted.
-
-   If FIND_MINIMAL, find a minimal difference no matter how
-   expensive it is.  */
-
-static void
-compareseq (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal)
-{
-  lin const *xv = xvec; /* Help the compiler.  */
-  lin const *yv = yvec;
-
-  /* Slide down the bottom initial diagonal. */
-  while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff])
-    ++xoff, ++yoff;
-  /* Slide up the top initial diagonal. */
-  while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1])
-    --xlim, --ylim;
-
-  /* Handle simple cases. */
-  if (xoff == xlim)
-    while (yoff < ylim)
-      files[1].changed[files[1].realindexes[yoff++]] = 1;
-  else if (yoff == ylim)
-    while (xoff < xlim)
-      files[0].changed[files[0].realindexes[xoff++]] = 1;
-  else
-    {
-      struct partition part IF_LINT (= {0});
-
-      /* Find a point of correspondence in the middle of the files.  */
-      diag (xoff, xlim, yoff, ylim, find_minimal, &part);
-
-      /* Use the partitions to split this problem into subproblems.  */
-      compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal);
-      compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal);
-    }
-}
-
 /* Discard lines from one file that have no matches in the other file.

    A line which is discarded will not be considered by the actual
@@ -789,7 +466,6 @@ briefly_report (int changes, struct file
 int
 diff_2_files (struct comparison *cmp)
 {
-  lin diags;
   int f;
   struct change *e, *p;
   struct change *script;
@@ -859,6 +535,10 @@ diff_2_files (struct comparison *cmp)
     }
   else
     {
+      struct context ctxt;
+      lin diags;
+      lin too_expensive;
+
       /* Allocate vectors for the results of comparison:
         a flag for each line of each file, saying whether that line
         is an insertion or deletion.
@@ -878,29 +558,31 @@ diff_2_files (struct comparison *cmp)
       /* Now do the main comparison algorithm, considering just the
         undiscarded lines.  */

-      xvec = cmp->file[0].undiscarded;
-      yvec = cmp->file[1].undiscarded;
+      ctxt.xvec = cmp->file[0].undiscarded;
+      ctxt.yvec = cmp->file[1].undiscarded;
       diags = (cmp->file[0].nondiscarded_lines
               + cmp->file[1].nondiscarded_lines + 3);
-      fdiag = xmalloc (diags * (2 * sizeof *fdiag));
-      bdiag = fdiag + diags;
-      fdiag += cmp->file[1].nondiscarded_lines + 1;
-      bdiag += cmp->file[1].nondiscarded_lines + 1;
+      ctxt.fdiag = xmalloc (diags * (2 * sizeof *ctxt.fdiag));
+      ctxt.bdiag = ctxt.fdiag + diags;
+      ctxt.fdiag += cmp->file[1].nondiscarded_lines + 1;
+      ctxt.bdiag += cmp->file[1].nondiscarded_lines + 1;
+
+      ctxt.heuristic = speed_large_files;

       /* Set TOO_EXPENSIVE to be approximate square root of input size,
         bounded below by 256.  */
       too_expensive = 1;
       for (;  diags != 0;  diags >>= 2)
        too_expensive <<= 1;
-      too_expensive = MAX (256, too_expensive);
+      ctxt.too_expensive = MAX (256, too_expensive);

       files[0] = cmp->file[0];
       files[1] = cmp->file[1];

       compareseq (0, cmp->file[0].nondiscarded_lines,
-                 0, cmp->file[1].nondiscarded_lines, minimal);
+                 0, cmp->file[1].nondiscarded_lines, minimal, &ctxt);

-      free (fdiag - (cmp->file[1].nondiscarded_lines + 1));
+      free (ctxt.fdiag - (cmp->file[1].nondiscarded_lines + 1));

       /* Modify the results slightly to make them prettier
         in cases where that can validly be done.  */

Re: new module 'fstrcmp', generic diff algorithm

Reply via email to