Here are the diffutils changes I just installed, in order to use the new diffseq module in gnulib.
2007-08-17 Paul Eggert <[EMAIL PROTECTED]> Break out diffseq.h into a separate file, so that gettext can use this code. Idea and code from Bruno Haible. * bootstrap.conf (gnulib_modules): Add diffseq. * src/analyze.c (xvec, yvec, fdiag, bdiag, too_expensive, SNAKE_LIMIT): (struct partition, diag, compareseq): Remove; now in diffseq.h. (ELEMENT, EQUAL, OFFSET, EXTRA_CONTEXT_FIELDS, NOTE_DELETE, NOTE_INSERT): (USE_HEURISTIC): New macros. Include "diffseq.h". (diff_2_files): Rewrite to use new diffseq.h interface. Index: bootstrap.conf =================================================================== RCS file: /cvsroot/diffutils/diffutils/bootstrap.conf,v retrieving revision 1.3 diff -u -p -r1.3 bootstrap.conf --- bootstrap.conf 19 Jul 2007 17:45:26 -0000 1.3 +++ bootstrap.conf 17 Aug 2007 23:35:45 -0000 @@ -18,7 +18,7 @@ # gnulib modules used by this package. gnulib_modules=' - c-stack config-h dirname dup2 error exclude exit exitfail + c-stack config-h diffseq dirname dup2 error exclude exit exitfail extensions fcntl fdl file-type fnmatch-gnu getopt gettext gettime hard-locale inttostr inttypes mkstemp regex sh-quote stat-macros stat-time strcase strftime strtoumax unistd Index: src/analyze.c =================================================================== RCS file: /cvsroot/diffutils/diffutils/src/analyze.c,v retrieving revision 1.26 diff -u -p -r1.26 analyze.c --- src/analyze.c 19 Jul 2007 17:45:28 -0000 1.26 +++ src/analyze.c 17 Aug 2007 23:35:45 -0000 @@ -1,7 +1,7 @@ /* Analyze file differences for GNU DIFF. Copyright (C) 1988, 1989, 1992, 1993, 1994, 1995, 1998, 2001, 2002, - 2004, 2006 Free Software Foundation, Inc. + 2004, 2006, 2007 Free Software Foundation, Inc. This file is part of GNU DIFF. @@ -18,345 +18,22 @@ You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. */ -/* The basic algorithm is described in: - "An O(ND) Difference Algorithm and its Variations", Eugene Myers, - Algorithmica Vol. 1 No. 2, 1986, pp. 251-266; - see especially section 4.2, which describes the variation used below. - Unless the --minimal option is specified, this code uses the TOO_EXPENSIVE - heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N) - at the price of producing suboptimal output for large inputs with - many differences. - - The basic algorithm was independently discovered as described in: - "Algorithms for Approximate String Matching", E. Ukkonen, - Information and Control Vol. 64, 1985, pp. 100-118. */ - #include "diff.h" #include <cmpbuf.h> #include <error.h> #include <file-type.h> #include <xalloc.h> -static lin *xvec, *yvec; /* Vectors being compared. */ -static lin *fdiag; /* Vector, indexed by diagonal, containing - 1 + the X coordinate of the point furthest - along the given diagonal in the forward - search of the edit matrix. */ -static lin *bdiag; /* Vector, indexed by diagonal, containing - the X coordinate of the point furthest - along the given diagonal in the backward - search of the edit matrix. */ -static lin too_expensive; /* Edit scripts longer than this are too - expensive to compute. */ - -#define SNAKE_LIMIT 20 /* Snakes bigger than this are considered `big'. */ - -struct partition -{ - lin xmid, ymid; /* Midpoints of this partition. */ - bool lo_minimal; /* Nonzero if low half will be analyzed minimally. */ - bool hi_minimal; /* Likewise for high half. */ -}; - -/* Find the midpoint of the shortest edit script for a specified - portion of the two files. - - Scan from the beginnings of the files, and simultaneously from the ends, - doing a breadth-first search through the space of edit-sequence. - When the two searches meet, we have found the midpoint of the shortest - edit sequence. - - If FIND_MINIMAL is nonzero, find the minimal edit script regardless - of expense. Otherwise, if the search is too expensive, use - heuristics to stop the search and report a suboptimal answer. - - Set PART->(xmid,ymid) to the midpoint (XMID,YMID). The diagonal number - XMID - YMID equals the number of inserted lines minus the number - of deleted lines (counting only lines before the midpoint). - - Set PART->lo_minimal to true iff the minimal edit script for the - left half of the partition is known; similarly for PART->hi_minimal. - - This function assumes that the first lines of the specified portions - of the two files do not match, and likewise that the last lines do not - match. The caller must trim matching lines from the beginning and end - of the portions it is going to specify. - - If we return the "wrong" partitions, - the worst this can do is cause suboptimal diff output. - It cannot cause incorrect diff output. */ - -static void -diag (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal, - struct partition *part) -{ - lin *const fd = fdiag; /* Give the compiler a chance. */ - lin *const bd = bdiag; /* Additional help for the compiler. */ - lin const *const xv = xvec; /* Still more help for the compiler. */ - lin const *const yv = yvec; /* And more and more . . . */ - lin const dmin = xoff - ylim; /* Minimum valid diagonal. */ - lin const dmax = xlim - yoff; /* Maximum valid diagonal. */ - lin const fmid = xoff - yoff; /* Center diagonal of top-down search. */ - lin const bmid = xlim - ylim; /* Center diagonal of bottom-up search. */ - lin fmin = fmid, fmax = fmid; /* Limits of top-down search. */ - lin bmin = bmid, bmax = bmid; /* Limits of bottom-up search. */ - lin c; /* Cost. */ - bool odd = (fmid - bmid) & 1; /* True if southeast corner is on an odd - diagonal with respect to the northwest. */ - - fd[fmid] = xoff; - bd[bmid] = xlim; - - for (c = 1;; ++c) - { - lin d; /* Active diagonal. */ - bool big_snake = false; - - /* Extend the top-down search by an edit step in each diagonal. */ - fmin > dmin ? fd[--fmin - 1] = -1 : ++fmin; - fmax < dmax ? fd[++fmax + 1] = -1 : --fmax; - for (d = fmax; d >= fmin; d -= 2) - { - lin x, y, oldx, tlo = fd[d - 1], thi = fd[d + 1]; - - if (tlo >= thi) - x = tlo + 1; - else - x = thi; - oldx = x; - y = x - d; - while (x < xlim && y < ylim && xv[x] == yv[y]) - ++x, ++y; - if (x - oldx > SNAKE_LIMIT) - big_snake = true; - fd[d] = x; - if (odd && bmin <= d && d <= bmax && bd[d] <= x) - { - part->xmid = x; - part->ymid = y; - part->lo_minimal = part->hi_minimal = true; - return; - } - } - - /* Similarly extend the bottom-up search. */ - bmin > dmin ? bd[--bmin - 1] = LIN_MAX : ++bmin; - bmax < dmax ? bd[++bmax + 1] = LIN_MAX : --bmax; - for (d = bmax; d >= bmin; d -= 2) - { - lin x, y, oldx, tlo = bd[d - 1], thi = bd[d + 1]; - - if (tlo < thi) - x = tlo; - else - x = thi - 1; - oldx = x; - y = x - d; - while (x > xoff && y > yoff && xv[x - 1] == yv[y - 1]) - --x, --y; - if (oldx - x > SNAKE_LIMIT) - big_snake = true; - bd[d] = x; - if (!odd && fmin <= d && d <= fmax && x <= fd[d]) - { - part->xmid = x; - part->ymid = y; - part->lo_minimal = part->hi_minimal = true; - return; - } - } - - if (find_minimal) - continue; - - /* Heuristic: check occasionally for a diagonal that has made - lots of progress compared with the edit distance. - If we have any such, find the one that has made the most - progress and return it as if it had succeeded. - - With this heuristic, for files with a constant small density - of changes, the algorithm is linear in the file size. */ - - if (200 < c && big_snake && speed_large_files) - { - lin best = 0; - - for (d = fmax; d >= fmin; d -= 2) - { - lin dd = d - fmid; - lin x = fd[d]; - lin y = x - d; - lin v = (x - xoff) * 2 - dd; - if (v > 12 * (c + (dd < 0 ? -dd : dd))) - { - if (v > best - && xoff + SNAKE_LIMIT <= x && x < xlim - && yoff + SNAKE_LIMIT <= y && y < ylim) - { - /* We have a good enough best diagonal; - now insist that it end with a significant snake. */ - int k; - - for (k = 1; xv[x - k] == yv[y - k]; k++) - if (k == SNAKE_LIMIT) - { - best = v; - part->xmid = x; - part->ymid = y; - break; - } - } - } - } - if (best > 0) - { - part->lo_minimal = true; - part->hi_minimal = false; - return; - } - - best = 0; - for (d = bmax; d >= bmin; d -= 2) - { - lin dd = d - bmid; - lin x = bd[d]; - lin y = x - d; - lin v = (xlim - x) * 2 + dd; - if (v > 12 * (c + (dd < 0 ? -dd : dd))) - { - if (v > best - && xoff < x && x <= xlim - SNAKE_LIMIT - && yoff < y && y <= ylim - SNAKE_LIMIT) - { - /* We have a good enough best diagonal; - now insist that it end with a significant snake. */ - int k; - - for (k = 0; xv[x + k] == yv[y + k]; k++) - if (k == SNAKE_LIMIT - 1) - { - best = v; - part->xmid = x; - part->ymid = y; - break; - } - } - } - } - if (best > 0) - { - part->lo_minimal = false; - part->hi_minimal = true; - return; - } - } - - /* Heuristic: if we've gone well beyond the call of duty, - give up and report halfway between our best results so far. */ - if (c >= too_expensive) - { - lin fxybest; - lin bxybest; - lin fxbest IF_LINT (= 0); - lin bxbest IF_LINT (= 0); - - /* Find forward diagonal that maximizes X + Y. */ - fxybest = -1; - for (d = fmax; d >= fmin; d -= 2) - { - lin x = MIN (fd[d], xlim); - lin y = x - d; - if (ylim < y) - x = ylim + d, y = ylim; - if (fxybest < x + y) - { - fxybest = x + y; - fxbest = x; - } - } +/* The core of the Diff algorithm. */ +#define ELEMENT lin +#define EQUAL(x,y) ((x) == (y)) +#define OFFSET lin +#define EXTRA_CONTEXT_FIELDS /* none */ +#define NOTE_DELETE(c, xoff) (files[0].changed[files[0].realindexes[xoff]] = 1) +#define NOTE_INSERT(c, yoff) (files[1].changed[files[1].realindexes[yoff]] = 1) +#define USE_HEURISTIC 1 +#include <diffseq.h> - /* Find backward diagonal that minimizes X + Y. */ - bxybest = LIN_MAX; - for (d = bmax; d >= bmin; d -= 2) - { - lin x = MAX (xoff, bd[d]); - lin y = x - d; - if (y < yoff) - x = yoff + d, y = yoff; - if (x + y < bxybest) - { - bxybest = x + y; - bxbest = x; - } - } - - /* Use the better of the two diagonals. */ - if ((xlim + ylim) - bxybest < fxybest - (xoff + yoff)) - { - part->xmid = fxbest; - part->ymid = fxybest - fxbest; - part->lo_minimal = true; - part->hi_minimal = false; - } - else - { - part->xmid = bxbest; - part->ymid = bxybest - bxbest; - part->lo_minimal = false; - part->hi_minimal = true; - } - return; - } - } -} - -/* Compare in detail contiguous subsequences of the two files - which are known, as a whole, to match each other. - - The results are recorded in the vectors files[N].changed, by - storing 1 in the element for each line that is an insertion or deletion. - - The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1. - - Note that XLIM, YLIM are exclusive bounds. - All line numbers are origin-0 and discarded lines are not counted. - - If FIND_MINIMAL, find a minimal difference no matter how - expensive it is. */ - -static void -compareseq (lin xoff, lin xlim, lin yoff, lin ylim, bool find_minimal) -{ - lin const *xv = xvec; /* Help the compiler. */ - lin const *yv = yvec; - - /* Slide down the bottom initial diagonal. */ - while (xoff < xlim && yoff < ylim && xv[xoff] == yv[yoff]) - ++xoff, ++yoff; - /* Slide up the top initial diagonal. */ - while (xlim > xoff && ylim > yoff && xv[xlim - 1] == yv[ylim - 1]) - --xlim, --ylim; - - /* Handle simple cases. */ - if (xoff == xlim) - while (yoff < ylim) - files[1].changed[files[1].realindexes[yoff++]] = 1; - else if (yoff == ylim) - while (xoff < xlim) - files[0].changed[files[0].realindexes[xoff++]] = 1; - else - { - struct partition part IF_LINT (= {0}); - - /* Find a point of correspondence in the middle of the files. */ - diag (xoff, xlim, yoff, ylim, find_minimal, &part); - - /* Use the partitions to split this problem into subproblems. */ - compareseq (xoff, part.xmid, yoff, part.ymid, part.lo_minimal); - compareseq (part.xmid, xlim, part.ymid, ylim, part.hi_minimal); - } -} - /* Discard lines from one file that have no matches in the other file. A line which is discarded will not be considered by the actual @@ -789,7 +466,6 @@ briefly_report (int changes, struct file int diff_2_files (struct comparison *cmp) { - lin diags; int f; struct change *e, *p; struct change *script; @@ -859,6 +535,10 @@ diff_2_files (struct comparison *cmp) } else { + struct context ctxt; + lin diags; + lin too_expensive; + /* Allocate vectors for the results of comparison: a flag for each line of each file, saying whether that line is an insertion or deletion. @@ -878,29 +558,31 @@ diff_2_files (struct comparison *cmp) /* Now do the main comparison algorithm, considering just the undiscarded lines. */ - xvec = cmp->file[0].undiscarded; - yvec = cmp->file[1].undiscarded; + ctxt.xvec = cmp->file[0].undiscarded; + ctxt.yvec = cmp->file[1].undiscarded; diags = (cmp->file[0].nondiscarded_lines + cmp->file[1].nondiscarded_lines + 3); - fdiag = xmalloc (diags * (2 * sizeof *fdiag)); - bdiag = fdiag + diags; - fdiag += cmp->file[1].nondiscarded_lines + 1; - bdiag += cmp->file[1].nondiscarded_lines + 1; + ctxt.fdiag = xmalloc (diags * (2 * sizeof *ctxt.fdiag)); + ctxt.bdiag = ctxt.fdiag + diags; + ctxt.fdiag += cmp->file[1].nondiscarded_lines + 1; + ctxt.bdiag += cmp->file[1].nondiscarded_lines + 1; + + ctxt.heuristic = speed_large_files; /* Set TOO_EXPENSIVE to be approximate square root of input size, bounded below by 256. */ too_expensive = 1; for (; diags != 0; diags >>= 2) too_expensive <<= 1; - too_expensive = MAX (256, too_expensive); + ctxt.too_expensive = MAX (256, too_expensive); files[0] = cmp->file[0]; files[1] = cmp->file[1]; compareseq (0, cmp->file[0].nondiscarded_lines, - 0, cmp->file[1].nondiscarded_lines, minimal); + 0, cmp->file[1].nondiscarded_lines, minimal, &ctxt); - free (fdiag - (cmp->file[1].nondiscarded_lines + 1)); + free (ctxt.fdiag - (cmp->file[1].nondiscarded_lines + 1)); /* Modify the results slightly to make them prettier in cases where that can validly be done. */