Teach the patience diff to attempt preventing user-specified lines from
appearing as a deletion or addition in the end result. The end user can
use this by specifying "--anchor=<text>" one or more times when using
Git commands like "diff" and "show".

Signed-off-by: Jonathan Tan <jonathanta...@google.com>
---
Actual patch instead of RFC.

One thing that might help is to warn if --anchor is used without
--patience, but I couldn't find a good place to put that warning. Let me
know if you know of a good place.

Replying to Stefan's and Junio's comments:

> The solution you provide is a good thing to experiment with, but
> longer term, I would want to have huge record of configs in which
> humans selected the best diff, such that we can use that data
> to reason about better automatic diff generation.
> The diff heuristic was based on a lot of human generated data,
> that was generated by Michael at the time. I wonder if we want to
> permanently store the anchor so the data collection will happen
> automatically over time.

I think machine learning is beyond the scope of this patch :-)

> or rather: "c is not moved, we don't care how the diff actually looks
> like",
> so maybe
>       ! grep "+c" diff

I think it's less error-prone to show "a" moving. With this, if the
command somehow prints nothing, the test would still pass.

> While this is RFC, I should not expect comments, though it would
> be nice to have them in the final series. ;-)

Added comments :-)

> This is a natural extension of the idea the patience algorithm is
> built upon.  If this were a cumulative command line option that can
> be given to specify multiple lines and can be used across the diff
> family, it would make a welcome addition, I would think.

Specifying multiple lines - done.

By diff family, do you mean "diff", "show", and others? If yes, that has
also been done in this patch.
---
 Documentation/diff-options.txt |  9 +++++++
 diff.c                         |  8 ++++++
 diff.h                         |  4 +++
 t/t4033-diff-patience.sh       | 59 ++++++++++++++++++++++++++++++++++++++++++
 xdiff/xdiff.h                  |  4 +++
 xdiff/xpatience.c              | 42 ++++++++++++++++++++++++++----
 6 files changed, 121 insertions(+), 5 deletions(-)

diff --git a/Documentation/diff-options.txt b/Documentation/diff-options.txt
index dd0dba5b1..b17d62696 100644
--- a/Documentation/diff-options.txt
+++ b/Documentation/diff-options.txt
@@ -100,6 +100,15 @@ For instance, if you configured diff.algorithm variable to 
a
 non-default value and want to use the default one, then you
 have to use `--diff-algorithm=default` option.
 
+--anchor=<text>::
+       This option may be specified more than once.
++
+If the "patience diff" algorithm is used, Git attempts to prevent lines
+starting with this text from appearing as a deletion or addition in the
+output.
++
+If another diff algorithm is used, this has no effect.
+
 --stat[=<width>[,<name-width>[,<count>]]]::
        Generate a diffstat. By default, as much space as necessary
        will be used for the filename part, and the rest for the graph
diff --git a/diff.c b/diff.c
index 0763e8926..29a7176ee 100644
--- a/diff.c
+++ b/diff.c
@@ -3210,6 +3210,8 @@ static void builtin_diff(const char *name_a,
                ecbdata.opt = o;
                ecbdata.header = header.len ? &header : NULL;
                xpp.flags = o->xdl_opts;
+               xpp.anchors = o->anchors;
+               xpp.anchors_nr = o->anchors_nr;
                xecfg.ctxlen = o->context;
                xecfg.interhunkctxlen = o->interhunkcontext;
                xecfg.flags = XDL_EMIT_FUNCNAMES;
@@ -3302,6 +3304,8 @@ static void builtin_diffstat(const char *name_a, const 
char *name_b,
                memset(&xpp, 0, sizeof(xpp));
                memset(&xecfg, 0, sizeof(xecfg));
                xpp.flags = o->xdl_opts;
+               xpp.anchors = o->anchors;
+               xpp.anchors_nr = o->anchors_nr;
                xecfg.ctxlen = o->context;
                xecfg.interhunkctxlen = o->interhunkcontext;
                if (xdi_diff_outf(&mf1, &mf2, diffstat_consume, diffstat,
@@ -4608,6 +4612,10 @@ int diff_opt_parse(struct diff_options *options,
                options->xdl_opts &= ~XDF_DIFF_ALGORITHM_MASK;
                options->xdl_opts |= value;
                return argcount;
+       } else if (skip_prefix(arg, "--anchor=", &arg)) {
+               ALLOC_GROW(options->anchors, options->anchors_nr + 1,
+                          options->anchors_alloc);
+               options->anchors[options->anchors_nr++] = xstrdup(arg);
        }
 
        /* flags options */
diff --git a/diff.h b/diff.h
index 0fb18dd73..7cf276f07 100644
--- a/diff.h
+++ b/diff.h
@@ -166,6 +166,10 @@ struct diff_options {
        const char *stat_sep;
        long xdl_opts;
 
+       /* see Documentation/diff-options.txt */
+       char **anchors;
+       size_t anchors_nr, anchors_alloc;
+
        int stat_width;
        int stat_name_width;
        int stat_graph_width;
diff --git a/t/t4033-diff-patience.sh b/t/t4033-diff-patience.sh
index 113304dc5..2d00d1056 100755
--- a/t/t4033-diff-patience.sh
+++ b/t/t4033-diff-patience.sh
@@ -13,6 +13,65 @@ test_expect_success '--ignore-space-at-eol with a single 
appended character' '
        grep "^+.*X" diff
 '
 
+test_expect_success '--anchor' '
+       printf "a\nb\nc\n" >pre &&
+       printf "c\na\nb\n" >post &&
+
+       # without anchor, c is moved
+       test_expect_code 1 git diff --no-index --patience pre post >diff &&
+       grep "^+c" diff &&
+
+       # with anchor, a is moved
+       test_expect_code 1 git diff --no-index --patience --anchor=c pre post 
>diff &&
+       grep "^+a" diff
+'
+
+test_expect_success '--anchor multiple' '
+       printf "a\nb\nc\nd\ne\nf\n" >pre &&
+       printf "c\na\nb\nf\nd\ne\n" >post &&
+
+       # with 1 anchor, c is not moved, but f is moved
+       test_expect_code 1 git diff --no-index --patience --anchor=c pre post 
>diff &&
+       grep "^+a" diff && # a is moved instead of c
+       grep "^+f" diff &&
+
+       # with 2 anchors, c and f are not moved
+       test_expect_code 1 git diff --no-index --patience --anchor=c --anchor=f 
pre post >diff &&
+       grep "^+a" diff &&
+       grep "^+d" diff # d is moved instead of f
+'
+
+test_expect_success '--anchor with nonexistent line has no effect' '
+       printf "a\nb\nc\n" >pre &&
+       printf "c\na\nb\n" >post &&
+
+       test_expect_code 1 git diff --no-index --patience --anchor=x pre post 
>diff &&
+       grep "^+c" diff
+'
+
+test_expect_success 'diff still produced with impossible multiple --anchor' '
+       printf "a\nb\nc\n" >pre &&
+       printf "c\na\nb\n" >post &&
+
+       test_expect_code 1 git diff --no-index --patience --anchor=a --anchor=c 
pre post >diff &&
+       mv post expected_post &&
+       git apply diff &&
+       diff expected_post post # make sure that the diff is correct
+'
+
+test_expect_success '--anchor works with other commands like "git show"' '
+       printf "a\nb\nc\n" >file &&
+       git add file &&
+       git commit -m foo &&
+       printf "c\na\nb\n" >file &&
+       git add file &&
+       git commit -m foo &&
+
+       # with anchor, a is moved
+       git show --patience --anchor=c >diff &&
+       grep "^+a" diff
+'
+
 test_diff_frobnitz "patience"
 
 test_diff_unique "patience"
diff --git a/xdiff/xdiff.h b/xdiff/xdiff.h
index 785ecb089..e6217e1d7 100644
--- a/xdiff/xdiff.h
+++ b/xdiff/xdiff.h
@@ -80,6 +80,10 @@ typedef struct s_mmbuffer {
 
 typedef struct s_xpparam {
        unsigned long flags;
+
+       /* See Documentation/diff-options.txt. */
+       char **anchors;
+       size_t anchors_nr;
 } xpparam_t;
 
 typedef struct s_xdemitcb {
diff --git a/xdiff/xpatience.c b/xdiff/xpatience.c
index a44e77632..f55c9fb11 100644
--- a/xdiff/xpatience.c
+++ b/xdiff/xpatience.c
@@ -62,6 +62,12 @@ struct hashmap {
                 * initially, "next" reflects only the order in file1.
                 */
                struct entry *next, *previous;
+
+               /*
+                * If 1, this entry can serve as the anchor. See
+                * Documentation/diff-options.txt for more information.
+                */
+               unsigned anchor : 1;
        } *entries, *first, *last;
        /* were common records found? */
        unsigned long has_matches;
@@ -70,8 +76,19 @@ struct hashmap {
        xpparam_t const *xpp;
 };
 
+static int is_anchor(xpparam_t const *xpp, const char *line)
+{
+       int i;
+       for (i = 0; i < xpp->anchors_nr; i++) {
+               if (!strncmp(line, xpp->anchors[i], strlen(xpp->anchors[i])))
+                       return 1;
+       }
+       return 0;
+}
+
 /* The argument "pass" is 1 for the first file, 2 for the second. */
-static void insert_record(int line, struct hashmap *map, int pass)
+static void insert_record(xpparam_t const *xpp, int line, struct hashmap *map,
+                         int pass)
 {
        xrecord_t **records = pass == 1 ?
                map->env->xdf1.recs : map->env->xdf2.recs;
@@ -110,6 +127,7 @@ static void insert_record(int line, struct hashmap *map, 
int pass)
                return;
        map->entries[index].line1 = line;
        map->entries[index].hash = record->ha;
+       map->entries[index].anchor = is_anchor(xpp, map->env->xdf1.recs[line - 
1]->ptr);
        if (!map->first)
                map->first = map->entries + index;
        if (map->last) {
@@ -147,11 +165,11 @@ static int fill_hashmap(mmfile_t *file1, mmfile_t *file2,
 
        /* First, fill with entries from the first file */
        while (count1--)
-               insert_record(line1++, result, 1);
+               insert_record(xpp, line1++, result, 1);
 
        /* Then search for matches in the second file */
        while (count2--)
-               insert_record(line2++, result, 2);
+               insert_record(xpp, line2++, result, 2);
 
        return 0;
 }
@@ -192,14 +210,28 @@ static struct entry *find_longest_common_sequence(struct 
hashmap *map)
        int longest = 0, i;
        struct entry *entry;
 
+       /*
+        * If not -1, this entry in sequence must never be overridden.
+        * Therefore, overriding entries before this has no effect, so
+        * do not do that either.
+        */
+       int anchor_i = -1;
+
        for (entry = map->first; entry; entry = entry->next) {
                if (!entry->line2 || entry->line2 == NON_UNIQUE)
                        continue;
                i = binary_search(sequence, longest, entry);
                entry->previous = i < 0 ? NULL : sequence[i];
-               sequence[++i] = entry;
-               if (i == longest)
+               ++i;
+               if (i <= anchor_i)
+                       continue;
+               sequence[i] = entry;
+               if (entry->anchor) {
+                       anchor_i = i;
+                       longest = anchor_i + 1;
+               } else if (i == longest) {
                        longest++;
+               }
        }
 
        /* No common unique lines were found */
-- 
2.15.0.448.gf294e3d99a-goog

Reply via email to