[RFC PATCH 1/1] Fuzzy blame

michael Sun, 24 Mar 2019 16:53:16 -0700

From: Michael Platings <mich...@platin.gs>

---
 blame.c                | 352 +++++++++++++++++++++++++++++++++++++++++++++++--
 blame.h                |   1 +
 builtin/blame.c        |   3 +
 t/t8020-blame-fuzzy.sh | 264 +++++++++++++++++++++++++++++++++++++
 4 files changed, 609 insertions(+), 11 deletions(-)
 create mode 100755 t/t8020-blame-fuzzy.sh


diff --git a/blame.c b/blame.c
index 5c07dec190..b5a40c8e9f 100644
--- a/blame.c
+++ b/blame.c
@@ -997,6 +997,326 @@ static void pass_blame_to_parent(struct blame_scoreboard 
*sb,
        return;
 }
 
+/* https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel */
+static int bitcount(uint32_t v) {
+       v = v - ((v >> 1) & 0x55555555u);
+       v = (v & 0x33333333u) + ((v >> 2) & 0x33333333u);
+       return ((v + (v >> 4) & 0xf0f0f0fu) * 0x1010101u) >> 24;
+}
+
+#define FINGERPRINT_LENGTH (8*256)
+/* This is just a bitset indicating which byte pairs are present.
+ e.g. the string "good goo" has pairs "go", "oo", "od", "d ", " g"
+ String similarity is calculated as a bitwise or and counting the set bits.
+ TODO for the string lengths we typically deal with, this would probably be
+ implemented more efficiently with a set data structure.
+ */
+struct fingerprint {
+       uint32_t bits[FINGERPRINT_LENGTH];
+};
+
+static void get_fingerprint(struct fingerprint *result,
+                           const char *line_begin, const char *line_end) {
+       memset(result, 0, sizeof(struct fingerprint));
+       for (const char *p = line_begin; p + 1 < line_end; ++p) {
+               unsigned c = tolower(*p) | (tolower(*(p + 1)) << 8);
+               result->bits[c >> 5] |= 1u << (c & 0x1f);
+       }
+}
+
+static int fingerprint_similarity(const struct fingerprint *a,
+                                 const struct fingerprint *b) {
+       int intersection = 0;
+       for (int i = 0; i < FINGERPRINT_LENGTH; ++i) {
+               intersection += bitcount(a->bits[i] & b->bits[i]);
+       }
+       return intersection;
+}
+
+struct fuzzy_blame_parent_data {
+       struct blame_origin *parent;
+       long offset;
+       struct blame_entry **processed_entries;
+       struct blame_entry **target_entries;
+       const char *parent_content;
+       const char *target_content;
+       int *parent_line_starts;
+       int *target_line_starts;
+       int parent_line_count;
+       int target_line_count;
+};
+
+static void get_chunk_fingerprints(struct fingerprint *fingerprints,
+                                  const char *content,
+                                  const int *line_starts,
+                                  long chunk_start,
+                                  long chunk_length) {
+       line_starts += chunk_start;
+       for (int i = 0; i != chunk_length; ++i) {
+               const char* linestart = content + line_starts[i];
+               const char* lineend = content + line_starts[i + 1];
+               get_fingerprint(fingerprints + i, linestart, lineend);
+       }
+}
+
+/* This finds the line that we can match with the most confidence, and
+ uses it as a partition. It then calls itself on the lines on either side of
+ that partition. In this way we avoid lines appearing out of order, and retain
+ a sensible line ordering.
+ TODO: so much optimisation. Currently this does the same work repeatedly.
+ */
+static void fuzzy_find_matching_lines_recurse(
+               const char *content_a, const char *content_b,
+               const int *line_starts_a, const int *line_starts_b,
+               int start_a, int start_b,
+               int length_a, int length_b,
+               int *result,
+               struct fingerprint *fingerprints_a,
+               struct fingerprint *fingerprints_b) {
+
+       int most_certain_line = -1;
+       int most_certain_line_certainty = -1;
+
+       for (int i = 0; i < length_b; ++i) {
+               const struct fingerprint *fingerprint_b = fingerprints_b + i;
+
+               int closest_line_a = (i * 2 + 1) * length_a /
+               (length_b * 2);
+
+               /* Limit range of search to a reasonable number of lines.
+                TODO consider scaling this up if length_a is greater than
+                length_b. */
+               const int MAX_SEARCH_DISTANCE = 5;
+               int search_start = closest_line_a - (MAX_SEARCH_DISTANCE - 1);
+               int search_end = closest_line_a + MAX_SEARCH_DISTANCE;
+               if (search_start < 0) search_start = 0;
+               if (search_end > length_a) search_end = length_a;
+
+               /* Find both the best and 2nd best matches. The match certainty
+                is the difference between these values. */
+               int best_similarity = 0, second_best_similarity = 0;
+               int best_similarity_index = 0;
+
+               for (int j = search_start; j < search_end; ++j) {
+                       int similarity = fingerprint_similarity(
+                                                               fingerprint_b,
+                                                               fingerprints_a 
+ j) *
+                               (1000 - abs(j - closest_line_a));
+
+                       if (similarity > best_similarity) {
+                               second_best_similarity = best_similarity;
+                               best_similarity = similarity;
+                               best_similarity_index = j;
+                       }
+                       else if (similarity > second_best_similarity) {
+                               second_best_similarity = similarity;
+                       }
+               }
+
+               if (best_similarity == 0) {
+                       result[i] = -1;
+                       continue;
+               }
+
+               result[i] = start_a + best_similarity_index;
+
+               int certainty = best_similarity - second_best_similarity;
+               if (certainty > most_certain_line_certainty) {
+                       most_certain_line_certainty = certainty;
+                       most_certain_line = i;
+               }
+       }
+
+       if (most_certain_line == -1) {
+               return;
+       }
+
+       if (most_certain_line > 0) {
+               fuzzy_find_matching_lines_recurse(content_a, content_b, 
line_starts_a, line_starts_b, start_a, start_b, result[most_certain_line] + 1 - 
start_a, most_certain_line, result, fingerprints_a, fingerprints_b);
+       }
+       if (most_certain_line + 1 < length_b) {
+               int second_half_start_a = result[most_certain_line];
+               int second_half_start_b = start_b + most_certain_line + 1;
+               int second_half_length_a = length_a + start_a - 
second_half_start_a;
+               int second_half_length_b = length_b + start_b - 
second_half_start_b;
+               fuzzy_find_matching_lines_recurse(content_a, content_b, 
line_starts_a, line_starts_b, second_half_start_a, second_half_start_b, 
second_half_length_a, second_half_length_b, result + most_certain_line + 1, 
fingerprints_a + second_half_start_a - start_a, fingerprints_b + 
most_certain_line + 1);
+       }
+}
+
+/* Find line numbers in "a" that match with lines in "b"
+ Returns an array of either line indices or -1 where no match is found.
+ The returned array must be free()d after use.
+ */
+static int *fuzzy_find_matching_lines(
+       const char *content_a, const char *content_b,
+       const int *line_starts_a, const int *line_starts_b,
+       int start_a, int start_b,
+       int length_a, int length_b) {
+
+       int *result = malloc(sizeof(int) * length_b);
+
+       struct fingerprint *fingerprints_a =
+               malloc(sizeof(struct fingerprint) * length_a);
+       struct fingerprint *fingerprints_b =
+               malloc(sizeof(struct fingerprint) * length_b);
+
+       get_chunk_fingerprints(fingerprints_a, content_a,
+                              line_starts_a,
+                              start_a, length_a);
+       get_chunk_fingerprints(fingerprints_b, content_b,
+                              line_starts_b,
+                              start_b, length_b);
+
+       fuzzy_find_matching_lines_recurse(content_a, content_b,
+                                           line_starts_a, line_starts_b,
+                                           start_a, start_b,
+                                           length_a, length_b,
+                                           result,
+                                           fingerprints_a,
+                                           fingerprints_b);
+
+       free(fingerprints_a);
+       free(fingerprints_b);
+
+       return result;
+}
+
+static int blame_chunk_fuzzy(long parent_chunk_start,
+                                 long parent_chunk_length,
+                                 long target_chunk_start,
+                                 long target_chunk_length,
+                                 void *data)
+{
+       struct fuzzy_blame_parent_data *d = data;
+
+       if (parent_chunk_start - target_chunk_start != d->offset)
+               die("internal error in blame::blame_chunk_fuzzy");
+
+       int target_chunk_end = target_chunk_start + target_chunk_length;
+
+       struct blame_origin *parent = d->parent;
+       struct blame_entry *e = *d->target_entries;
+       struct blame_entry *parent_tail = NULL;
+       struct blame_entry *target_tail = NULL;
+
+       if (parent_chunk_length == 0) {
+               /* Don't try to blame parent for newly added lines */
+               while (e && e->s_lno < target_chunk_end) {
+                       target_tail = e;
+                       e = e->next;
+               }
+               d->target_entries = &target_tail->next;
+               goto finish;
+       }
+
+       int *matched_lines = fuzzy_find_matching_lines(
+               d->parent_content, d->target_content,
+               d->parent_line_starts, d->target_line_starts,
+               parent_chunk_start, target_chunk_start,
+               parent_chunk_length, target_chunk_length);
+
+       while (e && e->s_lno < target_chunk_end) {
+               struct blame_entry *next = e->next;
+
+               for (int i = 0; i < e->num_lines; ++i) {
+                       struct blame_entry *n =
+                               xcalloc(1, sizeof (struct blame_entry));
+                       n->lno = e->lno + i;
+                       n->num_lines = 1;
+                       n->score = 0;
+
+                       int matched_line = matched_lines[i + e->s_lno -
+                               target_chunk_start];
+
+                       if (matched_line != -1) {
+                               n->suspect = blame_origin_incref(parent);
+                               n->s_lno = matched_line;
+                               n->next = parent_tail;
+                               parent_tail = n;
+                       } else {
+                               n->suspect = blame_origin_incref(e->suspect);
+                               n->s_lno = e->s_lno + i;
+                               n->next = target_tail;
+                               target_tail = n;
+                       }
+               }
+
+               blame_origin_decref(e->suspect);
+               free(e);
+
+               e = next;
+       }
+
+       if (parent_tail) {
+               parent_tail = llist_mergesort(parent_tail, get_next_blame,
+                                             set_next_blame,
+                                             compare_blame_suspect);
+               *d->processed_entries = parent_tail;
+               while (parent_tail->next) parent_tail = parent_tail->next;
+               d->processed_entries = &parent_tail->next;
+       }
+
+       if (target_tail) {
+               target_tail = llist_mergesort(target_tail, get_next_blame,
+                                             set_next_blame,
+                                             compare_blame_suspect);
+               *d->target_entries = target_tail;
+               while (target_tail->next) target_tail = target_tail->next;
+               d->target_entries = &target_tail->next;
+       }
+
+       *d->target_entries = e;
+
+       free(matched_lines);
+
+finish:
+       d->offset = parent_chunk_start + parent_chunk_length -
+               (target_chunk_start + target_chunk_length);
+
+       return 0;
+}
+
+static int find_line_starts(int **line_starts, const char *buf, unsigned long 
len);
+
+static void pass_blame_to_parent_fuzzy(struct blame_scoreboard *sb,
+                                      struct blame_origin *target,
+                                      struct blame_origin *parent)
+{
+       mmfile_t file_p, file_o;
+       struct fuzzy_blame_parent_data d;
+       struct blame_entry *newdest = NULL;
+
+       if (!target->suspects)
+               return; /* nothing remains for this target */
+
+       d.parent = parent;
+       d.offset = 0;
+       d.processed_entries = &newdest;
+       d.target_entries = &target->suspects;
+
+       fill_origin_blob(&sb->revs->diffopt, parent, &file_p, 
&sb->num_read_blob);
+       fill_origin_blob(&sb->revs->diffopt, target, &file_o, 
&sb->num_read_blob);
+       sb->num_get_patch++;
+
+       d.parent_content = file_p.ptr;
+       d.target_content = file_o.ptr;
+       d.parent_line_count = find_line_starts(&d.parent_line_starts, 
file_p.ptr, file_p.size);
+       d.target_line_count = find_line_starts(&d.target_line_starts, 
file_o.ptr, file_o.size);
+
+       if (diff_hunks(&file_p, &file_o, blame_chunk_fuzzy, &d, sb->xdl_opts))
+               die("unable to generate diff (%s -> %s)",
+                       oid_to_hex(&parent->commit->object.oid),
+                       oid_to_hex(&target->commit->object.oid));
+
+       *d.processed_entries = NULL;
+       queue_blames(sb, parent, newdest);
+
+       free(d.target_line_starts);
+       free(d.parent_line_starts);
+
+       return;
+}
+
 /*
  * The lines in blame_entry after splitting blames many times can become
  * very small and trivial, and at some point it becomes pointless to
@@ -1433,7 +1753,7 @@ static void pass_blame(struct blame_scoreboard *sb, 
struct blame_origin *origin,
        struct commit *commit = origin->commit;
        struct commit_list *sg;
        struct blame_origin *sg_buf[MAXSG];
-       struct blame_origin *porigin, **sg_origin = sg_buf;
+       struct blame_origin *porigin = NULL, **sg_origin = sg_buf;
        struct blame_entry *toosmall = NULL;
        struct blame_entry *blames, **blametail = &blames;
 
@@ -1560,6 +1880,11 @@ static void pass_blame(struct blame_scoreboard *sb, 
struct blame_origin *origin,
                *tail = origin->suspects;
                origin->suspects = toosmall;
        }
+
+       if (sb->fuzzy && porigin) {
+               pass_blame_to_parent_fuzzy(sb, origin, porigin);
+       }
+
        for (i = 0; i < num_sg; i++) {
                if (sg_origin[i]) {
                        drop_origin_blob(sg_origin[i]);
@@ -1645,14 +1970,8 @@ static const char *get_next_line(const char *start, 
const char *end)
        return nl ? nl + 1 : end;
 }
 
-/*
- * To allow quick access to the contents of nth line in the
- * final image, prepare an index in the scoreboard.
- */
-static int prepare_lines(struct blame_scoreboard *sb)
+static int find_line_starts(int **line_starts, const char *buf, unsigned long 
len)
 {
-       const char *buf = sb->final_buf;
-       unsigned long len = sb->final_buf_size;
        const char *end = buf + len;
        const char *p;
        int *lineno;
@@ -1661,15 +1980,26 @@ static int prepare_lines(struct blame_scoreboard *sb)
        for (p = buf; p < end; p = get_next_line(p, end))
                num++;
 
-       ALLOC_ARRAY(sb->lineno, num + 1);
-       lineno = sb->lineno;
+       ALLOC_ARRAY(*line_starts, num + 1);
+       lineno = *line_starts;
 
        for (p = buf; p < end; p = get_next_line(p, end))
                *lineno++ = p - buf;
 
        *lineno = len;
 
-       sb->num_lines = num;
+       return num;
+}
+
+/*
+ * To allow quick access to the contents of nth line in the
+ * final image, prepare an index in the scoreboard.
+ */
+static int prepare_lines(struct blame_scoreboard *sb)
+{
+       sb->num_lines = find_line_starts(&sb->lineno,
+                                                                        
sb->final_buf,
+                                                                        
sb->final_buf_size);
        return sb->num_lines;
 }
 
diff --git a/blame.h b/blame.h
index be3a895043..eb528f9f80 100644
--- a/blame.h
+++ b/blame.h
@@ -142,6 +142,7 @@ struct blame_scoreboard {
        int xdl_opts;
        int no_whole_file_rename;
        int debug;
+       int fuzzy;
 
        /* callbacks */
        void(*on_sanity_fail)(struct blame_scoreboard *, int);
diff --git a/builtin/blame.c b/builtin/blame.c
index 177c1022a0..d0a0dfff79 100644
--- a/builtin/blame.c
+++ b/builtin/blame.c
@@ -57,6 +57,7 @@ static struct date_mode blame_date_mode = { DATE_ISO8601 };
 static size_t blame_date_width;
 
 static struct string_list mailmap = STRING_LIST_INIT_NODUP;
+static int fuzzy;
 
 #ifndef DEBUG
 #define DEBUG 0
@@ -808,6 +809,7 @@ int cmd_blame(int argc, const char **argv, const char 
*prefix)
                OPT_BIT('w', NULL, &xdl_opts, N_("Ignore whitespace 
differences"), XDF_IGNORE_WHITESPACE),
                OPT_BIT(0, "color-lines", &output_option, N_("color redundant 
metadata from previous line differently"), OUTPUT_COLOR_LINE),
                OPT_BIT(0, "color-by-age", &output_option, N_("color lines by 
age"), OUTPUT_SHOW_AGE_WITH_COLOR),
+               OPT_BOOL('F', "fuzzy", &fuzzy, N_("Try to assign blame to 
similar lines in the parent")),
 
                /*
                 * The following two options are parsed by parse_revision_opt()
@@ -996,6 +998,7 @@ int cmd_blame(int argc, const char **argv, const char 
*prefix)
 
        init_scoreboard(&sb);
        sb.revs = &revs;
+       sb.fuzzy = fuzzy;
        sb.contents_from = contents_from;
        sb.reverse = reverse;
        sb.repo = the_repository;
diff --git a/t/t8020-blame-fuzzy.sh b/t/t8020-blame-fuzzy.sh
new file mode 100755
index 0000000000..d26c945722
--- /dev/null
+++ b/t/t8020-blame-fuzzy.sh
@@ -0,0 +1,264 @@
+#!/bin/sh
+
+test_description='git blame ignore a specific revision'
+. ./test-lib.sh
+
+pick_author='s/^[0-9a-f^]* *(\([^ ]*\) .*/\1/'
+
+file_count=8
+
+# Each test is composed of 4 variables:
+# titleN - the test name
+# aN - the initial content
+# bN - the final content
+# expectedN - the line numbers from aN that we expect git blame
+#             on bN to identify, or "Final" if bN itself should
+#             be identified as the origin of that line.
+
+title1="Expand lines"
+cat <<EOF >a1
+aaa
+bbb
+ccc
+ddd
+eee
+EOF
+cat <<EOF >b1
+aaa
+bbbx
+bbbx
+ccc
+dddx
+dddx
+eee
+EOF
+cat <<EOF >expected1
+1
+2
+2
+3
+4
+4
+5
+EOF
+
+title2="Combine 3 lines into 2"
+cat <<EOF >a2
+if ((maxgrow==0) ||
+    ( single_line_field && (field->dcols < maxgrow)) ||
+    (!single_line_field && (field->drows < maxgrow)))
+EOF
+cat <<EOF >b2
+if ((maxgrow == 0) || (single_line_field && (field->dcols < maxgrow)) ||
+    (!single_line_field && (field->drows < maxgrow))) {
+EOF
+cat <<EOF >expected2
+2
+3
+EOF
+
+title3="Add curly brackets"
+cat <<EOF >a3
+    if (rows) *rows = field->rows;
+    if (cols) *cols = field->cols;
+    if (frow) *frow = field->frow;
+    if (fcol) *fcol = field->fcol;
+EOF
+cat <<EOF >b3
+    if (rows) {
+      *rows = field->rows;
+    }
+    if (cols) {
+      *cols = field->cols;
+    }
+    if (frow) {
+      *frow = field->frow;
+    }
+    if (fcol) {
+      *fcol = field->fcol;
+    }
+EOF
+cat <<EOF >expected3
+1
+1
+Final
+2
+2
+Final
+3
+3
+Final
+4
+4
+Final
+EOF
+
+
+title4="Combine many lines and change case"
+cat <<EOF >a4
+for(row=0,pBuffer=field->buf;
+    row<height;
+    row++,pBuffer+=width )
+  {
+    if ((len = (int)( After_End_Of_Data( pBuffer, width ) - pBuffer )) > 0)
+      {
+        wmove( win, row, 0 );
+        waddnstr( win, pBuffer, len );
+EOF
+cat <<EOF >b4
+for (Row = 0, PBuffer = field->buf; Row < Height; Row++, PBuffer += Width) {
+  if ((Len = (int)(afterEndOfData(PBuffer, Width) - PBuffer)) > 0) {
+    wmove(win, Row, 0);
+    waddnstr(win, PBuffer, Len);
+EOF
+cat <<EOF >expected4
+1
+5
+7
+8
+EOF
+
+title5="Rename and combine lines"
+cat <<EOF >a5
+bool need_visual_update = ((form != (FORM *)0)      &&
+                           (form->status & _POSTED) &&
+                           (form->current==field));
+
+if (need_visual_update)
+  Synchronize_Buffer(form);
+
+if (single_line_field)
+  {
+    growth = field->cols * amount;
+    if (field->maxgrow)
+      growth = Minimum(field->maxgrow - field->dcols,growth);
+    field->dcols += growth;
+    if (field->dcols == field->maxgrow)
+EOF
+cat <<EOF >b5
+bool NeedVisualUpdate = ((Form != (FORM *)0) && (Form->status & _POSTED) &&
+                         (Form->current == field));
+
+if (NeedVisualUpdate) {
+  synchronizeBuffer(Form);
+}
+
+if (SingleLineField) {
+  Growth = field->cols * amount;
+  if (field->maxgrow) {
+    Growth = Minimum(field->maxgrow - field->dcols, Growth);
+  }
+  field->dcols += Growth;
+  if (field->dcols == field->maxgrow) {
+EOF
+cat <<EOF >expected5
+1
+3
+4
+5
+6
+Final
+7
+8
+10
+11
+12
+Final
+13
+14
+EOF
+
+# Both lines match identically so position must be used to tie-break.
+title6="Same line twice"
+cat <<EOF >a6
+abc
+abc
+EOF
+cat <<EOF >b6
+abcd
+abcd
+EOF
+cat <<EOF >expected6
+1
+2
+EOF
+
+title7="Enforce line order"
+cat <<EOF >a7
+abcdef
+ghijkl
+ab
+EOF
+cat <<EOF >b7
+ghijk
+abcd
+EOF
+cat <<EOF >expected7
+2
+3
+EOF
+
+title8="Expand lines and rename variables"
+cat <<EOF >a8
+int myFunction(int ArgumentOne, Thing *ArgTwo, Blah XuglyBug) {
+  Squiggle FabulousResult = squargle(ArgumentOne, *ArgTwo,
+    XuglyBug) + EwwwGlobalWithAReallyLongNameYepTooLong;
+  return FabulousResult * 42;
+}
+EOF
+cat <<EOF >b8
+int myFunction(int argument_one, Thing *arg_asdfgh,
+    Blah xugly_bug) {
+  Squiggle fabulous_result = squargle(argument_one,
+    *arg_asdfgh, xugly_bug)
+    + g_ewww_global_with_a_really_long_name_yep_too_long;
+  return fabulous_result * 42;
+}
+EOF
+cat <<EOF >expected8
+1
+1
+2
+3
+3
+4
+5
+EOF
+
+test_expect_success setup '
+       { for ((i=1;i<=$file_count;i++))
+       do
+               # Append each line in a separate commit to make it easy to
+               # check which original line the blame output relates to.
+
+               line_count=0 &&
+               { while read line
+               do
+                       line_count=$((line_count+1)) &&
+                       echo $line >>"$i" &&
+                       git add "$i" &&
+                       test_tick &&
+                       GIT_AUTHOR_NAME="$line_count" git commit -m 
"$line_count"
+               done } <"a$i"
+       done } &&
+
+       { for ((i=1;i<=$file_count;i++))
+       do
+               # Overwrite the files with the final content.
+               cp b$i $i &&
+               git add $i &&
+               test_tick
+       done } &&
+
+       # Commit the final content all at once so it can all be
+       # referred to with the same commit ID.
+       GIT_AUTHOR_NAME=Final git commit -m Final
+'
+
+for ((i=1;i<=$file_count;i++)); do
+       title="title$i"
+       test_expect_success "${!title}" \
+       "git blame --fuzzy -- $i | sed -e \"$pick_author\" >actual && test_cmp 
expected$i actual"
+done
+
+test_done
-- 
2.14.3 (Apple Git-98)

[RFC PATCH 1/1] Fuzzy blame

Reply via email to