Automatic re-encoding of commit messages (and dropping of the encoding
header) hurts attempts to do reversible history rewrites (e.g. sha1sum
<-> sha256sum transitions, some subtree rewrites), and seems
inconsistent with the general principle followed elsewhere in
fast-export of requiring explicit user requests to modify the output
(e.g. --signed-tags=strip, --tag-of-filtered-object=rewrite).  Add a
--reencode flag that the user can use to specify, and like other
fast-export flags, default it to 'abort'.

Signed-off-by: Elijah Newren <new...@gmail.com>
---
 builtin/fast-export.c  | 35 ++++++++++++++++++++++++++++++++---
 t/t9350-fast-export.sh | 37 ++++++++++++++++++++++++++++++++++---
 2 files changed, 66 insertions(+), 6 deletions(-)

diff --git a/builtin/fast-export.c b/builtin/fast-export.c
index 66331fa401..43cc52331c 100644
--- a/builtin/fast-export.c
+++ b/builtin/fast-export.c
@@ -33,6 +33,7 @@ static const char *fast_export_usage[] = {
 static int progress;
 static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP } 
signed_tag_mode = SIGNED_TAG_ABORT;
 static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode = 
TAG_FILTERING_ABORT;
+static enum { REENCODE_ABORT, REENCODE_PLEASE, REENCODE_NEVER } reencode_mode 
= REENCODE_ABORT;
 static int fake_missing_tagger;
 static int use_done_feature;
 static int no_data;
@@ -77,6 +78,20 @@ static int parse_opt_tag_of_filtered_mode(const struct 
option *opt,
        return 0;
 }
 
+static int parse_opt_reencode_mode(const struct option *opt,
+                                  const char *arg, int unset)
+{
+       if (unset || !strcmp(arg, "abort"))
+               reencode_mode = REENCODE_ABORT;
+       else if (!strcmp(arg, "yes"))
+               reencode_mode = REENCODE_PLEASE;
+       else if (!strcmp(arg, "no"))
+               reencode_mode = REENCODE_NEVER;
+       else
+               return error("Unknown reencoding mode: %s", arg);
+       return 0;
+}
+
 static struct decoration idnums;
 static uint32_t last_idnum;
 
@@ -633,10 +648,21 @@ static void handle_commit(struct commit *commit, struct 
rev_info *rev,
        }
 
        mark_next_object(&commit->object);
-       if (anonymize)
+       if (anonymize) {
                reencoded = anonymize_commit_message(message);
-       else if (!is_encoding_utf8(encoding))
-               reencoded = reencode_string(message, "UTF-8", encoding);
+       } else if (encoding) {
+               switch(reencode_mode) {
+               case REENCODE_PLEASE:
+                       reencoded = reencode_string(message, "UTF-8", encoding);
+                       break;
+               case REENCODE_NEVER:
+                       break;
+               case REENCODE_ABORT:
+                       die("Encountered commit-specific encoding %s in commit "
+                           "%s; use --reencode=<mode> to handle it",
+                           encoding, oid_to_hex(&commit->object.oid));
+               }
+       }
        if (!commit->parents)
                printf("reset %s\n", refname);
        printf("commit %s\nmark :%"PRIu32"\n", refname, last_idnum);
@@ -1091,6 +1117,9 @@ int cmd_fast_export(int argc, const char **argv, const 
char *prefix)
                OPT_CALLBACK(0, "tag-of-filtered-object", 
&tag_of_filtered_mode, N_("mode"),
                             N_("select handling of tags that tag filtered 
objects"),
                             parse_opt_tag_of_filtered_mode),
+               OPT_CALLBACK(0, "reencode", &reencode_mode, N_("mode"),
+                            N_("select handling of commit messages in an 
alternate encoding"),
+                            parse_opt_reencode_mode),
                OPT_STRING(0, "export-marks", &export_filename, N_("file"),
                             N_("Dump marks to this file")),
                OPT_STRING(0, "import-marks", &import_filename, N_("file"),
diff --git a/t/t9350-fast-export.sh b/t/t9350-fast-export.sh
index fa124f4842..3cf4c7fc0c 100755
--- a/t/t9350-fast-export.sh
+++ b/t/t9350-fast-export.sh
@@ -94,14 +94,14 @@ test_expect_success 'fast-export --show-original-ids | git 
fast-import' '
        test $MUSS = $(git rev-parse --verify refs/tags/muss)
 '
 
-test_expect_success 'iso-8859-7' '
+test_expect_success 'reencoding iso-8859-7' '
 
        test_when_finished "git reset --hard HEAD~1" &&
        test_config i18n.commitencoding iso-8859-7 &&
        test_tick &&
        echo rosten >file &&
        git commit -s -F 
"$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
-       git fast-export wer^..wer >iso-8859-7.fi &&
+       git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
        sed "s/wer/i18n/" iso-8859-7.fi |
                (cd new &&
                 git fast-import &&
@@ -120,13 +120,44 @@ test_expect_success 'iso-8859-7' '
                 ! grep ^encoding actual)
 '
 
+test_expect_success 'aborting on iso-8859-7' '
+
+       test_when_finished "git reset --hard HEAD~1" &&
+       test_config i18n.commitencoding iso-8859-7 &&
+       echo rosten >file &&
+       git commit -s -F 
"$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
+       test_must_fail git fast-export --reencode=abort wer^..wer >iso-8859-7.fi
+'
+
+test_expect_success 'preserving iso-8859-7' '
+
+       test_when_finished "git reset --hard HEAD~1" &&
+       test_config i18n.commitencoding iso-8859-7 &&
+       echo rosten >file &&
+       git commit -s -F 
"$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
+       git fast-export --reencode=no wer^..wer >iso-8859-7.fi &&
+       sed "s/wer/i18n-no-recoding/" iso-8859-7.fi |
+               (cd new &&
+                git fast-import &&
+                # The commit object, if not re-encoded, is 240 bytes.
+                # Removing the "encoding iso-8859-7\n" header would drops 20
+                # bytes.  Re-encoding the Pi character from \xF0 in
+                # iso-8859-7 to \xCF\x80 in utf-8 would add a byte.  I would
+                # grep for the # specific bytes, but Windows lamely does not
+                # allow that, so just search for the expected size.
+                test 240 -eq "$(git cat-file -s i18n-no-recoding)" &&
+                # Also make sure the commit has the "encoding" header
+                git cat-file commit i18n-no-recoding >actual &&
+                grep ^encoding actual)
+'
+
 test_expect_success 'encoding preserved if reencoding fails' '
 
        test_when_finished "git reset --hard HEAD~1" &&
        test_config i18n.commitencoding iso-8859-7 &&
        echo rosten >file &&
        git commit -s -F 
"$TEST_DIRECTORY/t9350/broken-iso-8859-7-commit-message.txt" file &&
-       git fast-export wer^..wer >iso-8859-7.fi &&
+       git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
        sed "s/wer/i18n-invalid/" iso-8859-7.fi |
                (cd new &&
                 git fast-import &&
-- 
2.21.0.782.g2063122293

Reply via email to