Junio C Hamano <gits...@pobox.com> writes:

> I am not saying that we shouldn't have support for users to fix
> their repository and get out of this transititory broken state.  A
> recent work by Torsten Bögershausen to have ls-files report the end
> of line convention used in the blob in the index and the settings
> that affect conversion for each path (among other things) is a step
> in the right direction.  With a support like that, those who noticed
> that they by mistake added CRLF files to the index as-is when they
> wanted their project to be cross platform can recover from it by
> setting necessary attributes (i.e. mark them as "text") and then
> find paths that are broken out of "ls-files --eol" output to see
> which ones are not using lf end-of-line in the index.
>
> I do not think there is a canned command to help dealing with these
> broken paths right now.  You would have to check them out of the
> index (you would get a CRLF file in the working tree in the example
> we are discussing), fix the line endings (you would run dos2unix on
> it in this example, as you would want "text=true" attribute) and
> "git add" them to recover manually, but I can imagine that Torsten's
> work can be extended to do all of these, without molesting the
> working tree files, with minimum work by the end user.  That is:
>
>  * Reuse Torsten's "ls-files --eol" code to find paths that record
>    the blob in the index that does not follow the eol convention
>    specified for the path;
>
>  * For each of these index entries, run convert_to_working_tree() on
>    the indexed contents, and then on the result of it, run
>    convert_to_git().  The result is the blob that the index ought to
>    have had, if it were to be consistent with the attribute
>    settings.  So add that to the index.
>
>  * Write the index out.
>
>  * Tell the user to commit or commit it automatically with a canned
>    log message "fix broken encoding" or something.

Here is what I whipped up as a lunch-break hack.  I do not claim
that "git add" would be the best place to do this, but it should be
sufficient to illustrate the overall idea.

The user can say "git add --fix-index" and have a simplified version
of the above happen, i.e. for each path in the index, if the
contents recorded there does not round-trip to the identical
contents when first converted to the working tree representation
(i.e. passing through core.eol and smudge filter conversion) and
then converted back to the Git blob representation (i.e. clean
filter and core.crlf), and when the result is different from what we
started from, we know we have an unnormalized blob registered in the
index, so we replace it.  After this, "git diff --cached" would show
the correction made by this operation, and committing it would let
you fix the earlier mistake that added CRLF content when the path
was marked with text=true attribute.

We could go even fancier and attempt the round-trip twice or more.
It is possible that the in-index representation will not converge
when you use a misconfigured pair of clean/smudge filters (e.g.
using "gzip -d -c" as the smudge filter, and then using "gzip -c"
without "-n" option as the clean filter would most likely make the
in-index representation fuzzy, as each time the cycle is run, the
compressed contents will be made with different timestamps, even
though the working tree representation will be the same), and an
operation "we screwed up the filters, please repair the damage!"
like this "add --fix-index" is probably the best place to catch such
a misconfiguration.

 builtin/add.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/builtin/add.c b/builtin/add.c
index 145f06e..36d3915 100644
--- a/builtin/add.c
+++ b/builtin/add.c
@@ -233,6 +233,7 @@ N_("The following paths are ignored by one of your 
.gitignore files:\n");
 
 static int verbose, show_only, ignored_too, refresh_only;
 static int ignore_add_errors, intent_to_add, ignore_missing;
+static int fix_index;
 
 #define ADDREMOVE_DEFAULT 1
 static int addremove = ADDREMOVE_DEFAULT;
@@ -263,6 +264,7 @@ static struct option builtin_add_options[] = {
        OPT_BOOL( 0 , "refresh", &refresh_only, N_("don't add, only refresh the 
index")),
        OPT_BOOL( 0 , "ignore-errors", &ignore_add_errors, N_("just skip files 
which cannot be added because of errors")),
        OPT_BOOL( 0 , "ignore-missing", &ignore_missing, N_("check if - even 
missing - files are ignored in dry run")),
+       OPT_BOOL( 0 , "fix-index", &fix_index, N_("fix contents in the index 
that is inconsistent with the eol and clean/smudge filters")),
        OPT_END(),
 };
 
@@ -297,6 +299,64 @@ static int add_files(struct dir_struct *dir, int flags)
        return exit_status;
 }
 
+static int fix_index_roundtrip(int ac, const char **av, const char *prefix)
+{
+       int i;
+
+       if (ac)
+               die(_("git add --fix-index does not take any other argument"));
+
+       if (read_cache() < 0)
+               die(_("index file corrupt"));
+
+       for (i = 0; i < active_nr; i++) {
+               struct cache_entry *ce = active_cache[i];
+               struct strbuf buf = STRBUF_INIT;
+               char *contents;
+               unsigned long size;
+               enum object_type type;
+               unsigned char sha1[20];
+
+               if (ce_stage(ce) || !S_ISREG(ce->ce_mode))
+                       continue;
+               if (!would_convert_to_git(ce->name))
+                       continue;
+
+               contents = read_sha1_file(ce->sha1, &type, &size);
+               if (type != OBJ_BLOB)
+                       die(_("object in the index at path '%s' is not a blob"),
+                           ce->name);
+
+               /*
+                * Round-trip conversion; act as if we wrote it out to the
+                * working tree and then re-read it, with clean/smudge and
+                * eol conversions.  Do we get the same result?
+                */
+               if (convert_to_working_tree(ce->name, contents, size, &buf))
+                       strbuf_add(&buf, contents, size);
+               free(contents);
+
+               contents = strbuf_detach(&buf, &size);
+
+               if (!convert_to_git(ce->name, contents, size, &buf, 0))
+                       strbuf_add(&buf, contents, size);
+               free(contents);
+
+               /* Hash the result - does it match? */
+               hash_sha1_file(buf.buf, buf.len, "blob", sha1);
+               if (hashcmp(sha1, ce->sha1)) {
+                       hashcpy(ce->sha1, sha1);
+                       active_cache_changed = 1;
+               }
+               strbuf_release(&buf);
+       }
+
+       if (active_cache_changed)
+               if (write_locked_index(&the_index, &lock_file, COMMIT_LOCK))
+                       die(_("Unable to write new index file"));
+       return 0;
+}
+
 int cmd_add(int argc, const char **argv, const char *prefix)
 {
        int exit_status = 0;
@@ -318,6 +378,10 @@ int cmd_add(int argc, const char **argv, const char 
*prefix)
 
        if (edit_interactive)
                return(edit_patch(argc, argv, prefix));
+
+       if (fix_index)
+               exit(fix_index_roundtrip(argc, argv, prefix));
+
        argc--;
        argv++;
 



--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to