Thanks Jonathan for your thoughtful comments,
here is the product of the discussion:
* I split up the patch to fetch in the worktree to be 2 patches,
  each giving motivation on its own.
* the last patch is vastly simplified in code, but takes an extra test
* in [1], you remark "commits" ought not to be a pointer, but I decided against
  that, as we keep the pointed-at value around for the same time span (until
  we're done with that submodule) and we don't need to copy over the pointed-at
  value into the new struct.

[1] 
https://public-inbox.org/git/20181018003954.139498-1-jonathanta...@google.com/

This is still based on ao/submodule-wo-gitmodules-checked-out.
previous version
https://public-inbox.org/git/20181016181327.107186-1-sbel...@google.com/

Stefan Beller (10):
  sha1-array: provide oid_array_filter
  submodule.c: fix indentation
  submodule.c: sort changed_submodule_names before searching it
  submodule.c: tighten scope of changed_submodule_names struct
  submodule: store OIDs in changed_submodule_names
  repository: repo_submodule_init to take a submodule struct
  submodule: migrate get_next_submodule to use repository structs
  submodule.c: fetch in submodules git directory instead of in worktree
  fetch: try fetching submodules if needed objects were not fetched
  builtin/fetch: check for submodule updates in any ref update

 Documentation/technical/api-oid-array.txt    |   5 +
 builtin/fetch.c                              |  11 +-
 builtin/grep.c                               |  17 +-
 builtin/ls-files.c                           |  12 +-
 builtin/submodule--helper.c                  |   2 +-
 repository.c                                 |  27 +-
 repository.h                                 |  12 +-
 sha1-array.c                                 |  17 ++
 sha1-array.h                                 |   3 +
 submodule.c                                  | 265 ++++++++++++++++---
 t/helper/test-submodule-nested-repo-config.c |   8 +-
 t/t5526-fetch-submodules.sh                  |  55 ++++
 12 files changed, 346 insertions(+), 88 deletions(-)

  git range-diff origin/sb/submodule-recursive-fetch-gets-the-tip... 

1:  3fbb06aedd ! 1:  0fdb0e2ad9 submodule.c: move global 
changed_submodule_names into fetch submodule struct
    @@ -1,12 +1,11 @@
     Author: Stefan Beller <sbel...@google.com>
     
    -    submodule.c: move global changed_submodule_names into fetch submodule 
struct
    +    submodule.c: tighten scope of changed_submodule_names struct
     
         The `changed_submodule_names` are only used for fetching, so let's 
make it
         part of the struct that is passed around for fetching submodules.
     
         Signed-off-by: Stefan Beller <sbel...@google.com>
    -    Signed-off-by: Junio C Hamano <gits...@pobox.com>
     
     diff --git a/submodule.c b/submodule.c
     --- a/submodule.c
    @@ -16,7 +15,6 @@
      
      static int config_update_recurse_submodules = RECURSE_SUBMODULES_OFF;
     -static struct string_list changed_submodule_names = STRING_LIST_INIT_DUP;
    -+
      static int initialized_fetch_ref_tips;
      static struct oid_array ref_tips_before_fetch;
      static struct oid_array ref_tips_after_fetch;
    @@ -25,22 +23,8 @@
      }
      
     -static void calculate_changed_submodule_paths(void)
    -+struct submodule_parallel_fetch {
    -+  int count;
    -+  struct argv_array args;
    -+  struct repository *r;
    -+  const char *prefix;
    -+  int command_line_option;
    -+  int default_option;
    -+  int quiet;
    -+  int result;
    -+
    -+  struct string_list changed_submodule_names;
    -+};
    -+#define SPF_INIT {0, ARGV_ARRAY_INIT, NULL, NULL, 0, 0, 0, 0, 
STRING_LIST_INIT_DUP }
    -+
     +static void calculate_changed_submodule_paths(
    -+  struct submodule_parallel_fetch *spf)
    ++  struct string_list *changed_submodule_names)
      {
        struct argv_array argv = ARGV_ARRAY_INIT;
        struct string_list changed_submodules = STRING_LIST_INIT_DUP;
    @@ -49,30 +33,23 @@
      
                if (!submodule_has_commits(path, commits))
     -                  string_list_append(&changed_submodule_names, 
name->string);
    -+                  string_list_append(&spf->changed_submodule_names,
    ++                  string_list_append(changed_submodule_names,
     +                                     name->string);
        }
      
        free_submodules_oids(&changed_submodules);
     @@
    -   return ret;
    - }
    - 
    --struct submodule_parallel_fetch {
    --  int count;
    --  struct argv_array args;
    --  struct repository *r;
    --  const char *prefix;
    --  int command_line_option;
    --  int default_option;
    --  int quiet;
    --  int result;
    --};
    +   int default_option;
    +   int quiet;
    +   int result;
    ++
    ++  struct string_list changed_submodule_names;
    + };
     -#define SPF_INIT {0, ARGV_ARRAY_INIT, NULL, NULL, 0, 0, 0, 0}
    --
    ++#define SPF_INIT {0, ARGV_ARRAY_INIT, NULL, NULL, 0, 0, 0, 0, 
STRING_LIST_INIT_DUP }
    + 
      static int get_fetch_recurse_config(const struct submodule *submodule,
                                    struct submodule_parallel_fetch *spf)
    - {
     @@
                case RECURSE_SUBMODULES_ON_DEMAND:
                        if (!submodule ||
    @@ -88,7 +65,7 @@
      
     -  calculate_changed_submodule_paths();
     -  string_list_sort(&changed_submodule_names);
    -+  calculate_changed_submodule_paths(&spf);
    ++  calculate_changed_submodule_paths(&spf.changed_submodule_names);
     +  string_list_sort(&spf.changed_submodule_names);
        run_processes_parallel(max_parallel_jobs,
                               get_next_submodule,
2:  a64a8322a1 ! 2:  a11e6e39a2 submodule.c: do not copy around submodule list
    @@ -1,6 +1,6 @@
     Author: Stefan Beller <sbel...@google.com>
     
    -    submodule.c: do not copy around submodule list
    +    submodule: store OIDs in changed_submodule_names
     
         'calculate_changed_submodule_paths' uses a local list to compute the
         changed submodules, and then produces the result by copying appropriate
    @@ -14,13 +14,13 @@
         useful in a later patch.
     
         Signed-off-by: Stefan Beller <sbel...@google.com>
    -    Signed-off-by: Junio C Hamano <gits...@pobox.com>
    +    Reviewed-by: Jonathan Tan <jonathanta...@google.com>
     
     diff --git a/submodule.c b/submodule.c
     --- a/submodule.c
     +++ b/submodule.c
     @@
    -   struct submodule_parallel_fetch *spf)
    +   struct string_list *changed_submodule_names)
      {
        struct argv_array argv = ARGV_ARRAY_INIT;
     -  struct string_list changed_submodules = STRING_LIST_INIT_DUP;
    @@ -34,10 +34,10 @@
         * commits have been recorded upstream in "changed_submodule_names".
         */
     -  collect_changed_submodules(&changed_submodules, &argv);
    -+  collect_changed_submodules(&spf->changed_submodule_names, &argv);
    ++  collect_changed_submodules(changed_submodule_names, &argv);
      
     -  for_each_string_list_item(name, &changed_submodules) {
    -+  for_each_string_list_item(name, &spf->changed_submodule_names) {
    ++  for_each_string_list_item(name, changed_submodule_names) {
                struct oid_array *commits = name->util;
                const struct submodule *submodule;
                const char *path = NULL;
    @@ -46,7 +46,7 @@
                        continue;
      
     -          if (!submodule_has_commits(path, commits))
    --                  string_list_append(&spf->changed_submodule_names,
    +-                  string_list_append(changed_submodule_names,
     -                                     name->string);
     +          if (submodule_has_commits(path, commits)) {
     +                  oid_array_clear(commits);
    @@ -55,7 +55,7 @@
        }
      
     -  free_submodules_oids(&changed_submodules);
    -+  string_list_remove_empty_items(&spf->changed_submodule_names, 1);
    ++  string_list_remove_empty_items(changed_submodule_names, 1);
     +
        argv_array_clear(&argv);
        oid_array_clear(&ref_tips_before_fetch);
3:  9341b92c83 ! 3:  3f4e0d4b8d repository: repo_submodule_init to take a 
submodule struct
    @@ -5,17 +5,19 @@
         When constructing a struct repository for a submodule for some revision
         of the superproject where the submodule is not contained in the index,
         it may not be present in the working tree currently either. In that
    -    siutation giving a 'path' argument is not useful. Upgrade the
    +    situation giving a 'path' argument is not useful. Upgrade the
         repo_submodule_init function to take a struct submodule instead.
    +    The submodule struct can be obtained via submodule_from_{path, name} or
    +    an artificial submodule struct can be passed in.
     
    -    While we are at it, overhaul the repo_submodule_init function by 
renaming
    -    the submodule repository struct, which is to be initialized, to a name
    -    that is not confused with the struct submodule as easily.
    +    While we are at it, rename the repository struct in the 
repo_submodule_init
    +    function, which is to be initialized, to a name that is not confused 
with
    +    the struct submodule as easily. Perform such renames in similar 
functions
    +    as well.
     
         Also move its documentation into the header file.
     
         Signed-off-by: Stefan Beller <sbel...@google.com>
    -    Signed-off-by: Junio C Hamano <gits...@pobox.com>
     
     diff --git a/builtin/grep.c b/builtin/grep.c
     --- a/builtin/grep.c
    @@ -104,6 +106,19 @@
      
      static void show_ce(struct repository *repo, struct dir_struct *dir,
     
    +diff --git a/builtin/submodule--helper.c b/builtin/submodule--helper.c
    +--- a/builtin/submodule--helper.c
    ++++ b/builtin/submodule--helper.c
    +@@
    +   if (!sub)
    +           BUG("We could get the submodule handle before?");
    + 
    +-  if (repo_submodule_init(&subrepo, the_repository, path))
    ++  if (repo_submodule_init(&subrepo, the_repository, sub))
    +           die(_("could not get a repository handle for submodule '%s'"), 
path);
    + 
    +   if (!repo_config_get_string(&subrepo, "core.worktree", &cw)) {
    +
     diff --git a/repository.c b/repository.c
     --- a/repository.c
     +++ b/repository.c
    @@ -178,7 +193,8 @@
     +/*
     + * Initialize the repository 'subrepo' as the submodule given by the
     + * struct submodule 'sub' in parent repository 'superproject'.
    -+ * Return 0 upon success and a non-zero value upon failure.
    ++ * Return 0 upon success and a non-zero value upon failure, which may 
happen
    ++ * if the submodule is not found, or 'sub' is NULL.
     + */
     +struct submodule;
     +int repo_submodule_init(struct repository *subrepo,
4:  cea909cbd4 ! 4:  b1566069e7 submodule: fetch in submodules git directory 
instead of in worktree
    @@ -1,44 +1,19 @@
     Author: Stefan Beller <sbel...@google.com>
     
    -    submodule: fetch in submodules git directory instead of in worktree
    +    submodule: migrate get_next_submodule to use repository structs
     
    -    This patch started as a refactoring to make 'get_next_submodule' more
    -    readable, but upon doing so, I realized that "git fetch" of the 
submodule
    -    actually doesn't need to be run in the submodules worktree. So let's 
run
    -    it in its git dir instead.
    +    We used to recurse into submodules, even if they were broken having
    +    only an objects directory. The child process executed in the submodule
    +    would fail though if the submodule was broken.
     
    -    That should pave the way towards fetching submodules that are currently
    -    not checked out.
    -
    -    This patch leaks the cp->dir in get_next_submodule, as any further
    -    callback in run_processes_parallel doesn't have access to the child
    -    process any more. In an early iteration of this patch, the function
    -    get_submodule_repo_for directly returned the string containing the
    -    git directory, which would be a better design choice for this patch.
    -
    -    However the next patch both fixes the memory leak of cp->dir and also 
has
    -    a use case for using the full repository handle of the submodule, so
    -    it makes sense to introduce the get_submodule_repo_for here already.
    +    This patch tightens the check upfront, such that we do not need
    +    to spawn a child process to find out if the submodule is broken.
     
         Signed-off-by: Stefan Beller <sbel...@google.com>
    -    Signed-off-by: Junio C Hamano <gits...@pobox.com>
     
     diff --git a/submodule.c b/submodule.c
     --- a/submodule.c
     +++ b/submodule.c
    -@@
    -                    DEFAULT_GIT_DIR_ENVIRONMENT);
    - }
    - 
    -+static void prepare_submodule_repo_env_in_gitdir(struct argv_array *out)
    -+{
    -+  prepare_submodule_repo_env_no_git_dir(out);
    -+  argv_array_pushf(out, "%s=.", GIT_DIR_ENVIRONMENT);
    -+}
    -+
    - /* Helper function to display the submodule header line prior to the full
    -  * summary output. If it can locate the submodule objects directory it 
will
    -  * attempt to lookup both the left and right commits and put them into the
     @@
        return spf->default_option;
      }
    @@ -99,9 +74,8 @@
     +          if (repo) {
                        child_process_init(cp);
     -                  cp->dir = strbuf_detach(&submodule_path, NULL);
    --                  prepare_submodule_repo_env(&cp->env_array);
    -+                  prepare_submodule_repo_env_in_gitdir(&cp->env_array);
    -+                  cp->dir = xstrdup(repo->gitdir);
    +                   prepare_submodule_repo_env(&cp->env_array);
    ++                  cp->dir = xstrdup(repo->worktree);
                        cp->git_cmd = 1;
                        if (!spf->quiet)
                                strbuf_addf(err, "Fetching submodule %s%s\n",
    @@ -113,27 +87,17 @@
     +                  repo_clear(repo);
     +                  free(repo);
                        ret = 1;
    ++          } else {
    ++                  /*
    ++                   * An empty directory is normal,
    ++                   * the submodule is not initialized
    ++                   */
    ++                  if (S_ISGITLINK(ce->ce_mode) &&
    ++                      !is_empty_dir(ce->name))
    ++                          die(_("Could not access submodule '%s'"), 
ce->name);
                }
     -          strbuf_release(&submodule_path);
     -          strbuf_release(&submodule_git_dir);
                strbuf_release(&submodule_prefix);
                if (ret) {
                        spf->count++;
    -
    -diff --git a/t/t5526-fetch-submodules.sh b/t/t5526-fetch-submodules.sh
    ---- a/t/t5526-fetch-submodules.sh
    -+++ b/t/t5526-fetch-submodules.sh
    -@@
    - 
    -   test_must_fail git -C dst status &&
    -   test_must_fail git -C dst diff &&
    --  test_must_fail git -C dst fetch --recurse-submodules
    -+
    -+  # git-fetch cannot find the git directory of the submodule,
    -+  # so it will do nothing, successfully, as it cannot distinguish between
    -+  # this broken submodule and a submodule that was just set active but
    -+  # not cloned yet
    -+  git -C dst fetch --recurse-submodules
    - '
    - 
    - test_expect_success "fetch new commits when submodule got renamed" '
-:  ---------- > 5:  2d98ff1201 submodule.c: fetch in submodules git directory 
instead of in worktree
5:  9ad0125310 ! 6:  092b9cbcba fetch: retry fetching submodules if needed 
objects were not fetched
    @@ -1,25 +1,23 @@
     Author: Stefan Beller <sbel...@google.com>
     
    -    fetch: retry fetching submodules if needed objects were not fetched
    +    fetch: try fetching submodules if needed objects were not fetched
     
         Currently when git-fetch is asked to recurse into submodules, it 
dispatches
         a plain "git-fetch -C <submodule-dir>" (with some submodule related 
options
         such as prefix and recusing strategy, but) without any information of 
the
         remote or the tip that should be fetched.
     
    -    This works surprisingly well in some workflows (such as using 
submodules
    -    as a third party library), while not so well in other scenarios, such
    -    as in a Gerrit topic-based workflow, that can tie together changes
    -    (potentially across repositories) on the server side. One of the parts
    -    of such a Gerrit workflow is to download a change when wanting to 
examine
    -    it, and you'd want to have its submodule changes that are in the same
    -    topic downloaded as well. However these submodule changes reside in 
their
    -    own repository in their own ref (refs/changes/<int>).
    +    But this default fetch is not sufficient, as a newly fetched commit in
    +    the superproject could point to a commit in the submodule that is not
    +    in the default refspec. This is common in workflows like Gerrit's.
    +    When fetching a Gerrit change under review (from refs/changes/??), the
    +    commits in that change likely point to submodule commits that have not
    +    been merged to a branch yet.
     
    -    Retry fetching a submodule by object name if the object id that the
    +    Try fetching a submodule by object id if the object id that the
         superproject points to, cannot be found.
     
    -    This retrying does not happen when the "git fetch" done at the
    +    The try does not happen when the "git fetch" done at the
         superproject is not storing the fetched results in remote
         tracking branches (i.e. instead just recording them to
         FETCH_HEAD) in this step. A later patch will fix this.
    @@ -32,7 +30,6 @@
         in case the submodule recursion is set to "on".
     
         Signed-off-by: Stefan Beller <sbel...@google.com>
    -    Signed-off-by: Junio C Hamano <gits...@pobox.com>
     
     diff --git a/builtin/fetch.c b/builtin/fetch.c
     --- a/builtin/fetch.c
    @@ -75,16 +72,16 @@
        int result;
      
        struct string_list changed_submodule_names;
    -+  struct get_next_submodule_task **retry;
    -+  int retry_nr, retry_alloc;
    ++  struct get_next_submodule_task **fetch_specific_oids;
    ++  int fetch_specific_oids_nr, fetch_specific_oids_alloc;
      };
     -#define SPF_INIT {0, ARGV_ARRAY_INIT, NULL, NULL, 0, 0, 0, 0, 
STRING_LIST_INIT_DUP }
     +#define SPF_INIT {0, ARGV_ARRAY_INIT, NULL, NULL, 0, 0, 0, 0, \
     +            STRING_LIST_INIT_DUP, \
     +            NULL, 0, 0}
      
    - static void calculate_changed_submodule_paths(
    -   struct submodule_parallel_fetch *spf)
    + static int get_fetch_recurse_config(const struct submodule *submodule,
    +                               struct submodule_parallel_fetch *spf)
     @@
        return spf->default_option;
      }
    @@ -93,6 +90,8 @@
     +  struct repository *repo;
     +  const struct submodule *sub;
     +  unsigned free_sub : 1; /* Do we need to free the submodule? */
    ++
    ++  /* fetch specific oids if set, otherwise fetch default refspec */
     +  struct oid_array *commits;
     +};
     +
    @@ -224,24 +223,29 @@
     -                  repo_clear(repo);
     -                  free(repo);
     -                  ret = 1;
    --          }
    --          strbuf_release(&submodule_prefix);
    --          if (ret) {
    -                   spf->count++;
    ++                  spf->count++;
     +                  *task_cb = task;
     +
     +                  strbuf_release(&submodule_prefix);
    -                   return 1;
    -+          } else {
    ++                  return 1;
    +           } else {
    ++
     +                  get_next_submodule_task_release(task);
     +                  free(task);
    ++
    +                   /*
    +                    * An empty directory is normal,
    +                    * the submodule is not initialized
    +@@
    +                       !is_empty_dir(ce->name))
    +                           die(_("Could not access submodule '%s'"), 
ce->name);
                }
    -   }
    ++  }
     +
    -+  if (spf->retry_nr) {
    -+          struct get_next_submodule_task *task = spf->retry[spf->retry_nr 
- 1];
    ++  if (spf->fetch_specific_oids_nr) {
    ++          struct get_next_submodule_task *task = 
spf->fetch_specific_oids[spf->fetch_specific_oids_nr - 1];
     +          struct strbuf submodule_prefix = STRBUF_INIT;
    -+          spf->retry_nr--;
    ++          spf->fetch_specific_oids_nr--;
     +
     +          strbuf_addf(&submodule_prefix, "%s%s/", spf->prefix, 
task->sub->path);
     +
    @@ -262,9 +266,13 @@
     +                                    append_oid_to_argv, &cp->args);
     +
     +          *task_cb = task;
    -+          strbuf_release(&submodule_prefix);
    +           strbuf_release(&submodule_prefix);
    +-          if (ret) {
    +-                  spf->count++;
    +-                  return 1;
    +-          }
     +          return 1;
    -+  }
    +   }
     +
        return 0;
      }
    @@ -326,9 +334,11 @@
     +                  return 0;
     +
     +          task->commits = commits;
    -+          ALLOC_GROW(spf->retry, spf->retry_nr + 1, spf->retry_alloc);
    -+          spf->retry[spf->retry_nr] = task;
    -+          spf->retry_nr++;
    ++          ALLOC_GROW(spf->fetch_specific_oids,
    ++                     spf->fetch_specific_oids_nr + 1,
    ++                     spf->fetch_specific_oids_alloc);
    ++          spf->fetch_specific_oids[spf->fetch_specific_oids_nr] = task;
    ++          spf->fetch_specific_oids_nr++;
     +          return 0;
     +  }
     +
    @@ -346,18 +356,33 @@
        test_cmp expect actual
      '
      
    -+test_expect_success "fetch new commits on-demand when they are not 
reachable" '
    ++test_expect_success "fetch new submodule commits on-demand outside 
standard refspec" '
    ++  # add a second submodule and ensure it is around in downstream first
    ++  git clone submodule sub1 &&
    ++  git submodule add ./sub1 &&
    ++  git commit -m "adding a second submodule" &&
    ++  git -C downstream pull &&
    ++  git -C downstream submodule update --init --recursive &&
    ++
     +  git checkout --detach &&
    ++
     +  C=$(git -C submodule commit-tree -m "new change outside refs/heads" 
HEAD^{tree}) &&
     +  git -C submodule update-ref refs/changes/1 $C &&
     +  git update-index --cacheinfo 160000 $C submodule &&
    -+  git commit -m "updated submodule outside of refs/heads" &&
    -+  D=$(git rev-parse HEAD) &&
    -+  git update-ref refs/changes/2 $D &&
    ++  test_tick &&
    ++
    ++  D=$(git -C sub1 commit-tree -m "new change outside refs/heads" 
HEAD^{tree}) &&
    ++  git -C sub1 update-ref refs/changes/2 $D &&
    ++  git update-index --cacheinfo 160000 $D sub1 &&
    ++
    ++  git commit -m "updated submodules outside of refs/heads" &&
    ++  E=$(git rev-parse HEAD) &&
    ++  git update-ref refs/changes/2 $E &&
     +  (
     +          cd downstream &&
    -+          git fetch --recurse-submodules --recurse-submodules-default 
on-demand origin refs/changes/2:refs/heads/my_branch &&
    ++          git fetch --recurse-submodules origin 
refs/changes/2:refs/heads/my_branch &&
     +          git -C submodule cat-file -t $C &&
    ++          git -C sub1 cat-file -t $D &&
     +          git checkout --recurse-submodules FETCH_HEAD
     +  )
     +'
6:  b8db3b45bf < -:  ---------- builtin/fetch: check for submodule updates for 
non branch fetches
7:  905a4f0d4f < -:  ---------- fixup! repository: repo_submodule_init to take 
a submodule struct
-:  ---------- > 7:  11bf819782 builtin/fetch: check for submodule updates in 
any ref update

Reply via email to