On Sun, Apr 05, 2015 at 02:52:59PM -0400, Jeff King wrote:
> Right now we parse all of the packed-refs file into an in-memory cache,
> and then do single lookups from that cache. Doing an mmap() and a binary
> search is way faster (and costs less memory) for doing individual
> lookups. It relies on the list being sorted. This is generally true, but
> not something we currently rely on (however, it would be easy to add a
> "sorted" flag to top of the file and have the readers fall back when the
> flag is missing). I've played with a patch to do this (it's not entirely
> trivial, because you jump into the middle of a line, and then have to
> walk backwards to find the start of the record).
>
> For traversals, it's more complicated. Obviously if you are traversing
> all refs, you have to read the whole thing anyway. If you are traversing
> a subset of the refs, you can binary-search the start of the subset, and
> then walk forward. But that's where it gets tricky with the current
> code.
In case you are curious, here is my proof-of-concept for the packed-refs
binary search. You'll note that it's a separate program, and not
integrated into refs.c. I wrote this last August, and after trying to
integrate it into refs.c, I found the ref_cache problems I described,
and I haven't touched it since.
I also seem to have saved the patch for stuffing it into refs.c, but I
am not sure if it even compiles (I wrote only "horrible wip" in the
commit message ;) ).
-- >8 --
Subject: [PATCH] add git-quick-list
This is a proof of concept for binary-searching the
packed-refs file in order to traverse an ordered subset of
it. Note that it _only_ reads the packed-refs file
currently. To really compare to for-each-ref, it would need
to also walk the loose ref area for its prefix. On a
mostly-packed repository that shouldn't make a big speed
difference, though.
And of course we don't _really_ want a separate command here
at all. This should be part of refs.c, and everyone who
calls for_each_ref should benefit from it.
Still, the numbers are promising. Here's are comparisons
against for-each-ref on torvalds/linux, which has a 218M
packed-refs file:
$ time git for-each-ref \
--format='%(objectname) %(refname)' \
refs/remotes/2325298/ |
wc -c
44139
real 0m1.649s
user 0m1.332s
sys 0m0.304s
$ time ~peff/git-quick-list refs/remotes/2325298/ | wc -c
44139
real 0m0.012s
user 0m0.004s
sys 0m0.004s
---
Makefile | 1 +
quick-list.c | 174 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 175 insertions(+)
create mode 100644 quick-list.c
diff --git a/Makefile b/Makefile
index 2457065..aa32598 100644
--- a/Makefile
+++ b/Makefile
@@ -541,6 +541,7 @@ PROGRAM_OBJS += shell.o
PROGRAM_OBJS += show-index.o
PROGRAM_OBJS += upload-pack.o
PROGRAM_OBJS += remote-testsvn.o
+PROGRAM_OBJS += quick-list.o
# Binary suffix, set to .exe for Windows builds
X =
diff --git a/quick-list.c b/quick-list.c
new file mode 100644
index 0000000..e423f1f
--- /dev/null
+++ b/quick-list.c
@@ -0,0 +1,174 @@
+#include "cache.h"
+#include "refs.h"
+
+struct packed_refs_iterator {
+ const char *start;
+ const char *end;
+
+ const char *cur;
+ const char *ref;
+ const char *eol;
+ const char *next;
+};
+
+static void iterator_init(struct packed_refs_iterator *pos,
+ const char *buf, size_t len)
+{
+ pos->start = buf;
+ pos->end = buf + len;
+
+ /* skip past header line */
+ if (pos->start < pos->end && *pos->start == '#') {
+ while (pos->start < pos->end && *pos->start != '\n')
+ pos->start++;
+ if (pos->start < pos->end)
+ pos->start++;
+ }
+}
+
+static int iterator_cmp(const char *key, struct packed_refs_iterator *pos)
+{
+ const char *ref = pos->ref;
+ for (; *key && ref < pos->eol; key++, ref++)
+ if (*key != *ref)
+ return (unsigned char)*key - (unsigned char)*ref;
+ return ref == pos->eol ? *key ? 1 : 0 : -1;
+}
+
+static const char *find_eol(const char *p, const char *end)
+{
+ p = memchr(p, '\n', end - p);
+ return p ? p : end;
+}
+
+static void parse_line(struct packed_refs_iterator *pos, const char *p)
+{
+ pos->cur = p;
+ if (pos->end - p < 41)
+ die("truncated packed-refs file");
+ p += 41;
+
+ pos->ref = p;
+ pos->eol = p = find_eol(p, pos->end);
+
+ /* skip newline, and then past any peel records */
+ if (p < pos->end)
+ p++;
+ while (p < pos->end && *p == '^') {
+ p = find_eol(p, pos->end);
+ if (p < pos->end)
+ p++;
+ }
+ pos->next = p;
+}
+
+static void iterator_next(struct packed_refs_iterator *pos)
+{
+ if (pos->next < pos->end)
+ parse_line(pos, pos->next);
+ else
+ pos->cur = NULL;
+}
+
+static void iterator_start(struct packed_refs_iterator *pos, const char
*prefix)
+{
+ const char *lo = pos->start, *hi = pos->end;
+
+ while (lo < hi) {
+ const char *mi = lo + ((hi - lo) / 2);
+ int cmp;
+
+ /*
+ * We landed somewhere on a line. Walk back to find
+ * the start of the line.
+ */
+ while (mi > lo && *(mi-1) != '\n')
+ mi--;
+
+ /*
+ * We may have hit a peel-line. In that case, try
+ * to walk back to the actual ref line (and skip as
+ * many peel lines as we find, for future-proofing).
+ */
+ while (*mi == '^') {
+ if (mi == lo)
+ die("peel line without a record before it?");
+ mi--;
+ if (mi == lo)
+ die("peel line with bare newline before it?");
+ mi--;
+ while (mi > lo && *(mi-1) != '\n')
+ mi--;
+ }
+
+ /* Now we should be at a real ref line. */
+ parse_line(pos, mi);
+ cmp = iterator_cmp(prefix, pos);
+ if (!cmp)
+ return;
+ else if (cmp < 0)
+ hi = pos->cur;
+ else
+ lo = pos->next;
+ }
+
+ if (hi < pos->end)
+ parse_line(pos, hi);
+ else
+ pos->cur = NULL;
+}
+
+static void quick_list(const char *prefix, each_ref_fn fn, void *data)
+{
+ int fd = open(git_path("packed-refs"), O_RDONLY);
+ struct stat st;
+ const char *buf = NULL;
+ size_t len;
+ struct packed_refs_iterator pos;
+
+ if (fd < 0)
+ goto out;
+ if (fstat(fd, &st) < 0)
+ goto out;
+ len = xsize_t(st.st_size);
+ buf = xmmap(NULL, len, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (!buf)
+ goto out;
+
+ iterator_init(&pos, buf, len);
+ for (iterator_start(&pos, prefix);
+ pos.cur && starts_with(pos.ref, prefix);
+ iterator_next(&pos)) {
+ unsigned char sha1[20];
+ char *refname;
+
+ if (get_sha1_hex(pos.cur, sha1) < 0)
+ die("packed-refs contained invalid sha1");
+ refname = xmemdupz(pos.ref, pos.eol - pos.ref);
+ fn(refname, sha1, 0, data);
+ free(refname);
+ }
+
+out:
+ close(fd);
+ if (buf)
+ munmap((void *)buf, len);
+}
+
+static int show_ref(const char *refname, const unsigned char *sha1,
+ int flags, void *data)
+{
+ printf("%s %s\n", sha1_to_hex(sha1), refname);
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ if (argc != 2)
+ usage("git quick-list <prefix>");
+
+ setup_git_directory();
+ quick_list(argv[1], show_ref, NULL);
+
+ return 0;
+}
--
2.4.0.rc0.363.gf9f328b
--
To unsubscribe from this list: send the line "unsubscribe git" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html