Hello, According to [1], using directory lists ordered by inode can speed up tar archivation by up to 40% on ext[34] filesystems (confirmed by my own measurements). Please take a look at the attached patch. Is it OK to apply?
Regards, Sergey [1] https://savannah.gnu.org/patch/?7892
>From 92ddbfb74f05e5cded933f760cddd0ce007001ae Mon Sep 17 00:00:00 2001 From: Sergey Poznyakoff <g...@gnu.org.ua> Date: Tue, 11 Feb 2014 10:52:44 +0200 Subject: [PATCH] savedir: optionally produce ordered directory list New function streamsavedir_ordered returns directory entries ordered by names or inode numbers. This can be used by archivers to store archive members in reproducible order (when sorted by name) or to speed-up archivation on some filesystems (when sorted by inode). Patch based on an idea by Dick Streefland. * lib/savedir.h (SAVEDIR_SORT_NONE, SAVEDIR_SORT_NAME) (SAVEDIR_SORT_INODE): New constants. (streamsavedir_ordered) (set_savedir_sort_order): New prototypes. * lib/savedir.c (streamsavedir_ordered): New function. (set_savedir_sort_order): New function. (streamsavedir): Rewrite as an entry point to streamsavedir_ordered. --- lib/savedir.c | 133 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- lib/savedir.h | 10 +++++ 2 files changed, 140 insertions(+), 3 deletions(-) diff --git a/lib/savedir.c b/lib/savedir.c index 6d5ed7f..5fc2ff2 100644 --- a/lib/savedir.c +++ b/lib/savedir.c @@ -41,25 +41,92 @@ # define NAME_SIZE_DEFAULT 512 #endif +typedef struct +{ + union + { + char *ptr; + size_t off; + } name; +#if D_INO_IN_DIRENT + ino_t ino; +#endif +} direntry_t; + +/* Compare the names of two directory entries */ + +static int +direntry_cmp_name (const void *a, const void *b) +{ + const direntry_t *dea = (const direntry_t *) a; + const direntry_t *deb = (const direntry_t *) b; + + return strcmp(dea->name.ptr, deb->name.ptr); +} + +#if D_INO_IN_DIRENT +/* Compare the inode numbers of two directory entries */ + +static int +direntry_cmp_inode (const void *a, const void *b) +{ + const direntry_t *dea = (const direntry_t *) a; + const direntry_t *deb = (const direntry_t *) b; + + return (int) (dea->ino - deb->ino); +} +#endif + /* Return a freshly allocated string containing the file names in directory DIRP, separated by '\0' characters; the end is marked by two '\0' characters in a row. - Return NULL (setting errno) if DIRP cannot be read. - If DIRP is NULL, return NULL without affecting errno. */ + Returned values are sorted according to ORDER. + + Return NULL (setting errno) if DIRP cannot be read or ORDER + is invalid. + + If DIRP is NULL, return NULL without affecting errno. + */ char * -streamsavedir (DIR *dirp) +streamsavedir_ordered (DIR *dirp, int order) { char *name_space; + char *sorted; size_t allocated = NAME_SIZE_DEFAULT; + direntry_t *entries = NULL; + size_t entries_allocated = 0; + size_t entries_used = 0; + size_t i; size_t used = 0; int save_errno; + int (*cmp) (const void *a, const void *b) = NULL; if (dirp == NULL) return NULL; name_space = xmalloc (allocated); + switch (order) + { + case SAVEDIR_SORT_NONE: + break; + + case SAVEDIR_SORT_NAME: + cmp = direntry_cmp_name; + break; + +#if D_INO_IN_DIRENT + case SAVEDIR_SORT_INODE: + cmp = direntry_cmp_inode; + break; +#endif + + default: + errno = EINVAL; + return NULL; + } + for (;;) { struct dirent const *dp; @@ -91,9 +158,41 @@ streamsavedir (DIR *dirp) name_space = xrealloc (name_space, allocated); } memcpy (name_space + used, entry, entry_size); + if (cmp) + { + if (entries_allocated == entries_used) + { + if (entries_allocated == 0) + entries_allocated = NAME_SIZE_DEFAULT / sizeof (direntry_t); + entries = x2nrealloc (entries, &entries_allocated, + sizeof (entries[0])); + } + entries[entries_used].name.off = used; +#if D_INO_IN_DIRENT + entries[entries_used].ino = dp->d_ino; +#endif + entries_used++; + } used += entry_size; } } + + if (cmp) + { + for (i = 0; i < entries_used; i++) + entries[i].name.ptr = name_space + entries[i].name.off; + qsort (entries, entries_used, sizeof (direntry_t), cmp); + sorted = xmalloc (used + 1); + used = 0; + for (i = 0; i < entries_used; i++) + { + strcpy (sorted + used, entries[i].name.ptr); + used += strlen (sorted + used) + 1; + } + free (entries); + free (name_space); + name_space = sorted; + } name_space[used] = '\0'; save_errno = errno; if (save_errno != 0) @@ -105,6 +204,34 @@ streamsavedir (DIR *dirp) return name_space; } +static int default_sort_order; + +int +set_savedir_sort_order (int order) +{ + switch (order) + { + case SAVEDIR_SORT_NONE: + case SAVEDIR_SORT_NAME: +#if D_INO_IN_DIRENT + case SAVEDIR_SORT_INODE: +#endif + default_sort_order = order; + break; + + default: + errno = EINVAL; + return 1; + } + return 0; +} + +char * +streamsavedir (DIR *dirp) +{ + return streamsavedir_ordered (dirp, default_sort_order); +} + /* Like streamsavedir (DIRP), except also close DIRP. */ static char * diff --git a/lib/savedir.h b/lib/savedir.h index eedb0c4..fb59211 100644 --- a/lib/savedir.h +++ b/lib/savedir.h @@ -22,6 +22,16 @@ #define _GL_SAVEDIR_H #include <dirent.h> + +enum + { + SAVEDIR_SORT_NONE, + SAVEDIR_SORT_NAME, + SAVEDIR_SORT_INODE + }; +int set_savedir_sort_order (int order); + +char *streamsavedir_ordered (DIR *dirp, int sortorder); char *streamsavedir (DIR *dirp); char *savedir (char const *dir); char *fdsavedir (int fd); /* deprecated */ -- 1.7.12.1