I did some experimenting, and I figured out what was going on. You're right, (c) doesn't quite work, because delayed allocation meant that the writeout didn't take place until the fsync() for each file happened. I didn't see this at first; my apologies.
However, this *does* work: extract(a); sync_file_range(fd.a, 0, 0, SYNC_FILE_RANGE_WRITE); extract(b.dpkg-new); sync_file_range(fd.b, 0, 0, SYNC_FILE_RANGE_WRITE); extract(c.dpkg-new); sync_file_range(fd.c, 0, 0, SYNC_FILE_RANGE_WRITE); sync_file_range(fd.a, 0, 0, SYNC_FILE_RANGE_WAIT_BEFORE); sync_file_range(fd.b, 0, 0, SYNC_FILE_RANGE_WAIT_BEFORE); sync_file_range(fd.c, 0, 0, SYNC_FILE_RANGE_WAIT_BEFORE); fdatasync(a); fdatasync(b.dpkg-new); fdatasync(c.dpkg-new); rename(b.dpkg-new, b); rename(c.dpkg-new, c); This assumes that files b and c existed beforehand, but a is a new file. What's going on here? sync_file_range() is a Linux specific system call that has been around for a while. It allows program to control when writeback happens in a very low-level fashion. The first set of sync_file_range() system calls causes the system to start writing back each file once it has finished being extracted. It doesn't actually wait for the write to finish; it just starts the writeback. The second series of sync_file_range() calls, with the operation SYNC_FILE_RANGE_WAIT_BEFORE, will block until the previously initiated writeback has completed. This basically ensures that the delayed allocation has been resolved; that is, the data blocks have been allocated and written, and the inode updated (in memory), but not necessarily pushed out to disk. The fdatasync() call will actually force the inode to disk. In the case of the ext4 file system, the first fdatasync() will actually push all of the inodes to disk, and all of the subsequent fdatasync() calls are in fact no-ops (assuming that files 'a', 'b', and 'c' are all on the same file system). But what it means is that it minimizes the number of (heavyweight) jbd2 commits to a minimum. It uses a linux-specific system call --- sync_file_range --- but the result should be faster performance across the board for all file systems. So I don't consider this an ext4-specific hack, although it probably does makes things faster for ext4 more than any other file system. I've attached the program I used to test and prove this mechanism, as well as the kernel tracepoint script I used to debug why (c) wasn't working, which might be of interest to folks on debian-kernel. Basically it's a demonstration of how cool ftrace is. :-) But using this program on a file system composed of a 5400rpm laptop drive running LVM and LUKS, I get: mass-sync-tester -d: dpkg current: time: 0.83/ 0.01/ 0.00 versus mass-sync-tester -n: dpkg fixed: time: 0.07/ 0.00/ 0.01 - Ted
/* * Mass sync tester */ #define _GNU_SOURCE #include <stdio.h> #include <unistd.h> #include <stdlib.h> #include <sys/types.h> #include <sys/time.h> #include <sys/stat.h> #include <fcntl.h> #include <sys/resource.h> #include <getopt.h> #include <errno.h> #include <string.h> void write_file(const char *name, int sync, int sync_range) { int fd, i, ret; char buf[1024]; fd = open(name, O_WRONLY|O_TRUNC|O_CREAT, 0666); if (fd < 0) { fprintf(stderr, "open(%s) in write_file: %s\n", name, strerror(errno)); exit(1); } memset(buf, 0, sizeof(buf)); for (i=0; i < 16; i++) { ret = write(fd, buf, sizeof(buf)); if (ret < 0) { fprintf(stderr, "writing %s: %s\n", name, strerror(errno)); exit(1); } } if (sync) { ret = fsync(fd); if (ret < 0) { fprintf(stderr, "fsyncing %s in write_file: %s\n", name, strerror(errno)); exit(1); } } if (sync_range) { ret = sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WRITE); if (ret < 0) { fprintf(stderr, "sync_file_range %s in write_file: %s\n", name, strerror(errno)); exit(1); } } ret = close(fd); if (ret < 0) { fprintf(stderr, "closing %s in write_file: %s\n", name, strerror(errno)); exit(1); } } void rename_file(const char *src, const char *dest) { int ret; ret = rename(src, dest); if (ret) { fprintf(stderr, "renaming %s to %s: %s\n", src, dest, strerror(errno)); exit(1); } } void sync_file(const char *name) { int fd, i, ret; fd = open(name, O_RDONLY|O_NOATIME, 0666); if (fd < 0) { fprintf(stderr, "open(%s) in sync_file: %s\n", name, strerror(errno)); exit(1); } ret = fsync(fd); if (ret < 0) { fprintf(stderr, "fsyncing %s in sync_file: %s\n", name, strerror(errno)); exit(1); } ret = close(fd); if (ret < 0) { fprintf(stderr, "closing %s in sync_file: %s\n", name, strerror(errno)); exit(1); } } void datasync_file(const char *name) { int fd, i, ret; fd = open(name, O_RDONLY|O_NOATIME, 0666); if (fd < 0) { fprintf(stderr, "open(%s) in datasync_file: %s\n", name, strerror(errno)); exit(1); } ret = fdatasync(fd); if (ret < 0) { fprintf(stderr, "fsyncing %s in datasync_file: %s\n", name, strerror(errno)); exit(1); } ret = close(fd); if (ret < 0) { fprintf(stderr, "closing %s in datasync_file: %s\n", name, strerror(errno)); exit(1); } } void sync_range(const char *name) { int fd, i, ret; fd = open(name, O_RDONLY|O_NOATIME, 0666); if (fd < 0) { fprintf(stderr, "open(%s) in sync_range: %s\n", name, strerror(errno)); exit(1); } ret = sync_file_range(fd, 0, 0, SYNC_FILE_RANGE_WAIT_BEFORE); if (ret < 0) { fprintf(stderr, "sync_file_range %s in sync_range: %s\n", name, strerror(errno)); exit(1); } ret = close(fd); if (ret < 0) { fprintf(stderr, "closing %s in sync_range: %s\n", name, strerror(errno)); exit(1); } } /* * Begin resource tracking code */ struct resource_track { struct timeval time_start; struct timeval user_start; struct timeval system_start; }; void init_resource_track(struct resource_track *track) { struct rusage r; gettimeofday(&track->time_start, 0); getrusage(RUSAGE_SELF, &r); track->user_start = r.ru_utime; track->system_start = r.ru_stime; } static inline float timeval_subtract(struct timeval *tv1, struct timeval *tv2) { return ((tv1->tv_sec - tv2->tv_sec) + ((float) (tv1->tv_usec - tv2->tv_usec)) / 1000000); } void print_resource_track(const char *desc, struct resource_track *track) { struct rusage r; struct timeval time_end; gettimeofday(&time_end, 0); if (desc) printf("%s: ", desc); getrusage(RUSAGE_SELF, &r); printf("time: %5.2f/%5.2f/%5.2f\n", timeval_subtract(&time_end, &track->time_start), timeval_subtract(&r.ru_utime, &track->user_start), timeval_subtract(&r.ru_stime, &track->system_start)); } /* * End resource tracking code */ const char *setup[] = { "exist_1", "exist_2", "exist_3", "exist_4", "exist_5", "exist_6", "exist_7", "exist_8", "exist_9", "exist_10", 0 }; const char *pkg_list[] = { "exist_1", "new_1", "exist_2", "exist_3", "new_2", "new_3", "exist_4", "exist_5", "exist_6", "new_4", "new_5", "new_6", "exist_7", "exist_8", "exist_9", "exist_10", "new_7", "new_8", "new_9", "new_10", 0 }; /* * Do the setup for a test */ void do_setup(const char *dirname) { int ret, i; ret = mkdir(dirname, 0700); if (ret < 0) { fprintf(stderr, "mkdir(%s) in do_setup: %s\n", dirname, strerror(errno)); exit(1); } ret = chdir(dirname); if (ret < 0) { fprintf(stderr, "chdir(%s) in do_setup: %s\n", dirname, strerror(errno)); exit(1); } for (i=0; setup[i]; i++) { write_file(setup[i], 1, 0); sync_file(setup[i]); } } void do_cleanup(const char *dirname) { char buf[1024]; int ret; ret = chdir(".."); if (ret < 0) { fprintf(stderr, "chdir(..) in do_cleanup: %s\n", dirname, strerror(errno)); exit(1); } sprintf(buf, "/bin/rm -rf %s\n", dirname); ret = system(buf); if (ret < 0) { fprintf(stderr, "system clal in do_cleanup: %s\n", strerror(errno)); exit(1); } } void do_dpkg_method() { const char *dirname = "test_pkg_dir"; struct resource_track rtrack; char buf[1024]; int i; do_setup(dirname); init_resource_track(&rtrack); for (i=0; pkg_list[i]; i++) { sprintf(buf, "%s.dpkg-new", pkg_list[i]); write_file(buf, 0, 0); } for (i=0; pkg_list[i]; i++) { sprintf(buf, "%s.dpkg-new", pkg_list[i]); sync_file(buf); rename(buf, pkg_list[i]); } print_resource_track("dpkg current", &rtrack); do_cleanup(dirname); } void do_fixed_method() { const char *dirname = "fixed_pkg_dir"; struct resource_track rtrack; char buf[1024]; int i; do_setup(dirname); init_resource_track(&rtrack); for (i=0; pkg_list[i]; i++) { sprintf(buf, "%s.dpkg-new", pkg_list[i]); write_file(buf, 0, 1); } for (i=0; pkg_list[i]; i++) { sprintf(buf, "%s.dpkg-new", pkg_list[i]); sync_range(buf); } for (i=0; pkg_list[i]; i++) { sprintf(buf, "%s.dpkg-new", pkg_list[i]); datasync_file(buf); } for (i=0; pkg_list[i]; i++) { sprintf(buf, "%s.dpkg-new", pkg_list[i]); rename(buf, pkg_list[i]); } print_resource_track("dpkg fixed", &rtrack); do_cleanup(dirname); } main(int argc, char **argv) { int opt; int d_flag = 0, n_flag = 0; while ((opt = getopt(argc, argv, "dn")) != -1) { switch(opt) { case 'd': d_flag = 1; break; case 'n': n_flag = 1; break; default: fprintf(stderr, "Bad option: %c\n", opt); exit(1); } } if (d_flag) do_dpkg_method(); if (n_flag) do_fixed_method(); }
#!/bin/sh cd /sys/kernel/debug/tracing echo blk > current_tracer echo 1 > /sys/block/dm-5/trace/enable echo 1 > events/ext4/ext4_sync_file/enable echo 1 > events/ext4/ext4_da_writepages/enable echo 1 > events/ext4/ext4_mark_inode_dirty/enable echo 1 > events/jbd2/jbd2_run_stats/enable echo 1 > events/jbd2/jbd2_start_commit/enable echo 1 > events/jbd2/jbd2_end_commit/enable (cd /kbuild; /home/tytso/src/mass-sync-tester -n) cat trace > /tmp/trace echo 0 > events/jbd2/jbd2_start_commit/enable echo 0 > events/jbd2/jbd2_end_commit/enable echo 0 > events/jbd2/jbd2_run_stats/enable echo 0 > events/ext4/ext4_sync_file/enable echo 0 > events/ext4/ext4_da_writepages/enable echo 0 > events/ext4/ext4_mark_inode_dirty/enable echo 0 > /sys/block/dm-5/trace/enable echo nop > current_tracer