On 12/6/10 6:13 PM, Tom Lane wrote: > Josh Berkus <j...@agliodbs.com> writes: >> OK, patch coming then. Right now test_fsync aborts when O_DIRECT fails. >> What should I have it do instead? > > Report that it fails, and keep testing the other methods.
Patch attached. Includes a fair amount of comment cleanup, since existing comments did not meet our current project standards. Tests all 6 of the methods we support separately. Some questions, though: (1) Why are we doing the open_sync different-size write test? AFAIK, this doesn't match any behavior which PostgreSQL has. (2) In this patch, I'm stepping down the number of loops which fsync_writethrough does by 90%. The reason for that was that on the platforms where I tested writethrough (desktop machines), doing 10,000 loops took 15-20 *minutes*, which seems hard on the user. Would be easy to revert if you think it's a bad idea. Possibly auto-sizing the number of loops based on the first fsync test might be a good idea, but seems like going a bit too far. (3) Should the multi-descriptor test be using writethrough on platforms which support it? -- -- Josh Berkus PostgreSQL Experts Inc. http://www.pgexperts.com
diff --git a/src/tools/fsync/Makefile b/src/tools/fsync/Makefile index 252c087..2ddbbe9 100644 *** a/src/tools/fsync/Makefile --- b/src/tools/fsync/Makefile *************** *** 4,10 **** # # Copyright (c) 2003-2010, PostgreSQL Global Development Group # ! # src/tools/fsync/Makefile # #------------------------------------------------------------------------- --- 4,10 ---- # # Copyright (c) 2003-2010, PostgreSQL Global Development Group # ! # $PostgreSQL: pgsql/src/tools/fsync/Makefile,v 1.9 2010/07/05 18:54:38 tgl Exp $ # #------------------------------------------------------------------------- *************** override CPPFLAGS := -I$(libpq_srcdir) $ *** 16,24 **** OBJS= test_fsync.o ! all: test_fsync ! test_fsync: test_fsync.o | submake-libpq submake-libpgport $(CC) $(CFLAGS) test_fsync.o $(libpq_pgport) $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $...@$(X) clean distclean maintainer-clean: --- 16,24 ---- OBJS= test_fsync.o ! all: submake-libpq submake-libpgport test_fsync ! test_fsync: test_fsync.o $(libpq_builddir)/libpq.a $(CC) $(CFLAGS) test_fsync.o $(libpq_pgport) $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $...@$(X) clean distclean maintainer-clean: diff --git a/src/tools/fsync/README b/src/tools/fsync/README index 6d9acd3..5b45581 100644 *** a/src/tools/fsync/README --- b/src/tools/fsync/README *************** *** 1,4 **** ! src/tools/fsync/README fsync ===== --- 1,4 ---- ! $PostgreSQL: pgsql/src/tools/fsync/README,v 1.5 2009/11/28 15:04:54 momjian Exp $ fsync ===== *************** fsync *** 6,11 **** This program tests fsync. The tests are described as part of the program output. Usage: test_fsync [-f filename] [loops] ! Loops defaults to 5000. The default output file is /var/tmp/test_fsync.out. ! Consider that /tmp or /var/tmp might be memory-based file systems. --- 6,25 ---- This program tests fsync. The tests are described as part of the program output. Usage: test_fsync [-f filename] [loops] + + test_fsync is intended to give you a reasonable idea of what the fastest + fsync_method is on your specific system, as well as supplying diagnostic + information in the event of an identified I/O problem. However, differences + shown by test_fsync may not make any difference in real database throughput, + especially since many database servers are not speed-limited by their + transaction logs. ! Filename defaults to test_fsync.out in the current directory. test_fsync ! should be run on the same filesystem where your transaction log currently ! resides. ! ! Loops default to 10000, except for writethrough tests, where there are 1/10 of ! that in order to make the user not wait forever. You should lower loops if you ! have a slow system and the tests are taking more than 5 minutes each. You should ! raise loops if your system is faster than 5000/second, in order to get useful ! statistics. diff --git a/src/tools/fsync/test_fsync.c b/src/tools/fsync/test_fsync.c index 28c2119..5980b70 100644 *** a/src/tools/fsync/test_fsync.c --- b/src/tools/fsync/test_fsync.c *************** *** 3,9 **** * * * test_fsync.c ! * test various fsync() methods */ #include "postgres.h" --- 3,9 ---- * * * test_fsync.c ! * tests all supported fsync() methods */ #include "postgres.h" *************** *** 22,55 **** #include <unistd.h> #include <string.h> ! ! #ifdef WIN32 #define FSYNC_FILENAME "./test_fsync.out" - #else - /* /tmp might be a memory file system */ - #define FSYNC_FILENAME "/var/tmp/test_fsync.out" - #endif #define WRITE_SIZE (8 * 1024) /* 8k */ #define LABEL_FORMAT "\t%-30s" int loops = 10000; void die(char *str); void print_elapse(struct timeval start_t, struct timeval stop_t); int main(int argc, char *argv[]) { struct timeval start_t; struct timeval stop_t; ! int tmpfile, ! i; char *full_buf = (char *) malloc(XLOG_SEG_SIZE), *buf; char *filename = FSYNC_FILENAME; if (argc > 2 && strcmp(argv[1], "-f") == 0) { filename = argv[2]; --- 22,58 ---- #include <unistd.h> #include <string.h> ! /* ! * put the temp files in the local directory ! * unless the user specifies otherwise ! */ #define FSYNC_FILENAME "./test_fsync.out" #define WRITE_SIZE (8 * 1024) /* 8k */ #define LABEL_FORMAT "\t%-30s" int loops = 10000; + int writethrough_loops = 1000; void die(char *str); void print_elapse(struct timeval start_t, struct timeval stop_t); + void print_elapse_writethrough(struct timeval start_t, struct timeval stop_t); int main(int argc, char *argv[]) { struct timeval start_t; struct timeval stop_t; ! int tmpfile; ! int i; char *full_buf = (char *) malloc(XLOG_SEG_SIZE), *buf; char *filename = FSYNC_FILENAME; + /* + * arguments: loops and filename (optional) + */ if (argc > 2 && strcmp(argv[1], "-f") == 0) { filename = argv[2]; *************** main(int argc, char *argv[]) *** 57,73 **** argc -= 2; } ! if (argc > 1) loops = atoi(argv[1]); for (i = 0; i < XLOG_SEG_SIZE; i++) full_buf[i] = random(); if ((tmpfile = open(filename, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR)) == -1) die("Cannot open output file."); if (write(tmpfile, full_buf, XLOG_SEG_SIZE) != XLOG_SEG_SIZE) die("write failed"); ! /* fsync now so later fsync's don't have to do it */ if (fsync(tmpfile) != 0) die("fsync failed"); close(tmpfile); --- 60,88 ---- argc -= 2; } ! /* ! * set writethrough_loops to be 1/10 of loops ! * since writethroughs are very slow ! */ ! if (argc > 1) ! { loops = atoi(argv[1]); + writethrough_loops = loops / 10; + } for (i = 0; i < XLOG_SEG_SIZE; i++) full_buf[i] = random(); + /* + * test if we can open the target file + */ if ((tmpfile = open(filename, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR)) == -1) die("Cannot open output file."); if (write(tmpfile, full_buf, XLOG_SEG_SIZE) != XLOG_SEG_SIZE) die("write failed"); ! /* ! * fsync now so that dirty buffers don't skew later tests ! */ if (fsync(tmpfile) != 0) die("fsync failed"); close(tmpfile); *************** main(int argc, char *argv[]) *** 77,83 **** printf("Loops = %d\n\n", loops); /* ! * Simple write */ printf("Simple write:\n"); printf(LABEL_FORMAT, "8k write"); --- 92,98 ---- printf("Loops = %d\n\n", loops); /* ! * Test a simple write without fsync */ printf("Simple write:\n"); printf(LABEL_FORMAT, "8k write"); *************** main(int argc, char *argv[]) *** 95,104 **** print_elapse(start_t, stop_t); /* ! * Compare file sync methods with one 8k write */ printf("\nCompare file sync methods using one write:\n"); #ifdef OPEN_DATASYNC_FLAG printf(LABEL_FORMAT, "open_datasync 8k write"); fflush(stdout); --- 110,122 ---- print_elapse(start_t, stop_t); /* ! * Test all fsync methods using single 8k writes */ printf("\nCompare file sync methods using one write:\n"); + /* + * Test open_datasync if available + */ #ifdef OPEN_DATASYNC_FLAG printf(LABEL_FORMAT, "open_datasync 8k write"); fflush(stdout); *************** main(int argc, char *argv[]) *** 115,124 **** --- 133,174 ---- gettimeofday(&stop_t, NULL); close(tmpfile); print_elapse(start_t, stop_t); + + /* + * If O_DIRECT is enabled, test that with open_datasync + */ + if ( PG_O_DIRECT != 0 ) + { + printf(LABEL_FORMAT, "open_datasync 8k directIO write"); + fflush(stdout); + if ((tmpfile = open(filename, O_RDWR | O_DSYNC | PG_O_DIRECT, 0)) == -1) + printf("\t(unavailable: o_direct on this filesystem)\n"); + else + { + gettimeofday(&start_t, NULL); + for (i = 0; i < loops; i++) + { + if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE) + die("write failed"); + if (lseek(tmpfile, 0, SEEK_SET) == -1) + die("seek failed"); + } + gettimeofday(&stop_t, NULL); + close(tmpfile); + print_elapse(start_t, stop_t); + } + } + else + { + printf("\t(unavailable: o_direct)\n"); + } #else printf("\t(unavailable: open_datasync)\n"); #endif + /* + * Test open_sync if available + */ #ifdef OPEN_SYNC_FLAG printf(LABEL_FORMAT, "open_sync 8k write"); fflush(stdout); *************** main(int argc, char *argv[]) *** 135,144 **** --- 185,226 ---- gettimeofday(&stop_t, NULL); close(tmpfile); print_elapse(start_t, stop_t); + + /* + * If O_DIRECT is enabled, test that with open_sync + */ + if ( PG_O_DIRECT != 0 ) + { + printf(LABEL_FORMAT, "open_sync 8k directIO write"); + fflush(stdout); + if ((tmpfile = open(filename, O_RDWR | O_SYNC | PG_O_DIRECT, 0)) == -1) + printf("\t(unavailable: o_direct on this filesystem)\n"); + else + { + gettimeofday(&start_t, NULL); + for (i = 0; i < loops; i++) + { + if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE) + die("write failed"); + if (lseek(tmpfile, 0, SEEK_SET) == -1) + die("seek failed"); + } + gettimeofday(&stop_t, NULL); + close(tmpfile); + print_elapse(start_t, stop_t); + } + } + else + { + printf("\t(unavailable: o_direct)\n"); + } #else printf("\t(unavailable: open_sync)\n"); #endif + /* + * Test fdatasync if available + */ #ifdef HAVE_FDATASYNC printf(LABEL_FORMAT, "8k write, fdatasync"); fflush(stdout); *************** main(int argc, char *argv[]) *** 160,165 **** --- 242,250 ---- printf("\t(unavailable: fdatasync)\n"); #endif + /* + * Test fsync + */ printf(LABEL_FORMAT, "8k write, fsync"); fflush(stdout); if ((tmpfile = open(filename, O_RDWR, 0)) == -1) *************** main(int argc, char *argv[]) *** 177,188 **** gettimeofday(&stop_t, NULL); close(tmpfile); print_elapse(start_t, stop_t); /* ! * Compare file sync methods with two 8k write */ printf("\nCompare file sync methods using two writes:\n"); #ifdef OPEN_DATASYNC_FLAG printf(LABEL_FORMAT, "2 open_datasync 8k writes"); fflush(stdout); --- 262,304 ---- gettimeofday(&stop_t, NULL); close(tmpfile); print_elapse(start_t, stop_t); + + /* + * If fsync_writethrough is available, test as well + * This uses 1/10 the number of loops because it tends + * to take forever otherwise. + */ + #ifdef HAVE_FSYNC_WRITETHROUGH + printf(LABEL_FORMAT, "8k write, fsync_writethrough"); + fflush(stdout); + if ((tmpfile = open(filename, O_RDWR, 0)) == -1) + die("Cannot open output file."); + gettimeofday(&start_t, NULL); + for (i = 0; i < writethrough_loops; i++) + { + if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE) + die("write failed"); + if (fcntl(tmpfile, F_FULLFSYNC ) != 0) + die("fsync failed"); + if (lseek(tmpfile, 0, SEEK_SET) == -1) + die("seek failed"); + } + gettimeofday(&stop_t, NULL); + close(tmpfile); + print_elapse_writethrough(start_t, stop_t); + #else + printf("\t(unavailable: fsync_writethrough)\n"); + #endif /* ! * Compare some of the file sync methods with ! * two 8k writes to see if timing is different */ printf("\nCompare file sync methods using two writes:\n"); + /* + * Test open_datasync with and without o_direct + */ #ifdef OPEN_DATASYNC_FLAG printf(LABEL_FORMAT, "2 open_datasync 8k writes"); fflush(stdout); *************** main(int argc, char *argv[]) *** 201,210 **** --- 317,354 ---- gettimeofday(&stop_t, NULL); close(tmpfile); print_elapse(start_t, stop_t); + + if ( PG_O_DIRECT != 0 ) + { + printf(LABEL_FORMAT, "2 open_datasync directIO 8k writes"); + fflush(stdout); + if ((tmpfile = open(filename, O_RDWR | O_DSYNC | PG_O_DIRECT, 0)) == -1) + die("Cannot open output file."); + gettimeofday(&start_t, NULL); + for (i = 0; i < loops; i++) + { + if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE) + die("write failed"); + if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE) + die("write failed"); + if (lseek(tmpfile, 0, SEEK_SET) == -1) + die("seek failed"); + } + gettimeofday(&stop_t, NULL); + close(tmpfile); + print_elapse(start_t, stop_t); + } + else + { + printf("\t(unavailable: o_direct)\n"); + } #else printf("\t(unavailable: open_datasync)\n"); #endif + /* + * Test open_sync with and without o_direct + */ #ifdef OPEN_SYNC_FLAG printf(LABEL_FORMAT, "2 open_sync 8k writes"); fflush(stdout); *************** main(int argc, char *argv[]) *** 223,230 **** --- 367,404 ---- gettimeofday(&stop_t, NULL); close(tmpfile); print_elapse(start_t, stop_t); + + if ( PG_O_DIRECT != 0 ) + { + printf(LABEL_FORMAT, "2 open_sync directIO 8k writes"); + fflush(stdout); + if ((tmpfile = open(filename, O_RDWR | O_SYNC | PG_O_DIRECT, 0)) == -1) + die("Cannot open output file."); + gettimeofday(&start_t, NULL); + for (i = 0; i < loops; i++) + { + if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE) + die("write failed"); + if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE) + die("write failed"); + if (lseek(tmpfile, 0, SEEK_SET) == -1) + die("seek failed"); + } + gettimeofday(&stop_t, NULL); + close(tmpfile); + print_elapse(start_t, stop_t); + } + else + { + printf("\t(unavailable: o_direct)\n"); + } + #else + printf("\t(unavailable: open_sync)\n"); #endif + /* + * Test fdatasync + */ #ifdef HAVE_FDATASYNC printf(LABEL_FORMAT, "8k write, 8k write, fdatasync"); fflush(stdout); *************** main(int argc, char *argv[]) *** 248,253 **** --- 422,430 ---- printf("\t(unavailable: fdatasync)\n"); #endif + /* + * Test basic fsync + */ printf(LABEL_FORMAT, "8k write, 8k write, fsync"); fflush(stdout); if ((tmpfile = open(filename, O_RDWR, 0)) == -1) *************** main(int argc, char *argv[]) *** 267,278 **** --- 444,488 ---- gettimeofday(&stop_t, NULL); close(tmpfile); print_elapse(start_t, stop_t); + + /* + * Test fsync_writethrough if available + * Again, using 1/10 as many loops + */ + #ifdef HAVE_FSYNC_WRITETHROUGH + printf(LABEL_FORMAT, "8k write, 8k write, fsync_writethrough"); + fflush(stdout); + if ((tmpfile = open(filename, O_RDWR, 0)) == -1) + die("Cannot open output file."); + gettimeofday(&start_t, NULL); + for (i = 0; i < writethrough_loops; i++) + { + if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE) + die("write failed"); + if (write(tmpfile, buf, WRITE_SIZE) != WRITE_SIZE) + die("write failed"); + if (fcntl(tmpfile, F_FULLFSYNC) != 0) + die("fsync failed"); + if (lseek(tmpfile, 0, SEEK_SET) == -1) + die("seek failed"); + } + gettimeofday(&stop_t, NULL); + close(tmpfile); + print_elapse_writethrough(start_t, stop_t); + #else + printf("\t(unavailable: fsync_writethrough)\n"); + #endif /* * Compare 1 to 2 writes */ printf("\nCompare open_sync with different sizes:\n"); + /* + * Test open_sync with different size files + * It's unclear why this is in test_fsync, since it's + * not anything PostgreSQL does + */ #ifdef OPEN_SYNC_FLAG printf(LABEL_FORMAT, "open_sync 16k write"); fflush(stdout); *************** main(int argc, char *argv[]) *** 312,323 **** #endif /* ! * Fsync another file descriptor? */ printf("\nTest if fsync on non-write file descriptor is honored:\n"); printf("(If the times are similar, fsync() can sync data written\n"); printf("on a different descriptor.)\n"); printf(LABEL_FORMAT, "8k write, fsync, close"); fflush(stdout); gettimeofday(&start_t, NULL); --- 522,541 ---- #endif /* ! * Test whether fsync can sync data written on a different ! * descriptor for the same file. This checks the efficiency ! * of multi-process fsyncs against the same file. ! * Possibly this should be done with writethrough on platforms ! * which support it. */ printf("\nTest if fsync on non-write file descriptor is honored:\n"); printf("(If the times are similar, fsync() can sync data written\n"); printf("on a different descriptor.)\n"); + /* + * first write, fsync and close, which is the + * normal behavior without multiple descriptors + */ printf(LABEL_FORMAT, "8k write, fsync, close"); fflush(stdout); gettimeofday(&start_t, NULL); *************** main(int argc, char *argv[]) *** 330,343 **** if (fsync(tmpfile) != 0) die("fsync failed"); close(tmpfile); if ((tmpfile = open(filename, O_RDWR, 0)) == -1) die("Cannot open output file."); - /* do nothing but the open/close the tests are consistent. */ close(tmpfile); } gettimeofday(&stop_t, NULL); print_elapse(start_t, stop_t); printf(LABEL_FORMAT, "8k write, close, fsync"); fflush(stdout); gettimeofday(&start_t, NULL); --- 548,569 ---- if (fsync(tmpfile) != 0) die("fsync failed"); close(tmpfile); + /* + * open and close the file again to be consistent + * with the following test + */ if ((tmpfile = open(filename, O_RDWR, 0)) == -1) die("Cannot open output file."); close(tmpfile); } gettimeofday(&stop_t, NULL); print_elapse(start_t, stop_t); + /* + * Now open, write, close, open again and fsync + * This simulates processes fsyncing each other's + * writes. + */ printf(LABEL_FORMAT, "8k write, close, fsync"); fflush(stdout); gettimeofday(&start_t, NULL); *************** main(int argc, char *argv[]) *** 358,381 **** gettimeofday(&stop_t, NULL); print_elapse(start_t, stop_t); ! /* cleanup */ free(full_buf); unlink(filename); return 0; } void print_elapse(struct timeval start_t, struct timeval stop_t) { double total_time = (stop_t.tv_sec - start_t.tv_sec) + - /* usec subtraction might be negative, e.g. 5.4 - 4.8 */ (stop_t.tv_usec - start_t.tv_usec) * 0.000001; double per_second = loops / total_time; printf("%9.3f/second\n", per_second); } void die(char *str) { --- 584,624 ---- gettimeofday(&stop_t, NULL); print_elapse(start_t, stop_t); ! /* ! * cleanup ! */ free(full_buf); unlink(filename); return 0; } + /* + * print out the writes per second for most tests + */ void print_elapse(struct timeval start_t, struct timeval stop_t) { double total_time = (stop_t.tv_sec - start_t.tv_sec) + (stop_t.tv_usec - start_t.tv_usec) * 0.000001; double per_second = loops / total_time; printf("%9.3f/second\n", per_second); } + /* + * print out the writes per second for writethrough tests + */ + void + print_elapse_writethrough(struct timeval start_t, struct timeval stop_t) + { + double total_time = (stop_t.tv_sec - start_t.tv_sec) + + (stop_t.tv_usec - start_t.tv_usec) * 0.000001; + double per_second = writethrough_loops / total_time; + + printf("%9.3f/second\n", per_second); + } + void die(char *str) {
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers