Mark Kirkwood wrote: > This is a well-worn thread title - apologies, but these results seemed > interesting, and hopefully useful in the quest to get better performance > on Solaris: > > I was curious to see if the rather uninspiring pgbench performance > obtained from a Sun 280R (see General: ATA Disks and RAID controllers > for database servers) could be improved if more time was spent > tuning. > > With the help of a fellow workmate who is a bit of a Solaris guy, we > decided to have a go. > > The major performance killer appeared to be mounting the filesystem with > the logging option. The next most significant seemed to be the choice of > sync_method for Pg - the default (open_datasync), which we initially > thought should be the best - appears noticeably slower than fdatasync.
I thought the default was fdatasync, but looking at the code it seems the default is open_datasync if O_DSYNC is available. I assume the logic is that we usually do only one write() before fsync(), so open_datasync should be faster. Why do we not use O_FSYNC over fsync(). Looking at the code: #if defined(O_SYNC) #define OPEN_SYNC_FLAG O_SYNC #else #if defined(O_FSYNC) #define OPEN_SYNC_FLAG O_FSYNC #endif #endif #if defined(OPEN_SYNC_FLAG) #if defined(O_DSYNC) && (O_DSYNC != OPEN_SYNC_FLAG) #define OPEN_DATASYNC_FLAG O_DSYNC #endif #endif #if defined(OPEN_DATASYNC_FLAG) #define DEFAULT_SYNC_METHOD_STR "open_datasync" #define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN #define DEFAULT_SYNC_FLAGBIT OPEN_DATASYNC_FLAG #else #if defined(HAVE_FDATASYNC) #define DEFAULT_SYNC_METHOD_STR "fdatasync" #define DEFAULT_SYNC_METHOD SYNC_METHOD_FDATASYNC #define DEFAULT_SYNC_FLAGBIT 0 #else #define DEFAULT_SYNC_METHOD_STR "fsync" #define DEFAULT_SYNC_METHOD SYNC_METHOD_FSYNC #define DEFAULT_SYNC_FLAGBIT 0 #endif #endif I think the problem is that we prefer O_DSYNC over fdatasync, but do not prefer O_FSYNC over fsync. Running the attached test program shows on BSD/OS 4.3: write 0.000360 write & fsync 0.001391 write, close & fsync 0.001308 open o_fsync, write 0.000924 showing O_FSYNC faster than fsync(). -- Bruce Momjian | http://candle.pha.pa.us [EMAIL PROTECTED] | (610) 359-1001 + If your life is a hard drive, | 13 Roberts Road + Christ can be your backup. | Newtown Square, Pennsylvania 19073
/* * test_fsync.c * tests if fsync can be done from another process than the original write */ #include <sys/types.h> #include <fcntl.h> #include <stdio.h> #include <stdlib.h> #include <time.h> #include <unistd.h> void die(char *str); void print_elapse(struct timeval start_t, struct timeval elapse_t); int main(int argc, char *argv[]) { struct timeval start_t; struct timeval elapse_t; int tmpfile; char *strout = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"; /* write only */ gettimeofday(&start_t, NULL); if ((tmpfile = open("/var/tmp/test_fsync.out", O_RDWR | O_CREAT)) == -1) die("can't open /var/tmp/test_fsync.out"); write(tmpfile, &strout, 200); close(tmpfile); gettimeofday(&elapse_t, NULL); unlink("/var/tmp/test_fsync.out"); printf("write "); print_elapse(start_t, elapse_t); printf("\n"); /* write & fsync */ gettimeofday(&start_t, NULL); if ((tmpfile = open("/var/tmp/test_fsync.out", O_RDWR | O_CREAT)) == -1) die("can't open /var/tmp/test_fsync.out"); write(tmpfile, &strout, 200); fsync(tmpfile); close(tmpfile); gettimeofday(&elapse_t, NULL); unlink("/var/tmp/test_fsync.out"); printf("write & fsync "); print_elapse(start_t, elapse_t); printf("\n"); /* write, close & fsync */ gettimeofday(&start_t, NULL); if ((tmpfile = open("/var/tmp/test_fsync.out", O_RDWR | O_CREAT)) == -1) die("can't open /var/tmp/test_fsync.out"); write(tmpfile, &strout, 200); close(tmpfile); /* reopen file */ if ((tmpfile = open("/var/tmp/test_fsync.out", O_RDWR | O_CREAT)) == -1) die("can't open /var/tmp/test_fsync.out"); fsync(tmpfile); close(tmpfile); gettimeofday(&elapse_t, NULL); unlink("/var/tmp/test_fsync.out"); printf("write, close & fsync "); print_elapse(start_t, elapse_t); printf("\n"); /* open_fsync, write */ gettimeofday(&start_t, NULL); if ((tmpfile = open("/var/tmp/test_fsync.out", O_RDWR | O_CREAT | O_FSYNC)) == -1) die("can't open /var/tmp/test_fsync.out"); write(tmpfile, &strout, 200); close(tmpfile); gettimeofday(&elapse_t, NULL); unlink("/var/tmp/test_fsync.out"); printf("open o_fsync, write "); print_elapse(start_t, elapse_t); printf("\n"); return 0; } void print_elapse(struct timeval start_t, struct timeval elapse_t) { if (elapse_t.tv_usec < start_t.tv_usec) { elapse_t.tv_sec--; elapse_t.tv_usec += 1000000; } printf("%ld.%06ld", (long) (elapse_t.tv_sec - start_t.tv_sec), (long) (elapse_t.tv_usec - start_t.tv_usec)); } void die(char *str) { fprintf(stderr, "%s", str); exit(1); }
---------------------------(end of broadcast)--------------------------- TIP 4: Don't 'kill -9' the postmaster