Hi, Using sync_file_range(2) as wal_sync_method might speed up the XLOG flush. So, I made the patch to introduce the new valid value (sync_file_range) to wal_sync_method, and performed the comparative performance measurement of fdatasync vs sync_file_range using this patch. The patch is attached to this mail. This is just a reference information, and I'm not planning to provide the patch for CommitFest now.
Environment: - PowerEdge1850 (Xeon 2.8GHz, Mem 512MB) - Fedora11 - PostgreSQL v8.4 with the patch Measurement: - pgbench -i -s64 - pgbench -c16 -t1000 -Mprepared * [20 times] - postgresql.conf checkpoint_segments = 64 - The above measurement was repeated 3 times Result: - The following values indicate throughput of pgbench (tps) The first set ---------------- fdatasync sync_file_range 1 60.6 58.9 2 63.1 58.8 3 61.3 62.3 4 70.3 66.8 5 67.4 66.2 6 67.8 71.1 7 74.3 67.5 8 70.0 71.9 9 71.7 72.8 10 74.0 72.0 11 72.3 72.1 12 79.9 78.6 13 73.3 73.3 14 72.9 71.2 15 78.6 78.6 16 81.7 76.7 17 75.5 75.9 18 78.0 73.3 19 75.3 78.9 20 83.0 77.3 avg 72.5 71.2 The second set --------------------- fdatasync sync_file_range 1 52.6 60.3 2 57.4 65.9 3 62.6 63.7 4 59.0 68.9 5 67.0 72.2 6 61.5 72.2 7 69.0 73.4 8 64.3 75.6 9 67.6 74.8 10 69.1 75.7 11 65.7 77.7 12 72.6 76.6 13 68.8 75.5 14 69.4 79.4 15 74.2 81.2 16 71.4 77.5 17 71.3 78.0 18 73.1 80.4 19 73.5 80.2 20 73.7 80.7 avg 67.2 74.5 The third set ----------------- fdatasync sync_file_range 1 60.9 59.5 2 58.3 64.1 3 64.7 62.9 4 66.6 68.0 5 67.9 70.9 6 69.9 69.4 7 70.0 72.6 8 72.3 76.6 9 70.7 74.7 10 70.3 70.2 11 77.2 78.2 12 74.8 73.9 13 69.6 79.0 14 79.3 80.7 15 78.0 74.6 16 77.8 78.9 17 73.6 81.0 18 81.5 77.6 19 76.1 78.5 20 79.1 83.7 avg 71.9 73.8 According to the result, using sync_file_range instead of fdatasync has little effect in the performance of postgres. This time I just used sync_file_range with the following combination of the flags: SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER This might be a stupid way, so there might be room for improvement. Regards, -- Fujii Masao NIPPON TELEGRAPH AND TELEPHONE CORPORATION NTT Open Source Software Center
Index: configure =================================================================== RCS file: /projects/cvsroot/pgsql/configure,v retrieving revision 1.644 diff -c -r1.644 configure *** configure 27 Jun 2009 00:14:46 -0000 1.644 --- configure 30 Jun 2009 04:54:13 -0000 *************** *** 16587,16592 **** --- 16587,16761 ---- fi + # sync_file_range() is a no-op on Solaris, so don't incur function overhead + # by calling it. + if test "$PORTNAME" != "solaris"; then + + for ac_func in sync_file_range + do + as_ac_var=`echo "ac_cv_func_$ac_func" | $as_tr_sh` + { echo "$as_me:$LINENO: checking for $ac_func" >&5 + echo $ECHO_N "checking for $ac_func... $ECHO_C" >&6; } + if { as_var=$as_ac_var; eval "test \"\${$as_var+set}\" = set"; }; then + echo $ECHO_N "(cached) $ECHO_C" >&6 + else + cat >conftest.$ac_ext <<_ACEOF + /* confdefs.h. */ + _ACEOF + cat confdefs.h >>conftest.$ac_ext + cat >>conftest.$ac_ext <<_ACEOF + /* end confdefs.h. */ + /* Define $ac_func to an innocuous variant, in case <limits.h> declares $ac_func. + For example, HP-UX 11i <limits.h> declares gettimeofday. */ + #define $ac_func innocuous_$ac_func + + /* System header to define __stub macros and hopefully few prototypes, + which can conflict with char $ac_func (); below. + Prefer <limits.h> to <assert.h> if __STDC__ is defined, since + <limits.h> exists even on freestanding compilers. */ + + #ifdef __STDC__ + # include <limits.h> + #else + # include <assert.h> + #endif + + #undef $ac_func + + /* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ + #ifdef __cplusplus + extern "C" + #endif + char $ac_func (); + /* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ + #if defined __stub_$ac_func || defined __stub___$ac_func + choke me + #endif + + int + main () + { + return $ac_func (); + ; + return 0; + } + _ACEOF + rm -f conftest.$ac_objext conftest$ac_exeext + if { (ac_try="$ac_link" + case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; + esac + eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_link") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest$ac_exeext && + $as_test_x conftest$ac_exeext; then + eval "$as_ac_var=yes" + else + echo "$as_me: failed program was:" >&5 + sed 's/^/| /' conftest.$ac_ext >&5 + + eval "$as_ac_var=no" + fi + + rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \ + conftest$ac_exeext conftest.$ac_ext + fi + ac_res=`eval echo '${'$as_ac_var'}'` + { echo "$as_me:$LINENO: result: $ac_res" >&5 + echo "${ECHO_T}$ac_res" >&6; } + if test `eval echo '${'$as_ac_var'}'` = yes; then + cat >>confdefs.h <<_ACEOF + #define `echo "HAVE_$ac_func" | $as_tr_cpp` 1 + _ACEOF + + fi + done + + { echo "$as_me:$LINENO: checking whether sync_file_range is declared" >&5 + echo $ECHO_N "checking whether sync_file_range is declared... $ECHO_C" >&6; } + if test "${ac_cv_have_decl_sync_file_range+set}" = set; then + echo $ECHO_N "(cached) $ECHO_C" >&6 + else + cat >conftest.$ac_ext <<_ACEOF + /* confdefs.h. */ + _ACEOF + cat confdefs.h >>conftest.$ac_ext + cat >>conftest.$ac_ext <<_ACEOF + /* end confdefs.h. */ + #define _GNU_SOURCE + #include <fcntl.h> + + int + main () + { + #ifndef sync_file_range + (void) sync_file_range; + #endif + + ; + return 0; + } + _ACEOF + rm -f conftest.$ac_objext + if { (ac_try="$ac_compile" + case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; + esac + eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5 + (eval "$ac_compile") 2>conftest.er1 + ac_status=$? + grep -v '^ *+' conftest.er1 >conftest.err + rm -f conftest.er1 + cat conftest.err >&5 + echo "$as_me:$LINENO: \$? = $ac_status" >&5 + (exit $ac_status); } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then + ac_cv_have_decl_sync_file_range=yes + else + echo "$as_me: failed program was:" >&5 + sed 's/^/| /' conftest.$ac_ext >&5 + + ac_cv_have_decl_sync_file_range=no + fi + + rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + fi + { echo "$as_me:$LINENO: result: $ac_cv_have_decl_sync_file_range" >&5 + echo "${ECHO_T}$ac_cv_have_decl_sync_file_range" >&6; } + if test $ac_cv_have_decl_sync_file_range = yes; then + + cat >>confdefs.h <<_ACEOF + #define HAVE_DECL_SYNC_FILE_RANGE 1 + _ACEOF + + + else + cat >>confdefs.h <<_ACEOF + #define HAVE_DECL_SYNC_FILE_RANGE 0 + _ACEOF + + + fi + + + fi + { echo "$as_me:$LINENO: checking whether fdatasync is declared" >&5 echo $ECHO_N "checking whether fdatasync is declared... $ECHO_C" >&6; } if test "${ac_cv_have_decl_fdatasync+set}" = set; then Index: configure.in =================================================================== RCS file: /projects/cvsroot/pgsql/configure.in,v retrieving revision 1.602 diff -c -r1.602 configure.in *** configure.in 27 Jun 2009 00:14:47 -0000 1.602 --- configure.in 30 Jun 2009 04:54:13 -0000 *************** *** 1151,1156 **** --- 1151,1163 ---- AC_CHECK_DECLS(posix_fadvise, [], [], [#include <fcntl.h>]) fi + # sync_file_range() is a no-op on Solaris, so don't incur function overhead + # by calling it. + if test "$PORTNAME" != "solaris"; then + AC_CHECK_FUNCS(sync_file_range) + AC_CHECK_DECLS(sync_file_range, [], [], [#include <fcntl.h>]) + fi + AC_CHECK_DECLS(fdatasync, [], [], [#include <unistd.h>]) AC_CHECK_DECLS([strlcat, strlcpy]) # This is probably only present on Darwin, but may as well check always Index: doc/src/sgml/config.sgml =================================================================== RCS file: /projects/cvsroot/pgsql/doc/src/sgml/config.sgml,v retrieving revision 1.220 diff -c -r1.220 config.sgml *** doc/src/sgml/config.sgml 17 Jun 2009 21:58:48 -0000 1.220 --- doc/src/sgml/config.sgml 30 Jun 2009 04:54:13 -0000 *************** *** 1406,1411 **** --- 1406,1416 ---- <literal>open_sync</> (write WAL files with <function>open()</> option <symbol>O_SYNC</>) </para> </listitem> + <listitem> + <para> + <literal>sync_file_range</> (call <function>sync_file_range()</> at each commit) + </para> + </listitem> </itemizedlist> <para> Not all of these choices are available on all platforms. Index: src/backend/access/transam/xlog.c =================================================================== RCS file: /projects/cvsroot/pgsql/src/backend/access/transam/xlog.c,v retrieving revision 1.345 diff -c -r1.345 xlog.c *** src/backend/access/transam/xlog.c 26 Jun 2009 20:29:04 -0000 1.345 --- src/backend/access/transam/xlog.c 30 Jun 2009 04:54:13 -0000 *************** *** 99,104 **** --- 99,107 ---- #ifdef HAVE_FDATASYNC {"fdatasync", SYNC_METHOD_FDATASYNC, false}, #endif + #ifdef HAVE_SYNC_FILE_RANGE + {"sync_file_range", SYNC_METHOD_SYNC_FILE_RANGE, false}, + #endif #ifdef OPEN_SYNC_FLAG {"open_sync", SYNC_METHOD_OPEN, false}, #endif *************** *** 501,507 **** --- 504,514 ---- #ifdef WAL_DEBUG static void xlog_outrec(StringInfo buf, XLogRecord *record); #endif + #ifdef HAVE_SYNC_FILE_RANGE + static void issue_xlog_fsync(uint32 offset, Size nbytes); + #else static void issue_xlog_fsync(void); + #endif static void pg_start_backup_callback(int code, Datum arg); static bool read_backup_label(XLogRecPtr *checkPointLoc, XLogRecPtr *minRecoveryLoc); *************** *** 1526,1531 **** --- 1533,1542 ---- int npages; int startidx; uint32 startoffset; + #ifdef HAVE_SYNC_FILE_RANGE + uint32 fsyncoffset; + Size fsyncnbytes; + #endif /* We should always be inside a critical section here */ Assert(CritSectionCount > 0); *************** *** 1548,1553 **** --- 1559,1570 ---- startidx = 0; startoffset = 0; + /* Initialize info about sync of a file segment */ + #ifdef HAVE_SYNC_FILE_RANGE + fsyncoffset = 0; + fsyncnbytes = 0; + #endif + /* * Within the loop, curridx is the cache block index of the page to * consider writing. We advance Write->curridx only after successfully *************** *** 1656,1661 **** --- 1673,1685 ---- openLogOff, (unsigned long) nbytes))); } + /* Update state for sync */ + #ifdef HAVE_SYNC_FILE_RANGE + if (fsyncnbytes == 0) + fsyncoffset = startoffset; + fsyncnbytes += nbytes; + #endif + /* Update state for write */ openLogOff += nbytes; Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx); *************** *** 1679,1685 **** --- 1703,1714 ---- */ if (finishing_seg || (xlog_switch && last_iteration)) { + #ifdef HAVE_SYNC_FILE_RANGE + issue_xlog_fsync(fsyncoffset, fsyncnbytes); + fsyncnbytes = 0; + #else issue_xlog_fsync(); + #endif LogwrtResult.Flush = LogwrtResult.Write; /* end of page */ if (XLogArchivingActive()) *************** *** 1743,1749 **** --- 1772,1783 ---- openLogFile = XLogFileOpen(openLogId, openLogSeg); openLogOff = 0; } + #ifdef HAVE_SYNC_FILE_RANGE + issue_xlog_fsync(fsyncoffset, fsyncnbytes); + fsyncnbytes = 0; + #else issue_xlog_fsync(); + #endif } LogwrtResult.Flush = LogwrtResult.Write; } *************** *** 7107,7112 **** --- 7141,7147 ---- case SYNC_METHOD_FSYNC: case SYNC_METHOD_FSYNC_WRITETHROUGH: case SYNC_METHOD_FDATASYNC: + case SYNC_METHOD_SYNC_FILE_RANGE: return 0; #ifdef OPEN_SYNC_FLAG case SYNC_METHOD_OPEN: *************** *** 7160,7166 **** * Issue appropriate kind of fsync (if any) on the current XLOG output file */ static void ! issue_xlog_fsync(void) { switch (sync_method) { --- 7195,7205 ---- * Issue appropriate kind of fsync (if any) on the current XLOG output file */ static void ! #ifdef HAVE_SYNC_FILE_RANGE ! issue_xlog_fsync(uint32 offset, Size nbytes) ! #else ! issue_xlog_fsync() ! #endif { switch (sync_method) { *************** *** 7193,7198 **** --- 7232,7246 ---- case SYNC_METHOD_OPEN_DSYNC: /* write synced it already */ break; + #ifdef HAVE_SYNC_FILE_RANGE + case SYNC_METHOD_SYNC_FILE_RANGE: + if (pg_sync_file_range(openLogFile, offset, nbytes) != 0) + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not sync_file_range log file %u, segment %u: %m", + openLogId, openLogSeg))); + break; + #endif default: elog(PANIC, "unrecognized wal_sync_method: %d", sync_method); break; Index: src/backend/storage/file/fd.c =================================================================== RCS file: /projects/cvsroot/pgsql/src/backend/storage/file/fd.c,v retrieving revision 1.149 diff -c -r1.149 fd.c *** src/backend/storage/file/fd.c 11 Jun 2009 14:49:01 -0000 1.149 --- src/backend/storage/file/fd.c 30 Jun 2009 04:54:13 -0000 *************** *** 44,49 **** --- 44,52 ---- #include <sys/param.h> #include <sys/stat.h> #include <unistd.h> + #ifdef HAVE_SYNC_FILE_RANGE + #define _GNU_SOURCE + #endif #include <fcntl.h> #ifdef HAVE_SYS_RESOURCE_H #include <sys/resource.h> /* for getrlimit */ *************** *** 318,323 **** --- 321,350 ---- } /* + * pg_sync_file_range --- same as sync_file_range except does nothing if + * enableFsync is off + * + * Not all platforms have fdatasync; treat as fsync if not available. + */ + int + pg_sync_file_range(int fd, uint32 offset, Size nbytes) + { + if (enableFsync) + { + #ifdef HAVE_SYNC_FILE_RANGE + return sync_file_range(fd, (off64_t) offset, (off64_t) nbytes, + SYNC_FILE_RANGE_WAIT_BEFORE | + SYNC_FILE_RANGE_WRITE | + SYNC_FILE_RANGE_WAIT_AFTER); + #else + return fsync(fd); + #endif + } + else + return 0; + } + + /* * InitFileAccess --- initialize this module during backend startup * * This is called during either normal or standalone backend start. Index: src/backend/utils/misc/postgresql.conf.sample =================================================================== RCS file: /projects/cvsroot/pgsql/src/backend/utils/misc/postgresql.conf.sample,v retrieving revision 1.260 diff -c -r1.260 postgresql.conf.sample *** src/backend/utils/misc/postgresql.conf.sample 23 Apr 2009 00:23:45 -0000 1.260 --- src/backend/utils/misc/postgresql.conf.sample 30 Jun 2009 04:54:13 -0000 *************** *** 156,161 **** --- 156,162 ---- # fsync # fsync_writethrough # open_sync + # sync_file_range #full_page_writes = on # recover from partial page writes #wal_buffers = 64kB # min 32kB # (change requires restart) Index: src/include/pg_config.h.in =================================================================== RCS file: /projects/cvsroot/pgsql/src/include/pg_config.h.in,v retrieving revision 1.139 diff -c -r1.139 pg_config.h.in *** src/include/pg_config.h.in 4 Apr 2009 21:55:50 -0000 1.139 --- src/include/pg_config.h.in 30 Jun 2009 04:54:13 -0000 *************** *** 114,119 **** --- 114,123 ---- don't. */ #undef HAVE_DECL_STRLCPY + /* Define to 1 if you have the declaration of `sync_file_range', and to 0 if you + don't. */ + #undef HAVE_DECL_SYNC_FILE_RANGE + /* Define to 1 if you have the declaration of `sys_siglist', and to 0 if you don't. */ #undef HAVE_DECL_SYS_SIGLIST *************** *** 508,513 **** --- 512,520 ---- /* Define to 1 if you have the `symlink' function. */ #undef HAVE_SYMLINK + /* Define to 1 if you have the `sync_file_range' function. */ + #undef HAVE_SYNC_FILE_RANGE + /* Define to 1 if you have the `sysconf' function. */ #undef HAVE_SYSCONF Index: src/include/access/xlog.h =================================================================== RCS file: /projects/cvsroot/pgsql/src/include/access/xlog.h,v retrieving revision 1.93 diff -c -r1.93 xlog.h *** src/include/access/xlog.h 26 Jun 2009 20:29:04 -0000 1.93 --- src/include/access/xlog.h 30 Jun 2009 04:54:13 -0000 *************** *** 91,96 **** --- 91,97 ---- #define SYNC_METHOD_OPEN 2 /* for O_SYNC */ #define SYNC_METHOD_FSYNC_WRITETHROUGH 3 #define SYNC_METHOD_OPEN_DSYNC 4 /* for O_DSYNC */ + #define SYNC_METHOD_SYNC_FILE_RANGE 5 extern int sync_method; /* Index: src/include/access/xlogdefs.h =================================================================== RCS file: /projects/cvsroot/pgsql/src/include/access/xlogdefs.h,v retrieving revision 1.23 diff -c -r1.23 xlogdefs.h *** src/include/access/xlogdefs.h 1 Jan 2009 17:23:56 -0000 1.23 --- src/include/access/xlogdefs.h 30 Jun 2009 04:54:13 -0000 *************** *** 114,119 **** --- 114,121 ---- #define DEFAULT_SYNC_METHOD SYNC_METHOD_OPEN_DSYNC #elif defined(HAVE_FDATASYNC) #define DEFAULT_SYNC_METHOD SYNC_METHOD_FDATASYNC + #elif defined(HAVE_SYNC_FILE_RANGE) + #define DEFAULT_SYNC_METHOD SYNC_METHOD_SYNC_FILE_RANGE #elif defined(HAVE_FSYNC_WRITETHROUGH_ONLY) #define DEFAULT_SYNC_METHOD SYNC_METHOD_FSYNC_WRITETHROUGH #else Index: src/include/storage/fd.h =================================================================== RCS file: /projects/cvsroot/pgsql/src/include/storage/fd.h,v retrieving revision 1.64 diff -c -r1.64 fd.h *** src/include/storage/fd.h 12 Jan 2009 05:10:45 -0000 1.64 --- src/include/storage/fd.h 30 Jun 2009 04:54:13 -0000 *************** *** 97,102 **** --- 97,103 ---- extern int pg_fsync_no_writethrough(int fd); extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); + extern int pg_sync_file_range(int fd, uint32 offset, Size nbytes); /* Filename components for OpenTemporaryFile */ #define PG_TEMP_FILES_DIR "pgsql_tmp"
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers