Hi,

Using sync_file_range(2) as wal_sync_method might speed up
the XLOG flush. So, I made the patch to introduce the new valid
value (sync_file_range) to wal_sync_method, and performed the
comparative performance measurement of fdatasync vs
sync_file_range using this patch. The patch is attached to this
mail. This is just a reference information, and I'm not planning to
provide the patch for CommitFest now.

Environment:
- PowerEdge1850 (Xeon 2.8GHz, Mem 512MB)
- Fedora11
- PostgreSQL v8.4 with the patch

Measurement:
- pgbench -i -s64
- pgbench -c16 -t1000 -Mprepared  * [20 times]
- postgresql.conf
  checkpoint_segments = 64
- The above measurement was repeated 3 times

Result:
- The following values indicate throughput of pgbench (tps)

The first set
----------------
       fdatasync   sync_file_range
1       60.6         58.9
2       63.1         58.8
3       61.3         62.3
4       70.3         66.8
5       67.4         66.2
6       67.8         71.1
7       74.3         67.5
8       70.0         71.9
9       71.7         72.8
10     74.0         72.0
11     72.3         72.1
12     79.9         78.6
13     73.3         73.3
14     72.9         71.2
15     78.6         78.6
16     81.7         76.7
17     75.5         75.9
18     78.0         73.3
19     75.3         78.9
20     83.0         77.3
avg   72.5         71.2

The second set
---------------------
       fdatasync   sync_file_range
1       52.6         60.3
2       57.4         65.9
3       62.6         63.7
4       59.0         68.9
5       67.0         72.2
6       61.5         72.2
7       69.0         73.4
8       64.3         75.6
9       67.6         74.8
10     69.1         75.7
11     65.7         77.7
12     72.6         76.6
13     68.8         75.5
14     69.4         79.4
15     74.2         81.2
16     71.4         77.5
17     71.3         78.0
18     73.1         80.4
19     73.5         80.2
20     73.7         80.7
avg   67.2         74.5

The third set
-----------------
       fdatasync   sync_file_range
1       60.9         59.5
2       58.3         64.1
3       64.7         62.9
4       66.6         68.0
5       67.9         70.9
6       69.9         69.4
7       70.0         72.6
8       72.3         76.6
9       70.7         74.7
10     70.3         70.2
11     77.2         78.2
12     74.8         73.9
13     69.6         79.0
14     79.3         80.7
15     78.0         74.6
16     77.8         78.9
17     73.6         81.0
18     81.5         77.6
19     76.1         78.5
20     79.1         83.7
avg   71.9         73.8

According to the result, using sync_file_range instead of fdatasync
has little effect in the performance of postgres. This time I just used
sync_file_range with the following combination of the flags:

   SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE |
      SYNC_FILE_RANGE_WAIT_AFTER

This might be a stupid way, so there might be room for improvement.

Regards,

--
Fujii Masao
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center
Index: configure
===================================================================
RCS file: /projects/cvsroot/pgsql/configure,v
retrieving revision 1.644
diff -c -r1.644 configure
*** configure	27 Jun 2009 00:14:46 -0000	1.644
--- configure	30 Jun 2009 04:54:13 -0000
***************
*** 16587,16592 ****
--- 16587,16761 ----
  
  fi
  
+ # sync_file_range() is a no-op on Solaris, so don't incur function overhead
+ # by calling it.
+ if test "$PORTNAME" != "solaris"; then
+ 
+ for ac_func in sync_file_range
+ do
+ as_ac_var=`echo "ac_cv_func_$ac_func" | $as_tr_sh`
+ { echo "$as_me:$LINENO: checking for $ac_func" >&5
+ echo $ECHO_N "checking for $ac_func... $ECHO_C" >&6; }
+ if { as_var=$as_ac_var; eval "test \"\${$as_var+set}\" = set"; }; then
+   echo $ECHO_N "(cached) $ECHO_C" >&6
+ else
+   cat >conftest.$ac_ext <<_ACEOF
+ /* confdefs.h.  */
+ _ACEOF
+ cat confdefs.h >>conftest.$ac_ext
+ cat >>conftest.$ac_ext <<_ACEOF
+ /* end confdefs.h.  */
+ /* Define $ac_func to an innocuous variant, in case <limits.h> declares $ac_func.
+    For example, HP-UX 11i <limits.h> declares gettimeofday.  */
+ #define $ac_func innocuous_$ac_func
+ 
+ /* System header to define __stub macros and hopefully few prototypes,
+     which can conflict with char $ac_func (); below.
+     Prefer <limits.h> to <assert.h> if __STDC__ is defined, since
+     <limits.h> exists even on freestanding compilers.  */
+ 
+ #ifdef __STDC__
+ # include <limits.h>
+ #else
+ # include <assert.h>
+ #endif
+ 
+ #undef $ac_func
+ 
+ /* Override any GCC internal prototype to avoid an error.
+    Use char because int might match the return type of a GCC
+    builtin and then its argument prototype would still apply.  */
+ #ifdef __cplusplus
+ extern "C"
+ #endif
+ char $ac_func ();
+ /* The GNU C library defines this for functions which it implements
+     to always fail with ENOSYS.  Some functions are actually named
+     something starting with __ and the normal name is an alias.  */
+ #if defined __stub_$ac_func || defined __stub___$ac_func
+ choke me
+ #endif
+ 
+ int
+ main ()
+ {
+ return $ac_func ();
+   ;
+   return 0;
+ }
+ _ACEOF
+ rm -f conftest.$ac_objext conftest$ac_exeext
+ if { (ac_try="$ac_link"
+ case "(($ac_try" in
+   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+   *) ac_try_echo=$ac_try;;
+ esac
+ eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+   (eval "$ac_link") 2>conftest.er1
+   ac_status=$?
+   grep -v '^ *+' conftest.er1 >conftest.err
+   rm -f conftest.er1
+   cat conftest.err >&5
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); } && {
+ 	 test -z "$ac_c_werror_flag" ||
+ 	 test ! -s conftest.err
+        } && test -s conftest$ac_exeext &&
+        $as_test_x conftest$ac_exeext; then
+   eval "$as_ac_var=yes"
+ else
+   echo "$as_me: failed program was:" >&5
+ sed 's/^/| /' conftest.$ac_ext >&5
+ 
+ 	eval "$as_ac_var=no"
+ fi
+ 
+ rm -f core conftest.err conftest.$ac_objext conftest_ipa8_conftest.oo \
+       conftest$ac_exeext conftest.$ac_ext
+ fi
+ ac_res=`eval echo '${'$as_ac_var'}'`
+ 	       { echo "$as_me:$LINENO: result: $ac_res" >&5
+ echo "${ECHO_T}$ac_res" >&6; }
+ if test `eval echo '${'$as_ac_var'}'` = yes; then
+   cat >>confdefs.h <<_ACEOF
+ #define `echo "HAVE_$ac_func" | $as_tr_cpp` 1
+ _ACEOF
+ 
+ fi
+ done
+ 
+ { echo "$as_me:$LINENO: checking whether sync_file_range is declared" >&5
+ echo $ECHO_N "checking whether sync_file_range is declared... $ECHO_C" >&6; }
+ if test "${ac_cv_have_decl_sync_file_range+set}" = set; then
+   echo $ECHO_N "(cached) $ECHO_C" >&6
+ else
+   cat >conftest.$ac_ext <<_ACEOF
+ /* confdefs.h.  */
+ _ACEOF
+ cat confdefs.h >>conftest.$ac_ext
+ cat >>conftest.$ac_ext <<_ACEOF
+ /* end confdefs.h.  */
+ #define _GNU_SOURCE
+ #include <fcntl.h>
+ 
+ int
+ main ()
+ {
+ #ifndef sync_file_range
+   (void) sync_file_range;
+ #endif
+ 
+   ;
+   return 0;
+ }
+ _ACEOF
+ rm -f conftest.$ac_objext
+ if { (ac_try="$ac_compile"
+ case "(($ac_try" in
+   *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+   *) ac_try_echo=$ac_try;;
+ esac
+ eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
+   (eval "$ac_compile") 2>conftest.er1
+   ac_status=$?
+   grep -v '^ *+' conftest.er1 >conftest.err
+   rm -f conftest.er1
+   cat conftest.err >&5
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); } && {
+ 	 test -z "$ac_c_werror_flag" ||
+ 	 test ! -s conftest.err
+        } && test -s conftest.$ac_objext; then
+   ac_cv_have_decl_sync_file_range=yes
+ else
+   echo "$as_me: failed program was:" >&5
+ sed 's/^/| /' conftest.$ac_ext >&5
+ 
+ 	ac_cv_have_decl_sync_file_range=no
+ fi
+ 
+ rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext
+ fi
+ { echo "$as_me:$LINENO: result: $ac_cv_have_decl_sync_file_range" >&5
+ echo "${ECHO_T}$ac_cv_have_decl_sync_file_range" >&6; }
+ if test $ac_cv_have_decl_sync_file_range = yes; then
+ 
+ cat >>confdefs.h <<_ACEOF
+ #define HAVE_DECL_SYNC_FILE_RANGE 1
+ _ACEOF
+ 
+ 
+ else
+   cat >>confdefs.h <<_ACEOF
+ #define HAVE_DECL_SYNC_FILE_RANGE 0
+ _ACEOF
+ 
+ 
+ fi
+ 
+ 
+ fi
+ 
  { echo "$as_me:$LINENO: checking whether fdatasync is declared" >&5
  echo $ECHO_N "checking whether fdatasync is declared... $ECHO_C" >&6; }
  if test "${ac_cv_have_decl_fdatasync+set}" = set; then
Index: configure.in
===================================================================
RCS file: /projects/cvsroot/pgsql/configure.in,v
retrieving revision 1.602
diff -c -r1.602 configure.in
*** configure.in	27 Jun 2009 00:14:47 -0000	1.602
--- configure.in	30 Jun 2009 04:54:13 -0000
***************
*** 1151,1156 ****
--- 1151,1163 ----
  AC_CHECK_DECLS(posix_fadvise, [], [], [#include <fcntl.h>])
  fi
  
+ # sync_file_range() is a no-op on Solaris, so don't incur function overhead
+ # by calling it.
+ if test "$PORTNAME" != "solaris"; then
+ AC_CHECK_FUNCS(sync_file_range)
+ AC_CHECK_DECLS(sync_file_range, [], [], [#include <fcntl.h>])
+ fi
+ 
  AC_CHECK_DECLS(fdatasync, [], [], [#include <unistd.h>])
  AC_CHECK_DECLS([strlcat, strlcpy])
  # This is probably only present on Darwin, but may as well check always
Index: doc/src/sgml/config.sgml
===================================================================
RCS file: /projects/cvsroot/pgsql/doc/src/sgml/config.sgml,v
retrieving revision 1.220
diff -c -r1.220 config.sgml
*** doc/src/sgml/config.sgml	17 Jun 2009 21:58:48 -0000	1.220
--- doc/src/sgml/config.sgml	30 Jun 2009 04:54:13 -0000
***************
*** 1406,1411 ****
--- 1406,1416 ----
           <literal>open_sync</> (write WAL files with <function>open()</> option <symbol>O_SYNC</>)
          </para>
          </listitem>
+         <listitem>
+         <para>
+          <literal>sync_file_range</> (call <function>sync_file_range()</> at each commit)
+         </para>
+         </listitem>
         </itemizedlist>
         <para>
          Not all of these choices are available on all platforms.
Index: src/backend/access/transam/xlog.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/access/transam/xlog.c,v
retrieving revision 1.345
diff -c -r1.345 xlog.c
*** src/backend/access/transam/xlog.c	26 Jun 2009 20:29:04 -0000	1.345
--- src/backend/access/transam/xlog.c	30 Jun 2009 04:54:13 -0000
***************
*** 99,104 ****
--- 99,107 ----
  #ifdef HAVE_FDATASYNC
  	{"fdatasync", SYNC_METHOD_FDATASYNC, false},
  #endif
+ #ifdef HAVE_SYNC_FILE_RANGE
+ 	{"sync_file_range", SYNC_METHOD_SYNC_FILE_RANGE, false},
+ #endif
  #ifdef OPEN_SYNC_FLAG
  	{"open_sync", SYNC_METHOD_OPEN, false},
  #endif
***************
*** 501,507 ****
--- 504,514 ----
  #ifdef WAL_DEBUG
  static void xlog_outrec(StringInfo buf, XLogRecord *record);
  #endif
+ #ifdef HAVE_SYNC_FILE_RANGE
+ static void issue_xlog_fsync(uint32 offset, Size nbytes);
+ #else
  static void issue_xlog_fsync(void);
+ #endif
  static void pg_start_backup_callback(int code, Datum arg);
  static bool read_backup_label(XLogRecPtr *checkPointLoc,
  				  XLogRecPtr *minRecoveryLoc);
***************
*** 1526,1531 ****
--- 1533,1542 ----
  	int			npages;
  	int			startidx;
  	uint32		startoffset;
+ #ifdef HAVE_SYNC_FILE_RANGE
+ 	uint32		fsyncoffset;
+ 	Size		fsyncnbytes;
+ #endif
  
  	/* We should always be inside a critical section here */
  	Assert(CritSectionCount > 0);
***************
*** 1548,1553 ****
--- 1559,1570 ----
  	startidx = 0;
  	startoffset = 0;
  
+ 	/* Initialize info about sync of a file segment */
+ #ifdef HAVE_SYNC_FILE_RANGE
+ 	fsyncoffset = 0;
+ 	fsyncnbytes = 0;
+ #endif
+ 
  	/*
  	 * Within the loop, curridx is the cache block index of the page to
  	 * consider writing.  We advance Write->curridx only after successfully
***************
*** 1656,1661 ****
--- 1673,1685 ----
  								openLogOff, (unsigned long) nbytes)));
  			}
  
+ 			/* Update state for sync */
+ #ifdef HAVE_SYNC_FILE_RANGE
+ 			if (fsyncnbytes == 0)
+ 				fsyncoffset = startoffset;
+ 			fsyncnbytes += nbytes;
+ #endif
+ 
  			/* Update state for write */
  			openLogOff += nbytes;
  			Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
***************
*** 1679,1685 ****
--- 1703,1714 ----
  			 */
  			if (finishing_seg || (xlog_switch && last_iteration))
  			{
+ #ifdef HAVE_SYNC_FILE_RANGE
+ 				issue_xlog_fsync(fsyncoffset, fsyncnbytes);
+ 				fsyncnbytes = 0;
+ #else
  				issue_xlog_fsync();
+ #endif
  				LogwrtResult.Flush = LogwrtResult.Write;		/* end of page */
  
  				if (XLogArchivingActive())
***************
*** 1743,1749 ****
--- 1772,1783 ----
  				openLogFile = XLogFileOpen(openLogId, openLogSeg);
  				openLogOff = 0;
  			}
+ #ifdef HAVE_SYNC_FILE_RANGE
+ 			issue_xlog_fsync(fsyncoffset, fsyncnbytes);
+ 			fsyncnbytes = 0;
+ #else
  			issue_xlog_fsync();
+ #endif
  		}
  		LogwrtResult.Flush = LogwrtResult.Write;
  	}
***************
*** 7107,7112 ****
--- 7141,7147 ----
  		case SYNC_METHOD_FSYNC:
  		case SYNC_METHOD_FSYNC_WRITETHROUGH:
  		case SYNC_METHOD_FDATASYNC:
+ 		case SYNC_METHOD_SYNC_FILE_RANGE:
  			return 0;
  #ifdef OPEN_SYNC_FLAG
  		case SYNC_METHOD_OPEN:
***************
*** 7160,7166 ****
   * Issue appropriate kind of fsync (if any) on the current XLOG output file
   */
  static void
! issue_xlog_fsync(void)
  {
  	switch (sync_method)
  	{
--- 7195,7205 ----
   * Issue appropriate kind of fsync (if any) on the current XLOG output file
   */
  static void
! #ifdef HAVE_SYNC_FILE_RANGE
! issue_xlog_fsync(uint32 offset, Size nbytes)
! #else
! issue_xlog_fsync()
! #endif
  {
  	switch (sync_method)
  	{
***************
*** 7193,7198 ****
--- 7232,7246 ----
  		case SYNC_METHOD_OPEN_DSYNC:
  			/* write synced it already */
  			break;
+ #ifdef HAVE_SYNC_FILE_RANGE
+ 		case SYNC_METHOD_SYNC_FILE_RANGE:
+ 			if (pg_sync_file_range(openLogFile, offset, nbytes) != 0)
+ 				ereport(PANIC,
+ 						(errcode_for_file_access(),
+ 						 errmsg("could not sync_file_range log file %u, segment %u: %m",
+ 								openLogId, openLogSeg)));
+ 			break;
+ #endif
  		default:
  			elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
  			break;
Index: src/backend/storage/file/fd.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/storage/file/fd.c,v
retrieving revision 1.149
diff -c -r1.149 fd.c
*** src/backend/storage/file/fd.c	11 Jun 2009 14:49:01 -0000	1.149
--- src/backend/storage/file/fd.c	30 Jun 2009 04:54:13 -0000
***************
*** 44,49 ****
--- 44,52 ----
  #include <sys/param.h>
  #include <sys/stat.h>
  #include <unistd.h>
+ #ifdef HAVE_SYNC_FILE_RANGE
+ #define _GNU_SOURCE
+ #endif
  #include <fcntl.h>
  #ifdef HAVE_SYS_RESOURCE_H
  #include <sys/resource.h>		/* for getrlimit */
***************
*** 318,323 ****
--- 321,350 ----
  }
  
  /*
+  * pg_sync_file_range --- same as sync_file_range except does nothing if
+  * enableFsync is off
+  *
+  * Not all platforms have fdatasync; treat as fsync if not available.
+  */
+ int
+ pg_sync_file_range(int fd, uint32 offset, Size nbytes)
+ {
+ 	if (enableFsync)
+ 	{
+ #ifdef HAVE_SYNC_FILE_RANGE
+ 		return sync_file_range(fd, (off64_t) offset, (off64_t) nbytes,
+ 							   SYNC_FILE_RANGE_WAIT_BEFORE |
+ 							   SYNC_FILE_RANGE_WRITE |
+ 							   SYNC_FILE_RANGE_WAIT_AFTER);
+ #else
+ 		return fsync(fd);
+ #endif
+ 	}
+ 	else
+ 		return 0;
+ }
+ 
+ /*
   * InitFileAccess --- initialize this module during backend startup
   *
   * This is called during either normal or standalone backend start.
Index: src/backend/utils/misc/postgresql.conf.sample
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/utils/misc/postgresql.conf.sample,v
retrieving revision 1.260
diff -c -r1.260 postgresql.conf.sample
*** src/backend/utils/misc/postgresql.conf.sample	23 Apr 2009 00:23:45 -0000	1.260
--- src/backend/utils/misc/postgresql.conf.sample	30 Jun 2009 04:54:13 -0000
***************
*** 156,161 ****
--- 156,162 ----
  					#   fsync
  					#   fsync_writethrough
  					#   open_sync
+ 					#   sync_file_range
  #full_page_writes = on			# recover from partial page writes
  #wal_buffers = 64kB			# min 32kB
  					# (change requires restart)
Index: src/include/pg_config.h.in
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/pg_config.h.in,v
retrieving revision 1.139
diff -c -r1.139 pg_config.h.in
*** src/include/pg_config.h.in	4 Apr 2009 21:55:50 -0000	1.139
--- src/include/pg_config.h.in	30 Jun 2009 04:54:13 -0000
***************
*** 114,119 ****
--- 114,123 ----
     don't. */
  #undef HAVE_DECL_STRLCPY
  
+ /* Define to 1 if you have the declaration of `sync_file_range', and to 0 if you
+    don't. */
+ #undef HAVE_DECL_SYNC_FILE_RANGE
+ 
  /* Define to 1 if you have the declaration of `sys_siglist', and to 0 if you
     don't. */
  #undef HAVE_DECL_SYS_SIGLIST
***************
*** 508,513 ****
--- 512,520 ----
  /* Define to 1 if you have the `symlink' function. */
  #undef HAVE_SYMLINK
  
+ /* Define to 1 if you have the `sync_file_range' function. */
+ #undef HAVE_SYNC_FILE_RANGE
+ 
  /* Define to 1 if you have the `sysconf' function. */
  #undef HAVE_SYSCONF
  
Index: src/include/access/xlog.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/access/xlog.h,v
retrieving revision 1.93
diff -c -r1.93 xlog.h
*** src/include/access/xlog.h	26 Jun 2009 20:29:04 -0000	1.93
--- src/include/access/xlog.h	30 Jun 2009 04:54:13 -0000
***************
*** 91,96 ****
--- 91,97 ----
  #define SYNC_METHOD_OPEN		2		/* for O_SYNC */
  #define SYNC_METHOD_FSYNC_WRITETHROUGH	3
  #define SYNC_METHOD_OPEN_DSYNC	4		/* for O_DSYNC */
+ #define SYNC_METHOD_SYNC_FILE_RANGE	5
  extern int	sync_method;
  
  /*
Index: src/include/access/xlogdefs.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/access/xlogdefs.h,v
retrieving revision 1.23
diff -c -r1.23 xlogdefs.h
*** src/include/access/xlogdefs.h	1 Jan 2009 17:23:56 -0000	1.23
--- src/include/access/xlogdefs.h	30 Jun 2009 04:54:13 -0000
***************
*** 114,119 ****
--- 114,121 ----
  #define DEFAULT_SYNC_METHOD		SYNC_METHOD_OPEN_DSYNC
  #elif defined(HAVE_FDATASYNC)
  #define DEFAULT_SYNC_METHOD		SYNC_METHOD_FDATASYNC
+ #elif defined(HAVE_SYNC_FILE_RANGE)
+ #define DEFAULT_SYNC_METHOD		SYNC_METHOD_SYNC_FILE_RANGE
  #elif defined(HAVE_FSYNC_WRITETHROUGH_ONLY)
  #define DEFAULT_SYNC_METHOD		SYNC_METHOD_FSYNC_WRITETHROUGH
  #else
Index: src/include/storage/fd.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/storage/fd.h,v
retrieving revision 1.64
diff -c -r1.64 fd.h
*** src/include/storage/fd.h	12 Jan 2009 05:10:45 -0000	1.64
--- src/include/storage/fd.h	30 Jun 2009 04:54:13 -0000
***************
*** 97,102 ****
--- 97,103 ----
  extern int	pg_fsync_no_writethrough(int fd);
  extern int	pg_fsync_writethrough(int fd);
  extern int	pg_fdatasync(int fd);
+ extern int	pg_sync_file_range(int fd, uint32 offset, Size nbytes);
  
  /* Filename components for OpenTemporaryFile */
  #define PG_TEMP_FILES_DIR "pgsql_tmp"
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to