At 2015-05-26 22:44:03 +0200, and...@anarazel.de wrote:
>
> So what I propose is:
> 1) Remove the automatic symlink following
> 2) Follow pg_tbspc/*, pg_xlog if it's a symlink, fix the latter in
>    initdb -S
> 3) Add a elevel argument to walkdir(), return if AllocateDir() fails,
>    continue for stat() failures in the readdir() loop.
> 4) Add elevel argument to pre_sync_fname, fsync_fname, return after
>    errors.
> 5) Accept EACCESS, ETXTBSY (if defined) when open()ing the files. By
>    virtue of not following symlinks we should not need to worry about
>    EROFS

Here's a WIP patch for discussion.

I've (a) removed the S_ISLNK() branch in walkdir, (b) reintroduced
walktblspc_links to call walkdir on each of the entries within pg_tblspc
(simpler than trying to make walkdir follow links only for pg_xlog and
under pg_tblspc), (c) call walkdir on pg_xlog if it's a symlink (not
done for initdb -S; will submit separately), (d) add elevel arguments as
described, (e) ignore EACCES and ETXTBSY.

This correctly fsync()s stuff according to strace, and doesn't die if
there are unreadable files/links in PGDATA.

What I haven't done is return if AllocateDir() fails. I'm not convinced
that's correct, because it'll not complain if PGDATA is unreadable (but
this will break other things, so it doesn't matter), but also will die
if readdir fails rather than opendir.

I'm trying a couple of approaches to that (e.g. using readdir directly
instead of ReadDir), but other suggestions are welcome.

-- Abhijit
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 087b6be..6956e88 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -11609,16 +11609,41 @@ SetWalWriterSleeping(bool sleeping)
 static void
 fsync_pgdata(char *datadir)
 {
+	bool		xlog_is_symlink;
+	char		pg_xlog[MAXPGPATH];
+	char		pg_tblspc[MAXPGPATH];
+	struct stat st;
+
 	if (!enableFsync)
 		return;
 
+	snprintf(pg_xlog, MAXPGPATH, "%s/pg_xlog", datadir);
+	snprintf(pg_tblspc, MAXPGPATH, "%s/pg_tblspc", datadir);
+
+	if (lstat(pg_xlog, &st) < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				errmsg("could not stat directory \"pg_xlog\": %m")));
+
+	xlog_is_symlink = false;
+#ifndef WIN32
+	if (S_ISLNK(st.st_mode))
+		xlog_is_symlink = true;
+#else
+	if (pgwin32_is_junction(subpath))
+		xlog_is_symlink = true;
+#endif
+
 	/*
 	 * If possible, hint to the kernel that we're soon going to fsync the data
 	 * directory and its contents.
 	 */
 #if defined(HAVE_SYNC_FILE_RANGE) || \
 	(defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED))
-	walkdir(datadir, pre_sync_fname);
+	walkdir(DEBUG1, datadir, pre_sync_fname);
+	if (xlog_is_symlink)
+		walkdir(DEBUG1, pg_xlog, pre_sync_fname);
+	walktblspc_links(DEBUG1, pg_tblspc, pre_sync_fname);
 #endif
 
 	/*
@@ -11628,5 +11653,8 @@ fsync_pgdata(char *datadir)
 	 * file fsyncs don't guarantee that the directory entry for the file is
 	 * synced.
 	 */
-	walkdir(datadir, fsync_fname);
+	walkdir(WARNING, datadir, fsync_fname_ext);
+	if (xlog_is_symlink)
+		walkdir(DEBUG1, pg_xlog, fsync_fname_ext);
+	walktblspc_links(WARNING, pg_tblspc, fsync_fname_ext);
 }
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 68d43c6..c855004 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -2443,42 +2443,99 @@ looks_like_temp_rel_name(const char *name)
 /*
  * Hint to the OS that it should get ready to fsync() this file.
  *
- * Adapted from pre_sync_fname in initdb.c
+ * Ignores errors trying to open unreadable files, and logs other errors at a
+ * caller-specified level.
  */
 void
-pre_sync_fname(char *fname, bool isdir)
+pre_sync_fname(int elevel, char *fname, bool isdir)
 {
 	int			fd;
 
 	fd = BasicOpenFile(fname, O_RDONLY | PG_BINARY, 0);
 
+	if (fd < 0)
+	{
+		if (errno == EACCES || (isdir && errno == EISDIR))
+			return;
+
+#ifdef ETXTBSY
+		if (errno == ETXTBSY)
+			return;
+#endif
+
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", fname)));
+	}
+
+	(void) pg_flush_data(fd, 0, 0);
+	(void) close(fd);
+}
+
+/*
+ * fsync_fname_ext -- Try to fsync a file or directory
+ *
+ * Ignores errors trying to open unreadable files, or trying to fsync
+ * directories on systems where that isn't allowed/required, and logs other
+ * errors at a caller-specified level.
+ */
+void
+fsync_fname_ext(int elevel, char *fname, bool isdir)
+{
+	int			fd;
+	int			returncode;
+
 	/*
-	 * Some OSs don't allow us to open directories at all (Windows returns
-	 * EACCES)
+	 * Some OSs require directories to be opened read-only whereas other
+	 * systems don't allow us to fsync files opened read-only; so we need both
+	 * cases here
 	 */
-	if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
-		return;
+	if (!isdir)
+		fd = OpenTransientFile(fname,
+							   O_RDWR | PG_BINARY,
+							   S_IRUSR | S_IWUSR);
+	else
+		fd = OpenTransientFile(fname,
+							   O_RDONLY | PG_BINARY,
+							   S_IRUSR | S_IWUSR);
 
 	if (fd < 0)
-		ereport(FATAL,
-				(errmsg("could not open file \"%s\": %m",
-						fname)));
+	{
+		if (errno == EACCES || (isdir && errno == EISDIR))
+			return;
 
-	pg_flush_data(fd, 0, 0);
+#ifdef ETXTBSY
+		if (errno == ETXTBSY)
+			return;
+#endif
 
-	close(fd);
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", fname)));
+	}
+
+	returncode = pg_fsync(fd);
+
+	/*
+	 * Some OSes don't allow us to fsync directories at all, so we can ignore
+	 * those errors. Anything else needs to be logged.
+	 */
+	if (returncode != 0 && !(isdir && errno == EBADF))
+		ereport(elevel,
+				(errcode_for_file_access(),
+				 errmsg("could not fsync file \"%s\": %m", fname)));
+
+	CloseTransientFile(fd);
 }
 
 /*
  * walkdir: recursively walk a directory, applying the action to each
- * regular file and directory (including the named directory itself)
- * and following symbolic links.
+ * regular file and directory (including the named directory itself).
  *
- * NB: There is another version of walkdir in initdb.c, but that version
- * behaves differently with respect to symbolic links.  Caveat emptor!
+ * Adapted from walkdir in initdb.c
  */
 void
-walkdir(char *path, void (*action) (char *fname, bool isdir))
+walkdir(int elevel, char *path, void (*action) (int elevel, char *fname, bool isdir))
 {
 	DIR		   *dir;
 	struct dirent *de;
@@ -2498,59 +2555,55 @@ walkdir(char *path, void (*action) (char *fname, bool isdir))
 		snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name);
 
 		if (lstat(subpath, &fst) < 0)
-			ereport(ERROR,
+			ereport(elevel,
 					(errcode_for_file_access(),
 					 errmsg("could not stat file \"%s\": %m", subpath)));
 
 		if (S_ISREG(fst.st_mode))
-			(*action) (subpath, false);
+			(*action) (elevel, subpath, false);
 		else if (S_ISDIR(fst.st_mode))
-			walkdir(subpath, action);
-#ifndef WIN32
-		else if (S_ISLNK(fst.st_mode))
-#else
-		else if (pgwin32_is_junction(subpath))
-#endif
-		{
-#if defined(HAVE_READLINK) || defined(WIN32)
-			char		linkpath[MAXPGPATH];
-			int			len;
-			struct stat lst;
+			walkdir(elevel, subpath, action);
+	}
+	FreeDir(dir);
 
-			len = readlink(subpath, linkpath, sizeof(linkpath) - 1);
-			if (len < 0)
-				ereport(ERROR,
-						(errcode_for_file_access(),
-						 errmsg("could not read symbolic link \"%s\": %m",
-								subpath)));
+	(*action) (elevel, path, true);
+}
 
-			if (len >= sizeof(linkpath) - 1)
-				ereport(ERROR,
-						(errmsg("symbolic link \"%s\" target is too long",
-								subpath)));
+/*
+ * walktblspc_links: call walkdir on each entry under the given
+ * pg_tblspc directory, or do nothing if pg_tblspc doesn't exist.
+ *
+ * Adapted from walktblspc_links in initdb.c
+ */
+void
+walktblspc_links(int elevel, char *path, void (*action) (int elevel, char *fname, bool isdir))
+{
+	DIR		   *dir;
+	struct dirent *de;
+	char		subpath[MAXPGPATH];
 
-			linkpath[len] = '\0';
+	dir = AllocateDir(path);
+	if (dir == NULL)
+	{
+		if (errno != ENOENT)
+			ereport(elevel,
+					(errcode_for_file_access(),
+					 errmsg("could not open directory \"%s\": %m", path)));
+		return;
+	}
 
-			if (lstat(linkpath, &lst) == 0)
-			{
-				if (S_ISREG(lst.st_mode))
-					(*action) (linkpath, false);
-				else if (S_ISDIR(lst.st_mode))
-					walkdir(subpath, action);
-			}
-			else if (errno != ENOENT)
-				ereport(ERROR,
-						(errcode_for_file_access(),
-						 errmsg("could not stat file \"%s\": %m", linkpath)));
-#else
-			ereport(WARNING,
-					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
-					 errmsg("this platform does not support symbolic links; ignoring \"%s\"",
-							subpath)));
-#endif
-		}
+	while ((de = ReadDir(dir, path)) != NULL)
+	{
+		if (strcmp(de->d_name, ".") == 0 ||
+			strcmp(de->d_name, "..") == 0)
+			continue;
+
+		/* fsync the version specific tablespace subdirectory */
+		snprintf(subpath, sizeof(subpath), "%s/%s/%s",
+				 path, de->d_name, TABLESPACE_VERSION_DIRECTORY);
+
+		walkdir(elevel, subpath, action);
 	}
-	FreeDir(dir);
 
-	(*action) (path, true);
+	FreeDir(dir);
 }
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 5b563a6..322fd40 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -114,8 +114,10 @@ extern int	pg_fsync_writethrough(int fd);
 extern int	pg_fdatasync(int fd);
 extern int	pg_flush_data(int fd, off_t offset, off_t amount);
 extern void fsync_fname(char *fname, bool isdir);
-extern void pre_sync_fname(char *fname, bool isdir);
-extern void walkdir(char *path, void (*action) (char *fname, bool isdir));
+extern void fsync_fname_ext(int elevel, char *fname, bool isdir);
+extern void pre_sync_fname(int elevel, char *fname, bool isdir);
+extern void walkdir(int elevel, char *path, void (*action) (int elevel, char *fname, bool isdir));
+extern void walktblspc_links(int elevel, char *path, void (*action) (int elevel, char *fname, bool isdir));
 
 /* Filename components for OpenTemporaryFile */
 #define PG_TEMP_FILES_DIR "pgsql_tmp"
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to