On Sat, Mar 5, 2016 at 7:47 AM, Andres Freund <and...@anarazel.de> wrote:
> On 2016-03-05 07:43:00 +0900, Michael Paquier wrote:
>> On Sat, Mar 5, 2016 at 7:35 AM, Andres Freund <and...@anarazel.de> wrote:
>> > On 2016-03-04 14:51:50 +0900, Michael Paquier wrote:
>> >> On Fri, Mar 4, 2016 at 4:06 AM, Andres Freund <and...@anarazel.de> wrote:
>> >> Hm. OK. I don't see any reason why switching to link() even in code
>> >> paths like KeepFileRestoredFromArchive() or pgarch_archiveDone() would
>> >> be a problem thinking about it. Should HAVE_WORKING_LINK be available
>> >> on a platform we can combine it with unlink. Is that in line with what
>> >> you think?
>> >
>> > I wasn't trying to suggest we should replace all rename codepaths with
>> > the link wrapper, just the ones that already have a HAVE_WORKING_LINK
>> > check. The name of the routine I suggested is bad though...
>>
>> So we'd introduce a first routine rename_or_link_safe(), say replace_safe().
>
> Or actually maybe just link_safe(), which falls back to access() &&
> rename() if !HAVE_WORKING_LINK.
>
>> > That's one approach, yes. Combined with the fact that you can't actually
>> > reliably rename across directories, the two could be on different
>> > filesystems after all, that'd be a suitable defense. It just needs to be
>> > properly documented in the function header, not at the bottom.
>>
>> OK. Got it. Or the two could be on the same filesystem.
>
>> Still, link() and rename() do not support doing their stuff on
>> different filesystems (EXDEV).
>
> That's my point ...

OK, I hacked a v7:
- Move the link()/rename() group with HAVE_WORKING_LINK into a single
routine, making the previous link_safe renamed to replace_safe. This
is sharing a lot of things with rename_safe. I am not sure it is worth
complicating the code more this way by having a common single routine
for whole. Thoughts welcome. Honestly, I kind of liked the separation
with link_safe/rename_safe of previous patches because link_safe could
have been directly used by extensions and plugins btw.
- Remove the call of stat() in rename_safe() and implement a logic
depending on OpenTransientFile()/pg_fsync() to flush any existing
target file before performing the rename.
Andres, feel free to use this patch as a base, perhaps that will help.
-- 
Michael
diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c
index f6da673..80ec293 100644
--- a/src/backend/access/transam/timeline.c
+++ b/src/backend/access/transam/timeline.c
@@ -417,25 +417,12 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 	 */
 	TLHistoryFilePath(path, newTLI);
 
-	/*
-	 * Prefer link() to rename() here just to be really sure that we don't
-	 * overwrite an existing file.  However, there shouldn't be one, so
-	 * rename() is an acceptable substitute except for the truly paranoid.
-	 */
-#if HAVE_WORKING_LINK
-	if (link(tmppath, path) < 0)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not link file \"%s\" to \"%s\": %m",
-						tmppath, path)));
-	unlink(tmppath);
-#else
-	if (rename(tmppath, path) < 0)
+	/* And perform the rename */
+	if (replace_safe(tmppath, path) < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
-				 errmsg("could not rename file \"%s\" to \"%s\": %m",
+				 errmsg("could not replace file \"%s\" to \"%s\": %m",
 						tmppath, path)));
-#endif
 
 	/* The history file can be archived immediately. */
 	if (XLogArchivingActive())
@@ -507,25 +494,12 @@ writeTimeLineHistoryFile(TimeLineID tli, char *content, int size)
 	 */
 	TLHistoryFilePath(path, tli);
 
-	/*
-	 * Prefer link() to rename() here just to be really sure that we don't
-	 * overwrite an existing logfile.  However, there shouldn't be one, so
-	 * rename() is an acceptable substitute except for the truly paranoid.
-	 */
-#if HAVE_WORKING_LINK
-	if (link(tmppath, path) < 0)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not link file \"%s\" to \"%s\": %m",
-						tmppath, path)));
-	unlink(tmppath);
-#else
-	if (rename(tmppath, path) < 0)
+	/* And perform the rename */
+	if (replace_safe(tmppath, path) < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
-				 errmsg("could not rename file \"%s\" to \"%s\": %m",
+				 errmsg("could not replace file \"%s\" to \"%s\": %m",
 						tmppath, path)));
-#endif
 }
 
 /*
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 94b79ac..6c4f36d 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -3298,35 +3298,17 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 		}
 	}
 
-	/*
-	 * Prefer link() to rename() here just to be really sure that we don't
-	 * overwrite an existing logfile.  However, there shouldn't be one, so
-	 * rename() is an acceptable substitute except for the truly paranoid.
-	 */
-#if HAVE_WORKING_LINK
-	if (link(tmppath, path) < 0)
+	/* rename the segment file */
+	if (replace_safe(tmppath, path) < 0)
 	{
 		if (use_lock)
 			LWLockRelease(ControlFileLock);
 		ereport(LOG,
 				(errcode_for_file_access(),
-				 errmsg("could not link file \"%s\" to \"%s\" (initialization of log file): %m",
+				 errmsg("could not replace file \"%s\" to \"%s\" (initialization of log file): %m",
 						tmppath, path)));
 		return false;
 	}
-	unlink(tmppath);
-#else
-	if (rename(tmppath, path) < 0)
-	{
-		if (use_lock)
-			LWLockRelease(ControlFileLock);
-		ereport(LOG,
-				(errcode_for_file_access(),
-				 errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file): %m",
-						tmppath, path)));
-		return false;
-	}
-#endif
 
 	if (use_lock)
 		LWLockRelease(ControlFileLock);
@@ -3840,7 +3822,7 @@ RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
 		 * flag, rename will fail. We'll try again at the next checkpoint.
 		 */
 		snprintf(newpath, MAXPGPATH, "%s.deleted", path);
-		if (rename(path, newpath) != 0)
+		if (rename_safe(path, newpath) != 0)
 		{
 			ereport(LOG,
 					(errcode_for_file_access(),
@@ -5339,7 +5321,7 @@ exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
 	 * re-enter archive recovery mode in a subsequent crash.
 	 */
 	unlink(RECOVERY_COMMAND_DONE);
-	if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
+	if (rename_safe(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
 		ereport(FATAL,
 				(errcode_for_file_access(),
 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -6186,7 +6168,7 @@ StartupXLOG(void)
 		if (stat(TABLESPACE_MAP, &st) == 0)
 		{
 			unlink(TABLESPACE_MAP_OLD);
-			if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0)
+			if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0)
 				ereport(LOG,
 					(errmsg("ignoring file \"%s\" because no file \"%s\" exists",
 							TABLESPACE_MAP, BACKUP_LABEL_FILE),
@@ -6549,7 +6531,7 @@ StartupXLOG(void)
 		if (haveBackupLabel)
 		{
 			unlink(BACKUP_LABEL_OLD);
-			if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
+			if (rename_safe(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
 				ereport(FATAL,
 						(errcode_for_file_access(),
 						 errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -6566,7 +6548,7 @@ StartupXLOG(void)
 		if (haveTblspcMap)
 		{
 			unlink(TABLESPACE_MAP_OLD);
-			if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) != 0)
+			if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) != 0)
 				ereport(FATAL,
 						(errcode_for_file_access(),
 						 errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -7347,7 +7329,7 @@ StartupXLOG(void)
 				 */
 				XLogArchiveCleanup(partialfname);
 
-				if (rename(origpath, partialpath) != 0)
+				if (rename_safe(origpath, partialpath) != 0)
 					ereport(ERROR,
 							(errcode_for_file_access(),
 						 errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -10907,7 +10889,7 @@ CancelBackup(void)
 	/* remove leftover file from previously canceled backup if it exists */
 	unlink(BACKUP_LABEL_OLD);
 
-	if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
+	if (rename_safe(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
 	{
 		ereport(WARNING,
 				(errcode_for_file_access(),
@@ -10930,7 +10912,7 @@ CancelBackup(void)
 	/* remove leftover file from previously canceled backup if it exists */
 	unlink(TABLESPACE_MAP_OLD);
 
-	if (rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0)
+	if (rename_safe(TABLESPACE_MAP, TABLESPACE_MAP_OLD) == 0)
 	{
 		ereport(LOG,
 				(errmsg("online backup mode canceled"),
diff --git a/src/backend/access/transam/xlogarchive.c b/src/backend/access/transam/xlogarchive.c
index 277c14a..65c03b2 100644
--- a/src/backend/access/transam/xlogarchive.c
+++ b/src/backend/access/transam/xlogarchive.c
@@ -451,7 +451,7 @@ KeepFileRestoredFromArchive(char *path, char *xlogfname)
 		 */
 		snprintf(oldpath, MAXPGPATH, "%s.deleted%u",
 				 xlogfpath, deletedcounter++);
-		if (rename(xlogfpath, oldpath) != 0)
+		if (rename_safe(xlogfpath, oldpath) != 0)
 		{
 			ereport(ERROR,
 					(errcode_for_file_access(),
@@ -470,7 +470,7 @@ KeepFileRestoredFromArchive(char *path, char *xlogfname)
 		reload = true;
 	}
 
-	if (rename(path, xlogfpath) < 0)
+	if (rename_safe(path, xlogfpath) < 0)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
@@ -580,7 +580,7 @@ XLogArchiveForceDone(const char *xlog)
 	StatusFilePath(archiveReady, xlog, ".ready");
 	if (stat(archiveReady, &stat_buf) == 0)
 	{
-		if (rename(archiveReady, archiveDone) < 0)
+		if (rename_safe(archiveReady, archiveDone) < 0)
 			ereport(WARNING,
 					(errcode_for_file_access(),
 					 errmsg("could not rename file \"%s\" to \"%s\": %m",
diff --git a/src/backend/postmaster/pgarch.c b/src/backend/postmaster/pgarch.c
index 397f802..db54889 100644
--- a/src/backend/postmaster/pgarch.c
+++ b/src/backend/postmaster/pgarch.c
@@ -728,7 +728,7 @@ pgarch_archiveDone(char *xlog)
 
 	StatusFilePath(rlogready, xlog, ".ready");
 	StatusFilePath(rlogdone, xlog, ".done");
-	if (rename(rlogready, rlogdone) < 0)
+	if (rename_safe(rlogready, rlogdone) < 0)
 		ereport(WARNING,
 				(errcode_for_file_access(),
 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
diff --git a/src/backend/replication/logical/origin.c b/src/backend/replication/logical/origin.c
index 0caf7a3..96368c7 100644
--- a/src/backend/replication/logical/origin.c
+++ b/src/backend/replication/logical/origin.c
@@ -617,16 +617,13 @@ CheckPointReplicationOrigin(void)
 	CloseTransientFile(tmpfd);
 
 	/* rename to permanent file, fsync file and directory */
-	if (rename(tmppath, path) != 0)
+	if (rename_safe(tmppath, path) != 0)
 	{
 		ereport(PANIC,
 				(errcode_for_file_access(),
 				 errmsg("could not rename file \"%s\" to \"%s\": %m",
 						tmppath, path)));
 	}
-
-	fsync_fname((char *) path, false);
-	fsync_fname("pg_logical", true);
 }
 
 /*
diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index affa9b9..ead221d 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -1095,7 +1095,7 @@ SaveSlotToPath(ReplicationSlot *slot, const char *dir, int elevel)
 	START_CRIT_SECTION();
 
 	fsync_fname(path, false);
-	fsync_fname((char *) dir, true);
+	fsync_fname(dir, true);
 	fsync_fname("pg_replslot", true);
 
 	END_CRIT_SECTION();
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 1b30100..85bab37 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -307,6 +307,7 @@ static void walkdir(const char *path,
 static void pre_sync_fname(const char *fname, bool isdir, int elevel);
 #endif
 static void fsync_fname_ext(const char *fname, bool isdir, int elevel);
+static void fsync_parent_path(const char *fname);
 
 
 /*
@@ -413,7 +414,7 @@ pg_flush_data(int fd, off_t offset, off_t amount)
  * indicate the OS just doesn't allow/require fsyncing directories.
  */
 void
-fsync_fname(char *fname, bool isdir)
+fsync_fname(const char *fname, bool isdir)
 {
 	int			fd;
 	int			returncode;
@@ -424,11 +425,11 @@ fsync_fname(char *fname, bool isdir)
 	 * cases here
 	 */
 	if (!isdir)
-		fd = OpenTransientFile(fname,
+		fd = OpenTransientFile((char *) fname,
 							   O_RDWR | PG_BINARY,
 							   S_IRUSR | S_IWUSR);
 	else
-		fd = OpenTransientFile(fname,
+		fd = OpenTransientFile((char *) fname,
 							   O_RDONLY | PG_BINARY,
 							   S_IRUSR | S_IWUSR);
 
@@ -461,6 +462,100 @@ fsync_fname(char *fname, bool isdir)
 	CloseTransientFile(fd);
 }
 
+/*
+ * rename_safe -- rename of a file, making it on-disk persistent
+ *
+ * This routine ensures that a rename file persists in case of a crash by
+ * using fsync on the old and new files before and after performing the
+ * rename so as this categorizes as an all-or-nothing operation.
+ *
+ * rename() is not reliable across directories, particularly if the origin
+ * point and the target point are located on different mounted partitions
+ * so this routine should be called when the replacement of a file is
+ * located in the same directory as its origin file.
+ */
+int
+rename_safe(const char *oldfile, const char *newfile)
+{
+	int		fd;
+
+	/*
+	 * First fsync the old entry and new entry, it this one exists, to ensure
+	 * that they are properly persistent on disk. Calling this routine with
+	 * an existing new target file is fine, rename() will first remove it
+	 * before performing its operation.
+	 */
+	fsync_fname(oldfile, false);
+
+	fd = OpenTransientFile((char *) newfile, PG_BINARY | O_RDONLY, 0);
+	if (fd < 0)
+	{
+		if (errno != ENOENT)
+			 return -1;
+	}
+	else
+	{
+		if (pg_fsync(fd) != 0)
+			ereport(LOG,
+					(errcode_for_file_access(),
+					 errmsg("could not write to file \"%s\": %m",
+							newfile)));
+		(void) CloseTransientFile(fd);
+	}
+
+	/* Time to do the real deal... */
+	if (rename(oldfile, newfile) != 0)
+		return -1;
+
+	/*
+	 * Make change persistent in case of an OS crash, both the new entry and
+	 * its parent directory need to be flushed.
+	 */
+	fsync_fname(newfile, false);
+
+	/* Same for parent directory */
+	fsync_parent_path(newfile);
+	return 0;
+}
+
+/*
+ * replace_safe -- replace a file, making it on-disk persistent
+ *
+ * This routine ensures that a file link or rename on a file persists on
+ * system in case of a crash by using fsync where on the link generated
+ * as well as on its parent directory. link() is preferred to rename() just
+ * to be really sure that an existing file is not overwritten. However,
+ * there should not be an existing file when calling this routine, so rename()
+ * is an acceptable substitute except for the truly paranoid.
+ *
+ * rename() and link() are not reliable across directories, particularly
+ * if the origin point and the target point are located on different mounted
+ * partitions, so this routine should be called when the replacement of a
+ * file is located in the same directory as its origin file.
+ */
+int
+replace_safe(const char *oldfile, const char *newfile)
+{
+#if HAVE_WORKING_LINK
+	if (link(oldfile, newfile) < 0)
+		return -1;
+	unlink(oldfile);
+#else
+	if (rename(oldfile, newfile) < 0)
+		return -1;
+#endif
+
+	/*
+	 * Make change persistent in case of an OS crash, both the new entry and
+	 * its parent directory need to be flushed.
+	 */
+	fsync_fname(newfile, false);
+
+	/* Same for parent directory */
+	fsync_parent_path(newfile);
+	return 0;
+}
+
 
 /*
  * InitFileAccess --- initialize this module during backend startup
@@ -2719,3 +2814,29 @@ fsync_fname_ext(const char *fname, bool isdir, int elevel)
 
 	(void) CloseTransientFile(fd);
 }
+
+/*
+ * fsync_parent_path -- fsync the parent path of a file or directory
+ *
+ * This is aimed at making file operations persistent on disk in case of
+ * an OS crash or power failure.
+ */
+static void
+fsync_parent_path(const char *fname)
+{
+	char	parentpath[MAXPGPATH];
+
+	/* Same for parent directory */
+	snprintf(parentpath, MAXPGPATH, "%s", fname);
+	get_parent_directory(parentpath);
+
+	/*
+	 * get_parent_directory() returns an empty string if the input argument
+	 * is just a file name (see comments in path.c), so handle that as being
+	 * the current directory.
+	 */
+	if (strlen(parentpath) == 0)
+		fsync_fname(".", true);
+	else
+		fsync_fname(parentpath, true);
+}
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 4a3fccb..60836a1 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -113,7 +113,9 @@ extern int	pg_fsync_no_writethrough(int fd);
 extern int	pg_fsync_writethrough(int fd);
 extern int	pg_fdatasync(int fd);
 extern int	pg_flush_data(int fd, off_t offset, off_t amount);
-extern void fsync_fname(char *fname, bool isdir);
+extern void fsync_fname(const char *fname, bool isdir);
+extern int	rename_safe(const char *oldfile, const char *newfile);
+extern int	replace_safe(const char *oldfile, const char *newfile);
 extern void SyncDataDirectory(void);
 
 /* Filename components for OpenTemporaryFile */
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to