At Thu, 30 Jun 2022 12:28:43 +0900, Fujii Masao <masao.fu...@oss.nttdata.com> 
wrote in 
> The root cause of these failures seems that sessionBackupState flag
> is not reset to SESSION_BACKUP_NONE even when BASE_BACKUP is aborted.
> So attached patch changes do_pg_abort_backup callback so that
> it resets sessionBackupState. I confirmed that, with the patch,
> those assertion failure and segmentation fault didn't happen.
> 
> But this change has one issue that; if BASE_BACKUP is run while
> a backup is already in progress in the session by pg_backup_start()
> and that session is terminated, the change causes
> XLogCtl->Insert.runningBackups
> to be decremented incorrectly. That is, XLogCtl->Insert.runningBackups
> is incremented by two by pg_backup_start() and BASE_BACKUP,
> but it's decremented only by one by the termination of the session.
> 
> To address this issue, I think that we should disallow BASE_BACKUP
> to run while a backup is already in progress in the *same* session
> as we already do this for pg_backup_start(). Thought? I included
> the code to disallow that in the attached patch.

It seems like to me that the root cause is the callback is registered
twice.  The callback does not expect to be called more than once (at
least per one increment of runningBackups).

register_persistent_abort_backup_hanedler() prevents duplicate
regsitration of the callback so I think perform_base_backup should use
this function instead of protecting by the PG_*_ERROR_CLEANUP()
section.

Please find the attached.

regards.

-- 
Kyotaro Horiguchi
NTT Open Source Software Center
>From 76af9ecee495b34b6a1d2abfd0f35bb7aeb64178 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horikyota....@gmail.com>
Date: Fri, 1 Jul 2022 11:38:34 +0900
Subject: [PATCH 1/2] Use permanent backup-abort call back in
 perform_base_backup

---
 src/backend/replication/basebackup.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index 95440013c0..e4345dbff2 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -255,19 +255,14 @@ perform_base_backup(basebackup_options *opt, bbsink *sink)
 	total_checksum_failures = 0;
 
 	basebackup_progress_wait_checkpoint();
+
+	register_persistent_abort_backup_handler();
+
 	state.startptr = do_pg_backup_start(opt->label, opt->fastcheckpoint,
 										&state.starttli,
 										labelfile, &state.tablespaces,
 										tblspc_map_file);
 
-	/*
-	 * Once do_pg_backup_start has been called, ensure that any failure causes
-	 * us to abort the backup so we don't "leak" a backup counter. For this
-	 * reason, *all* functionality between do_pg_backup_start() and the end of
-	 * do_pg_backup_stop() should be inside the error cleanup block!
-	 */
-
-	PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false));
 	{
 		ListCell   *lc;
 		tablespaceinfo *ti;
@@ -375,8 +370,6 @@ perform_base_backup(basebackup_options *opt, bbsink *sink)
 		basebackup_progress_wait_wal_archive(&state);
 		endptr = do_pg_backup_stop(labelfile->data, !opt->nowait, &endtli);
 	}
-	PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false));
-
 
 	if (opt->includewal)
 	{
-- 
2.31.1

>From 477c470fe22f3f98f4ca1f990d8150fbc19bd778 Mon Sep 17 00:00:00 2001
From: Kyotaro Horiguchi <horikyota....@gmail.com>
Date: Fri, 1 Jul 2022 11:40:14 +0900
Subject: [PATCH 2/2] Remove extra code block.

---
 src/backend/replication/basebackup.c | 185 +++++++++++++--------------
 1 file changed, 91 insertions(+), 94 deletions(-)

diff --git a/src/backend/replication/basebackup.c b/src/backend/replication/basebackup.c
index e4345dbff2..ec9bc6f52b 100644
--- a/src/backend/replication/basebackup.c
+++ b/src/backend/replication/basebackup.c
@@ -233,6 +233,8 @@ perform_base_backup(basebackup_options *opt, bbsink *sink)
 	StringInfo	labelfile;
 	StringInfo	tblspc_map_file;
 	backup_manifest_info manifest;
+	ListCell   *lc;
+	tablespaceinfo *ti;
 
 	/* Initial backup state, insofar as we know it now. */
 	state.tablespaces = NIL;
@@ -263,114 +265,109 @@ perform_base_backup(basebackup_options *opt, bbsink *sink)
 										labelfile, &state.tablespaces,
 										tblspc_map_file);
 
+	/* Add a node for the base directory at the end */
+	ti = palloc0(sizeof(tablespaceinfo));
+	ti->size = -1;
+	state.tablespaces = lappend(state.tablespaces, ti);
+
+	/*
+	 * Calculate the total backup size by summing up the size of each
+	 * tablespace
+	 */
+	if (opt->progress)
 	{
-		ListCell   *lc;
-		tablespaceinfo *ti;
-
-		/* Add a node for the base directory at the end */
-		ti = palloc0(sizeof(tablespaceinfo));
-		ti->size = -1;
-		state.tablespaces = lappend(state.tablespaces, ti);
-
-		/*
-		 * Calculate the total backup size by summing up the size of each
-		 * tablespace
-		 */
-		if (opt->progress)
-		{
-			basebackup_progress_estimate_backup_size();
-
-			foreach(lc, state.tablespaces)
-			{
-				tablespaceinfo *tmp = (tablespaceinfo *) lfirst(lc);
-
-				if (tmp->path == NULL)
-					tmp->size = sendDir(sink, ".", 1, true, state.tablespaces,
-										true, NULL, NULL);
-				else
-					tmp->size = sendTablespace(sink, tmp->path, tmp->oid, true,
-											   NULL);
-				state.bytes_total += tmp->size;
-			}
-			state.bytes_total_is_valid = true;
-		}
-
-		/* notify basebackup sink about start of backup */
-		bbsink_begin_backup(sink, &state, SINK_BUFFER_LENGTH);
-
-		/* Send off our tablespaces one by one */
+		basebackup_progress_estimate_backup_size();
+		
 		foreach(lc, state.tablespaces)
 		{
-			tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
+			tablespaceinfo *tmp = (tablespaceinfo *) lfirst(lc);
+			
+			if (tmp->path == NULL)
+				tmp->size = sendDir(sink, ".", 1, true, state.tablespaces,
+									true, NULL, NULL);
+			else
+				tmp->size = sendTablespace(sink, tmp->path, tmp->oid, true,
+										   NULL);
+			state.bytes_total += tmp->size;
+		}
+		state.bytes_total_is_valid = true;
+	}
 
-			if (ti->path == NULL)
+	/* notify basebackup sink about start of backup */
+	bbsink_begin_backup(sink, &state, SINK_BUFFER_LENGTH);
+
+	/* Send off our tablespaces one by one */
+	foreach(lc, state.tablespaces)
+	{
+		tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
+
+		if (ti->path == NULL)
+		{
+			struct stat statbuf;
+			bool		sendtblspclinks = true;
+
+			bbsink_begin_archive(sink, "base.tar");
+
+			/* In the main tar, include the backup_label first... */
+			sendFileWithContent(sink, BACKUP_LABEL_FILE, labelfile->data,
+								&manifest);
+
+			/* Then the tablespace_map file, if required... */
+			if (opt->sendtblspcmapfile)
 			{
-				struct stat statbuf;
-				bool		sendtblspclinks = true;
-
-				bbsink_begin_archive(sink, "base.tar");
-
-				/* In the main tar, include the backup_label first... */
-				sendFileWithContent(sink, BACKUP_LABEL_FILE, labelfile->data,
+				sendFileWithContent(sink, TABLESPACE_MAP, tblspc_map_file->data,
 									&manifest);
-
-				/* Then the tablespace_map file, if required... */
-				if (opt->sendtblspcmapfile)
-				{
-					sendFileWithContent(sink, TABLESPACE_MAP, tblspc_map_file->data,
-										&manifest);
-					sendtblspclinks = false;
-				}
-
-				/* Then the bulk of the files... */
-				sendDir(sink, ".", 1, false, state.tablespaces,
-						sendtblspclinks, &manifest, NULL);
-
-				/* ... and pg_control after everything else. */
-				if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0)
-					ereport(ERROR,
-							(errcode_for_file_access(),
-							 errmsg("could not stat file \"%s\": %m",
-									XLOG_CONTROL_FILE)));
-				sendFile(sink, XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf,
-						 false, InvalidOid, &manifest, NULL);
+				sendtblspclinks = false;
 			}
-			else
-			{
-				char	   *archive_name = psprintf("%s.tar", ti->oid);
 
-				bbsink_begin_archive(sink, archive_name);
-
-				sendTablespace(sink, ti->path, ti->oid, false, &manifest);
-			}
+			/* Then the bulk of the files... */
+			sendDir(sink, ".", 1, false, state.tablespaces,
+					sendtblspclinks, &manifest, NULL);
+
+			/* ... and pg_control after everything else. */
+			if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0)
+				ereport(ERROR,
+						(errcode_for_file_access(),
+						 errmsg("could not stat file \"%s\": %m",
+								XLOG_CONTROL_FILE)));
+			sendFile(sink, XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf,
+					 false, InvalidOid, &manifest, NULL);
+		}
+		else
+		{
+			char	   *archive_name = psprintf("%s.tar", ti->oid);
 
-			/*
-			 * If we're including WAL, and this is the main data directory we
-			 * don't treat this as the end of the tablespace. Instead, we will
-			 * include the xlog files below and stop afterwards. This is safe
-			 * since the main data directory is always sent *last*.
-			 */
-			if (opt->includewal && ti->path == NULL)
-			{
-				Assert(lnext(state.tablespaces, lc) == NULL);
-			}
-			else
-			{
-				/* Properly terminate the tarfile. */
-				StaticAssertStmt(2 * TAR_BLOCK_SIZE <= BLCKSZ,
-								 "BLCKSZ too small for 2 tar blocks");
-				memset(sink->bbs_buffer, 0, 2 * TAR_BLOCK_SIZE);
-				bbsink_archive_contents(sink, 2 * TAR_BLOCK_SIZE);
+			bbsink_begin_archive(sink, archive_name);
 
-				/* OK, that's the end of the archive. */
-				bbsink_end_archive(sink);
-			}
+			sendTablespace(sink, ti->path, ti->oid, false, &manifest);
 		}
 
-		basebackup_progress_wait_wal_archive(&state);
-		endptr = do_pg_backup_stop(labelfile->data, !opt->nowait, &endtli);
+		/*
+		 * If we're including WAL, and this is the main data directory we
+		 * don't treat this as the end of the tablespace. Instead, we will
+		 * include the xlog files below and stop afterwards. This is safe
+		 * since the main data directory is always sent *last*.
+		 */
+		if (opt->includewal && ti->path == NULL)
+		{
+			Assert(lnext(state.tablespaces, lc) == NULL);
+		}
+		else
+		{
+			/* Properly terminate the tarfile. */
+			StaticAssertStmt(2 * TAR_BLOCK_SIZE <= BLCKSZ,
+							 "BLCKSZ too small for 2 tar blocks");
+			memset(sink->bbs_buffer, 0, 2 * TAR_BLOCK_SIZE);
+			bbsink_archive_contents(sink, 2 * TAR_BLOCK_SIZE);
+
+			/* OK, that's the end of the archive. */
+			bbsink_end_archive(sink);
+		}
 	}
 
+	basebackup_progress_wait_wal_archive(&state);
+	endptr = do_pg_backup_stop(labelfile->data, !opt->nowait, &endtli);
+
 	if (opt->includewal)
 	{
 		/*
-- 
2.31.1

Reply via email to