On Thu, Sep 29, 2022 at 7:05 PM Robert Haas <robertmh...@gmail.com> wrote:
>
> On Thu, Sep 29, 2022 at 4:29 AM Bharath Rupireddy
> <bharath.rupireddyforpostg...@gmail.com> wrote:
> > Please review the attached v3 patch.
>
> template1=# select * from pg_backup_start('sdgkljsdgkjdsg', true);
>  pg_backup_start
> -----------------
>  0/2000028
> (1 row)
>
> template1=# select 1/0;
> ERROR:  division by zero
> template1=# select * from pg_backup_stop();
> server closed the connection unexpectedly
> This probably means the server terminated abnormally
> before or while processing the request.
> The connection to the server was lost. Attempting reset: Failed.
> The connection to the server was lost. Attempting reset: Failed.
> !?>

Thanks! I used a variable to define the scope to clean up the backup
memory context for SQL functions/on-line backup. We don't have this
problem in case of base backup because we don't give control in
between start and stop backup in perform_base_backup().

Please review the v4 patch.

--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
From f22c5e3afcdf2bf0d7bbcab9356f9ad70a7ec201 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Thu, 29 Sep 2022 16:31:59 +0000
Subject: [PATCH v4] Avoid memory leaks during backups

Postgres currently can leak memory if a failure occurs during base
backup in do_pg_backup_start() or do_pg_backup_stop() or
perform_base_backup(). The palloc'd memory such as backup_state or
tablespace_map in xlogfuncs.c or basebackup.c or tablespaceinfo in
perform_base_backup() or any other, is left-out which may cause
memory bloat on the server eventually. To experience this issue,
run pg_basebackup with --label name longer than 1024 characters and
observe memory with watch command, the memory usage goes up. It looks
like the memory leak issue has been there for quite some time.

To solve the above problem, we create separate memory context for
backup and add clean up callback in PostgresMain()'s error handling
code using sigsetjmp().

The design of the patch is as follows:

For backup functions or on-line backups, a new memory context is
created as a child of TopMemoryContext in pg_backup_start() which
gets deleted upon any error or at the end of pg_backup_stop().

For perform_base_backup() or base backups, a new memory context is
created as a child of CurrentMemoryContext which gets deleted upon
any error or at the end of perform_base_backup().
---
 src/backend/access/transam/xlogfuncs.c | 103 +++++++++++++++++++------
 src/backend/backup/basebackup.c        |  40 ++++++++++
 src/backend/replication/walsender.c    |   2 +
 src/backend/tcop/postgres.c            |   4 +
 src/include/access/xlogbackup.h        |   1 +
 src/include/backup/basebackup.h        |   1 +
 6 files changed, 127 insertions(+), 24 deletions(-)

diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c
index a801a94fe8..985be59ca8 100644
--- a/src/backend/access/transam/xlogfuncs.c
+++ b/src/backend/access/transam/xlogfuncs.c
@@ -45,6 +45,30 @@
 static BackupState *backup_state = NULL;
 static StringInfo tablespace_map = NULL;
 
+/* A workspace for on-line backup. */
+static MemoryContext backupcontext = NULL;
+
+/*
+ * A session-level variable to define the scope of backup memory context clean
+ * up.
+ */
+static bool allowedToCleanBackupMemCxt = false;
+
+/*
+ * Clean up backup memory context we created.
+ */
+void
+BackupMemoryContextCleanup(void)
+{
+	if (backupcontext == NULL || allowedToCleanBackupMemCxt == false)
+		return;		/* Nothing to do. */
+
+	MemoryContextDelete(backupcontext);
+	backupcontext = NULL;
+	backup_state = NULL;
+	tablespace_map = NULL;
+}
+
 /*
  * pg_backup_start: set up for taking an on-line backup dump
  *
@@ -71,34 +95,45 @@ pg_backup_start(PG_FUNCTION_ARGS)
 				 errmsg("a backup is already in progress in this session")));
 
 	/*
-	 * backup_state and tablespace_map need to be long-lived as they are used
-	 * in pg_backup_stop().
+	 * We're okay to clean backup memory context while an error occurs within
+	 * this function.
 	 */
-	oldcontext = MemoryContextSwitchTo(TopMemoryContext);
+	allowedToCleanBackupMemCxt = true;
 
-	/* Allocate backup state or reset it, if it comes from a previous run */
-	if (backup_state == NULL)
-		backup_state = (BackupState *) palloc0(sizeof(BackupState));
-	else
-		MemSet(backup_state, 0, sizeof(BackupState));
+	/*
+	 * backup_state and tablespace_map need to be long-lived as they are used
+	 * in pg_backup_stop(). Create a special memory context as a direct child
+	 * of TopMemoryContext so that the memory allocated is carried till the
+	 * pg_backup_stop() and cleaned in case of errors (see error handling code
+	 * using sigsetjmp() in PostgresMain()).
+	 */
+	if (backupcontext == NULL)
+		backupcontext = AllocSetContextCreate(TopMemoryContext,
+											  "on-line backup context",
+											  ALLOCSET_DEFAULT_SIZES);
 
 	/*
-	 * tablespace_map may have been created in a previous backup, so take this
-	 * occasion to clean it.
+	 * We switch to backup memory context to clean up the allocated memory even
+	 * if an error occurs.
 	 */
-	if (tablespace_map != NULL)
-	{
-		pfree(tablespace_map->data);
-		pfree(tablespace_map);
-		tablespace_map = NULL;
-	}
+	oldcontext = MemoryContextSwitchTo(backupcontext);
 
+	Assert(backup_state == NULL);
+	Assert(tablespace_map == NULL);
+	backup_state = (BackupState *) palloc0(sizeof(BackupState));
 	tablespace_map = makeStringInfo();
-	MemoryContextSwitchTo(oldcontext);
 
 	register_persistent_abort_backup_handler();
 	do_pg_backup_start(backupidstr, fast, NULL, backup_state, tablespace_map);
 
+	MemoryContextSwitchTo(oldcontext);
+
+	/*
+	 * We're not okay to clean backup memory context while an error occurs
+	 * after this function until we reach pg_backup_stop().
+	 */
+	allowedToCleanBackupMemCxt = false;
+
 	PG_RETURN_LSN(backup_state->startpoint);
 }
 
@@ -132,6 +167,7 @@ pg_backup_stop(PG_FUNCTION_ARGS)
 	bool		waitforarchive = PG_GETARG_BOOL(0);
 	char	   *backup_label;
 	SessionBackupState status = get_backup_status();
+	MemoryContext oldcontext;
 
 	/* Initialize attributes information in the tuple descriptor */
 	if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
@@ -143,8 +179,21 @@ pg_backup_stop(PG_FUNCTION_ARGS)
 				 errmsg("backup is not in progress"),
 				 errhint("Did you call pg_backup_start()?")));
 
+	/*
+	 * We're okay to clean backup memory context while an error occurs within
+	 * this function.
+	 */
+	allowedToCleanBackupMemCxt = true;
+
 	Assert(backup_state != NULL);
 	Assert(tablespace_map != NULL);
+	Assert(backupcontext != NULL);
+
+	/*
+	 * We switch to backup memory context to clean up the allocated memory even
+	 * if an error occurs.
+	 */
+	oldcontext = MemoryContextSwitchTo(backupcontext);
 
 	/* Stop the backup */
 	do_pg_backup_stop(backup_state, waitforarchive);
@@ -152,17 +201,23 @@ pg_backup_stop(PG_FUNCTION_ARGS)
 	/* Build the contents of backup_label */
 	backup_label = build_backup_content(backup_state, false);
 
+	/*
+	 * We switch back from backup memory context so that the results are
+	 * created in right memory context.
+	 */
+	MemoryContextSwitchTo(oldcontext);
+
 	values[0] = LSNGetDatum(backup_state->stoppoint);
 	values[1] = CStringGetTextDatum(backup_label);
 	values[2] = CStringGetTextDatum(tablespace_map->data);
 
-	/* Deallocate backup-related variables */
-	pfree(backup_state);
-	backup_state = NULL;
-	pfree(tablespace_map->data);
-	pfree(tablespace_map);
-	tablespace_map = NULL;
-	pfree(backup_label);
+	BackupMemoryContextCleanup();
+
+	/*
+	 * Reset session-level variable for further backups that may happen within
+	 * this session.
+	 */
+	allowedToCleanBackupMemCxt = false;
 
 	/* Returns the record as Datum */
 	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c
index e252ad7421..fe07a3dc8f 100644
--- a/src/backend/backup/basebackup.c
+++ b/src/backend/backup/basebackup.c
@@ -42,6 +42,7 @@
 #include "storage/reinit.h"
 #include "utils/builtins.h"
 #include "utils/guc.h"
+#include "utils/memutils.h"
 #include "utils/ps_status.h"
 #include "utils/relcache.h"
 #include "utils/resowner.h"
@@ -75,6 +76,9 @@ typedef struct
 	pg_checksum_type manifest_checksum_type;
 } basebackup_options;
 
+/* A workspace for base backup. */
+static MemoryContext backupcontext = NULL;
+
 static int64 sendTablespace(bbsink *sink, char *path, char *spcoid, bool sizeonly,
 							struct backup_manifest_info *manifest);
 static int64 sendDir(bbsink *sink, const char *path, int basepathlen, bool sizeonly,
@@ -220,6 +224,19 @@ static const struct exclude_list_item noChecksumFiles[] = {
 	{NULL, false}
 };
 
+/*
+ * Clean up base backup memory context we created.
+ */
+void
+BaseBackupMemoryContextCleanup(void)
+{
+	if (backupcontext == NULL)
+		return;		/* Nothing to do. */
+
+	MemoryContextDelete(backupcontext);
+	backupcontext = NULL;
+}
+
 /*
  * Actually do a base backup for the specified tablespaces.
  *
@@ -235,6 +252,7 @@ perform_base_backup(basebackup_options *opt, bbsink *sink)
 	backup_manifest_info manifest;
 	BackupState *backup_state;
 	StringInfo	tablespace_map;
+	MemoryContext oldcontext;
 
 	/* Initial backup state, insofar as we know it now. */
 	state.tablespaces = NIL;
@@ -252,6 +270,23 @@ perform_base_backup(basebackup_options *opt, bbsink *sink)
 	InitializeBackupManifest(&manifest, opt->manifest,
 							 opt->manifest_checksum_type);
 
+	/*
+	 * Perform the base backup in a special memory context to not leak any
+	 * memory in case of errors (see error handling code using sigsetjmp() in
+	 * PostgresMain()).
+	 */
+	if (backupcontext == NULL)
+		backupcontext = AllocSetContextCreate(CurrentMemoryContext,
+											  "base backup context",
+											  ALLOCSET_DEFAULT_SIZES);
+
+	/*
+	 * We switch to base backup memory context only after
+	 * InitializeBackupManifest() as it uses BufFile which needs to be in the
+	 * ResourceOwner's memory context.
+	 */
+	oldcontext = MemoryContextSwitchTo(backupcontext);
+
 	total_checksum_failures = 0;
 
 	/* Allocate backup related varilables. */
@@ -660,6 +695,11 @@ perform_base_backup(basebackup_options *opt, bbsink *sink)
 	 */
 	FreeBackupManifest(&manifest);
 
+	MemoryContextSwitchTo(oldcontext);
+
+	/* clean up the base backup memory context we created */
+	BaseBackupMemoryContextCleanup();
+
 	/* clean up the resource owner we created */
 	WalSndResourceCleanup(true);
 
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index e9ba500a15..c58091253e 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -328,6 +328,8 @@ WalSndErrorCleanup(void)
 
 	replication_active = false;
 
+	BaseBackupMemoryContextCleanup();
+
 	/*
 	 * If there is a transaction in progress, it will clean up our
 	 * ResourceOwner, but if a replication command set up a resource owner
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 5352d5f4c6..1ef345dd9d 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -31,6 +31,7 @@
 #include "access/parallel.h"
 #include "access/printtup.h"
 #include "access/xact.h"
+#include "access/xlogbackup.h"
 #include "catalog/pg_type.h"
 #include "commands/async.h"
 #include "commands/prepare.h"
@@ -4277,6 +4278,9 @@ PostgresMain(const char *dbname, const char *username)
 		 */
 		AbortCurrentTransaction();
 
+		/* Clean up the backup memory context if created */
+		BackupMemoryContextCleanup();
+
 		if (am_walsender)
 			WalSndErrorCleanup();
 
diff --git a/src/include/access/xlogbackup.h b/src/include/access/xlogbackup.h
index 8ec3d88b0a..1f88511d61 100644
--- a/src/include/access/xlogbackup.h
+++ b/src/include/access/xlogbackup.h
@@ -37,5 +37,6 @@ typedef struct BackupState
 
 extern char *build_backup_content(BackupState *state,
 								  bool ishistoryfile);
+extern void BackupMemoryContextCleanup(void);
 
 #endif							/* XLOG_BACKUP_H */
diff --git a/src/include/backup/basebackup.h b/src/include/backup/basebackup.h
index 593479afdc..33ef95ada8 100644
--- a/src/include/backup/basebackup.h
+++ b/src/include/backup/basebackup.h
@@ -35,5 +35,6 @@ typedef struct
 } tablespaceinfo;
 
 extern void SendBaseBackup(BaseBackupCmd *cmd);
+extern void BaseBackupMemoryContextCleanup(void);
 
 #endif							/* _BASEBACKUP_H */
-- 
2.34.1

Reply via email to