Hello, hackers!

Recently I've faced an issue when playing with pg_controldata.
I made some rubbish pg_control files and caught a SIGSEGV.
Then I tried fuzzing REL_17_STABLE pg_controldata with AFL++ and got 7 crash 
cases.
Also, it is not necessary to use fuzzing. You can simply generate
any pg_control by 32-bit initdb and give it to the 64-bit pg_controldata.

SIGSEGV was caused by two places in pg_controldata.c where there
is a work with localtime(&time_tmp));. So I added a check for not NULL.

The reason is such:
The 64-bit ControlFileData has gaps for alignment after DBState,
but the 32-bit ControlFileData does not have such alignment,
so the zero bytes from the gap are read in pg_time_t time (which is actually a 
long long time):

(gdb output)

64-bit
type = struct ControlFileData {

/* 0   | 8 */ uint64 system_identifier;
/* 8   | 4 */ uint32 pg_control_version;
/* 12 | 4 */ uint32 catalog_version_no;
/* 16 | 4 */ DBState state;
/* XXX 4-byte hole */
/* 24 | 8 */ pg_time_t time;

32 bit
type = struct ControlFileData {
/* 0     | 8 */ uint64 system_identifier;
/* 8     | 4 */ uint32 pg_control_version;
/* 12   | 4 */ uint32 catalog_version_no;
/* 16   | 4 */ DBState state;
/* 20   | 8 */ pg_time_t time;
/* 28   | 8 */ XLogRecPtr checkPoint;
/* 36   | 76 */ CheckPoint checkPointCopy;
/* 112 | 8 */ XLogRecPtr unloggedLSN;
/* 120 | 8 */ XLogRecPtr minRecoveryPoint;
/* 128 | 4 */ TimeLineID minRecoveryPointTLI;
/* 132 | 8 */ XLogRecPtr backupStartPoint;
/* 140 | 8 */ XLogRecPtr backupEndPoint;
/* 148 | 1 */ _Bool backupEndRequired;
/* XXX 3-byte hole */

The other problem is FPE, caused by WalSegSz in pg_controldata.c.
For some reason this variable is signed, it is signed everywhere.
And when pg_control is not valid it can get a negative value, that
then causes an FPE. I changed it to unsigned in my patches.
This raised a question. What is the rationale behind using a signed
type for WAL segment size. Is there a historical or technical reason for this 
choice?

I’ve implemented fixes tailored to the specific contexts of the affected 
versions:


For REL_16 and later:

These versions utilize a function XLogFileName. So this raises a question as 
changing wal_segsz_bytes
argument to unsigned seems logical. But it will be unsigned only here. Maybe it 
is better to change it
in the other code?


For REL_12 to REL_15:

In these older versions, there is a macro in 
./src/include/access/xlog_internal.h
#define XLogFileName(fname, tli, logSegNo, wal_segsz_bytes)     \
        snprintf(fname, MAXFNAMELEN, "%08X%08X%08X", tli,               \
                         (uint32) ((logSegNo) / 
XLogSegmentsPerXLogId(wal_segsz_bytes)), \
                         (uint32) ((logSegNo) % 
XLogSegmentsPerXLogId(wal_segsz_bytes)))

Where casting second operand in % (XLogSegmentsPerXLogId(wal_segsz_bytes)) to 
unsigned seems enough. Would be glad to hear your thoughts.

The versions that contain the problem:
REL_12_STABLE,
REL_13_STABLE,
REL_14_STABLE,
REL_15_STABLE,
REL_16_STABLE,
REL_17_STABLE


Kind regards,
Ian Ilyasov.

Junior Software Developer at Postgres Professional
From 104a8f2f1bfe791b625752e25479b35b8f6bed23 Mon Sep 17 00:00:00 2001
From: Ian Ilyasov <ianilya...@outlook.com>
Date: Tue, 29 Oct 2024 17:59:29 +0300
Subject: [PATCH] fix possible SIGSEGV situations on rubbish
 pg_control

uint32 for WalSegSz is crucial as xlog_seg_size in pg_control.h
is also uint32 and there could be situation when WalSegSz gets
a negative value and pg_controldata triggers FPE (division by zero):

((0x100000000UL) / (WalSegSz)) can turn into zero in

XLogFileName(xlogfilename, ControlFile->checkPointCopy.ThisTimeLineID,
				 segno, WalSegSz);

because if WalSegSz is -1 then by arithmetic rules in C we get
0x100000000UL / 0xFFFFFFFFFFFFFFFFUL == 0.
---
 src/bin/pg_controldata/pg_controldata.c | 30 +++++++++++++++++++------
 src/include/access/xlog_internal.h      |  2 +-
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index 93a05d80ca7..bf759c050d5 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -97,6 +97,7 @@ main(int argc, char *argv[])
 	bool		crc_ok;
 	char	   *DataDir = NULL;
 	time_t		time_tmp;
+	struct tm  *tm_tmp;
 	char		pgctime_str[128];
 	char		ckpttime_str[128];
 	char		mock_auth_nonce_str[MOCK_AUTH_NONCE_LEN * 2 + 1];
@@ -105,7 +106,7 @@ main(int argc, char *argv[])
 	char		xlogfilename[MAXFNAMELEN];
 	int			c;
 	int			i;
-	int			WalSegSz;
+	uint32			WalSegSz;
 
 	pg_logging_init(argv[0]);
 	set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_controldata"));
@@ -178,8 +179,8 @@ main(int argc, char *argv[])
 
 	if (!IsValidWalSegSize(WalSegSz))
 	{
-		pg_log_warning(ngettext("invalid WAL segment size in control file (%d byte)",
-								"invalid WAL segment size in control file (%d bytes)",
+		pg_log_warning(ngettext("invalid WAL segment size in control file (%u byte)",
+								"invalid WAL segment size in control file (%u bytes)",
 								WalSegSz),
 					   WalSegSz);
 		pg_log_warning_detail("The WAL segment size must be a power of two between 1 MB and 1 GB.");
@@ -196,11 +197,26 @@ main(int argc, char *argv[])
 	 * about %c
 	 */
 	time_tmp = (time_t) ControlFile->time;
-	strftime(pgctime_str, sizeof(pgctime_str), strftime_fmt,
-			 localtime(&time_tmp));
+	tm_tmp = localtime(&time_tmp);
+
+	if (tm_tmp != NULL)
+	{
+		strftime(pgctime_str, sizeof(pgctime_str), strftime_fmt,
+			 tm_tmp);
+	} else {
+		snprintf(pgctime_str, sizeof(pgctime_str), "(corrupted timestamp)");
+	}
+
 	time_tmp = (time_t) ControlFile->checkPointCopy.time;
-	strftime(ckpttime_str, sizeof(ckpttime_str), strftime_fmt,
-			 localtime(&time_tmp));
+	tm_tmp = localtime(&time_tmp);
+
+	if (tm_tmp != NULL)
+	{
+		strftime(ckpttime_str, sizeof(ckpttime_str), strftime_fmt,
+			 tm_tmp);
+	} else {
+		snprintf(ckpttime_str, sizeof(ckpttime_str), "(corrupted timestamp)");
+	}
 
 	/*
 	 * Calculate name of the WAL file containing the latest checkpoint's REDO
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index c6a91fb4560..5ecbbba71dd 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -163,7 +163,7 @@ typedef XLogLongPageHeaderData *XLogLongPageHeader;
  * function allocating the result generated.
  */
 static inline void
-XLogFileName(char *fname, TimeLineID tli, XLogSegNo logSegNo, int wal_segsz_bytes)
+XLogFileName(char *fname, TimeLineID tli, XLogSegNo logSegNo, unsigned int wal_segsz_bytes)
 {
 	snprintf(fname, MAXFNAMELEN, "%08X%08X%08X", tli,
 			 (uint32) (logSegNo / XLogSegmentsPerXLogId(wal_segsz_bytes)),
-- 
2.39.5

From 0f1a78411a195c1c51cabb158ca1b58858a12d99 Mon Sep 17 00:00:00 2001
From: Ian Ilyasov <ianilya...@outlook.com>
Date: Tue, 29 Oct 2024 17:59:29 +0300
Subject: [PATCH] fix possible SIGSEGV situations on rubbish
 pg_control

uint32 for WalSegSz is crucial as xlog_seg_size in pg_control.h
is also uint32 and there could be situation when WalSegSz gets
a negative value and pg_controldata triggers FPE (division by zero):

((0x100000000UL) / (WalSegSz)) can turn into zero in

XLogFileName(xlogfilename, ControlFile->checkPointCopy.ThisTimeLineID,
				 segno, WalSegSz);

because if WalSegSz is -1 then by arithmetic rules in C we get
0x100000000UL / 0xFFFFFFFFFFFFFFFFUL == 0.
---
 src/bin/pg_controldata/pg_controldata.c | 28 ++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index e73639df744..1b7cbb03167 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -97,6 +97,7 @@ main(int argc, char *argv[])
 	bool		crc_ok;
 	char	   *DataDir = NULL;
 	time_t		time_tmp;
+	struct tm  *tm_tmp;
 	char		pgctime_str[128];
 	char		ckpttime_str[128];
 	char		mock_auth_nonce_str[MOCK_AUTH_NONCE_LEN * 2 + 1];
@@ -105,7 +106,7 @@ main(int argc, char *argv[])
 	char		xlogfilename[MAXFNAMELEN];
 	int			c;
 	int			i;
-	int			WalSegSz;
+	uint32			WalSegSz;
 
 	pg_logging_init(argv[0]);
 	set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_controldata"));
@@ -177,10 +178,10 @@ main(int argc, char *argv[])
 	if (!IsValidWalSegSize(WalSegSz))
 	{
 		printf(_("WARNING: invalid WAL segment size\n"));
-		printf(ngettext("The WAL segment size stored in the file, %d byte, is not a power of two\n"
+		printf(ngettext("The WAL segment size stored in the file, %u byte, is not a power of two\n"
 						"between 1 MB and 1 GB.  The file is corrupt and the results below are\n"
 						"untrustworthy.\n\n",
-						"The WAL segment size stored in the file, %d bytes, is not a power of two\n"
+						"The WAL segment size stored in the file, %u bytes, is not a power of two\n"
 						"between 1 MB and 1 GB.  The file is corrupt and the results below are\n"
 						"untrustworthy.\n\n",
 						WalSegSz),
@@ -197,11 +198,28 @@ main(int argc, char *argv[])
 	 * about %c
 	 */
 	time_tmp = (time_t) ControlFile->time;
+	tm_tmp = localtime(&time_tmp);
+
+	if (tm_tmp == NULL)
+	{
+		pg_log_error("timestamp is corrupted");
+		exit(1);
+	}
+
 	strftime(pgctime_str, sizeof(pgctime_str), strftime_fmt,
-			 localtime(&time_tmp));
+			 tm_tmp);
+
 	time_tmp = (time_t) ControlFile->checkPointCopy.time;
+	tm_tmp = localtime(&time_tmp);
+
+	if (tm_tmp == NULL)
+	{
+		pg_log_error("timestamp is corrupted");
+		exit(1);
+	}
+
 	strftime(ckpttime_str, sizeof(ckpttime_str), strftime_fmt,
-			 localtime(&time_tmp));
+			 tm_tmp);
 
 	/*
 	 * Calculate name of the WAL file containing the latest checkpoint's REDO
-- 
2.39.5

Reply via email to