On Sat, Mar 30, 2019 at 09:07:41PM +0100, Michael Banck wrote:
> The way you've now changed this is that there's a function call into
> progress_report() for every block that's being read, even if there is no
> progress reporting requested. That looks like a pretty severe
> performance problem so I suggest to at least stick with checking
> showprogress before calling progress_report() and not the other way
> round.
>
> So my vote is in favour of only calling progress_report() every once in
> a while - I saw quite a speedup (or removal of slowdown) due to this in
> my tests, this was not just some unwarranted microoptimization.

Do you have some runtime numbers?  If all the pages are in the OS
cache you should be able to see more impact.  I have been testing with
a pgbench database at scale 300 and --check, and perf is showing me
that progress_report counts for 0.10% with or without the option of
the profile while pg_checksum_page() counts for 36%.  Switching the
switch in or out of it makes the performance change lost in the noise.
I'd rather keep the check for showprogress within progress_report() so
as extra callers don't miss bypassing the report if --progress is not
enabled, still we are talking about only one caller now so the
attached does it the other way around with an assertion.

>> +     fprintf(stderr, "\r");
>
> I think the isatty() check that was in our versions of the patch is
> useful here, did you check how this looks when piping the output to a
> file?

Oops, fixed.  The output was strange.

> This hunk is from the performance optimization of calling
> progress_report only every 1024 blocks and could be removed if we stick
> with calling it for every block anyway (but see above).

Indeed, removed this one.

>> -static void
>> -scan_directory(const char *basedir, const char *subdir)
>> +/*
>> + * Scan the given directory for items which can be checksummed and
>> + * operate on each one of them.  If "sizeonly" is true, the size of
>> + * all the items which have checksums is computed and returned back
> 
> "computed" is maybe a bit strong a word here, how about "added up"?

"computed" sounds fine to me.
--
Michael
diff --git a/doc/src/sgml/ref/pg_checksums.sgml b/doc/src/sgml/ref/pg_checksums.sgml
index 70339eaec9..29507ff98e 100644
--- a/doc/src/sgml/ref/pg_checksums.sgml
+++ b/doc/src/sgml/ref/pg_checksums.sgml
@@ -135,6 +135,17 @@ PostgreSQL documentation
       </listitem>
      </varlistentry>
 
+     <varlistentry>
+      <term><option>-P</option></term>
+      <term><option>--progress</option></term>
+      <listitem>
+       <para>
+        Enable progress reporting. Turning this on will deliver a progress
+        report while checking or enabling checksums.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry>
        <term><option>-V</option></term>
        <term><option>--version</option></term>
diff --git a/src/bin/pg_checksums/pg_checksums.c b/src/bin/pg_checksums/pg_checksums.c
index 5c185ebed8..c85e7febed 100644
--- a/src/bin/pg_checksums/pg_checksums.c
+++ b/src/bin/pg_checksums/pg_checksums.c
@@ -15,6 +15,7 @@
 #include "postgres_fe.h"
 
 #include <dirent.h>
+#include <time.h>
 #include <sys/stat.h>
 #include <unistd.h>
 
@@ -37,6 +38,7 @@ static ControlFileData *ControlFile;
 static char *only_relfilenode = NULL;
 static bool do_sync = true;
 static bool verbose = false;
+static bool showprogress = false;
 
 typedef enum
 {
@@ -59,6 +61,13 @@ static PgChecksumMode mode = PG_MODE_CHECK;
 
 static const char *progname;
 
+/*
+ * Progress status information.
+ */
+int64		total_size = 0;
+int64		current_size = 0;
+static pg_time_t last_progress_report = 0;
+
 static void
 usage(void)
 {
@@ -71,6 +80,7 @@ usage(void)
 	printf(_("  -d, --disable          disable data checksums\n"));
 	printf(_("  -e, --enable           enable data checksums\n"));
 	printf(_("  -N, --no-sync          do not wait for changes to be written safely to disk\n"));
+	printf(_("  -P, --progress         show progress information\n"));
 	printf(_("  -v, --verbose          output verbose messages\n"));
 	printf(_("  -r RELFILENODE         check only relation with specified relfilenode\n"));
 	printf(_("  -V, --version          output version information, then exit\n"));
@@ -97,6 +107,52 @@ static const char *const skip[] = {
 	NULL,
 };
 
+/*
+ * Report current progress status.  Parts borrowed from
+ * src/bin/pg_basebackup.c.
+ */
+static void
+progress_report(bool force)
+{
+	int			percent;
+	char		total_size_str[32];
+	char		current_size_str[32];
+	pg_time_t	now;
+
+	Assert(showprogress);
+
+	now = time(NULL);
+	if (now == last_progress_report && !force)
+		return;					/* Max once per second */
+
+	/* Save current time */
+	last_progress_report = now;
+
+	/* Adjust total size if current_size is larger */
+	if (current_size > total_size)
+		total_size = current_size;
+
+	/* Calculate current percentage of size done */
+	percent = total_size ? (int) ((current_size) * 100 / total_size) : 0;
+
+	snprintf(total_size_str, sizeof(total_size_str), INT64_FORMAT,
+			 total_size / (1024 * 1024));
+	snprintf(current_size_str, sizeof(current_size_str), INT64_FORMAT,
+			 current_size / (1024 * 1024));
+
+	/*
+	 * Separate step to keep platform-dependent format code out of
+	 * translatable strings.  And we only test for INT64_FORMAT availability
+	 * in snprintf, not fprintf.
+	 */
+	fprintf(stderr, "%*s/%s MB (%d%%) computed",
+			(int) strlen(current_size_str), current_size_str, total_size_str,
+			percent);
+
+	/* Stay on the same line if reporting to a terminal */
+	fprintf(stderr, isatty(fileno(stderr)) ? "\r" : "\n");
+}
+
 static bool
 skipfile(const char *fn)
 {
@@ -153,6 +209,7 @@ scan_file(const char *fn, BlockNumber segmentno)
 			continue;
 
 		csum = pg_checksum_page(buf.data, blockno + segmentno * RELSEG_SIZE);
+		current_size += r;
 		if (mode == PG_MODE_CHECK)
 		{
 			if (csum != header->pd_checksum)
@@ -183,6 +240,9 @@ scan_file(const char *fn, BlockNumber segmentno)
 				exit(1);
 			}
 		}
+
+		if (showprogress)
+			progress_report(false);
 	}
 
 	if (verbose)
@@ -198,9 +258,17 @@ scan_file(const char *fn, BlockNumber segmentno)
 	close(f);
 }
 
-static void
-scan_directory(const char *basedir, const char *subdir)
+/*
+ * Scan the given directory for items which can be checksummed and
+ * operate on each one of them.  If "sizeonly" is true, the size of
+ * all the items which have checksums is computed and returned back
+ * to the caller without operating on the files.  This is used to compile
+ * the total size of the data directory for progress reports.
+ */
+static int64
+scan_directory(const char *basedir, const char *subdir, bool sizeonly)
 {
+	int64		dirsize = 0;
 	char		path[MAXPGPATH];
 	DIR		   *dir;
 	struct dirent *de;
@@ -279,16 +347,24 @@ scan_directory(const char *basedir, const char *subdir)
 				/* Relfilenode not to be included */
 				continue;
 
-			scan_file(fn, segmentno);
+			dirsize += st.st_size;
+
+			/*
+			 * No need to work on the file when calculating only the size
+			 * of the items in the data folder.
+			 */
+			if (!sizeonly)
+				scan_file(fn, segmentno);
 		}
 #ifndef WIN32
 		else if (S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode))
 #else
 		else if (S_ISDIR(st.st_mode) || pgwin32_is_junction(fn))
 #endif
-			scan_directory(path, de->d_name);
+			dirsize += scan_directory(path, de->d_name, sizeonly);
 	}
 	closedir(dir);
+	return dirsize;
 }
 
 int
@@ -300,6 +376,7 @@ main(int argc, char *argv[])
 		{"disable", no_argument, NULL, 'd'},
 		{"enable", no_argument, NULL, 'e'},
 		{"no-sync", no_argument, NULL, 'N'},
+		{"progress", no_argument, NULL, 'P'},
 		{"verbose", no_argument, NULL, 'v'},
 		{NULL, 0, NULL, 0}
 	};
@@ -327,7 +404,7 @@ main(int argc, char *argv[])
 		}
 	}
 
-	while ((c = getopt_long(argc, argv, "cD:deNr:v", long_options, &option_index)) != -1)
+	while ((c = getopt_long(argc, argv, "cD:deNPr:v", long_options, &option_index)) != -1)
 	{
 		switch (c)
 		{
@@ -357,6 +434,9 @@ main(int argc, char *argv[])
 				}
 				only_relfilenode = pstrdup(optarg);
 				break;
+			case 'P':
+				showprogress = true;
+				break;
 			default:
 				fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
 				exit(1);
@@ -436,6 +516,18 @@ main(int argc, char *argv[])
 		exit(1);
 	}
 
+	/*
+	 * If progress status information is requested, we need to scan the
+	 * directory tree twice: once to know how much total data needs to
+	 * be processed and once to do the real work.
+	 */
+	if (showprogress)
+	{
+		total_size = scan_directory(DataDir, "global", true);
+		total_size += scan_directory(DataDir, "base", true);
+		total_size += scan_directory(DataDir, "pg_tblspc", true);
+	}
+
 	if (ControlFile->data_checksum_version == 0 &&
 		mode == PG_MODE_DISABLE)
 	{
@@ -453,9 +545,15 @@ main(int argc, char *argv[])
 	/* Operate on all files if checking or enabling checksums */
 	if (mode == PG_MODE_CHECK || mode == PG_MODE_ENABLE)
 	{
-		scan_directory(DataDir, "global");
-		scan_directory(DataDir, "base");
-		scan_directory(DataDir, "pg_tblspc");
+		(void) scan_directory(DataDir, "global", false);
+		(void) scan_directory(DataDir, "base", false);
+		(void) scan_directory(DataDir, "pg_tblspc", false);
+
+		if (showprogress)
+		{
+			progress_report(true);
+			fprintf(stderr, "\n");	/* Need to move to next line */
+		}
 
 		printf(_("Checksum operation completed\n"));
 		printf(_("Files scanned:  %s\n"), psprintf(INT64_FORMAT, files));

Attachment: signature.asc
Description: PGP signature

Reply via email to