From 0fecbea4fc4c49a9f8657005095e1cab081451be Mon Sep 17 00:00:00 2001
From: Sami Imseih <simseih@amazon.com>
Date: Fri, 9 Aug 2024 18:02:12 -0500
Subject: [PATCH v9 1/1] vaccum_delay with absolute time nanosleep

---
 src/backend/commands/vacuum.c        |  2 +-
 src/backend/port/win32/signal.c      | 10 +++++++
 src/include/port.h                   |  1 +
 src/include/portability/instr_time.h | 10 +++++++
 src/port/pgsleep.c                   | 43 ++++++++++++++++++++++++++++
 5 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 48f8eab202..43333c4698 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -2384,7 +2384,7 @@ vacuum_delay_point(void)
 			msec = vacuum_cost_delay * 4;
 
 		pgstat_report_wait_start(WAIT_EVENT_VACUUM_DELAY);
-		pg_usleep(msec * 1000);
+		pg_usleep_non_interruptible(msec * 1000);
 		pgstat_report_wait_end();
 
 		/*
diff --git a/src/backend/port/win32/signal.c b/src/backend/port/win32/signal.c
index 285cb611b4..edcb181215 100644
--- a/src/backend/port/win32/signal.c
+++ b/src/backend/port/win32/signal.c
@@ -73,6 +73,16 @@ pg_usleep(long microsec)
 	}
 }
 
+/*
+ * pg_usleep_non_interruptible --- delay the specified number of microseconds.
+ *
+ * Unlike pg_usleep, this relies on a non-interruptible sleep.
+ */
+void
+pg_usleep_non_interruptible(long microsec)
+{
+	SleepEx((microsec < 500 ? 1 : (microsec + 500) / 1000), FALSE);
+}
 
 /* Initialization */
 void
diff --git a/src/include/port.h b/src/include/port.h
index c740005267..c8ff23e5ee 100644
--- a/src/include/port.h
+++ b/src/include/port.h
@@ -162,6 +162,7 @@ extern int	pg_disable_aslr(void);
 
 /* Portable delay handling */
 extern void pg_usleep(long microsec);
+extern void pg_usleep_non_interruptible(long microsec);
 
 /* Portable SQL-like case-independent comparisons and conversions */
 extern int	pg_strcasecmp(const char *s1, const char *s2);
diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h
index e66ecf34cd..6e4b0f1b17 100644
--- a/src/include/portability/instr_time.h
+++ b/src/include/portability/instr_time.h
@@ -36,6 +36,10 @@
  *
  * INSTR_TIME_GET_NANOSEC(t)		convert t to int64 (in nanoseconds)
  *
+ * INSTR_TIME_ADD_MICROSEC(x,t)		add t (in microseconds) to x
+ *
+ * INSTR_TIME_IS_GREATER(x,y)		is x greater than y?
+ *
  * Note that INSTR_TIME_SUBTRACT and INSTR_TIME_ACCUM_DIFF convert
  * absolute times to intervals.  The INSTR_TIME_GET_xxx operations are
  * only useful on intervals.
@@ -194,4 +198,10 @@ GetTimerFrequency(void)
 #define INSTR_TIME_GET_MICROSEC(t) \
 	(INSTR_TIME_GET_NANOSEC(t) / NS_PER_US)
 
+#define INSTR_TIME_ADD_MICROSEC(x,t) \
+	((x).ticks += (t) * NS_PER_US)
+
+#define INSTR_TIME_IS_GREATER(x,y) \
+	((x).ticks > (y).ticks)
+
 #endif							/* INSTR_TIME_H */
diff --git a/src/port/pgsleep.c b/src/port/pgsleep.c
index 1284458bfc..9525156abc 100644
--- a/src/port/pgsleep.c
+++ b/src/port/pgsleep.c
@@ -14,6 +14,8 @@
 
 #include <time.h>
 
+#include "portability/instr_time.h"
+
 /*
  * In a Windows backend, we don't use this implementation, but rather
  * the signal-aware version in src/backend/port/win32/signal.c.
@@ -54,4 +56,45 @@ pg_usleep(long microsec)
 	}
 }
 
+/*
+ * pg_usleep_non_interruptible --- delay the specified number of microseconds.
+ *
+ * Unlike pg_usleep, This function continues the delay in case of an
+ * interrupt.
+ */
+void
+pg_usleep_non_interruptible(long microsec)
+{
+	/*
+	 * We allow nanosleep to handle interrupts and retry with the remaining
+	 * time. However, frequent interruptions and restarts of the nanosleep
+	 * calls can substantially lead to drift in the time when the sleep
+	 * finally completes. To deal with this, we break out of the loop whenever
+	 * the current time is past the expected end time of the sleep.
+	 */
+
+	struct timespec delay;
+	struct timespec remain;
+	instr_time	end_time;
+
+	INSTR_TIME_SET_CURRENT(end_time);
+	INSTR_TIME_ADD_MICROSEC(end_time, microsec);
+
+	delay.tv_sec = microsec / 1000000L;
+	delay.tv_nsec = (microsec % 1000000L) * 1000;
+
+	while (nanosleep(&delay, &remain) == -1 && errno == EINTR)
+	{
+		instr_time	current_time;
+
+		INSTR_TIME_SET_CURRENT(current_time);
+
+		if (INSTR_TIME_IS_GREATER(current_time, end_time))
+			break;
+
+		delay = remain;
+	}
+}
+
+
 #endif							/* defined(FRONTEND) || !defined(WIN32) */
-- 
2.39.3 (Apple Git-146)

