I missed attaching the patches. -- David Geier (ServiceNow)
From f4e962729ca605498d0c8bfc97d0f42d68a0df06 Mon Sep 17 00:00:00 2001 From: David Geier <geidav...@gmail.com> Date: Thu, 17 Nov 2022 10:22:01 +0100 Subject: [PATCH 1/2] WIP: Change instr_time to just store nanoseconds, that's cheaper.
--- src/include/portability/instr_time.h | 62 ++++++++++++---------------- 1 file changed, 26 insertions(+), 36 deletions(-) diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index 22bcf3d288..4bd555113b 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -80,63 +80,53 @@ #define PG_INSTR_CLOCK CLOCK_REALTIME #endif -typedef struct timespec instr_time; +typedef int64 instr_time; +#define NS_PER_S INT64CONST(1000000000) +#define US_PER_S INT64CONST(1000000) +#define MS_PER_S INT64CONST(1000) -#define INSTR_TIME_IS_ZERO(t) ((t).tv_nsec == 0 && (t).tv_sec == 0) +#define NS_PER_MS INT64CONST(1000000) +#define NS_PER_US INT64CONST(1000) -#define INSTR_TIME_SET_ZERO(t) ((t).tv_sec = 0, (t).tv_nsec = 0) +#define INSTR_TIME_IS_ZERO(t) ((t) == 0) -#define INSTR_TIME_SET_CURRENT(t) ((void) clock_gettime(PG_INSTR_CLOCK, &(t))) +#define INSTR_TIME_SET_ZERO(t) ((t) = 0) + +static inline instr_time pg_clock_gettime_ns(void) +{ + struct timespec tmp; + + clock_gettime(PG_INSTR_CLOCK, &tmp); + + return tmp.tv_sec * NS_PER_S + tmp.tv_nsec; +} + +#define INSTR_TIME_SET_CURRENT(t) \ + (t) = pg_clock_gettime_ns() #define INSTR_TIME_ADD(x,y) \ do { \ - (x).tv_sec += (y).tv_sec; \ - (x).tv_nsec += (y).tv_nsec; \ - /* Normalize */ \ - while ((x).tv_nsec >= 1000000000) \ - { \ - (x).tv_nsec -= 1000000000; \ - (x).tv_sec++; \ - } \ + (x) += (y); \ } while (0) #define INSTR_TIME_SUBTRACT(x,y) \ do { \ - (x).tv_sec -= (y).tv_sec; \ - (x).tv_nsec -= (y).tv_nsec; \ - /* Normalize */ \ - while ((x).tv_nsec < 0) \ - { \ - (x).tv_nsec += 1000000000; \ - (x).tv_sec--; \ - } \ + (x) -= (y); \ } while (0) #define INSTR_TIME_ACCUM_DIFF(x,y,z) \ do { \ - (x).tv_sec += (y).tv_sec - (z).tv_sec; \ - (x).tv_nsec += (y).tv_nsec - (z).tv_nsec; \ - /* Normalize after each add to avoid overflow/underflow of tv_nsec */ \ - while ((x).tv_nsec < 0) \ - { \ - (x).tv_nsec += 1000000000; \ - (x).tv_sec--; \ - } \ - while ((x).tv_nsec >= 1000000000) \ - { \ - (x).tv_nsec -= 1000000000; \ - (x).tv_sec++; \ - } \ + (x) += (y) - (z); \ } while (0) #define INSTR_TIME_GET_DOUBLE(t) \ - (((double) (t).tv_sec) + ((double) (t).tv_nsec) / 1000000000.0) + ((double) (t) / NS_PER_S) #define INSTR_TIME_GET_MILLISEC(t) \ - (((double) (t).tv_sec * 1000.0) + ((double) (t).tv_nsec) / 1000000.0) + ((double) (t) / NS_PER_MS) #define INSTR_TIME_GET_MICROSEC(t) \ - (((uint64) (t).tv_sec * (uint64) 1000000) + (uint64) ((t).tv_nsec / 1000)) + ((double) (t) / NS_PER_US) #else /* WIN32 */ -- 2.34.1
From 7a6317fdf5b1d82f120a4fd5535cfe40c8165153 Mon Sep 17 00:00:00 2001 From: David Geier <geidav...@gmail.com> Date: Thu, 17 Nov 2022 13:03:59 +0100 Subject: [PATCH 2/2] WIP: Use cpu reference cycles, via rdtsc, to measure time for instrumentation. For now this is only enabled on Linux/x86 when the system clocksource is marked tsc as well, as determined at runtime. This way we can rely on the Linux kernel to make a decision whether tsc is invariant and usable on the current CPU architecture. In all other cases we continue to use the clock_gettime() implementation like before. Note that this intentionally uses rdtsc, not rdtscp, as rdtscp waits for currently running CPU instructions to have finished, and that adds up to noticable latency for little benefit in the typical InstrStartNode() / InstrStopNode() use case. --- src/backend/utils/init/postinit.c | 3 + src/bin/pg_test_timing/pg_test_timing.c | 1 + src/bin/pgbench/pgbench.c | 3 + src/bin/psql/startup.c | 4 + src/common/Makefile | 1 + src/common/instr_time.c | 103 ++++++++++++++++++++++++ src/include/portability/instr_time.h | 50 +++++++++--- src/tools/msvc/Mkvcbuild.pm | 2 +- 8 files changed, 157 insertions(+), 10 deletions(-) create mode 100644 src/common/instr_time.c diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c index a990c833c5..c684725af3 100644 --- a/src/backend/utils/init/postinit.c +++ b/src/backend/utils/init/postinit.c @@ -804,6 +804,9 @@ InitPostgres(const char *in_dbname, Oid dboid, /* Initialize portal manager */ EnablePortalManager(); + /* initialize high-precision interval timing */ + INSTR_TIME_INITIALIZE(); + /* Initialize status reporting */ pgstat_beinit(); diff --git a/src/bin/pg_test_timing/pg_test_timing.c b/src/bin/pg_test_timing/pg_test_timing.c index c29d6f8762..0d667ff5a7 100644 --- a/src/bin/pg_test_timing/pg_test_timing.c +++ b/src/bin/pg_test_timing/pg_test_timing.c @@ -132,6 +132,7 @@ test_timing(unsigned int duration) total_time = duration > 0 ? duration * INT64CONST(1000000) : 0; + INSTR_TIME_INITIALIZE(); INSTR_TIME_SET_CURRENT(start_time); cur = INSTR_TIME_GET_MICROSEC(start_time); diff --git a/src/bin/pgbench/pgbench.c b/src/bin/pgbench/pgbench.c index 36905a8968..1c5a265863 100644 --- a/src/bin/pgbench/pgbench.c +++ b/src/bin/pgbench/pgbench.c @@ -7076,6 +7076,9 @@ main(int argc, char **argv) initRandomState(&state[i].cs_func_rs); } + /* initialize high-precision interval timing */ + INSTR_TIME_INITIALIZE(); + /* opening connection... */ con = doConnect(); if (con == NULL) diff --git a/src/bin/psql/startup.c b/src/bin/psql/startup.c index f5b9e268f2..14f368b658 100644 --- a/src/bin/psql/startup.c +++ b/src/bin/psql/startup.c @@ -24,6 +24,7 @@ #include "help.h" #include "input.h" #include "mainloop.h" +#include "portability/instr_time.h" #include "settings.h" /* @@ -322,6 +323,9 @@ main(int argc, char *argv[]) PQsetNoticeProcessor(pset.db, NoticeProcessor, NULL); + /* initialize high-precision interval timing */ + INSTR_TIME_INITIALIZE(); + SyncVariables(); if (options.list_dbs) diff --git a/src/common/Makefile b/src/common/Makefile index e9af7346c9..437a018590 100644 --- a/src/common/Makefile +++ b/src/common/Makefile @@ -59,6 +59,7 @@ OBJS_COMMON = \ file_perm.o \ file_utils.o \ hashfn.o \ + instr_time.o \ ip.o \ jsonapi.o \ keywords.o \ diff --git a/src/common/instr_time.c b/src/common/instr_time.c new file mode 100644 index 0000000000..27653a8bc3 --- /dev/null +++ b/src/common/instr_time.c @@ -0,0 +1,103 @@ +/*------------------------------------------------------------------------- + * + * instr_time.c + * Non-inline parts of the portable high-precision interval timing + * implementation + * + * Portions Copyright (c) 2022, PostgreSQL Global Development Group + * + * + * IDENTIFICATION + * src/backend/port/instr_time.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "portability/instr_time.h" + +/* + * Stores what the number of cycles needs to be multiplied with to end up with + * seconds. This indirection exists to support the rtdsc instruction. + * + * As a default, assume we are using clock_gettime() as a fallback and treat it + * as 1 "cycle" per nanosecond (aka 1 GHz). + * + * When using the rtdsc instruction directly this is filled in during + * initialization based on the relevant cpuid fields. + */ +double cycles_to_sec = 1.0 / NS_PER_S; + +/* + * Determines whether rdtsc is used (Linux/x86 only, when OS uses tsc clocksource) + */ +bool use_rdtsc = false; + +#if defined(__x86_64__) && defined(__linux__) +/* + * Decide whether we use the rdtsc instruction at runtime, for Linux/x86, + * instead of incurring the overhead of a full clock_gettime() call. + * + * This can't be reliably determined at compile time, since the + * availability of an "invariant" TSC (that is not affected by CPU + * frequency changes) is dependent on the CPU architecture. Additionally, + * there are cases where TSC availability is impacted by virtualization, + * where a simple cpuid feature check would not be enough. + * + * Since Linux already does a significant amount of work to determine + * whether TSC is a viable clock source, decide based on that. + */ +void pg_clock_gettime_initialize_rdtsc(void) +{ + FILE *fp = fopen("/sys/devices/system/clocksource/clocksource0/current_clocksource", "r"); + char buf[128]; + + if (fp) + { + if (fgets(buf, sizeof(buf), fp) != NULL && strcmp(buf, "tsc\n") == 0) + use_rdtsc = true; + + fclose(fp); + } + + /* + * Compute baseline cpu peformance, determines speed at which rdtsc advances + */ + if (use_rdtsc) + { + uint32 cpuinfo[4] = {0}; + + /* + * FIXME: We should probably not unnecessarily use floating point math + * here. And it's likely that the numbers are small enough that we are running + * into floating point inaccuracies already. Probably worthwhile to be a good + * bit smarter. + */ + + __get_cpuid(0x16, cpuinfo, cpuinfo + 1, cpuinfo + 2, cpuinfo + 3); + + if (cpuinfo[0] != 0) + cycles_to_sec = 1 / ((double) cpuinfo[0] * 1000 * 1000); + else + { + float cpu_mhz; + + fp = fopen("/proc/cpuinfo", "r"); + + if (fp) + { + while (fgets(buf, sizeof(buf), fp)) + { + if (sscanf(buf, "cpu MHz : %f", &cpu_mhz) == 1) + { + cycles_to_sec = 1 / ((double) cpu_mhz * 1000 * 1000); + break; + } + } + } + + fclose(fp); + } + } +} +#endif /* defined(__x86_64__) && defined(__linux__) */ diff --git a/src/include/portability/instr_time.h b/src/include/portability/instr_time.h index 4bd555113b..330c205924 100644 --- a/src/include/portability/instr_time.h +++ b/src/include/portability/instr_time.h @@ -4,9 +4,11 @@ * portable high-precision interval timing * * This file provides an abstraction layer to hide portability issues in - * interval timing. On Unix we use clock_gettime(), and on Windows we use - * QueryPerformanceCounter(). These macros also give some breathing room to - * use other high-precision-timing APIs. + * interval timing. On Linux/x86 we use the rdtsc instruction when a TSC + * clocksource is also used on the host OS. Otherwise, and on other + * Unix-like systems we use clock_gettime() and on Windows we use + * QueryPerformanceCounter(). These macros also give some breathing + * room to use other high-precision-timing APIs. * * The basic data type is instr_time, which all callers should treat as an * opaque typedef. instr_time can store either an absolute time (of @@ -56,10 +58,15 @@ #ifndef WIN32 -/* Use clock_gettime() */ +/* Uses rdtsc on Linux/x86 if available, otherwise clock_gettime() */ #include <time.h> +#if defined(__x86_64__) && defined(__linux__) +#include <x86intrin.h> +#include <cpuid.h> +#endif + /* * The best clockid to use according to the POSIX spec is CLOCK_MONOTONIC, * since that will give reliable interval timing even in the face of changes @@ -80,7 +87,9 @@ #define PG_INSTR_CLOCK CLOCK_REALTIME #endif +/* time in cpu reference cycles (when using rdtsc), otherwise nanoseconds */ typedef int64 instr_time; + #define NS_PER_S INT64CONST(1000000000) #define US_PER_S INT64CONST(1000000) #define MS_PER_S INT64CONST(1000) @@ -92,17 +101,38 @@ typedef int64 instr_time; #define INSTR_TIME_SET_ZERO(t) ((t) = 0) -static inline instr_time pg_clock_gettime_ns(void) +extern double cycles_to_sec; +extern bool use_rdtsc; + +#if defined(__x86_64__) && defined(__linux__) +extern void pg_clock_gettime_initialize_rdtsc(void); +#endif + +static inline instr_time pg_clock_gettime_ref_cycles(void) { struct timespec tmp; +#if defined(__x86_64__) && defined(__linux__) +#ifndef FRONTEND + if (use_rdtsc) + return __rdtsc(); +#endif +#endif + clock_gettime(PG_INSTR_CLOCK, &tmp); return tmp.tv_sec * NS_PER_S + tmp.tv_nsec; } +#if defined(__x86_64__) && defined(__linux__) +#define INSTR_TIME_INITIALIZE() \ + pg_clock_gettime_initialize_rdtsc() +#else +#define INSTR_TIME_INITIALIZE() +#endif + #define INSTR_TIME_SET_CURRENT(t) \ - (t) = pg_clock_gettime_ns() + (t) = pg_clock_gettime_ref_cycles() #define INSTR_TIME_ADD(x,y) \ do { \ @@ -120,13 +150,13 @@ static inline instr_time pg_clock_gettime_ns(void) } while (0) #define INSTR_TIME_GET_DOUBLE(t) \ - ((double) (t) / NS_PER_S) + ((double) (t) * cycles_to_sec) #define INSTR_TIME_GET_MILLISEC(t) \ - ((double) (t) / NS_PER_MS) + ((double) (t) * (cycles_to_sec * MS_PER_S)) #define INSTR_TIME_GET_MICROSEC(t) \ - ((double) (t) / NS_PER_US) + ((uint64) (t) * (cycles_to_sec * US_PER_S)) #else /* WIN32 */ @@ -138,6 +168,8 @@ typedef LARGE_INTEGER instr_time; #define INSTR_TIME_SET_ZERO(t) ((t).QuadPart = 0) +#define INSTR_TIME_INITIALIZE() + #define INSTR_TIME_SET_CURRENT(t) QueryPerformanceCounter(&(t)) #define INSTR_TIME_ADD(x,y) \ diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm index 83a3e40425..09ef56c309 100644 --- a/src/tools/msvc/Mkvcbuild.pm +++ b/src/tools/msvc/Mkvcbuild.pm @@ -135,7 +135,7 @@ sub mkvcbuild our @pgcommonallfiles = qw( archive.c base64.c checksum_helper.c compression.c config_info.c controldata_utils.c d2s.c encnames.c exec.c - f2s.c file_perm.c file_utils.c hashfn.c ip.c jsonapi.c + f2s.c file_perm.c file_utils.c hashfn.c ip.c instr_time.o jsonapi.c keywords.c kwlookup.c link-canary.c md5_common.c pg_get_line.c pg_lzcompress.c pg_prng.c pgfnames.c psprintf.c relpath.c rmtree.c saslprep.c scram-common.c string.c stringinfo.c unicode_norm.c -- 2.34.1