Handling clock_gettime( CLOCK_MONOTONIC_RAW, ×pec) by calling vdso_fallback_gettime(), ie. syscall, is too slow - latencies of 300-700ns are common on Haswell (06:3C) CPUs .
This patch against the 4.15.7 stable branch makes the VDSO handle clock_gettime(CLOCK_GETTIME_RAW, &ts) by issuing rdtscp in userspace, IFF the clock source is the TSC, and converting it to nanoseconds using the vsyscall_gtod_data 'mult' and 'shift' fields : volatile u32 tsc_lo, tsc_hi, tsc_cpu; asm volatile( "rdtscp" : (=a) tsc_lo, (=d) tsc_hi, (=c) tsc_cpu ); u64 tsc = (((u64)tsc_hi)<<32) | ((u64)tsc_lo); tsc *= gtod->mult; tsc >>=gtod->shift; /* tsc is now number of nanoseconds */ ts->tv_sec = __iter_div_u64_rem( tsc, NSEC_PER_SEC, &ts->tv_nsec); Use of the "open coded asm" style here actually forces the compiler to always choose the 32-bit version of rdtscp, which sets only %eax, %edx, and %ecx and does not clear the high bits of %rax, %rdx, and %rdx , because the variables are declared 32-bit - so the same 32-bit version is used whether the code is compiled with -m32 or -m64 ( tested using gcc 5.4.0, gcc 6.4.1 ) . The full story and test programs are in Bug #198961 : https://bugzilla.kernel.org/show_bug.cgi?id=198961 . The patched VDSO now handles clock_gettime(CLOCK_MONOTONIC_RAW, &ts) on the same machine with a latency (minimum time that can be measured) of around 100ns (compared with 300-700ns before patch). I also think it makes sense to expose pointers to the live, updated gtod->mult and gtod->shift values somehow to userspace . Then a userspace TSC reader could re-use previous values to avoid the long-division in most cases and obtain latencies of 10-20ns . Hence there is now a new method in the VDSO: __ vdso_linux_tsc_calibration() which returns a pointer to a 'struct linux_tsc_calibration' declared in a new header arch/x86/include/uapi/asm/vdso_tsc_calibration.h If the clock source is NOT the TSC, this function returns NULL . The pointer is only valid when the system clock source is the TSC . User-space TSC readers can detect when TSC is modified with Events, and now can detect when clock source changes from / to TSC with this function . The patch : --- diff --git a/arch/x86/entry/vdso/vclock_gettime.c \ b/arch/x86/entry/vdso/vclock_gettime.c index f19856d..e840600 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -21,6 +21,7 @@ #include <linux/math64.h> #include <linux/time.h> #include <linux/kernel.h> +#include <uapi/asm/vdso_tsc_calibration.h> #define gtod (&VVAR(vsyscall_gtod_data)) @@ -246,6 +247,29 @@ notrace static int __always_inline do_monotonic\ (struct timespec *ts) return mode; } +notrace static int __always_inline do_monotonic_raw( struct timespec *ts) +{ + volatile u32 tsc_lo=0, tsc_hi=0, tsc_cpu=0; // so same instrs generated for 64-bit as for 32-bit builds + u64 ns; + register u64 tsc=0; + if (gtod->vclock_mode == VCLOCK_TSC) + { + asm volatile + ( "rdtscp" + : "=a" (tsc_lo) + , "=d" (tsc_hi) + , "=c" (tsc_cpu) + ); // : eax, edx, ecx used - NOT rax, rdx, rcx + tsc = ((((u64)tsc_hi) & 0xffffffffUL) << 32) | (((u64)tsc_lo) & 0xffffffffUL); + tsc *= gtod->mult; + tsc >>= gtod->shift; + ts->tv_sec = __iter_div_u64_rem(tsc, NSEC_PER_SEC, &ns); + ts->tv_nsec = ns; + return VCLOCK_TSC; + } + return VCLOCK_NONE; +} + notrace static void do_realtime_coarse(struct timespec *ts) { unsigned long seq; @@ -277,6 +301,10 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) if (do_monotonic(ts) == VCLOCK_NONE) goto fallback; break; + case CLOCK_MONOTONIC_RAW: + if (do_monotonic_raw(ts) == VCLOCK_NONE) + goto fallback; + break; case CLOCK_REALTIME_COARSE: do_realtime_coarse(ts); break; @@ -326,3 +354,18 @@ notrace time_t __vdso_time(time_t *t) } time_t time(time_t *t) __attribute__((weak, alias("__vdso_time"))); + +extern const struct linux_tsc_calibration * + __vdso_linux_tsc_calibration(void); + +notrace const struct linux_tsc_calibration * + __vdso_linux_tsc_calibration(void) +{ + if( gtod->vclock_mode == VCLOCK_TSC ) + return ((const struct linux_tsc_calibration*) >od->mult); + return 0UL; +} + +const struct linux_tsc_calibration * linux_tsc_calibration(void) + __attribute((weak, alias("__vdso_linux_tsc_calibration"))); + diff --git a/arch/x86/entry/vdso/vdso.lds.S b/arch/x86/entry/vdso/vdso.lds.S index d3a2dce..41a2ca5 100644 --- a/arch/x86/entry/vdso/vdso.lds.S +++ b/arch/x86/entry/vdso/vdso.lds.S @@ -24,7 +24,9 @@ VERSION { getcpu; __vdso_getcpu; time; - __vdso_time; + __vdso_time; + linux_tsc_calibration; + __vdso_linux_tsc_calibration; local: *; }; } diff --git a/arch/x86/entry/vdso/vdso32/vdso32.lds.S b/arch/x86/entry/vdso/vdso32/vdso32.lds.S index 422764a..d53bd73 100644 --- a/arch/x86/entry/vdso/vdso32/vdso32.lds.S +++ b/arch/x86/entry/vdso/vdso32/vdso32.lds.S @@ -25,7 +25,8 @@ VERSION global: __vdso_clock_gettime; __vdso_gettimeofday; - __vdso_time; + __vdso_time; + __vdso_linux_tsc_calibration; }; LINUX_2.5 { diff --git a/arch/x86/entry/vdso/vdsox32.lds.S b/arch/x86/entry/vdso/vdsox32.lds.S index 05cd1c5..fb13b16 100644 --- a/arch/x86/entry/vdso/vdsox32.lds.S +++ b/arch/x86/entry/vdso/vdsox32.lds.S @@ -20,7 +20,8 @@ VERSION { __vdso_clock_gettime; __vdso_gettimeofday; __vdso_getcpu; - __vdso_time; + __vdso_time; + __vdso_linux_tsc_calibration; local: *; }; } diff --git a/arch/x86/include/uapi/asm/vdso_tsc_calibration.h \ b/arch/x86/include/uapi/asm/vdso_tsc_calibration.h new file mode 100644 index 0000000..6febb84 --- /dev/null +++ b/arch/x86/include/uapi/asm/vdso_tsc_calibration.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _ASM_X86_VDSO_TSC_CALIBRATION_H +#define _ASM_X86_VDSO_TSC_CALIBRATION_H +/* + * Programs that want to use rdtsc / rdtscp instructions + * from user-space can make use of the Linux kernel TSC calibration + * by calling : + * __vdso_linux_tsc_calibration(void); + * ( one has to resolve this symbol as in + * tools/testing/selftests/vDSO/parse_vdso.c + * ) + * which returns a pointer into the read-only + * vvar_page (the vsyscall_gtod_data structure), + * with the following layout : + */ + +struct linux_tsc_calibration +{ + unsigned int mult; /* amount to multiply 64-bit TSC value by */ + + unsigned int shift; /* the right shift to apply to (mult*TSC) yielding \ nanoseconds */ +}; + +/* To use: + * + * static const struct lnx_tsc_calibration_s* + * (*linux_tsc_cal)(void) = vdso_sym("LINUX_2.6", "__vdso_linux_tsc_calibration"); + * if( linux_tsc_cal == 0UL ) + * { fprintf(stderr,"the patch providing __vdso_linux_tsc_calibration is not \ applied to the kernel.\n"); + * return ERROR; + * } + * static const struct lnx_tsc_calibration_s *clock_source = (*linux_tsc_cal)(); + * if( clock_source == ((void*)0UL)) + * fprintf(stderr,"TSC is not the system clocksource.\n"); + * volatile unsigned int tsc_lo, tsc_hi, tsc_cpu; + * asm volatile + * ( "rdtscp" : (=a) tsc_hi, (=d) tsc_lo, (=c) tsc_cpu ); + * unsigned long tsc = (((unsigned long)tsc_hi) << 32) | tsc_lo; + * unsigned long nanoseconds = + * (( clock_source -> mult ) * tsc ) >> (clock_source -> shift); + * + * nanoseconds is now TSC value converted to nanoseconds, + * according to Linux' clocksource calibration values. + * + */ + +#endif ---
vdso_vclock_gettime_userspace_tsc_4.15.7.patch
Description: Binary data