Add two new power management intrinsics, and provide an implementation in eal/x86 based on UMONITOR/UMWAIT instructions. The instructions are implemented as raw byte opcodes because there is not yet widespread compiler support for these instructions.
The power management instructions provide an architecture-specific function to either wait until a specified TSC timestamp is reached, or optionally wait until either a TSC timestamp is reached or a memory location is written to. The monitor function also provides an optional comparison, to avoid sleeping when the expected write has already happened, and no more writes are expected. Signed-off-by: Liang Ma <liang.j...@intel.com> Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com> --- .../include/generic/rte_power_intrinsics.h | 64 ++++++++ lib/librte_eal/include/meson.build | 1 + lib/librte_eal/x86/include/meson.build | 1 + lib/librte_eal/x86/include/rte_cpuflags.h | 2 + .../x86/include/rte_power_intrinsics.h | 143 ++++++++++++++++++ lib/librte_eal/x86/rte_cpuflags.c | 2 + 6 files changed, 213 insertions(+) create mode 100644 lib/librte_eal/include/generic/rte_power_intrinsics.h create mode 100644 lib/librte_eal/x86/include/rte_power_intrinsics.h diff --git a/lib/librte_eal/include/generic/rte_power_intrinsics.h b/lib/librte_eal/include/generic/rte_power_intrinsics.h new file mode 100644 index 0000000000..cd7f8070ac --- /dev/null +++ b/lib/librte_eal/include/generic/rte_power_intrinsics.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2020 Intel Corporation + */ + +#ifndef _RTE_POWER_INTRINSIC_H_ +#define _RTE_POWER_INTRINSIC_H_ + +#include <inttypes.h> + +/** + * @file + * Advanced power management operations. + * + * This file define APIs for advanced power management, + * which are architecture-dependent. + */ + +/** + * Monitor specific address for changes. This will cause the CPU to enter an + * architecture-defined optimized power state until either the specified + * memory address is written to, or a certain TSC timestamp is reached. + * + * Additionally, an `expected` 64-bit value and 64-bit mask are provided. If + * mask is non-zero, the current value pointed to by the `p` pointer will be + * checked against the expected value, and if they match, the entering of + * optimized power state may be aborted. + * + * @param p + * Address to monitor for changes. Must be aligned on an 64-byte boundary. + * @param expected_value + * Before attempting the monitoring, the `p` address may be read and compared + * against this value. If `value_mask` is zero, this step will be skipped. + * @param value_mask + * The 64-bit mask to use to extract current value from `p`. + * @param state + * Architecture-dependent optimized power state number + * @param tsc_timestamp + * Maximum TSC timestamp to wait for. Note that the wait behavior is + * architecture-dependent. + * + * @return + * Architecture-dependent return value. + */ +static inline int rte_power_monitor(const volatile void *p, + const uint64_t expected_value, const uint64_t value_mask, + const uint32_t state, const uint64_t tsc_timestamp); + +/** + * Enter an architecture-defined optimized power state until a certain TSC + * timestamp is reached. + * + * @param state + * Architecture-dependent optimized power state number + * @param tsc_timestamp + * Maximum TSC timestamp to wait for. Note that the wait behavior is + * architecture-dependent. + * + * @return + * Architecture-dependent return value. + */ +static inline int rte_power_pause(const uint32_t state, + const uint64_t tsc_timestamp); + +#endif /* _RTE_POWER_INTRINSIC_H_ */ diff --git a/lib/librte_eal/include/meson.build b/lib/librte_eal/include/meson.build index cd09027958..3a12e87e19 100644 --- a/lib/librte_eal/include/meson.build +++ b/lib/librte_eal/include/meson.build @@ -60,6 +60,7 @@ generic_headers = files( 'generic/rte_memcpy.h', 'generic/rte_pause.h', 'generic/rte_prefetch.h', + 'generic/rte_power_intrinsics.h', 'generic/rte_rwlock.h', 'generic/rte_spinlock.h', 'generic/rte_ticketlock.h', diff --git a/lib/librte_eal/x86/include/meson.build b/lib/librte_eal/x86/include/meson.build index f0e998c2fe..494a8142a2 100644 --- a/lib/librte_eal/x86/include/meson.build +++ b/lib/librte_eal/x86/include/meson.build @@ -13,6 +13,7 @@ arch_headers = files( 'rte_io.h', 'rte_memcpy.h', 'rte_prefetch.h', + 'rte_power_intrinsics.h', 'rte_pause.h', 'rte_rtm.h', 'rte_rwlock.h', diff --git a/lib/librte_eal/x86/include/rte_cpuflags.h b/lib/librte_eal/x86/include/rte_cpuflags.h index c1d20364d1..5041a830a7 100644 --- a/lib/librte_eal/x86/include/rte_cpuflags.h +++ b/lib/librte_eal/x86/include/rte_cpuflags.h @@ -132,6 +132,8 @@ enum rte_cpu_flag_t { RTE_CPUFLAG_MOVDIR64B, /**< Direct Store Instructions 64B */ RTE_CPUFLAG_AVX512VP2INTERSECT, /**< AVX512 Two Register Intersection */ + /**< UMWAIT/TPAUSE Instructions */ + RTE_CPUFLAG_WAITPKG, /**< UMINITOR/UMWAIT/TPAUSE */ /* The last item */ RTE_CPUFLAG_NUMFLAGS, /**< This should always be the last! */ }; diff --git a/lib/librte_eal/x86/include/rte_power_intrinsics.h b/lib/librte_eal/x86/include/rte_power_intrinsics.h new file mode 100644 index 0000000000..6dd1cdc939 --- /dev/null +++ b/lib/librte_eal/x86/include/rte_power_intrinsics.h @@ -0,0 +1,143 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(c) 2020 Intel Corporation + */ + +#ifndef _RTE_POWER_INTRINSIC_X86_64_H_ +#define _RTE_POWER_INTRINSIC_X86_64_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#include <rte_atomic.h> +#include <rte_common.h> + +#include "generic/rte_power_intrinsics.h" + +/** + * Monitor specific address for changes. This will cause the CPU to enter an + * architecture-defined optimized power state until either the specified + * memory address is written to, or a certain TSC timestamp is reached. + * + * Additionally, an `expected` 64-bit value and 64-bit mask are provided. If + * mask is non-zero, the current value pointed to by the `p` pointer will be + * checked against the expected value, and if they match, the entering of + * optimized power state may be aborted. + * + * This function uses UMONITOR/UMWAIT instructions. For more information about + * their usage, please refer to Intel(R) 64 and IA-32 Architectures Software + * Developer's Manual. + * + * @param p + * Address to monitor for changes. Must be aligned on an 64-byte boundary. + * @param expected_value + * Before attempting the monitoring, the `p` address may be read and compared + * against this value. If `value_mask` is zero, this step will be skipped. + * @param value_mask + * The 64-bit mask to use to extract current value from `p`. + * @param state + * Architecture-dependent optimized power state number. Can be 0 (C0.2) or + * 1 (C0.1). + * @param tsc_timestamp + * Maximum TSC timestamp to wait for. + * + * @return + * - 1 if wakeup was due to TSC timeout expiration. + * - 0 if wakeup was due to memory write or other reasons. + */ +static inline int rte_power_monitor(const volatile void *p, + const uint64_t expected_value, const uint64_t value_mask, + const uint32_t state, const uint64_t tsc_timestamp) +{ + const uint32_t tsc_l = (uint32_t)tsc_timestamp; + const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32); + /* the rflags need match native register size */ +#ifdef RTE_ARCH_I686 + uint32_t rflags; +#else + uint64_t rflags; +#endif + /* + * we're using raw byte codes for now as only the newest compiler + * versions support this instruction natively. + */ + + /* set address for UMONITOR */ + asm volatile(".byte 0xf3, 0x0f, 0xae, 0xf7;" + : + : "D"(p)); + + if (value_mask) { + const uint64_t cur_value = *(const volatile uint64_t *)p; + const uint64_t masked = cur_value & value_mask; + /* if the masked value is already matching, abort */ + if (masked == expected_value) + return 0; + } + /* execute UMWAIT */ + asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;\n" + /* + * UMWAIT sets CF flag in RFLAGS, so PUSHF to push them + * onto the stack, then pop them back into `rflags` so that + * we can read it. + */ + "pushf;\n" + "pop %0;\n" + : "=r"(rflags) + : "D"(state), "a"(tsc_l), "d"(tsc_h)); + + /* we're interested in the first bit (the carry flag) */ + return rflags & 0x1; +} + +/** + * Enter an architecture-defined optimized power state until a certain TSC + * timestamp is reached. + * + * This function uses TPAUSE instruction. For more information about its usage, + * please refer to Intel(R) 64 and IA-32 Architectures Software Developer's + * Manual. + * + * @param state + * Architecture-dependent optimized power state number. Can be 0 (C0.2) or + * 1 (C0.1). + * @param tsc_timestamp + * Maximum TSC timestamp to wait for. + * + * @return + * - 1 if wakeup was due to TSC timeout expiration. + * - 0 if wakeup was due to other reasons. + */ +static inline int rte_power_pause(const uint32_t state, + const uint64_t tsc_timestamp) +{ + const uint32_t tsc_l = (uint32_t)tsc_timestamp; + const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32); + /* the rflags need match native register size */ +#ifdef RTE_ARCH_I686 + uint32_t rflags; +#else + uint64_t rflags; +#endif + + /* execute TPAUSE */ + asm volatile(".byte 0x66, 0x0f, 0xae, 0xf7;\n" + /* + * TPAUSE sets CF flag in RFLAGS, so PUSHF to push them + * onto the stack, then pop them back into `rflags` so that + * we can read it. + */ + "pushf;\n" + "pop %0;\n" + : "=r"(rflags) + : "D"(state), "a"(tsc_l), "d"(tsc_h)); + + /* we're interested in the first bit (the carry flag) */ + return rflags & 0x1; +} + +#ifdef __cplusplus +} +#endif + +#endif /* _RTE_POWER_INTRINSIC_X86_64_H_ */ diff --git a/lib/librte_eal/x86/rte_cpuflags.c b/lib/librte_eal/x86/rte_cpuflags.c index 30439e7951..0325c4b93b 100644 --- a/lib/librte_eal/x86/rte_cpuflags.c +++ b/lib/librte_eal/x86/rte_cpuflags.c @@ -110,6 +110,8 @@ const struct feature_entry rte_cpu_feature_table[] = { FEAT_DEF(AVX512F, 0x00000007, 0, RTE_REG_EBX, 16) FEAT_DEF(RDSEED, 0x00000007, 0, RTE_REG_EBX, 18) + FEAT_DEF(WAITPKG, 0x00000007, 0, RTE_REG_ECX, 5) + FEAT_DEF(LAHF_SAHF, 0x80000001, 0, RTE_REG_ECX, 0) FEAT_DEF(LZCNT, 0x80000001, 0, RTE_REG_ECX, 4) -- 2.17.1