> > Add two new power management intrinsics, and provide an implementation > in eal/x86 based on UMONITOR/UMWAIT instructions. The instructions > are implemented as raw byte opcodes because there is not yet widespread > compiler support for these instructions. > > The power management instructions provide an architecture-specific > function to either wait until a specified TSC timestamp is reached, or > optionally wait until either a TSC timestamp is reached or a memory > location is written to. The monitor function also provides an optional > comparison, to avoid sleeping when the expected write has already > happened, and no more writes are expected.
I think what this API is missing - a function to wakeup sleeping core. If user can/should use some system call to achieve that, then at least it has to be clearly documented, even better some wrapper provided. > > For more details, Please reference Intel SDM Volume 2. > > Signed-off-by: Liang Ma <liang.j...@intel.com> > Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com> > --- > .../include/generic/rte_power_intrinsics.h | 64 ++++++++ > lib/librte_eal/include/meson.build | 1 + > lib/librte_eal/x86/include/meson.build | 1 + > .../x86/include/rte_power_intrinsics.h | 143 ++++++++++++++++++ > 4 files changed, 209 insertions(+) > create mode 100644 lib/librte_eal/include/generic/rte_power_intrinsics.h > create mode 100644 lib/librte_eal/x86/include/rte_power_intrinsics.h > > diff --git a/lib/librte_eal/include/generic/rte_power_intrinsics.h > b/lib/librte_eal/include/generic/rte_power_intrinsics.h > new file mode 100644 > index 0000000000..cd7f8070ac > --- /dev/null > +++ b/lib/librte_eal/include/generic/rte_power_intrinsics.h > @@ -0,0 +1,64 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2020 Intel Corporation > + */ > + > +#ifndef _RTE_POWER_INTRINSIC_H_ > +#define _RTE_POWER_INTRINSIC_H_ > + > +#include <inttypes.h> > + > +/** > + * @file > + * Advanced power management operations. > + * > + * This file define APIs for advanced power management, > + * which are architecture-dependent. > + */ > + > +/** > + * Monitor specific address for changes. This will cause the CPU to enter an > + * architecture-defined optimized power state until either the specified > + * memory address is written to, or a certain TSC timestamp is reached. > + * > + * Additionally, an `expected` 64-bit value and 64-bit mask are provided. If > + * mask is non-zero, the current value pointed to by the `p` pointer will be > + * checked against the expected value, and if they match, the entering of > + * optimized power state may be aborted. > + * > + * @param p > + * Address to monitor for changes. Must be aligned on an 64-byte boundary. > + * @param expected_value > + * Before attempting the monitoring, the `p` address may be read and > compared > + * against this value. If `value_mask` is zero, this step will be skipped. > + * @param value_mask > + * The 64-bit mask to use to extract current value from `p`. > + * @param state > + * Architecture-dependent optimized power state number > + * @param tsc_timestamp > + * Maximum TSC timestamp to wait for. Note that the wait behavior is > + * architecture-dependent. > + * > + * @return > + * Architecture-dependent return value. > + */ > +static inline int rte_power_monitor(const volatile void *p, > + const uint64_t expected_value, const uint64_t value_mask, > + const uint32_t state, const uint64_t tsc_timestamp); > + > +/** > + * Enter an architecture-defined optimized power state until a certain TSC > + * timestamp is reached. > + * > + * @param state > + * Architecture-dependent optimized power state number > + * @param tsc_timestamp > + * Maximum TSC timestamp to wait for. Note that the wait behavior is > + * architecture-dependent. > + * > + * @return > + * Architecture-dependent return value. > + */ > +static inline int rte_power_pause(const uint32_t state, > + const uint64_t tsc_timestamp); > + > +#endif /* _RTE_POWER_INTRINSIC_H_ */ > diff --git a/lib/librte_eal/include/meson.build > b/lib/librte_eal/include/meson.build > index cd09027958..3a12e87e19 100644 > --- a/lib/librte_eal/include/meson.build > +++ b/lib/librte_eal/include/meson.build > @@ -60,6 +60,7 @@ generic_headers = files( > 'generic/rte_memcpy.h', > 'generic/rte_pause.h', > 'generic/rte_prefetch.h', > + 'generic/rte_power_intrinsics.h', > 'generic/rte_rwlock.h', > 'generic/rte_spinlock.h', > 'generic/rte_ticketlock.h', > diff --git a/lib/librte_eal/x86/include/meson.build > b/lib/librte_eal/x86/include/meson.build > index f0e998c2fe..494a8142a2 100644 > --- a/lib/librte_eal/x86/include/meson.build > +++ b/lib/librte_eal/x86/include/meson.build > @@ -13,6 +13,7 @@ arch_headers = files( > 'rte_io.h', > 'rte_memcpy.h', > 'rte_prefetch.h', > + 'rte_power_intrinsics.h', > 'rte_pause.h', > 'rte_rtm.h', > 'rte_rwlock.h', > diff --git a/lib/librte_eal/x86/include/rte_power_intrinsics.h > b/lib/librte_eal/x86/include/rte_power_intrinsics.h > new file mode 100644 > index 0000000000..6dd1cdc939 > --- /dev/null > +++ b/lib/librte_eal/x86/include/rte_power_intrinsics.h > @@ -0,0 +1,143 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2020 Intel Corporation > + */ > + > +#ifndef _RTE_POWER_INTRINSIC_X86_64_H_ > +#define _RTE_POWER_INTRINSIC_X86_64_H_ > + > +#ifdef __cplusplus > +extern "C" { > +#endif > + > +#include <rte_atomic.h> > +#include <rte_common.h> > + > +#include "generic/rte_power_intrinsics.h" > + > +/** > + * Monitor specific address for changes. This will cause the CPU to enter an > + * architecture-defined optimized power state until either the specified > + * memory address is written to, or a certain TSC timestamp is reached. > + * > + * Additionally, an `expected` 64-bit value and 64-bit mask are provided. If > + * mask is non-zero, the current value pointed to by the `p` pointer will be > + * checked against the expected value, and if they match, the entering of > + * optimized power state may be aborted. > + * > + * This function uses UMONITOR/UMWAIT instructions. For more information > about > + * their usage, please refer to Intel(R) 64 and IA-32 Architectures Software > + * Developer's Manual. > + * > + * @param p > + * Address to monitor for changes. Must be aligned on an 64-byte boundary. > + * @param expected_value > + * Before attempting the monitoring, the `p` address may be read and > compared > + * against this value. If `value_mask` is zero, this step will be skipped. > + * @param value_mask > + * The 64-bit mask to use to extract current value from `p`. > + * @param state > + * Architecture-dependent optimized power state number. Can be 0 (C0.2) or > + * 1 (C0.1). > + * @param tsc_timestamp > + * Maximum TSC timestamp to wait for. > + * > + * @return > + * - 1 if wakeup was due to TSC timeout expiration. > + * - 0 if wakeup was due to memory write or other reasons. > + */ > +static inline int rte_power_monitor(const volatile void *p, > + const uint64_t expected_value, const uint64_t value_mask, > + const uint32_t state, const uint64_t tsc_timestamp) > +{ > + const uint32_t tsc_l = (uint32_t)tsc_timestamp; > + const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32); > + /* the rflags need match native register size */ > +#ifdef RTE_ARCH_I686 > + uint32_t rflags; > +#else > + uint64_t rflags; > +#endif > + /* > + * we're using raw byte codes for now as only the newest compiler > + * versions support this instruction natively. > + */ > + > + /* set address for UMONITOR */ > + asm volatile(".byte 0xf3, 0x0f, 0xae, 0xf7;" > + : > + : "D"(p)); > + > + if (value_mask) { > + const uint64_t cur_value = *(const volatile uint64_t *)p; > + const uint64_t masked = cur_value & value_mask; > + /* if the masked value is already matching, abort */ > + if (masked == expected_value) > + return 0; > + } > + /* execute UMWAIT */ > + asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;\n" > + /* > + * UMWAIT sets CF flag in RFLAGS, so PUSHF to push them > + * onto the stack, then pop them back into `rflags` so that > + * we can read it. > + */ > + "pushf;\n" > + "pop %0;\n" > + : "=r"(rflags) > + : "D"(state), "a"(tsc_l), "d"(tsc_h)); > + > + /* we're interested in the first bit (the carry flag) */ > + return rflags & 0x1; > +} > + > +/** > + * Enter an architecture-defined optimized power state until a certain TSC > + * timestamp is reached. > + * > + * This function uses TPAUSE instruction. For more information about its > usage, > + * please refer to Intel(R) 64 and IA-32 Architectures Software Developer's > + * Manual. > + * > + * @param state > + * Architecture-dependent optimized power state number. Can be 0 (C0.2) or > + * 1 (C0.1). > + * @param tsc_timestamp > + * Maximum TSC timestamp to wait for. > + * > + * @return > + * - 1 if wakeup was due to TSC timeout expiration. > + * - 0 if wakeup was due to other reasons. > + */ > +static inline int rte_power_pause(const uint32_t state, > + const uint64_t tsc_timestamp) > +{ > + const uint32_t tsc_l = (uint32_t)tsc_timestamp; > + const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32); > + /* the rflags need match native register size */ > +#ifdef RTE_ARCH_I686 > + uint32_t rflags; > +#else > + uint64_t rflags; > +#endif > + > + /* execute TPAUSE */ > + asm volatile(".byte 0x66, 0x0f, 0xae, 0xf7;\n" > + /* > + * TPAUSE sets CF flag in RFLAGS, so PUSHF to push them > + * onto the stack, then pop them back into `rflags` so that > + * we can read it. > + */ > + "pushf;\n" > + "pop %0;\n" > + : "=r"(rflags) > + : "D"(state), "a"(tsc_l), "d"(tsc_h)); > + > + /* we're interested in the first bit (the carry flag) */ > + return rflags & 0x1; > +} > + > +#ifdef __cplusplus > +} > +#endif > + > +#endif /* _RTE_POWER_INTRINSIC_X86_64_H_ */ > -- > 2.17.1