> 
> From: Liang Ma <liang.j...@intel.com>
> 
> Add two new power management intrinsics, and provide an implementation
> in eal/x86 based on UMONITOR/UMWAIT instructions. The instructions
> are implemented as raw byte opcodes because there is not yet widespread
> compiler support for these instructions.
> 
> The power management instructions provide an architecture-specific
> function to either wait until a specified TSC timestamp is reached, or
> optionally wait until either a TSC timestamp is reached or a memory
> location is written to. The monitor function also provides an optional
> comparison, to avoid sleeping when the expected write has already
> happened, and no more writes are expected.
> 
> For more details, please refer to Intel(R) 64 and IA-32 Architectures
> Software Developer's Manual, Volume 2.
> 
> Signed-off-by: Liang Ma <liang.j...@intel.com>
> Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com>
> Acked-by: David Christensen <d...@linux.vnet.ibm.com>
> ---
> 
> Notes:
>     v6:
>     - Add spinlock-enabled version to allow pthread-wait-like
>       constructs with umwait
>     - Clarify comments
>     - Added experimental tags to intrinsics
>     - Added endianness support
>     v5:
>     - Removed return values
>     - Simplified intrinsics and hardcoded C0.2 state
>     - Added other arch stubs
> 
>  lib/librte_eal/arm/include/meson.build        |   1 +
>  .../arm/include/rte_power_intrinsics.h        |  58 ++++++++
>  .../include/generic/rte_power_intrinsics.h    | 111 +++++++++++++++
>  lib/librte_eal/include/meson.build            |   1 +
>  lib/librte_eal/ppc/include/meson.build        |   1 +
>  .../ppc/include/rte_power_intrinsics.h        |  58 ++++++++
>  lib/librte_eal/x86/include/meson.build        |   1 +
>  .../x86/include/rte_power_intrinsics.h        | 132 ++++++++++++++++++
>  8 files changed, 363 insertions(+)
>  create mode 100644 lib/librte_eal/arm/include/rte_power_intrinsics.h
>  create mode 100644 lib/librte_eal/include/generic/rte_power_intrinsics.h
>  create mode 100644 lib/librte_eal/ppc/include/rte_power_intrinsics.h
>  create mode 100644 lib/librte_eal/x86/include/rte_power_intrinsics.h
> 
> diff --git a/lib/librte_eal/arm/include/meson.build 
> b/lib/librte_eal/arm/include/meson.build
> index 73b750a18f..c6a9f70d73 100644
> --- a/lib/librte_eal/arm/include/meson.build
> +++ b/lib/librte_eal/arm/include/meson.build
> @@ -20,6 +20,7 @@ arch_headers = files(
>       'rte_pause_32.h',
>       'rte_pause_64.h',
>       'rte_pause.h',
> +     'rte_power_intrinsics.h',
>       'rte_prefetch_32.h',
>       'rte_prefetch_64.h',
>       'rte_prefetch.h',
> diff --git a/lib/librte_eal/arm/include/rte_power_intrinsics.h 
> b/lib/librte_eal/arm/include/rte_power_intrinsics.h
> new file mode 100644
> index 0000000000..b04ba10c76
> --- /dev/null
> +++ b/lib/librte_eal/arm/include/rte_power_intrinsics.h
> @@ -0,0 +1,58 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2020 Intel Corporation
> + */
> +
> +#ifndef _RTE_POWER_INTRINSIC_ARM_H_
> +#define _RTE_POWER_INTRINSIC_ARM_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <rte_common.h>
> +
> +#include "generic/rte_power_intrinsics.h"
> +
> +/**
> + * This function is not supported on ARM.
> + */

Here and in other places - please follow dpdk coding convention
for function definitions, i.e:
static inline void
rte_power_monitor(... 

> +static inline void rte_power_monitor(const volatile void *p,
> +             const uint64_t expected_value, const uint64_t value_mask,
> +             const uint64_t tsc_timestamp, const uint8_t data_sz)
> +{
> +     RTE_SET_USED(p);
> +     RTE_SET_USED(expected_value);
> +     RTE_SET_USED(value_mask);
> +     RTE_SET_USED(tsc_timestamp);
> +     RTE_SET_USED(data_sz);
> +}

You can probably put NOP implementations of these rte_powe_* functions
into generic/rte_power_intrinsics.h.
So, wouldn't need to duplicate them for every non-supported arch.
Same as it was done for rte_wait_until_equal_*().

> +
> +/**
> + * This function is not supported on ARM.
> + */
> +static inline void rte_power_monitor_sync(const volatile void *p,
> +             const uint64_t expected_value, const uint64_t value_mask,
> +             const uint64_t tsc_timestamp, const uint8_t data_sz,
> +             rte_spinlock_t *lck)
> +{
> +     RTE_SET_USED(p);
> +     RTE_SET_USED(expected_value);
> +     RTE_SET_USED(value_mask);
> +     RTE_SET_USED(tsc_timestamp);
> +     RTE_SET_USED(lck);
> +     RTE_SET_USED(data_sz);
> +}
> +
> +/**
> + * This function is not supported on ARM.
> + */
> +static inline void rte_power_pause(const uint64_t tsc_timestamp)
> +{
> +     RTE_SET_USED(tsc_timestamp);
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_POWER_INTRINSIC_ARM_H_ */
> diff --git a/lib/librte_eal/include/generic/rte_power_intrinsics.h 
> b/lib/librte_eal/include/generic/rte_power_intrinsics.h
> new file mode 100644
> index 0000000000..f9522f2776
> --- /dev/null
> +++ b/lib/librte_eal/include/generic/rte_power_intrinsics.h
> @@ -0,0 +1,111 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2020 Intel Corporation
> + */
> +
> +#ifndef _RTE_POWER_INTRINSIC_H_
> +#define _RTE_POWER_INTRINSIC_H_
> +
> +#include <inttypes.h>
> +
> +#include <rte_compat.h>
> +#include <rte_spinlock.h>
> +
> +/**
> + * @file
> + * Advanced power management operations.
> + *
> + * This file define APIs for advanced power management,
> + * which are architecture-dependent.
> + */
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Monitor specific address for changes. This will cause the CPU to enter an
> + * architecture-defined optimized power state until either the specified
> + * memory address is written to, a certain TSC timestamp is reached, or other
> + * reasons cause the CPU to wake up.
> + *
> + * Additionally, an `expected` 64-bit value and 64-bit mask are provided. If
> + * mask is non-zero, the current value pointed to by the `p` pointer will be
> + * checked against the expected value, and if they match, the entering of
> + * optimized power state may be aborted.
> + *
> + * @param p
> + *   Address to monitor for changes. Must be aligned on an 64-byte boundary.

Is 64B alignment really needed?


> + * @param expected_value
> + *   Before attempting the monitoring, the `p` address may be read and 
> compared
> + *   against this value. If `value_mask` is zero, this step will be skipped.
> + * @param value_mask
> + *   The 64-bit mask to use to extract current value from `p`.
> + * @param tsc_timestamp
> + *   Maximum TSC timestamp to wait for. Note that the wait behavior is
> + *   architecture-dependent.
> + * @param data_sz
> + *   Data size (in bytes) that will be used to compare expected value with 
> the
> + *   memory address. Can be 1, 2, 4 or 8. Supplying any other value will lead
> + *   to undefined result.
> + */
> +__rte_experimental
> +static inline void rte_power_monitor(const volatile void *p,
> +             const uint64_t expected_value, const uint64_t value_mask,
> +             const uint64_t tsc_timestamp, const uint8_t data_sz);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Monitor specific address for changes. This will cause the CPU to enter an
> + * architecture-defined optimized power state until either the specified
> + * memory address is written to, a certain TSC timestamp is reached, or other
> + * reasons cause the CPU to wake up.
> + *
> + * Additionally, an `expected` 64-bit value and 64-bit mask are provided. If
> + * mask is non-zero, the current value pointed to by the `p` pointer will be
> + * checked against the expected value, and if they match, the entering of
> + * optimized power state may be aborted.
> + *
> + * This call will also lock a spinlock on entering sleep, and release it on
> + * waking up the CPU.
> + *
> + * @param p
> + *   Address to monitor for changes. Must be aligned on an 64-byte boundary.
> + * @param expected_value
> + *   Before attempting the monitoring, the `p` address may be read and 
> compared
> + *   against this value. If `value_mask` is zero, this step will be skipped.
> + * @param value_mask
> + *   The 64-bit mask to use to extract current value from `p`.
> + * @param tsc_timestamp
> + *   Maximum TSC timestamp to wait for. Note that the wait behavior is
> + *   architecture-dependent.
> + * @param data_sz
> + *   Data size (in bytes) that will be used to compare expected value with 
> the
> + *   memory address. Can be 1, 2, 4 or 8. Supplying any other value will lead
> + *   to undefined result.
> + * @param lck
> + *   A spinlock that must be locked before entering the function, will be
> + *   unlocked while the CPU is sleeping, and will be locked again once the 
> CPU
> + *   wakes up.
> + */
> +__rte_experimental
> +static inline void rte_power_monitor_sync(const volatile void *p,
> +             const uint64_t expected_value, const uint64_t value_mask,
> +             const uint64_t tsc_timestamp, const uint8_t data_sz,
> +             rte_spinlock_t *lck);
> +
> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice
> + *
> + * Enter an architecture-defined optimized power state until a certain TSC
> + * timestamp is reached.
> + *
> + * @param tsc_timestamp
> + *   Maximum TSC timestamp to wait for. Note that the wait behavior is
> + *   architecture-dependent.
> + */
> +__rte_experimental
> +static inline void rte_power_pause(const uint64_t tsc_timestamp);
> +
> +#endif /* _RTE_POWER_INTRINSIC_H_ */
> diff --git a/lib/librte_eal/include/meson.build 
> b/lib/librte_eal/include/meson.build
> index cd09027958..3a12e87e19 100644
> --- a/lib/librte_eal/include/meson.build
> +++ b/lib/librte_eal/include/meson.build
> @@ -60,6 +60,7 @@ generic_headers = files(
>       'generic/rte_memcpy.h',
>       'generic/rte_pause.h',
>       'generic/rte_prefetch.h',
> +     'generic/rte_power_intrinsics.h',
>       'generic/rte_rwlock.h',
>       'generic/rte_spinlock.h',
>       'generic/rte_ticketlock.h',
> diff --git a/lib/librte_eal/ppc/include/meson.build 
> b/lib/librte_eal/ppc/include/meson.build
> index ab4bd28092..0873b2aecb 100644
> --- a/lib/librte_eal/ppc/include/meson.build
> +++ b/lib/librte_eal/ppc/include/meson.build
> @@ -10,6 +10,7 @@ arch_headers = files(
>       'rte_io.h',
>       'rte_memcpy.h',
>       'rte_pause.h',
> +     'rte_power_intrinsics.h',
>       'rte_prefetch.h',
>       'rte_rwlock.h',
>       'rte_spinlock.h',
> diff --git a/lib/librte_eal/ppc/include/rte_power_intrinsics.h 
> b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
> new file mode 100644
> index 0000000000..3bceefdc3f
> --- /dev/null
> +++ b/lib/librte_eal/ppc/include/rte_power_intrinsics.h
> @@ -0,0 +1,58 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2020 Intel Corporation
> + */
> +
> +#ifndef _RTE_POWER_INTRINSIC_PPC_H_
> +#define _RTE_POWER_INTRINSIC_PPC_H_
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <rte_common.h>
> +
> +#include "generic/rte_power_intrinsics.h"
> +
> +/**
> + * This function is not supported on PPC64.
> + */
> +static inline void rte_power_monitor(const volatile void *p,
> +             const uint64_t expected_value, const uint64_t value_mask,
> +             const uint64_t tsc_timestamp, const uint8_t data_sz)
> +{
> +     RTE_SET_USED(p);
> +     RTE_SET_USED(expected_value);
> +     RTE_SET_USED(value_mask);
> +     RTE_SET_USED(tsc_timestamp);
> +     RTE_SET_USED(data_sz);
> +}
> +
> +/**
> + * This function is not supported on PPC64.
> + */
> +static inline void rte_power_monitor_sync(const volatile void *p,
> +             const uint64_t expected_value, const uint64_t value_mask,
> +             const uint64_t tsc_timestamp, const uint8_t data_sz,
> +             rte_spinlock_t *lck)
> +{
> +     RTE_SET_USED(p);
> +     RTE_SET_USED(expected_value);
> +     RTE_SET_USED(value_mask);
> +     RTE_SET_USED(tsc_timestamp);
> +     RTE_SET_USED(lck);
> +     RTE_SET_USED(data_sz);
> +}
> +
> +/**
> + * This function is not supported on PPC64.
> + */
> +static inline void rte_power_pause(const uint64_t tsc_timestamp)
> +{
> +     RTE_SET_USED(tsc_timestamp);
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_POWER_INTRINSIC_PPC_H_ */
> diff --git a/lib/librte_eal/x86/include/meson.build 
> b/lib/librte_eal/x86/include/meson.build
> index f0e998c2fe..494a8142a2 100644
> --- a/lib/librte_eal/x86/include/meson.build
> +++ b/lib/librte_eal/x86/include/meson.build
> @@ -13,6 +13,7 @@ arch_headers = files(
>       'rte_io.h',
>       'rte_memcpy.h',
>       'rte_prefetch.h',
> +     'rte_power_intrinsics.h',
>       'rte_pause.h',
>       'rte_rtm.h',
>       'rte_rwlock.h',
> diff --git a/lib/librte_eal/x86/include/rte_power_intrinsics.h 
> b/lib/librte_eal/x86/include/rte_power_intrinsics.h
> new file mode 100644
> index 0000000000..9ac8e6eef6
> --- /dev/null
> +++ b/lib/librte_eal/x86/include/rte_power_intrinsics.h
> @@ -0,0 +1,132 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2020 Intel Corporation
> + */
> +
> +#ifndef _RTE_POWER_INTRINSIC_X86_64_H_
> +#define _RTE_POWER_INTRINSIC_X86_64_H_

Why '_64_H'?
My understanding was these ops are supported 32-bit mode too. 

> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +#include <rte_common.h>
> +
> +#include "generic/rte_power_intrinsics.h"
> +
> +static inline uint64_t __get_umwait_val(const volatile void *p,
> +             const uint8_t sz)
> +{
> +     switch (sz) {
> +     case 1:

Just as a nit:
case sizeof(type_x):
        return *(const volatile type_x *)p;

> +             return *(const volatile uint8_t *)p;
> +     case 2:
> +             return *(const volatile uint16_t *)p;
> +     case 4:
> +             return *(const volatile uint32_t *)p;
> +     case 8:
> +             return *(const volatile uint64_t *)p;
> +     default:
> +             /* this is an intrinsic, so we can't have any error handling */

RTE_ASSERT(0); ?

> +             return 0;
> +     }
> +}
> +
> +/**
> + * This function uses UMONITOR/UMWAIT instructions and will enter C0.2 state.
> + * For more information about usage of these instructions, please refer to
> + * Intel(R) 64 and IA-32 Architectures Software Developer's Manual.
> + */
> +static inline void rte_power_monitor(const volatile void *p,
> +             const uint64_t expected_value, const uint64_t value_mask,
> +             const uint64_t tsc_timestamp, const uint8_t data_sz)
> +{
> +     const uint32_t tsc_l = (uint32_t)tsc_timestamp;
> +     const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
> +     /*
> +      * we're using raw byte codes for now as only the newest compiler
> +      * versions support this instruction natively.
> +      */
> +
> +     /* set address for UMONITOR */
> +     asm volatile(".byte 0xf3, 0x0f, 0xae, 0xf7;"
> +                     :
> +                     : "D"(p));
> +
> +     if (value_mask) {
> +             const uint64_t cur_value = __get_umwait_val(p, data_sz);
> +             const uint64_t masked = cur_value & value_mask;
> +
> +             /* if the masked value is already matching, abort */
> +             if (masked == expected_value)
> +                     return;
> +     }
> +     /* execute UMWAIT */
> +     asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
> +                     : /* ignore rflags */
> +                     : "D"(0), /* enter C0.2 */
> +                       "a"(tsc_l), "d"(tsc_h));
> +}
> +
> +/**
> + * This function uses UMONITOR/UMWAIT instructions and will enter C0.2 state.
> + * For more information about usage of these instructions, please refer to
> + * Intel(R) 64 and IA-32 Architectures Software Developer's Manual.
> + */
> +static inline void rte_power_monitor_sync(const volatile void *p,
> +             const uint64_t expected_value, const uint64_t value_mask,
> +             const uint64_t tsc_timestamp, const uint8_t data_sz,
> +             rte_spinlock_t *lck)
> +{
> +     const uint32_t tsc_l = (uint32_t)tsc_timestamp;
> +     const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
> +     /*
> +      * we're using raw byte codes for now as only the newest compiler
> +      * versions support this instruction natively.
> +      */
> +
> +     /* set address for UMONITOR */
> +     asm volatile(".byte 0xf3, 0x0f, 0xae, 0xf7;"
> +                     :
> +                     : "D"(p));
> +
> +     if (value_mask) {
> +             const uint64_t cur_value = __get_umwait_val(p, data_sz);
> +             const uint64_t masked = cur_value & value_mask;
> +
> +             /* if the masked value is already matching, abort */
> +             if (masked == expected_value)
> +                     return;
> +     }
> +     rte_spinlock_unlock(lck);
> +
> +     /* execute UMWAIT */
> +     asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;"
> +                     : /* ignore rflags */
> +                     : "D"(0), /* enter C0.2 */
> +                       "a"(tsc_l), "d"(tsc_h));
> +
> +     rte_spinlock_lock(lck);
> +}
> +
> +/**
> + * This function uses TPAUSE instruction  and will enter C0.2 state. For more
> + * information about usage of this instruction, please refer to Intel(R) 64 
> and
> + * IA-32 Architectures Software Developer's Manual.
> + */
> +static inline void rte_power_pause(const uint64_t tsc_timestamp)
> +{
> +     const uint32_t tsc_l = (uint32_t)tsc_timestamp;
> +     const uint32_t tsc_h = (uint32_t)(tsc_timestamp >> 32);
> +
> +     /* execute TPAUSE */
> +     asm volatile(".byte 0x66, 0x0f, 0xae, 0xf7;"
> +             : /* ignore rflags */
> +             : "D"(0), /* enter C0.2 */
> +               "a"(tsc_l), "d"(tsc_h));
> +}
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif /* _RTE_POWER_INTRINSIC_X86_64_H_ */
> --
> 2.17.1

Reply via email to