mwaitx allows epyc processors to enter a implementation dependent power/performance optimized state (C1 state) for a specific period or until a store to the monitored address range.
Signed-off-by: Sivaprasad Tummala <sivaprasad.tumm...@amd.com> --- lib/eal/include/generic/rte_cpuflags.h | 2 + lib/eal/x86/include/rte_cpuflags.h | 1 + lib/eal/x86/rte_cpuflags.c | 3 + lib/eal/x86/rte_power_intrinsics.c | 80 +++++++++++++++++++++++++- lib/power/rte_power_pmd_mgmt.c | 3 +- 5 files changed, 86 insertions(+), 3 deletions(-) diff --git a/lib/eal/include/generic/rte_cpuflags.h b/lib/eal/include/generic/rte_cpuflags.h index d35551e931..db653a8dd7 100644 --- a/lib/eal/include/generic/rte_cpuflags.h +++ b/lib/eal/include/generic/rte_cpuflags.h @@ -26,6 +26,8 @@ struct rte_cpu_intrinsics { /**< indicates support for rte_power_pause function */ uint32_t power_monitor_multi : 1; /**< indicates support for rte_power_monitor_multi function */ + uint32_t amd_power_monitorx : 1; + /**< indicates amd support for rte_power_monitor function */ }; /** diff --git a/lib/eal/x86/include/rte_cpuflags.h b/lib/eal/x86/include/rte_cpuflags.h index 92e90fb6e0..5ccf4e7d98 100644 --- a/lib/eal/x86/include/rte_cpuflags.h +++ b/lib/eal/x86/include/rte_cpuflags.h @@ -102,6 +102,7 @@ enum rte_cpu_flag_t { /* (EAX 80000001h) ECX features */ RTE_CPUFLAG_LAHF_SAHF, /**< LAHF_SAHF */ RTE_CPUFLAG_LZCNT, /**< LZCNT */ + RTE_CPUFLAG_MONITORX, /**< MONITORX */ /* (EAX 80000001h) EDX features */ RTE_CPUFLAG_SYSCALL, /**< SYSCALL */ diff --git a/lib/eal/x86/rte_cpuflags.c b/lib/eal/x86/rte_cpuflags.c index d6b518251b..ae2e0a8470 100644 --- a/lib/eal/x86/rte_cpuflags.c +++ b/lib/eal/x86/rte_cpuflags.c @@ -133,6 +133,7 @@ const struct feature_entry rte_cpu_feature_table[] = { FEAT_DEF(LAHF_SAHF, 0x80000001, 0, RTE_REG_ECX, 0) FEAT_DEF(LZCNT, 0x80000001, 0, RTE_REG_ECX, 4) + FEAT_DEF(MONITORX, 0x80000001, 0, RTE_REG_ECX, 29) FEAT_DEF(SYSCALL, 0x80000001, 0, RTE_REG_EDX, 11) FEAT_DEF(XD, 0x80000001, 0, RTE_REG_EDX, 20) @@ -191,5 +192,7 @@ rte_cpu_get_intrinsics_support(struct rte_cpu_intrinsics *intrinsics) intrinsics->power_pause = 1; if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_RTM)) intrinsics->power_monitor_multi = 1; + } else if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_MONITORX)) { + intrinsics->amd_power_monitorx = 1; } } diff --git a/lib/eal/x86/rte_power_intrinsics.c b/lib/eal/x86/rte_power_intrinsics.c index f749da9b85..4e81870387 100644 --- a/lib/eal/x86/rte_power_intrinsics.c +++ b/lib/eal/x86/rte_power_intrinsics.c @@ -30,6 +30,7 @@ __umwait_wakeup(volatile void *addr) static bool wait_supported; static bool wait_multi_supported; +static bool amd_mwaitx_supported; static inline uint64_t __get_umwait_val(const volatile void *p, const uint8_t sz) @@ -65,6 +66,76 @@ __check_val_size(const uint8_t sz) } } +/** + * This function uses MONITORX/MWAITX instructions and will enter C1 state. + * For more information about usage of these instructions, please refer to + * AMD64 Architecture Programmer’s Manual. + */ +static inline int +amd_power_monitorx(const struct rte_power_monitor_cond *pmc, + const uint64_t tsc_timestamp) +{ + const unsigned int lcore_id = rte_lcore_id(); + struct power_wait_status *s; + uint64_t cur_value; + + RTE_SET_USED(tsc_timestamp); + + /* prevent non-EAL thread from using this API */ + if (lcore_id >= RTE_MAX_LCORE) + return -EINVAL; + + if (pmc == NULL) + return -EINVAL; + + if (__check_val_size(pmc->size) < 0) + return -EINVAL; + + if (pmc->fn == NULL) + return -EINVAL; + + s = &wait_status[lcore_id]; + + /* update sleep address */ + rte_spinlock_lock(&s->lock); + s->monitor_addr = pmc->addr; + + /* + * we're using raw byte codes for now as only the newest compiler + * versions support this instruction natively. + */ + /* set address for MONITORX */ + asm volatile(".byte 0x0f, 0x01, 0xfa;" + : + : "a"(pmc->addr), + "c"(0), /* no extensions */ + "d"(0)); /* no hints */ + + /* now that we've put this address into monitor, we can unlock */ + rte_spinlock_unlock(&s->lock); + + cur_value = __get_umwait_val(pmc->addr, pmc->size); + + /* check if callback indicates we should abort */ + if (pmc->fn(cur_value, pmc->opaque) != 0) + goto end; + + /* execute MWAITX */ + asm volatile(".byte 0x0f, 0x01, 0xfb;" + : /* ignore rflags */ + : "a"(0), /* enter C1 */ + "c"(0), /* no time-out */ + "b"(0)); + +end: + /* erase sleep address */ + rte_spinlock_lock(&s->lock); + s->monitor_addr = NULL; + rte_spinlock_unlock(&s->lock); + + return 0; +} + /** * This function uses UMONITOR/UMWAIT instructions and will enter C0.2 state. * For more information about usage of these instructions, please refer to @@ -80,6 +151,9 @@ rte_power_monitor(const struct rte_power_monitor_cond *pmc, struct power_wait_status *s; uint64_t cur_value; + if (amd_mwaitx_supported) + return amd_power_monitorx(pmc, tsc_timestamp); + /* prevent user from running this instruction if it's not supported */ if (!wait_supported) return -ENOTSUP; @@ -126,7 +200,7 @@ rte_power_monitor(const struct rte_power_monitor_cond *pmc, asm volatile(".byte 0xf2, 0x0f, 0xae, 0xf7;" : /* ignore rflags */ : "D"(0), /* enter C0.2 */ - "a"(tsc_l), "d"(tsc_h)); + "a"(tsc_l), "d"(tsc_h)); end: /* erase sleep address */ @@ -170,6 +244,8 @@ RTE_INIT(rte_power_intrinsics_init) { wait_supported = 1; if (i.power_monitor_multi) wait_multi_supported = 1; + if (i.amd_power_monitorx) + amd_mwaitx_supported = 1; } int @@ -178,7 +254,7 @@ rte_power_monitor_wakeup(const unsigned int lcore_id) struct power_wait_status *s; /* prevent user from running this instruction if it's not supported */ - if (!wait_supported) + if (!wait_supported && !amd_mwaitx_supported) return -ENOTSUP; /* prevent buffer overrun */ diff --git a/lib/power/rte_power_pmd_mgmt.c b/lib/power/rte_power_pmd_mgmt.c index ca1840387c..43971e6014 100644 --- a/lib/power/rte_power_pmd_mgmt.c +++ b/lib/power/rte_power_pmd_mgmt.c @@ -447,7 +447,8 @@ check_monitor(struct pmd_core_cfg *cfg, const union queue *qdata) bool multimonitor_supported; /* check if rte_power_monitor is supported */ - if (!global_data.intrinsics_support.power_monitor) { + if ((!global_data.intrinsics_support.power_monitor) && + (!global_data.intrinsics_support.amd_power_monitorx)) { RTE_LOG(DEBUG, POWER, "Monitoring intrinsics are not supported\n"); return -ENOTSUP; } -- 2.34.1