From: Pavan Nikhilesh <pbhagavat...@marvell.com> Use WFE LDP loop while polling for GETWORK completion for better power savings. Disabled by default and can be enabled by configuring meson with -Dc_args='-DRTE_ARM_USE_WFE'.
Signed-off-by: Pavan Nikhilesh <pbhagavat...@marvell.com> --- doc/guides/eventdevs/cnxk.rst | 9 ++++++ drivers/event/cnxk/cn10k_worker.h | 52 +++++++++++++++++++++++++------ 2 files changed, 52 insertions(+), 9 deletions(-) diff --git a/doc/guides/eventdevs/cnxk.rst b/doc/guides/eventdevs/cnxk.rst index cccb8a0304..04f5b5025b 100644 --- a/doc/guides/eventdevs/cnxk.rst +++ b/doc/guides/eventdevs/cnxk.rst @@ -198,6 +198,15 @@ Runtime Config Options -a 0002:0e:00.0,tim_eclk_freq=122880000-1000000000-0 +Power Savings on CN10K +---------------------- + +ARM cores can additionally use WFE when polling for transactions on SSO bus +to save power i.e., in the event dequeue call ARM core can enter WFE and exit +when either work has been scheduled or dequeue timeout has reached. +This can be enabled by configuring meson with the following option +``-Dc_args='-DRTE_ARM_USE_WFE'``. + Debugging Options ----------------- diff --git a/drivers/event/cnxk/cn10k_worker.h b/drivers/event/cnxk/cn10k_worker.h index 8aa916fa12..92d5190842 100644 --- a/drivers/event/cnxk/cn10k_worker.h +++ b/drivers/event/cnxk/cn10k_worker.h @@ -250,23 +250,57 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct rte_event *ev, gw.get_work = ws->gw_wdata; #if defined(RTE_ARCH_ARM64) -#if !defined(__clang__) - asm volatile( - PLT_CPU_FEATURE_PREAMBLE - "caspal %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n" - : [wdata] "+r"(gw.get_work) - : [gw_loc] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0) - : "memory"); -#else +#if defined(__clang__) register uint64_t x0 __asm("x0") = (uint64_t)gw.u64[0]; register uint64_t x1 __asm("x1") = (uint64_t)gw.u64[1]; +#if defined(RTE_ARM_USE_WFE) + plt_write64(gw.u64[0], ws->base + SSOW_LF_GWS_OP_GET_WORK0); + asm volatile(PLT_CPU_FEATURE_PREAMBLE + " ldp %[x0], %[x1], [%[tag_loc]] \n" + " tbz %[x0], %[pend_gw], done%= \n" + " sevl \n" + "rty%=: wfe \n" + " ldp %[x0], %[x1], [%[tag_loc]] \n" + " tbnz %[x0], %[pend_gw], rty%= \n" + "done%=: \n" + " dmb ld \n" + : [x0] "+r" (x0), [x1] "+r" (x1) + : [tag_loc] "r"(ws->base + SSOW_LF_GWS_WQE0), + [pend_gw] "i"(SSOW_LF_GWS_TAG_PEND_GET_WORK_BIT) + : "memory"); +#else asm volatile(".arch armv8-a+lse\n" "caspal %[x0], %[x1], %[x0], %[x1], [%[dst]]\n" - : [x0] "+r"(x0), [x1] "+r"(x1) + : [x0] "+r" (x0), [x1] "+r" (x1) : [dst] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0) : "memory"); +#endif gw.u64[0] = x0; gw.u64[1] = x1; +#else +#if defined(RTE_ARM_USE_WFE) + plt_write64(gw.u64[0], ws->base + SSOW_LF_GWS_OP_GET_WORK0); + asm volatile(PLT_CPU_FEATURE_PREAMBLE + " ldp %[wdata], %H[wdata], [%[tag_loc]] \n" + " tbz %[wdata], %[pend_gw], done%= \n" + " sevl \n" + "rty%=: wfe \n" + " ldp %[wdata], %H[wdata], [%[tag_loc]] \n" + " tbnz %[wdata], %[pend_gw], rty%= \n" + "done%=: \n" + " dmb ld \n" + : [wdata] "=&r"(gw.get_work) + : [tag_loc] "r"(ws->base + SSOW_LF_GWS_WQE0), + [pend_gw] "i"(SSOW_LF_GWS_TAG_PEND_GET_WORK_BIT) + : "memory"); +#else + asm volatile( + PLT_CPU_FEATURE_PREAMBLE + "caspal %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n" + : [wdata] "+r"(gw.get_work) + : [gw_loc] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0) + : "memory"); +#endif #endif #else plt_write64(gw.u64[0], ws->base + SSOW_LF_GWS_OP_GET_WORK0); -- 2.25.1