From: Pavan Nikhilesh <pbhagavat...@marvell.com>

Use WFE LDP loop while polling for GETWORK completion for better
power savings.
Disabled by default and can be enabled by configuring meson with
-Dc_args='-DRTE_ARM_USE_WFE'.

Signed-off-by: Pavan Nikhilesh <pbhagavat...@marvell.com>
---
 doc/guides/eventdevs/cnxk.rst     |  9 ++++++
 drivers/event/cnxk/cn10k_worker.h | 52 +++++++++++++++++++++++++------
 2 files changed, 52 insertions(+), 9 deletions(-)

diff --git a/doc/guides/eventdevs/cnxk.rst b/doc/guides/eventdevs/cnxk.rst
index cccb8a0304..04f5b5025b 100644
--- a/doc/guides/eventdevs/cnxk.rst
+++ b/doc/guides/eventdevs/cnxk.rst
@@ -198,6 +198,15 @@ Runtime Config Options
 
     -a 0002:0e:00.0,tim_eclk_freq=122880000-1000000000-0
 
+Power Savings on CN10K
+----------------------
+
+ARM cores can additionally use WFE when polling for transactions on SSO bus
+to save power i.e., in the event dequeue call ARM core can enter WFE and exit
+when either work has been scheduled or dequeue timeout has reached.
+This can be enabled by configuring meson with the following option
+``-Dc_args='-DRTE_ARM_USE_WFE'``.
+
 Debugging Options
 -----------------
 
diff --git a/drivers/event/cnxk/cn10k_worker.h 
b/drivers/event/cnxk/cn10k_worker.h
index 8aa916fa12..92d5190842 100644
--- a/drivers/event/cnxk/cn10k_worker.h
+++ b/drivers/event/cnxk/cn10k_worker.h
@@ -250,23 +250,57 @@ cn10k_sso_hws_get_work(struct cn10k_sso_hws *ws, struct 
rte_event *ev,
 
        gw.get_work = ws->gw_wdata;
 #if defined(RTE_ARCH_ARM64)
-#if !defined(__clang__)
-       asm volatile(
-               PLT_CPU_FEATURE_PREAMBLE
-               "caspal %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
-               : [wdata] "+r"(gw.get_work)
-               : [gw_loc] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0)
-               : "memory");
-#else
+#if defined(__clang__)
        register uint64_t x0 __asm("x0") = (uint64_t)gw.u64[0];
        register uint64_t x1 __asm("x1") = (uint64_t)gw.u64[1];
+#if defined(RTE_ARM_USE_WFE)
+       plt_write64(gw.u64[0], ws->base + SSOW_LF_GWS_OP_GET_WORK0);
+       asm volatile(PLT_CPU_FEATURE_PREAMBLE
+                    "          ldp %[x0], %[x1], [%[tag_loc]]  \n"
+                    "          tbz %[x0], %[pend_gw], done%=   \n"
+                    "          sevl                                    \n"
+                    "rty%=:    wfe                                     \n"
+                    "          ldp %[x0], %[x1], [%[tag_loc]]  \n"
+                    "          tbnz %[x0], %[pend_gw], rty%=   \n"
+                    "done%=:                                           \n"
+                    "          dmb ld                                  \n"
+                    : [x0] "+r" (x0), [x1] "+r" (x1)
+                    : [tag_loc] "r"(ws->base + SSOW_LF_GWS_WQE0),
+                      [pend_gw] "i"(SSOW_LF_GWS_TAG_PEND_GET_WORK_BIT)
+                    : "memory");
+#else
        asm volatile(".arch armv8-a+lse\n"
                     "caspal %[x0], %[x1], %[x0], %[x1], [%[dst]]\n"
-                    : [x0] "+r"(x0), [x1] "+r"(x1)
+                    : [x0] "+r" (x0), [x1] "+r" (x1)
                     : [dst] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0)
                     : "memory");
+#endif
        gw.u64[0] = x0;
        gw.u64[1] = x1;
+#else
+#if defined(RTE_ARM_USE_WFE)
+       plt_write64(gw.u64[0], ws->base + SSOW_LF_GWS_OP_GET_WORK0);
+       asm volatile(PLT_CPU_FEATURE_PREAMBLE
+                    "          ldp %[wdata], %H[wdata], [%[tag_loc]]   \n"
+                    "          tbz %[wdata], %[pend_gw], done%=        \n"
+                    "          sevl                                    \n"
+                    "rty%=:    wfe                                     \n"
+                    "          ldp %[wdata], %H[wdata], [%[tag_loc]]   \n"
+                    "          tbnz %[wdata], %[pend_gw], rty%=        \n"
+                    "done%=:                                           \n"
+                    "          dmb ld                                  \n"
+                    : [wdata] "=&r"(gw.get_work)
+                    : [tag_loc] "r"(ws->base + SSOW_LF_GWS_WQE0),
+                      [pend_gw] "i"(SSOW_LF_GWS_TAG_PEND_GET_WORK_BIT)
+                    : "memory");
+#else
+       asm volatile(
+               PLT_CPU_FEATURE_PREAMBLE
+               "caspal %[wdata], %H[wdata], %[wdata], %H[wdata], [%[gw_loc]]\n"
+               : [wdata] "+r"(gw.get_work)
+               : [gw_loc] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0)
+               : "memory");
+#endif
 #endif
 #else
        plt_write64(gw.u64[0], ws->base + SSOW_LF_GWS_OP_GET_WORK0);
-- 
2.25.1

Reply via email to