From: Pavan Nikhilesh <pbhagavat...@marvell.com>

Add CN20K SSO GWS event dequeue fastpath functions.

Signed-off-by: Pavan Nikhilesh <pbhagavat...@marvell.com>
---
 drivers/event/cnxk/cn20k_eventdev.c |   5 +
 drivers/event/cnxk/cn20k_worker.c   |  54 +++++++++++
 drivers/event/cnxk/cn20k_worker.h   | 137 +++++++++++++++++++++++++++-
 3 files changed, 195 insertions(+), 1 deletion(-)

diff --git a/drivers/event/cnxk/cn20k_eventdev.c 
b/drivers/event/cnxk/cn20k_eventdev.c
index a5dd03de6e..d1668a00c1 100644
--- a/drivers/event/cnxk/cn20k_eventdev.c
+++ b/drivers/event/cnxk/cn20k_eventdev.c
@@ -114,11 +114,16 @@ static void
 cn20k_sso_fp_fns_set(struct rte_eventdev *event_dev)
 {
 #if defined(RTE_ARCH_ARM64)
+       struct cnxk_sso_evdev *dev = cnxk_sso_pmd_priv(event_dev);
 
        event_dev->enqueue_burst = cn20k_sso_hws_enq_burst;
        event_dev->enqueue_new_burst = cn20k_sso_hws_enq_new_burst;
        event_dev->enqueue_forward_burst = cn20k_sso_hws_enq_fwd_burst;
 
+       event_dev->dequeue_burst = cn20k_sso_hws_deq_burst;
+       if (dev->deq_tmo_ns)
+               event_dev->dequeue_burst = cn20k_sso_hws_tmo_deq_burst;
+
 #else
        RTE_SET_USED(event_dev);
 #endif
diff --git a/drivers/event/cnxk/cn20k_worker.c 
b/drivers/event/cnxk/cn20k_worker.c
index c7de493681..2dcde0b444 100644
--- a/drivers/event/cnxk/cn20k_worker.c
+++ b/drivers/event/cnxk/cn20k_worker.c
@@ -382,3 +382,57 @@ cn20k_sso_hws_enq_fwd_burst(void *port, const struct 
rte_event ev[], uint16_t nb
 
        return 1;
 }
+
+uint16_t __rte_hot
+cn20k_sso_hws_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
+{
+       struct cn20k_sso_hws *ws = port;
+
+       RTE_SET_USED(timeout_ticks);
+
+       if (ws->swtag_req) {
+               ws->swtag_req = 0;
+               cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);
+               return 1;
+       }
+
+       return cn20k_sso_hws_get_work(ws, ev, 0);
+}
+
+uint16_t __rte_hot
+cn20k_sso_hws_deq_burst(void *port, struct rte_event ev[], uint16_t nb_events,
+                       uint64_t timeout_ticks)
+{
+       RTE_SET_USED(nb_events);
+
+       return cn20k_sso_hws_deq(port, ev, timeout_ticks);
+}
+
+uint16_t __rte_hot
+cn20k_sso_hws_tmo_deq(void *port, struct rte_event *ev, uint64_t timeout_ticks)
+{
+       struct cn20k_sso_hws *ws = port;
+       uint16_t ret = 1;
+       uint64_t iter;
+
+       if (ws->swtag_req) {
+               ws->swtag_req = 0;
+               cnxk_sso_hws_swtag_wait(ws->base + SSOW_LF_GWS_WQE0);
+               return ret;
+       }
+
+       ret = cn20k_sso_hws_get_work(ws, ev, 0);
+       for (iter = 1; iter < timeout_ticks && (ret == 0); iter++)
+               ret = cn20k_sso_hws_get_work(ws, ev, 0);
+
+       return ret;
+}
+
+uint16_t __rte_hot
+cn20k_sso_hws_tmo_deq_burst(void *port, struct rte_event ev[], uint16_t 
nb_events,
+                           uint64_t timeout_ticks)
+{
+       RTE_SET_USED(nb_events);
+
+       return cn20k_sso_hws_tmo_deq(port, ev, timeout_ticks);
+}
diff --git a/drivers/event/cnxk/cn20k_worker.h 
b/drivers/event/cnxk/cn20k_worker.h
index 5ff8f11b38..8dc60a06ec 100644
--- a/drivers/event/cnxk/cn20k_worker.h
+++ b/drivers/event/cnxk/cn20k_worker.h
@@ -7,8 +7,136 @@
 
 #include <rte_eventdev.h>
 
-#include "cnxk_worker.h"
 #include "cn20k_eventdev.h"
+#include "cnxk_worker.h"
+
+static __rte_always_inline void
+cn20k_sso_hws_post_process(struct cn20k_sso_hws *ws, uint64_t *u64, const 
uint32_t flags)
+{
+       RTE_SET_USED(ws);
+       RTE_SET_USED(flags);
+
+       u64[0] = (u64[0] & (0x3ull << 32)) << 6 | (u64[0] & (0x3FFull << 36)) 
<< 4 |
+                (u64[0] & 0xffffffff);
+}
+
+static __rte_always_inline uint16_t
+cn20k_sso_hws_get_work(struct cn20k_sso_hws *ws, struct rte_event *ev, const 
uint32_t flags)
+{
+       union {
+               __uint128_t get_work;
+               uint64_t u64[2];
+       } gw;
+
+       gw.get_work = ws->gw_wdata;
+#if defined(RTE_ARCH_ARM64)
+#if defined(__clang__)
+       register uint64_t x0 __asm("x0") = (uint64_t)gw.u64[0];
+       register uint64_t x1 __asm("x1") = (uint64_t)gw.u64[1];
+#if defined(RTE_ARM_USE_WFE)
+       plt_write64(gw.u64[0], ws->base + SSOW_LF_GWS_OP_GET_WORK0);
+       asm volatile(PLT_CPU_FEATURE_PREAMBLE
+                    "          ldp %[x0], %[x1], [%[tag_loc]]  \n"
+                    "          tbz %[x0], %[pend_gw], done%=   \n"
+                    "          sevl                                    \n"
+                    "rty%=:    wfe                                     \n"
+                    "          ldp %[x0], %[x1], [%[tag_loc]]  \n"
+                    "          tbnz %[x0], %[pend_gw], rty%=   \n"
+                    "done%=:                                           \n"
+                    "          dmb ld                                  \n"
+                    : [x0] "+r" (x0), [x1] "+r" (x1)
+                    : [tag_loc] "r"(ws->base + SSOW_LF_GWS_WQE0),
+                      [pend_gw] "i"(SSOW_LF_GWS_TAG_PEND_GET_WORK_BIT)
+                    : "memory");
+#else
+       asm volatile(".arch armv8-a+lse\n"
+                    "caspal %[x0], %[x1], %[x0], %[x1], [%[dst]]\n"
+                    : [x0] "+r" (x0), [x1] "+r" (x1)
+                    : [dst] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0)
+                    : "memory");
+#endif
+       gw.u64[0] = x0;
+       gw.u64[1] = x1;
+#else
+#if defined(RTE_ARM_USE_WFE)
+       plt_write64(gw.u64[0], ws->base + SSOW_LF_GWS_OP_GET_WORK0);
+       asm volatile(PLT_CPU_FEATURE_PREAMBLE
+                    "          ldp %[wdata], %H[wdata], [%[tag_loc]]   \n"
+                    "          tbz %[wdata], %[pend_gw], done%=        \n"
+                    "          sevl                                    \n"
+                    "rty%=:    wfe                                     \n"
+                    "          ldp %[wdata], %H[wdata], [%[tag_loc]]   \n"
+                    "          tbnz %[wdata], %[pend_gw], rty%=        \n"
+                    "done%=:                                           \n"
+                    "          dmb ld                                  \n"
+                    : [wdata] "=&r"(gw.get_work)
+                    : [tag_loc] "r"(ws->base + SSOW_LF_GWS_WQE0),
+                      [pend_gw] "i"(SSOW_LF_GWS_TAG_PEND_GET_WORK_BIT)
+                    : "memory");
+#else
+       asm volatile(PLT_CPU_FEATURE_PREAMBLE
+                    "caspal %[wdata], %H[wdata], %[wdata], %H[wdata], 
[%[gw_loc]]\n"
+                    : [wdata] "+r"(gw.get_work)
+                    : [gw_loc] "r"(ws->base + SSOW_LF_GWS_OP_GET_WORK0)
+                    : "memory");
+#endif
+#endif
+#else
+       plt_write64(gw.u64[0], ws->base + SSOW_LF_GWS_OP_GET_WORK0);
+       do {
+               roc_load_pair(gw.u64[0], gw.u64[1], ws->base + 
SSOW_LF_GWS_WQE0);
+       } while (gw.u64[0] & BIT_ULL(63));
+       rte_atomic_thread_fence(rte_memory_order_seq_cst);
+#endif
+       ws->gw_rdata = gw.u64[0];
+       if (gw.u64[1])
+               cn20k_sso_hws_post_process(ws, gw.u64, flags);
+
+       ev->event = gw.u64[0];
+       ev->u64 = gw.u64[1];
+
+       return !!gw.u64[1];
+}
+
+/* Used in cleaning up workslot. */
+static __rte_always_inline uint16_t
+cn20k_sso_hws_get_work_empty(struct cn20k_sso_hws *ws, struct rte_event *ev, 
const uint32_t flags)
+{
+       union {
+               __uint128_t get_work;
+               uint64_t u64[2];
+       } gw;
+
+#ifdef RTE_ARCH_ARM64
+       asm volatile(PLT_CPU_FEATURE_PREAMBLE
+                    "          ldp %[tag], %[wqp], [%[tag_loc]]        \n"
+                    "          tbz %[tag], 63, .Ldone%=                \n"
+                    "          sevl                                    \n"
+                    ".Lrty%=:  wfe                                     \n"
+                    "          ldp %[tag], %[wqp], [%[tag_loc]]        \n"
+                    "          tbnz %[tag], 63, .Lrty%=                \n"
+                    ".Ldone%=: dmb ld                                  \n"
+                    : [tag] "=&r"(gw.u64[0]), [wqp] "=&r"(gw.u64[1])
+                    : [tag_loc] "r"(ws->base + SSOW_LF_GWS_WQE0)
+                    : "memory");
+#else
+       do {
+               roc_load_pair(gw.u64[0], gw.u64[1], ws->base + 
SSOW_LF_GWS_WQE0);
+       } while (gw.u64[0] & BIT_ULL(63));
+#endif
+
+       ws->gw_rdata = gw.u64[0];
+       if (gw.u64[1])
+               cn20k_sso_hws_post_process(ws, gw.u64, flags);
+       else
+               gw.u64[0] = (gw.u64[0] & (0x3ull << 32)) << 6 |
+                           (gw.u64[0] & (0x3FFull << 36)) << 4 | (gw.u64[0] & 
0xffffffff);
+
+       ev->event = gw.u64[0];
+       ev->u64 = gw.u64[1];
+
+       return !!gw.u64[1];
+}
 
 /* CN20K Fastpath functions. */
 uint16_t __rte_hot cn20k_sso_hws_enq_burst(void *port, const struct rte_event 
ev[],
@@ -18,4 +146,11 @@ uint16_t __rte_hot cn20k_sso_hws_enq_new_burst(void *port, 
const struct rte_even
 uint16_t __rte_hot cn20k_sso_hws_enq_fwd_burst(void *port, const struct 
rte_event ev[],
                                               uint16_t nb_events);
 
+uint16_t __rte_hot cn20k_sso_hws_deq(void *port, struct rte_event *ev, 
uint64_t timeout_ticks);
+uint16_t __rte_hot cn20k_sso_hws_deq_burst(void *port, struct rte_event ev[], 
uint16_t nb_events,
+                                          uint64_t timeout_ticks);
+uint16_t __rte_hot cn20k_sso_hws_tmo_deq(void *port, struct rte_event *ev, 
uint64_t timeout_ticks);
+uint16_t __rte_hot cn20k_sso_hws_tmo_deq_burst(void *port, struct rte_event 
ev[],
+                                              uint16_t nb_events, uint64_t 
timeout_ticks);
+
 #endif
-- 
2.25.1

Reply via email to