<snip>
Add minimum burst throughout the scheduler pipeline and a flush counter.
Replace ring API calls with local single threaded implementation where
possible.
Signed-off-by: Radu Nicolau mailto:radu.nico...@intel.com
Thanks for the patch, a few comments inline.
---
drivers/event/sw/sw_evdev.h | 11 +++-
drivers/event/sw/sw_evdev_scheduler.c | 83
+++++++++++++++++++++++----
2 files changed, 81 insertions(+), 13 deletions(-)
diff --git a/drivers/event/sw/sw_evdev.h b/drivers/event/sw/sw_evdev.h
index 7c77b2495..95e51065f 100644
--- a/drivers/event/sw/sw_evdev.h
+++ b/drivers/event/sw/sw_evdev.h
@@ -29,7 +29,13 @@
/* report dequeue burst sizes in buckets */ #define
SW_DEQ_STAT_BUCKET_SHIFT 2
/* how many packets pulled from port by sched */ -#define
SCHED_DEQUEUE_BURST_SIZE 32
+#define SCHED_DEQUEUE_BURST_SIZE 64
+
+#define SCHED_MIN_BURST_SIZE 8
+#define SCHED_NO_ENQ_CYCLE_FLUSH 256
+/* set SCHED_DEQUEUE_BURST_SIZE to 64 or 128 when setting this to 1*/
+#define SCHED_REFILL_ONCE_PER_CALL 1
Is it possible to make the above #define a runtime option?
Eg, --vdev event_sw,refill_iter=1
That would allow packaged versions of DPDK to be usable in both modes.
+
#define SW_PORT_HIST_LIST (MAX_SW_PROD_Q_DEPTH) /* size of our
history list */ #define NUM_SAMPLES 64 /* how many data points use
for average stats */ @@ -214,6 +220,9 @@ struct sw_evdev {
uint32_t xstats_count_mode_port;
uint32_t xstats_count_mode_queue;
+ uint16_t sched_flush_count;
+ uint16_t sched_min_burst;
+
/* Contains all ports - load balanced and directed */
struct sw_port ports[SW_PORTS_MAX] __rte_cache_aligned;
diff --git a/drivers/event/sw/sw_evdev_scheduler.c
b/drivers/event/sw/sw_evdev_scheduler.c
index cff747da8..ca6d1caff 100644
--- a/drivers/event/sw/sw_evdev_scheduler.c
+++ b/drivers/event/sw/sw_evdev_scheduler.c
@@ -26,6 +26,29 @@
/* use cheap bit mixing, we only need to lose a few bits */ #define
SW_HASH_FLOWID(f) (((f) ^ (f >> 10)) & FLOWID_MASK)
+
+/* single object enq and deq for non MT ring */ static
+__rte_always_inline void sw_nonmt_ring_dequeue(struct rte_ring *r,
+void **obj) {
+ if ((r->prod.tail - r->cons.tail) < 1)
+ return;
+ void **ring = (void **)&r[1];
+ *obj = ring[r->cons.tail & r->mask];
+ r->cons.tail++;
+}
+static __rte_always_inline int
+sw_nonmt_ring_enqueue(struct rte_ring *r, void *obj) {
+ if ((r->capacity + r->cons.tail - r->prod.tail) < 1)
+ return 0;
+ void **ring = (void **)&r[1];
+ ring[r->prod.tail & r->mask] = obj;
+ r->prod.tail++;
+ return 1;
+
Why not make these APIs part of the rte_ring library? You could further
optimize them by keeping the indices on the same cacheline.
I'm not sure there is any need for non thread-safe rings outside this
particular case.
[Honnappa] I think if we add the APIs, we will find the use cases.
But, more than that, I understand that rte_ring structure is exposed to the
application. The reason for doing that is the inline functions that rte_ring
provides. IMO, we should still maintain modularity and should not use the
internals of the rte_ring structure outside of the library.
+1 to that.
BTW, is there any real perf benefit from such micor-optimisation?