With rearranging the code to prefetch the contents before
loop check increases performance from single and multistage
atomic pipeline.

Signed-off-by: Vipin Varghese <vipin.vargh...@intel.com>
---
 drivers/event/sw/sw_evdev_scheduler.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/event/sw/sw_evdev_scheduler.c 
b/drivers/event/sw/sw_evdev_scheduler.c
index e3a41e0..70d1970 100644
--- a/drivers/event/sw/sw_evdev_scheduler.c
+++ b/drivers/event/sw/sw_evdev_scheduler.c
@@ -44,12 +44,13 @@ sw_schedule_atomic_to_cq(struct sw_evdev *sw, struct sw_qid 
* const qid,
        uint32_t qid_id = qid->id;
 
        iq_dequeue_burst(sw, &qid->iq[iq_num], qes, count);
-       for (i = 0; i < count; i++) {
-               const struct rte_event *qe = &qes[i];
-               const uint16_t flow_id = SW_HASH_FLOWID(qes[i].flow_id);
-               struct sw_fid_t *fid = &qid->fids[flow_id];
-               int cq = fid->cq;
 
+       const struct rte_event *qe = &qes[0];
+       const uint16_t flow_id = SW_HASH_FLOWID(qes[0].flow_id);
+       struct sw_fid_t *fid = &qid->fids[flow_id];
+       int cq = fid->cq;
+
+       for (i = 0; i < count; i++) {
                if (cq < 0) {
                        uint32_t cq_idx = qid->cq_next_tx++;
                        if (qid->cq_next_tx == qid->cq_num_mapped_cqs)
@@ -101,6 +102,13 @@ sw_schedule_atomic_to_cq(struct sw_evdev *sw, struct 
sw_qid * const qid,
                                        &sw->cq_ring_space[cq]);
                        p->cq_buf_count = 0;
                }
+
+               if (likely(i+1 < count)) {
+                       qe = (qes + i + 1);
+                       flow_id = SW_HASH_FLOWID(qes[i + 1].flow_id);
+                       fid = &qid->fids[flow_id];
+                       cq = fid->cq;
+               }
        }
        iq_put_back(sw, &qid->iq[iq_num], blocked_qes, nb_blocked);
 
-- 
2.7.4

Reply via email to