Prefetch mbuf headers, resulting in ~10% throughput improvement when the Ethernet RX and TX Adapters are hosted on the same core (likely ~2x in case a dedicated TX core is used).
Signed-off-by: Mattias Rönnblom <mattias.ronnb...@ericsson.com> Tested-by: Peter Nilsson <peter.j.nils...@ericsson.com> --- lib/eventdev/rte_event_eth_tx_adapter.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/lib/eventdev/rte_event_eth_tx_adapter.c b/lib/eventdev/rte_event_eth_tx_adapter.c index 67fff8b7d6..d740ae00f9 100644 --- a/lib/eventdev/rte_event_eth_tx_adapter.c +++ b/lib/eventdev/rte_event_eth_tx_adapter.c @@ -598,6 +598,12 @@ txa_process_event_vector(struct txa_service_data *txa, return nb_tx; } +static inline void +txa_prefetch_mbuf(struct rte_mbuf *mbuf) +{ + rte_mbuf_prefetch_part1(mbuf); +} + static void txa_service_tx(struct txa_service_data *txa, struct rte_event *ev, uint32_t n) @@ -608,6 +614,20 @@ txa_service_tx(struct txa_service_data *txa, struct rte_event *ev, stats = &txa->stats; + for (i = 0; i < n; i++) { + struct rte_event *event = &ev[i]; + + if (unlikely(event->event_type & RTE_EVENT_TYPE_VECTOR)) { + struct rte_event_vector *vec = event->vec; + struct rte_mbuf **mbufs = vec->mbufs; + uint32_t k; + + for (k = 0; k < vec->nb_elem; k++) + txa_prefetch_mbuf(mbufs[k]); + } else + txa_prefetch_mbuf(event->mbuf); + } + nb_tx = 0; for (i = 0; i < n; i++) { uint16_t port; -- 2.43.0