[PATCH v2 1/3] net/octeon_ep: optimize Rx and Tx routines
From: Pavan Nikhilesh Preset rearm data to avoid writing multiple fields in fastpath, Increase maximum outstanding Tx instructions from 128 to 256. Signed-off-by: Pavan Nikhilesh --- v2 Changes: - Skip compiling for 32b x86 targets. drivers/net/octeon_ep/cnxk_ep_rx.c| 12 drivers/net/octeon_ep/otx_ep_common.h | 3 +++ drivers/net/octeon_ep/otx_ep_rxtx.c | 27 +++ drivers/net/octeon_ep/otx_ep_rxtx.h | 2 +- 4 files changed, 39 insertions(+), 5 deletions(-) diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.c b/drivers/net/octeon_ep/cnxk_ep_rx.c index 74f0011283..75bb7225d2 100644 --- a/drivers/net/octeon_ep/cnxk_ep_rx.c +++ b/drivers/net/octeon_ep/cnxk_ep_rx.c @@ -93,7 +93,7 @@ cnxk_ep_check_rx_pkts(struct otx_ep_droq *droq) new_pkts = val - droq->pkts_sent_ism_prev; droq->pkts_sent_ism_prev = val; - if (val > (uint32_t)(1 << 31)) { + if (val > RTE_BIT32(31)) { /* Only subtract the packet count in the HW counter * when count above halfway to saturation. */ @@ -128,7 +128,6 @@ cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, { struct rte_mbuf **recv_buf_list = droq->recv_buf_list; uint32_t bytes_rsvd = 0, read_idx = droq->read_idx; - uint16_t port_id = droq->otx_ep_dev->port_id; uint16_t nb_desc = droq->nb_desc; uint16_t pkts; @@ -137,14 +136,19 @@ cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, struct rte_mbuf *mbuf; uint16_t pkt_len; + rte_prefetch0(recv_buf_list[otx_ep_incr_index(read_idx, 2, nb_desc)]); + rte_prefetch0(rte_pktmbuf_mtod(recv_buf_list[otx_ep_incr_index(read_idx, + 2, nb_desc)], + void *)); + mbuf = recv_buf_list[read_idx]; info = rte_pktmbuf_mtod(mbuf, struct otx_ep_droq_info *); read_idx = otx_ep_incr_index(read_idx, 1, nb_desc); pkt_len = rte_bswap16(info->length >> 48); - mbuf->data_off += OTX_EP_INFO_SIZE; mbuf->pkt_len = pkt_len; mbuf->data_len = pkt_len; - mbuf->port = port_id; + + *(uint64_t *)&mbuf->rearm_data = droq->rearm_data; rx_pkts[pkts] = mbuf; bytes_rsvd += pkt_len; } diff --git a/drivers/net/octeon_ep/otx_ep_common.h b/drivers/net/octeon_ep/otx_ep_common.h index 82e57520d3..299b5122d8 100644 --- a/drivers/net/octeon_ep/otx_ep_common.h +++ b/drivers/net/octeon_ep/otx_ep_common.h @@ -365,6 +365,9 @@ struct otx_ep_droq { /* receive buffer list contains mbuf ptr list */ struct rte_mbuf **recv_buf_list; + /* Packet re-arm data. */ + uint64_t rearm_data; + /* Packets pending to be processed */ uint64_t pkts_pending; diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.c b/drivers/net/octeon_ep/otx_ep_rxtx.c index c421ef0a1c..40c4a16a38 100644 --- a/drivers/net/octeon_ep/otx_ep_rxtx.c +++ b/drivers/net/octeon_ep/otx_ep_rxtx.c @@ -284,6 +284,32 @@ otx_ep_droq_setup_ring_buffers(struct otx_ep_droq *droq) return 0; } +static inline uint64_t +otx_ep_set_rearm_data(struct otx_ep_device *otx_ep) +{ + uint16_t port_id = otx_ep->port_id; + struct rte_mbuf mb_def; + uint64_t *tmp; + + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, data_off) % 8 != 0); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, refcnt) - offsetof(struct rte_mbuf, data_off) != +2); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, nb_segs) - offsetof(struct rte_mbuf, data_off) != +4); + RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, port) - offsetof(struct rte_mbuf, data_off) != +6); + mb_def.nb_segs = 1; + mb_def.data_off = RTE_PKTMBUF_HEADROOM + OTX_EP_INFO_SIZE; + mb_def.port = port_id; + rte_mbuf_refcnt_set(&mb_def, 1); + + /* Prevent compiler reordering: rearm_data covers previous fields */ + rte_compiler_barrier(); + tmp = (uint64_t *)&mb_def.rearm_data; + + return *tmp; +} + /* OQ initialization */ static int otx_ep_init_droq(struct otx_ep_device *otx_ep, uint32_t q_no, @@ -340,6 +366,7 @@ otx_ep_init_droq(struct otx_ep_device *otx_ep, uint32_t q_no, goto init_droq_fail; droq->refill_threshold = c_refill_threshold; + droq->rearm_data = otx_ep_set_rearm_data(otx_ep); /* Set up OQ registers */ ret = otx_ep->fn_list.setup_oq_regs(otx_ep, q_no); diff --git a/drivers/net/octeon_ep/otx_ep_rxtx.h b/drivers/net/octeon_ep/otx_ep_rxtx.h index cb68ef3b41..b159c32cae 100644 --- a/drivers/net/octeon_ep/otx_ep_rxtx.h +++ b/drivers/net/octeon_ep/otx_ep_rxtx.h @@ -17,7 +17,7 @@ #define OTX_EP_FSZ 28 #def
[PATCH v2 2/3] net/octeon_ep: use SSE instructions for Rx routine
From: Pavan Nikhilesh Optimize Rx routine to use SSE instructions. Signed-off-by: Pavan Nikhilesh --- drivers/net/octeon_ep/cnxk_ep_rx.c | 159 +-- drivers/net/octeon_ep/cnxk_ep_rx.h | 167 + drivers/net/octeon_ep/cnxk_ep_rx_sse.c | 124 ++ drivers/net/octeon_ep/meson.build | 17 +++ drivers/net/octeon_ep/otx_ep_ethdev.c | 7 ++ drivers/net/octeon_ep/otx_ep_rxtx.h| 10 ++ 6 files changed, 326 insertions(+), 158 deletions(-) create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx.h create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx_sse.c diff --git a/drivers/net/octeon_ep/cnxk_ep_rx.c b/drivers/net/octeon_ep/cnxk_ep_rx.c index 75bb7225d2..f3e4fb27d1 100644 --- a/drivers/net/octeon_ep/cnxk_ep_rx.c +++ b/drivers/net/octeon_ep/cnxk_ep_rx.c @@ -2,164 +2,7 @@ * Copyright(C) 2023 Marvell. */ -#include "otx_ep_common.h" -#include "otx2_ep_vf.h" -#include "otx_ep_rxtx.h" - -static inline int -cnxk_ep_rx_refill_mbuf(struct otx_ep_droq *droq, uint32_t count) -{ - struct otx_ep_droq_desc *desc_ring = droq->desc_ring; - struct rte_mbuf **recv_buf_list = droq->recv_buf_list; - uint32_t refill_idx = droq->refill_idx; - struct rte_mbuf *buf; - uint32_t i; - int rc; - - rc = rte_pktmbuf_alloc_bulk(droq->mpool, &recv_buf_list[refill_idx], count); - if (unlikely(rc)) { - droq->stats.rx_alloc_failure++; - return rc; - } - - for (i = 0; i < count; i++) { - buf = recv_buf_list[refill_idx]; - desc_ring[refill_idx].buffer_ptr = rte_mbuf_data_iova_default(buf); - refill_idx++; - } - - droq->refill_idx = otx_ep_incr_index(droq->refill_idx, count, droq->nb_desc); - droq->refill_count -= count; - - return 0; -} - -static inline void -cnxk_ep_rx_refill(struct otx_ep_droq *droq) -{ - uint32_t desc_refilled = 0, count; - uint32_t nb_desc = droq->nb_desc; - uint32_t refill_idx = droq->refill_idx; - int rc; - - if (unlikely(droq->read_idx == refill_idx)) - return; - - if (refill_idx < droq->read_idx) { - count = droq->read_idx - refill_idx; - rc = cnxk_ep_rx_refill_mbuf(droq, count); - if (unlikely(rc)) { - droq->stats.rx_alloc_failure++; - return; - } - desc_refilled = count; - } else { - count = nb_desc - refill_idx; - rc = cnxk_ep_rx_refill_mbuf(droq, count); - if (unlikely(rc)) { - droq->stats.rx_alloc_failure++; - return; - } - - desc_refilled = count; - count = droq->read_idx; - rc = cnxk_ep_rx_refill_mbuf(droq, count); - if (unlikely(rc)) { - droq->stats.rx_alloc_failure++; - return; - } - desc_refilled += count; - } - - /* Flush the droq descriptor data to memory to be sure -* that when we update the credits the data in memory is -* accurate. -*/ - rte_io_wmb(); - rte_write32(desc_refilled, droq->pkts_credit_reg); -} - -static inline uint32_t -cnxk_ep_check_rx_pkts(struct otx_ep_droq *droq) -{ - uint32_t new_pkts; - uint32_t val; - - /* Batch subtractions from the HW counter to reduce PCIe traffic -* This adds an extra local variable, but almost halves the -* number of PCIe writes. -*/ - val = __atomic_load_n(droq->pkts_sent_ism, __ATOMIC_RELAXED); - new_pkts = val - droq->pkts_sent_ism_prev; - droq->pkts_sent_ism_prev = val; - - if (val > RTE_BIT32(31)) { - /* Only subtract the packet count in the HW counter -* when count above halfway to saturation. -*/ - rte_write64((uint64_t)val, droq->pkts_sent_reg); - rte_mb(); - - rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg); - while (__atomic_load_n(droq->pkts_sent_ism, __ATOMIC_RELAXED) >= val) { - rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg); - rte_mb(); - } - - droq->pkts_sent_ism_prev = 0; - } - rte_write64(OTX2_SDP_REQUEST_ISM, droq->pkts_sent_reg); - droq->pkts_pending += new_pkts; - - return new_pkts; -} - -static inline int16_t __rte_hot -cnxk_ep_rx_pkts_to_process(struct otx_ep_droq *droq, uint16_t nb_pkts) -{ - if (droq->pkts_pending < nb_pkts) - cnxk_ep_check_rx_pkts(droq); - - return RTE_MIN(nb_pkts, droq->pkts_pending); -} - -static __rte_always_inline void -cnxk_ep_process_pkts_scalar(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts) -{ - struct rte_mbuf **re
[PATCH v2 3/3] net/octeon_ep: use AVX2 instructions for Rx
From: Pavan Nikhilesh Optimize Rx routine to use AVX2 instructions when underlying architecture supports it. Signed-off-by: Pavan Nikhilesh --- drivers/net/octeon_ep/cnxk_ep_rx_avx.c | 117 + drivers/net/octeon_ep/meson.build | 12 +++ drivers/net/octeon_ep/otx_ep_ethdev.c | 10 +++ drivers/net/octeon_ep/otx_ep_rxtx.h| 10 +++ 4 files changed, 149 insertions(+) create mode 100644 drivers/net/octeon_ep/cnxk_ep_rx_avx.c diff --git a/drivers/net/octeon_ep/cnxk_ep_rx_avx.c b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c new file mode 100644 index 00..cbd797f98b --- /dev/null +++ b/drivers/net/octeon_ep/cnxk_ep_rx_avx.c @@ -0,0 +1,117 @@ +/* SPDX-License-Identifier: BSD-3-Clause + * Copyright(C) 2023 Marvell. + */ + +#include "cnxk_ep_rx.h" + +static __rte_always_inline void +cnxk_ep_process_pkts_vec_avx(struct rte_mbuf **rx_pkts, struct otx_ep_droq *droq, uint16_t new_pkts) +{ + struct rte_mbuf **recv_buf_list = droq->recv_buf_list; + uint32_t bytes_rsvd = 0, read_idx = droq->read_idx; + const uint64_t rearm_data = droq->rearm_data; + struct rte_mbuf *m[CNXK_EP_OQ_DESC_PER_LOOP_AVX]; + uint32_t pidx[CNXK_EP_OQ_DESC_PER_LOOP_AVX]; + uint32_t idx[CNXK_EP_OQ_DESC_PER_LOOP_AVX]; + uint16_t nb_desc = droq->nb_desc; + uint16_t pkts = 0; + uint8_t i; + + idx[0] = read_idx; + while (pkts < new_pkts) { + __m256i data[CNXK_EP_OQ_DESC_PER_LOOP_AVX]; + /* mask to shuffle from desc. to mbuf (2 descriptors)*/ + const __m256i mask = + _mm256_set_epi8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 20, 21, 0xFF, 0xFF, 20, + 21, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 7, 6, 5, 4, 3, 2, 1, 0); + + for (i = 1; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++) + idx[i] = otx_ep_incr_index(idx[i - 1], 1, nb_desc); + + if (new_pkts - pkts > 8) { + pidx[0] = otx_ep_incr_index(idx[i - 1], 1, nb_desc); + for (i = 1; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++) + pidx[i] = otx_ep_incr_index(pidx[i - 1], 1, nb_desc); + + for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++) { + rte_prefetch0(recv_buf_list[pidx[i]]); + rte_prefetch0(rte_pktmbuf_mtod(recv_buf_list[pidx[i]], void *)); + } + } + + for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++) + m[i] = recv_buf_list[idx[i]]; + + for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++) + data[i] = _mm256_set_epi64x(0, + rte_pktmbuf_mtod(m[i], struct otx_ep_droq_info *)->length >> 16, + 0, rearm_data); + + for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++) { + data[i] = _mm256_shuffle_epi8(data[i], mask); + bytes_rsvd += _mm256_extract_epi16(data[i], 10); + } + + for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++) + _mm256_storeu_si256((__m256i *)&m[i]->rearm_data, data[i]); + + for (i = 0; i < CNXK_EP_OQ_DESC_PER_LOOP_AVX; i++) + rx_pkts[pkts++] = m[i]; + idx[0] = otx_ep_incr_index(idx[i - 1], 1, nb_desc); + } + droq->read_idx = idx[0]; + + droq->refill_count += new_pkts; + droq->pkts_pending -= new_pkts; + /* Stats */ + droq->stats.pkts_received += new_pkts; + droq->stats.bytes_received += bytes_rsvd; +} + +uint16_t __rte_noinline __rte_hot +cnxk_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +{ + struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue; + uint16_t new_pkts, vpkts; + + new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts); + vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX); + cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts); + cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts); + + /* Refill RX buffers */ + if (droq->refill_count >= DROQ_REFILL_THRESHOLD) + cnxk_ep_rx_refill(droq); + + return new_pkts; +} + +uint16_t __rte_noinline __rte_hot +cn9k_ep_recv_pkts_avx(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) +{ + struct otx_ep_droq *droq = (struct otx_ep_droq *)rx_queue; + uint16_t new_pkts, vpkts; + + new_pkts = cnxk_ep_rx_pkts_to_process(droq, nb_pkts); + vpkts = RTE_ALIGN_FLOOR(new_pkts, CNXK_EP_OQ_DESC_PER_LOOP_AVX); + cnxk_ep_process_pkts_vec_avx(rx_pkts, droq, vpkts); + cnxk_ep_process_pkts_scalar(&rx_pkts[vpkts], droq, new_pkts - vpkts); + + /* Refill RX buffe
Re: DPDK Release Status Meeting 2023-11-23
On 2023/11/24 4:53 PM, Mcnamara, John wrote: Release status meeting minutes 2023-11-23 issues: * Build/link issue on Debian https://salsa.debian.org/debian/dpdk/-/jobs/4949787 * cpfl compilation issue https://build.opensuse.org/package/live_build_log/home:bluca:dpdk/dpdk/Debian_Next/i586 * LCOREs autotest timing out on ARM: https://build.opensuse.org/package/live_build_log/home:bluca:dpdk/dpdk/Debian_12/aarch64 The failure relates to test environment. 50s is not enough for lcores test to finish. Due to a relative bigger RTE_MAX_LCORE value on ARM, the unit test case would take a longer time to finish iterations. In one of my run, the case took about 100s. Proposed Schedule for 2023 -- See http://core.dpdk.org/roadmap/#dates LTS --- * 22.11.3 - In progress * 21.11.6 - In progress * 20.11.10 - In progress * 19.11.15 - Will only be updated with CVE and critical fixes. * Distros * Debian 12 contains DPDK v22.11 * Ubuntu 22.04-LTS contains DPDK v21.11 * Ubuntu 23.04 contains DPDK v22.11 Defects --- * Bugzilla links, 'Bugs', added for hosted projects * https://www.dpdk.org/hosted-projects/ DPDK Release Status Meetings The DPDK Release Status Meeting is intended for DPDK Committers to discuss the status of the master tree and sub-trees, and for project managers to track progress or milestone dates. The meeting occurs on every Thursday at 9:30 UTC over Jitsi on https://meet.jit.si/DPDK You don't need an invite to join the meeting but if you want a calendar reminder just send an email to "John McNamara john.mcnam...@intel.com" for the invite.