Re: [PATCH] IGC: Remove I225_I_PHY_ID checking

2022-09-02 Thread David Marchand
On Thu, Sep 1, 2022 at 10:22 AM David Marchand
 wrote:
> Besides, please register to the @dev mailing list.

Bis.

Please register to the @dev mailing list.


-- 
David Marchand



[PATCH v3] IGC: Remove I225_I_PHY_ID checking

2022-09-02 Thread yock . gen . mah
From: Mah Yock Gen 

i225 devices have only one PHY vendor. There is unnecessary to check
_I_PHY_ID during the link establishment and auto-negotiation process,
the checking also caused devices like i225-IT failed. This patch is to
remove the mentioned unnecessary checking.

Cc: sta...@dpdk.org



Co-developed-by: Mah Yock Gen 
Signed-off-by: Mah Yock Gen 
Co-developed-by: Taripin Samuel 
Signed-off-by: Taripin Samuel 
Signed-off-by: Mah Yock Gen 
---
V3:
* Fixed coding style issue

V2:
* Removing i225-IT device id added in V1 patch, as it has been superseded

 drivers/net/igc/base/igc_i225.c | 15 ++-
 drivers/net/igc/base/igc_phy.c  |  6 ++
 2 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/drivers/net/igc/base/igc_i225.c b/drivers/net/igc/base/igc_i225.c
index 5f3d535490..180d3cf687 100644
--- a/drivers/net/igc/base/igc_i225.c
+++ b/drivers/net/igc/base/igc_i225.c
@@ -173,19 +173,8 @@ static s32 igc_init_phy_params_i225(struct igc_hw *hw)
phy->ops.write_reg = igc_write_phy_reg_gpy;
 
ret_val = igc_get_phy_id(hw);
-   /* Verify phy id and set remaining function pointers */
-   switch (phy->id) {
-   case I225_I_PHY_ID:
-   case I226_LM_PHY_ID:
-   phy->type   = igc_phy_i225;
-   phy->ops.set_d0_lplu_state = igc_set_d0_lplu_state_i225;
-   phy->ops.set_d3_lplu_state = igc_set_d3_lplu_state_i225;
-   /* TODO - complete with GPY PHY information */
-   break;
-   default:
-   ret_val = -IGC_ERR_PHY;
-   goto out;
-   }
+   phy->type = igc_phy_i225;
+
 
 out:
return ret_val;
diff --git a/drivers/net/igc/base/igc_phy.c b/drivers/net/igc/base/igc_phy.c
index 43bbe69bca..2906bae21a 100644
--- a/drivers/net/igc/base/igc_phy.c
+++ b/drivers/net/igc/base/igc_phy.c
@@ -1474,8 +1474,7 @@ s32 igc_phy_setup_autoneg(struct igc_hw *hw)
return ret_val;
}
 
-   if ((phy->autoneg_mask & ADVERTISE_2500_FULL) &&
-   hw->phy.id == I225_I_PHY_ID) {
+   if (phy->autoneg_mask & ADVERTISE_2500_FULL) {
/* Read the MULTI GBT AN Control Register - reg 7.32 */
ret_val = phy->ops.read_reg(hw, (STANDARD_AN_REG_MASK <<
MMD_DEVADDR_SHIFT) |
@@ -1615,8 +1614,7 @@ s32 igc_phy_setup_autoneg(struct igc_hw *hw)
ret_val = phy->ops.write_reg(hw, PHY_1000T_CTRL,
 mii_1000t_ctrl_reg);
 
-   if ((phy->autoneg_mask & ADVERTISE_2500_FULL) &&
-   hw->phy.id == I225_I_PHY_ID)
+   if (phy->autoneg_mask & ADVERTISE_2500_FULL)
ret_val = phy->ops.write_reg(hw,
 (STANDARD_AN_REG_MASK <<
 MMD_DEVADDR_SHIFT) |
-- 
2.36.1



[PATCH v3 1/3] ethdev: introduce pool sort capability

2022-09-02 Thread Hanumanth Pothula
This patch adds support for the pool sort capability.
Some of the HW has support for choosing memory pools based on the
packet's size. The pool sort capability allows PMD to choose a
memory pool based on the packet's length.

This is often useful for saving the memory where the application
can create a different pool to steer the specific size of the
packet, thus enabling effective use of memory.

For example, let's say HW has a capability of three pools,
 - pool-1 size is 2K
 - pool-2 size is > 2K and < 4K
 - pool-3 size is > 4K
Here,
pool-1 can accommodate packets with sizes < 2K
pool-2 can accommodate packets with sizes > 2K and < 4K
pool-3 can accommodate packets with sizes > 4K

With pool sort capability enabled in SW, an application may create
three pools of different sizes and send them to PMD. Allowing PMD
to program HW based on packet lengths. So that packets with less
than 2K are received on pool-1, packets with lengths between 2K
and 4K are received on pool-2 and finally packets greater than 4K
are received on pool-3.

The following two capabilities are added to the rte_eth_rxseg_capa
structure,
1. pool_sort --> tells pool sort capability is supported by HW.
2. max_npool --> max number of pools supported by HW.

Defined new structure rte_eth_rxseg_sort, to be used only when pool
sort capability is present. If required this may be extended further
to support more configurations.

Signed-off-by: Hanumanth Pothula 

v3:
 - Implemented Pool Sort capability as new Rx offload capability,
   RTE_ETH_RX_OFFLOAD_BUFFER_SORT.
v2:
 - Along with spec changes, uploading testpmd and driver changes.
---
 lib/ethdev/rte_ethdev.c | 69 ++---
 lib/ethdev/rte_ethdev.h | 24 +-
 2 files changed, 88 insertions(+), 5 deletions(-)

diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c
index 1979dc0850..5152c08f1e 100644
--- a/lib/ethdev/rte_ethdev.c
+++ b/lib/ethdev/rte_ethdev.c
@@ -1634,6 +1634,58 @@ rte_eth_dev_is_removed(uint16_t port_id)
return ret;
 }
 
+static int
+rte_eth_rx_queue_check_sort(const struct rte_eth_rxseg_sort *rx_seg,
+uint16_t n_seg, uint32_t *mbp_buf_size,
+const struct rte_eth_dev_info *dev_info)
+{
+   const struct rte_eth_rxseg_capa *seg_capa = &dev_info->rx_seg_capa;
+   uint16_t seg_idx;
+
+   if (!seg_capa->multi_pools || n_seg > seg_capa->max_npool) {
+   RTE_ETHDEV_LOG(ERR,
+  "Invalid capabilities, multi_pools:%d different 
length segments %u exceed supported %u\n",
+  seg_capa->multi_pools, n_seg, 
seg_capa->max_nseg);
+   return -EINVAL;
+   }
+
+   for (seg_idx = 0; seg_idx < n_seg; seg_idx++) {
+   struct rte_mempool *mpl = rx_seg[seg_idx].mp;
+   uint32_t length = rx_seg[seg_idx].length;
+
+   if (mpl == NULL) {
+   RTE_ETHDEV_LOG(ERR, "null mempool pointer\n");
+   return -EINVAL;
+   }
+
+   if (mpl->private_data_size <
+   sizeof(struct rte_pktmbuf_pool_private)) {
+   RTE_ETHDEV_LOG(ERR,
+  "%s private_data_size %u < %u\n",
+  mpl->name, mpl->private_data_size,
+  (unsigned int)sizeof
+   (struct rte_pktmbuf_pool_private));
+   return -ENOSPC;
+   }
+
+   *mbp_buf_size = rte_pktmbuf_data_room_size(mpl);
+   /* On segment length == 0, update segment's length with
+* the pool's length - headeroom space, to make sure enough
+* space is accomidate for header.
+**/
+   length = length != 0 ? length : (*mbp_buf_size - 
RTE_PKTMBUF_HEADROOM);
+   if (*mbp_buf_size < length + RTE_PKTMBUF_HEADROOM) {
+   RTE_ETHDEV_LOG(ERR,
+  "%s mbuf_data_room_size %u < %u))\n",
+  mpl->name, *mbp_buf_size,
+  length);
+   return -EINVAL;
+   }
+   }
+
+   return 0;
+}
+
 static int
 rte_eth_rx_queue_check_split(const struct rte_eth_rxseg_split *rx_seg,
 uint16_t n_seg, uint32_t *mbp_buf_size,
@@ -1764,7 +1816,6 @@ rte_eth_rx_queue_setup(uint16_t port_id, uint16_t 
rx_queue_id,
return -EINVAL;
}
} else {
-   const struct rte_eth_rxseg_split *rx_seg;
uint16_t n_seg;
 
/* Extended multi-segment configuration check. */
@@ -1774,13 +1825,23 @@ rte_eth_rx_queue_setup(uint16_t port_id, uint16_t 
rx_queue_id,
return -EINVAL;
}
 
-  

[PATCH v3 2/3] app/testpmd: Add support for pool sort capability

2022-09-02 Thread Hanumanth Pothula
This patch adds support for the pool sort capability.
Some of the HW has support for choosing memory pools based on the
packet's size. The pool sort capability allows PMD to choose a
memory pool based on the packet's length.

Populate Rx Sort/Split attributes based on the Rx offload value.
Also, print pool name on which packet is received.

Signed-off-by: Hanumanth Pothula 
---
 app/test-pmd/testpmd.c | 31 ++-
 app/test-pmd/util.c|  4 ++--
 2 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index addcbcac85..57f1d806b1 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -2661,7 +2661,8 @@ rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id,
int ret;
 
if (rx_pkt_nb_segs <= 1 ||
-   (rx_conf->offloads & RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT) == 0) {
+   (rx_conf->offloads & RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT ||
+rx_conf->offloads & RTE_ETH_RX_OFFLOAD_BUFFER_SORT) == 0) {
rx_conf->rx_seg = NULL;
rx_conf->rx_nseg = 0;
ret = rte_eth_rx_queue_setup(port_id, rx_queue_id,
@@ -2670,7 +2671,8 @@ rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id,
goto exit;
}
for (i = 0; i < rx_pkt_nb_segs; i++) {
-   struct rte_eth_rxseg_split *rx_seg = &rx_useg[i].split;
+   struct rte_eth_rxseg_split *rx_split = &rx_useg[i].split;
+   struct rte_eth_rxseg_sort  *rx_sort = &rx_useg[i].sort;
struct rte_mempool *mpx;
/*
 * Use last valid pool for the segments with number
@@ -2678,13 +2680,24 @@ rx_queue_setup(uint16_t port_id, uint16_t rx_queue_id,
 */
mp_n = (i >= mbuf_data_size_n) ? mbuf_data_size_n - 1 : i;
mpx = mbuf_pool_find(socket_id, mp_n);
-   /* Handle zero as mbuf data buffer size. */
-   rx_seg->length = rx_pkt_seg_lengths[i] ?
-  rx_pkt_seg_lengths[i] :
-  mbuf_data_size[mp_n];
-   rx_seg->offset = i < rx_pkt_nb_offs ?
-  rx_pkt_seg_offsets[i] : 0;
-   rx_seg->mp = mpx ? mpx : mp;
+   if (rx_conf->offloads & RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT) {
+   /**
+* On Segment length zero, update length as,
+*  buffer size - headroom size
+* to make sure enough space is accomidate for header.
+*/
+   rx_split->length = rx_pkt_seg_lengths[i] ?
+  rx_pkt_seg_lengths[i] :
+  mbuf_data_size[mp_n] - 
RTE_PKTMBUF_HEADROOM;
+   rx_split->offset = i < rx_pkt_nb_offs ?
+  rx_pkt_seg_offsets[i] : 0;
+   rx_split->mp = mpx ? mpx : mp;
+   } else if (rx_conf->offloads & RTE_ETH_RX_OFFLOAD_BUFFER_SORT) {
+   rx_sort->length = rx_pkt_seg_lengths[i] ?
+ rx_pkt_seg_lengths[i] :
+ mbuf_data_size[mp_n] - 
RTE_PKTMBUF_HEADROOM;
+   rx_sort->mp = mpx ? mpx : mp;
+   }
}
rx_conf->rx_nseg = rx_pkt_nb_segs;
rx_conf->rx_seg = rx_useg;
diff --git a/app/test-pmd/util.c b/app/test-pmd/util.c
index fd98e8b51d..f9df5f69ef 100644
--- a/app/test-pmd/util.c
+++ b/app/test-pmd/util.c
@@ -150,8 +150,8 @@ dump_pkt_burst(uint16_t port_id, uint16_t queue, struct 
rte_mbuf *pkts[],
print_ether_addr(" - dst=", ð_hdr->dst_addr,
 print_buf, buf_size, &cur_len);
MKDUMPSTR(print_buf, buf_size, cur_len,
- " - type=0x%04x - length=%u - nb_segs=%d",
- eth_type, (unsigned int) mb->pkt_len,
+ " - pool=%s - type=0x%04x - length=%u - nb_segs=%d",
+ mb->pool->name, eth_type, (unsigned int) mb->pkt_len,
  (int)mb->nb_segs);
ol_flags = mb->ol_flags;
if (ol_flags & RTE_MBUF_F_RX_RSS_HASH) {
-- 
2.25.1



[PATCH v3 3/3] net/cnxk: introduce pool sort capability

2022-09-02 Thread Hanumanth Pothula
Presently, HW is programmed only to receive packets from LPB pool.
Making all packets received from LPB pool.

But, CNXK HW supports two pools,
 - SPB -> packets with smaller size (less than 4K)
 - LPB -> packets with bigger size (greater than 4K)

Patch enables pool sorting capability, pool is selected based on
packet's length. So, basically, PMD programs HW for receiving
packets from both SPB and LPB pools based on the packet's length.

This is achieved by enabling rx buffer split offload,
RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT. This allows the application to send
more than one pool(in our case two) to the driver, with different
segment(packet) lengths, which helps the driver to configure both
pools based on segment lengths.

This is often useful for saving the memory where the application
can create a different pool to steer the specific size of the
packet, thus enabling effective use of memory.

Signed-off-by: Hanumanth Pothula 
---
 doc/guides/nics/features/cnxk.ini |  1 +
 doc/guides/nics/features/cnxk_vec.ini |  1 +
 drivers/net/cnxk/cnxk_ethdev.c| 93 ---
 drivers/net/cnxk/cnxk_ethdev.h|  4 +-
 drivers/net/cnxk/cnxk_ethdev_ops.c|  7 ++
 5 files changed, 96 insertions(+), 10 deletions(-)

diff --git a/doc/guides/nics/features/cnxk.ini 
b/doc/guides/nics/features/cnxk.ini
index 1876fe86c7..e1584ed740 100644
--- a/doc/guides/nics/features/cnxk.ini
+++ b/doc/guides/nics/features/cnxk.ini
@@ -4,6 +4,7 @@
 ; Refer to default.ini for the full list of available PMD features.
 ;
 [Features]
+pool sort   = Y
 Speed capabilities   = Y
 Rx interrupt = Y
 Lock-free Tx queue   = Y
diff --git a/doc/guides/nics/features/cnxk_vec.ini 
b/doc/guides/nics/features/cnxk_vec.ini
index 5d0976e6ce..a63d35aae7 100644
--- a/doc/guides/nics/features/cnxk_vec.ini
+++ b/doc/guides/nics/features/cnxk_vec.ini
@@ -4,6 +4,7 @@
 ; Refer to default.ini for the full list of available PMD features.
 ;
 [Features]
+pool sort   = Y
 Speed capabilities   = Y
 Rx interrupt = Y
 Lock-free Tx queue   = Y
diff --git a/drivers/net/cnxk/cnxk_ethdev.c b/drivers/net/cnxk/cnxk_ethdev.c
index cfcc4df916..376c5274d3 100644
--- a/drivers/net/cnxk/cnxk_ethdev.c
+++ b/drivers/net/cnxk/cnxk_ethdev.c
@@ -537,6 +537,64 @@ cnxk_nix_tx_queue_release(struct rte_eth_dev *eth_dev, 
uint16_t qid)
plt_free(txq_sp);
 }
 
+static int
+cnxk_nix_process_rx_conf(const struct rte_eth_rxconf *rx_conf,
+struct rte_mempool **lpb_pool, struct rte_mempool 
**spb_pool,
+uint16_t *lpb_len, uint16_t *spb_len)
+{
+   struct rte_eth_rxseg_sort rx_seg0;
+   struct rte_eth_rxseg_sort rx_seg1;
+   const char *platform_ops;
+   struct rte_mempool_ops *ops;
+
+   if (*lpb_pool || !rx_conf->rx_seg || rx_conf->rx_nseg != 
CNXK_NIX_NUM_POOLS_MAX ||
+   !rx_conf->rx_seg[0].sort.mp || !rx_conf->rx_seg[1].sort.mp) {
+   plt_err("invalid arguments");
+   return -EINVAL;
+   }
+
+   rx_seg0 = rx_conf->rx_seg[0].sort;
+   rx_seg1 = rx_conf->rx_seg[1].sort;
+
+   if (rx_seg0.length >= rx_seg0.mp->elt_size || rx_seg1.length >= 
rx_seg1.mp->elt_size) {
+   plt_err("mismatch in packet length & pool length seg0_len:%u 
pool0_len:%u"\
+   "seg1_len:%u pool1_len:%u", rx_seg0.length, 
rx_seg0.mp->elt_size,
+   rx_seg1.length, rx_seg1.mp->elt_size);
+   return -EINVAL;
+   }
+
+   if (rx_seg0.length > rx_seg1.length) {
+   *lpb_pool = rx_seg0.mp;
+   *spb_pool = rx_seg1.mp;
+
+   *lpb_len = rx_seg0.length;
+   *spb_len = rx_seg1.length;
+   } else {
+   *lpb_pool = rx_seg1.mp;
+   *spb_pool = rx_seg0.mp;
+
+   *lpb_len = rx_seg1.length;
+   *spb_len = rx_seg0.length;
+   }
+
+   if ((*spb_pool)->pool_id == 0) {
+   plt_err("Invalid pool_id");
+   return -EINVAL;
+   }
+
+   platform_ops = rte_mbuf_platform_mempool_ops();
+   ops = rte_mempool_get_ops((*spb_pool)->ops_index);
+   if (strncmp(ops->name, platform_ops, RTE_MEMPOOL_OPS_NAMESIZE)) {
+   plt_err("mempool ops should be of cnxk_npa type");
+   return -EINVAL;
+   }
+
+   plt_info("spb_pool:%s lpb_pool:%s lpb_len:%u spb_len:%u\n", 
(*spb_pool)->name,
+(*lpb_pool)->name, *lpb_len, *spb_len);
+
+   return 0;
+}
+
 int
 cnxk_nix_rx_queue_setup(struct rte_eth_dev *eth_dev, uint16_t qid,
uint32_t nb_desc, uint16_t fp_rx_q_sz,
@@ -553,6 +611,10 @@ cnxk_nix_rx_queue_setup(struct rte_eth_dev *eth_dev, 
uint16_t qid,
uint16_t first_skip;
int rc = -EINVAL;
size_t rxq_sz;
+   uint16_t lpb_len = 0;
+   uint16_t spb_len = 0;
+   struct rte_mempool *lpb_pool = mp;
+   struct rte_mempool *spb_pool = NULL;
 
/* Sanity checks */

Re: about RTL8168 PMD on ARM SoC

2022-09-02 Thread Thomas Monjalon
When do you expect or desire to merge it into upstream DPDK?
It's always better to know the intended roadmap, thanks.


29/08/2022 17:40, Hau:
> Hi Honnappa,
> 
> Thanks for prompt reply.
> Currently our pmd driver still has other issues. After we fix it, we will 
> submit our code as RFC.
> 
> Thanks,
> Hau
> 
> From: Honnappa Nagarahalli [mailto:honnappa.nagaraha...@arm.com]
> Sent: Saturday, August 27, 2022 12:44 AM
> To: xing_wang ; dev@dpdk.org
> Cc: dali_chen ; howard_wang 
> ; Ruifeng Wang ; Hau 
> ; nd 
> Subject: RE: about RTL8168 PMD on ARM SoC
> 
> Hi Xing Wang,
>Can you submit this code as RFC, it will enable us to provide 
> review comments?
> 
> Thanks,
> Honnappa
> 
> 
> From: 王星 mailto:xing_w...@realsil.com.cn>>
> Sent: Thursday, August 25, 2022 9:36 PM
> To: Honnappa Nagarahalli 
> mailto:honnappa.nagaraha...@arm.com>>; 
> dev@dpdk.org
> Cc: 陈立 mailto:dali_c...@realsil.com.cn>>; 王颢 
> mailto:howard_w...@realsil.com.cn>>; Ruifeng Wang 
> mailto:ruifeng.w...@arm.com>>; nd 
> mailto:n...@arm.com>>; nd mailto:n...@arm.com>>; 
> Hau mailto:h...@realtek.com>>
> Subject: 答复: about RTL8168 PMD on ARM SoC
> 
> Hi Honnappa,
> 
> I’m sorry I have to make a little correction,
> This issue was actually on our new 2.5 Gigabit RTL8125 series, the attachment 
> is latest r8125pmd,
> There is no essential difference w.r.t ARM platform though.
> 
> BRs,
> Xing Wang
> 
> 发件人: 王星
> 发送时间: 2022年8月26日 10:06
> 收件人: 'Honnappa Nagarahalli' 
> mailto:honnappa.nagaraha...@arm.com>>; 
> dev@dpdk.org
> 抄送: 陈立 mailto:dali_c...@realsil.com.cn>>; 王颢 
> mailto:howard_w...@realsil.com.cn>>; Ruifeng Wang 
> mailto:ruifeng.w...@arm.com>>; nd 
> mailto:n...@arm.com>>; nd mailto:n...@arm.com>>; 
> Hau mailto:h...@realtek.com>>
> 主题: 答复: about RTL8168 PMD on ARM SoC
> 
> Hi Honnappa,
> 
> The attachment is our current r8168pmd code for RTL8111/8168 Giga series
> (currently 8111G, 8111H are supported, we will add others later)
> I will contact the SoC vendor to consult you about this issue and let you 
> know some details about that SoC
> Thanks a lot!
> 
> BRs
> Xing Wang
> 发件人: Honnappa Nagarahalli [mailto:honnappa.nagaraha...@arm.com]
> 发送时间: 2022年8月25日 22:41
> 收件人: 王星 mailto:xing_w...@realsil.com.cn>>; 
> dev@dpdk.org
> 抄送: 陈立 mailto:dali_c...@realsil.com.cn>>; 王颢 
> mailto:howard_w...@realsil.com.cn>>; Ruifeng Wang 
> mailto:ruifeng.w...@arm.com>>; nd 
> mailto:n...@arm.com>>; nd mailto:n...@arm.com>>
> 主题: RE: about RTL8168 PMD on ARM SoC
> 
> Hello,
>I cannot find many details of the SoC on the internet. Does it 
> use coherent IO? Depending on that, different barriers might be needed. Other 
> than this, I would not think it needs anything special.
> 
> If you could send an RFC to the DPDK mailing list, I am happy to review and 
> provide any feedback.
> 
> Thanks,
> Honnappa
> 
> 
> From: 王星 mailto:xing_w...@realsil.com.cn>>
> Sent: Wednesday, August 24, 2022 9:53 PM
> To: dev@dpdk.org
> Cc: 陈立 mailto:dali_c...@realsil.com.cn>>; 王颢 
> mailto:howard_w...@realsil.com.cn>>
> Subject: about RTL8168 PMD on ARM SoC
> 
> Hi DPDK,
> 
> I am a pmd driver developer from Realtek NIC department,
> when I was porting r8168pmd already verified on x86 to an ARM64 SoC Unisoc: 
> UIS8650
> I found that after NIC Rx init (in general, Rx ring and buffers should have 
> been prepared for NIC to DMA read),
> the NIC status reg showed RDU (Rx Descriptor Unavailable), which means NIC 
> cannot read the proper desc content,
> 
> later I sended some packets to NIC hold by testpmd rx_only mode, HW internal 
> Rx packet counter can grow to some value, then stuck, 8168pmd Rx debug print 
> reported it received less packets than that value, and the print showed up 
> even some minutes later!
> 
> I doubt the phenomenon is caused by improper HW-based IO coherency support on 
> this ARM SoC,
> I have read the ARM SoC support list on DPDK website, to name it: NV 
> Bluefield, NXP DPAA, Marvell Octeon TX …
> 
> Does DPDK (or UIO/VFIO driver or hugetlb driver) need special HW IO cache 
> coherency support on ARM platform, say, ACE and Device side MMU etc?
> Should the SoC provide specialized UIO/VFIO driver or hugetlb driver and/or 
> specific DPDK lib to support such user mode DMA?
> Will you please give suggestions, thanks a lot!
> 
> BRs
> 
> --Please consider the environment before printing this e-mail.
> 







RE: [EXT] Re: [PATCH] examples: compilation fix for GCC-12

2022-09-02 Thread Amit Prakash Shukla
Thanks Stephen for feedback. I will make the change in v2.

> -Original Message-
> From: Stephen Hemminger 
> Sent: Thursday, September 1, 2022 8:54 PM
> To: Amit Prakash Shukla 
> Cc: Ruifeng Wang ; dev@dpdk.org; Jerin Jacob
> Kollanukkaran ; sta...@dpdk.org; Akhil Goyal
> 
> Subject: [EXT] Re: [PATCH] examples: compilation fix for GCC-12
> 
> External Email
> 
> --
> On Thu, 1 Sep 2022 13:53:43 +0530
> Amit Prakash Shukla  wrote:
> 
> > diff --git a/examples/common/neon/port_group.h
> b/examples/common/neon/port_group.h
> > index 82c6ed6d73..97da604583 100644
> > --- a/examples/common/neon/port_group.h
> > +++ b/examples/common/neon/port_group.h
> > @@ -24,7 +24,7 @@ port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp,
> uint16x8_t dp1,
> > union {
> > uint16_t u16[FWDSTEP + 1];
> > uint64_t u64;
> > -   } *pnum = (void *)pn;
> > +   } __attribute__((__packed__)) *pnum = (void *)pn;
> 
> Use __rte_packed instead of direct attribute


RE: [PATCH] vhost: compilation fix for GCC-12

2022-09-02 Thread Ruifeng Wang
> -Original Message-
> From: Amit Prakash Shukla 
> Sent: Thursday, September 1, 2022 4:50 PM
> To: Maxime Coquelin ; Chenbo Xia 
> 
> Cc: dev@dpdk.org; jer...@marvell.com; sta...@dpdk.org; Amit Prakash Shukla
> 
> Subject: [PATCH] vhost: compilation fix for GCC-12
> 
> ../lib/vhost/virtio_net.c:941:35: error:
>   'buf_vec[0].buf_len' may be used uninitialized
>   [-Werror=maybe-uninitialized]
>   941 | buf_len = buf_vec[vec_idx].buf_len;
>   |   ^~~~
> ../lib/vhost/virtio_net.c: In function 'virtio_dev_rx_packed':
> ../lib/vhost/virtio_net.c:1285:27: note: 'buf_vec' declared here
>  1285 | struct buf_vector buf_vec[BUF_VECTOR_MAX];
>   |   ^~~
> cc1: all warnings being treated as errors
> 
> Fixes: 93520085efda ("vhost: add packed ring single enqueue")
> Cc: sta...@dpdk.org
> 
> Signed-off-by: Amit Prakash Shukla 
> ---
>  lib/vhost/virtio_net.c | 12 
>  1 file changed, 12 insertions(+)
> 
> diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c index 
> b3d954aab4..0220bc923c
> 100644
> --- a/lib/vhost/virtio_net.c
> +++ b/lib/vhost/virtio_net.c
> @@ -1069,6 +1069,12 @@ vhost_enqueue_single_packed(struct virtio_net *dev,
>   else
>   max_tries = 1;
> 
> + /* To avoid GCC-12 warning.
> +  * GCC-12 is not evaluating sizeof at compile time.
Is this a compiler behavior change against previous versions?
I tried to find some clue from gcc-12 doc but got nothing. Can you point me to 
any material?

> +  */
> + if (unlikely(size == 0))
> + return -1;
> +
>   while (size > 0) {
Change 'while(){}' to 'do{}while()' can be a simpler solution. What do you 
think?

Thanks.

>   /*
>* if we tried all available ring items, and still @@ -1574,6 
> +1580,12 @@
> vhost_enqueue_async_packed(struct virtio_net *dev,
>   else
>   max_tries = 1;
> 
> + /* To avoid GCC-12 warning.
> +  * GCC-12 is not evaluating sizeof at compile time.
> +  */
> + if (unlikely(size == 0))
> + return -1;
> +
>   while (size > 0) {
>   /*
>* if we tried all available ring items, and still
> --
> 2.25.1



[PATCH v2] examples: compilation fix for GCC-12

2022-09-02 Thread Amit Prakash Shukla
GCC-12 warns when a pointer of type union points to an array of same
defined size, as union internally gets paded with pad bytes.

../examples/common/neon/port_group.h:42:21: error: array subscript
'union [0]' is partly outside array bounds of
'uint16_t[5]' {aka 'short unsigned int[5]'}
[-Werror=array-bounds]
   42 | pnum->u64 = gptbl[v].pnum;
  | ^~
../examples/common/neon/port_group.h:21:23: note: object 'pn' of
size [0, 10]
   21 | port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1
  |  ~^~~
../examples/common/neon/port_group.h:43:21: error: array subscript
'union [0]' is partly outside array bounds of
'uint16_t[5]' {aka 'short unsigned int[5]'} [-Werror=array-bounds]
   43 | pnum->u16[FWDSTEP] = 1;
  | ^~

Fixes: bdfc3816fbfc ("examples: common packet group functionality")
Cc: sta...@dpdk.org

Signed-off-by: Amit Prakash Shukla 
---
v2:
- Changed to __rte_packed instead of direct attribute

 examples/common/neon/port_group.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/common/neon/port_group.h 
b/examples/common/neon/port_group.h
index 82c6ed6d73..04e5699f70 100644
--- a/examples/common/neon/port_group.h
+++ b/examples/common/neon/port_group.h
@@ -24,7 +24,7 @@ port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, 
uint16x8_t dp1,
union {
uint16_t u16[FWDSTEP + 1];
uint64_t u64;
-   } *pnum = (void *)pn;
+   } __rte_packed *pnum = (void *)pn;
 
uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
int32_t v;
-- 
2.25.1



Re: [PATCH v2] examples/distributor: update dynamic configuration

2022-09-02 Thread Hunt, David



On 02/09/2022 06:20, Omer Yamac wrote:

Hi David,

I applied the changes as new version (v3), Thank you


--snip--



With the above suggested changes to the commit message:

Reviewed-by: David Hunt 



Hi Omer,

2 things:

Usually you submit subsequent versions of patches "in-reply-to" the 
previous versions to keep them together. Please see other examples of 
this on the mailing list.


It is normal to include any Reviewed-by or Acked-by tags in subsequent 
versions if the patch has not changed significantly.


Rgds,

Dave.



[PATCH v6 0/2] Add l2reflect measurement application

2022-09-02 Thread Felix Moessbauer
Dear DPDK community,

this patch provides the l2reflect measurement tool
which will be discussed in our 2022 DPDK Userspace Summit talk:
"Using DPDK OVS for deterministic low latency communication"

While the code still might need some polish, we believe it is
a good starting point for discussions about low latency networking
in DPDK.

The tool can also be used in a CI environment to contineously
measure latencies across the evolution of DPDK and Linux.

Best regards,
Felix Moessbauer
Siemens AG

Changes since v5:

- rebased against current main
- updated API to DPKD 22.x
- use RTE tracing
- rework handling of mempool
- fixes around configurable packet size
- backportet changes from out-of-tree version

Changes since v4:

All mentioned points from Thomas Monjalon's review are adressed.
This includes:

- remove obsolete makefile
- remove authors from headers
- include sbdirs in alphabetical order
- use rte functions to compare mac addrs
- use rte functions to format mac addrs
- use jansson instead of cjson for json creation
- print histogram to stderr to decouple from TUI
- add option to disable color (autodisable if redirected)
- improve documentation on how to use the tool
- improve inline documentation (mainly l2reflect_rx_filter)

This patch is still targeted towards dpdk/main until the
final decision is made to put it in DTS.
Further, currently only linux is supported due to the RT tuning
stuff and a missing windows testing environment on our side.
We would be happy about contributions to port that to other
platforms as well.

Changes since v3:

- check for sys/io.h header
- fix linking issues on gcc 10

Changes since v2:

- add missing internal dependency
- improve wording of commit message

Changes since v1:

- move to app folder, as suggested by maintainer
- fix issues reported by checkpatch



Felix Moessbauer (2):
  Fix build of apps with external dependencies
  Add l2reflect measurement application

 app/l2reflect/colors.c|   34 ++
 app/l2reflect/colors.h|   19 +
 app/l2reflect/l2reflect.h |   53 ++
 app/l2reflect/main.c  | 1007 +
 app/l2reflect/meson.build |   21 +
 app/l2reflect/payload.h   |   26 +
 app/l2reflect/stats.c |  225 +
 app/l2reflect/stats.h |   67 +++
 app/l2reflect/utils.c |   67 +++
 app/l2reflect/utils.h |   20 +
 app/meson.build   |3 +-
 11 files changed, 1541 insertions(+), 1 deletion(-)
 create mode 100644 app/l2reflect/colors.c
 create mode 100644 app/l2reflect/colors.h
 create mode 100644 app/l2reflect/l2reflect.h
 create mode 100644 app/l2reflect/main.c
 create mode 100644 app/l2reflect/meson.build
 create mode 100644 app/l2reflect/payload.h
 create mode 100644 app/l2reflect/stats.c
 create mode 100644 app/l2reflect/stats.h
 create mode 100644 app/l2reflect/utils.c
 create mode 100644 app/l2reflect/utils.h

-- 
2.30.2



[PATCH v6 1/2] Fix build of apps with external dependencies

2022-09-02 Thread Felix Moessbauer
This fix initializes the dependency object with the external
dependency list. Previously, the external dependencies were
just ignored.

Signed-off-by: Felix Moessbauer 
Acked-by: Bruce Richardson 
---
 app/meson.build | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/app/meson.build b/app/meson.build
index 93d8c15032..0ea04cadeb 100644
--- a/app/meson.build
+++ b/app/meson.build
@@ -44,7 +44,7 @@ foreach app:apps
 subdir(name)
 
 if build
-dep_objs = []
+dep_objs = ext_deps
 foreach d:deps
 var_name = get_option('default_library') + '_rte_' + d
 if not is_variable(var_name)
-- 
2.30.2



[PATCH v6 2/2] Add l2reflect measurement application

2022-09-02 Thread Felix Moessbauer
The l2reflect application implements a ping-pong benchmark to
measure the latency between two instances. For communication,
we use raw ethernet and send one packet at a time. The timing data
is collected locally and min/max/avg values are displayed in a TUI.
Finally, a histogram of the latencies is printed which can be
further processed with the jitterdebugger visualization scripts.
To debug latency spikes, a max threshold can be defined.
If it is hit, a trace point is created on both instances.

Signed-off-by: Felix Moessbauer 
Signed-off-by: Henning Schild 
---
 app/l2reflect/colors.c|   34 ++
 app/l2reflect/colors.h|   19 +
 app/l2reflect/l2reflect.h |   53 ++
 app/l2reflect/main.c  | 1007 +
 app/l2reflect/meson.build |   21 +
 app/l2reflect/payload.h   |   26 +
 app/l2reflect/stats.c |  225 +
 app/l2reflect/stats.h |   67 +++
 app/l2reflect/utils.c |   67 +++
 app/l2reflect/utils.h |   20 +
 app/meson.build   |1 +
 11 files changed, 1540 insertions(+)
 create mode 100644 app/l2reflect/colors.c
 create mode 100644 app/l2reflect/colors.h
 create mode 100644 app/l2reflect/l2reflect.h
 create mode 100644 app/l2reflect/main.c
 create mode 100644 app/l2reflect/meson.build
 create mode 100644 app/l2reflect/payload.h
 create mode 100644 app/l2reflect/stats.c
 create mode 100644 app/l2reflect/stats.h
 create mode 100644 app/l2reflect/utils.c
 create mode 100644 app/l2reflect/utils.h

diff --git a/app/l2reflect/colors.c b/app/l2reflect/colors.c
new file mode 100644
index 00..af881d8788
--- /dev/null
+++ b/app/l2reflect/colors.c
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Siemens AG
+ */
+
+#include "colors.h"
+
+const struct color_palette *colors;
+
+static const struct color_palette color_palette_default = {
+   .red = "\x1b[01;31m",
+   .green = "\x1b[01;32m",
+   .yellow = "\x1b[01;33m",
+   .blue = "\x1b[01;34m",
+   .magenta = "\x1b[01;35m",
+   .cyan = "\x1b[01;36m",
+   .reset = "\x1b[0m"
+};
+
+static const struct color_palette color_palette_bw = { .red = "",
+  .green = "",
+  .yellow = "",
+  .blue = "",
+  .magenta = "",
+  .cyan = "",
+  .reset = "" };
+
+void
+enable_colors(int enable)
+{
+   if (enable)
+   colors = &color_palette_default;
+   else
+   colors = &color_palette_bw;
+}
diff --git a/app/l2reflect/colors.h b/app/l2reflect/colors.h
new file mode 100644
index 00..346547138b
--- /dev/null
+++ b/app/l2reflect/colors.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Siemens AG
+ */
+#ifndef _L2REFLECT_COLORS_H_
+#define _L2REFLECT_COLORS_H_
+
+/* posix terminal colors */
+struct color_palette {
+   const char *red, *green, *yellow, *blue, *magenta, *cyan, *reset;
+};
+
+/* ptr to the current tui color palette */
+extern const struct color_palette *colors;
+
+/* disable colored output */
+void
+enable_colors(int enable);
+
+#endif /* _L2REFLECT_COLORS_H_ */
diff --git a/app/l2reflect/l2reflect.h b/app/l2reflect/l2reflect.h
new file mode 100644
index 00..922bd7c281
--- /dev/null
+++ b/app/l2reflect/l2reflect.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Siemens AG
+ */
+
+#include 
+
+#ifndef _L2REFLECT_L2REFLECT_H_
+#define _L2REFLECT_L2REFLECT_H_
+
+#define RTE_LOGTYPE_L2REFLECT RTE_LOGTYPE_USER1
+
+/* max size that common 1G NICs support */
+#define MAX_JUMBO_PKT_LEN 9600
+
+/* Used to compare MAC addresses. */
+#define MAC_ADDR_CMP 0xull
+
+#define MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+enum {
+   TRACE_TYPE_DATA,
+   TRACE_TYPE_HELO,
+   TRACE_TYPE_EHLO,
+   TRACE_TYPE_RSET,
+   TRACE_TYPE_QUIT,
+};
+
+enum STATE {
+   /* elect the initial sender */
+   S_ELECT_LEADER = 1,
+   /* reset the counters */
+   S_RESET_TRX = 2,
+   /* measurement S_RUNNING */
+   S_RUNNING = 4,
+   /* terminated by local event */
+   S_LOCAL_TERM = 8,
+   /* terminated by remote event */
+   S_REMOTE_TERM = 16
+};
+
+extern int l2reflect_hist;
+extern unsigned int l2reflect_hist_buckets;
+extern atomic_int l2reflect_output_hist;
+extern int l2reflect_interrupt;
+extern uint64_t l2reflect_sleep_msec;
+extern uint64_t l2reflect_pkt_bytes;
+extern uint16_t l2reflect_port_number;
+extern atomic_int l2reflect_state;
+extern struct rte_ether_addr l2reflect_port_eth_addr;
+extern struct rte_ether_addr l2reflect_remote_eth_addr;
+
+#endif /* _L2REFLECT_L2REFLECT_H_ */
diff --git a/app/l2reflect/main.c b/app/l2reflect/main.c

[PATCH v3 1/4] net/axgbe: fix scattered Rx

2022-09-02 Thread Bhagyada Modali
Error check needs to be done only for last segment of Jumbo packet.
Freed first_seg and reset eop to 0 in error case

Fixes: 965b3127d425 ("net/axgbe: support scattered Rx")
Cc: sta...@dpdk.org

Signed-off-by: Bhagyada Modali 

---
v2:
* removed the extra paranthesis
* initialised the variables
* updated the commit logs
---
 drivers/net/axgbe/axgbe_rxtx.c | 35 +++---
 1 file changed, 20 insertions(+), 15 deletions(-)

diff --git a/drivers/net/axgbe/axgbe_rxtx.c b/drivers/net/axgbe/axgbe_rxtx.c
index f38bb64fab..89ed6fd74a 100644
--- a/drivers/net/axgbe/axgbe_rxtx.c
+++ b/drivers/net/axgbe/axgbe_rxtx.c
@@ -343,8 +343,8 @@ uint16_t eth_axgbe_recv_scattered_pkts(void *rx_queue,
uint64_t old_dirty = rxq->dirty;
struct rte_mbuf *first_seg = NULL;
struct rte_mbuf *mbuf, *tmbuf;
-   unsigned int err, etlt;
-   uint32_t error_status;
+   unsigned int err = 0, etlt;
+   uint32_t error_status = 0;
uint16_t idx, pidx, data_len = 0, pkt_len = 0;
uint64_t offloads;
 
@@ -381,19 +381,6 @@ uint16_t eth_axgbe_recv_scattered_pkts(void *rx_queue,
}
 
mbuf = rxq->sw_ring[idx];
-   /* Check for any errors and free mbuf*/
-   err = AXGMAC_GET_BITS_LE(desc->write.desc3,
-RX_NORMAL_DESC3, ES);
-   error_status = 0;
-   if (unlikely(err)) {
-   error_status = desc->write.desc3 & AXGBE_ERR_STATUS;
-   if ((error_status != AXGBE_L3_CSUM_ERR)
-   && (error_status != AXGBE_L4_CSUM_ERR)) 
{
-   rxq->errors++;
-   rte_pktmbuf_free(mbuf);
-   goto err_set;
-   }
-   }
rte_prefetch1(rte_pktmbuf_mtod(mbuf, void *));
 
if (!AXGMAC_GET_BITS_LE(desc->write.desc3,
@@ -406,6 +393,24 @@ uint16_t eth_axgbe_recv_scattered_pkts(void *rx_queue,
pkt_len = AXGMAC_GET_BITS_LE(desc->write.desc3,
RX_NORMAL_DESC3, PL);
data_len = pkt_len - rxq->crc_len;
+   /* Check for any errors and free mbuf*/
+   err = AXGMAC_GET_BITS_LE(desc->write.desc3,
+   RX_NORMAL_DESC3, ES);
+   error_status = 0;
+   if (unlikely(err)) {
+   error_status = desc->write.desc3 &
+   AXGBE_ERR_STATUS;
+   if (error_status != AXGBE_L3_CSUM_ERR &&
+   error_status != 
AXGBE_L4_CSUM_ERR) {
+   rxq->errors++;
+   rte_pktmbuf_free(mbuf);
+   rte_pktmbuf_free(first_seg);
+   first_seg = NULL;
+   eop = 0;
+   goto err_set;
+   }
+   }
+
}
 
if (first_seg != NULL) {
-- 
2.25.1



[PATCH v3 2/4] net/axgbe: fix mbuf lengths in scattered Rx

2022-09-02 Thread Bhagyada Modali
Updated pkt_len and data_len in the last segment of the packet.

Fixes: 965b3127d425 ("net/axgbe: support scattered Rx")
Cc: sta...@dpdk.org

Signed-off-by: Bhagyada Modali 
---
 drivers/net/axgbe/axgbe_rxtx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/axgbe/axgbe_rxtx.c b/drivers/net/axgbe/axgbe_rxtx.c
index 89ed6fd74a..2c2554e900 100644
--- a/drivers/net/axgbe/axgbe_rxtx.c
+++ b/drivers/net/axgbe/axgbe_rxtx.c
@@ -391,8 +391,8 @@ uint16_t eth_axgbe_recv_scattered_pkts(void *rx_queue,
} else {
eop = 1;
pkt_len = AXGMAC_GET_BITS_LE(desc->write.desc3,
-   RX_NORMAL_DESC3, PL);
-   data_len = pkt_len - rxq->crc_len;
+   RX_NORMAL_DESC3, PL) - rxq->crc_len;
+   data_len = pkt_len % rxq->buf_size;
/* Check for any errors and free mbuf*/
err = AXGMAC_GET_BITS_LE(desc->write.desc3,
RX_NORMAL_DESC3, ES);
-- 
2.25.1



[PATCH v3 3/4] net/axgbe: fix length of each segment in scattered Rx

2022-09-02 Thread Bhagyada Modali
Updating mbuf data_len, Pkt_len for each segment before chaining them

Fixes: 965b3127d425 ("net/axgbe: support scattered Rx")
Cc: sta...@dpdk.org

Signed-off-by: Bhagyada Modali 
---
 drivers/net/axgbe/axgbe_rxtx.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/axgbe/axgbe_rxtx.c b/drivers/net/axgbe/axgbe_rxtx.c
index 2c2554e900..7c07fd90ef 100644
--- a/drivers/net/axgbe/axgbe_rxtx.c
+++ b/drivers/net/axgbe/axgbe_rxtx.c
@@ -412,6 +412,10 @@ uint16_t eth_axgbe_recv_scattered_pkts(void *rx_queue,
}
 
}
+   /* Mbuf populate */
+   mbuf->data_off = RTE_PKTMBUF_HEADROOM;
+   mbuf->data_len = data_len;
+   mbuf->pkt_len = data_len;
 
if (first_seg != NULL) {
if (rte_pktmbuf_chain(first_seg, mbuf) != 0)
@@ -443,9 +447,6 @@ uint16_t eth_axgbe_recv_scattered_pkts(void *rx_queue,
mbuf->vlan_tci = 0;
}
}
-   /* Mbuf populate */
-   mbuf->data_off = RTE_PKTMBUF_HEADROOM;
-   mbuf->data_len = data_len;
 
 err_set:
rxq->cur++;
-- 
2.25.1



[PATCH v3 4/4] net/axgbe: fix checksum and RSS in scattered Rx

2022-09-02 Thread Bhagyada Modali
Updated the RSS hash and CSUM checks with first_seg instead of mbufs.

Fixes: 965b3127d425 ("net/axgbe: support scattered Rx")
Cc: sta...@dpdk.org

Signed-off-by: Bhagyada Modali 
---
 drivers/net/axgbe/axgbe_rxtx.c | 41 +-
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/drivers/net/axgbe/axgbe_rxtx.c b/drivers/net/axgbe/axgbe_rxtx.c
index 7c07fd90ef..2bad638f79 100644
--- a/drivers/net/axgbe/axgbe_rxtx.c
+++ b/drivers/net/axgbe/axgbe_rxtx.c
@@ -427,24 +427,27 @@ uint16_t eth_axgbe_recv_scattered_pkts(void *rx_queue,
 
/* Get the RSS hash */
if (AXGMAC_GET_BITS_LE(desc->write.desc3, RX_NORMAL_DESC3, RSV))
-   mbuf->hash.rss = rte_le_to_cpu_32(desc->write.desc1);
+   first_seg->hash.rss =
+   rte_le_to_cpu_32(desc->write.desc1);
etlt = AXGMAC_GET_BITS_LE(desc->write.desc3,
RX_NORMAL_DESC3, ETLT);
offloads = rxq->pdata->eth_dev->data->dev_conf.rxmode.offloads;
if (!err || !etlt) {
if (etlt == RX_CVLAN_TAG_PRESENT) {
-   mbuf->ol_flags |= RTE_MBUF_F_RX_VLAN;
-   mbuf->vlan_tci =
+   first_seg->ol_flags |= RTE_MBUF_F_RX_VLAN;
+   first_seg->vlan_tci =
AXGMAC_GET_BITS_LE(desc->write.desc0,
RX_NORMAL_DESC0, OVT);
if (offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP)
-   mbuf->ol_flags |= 
RTE_MBUF_F_RX_VLAN_STRIPPED;
+   first_seg->ol_flags |=
+   RTE_MBUF_F_RX_VLAN_STRIPPED;
else
-   mbuf->ol_flags &= 
~RTE_MBUF_F_RX_VLAN_STRIPPED;
+   first_seg->ol_flags &=
+   ~RTE_MBUF_F_RX_VLAN_STRIPPED;
} else {
-   mbuf->ol_flags &=
+   first_seg->ol_flags &=
~(RTE_MBUF_F_RX_VLAN | 
RTE_MBUF_F_RX_VLAN_STRIPPED);
-   mbuf->vlan_tci = 0;
+   first_seg->vlan_tci = 0;
}
}
 
@@ -468,18 +471,24 @@ uint16_t eth_axgbe_recv_scattered_pkts(void *rx_queue,
 
first_seg->port = rxq->port_id;
if (rxq->pdata->rx_csum_enable) {
-   mbuf->ol_flags = 0;
-   mbuf->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD;
-   mbuf->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_GOOD;
+   first_seg->ol_flags = 0;
+   first_seg->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_GOOD;
+   first_seg->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_GOOD;
if (unlikely(error_status == AXGBE_L3_CSUM_ERR)) {
-   mbuf->ol_flags &= ~RTE_MBUF_F_RX_IP_CKSUM_GOOD;
-   mbuf->ol_flags |= RTE_MBUF_F_RX_IP_CKSUM_BAD;
-   mbuf->ol_flags &= ~RTE_MBUF_F_RX_L4_CKSUM_GOOD;
-   mbuf->ol_flags |= 
RTE_MBUF_F_RX_L4_CKSUM_UNKNOWN;
+   first_seg->ol_flags &=
+   ~RTE_MBUF_F_RX_IP_CKSUM_GOOD;
+   first_seg->ol_flags |=
+   RTE_MBUF_F_RX_IP_CKSUM_BAD;
+   first_seg->ol_flags &=
+   ~RTE_MBUF_F_RX_L4_CKSUM_GOOD;
+   first_seg->ol_flags |=
+   RTE_MBUF_F_RX_L4_CKSUM_UNKNOWN;
} else if (unlikely(error_status
== AXGBE_L4_CSUM_ERR)) {
-   mbuf->ol_flags &= ~RTE_MBUF_F_RX_L4_CKSUM_GOOD;
-   mbuf->ol_flags |= RTE_MBUF_F_RX_L4_CKSUM_BAD;
+   first_seg->ol_flags &=
+   ~RTE_MBUF_F_RX_L4_CKSUM_GOOD;
+   first_seg->ol_flags |=
+   RTE_MBUF_F_RX_L4_CKSUM_BAD;
}
}
 
-- 
2.25.1



Re: [PATCH v3] examples/distributor: update dynamic configuration

2022-09-02 Thread Hunt, David



On 01/09/2022 15:09, omer.yamac at ceng.metu.edu.tr (Abdullah Ömer 
Yamaç) wrote:

In this patch,
 * It is possible to switch the running mode of the distributor
using the command line argument.
 * With "-c" parameter, you can run RX and Distributor
on the same core.
 * Without "-c" parameter, you can run RX and Distributor
on the different core.
 * Consecutive termination of the lcores fixed.
The termination order was wrong, and you couldn't terminate the
application while traffic was capturing. The current order is
RX -> Distributor -> TX -> Workers
 * When "-c" parameter is active, the wasted distributor core is
also deactivated in the main function.

Signed-off-by: Abdullah ?mer Yama? 

---
Cc: david.hunt at intel.com
---


LGTM

Reviewed-by: David Hunt 




[PATCH v2 1/5] examples/l3fwd: fix port group mask generation

2022-09-02 Thread pbhagavatula
From: Pavan Nikhilesh 

Fix port group mask generation in altivec, vec_any_eq returns
0 or 1 while port_groupx4 expects comparison mask result.

Fixes: 2193b7467f7a ("examples/l3fwd: optimize packet processing on powerpc")
Cc: sta...@dpdk.org

Signed-off-by: Pavan Nikhilesh 
---
 v2 Changes:
 - Fix PPC, RISC-V, aarch32 compilation.

 examples/common/altivec/port_group.h | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/examples/common/altivec/port_group.h 
b/examples/common/altivec/port_group.h
index 5e209b02fa..592ef80b7f 100644
--- a/examples/common/altivec/port_group.h
+++ b/examples/common/altivec/port_group.h
@@ -26,12 +26,19 @@ port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp,
uint16_t u16[FWDSTEP + 1];
uint64_t u64;
} *pnum = (void *)pn;
+   union u_vec {
+   __vector unsigned short v_us;
+   unsigned short s[8];
+   };

+   union u_vec res;
int32_t v;

-   v = vec_any_eq(dp1, dp2);
-
+   dp1 = (__vector unsigned short)vec_cmpeq(dp1, dp2);
+   res.v_us = dp1;

+   v = (res.s[0] & 0x1) | (res.s[1] & 0x2) | (res.s[2] & 0x4) |
+   (res.s[3] & 0x8);
/* update last port counter. */
lp[0] += gptbl[v].lpv;

--
2.25.1



[PATCH v2 2/5] examples/l3fwd: split processing and send stages

2022-09-02 Thread pbhagavatula
From: Pavan Nikhilesh 

Split packet processing from packet send stage, as send stage
is not common for poll and event mode.

Signed-off-by: Pavan Nikhilesh 
---
 examples/l3fwd/l3fwd_em_hlm.h  | 39 +++---
 examples/l3fwd/l3fwd_lpm_altivec.h | 25 ---
 examples/l3fwd/l3fwd_lpm_neon.h| 35 ---
 examples/l3fwd/l3fwd_lpm_sse.h | 25 ---
 4 files changed, 95 insertions(+), 29 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index e76f2760b0..12b997e477 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -177,16 +177,12 @@ em_get_dst_port(const struct lcore_conf *qconf, struct 
rte_mbuf *pkt,
return portid;
 }
 
-/*
- * Buffer optimized handling of packets, invoked
- * from main_loop.
- */
 static inline void
-l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-   uint16_t portid, struct lcore_conf *qconf)
+l3fwd_em_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+uint16_t *dst_port, uint16_t portid,
+struct lcore_conf *qconf, const uint8_t do_step3)
 {
int32_t i, j, pos;
-   uint16_t dst_port[MAX_PKT_BURST];
 
/*
 * Send nb_rx - nb_rx % EM_HASH_LOOKUP_COUNT packets
@@ -233,13 +229,30 @@ l3fwd_em_send_packets(int nb_rx, struct rte_mbuf 
**pkts_burst,
dst_port[j + i] = em_get_dst_port(qconf,
pkts_burst[j + i], portid);
}
+
+   for (i = 0; i < EM_HASH_LOOKUP_COUNT && do_step3; i += FWDSTEP)
+   processx4_step3(&pkts_burst[j + i], &dst_port[j + i]);
}
 
-   for (; j < nb_rx; j++)
+   for (; j < nb_rx; j++) {
dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
+   if (do_step3)
+   process_packet(pkts_burst[j], &pkts_burst[j]->port);
+   }
+}
 
-   send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, uint16_t portid,
+ struct lcore_conf *qconf)
+{
+   uint16_t dst_port[MAX_PKT_BURST];
 
+   l3fwd_em_process_packets(nb_rx, pkts_burst, dst_port, portid, qconf, 0);
+   send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
 
 /*
@@ -260,11 +273,8 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 */
int32_t n = RTE_ALIGN_FLOOR(nb_rx, EM_HASH_LOOKUP_COUNT);
 
-   for (j = 0; j < EM_HASH_LOOKUP_COUNT && j < nb_rx; j++) {
+   for (j = 0; j < nb_rx; j++)
pkts_burst[j] = ev[j]->mbuf;
-   rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
-  struct rte_ether_hdr *) + 1);
-   }
 
for (j = 0; j < n; j += EM_HASH_LOOKUP_COUNT) {
 
@@ -305,7 +315,8 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
}
continue;
}
-   processx4_step3(&pkts_burst[j], &dst_port[j]);
+   for (i = 0; i < EM_HASH_LOOKUP_COUNT; i += FWDSTEP)
+   processx4_step3(&pkts_burst[j + i], &dst_port[j + i]);
 
for (i = 0; i < EM_HASH_LOOKUP_COUNT; i++)
pkts_burst[j + i]->port = dst_port[j + i];
diff --git a/examples/l3fwd/l3fwd_lpm_altivec.h 
b/examples/l3fwd/l3fwd_lpm_altivec.h
index 0c6852a7bb..adb82f1478 100644
--- a/examples/l3fwd/l3fwd_lpm_altivec.h
+++ b/examples/l3fwd/l3fwd_lpm_altivec.h
@@ -96,11 +96,11 @@ processx4_step2(const struct lcore_conf *qconf,
  * from main_loop.
  */
 static inline void
-l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
-   uint8_t portid, struct lcore_conf *qconf)
+l3fwd_lpm_process_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+ uint8_t portid, uint16_t *dst_port,
+ struct lcore_conf *qconf, const uint8_t do_step3)
 {
int32_t j;
-   uint16_t dst_port[MAX_PKT_BURST];
__vector unsigned int dip[MAX_PKT_BURST / FWDSTEP];
uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
@@ -114,22 +114,41 @@ l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf 
**pkts_burst,
ipv4_flag[j / FWDSTEP],
portid, &pkts_burst[j], &dst_port[j]);
 
+   if (do_step3)
+   for (j = 0; j != k; j += FWDSTEP)
+   processx4_step3(&pkts_burst[j], &dst_port[j]);
+
/* Classify last up to 3 packets one by one */
switch (nb_rx % FWDSTEP) {
case 3:
dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+   if 

[PATCH v2 4/5] examples/l3fwd: fix event vector processing in fib

2022-09-02 Thread pbhagavatula
From: Pavan Nikhilesh 

Fix stack overflow when event vector size is greater than
MAX_BURST_SIZE.
Add missing mac swap and rfc1812 stage.

Fixes: e8adca1951d4 ("examples/l3fwd: support event vector")

Signed-off-by: Pavan Nikhilesh 
---
 examples/l3fwd/l3fwd_fib.c | 124 ++---
 1 file changed, 87 insertions(+), 37 deletions(-)

diff --git a/examples/l3fwd/l3fwd_fib.c b/examples/l3fwd/l3fwd_fib.c
index e02e4b3f5a..ada5d0d430 100644
--- a/examples/l3fwd/l3fwd_fib.c
+++ b/examples/l3fwd/l3fwd_fib.c
@@ -77,27 +77,38 @@ fib_parse_packet(struct rte_mbuf *mbuf,
  */
 #if !defined FIB_SEND_MULTI
 static inline void
-fib_send_single(int nb_tx, struct lcore_conf *qconf,
-   struct rte_mbuf **pkts_burst, uint16_t hops[nb_tx])
+process_packet(struct rte_mbuf *pkt, uint16_t *hop)
 {
-   int32_t j;
struct rte_ether_hdr *eth_hdr;
 
-   for (j = 0; j < nb_tx; j++) {
-   /* Run rfc1812 if packet is ipv4 and checks enabled. */
+   /* Run rfc1812 if packet is ipv4 and checks enabled. */
 #if defined DO_RFC_1812_CHECKS
-   rfc1812_process((struct rte_ipv4_hdr *)(rte_pktmbuf_mtod(
-   pkts_burst[j], struct rte_ether_hdr *) + 1),
-   &hops[j], pkts_burst[j]->packet_type);
+   rfc1812_process(
+   (struct rte_ipv4_hdr *)(rte_pktmbuf_mtod(
+   pkt, struct rte_ether_hdr *) +
+   1),
+   hop, pkt->packet_type,
+   pkt->ol_flags & RTE_MBUF_F_RX_IP_CKSUM_MASK);
 #endif
 
-   /* Set MAC addresses. */
-   eth_hdr = rte_pktmbuf_mtod(pkts_burst[j],
-   struct rte_ether_hdr *);
-   *(uint64_t *)ð_hdr->dst_addr = dest_eth_addr[hops[j]];
-   rte_ether_addr_copy(&ports_eth_addr[hops[j]],
-   ð_hdr->src_addr);
+   /* Set MAC addresses. */
+   eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *);
+   *(uint64_t *)ð_hdr->dst_addr = dest_eth_addr[*hop];
+   rte_ether_addr_copy(&ports_eth_addr[*hop], ð_hdr->src_addr);
+}
+
+static inline void
+fib_send_single(int nb_tx, struct lcore_conf *qconf,
+   struct rte_mbuf **pkts_burst, uint16_t hops[nb_tx])
+{
+   int32_t j;
 
+   for (j = 0; j < nb_tx; j++) {
+   process_packet(pkts_burst[j], &hops[j]);
+   if (hops[j] == BAD_PORT) {
+   rte_pktmbuf_free(pkts_burst[j]);
+   continue;
+   }
/* Send single packet. */
send_single_packet(qconf, pkts_burst[j], hops[j]);
}
@@ -261,7 +272,7 @@ fib_event_loop(struct l3fwd_event_resources *evt_rsrc,
uint32_t ipv4_arr[MAX_PKT_BURST];
uint8_t ipv6_arr[MAX_PKT_BURST][RTE_FIB6_IPV6_ADDR_SIZE];
uint64_t hopsv4[MAX_PKT_BURST], hopsv6[MAX_PKT_BURST];
-   uint16_t nh;
+   uint16_t nh, hops[MAX_PKT_BURST];
uint8_t type_arr[MAX_PKT_BURST];
uint32_t ipv4_cnt, ipv6_cnt;
uint32_t ipv4_arr_assem, ipv6_arr_assem;
@@ -350,7 +361,13 @@ fib_event_loop(struct l3fwd_event_resources *evt_rsrc,
else
nh = (uint16_t)hopsv6[ipv6_arr_assem++];
if (nh != FIB_DEFAULT_HOP)
-   events[i].mbuf->port = nh;
+   hops[i] = nh != FIB_DEFAULT_HOP ?
+ nh :
+ events[i].mbuf->port;
+   process_packet(events[i].mbuf, &hops[i]);
+   events[i].mbuf->port = hops[i] != BAD_PORT ?
+  hops[i] :
+  events[i].mbuf->port;
}
 
if (flags & L3FWD_EVENT_TX_ENQ) {
@@ -418,14 +435,12 @@ fib_event_main_loop_tx_q_burst(__rte_unused void *dummy)
 }
 
 static __rte_always_inline void
-fib_process_event_vector(struct rte_event_vector *vec)
+fib_process_event_vector(struct rte_event_vector *vec, uint8_t *type_arr,
+uint8_t **ipv6_arr, uint64_t *hopsv4, uint64_t *hopsv6,
+uint32_t *ipv4_arr, uint16_t *hops)
 {
-   uint8_t ipv6_arr[MAX_PKT_BURST][RTE_FIB6_IPV6_ADDR_SIZE];
-   uint64_t hopsv4[MAX_PKT_BURST], hopsv6[MAX_PKT_BURST];
uint32_t ipv4_arr_assem, ipv6_arr_assem;
struct rte_mbuf **mbufs = vec->mbufs;
-   uint32_t ipv4_arr[MAX_PKT_BURST];
-   uint8_t type_arr[MAX_PKT_BURST];
uint32_t ipv4_cnt, ipv6_cnt;
struct lcore_conf *lconf;
uint16_t nh;
@@ -463,16 +478,10 @@ fib_process_event_vector(struct rte_event_vector *vec)
 
/* Lookup IPv6 hops if IPv6 packets are present. */
if (ipv6_cnt > 0)
-   rte_fib6_lookup_bulk(lconf->ip

[PATCH v2 3/5] examples/l3fwd: use lpm vector path for event vector

2022-09-02 Thread pbhagavatula
From: Pavan Nikhilesh 

Use lpm vector path to process event vector.

Signed-off-by: Pavan Nikhilesh 
---
 examples/l3fwd/l3fwd_altivec.h | 29 ++
 examples/l3fwd/l3fwd_event.h   | 71 ++
 examples/l3fwd/l3fwd_lpm.c | 38 ++
 examples/l3fwd/l3fwd_neon.h| 45 +
 examples/l3fwd/l3fwd_sse.h | 44 +
 5 files changed, 211 insertions(+), 16 deletions(-)

diff --git a/examples/l3fwd/l3fwd_altivec.h b/examples/l3fwd/l3fwd_altivec.h
index 87018f5dbe..e45e138e59 100644
--- a/examples/l3fwd/l3fwd_altivec.h
+++ b/examples/l3fwd/l3fwd_altivec.h
@@ -222,4 +222,33 @@ send_packets_multi(struct lcore_conf *qconf, struct 
rte_mbuf **pkts_burst,
}
 }
 
+static __rte_always_inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+   uint16_t i = 0, res;
+
+   while (nb_elem > 7) {
+   __vector unsigned short dp1;
+   __vector unsigned short dp;
+
+   dp = (__vector unsigned short)vec_splats((short)dst_ports[0]);
+   dp1 = *((__vector unsigned short *)&dst_ports[i]);
+   res = vec_all_eq(dp1, dp);
+   if (!res)
+   return BAD_PORT;
+
+   nb_elem -= 8;
+   i += 8;
+   }
+
+   while (nb_elem) {
+   if (dst_ports[i] != dst_ports[0])
+   return BAD_PORT;
+   nb_elem--;
+   i++;
+   }
+
+   return dst_ports[0];
+}
+
 #endif /* _L3FWD_ALTIVEC_H_ */
diff --git a/examples/l3fwd/l3fwd_event.h b/examples/l3fwd/l3fwd_event.h
index b93841a16f..3fe38aada0 100644
--- a/examples/l3fwd/l3fwd_event.h
+++ b/examples/l3fwd/l3fwd_event.h
@@ -82,6 +82,27 @@ struct l3fwd_event_resources {
uint64_t vector_tmo_ns;
 };
 
+#if defined(RTE_ARCH_X86)
+#include "l3fwd_sse.h"
+#elif defined __ARM_NEON
+#include "l3fwd_neon.h"
+#elif defined(RTE_ARCH_PPC_64)
+#include "l3fwd_altivec.h"
+#else
+static inline uint16_t
+process_dst_port(uint16_t *dst_ports, uint16_t nb_elem)
+{
+   int i;
+
+   for (i = 0; i < nb_elem; i++) {
+   if (dst_ports[i] != dst_ports[0])
+   return BAD_PORT;
+   }
+
+   return dst_ports[0];
+}
+#endif
+
 static inline void
 event_vector_attr_validate(struct rte_event_vector *vec, struct rte_mbuf *mbuf)
 {
@@ -103,7 +124,57 @@ event_vector_txq_set(struct rte_event_vector *vec, 
uint16_t txq)
}
 }
 
+static inline uint16_t
+filter_bad_packets(struct rte_mbuf **mbufs, uint16_t *dst_port,
+  uint16_t nb_pkts)
+{
+   uint16_t *des_pos, free = 0;
+   struct rte_mbuf **pos;
+   int i;
+
+   /* Filter out and free bad packets */
+   for (i = 0; i < nb_pkts; i++) {
+   if (dst_port[i] == BAD_PORT) {
+   rte_pktmbuf_free(mbufs[i]);
+   if (!free) {
+   pos = &mbufs[i];
+   des_pos = &dst_port[i];
+   }
+   free++;
+   continue;
+   }
+
+   if (free) {
+   *pos = mbufs[i];
+   pos++;
+   *des_pos = dst_port[i];
+   des_pos++;
+   }
+   }
 
+   return nb_pkts - free;
+}
+
+static inline void
+process_event_vector(struct rte_event_vector *vec, uint16_t *dst_port)
+{
+   uint16_t port, i;
+
+   vec->nb_elem = filter_bad_packets(vec->mbufs, dst_port, vec->nb_elem);
+   /* Verify destination array */
+   port = process_dst_port(dst_port, vec->nb_elem);
+   if (port == BAD_PORT) {
+   vec->attr_valid = 0;
+   for (i = 0; i < vec->nb_elem; i++) {
+   vec->mbufs[i]->port = dst_port[i];
+   rte_event_eth_tx_adapter_txq_set(vec->mbufs[i], 0);
+   }
+   } else {
+   vec->attr_valid = 1;
+   vec->port = port;
+   vec->queue = 0;
+   }
+}
 
 struct l3fwd_event_resources *l3fwd_get_eventdev_rsrc(void);
 void l3fwd_event_resource_setup(struct rte_eth_conf *port_conf);
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index d1b850dd5b..1652b7c470 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -425,24 +425,27 @@ lpm_event_main_loop_tx_q_burst(__rte_unused void *dummy)
 }
 
 static __rte_always_inline void
-lpm_process_event_vector(struct rte_event_vector *vec, struct lcore_conf 
*lconf)
+lpm_process_event_vector(struct rte_event_vector *vec, struct lcore_conf 
*lconf,
+uint16_t *dst_port)
 {
struct rte_mbuf **mbufs = vec->mbufs;
int i;
 
-   /* Process first packet to init vector attributes */
-   lpm_process_event_pkt(lconf, mbufs[0]);
+#if defined RTE_ARCH_X86 || defined __ARM_NEON || defined RTE_ARCH_PPC_64
if 

[PATCH v2 5/5] examples/l3fwd: use em vector path for event vector

2022-09-02 Thread pbhagavatula
From: Pavan Nikhilesh 

Use em vector path to process event vector.

Signed-off-by: Pavan Nikhilesh 
---
 examples/l3fwd/l3fwd_em.c| 12 +++--
 examples/l3fwd/l3fwd_em.h| 29 +--
 examples/l3fwd/l3fwd_em_hlm.h| 72 +---
 examples/l3fwd/l3fwd_em_sequential.h | 25 ++
 examples/l3fwd/l3fwd_event.h | 21 
 5 files changed, 47 insertions(+), 112 deletions(-)

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index 10be24c61d..e7b35cfbd9 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -852,10 +852,15 @@ em_event_loop_vector(struct l3fwd_event_resources 
*evt_rsrc,
int i, nb_enq = 0, nb_deq = 0;
struct lcore_conf *lconf;
unsigned int lcore_id;
+   uint16_t *dst_ports;
 
if (event_p_id < 0)
return;
 
+   dst_ports = rte_zmalloc("", sizeof(uint16_t) * evt_rsrc->vector_size,
+   RTE_CACHE_LINE_SIZE);
+   if (dst_ports == NULL)
+   return;
lcore_id = rte_lcore_id();
lconf = &lcore_conf[lcore_id];
 
@@ -877,13 +882,12 @@ em_event_loop_vector(struct l3fwd_event_resources 
*evt_rsrc,
}
 
 #if defined RTE_ARCH_X86 || defined __ARM_NEON
-   l3fwd_em_process_event_vector(events[i].vec, lconf);
+   l3fwd_em_process_event_vector(events[i].vec, lconf,
+ dst_ports);
 #else
l3fwd_em_no_opt_process_event_vector(events[i].vec,
-lconf);
+lconf, dst_ports);
 #endif
-   if (flags & L3FWD_EVENT_TX_DIRECT)
-   event_vector_txq_set(events[i].vec, 0);
}
 
if (flags & L3FWD_EVENT_TX_ENQ) {
diff --git a/examples/l3fwd/l3fwd_em.h b/examples/l3fwd/l3fwd_em.h
index fe2ee59f6a..7d051fc076 100644
--- a/examples/l3fwd/l3fwd_em.h
+++ b/examples/l3fwd/l3fwd_em.h
@@ -100,7 +100,7 @@ l3fwd_em_simple_forward(struct rte_mbuf *m, uint16_t portid,
}
 }
 
-static __rte_always_inline void
+static __rte_always_inline uint16_t
 l3fwd_em_simple_process(struct rte_mbuf *m, struct lcore_conf *qconf)
 {
struct rte_ether_hdr *eth_hdr;
@@ -117,6 +117,8 @@ l3fwd_em_simple_process(struct rte_mbuf *m, struct 
lcore_conf *qconf)
m->port = l3fwd_em_handle_ipv6(m, m->port, eth_hdr, qconf);
else
m->port = BAD_PORT;
+
+   return m->port;
 }
 
 /*
@@ -179,7 +181,8 @@ l3fwd_em_no_opt_process_events(int nb_rx, struct rte_event 
**events,
 
 static inline void
 l3fwd_em_no_opt_process_event_vector(struct rte_event_vector *vec,
-struct lcore_conf *qconf)
+struct lcore_conf *qconf,
+uint16_t *dst_ports)
 {
struct rte_mbuf **mbufs = vec->mbufs;
int32_t i;
@@ -188,30 +191,20 @@ l3fwd_em_no_opt_process_event_vector(struct 
rte_event_vector *vec,
for (i = 0; i < PREFETCH_OFFSET && i < vec->nb_elem; i++)
rte_prefetch0(rte_pktmbuf_mtod(mbufs[i], void *));
 
-   /* Process first packet to init vector attributes */
-   l3fwd_em_simple_process(mbufs[0], qconf);
-   if (vec->attr_valid) {
-   if (mbufs[0]->port != BAD_PORT)
-   vec->port = mbufs[0]->port;
-   else
-   vec->attr_valid = 0;
-   }
-
/*
 * Prefetch and forward already prefetched packets.
 */
-   for (i = 1; i < (vec->nb_elem - PREFETCH_OFFSET); i++) {
+   for (i = 0; i < (vec->nb_elem - PREFETCH_OFFSET); i++) {
rte_prefetch0(
rte_pktmbuf_mtod(mbufs[i + PREFETCH_OFFSET], void *));
-   l3fwd_em_simple_process(mbufs[i], qconf);
-   event_vector_attr_validate(vec, mbufs[i]);
+   dst_ports[i] = l3fwd_em_simple_process(mbufs[i], qconf);
}
 
/* Forward remaining prefetched packets */
-   for (; i < vec->nb_elem; i++) {
-   l3fwd_em_simple_process(mbufs[i], qconf);
-   event_vector_attr_validate(vec, mbufs[i]);
-   }
+   for (; i < vec->nb_elem; i++)
+   dst_ports[i] = l3fwd_em_simple_process(mbufs[i], qconf);
+
+   process_event_vector(vec, dst_ports);
 }
 
 #endif /* __L3FWD_EM_H__ */
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 12b997e477..2e11eefad7 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -332,70 +332,20 @@ l3fwd_em_process_events(int nb_rx, struct rte_event **ev,
 
 static inline void
 l3fwd_em_process_event_vector(struct rte_event_vector *vec,
- struct lcore_conf *qconf)
+  

[PATCH v3 0/4] support protocol based buffer split

2022-09-02 Thread Yuan Wang
Protocol type based buffer split consists of splitting a received packet
into several separate segments based on the packet content. It is useful
in some scenarios, such as GPU acceleration. The splitting will help to
enable true zero copy and hence improve the performance significantly.

This patchset aims to support protocol header split based on current buffer
split. When Rx queue is configured with RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT
offload and corresponding protocol, packets received will be directly split
into different mempools.

Change log:
v3:
Fix mail thread.

v2:
Add mbuf dump to the driver's buffer split path.
Add buffer split to the driver feature list.
Remove unsupported header protocols from the driver.

Yuan Wang (4):
  ethdev: introduce protocol header API
  ethdev: introduce protocol hdr based buffer split
  app/testpmd: add rxhdrs commands and parameters
  net/ice: support buffer split in Rx path

 app/test-pmd/cmdline.c | 123 +-
 app/test-pmd/config.c  |  70 
 app/test-pmd/parameters.c  |  16 +-
 app/test-pmd/testpmd.c |   2 +
 app/test-pmd/testpmd.h |   6 +
 doc/guides/nics/features/ice.ini   |   1 +
 doc/guides/rel_notes/release_22_11.rst |  14 ++
 drivers/net/ice/ice_ethdev.c   |  30 +++-
 drivers/net/ice/ice_rxtx.c | 220 +
 drivers/net/ice/ice_rxtx.h |  16 ++
 drivers/net/ice/ice_rxtx_vec_common.h  |   3 +
 lib/ethdev/ethdev_driver.h |  15 ++
 lib/ethdev/rte_ethdev.c|  88 --
 lib/ethdev/rte_ethdev.h|  41 -
 lib/ethdev/version.map |   3 +
 15 files changed, 600 insertions(+), 48 deletions(-)

-- 
2.25.1



[PATCH v3 1/4] ethdev: introduce protocol header API

2022-09-02 Thread Yuan Wang
Add a new ethdev API to retrieve supported protocol headers
of a PMD, which helps to configure protocol header based buffer split.

Signed-off-by: Yuan Wang 
Signed-off-by: Xuan Ding 
Signed-off-by: Wenxuan Wu 
---
 doc/guides/rel_notes/release_22_11.rst |  5 
 lib/ethdev/ethdev_driver.h | 15 
 lib/ethdev/rte_ethdev.c| 33 ++
 lib/ethdev/rte_ethdev.h| 24 +++
 lib/ethdev/version.map |  3 +++
 5 files changed, 80 insertions(+)

diff --git a/doc/guides/rel_notes/release_22_11.rst 
b/doc/guides/rel_notes/release_22_11.rst
index 8c021cf050..4d90514a9a 100644
--- a/doc/guides/rel_notes/release_22_11.rst
+++ b/doc/guides/rel_notes/release_22_11.rst
@@ -55,6 +55,11 @@ New Features
  Also, make sure to start the actual text at the margin.
  ===
 
+* **Added new ethdev API for PMD to get buffer split supported protocol 
types.**
+
+  Added ``rte_eth_buffer_split_get_supported_hdr_ptypes()``, to get supported
+  header protocols of a PMD to split.
+
 
 Removed Items
 -
diff --git a/lib/ethdev/ethdev_driver.h b/lib/ethdev/ethdev_driver.h
index 5101868ea7..f64ceb9907 100644
--- a/lib/ethdev/ethdev_driver.h
+++ b/lib/ethdev/ethdev_driver.h
@@ -1054,6 +1054,18 @@ typedef int (*eth_ip_reassembly_conf_get_t)(struct 
rte_eth_dev *dev,
 typedef int (*eth_ip_reassembly_conf_set_t)(struct rte_eth_dev *dev,
const struct rte_eth_ip_reassembly_params *conf);
 
+/**
+ * @internal
+ * Get supported header protocols of a PMD to split.
+ *
+ * @param dev
+ *   Ethdev handle of port.
+ *
+ * @return
+ *   An array pointer to store supported protocol headers.
+ */
+typedef const uint32_t *(*eth_buffer_split_supported_hdr_ptypes_get_t)(struct 
rte_eth_dev *dev);
+
 /**
  * @internal
  * Dump private info from device to a file.
@@ -1301,6 +1313,9 @@ struct eth_dev_ops {
/** Set IP reassembly configuration */
eth_ip_reassembly_conf_set_t ip_reassembly_conf_set;
 
+   /** Get supported header ptypes to split */
+   eth_buffer_split_supported_hdr_ptypes_get_t 
buffer_split_supported_hdr_ptypes_get;
+
/** Dump private info from device */
eth_dev_priv_dump_t eth_dev_priv_dump;
 
diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c
index 1979dc0850..093c577add 100644
--- a/lib/ethdev/rte_ethdev.c
+++ b/lib/ethdev/rte_ethdev.c
@@ -5917,6 +5917,39 @@ rte_eth_dev_priv_dump(uint16_t port_id, FILE *file)
return eth_err(port_id, (*dev->dev_ops->eth_dev_priv_dump)(dev, file));
 }
 
+int
+rte_eth_buffer_split_get_supported_hdr_ptypes(uint16_t port_id, uint32_t 
*ptypes, int num)
+{
+   int i, j;
+   struct rte_eth_dev *dev;
+   const uint32_t *all_types;
+
+   RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
+   dev = &rte_eth_devices[port_id];
+
+   if (ptypes == NULL && num > 0) {
+   RTE_ETHDEV_LOG(ERR,
+   "Cannot get ethdev port %u supported header protocol 
types to NULL "
+   "when array size is non zero\n",
+   port_id);
+   return -EINVAL;
+   }
+
+   
RTE_FUNC_PTR_OR_ERR_RET(*dev->dev_ops->buffer_split_supported_hdr_ptypes_get, 
-ENOTSUP);
+   all_types = (*dev->dev_ops->buffer_split_supported_hdr_ptypes_get)(dev);
+
+   if (!all_types)
+   return 0;
+
+   for (i = 0, j = 0; all_types[i] != RTE_PTYPE_UNKNOWN; ++i) {
+   if (j < num)
+   ptypes[j] = all_types[i];
+   j++;
+   }
+
+   return j;
+}
+
 RTE_LOG_REGISTER_DEFAULT(rte_eth_dev_logtype, INFO);
 
 RTE_INIT(ethdev_init_telemetry)
diff --git a/lib/ethdev/rte_ethdev.h b/lib/ethdev/rte_ethdev.h
index de9e970d4d..c58c908c3a 100644
--- a/lib/ethdev/rte_ethdev.h
+++ b/lib/ethdev/rte_ethdev.h
@@ -6206,6 +6206,30 @@ rte_eth_tx_buffer(uint16_t port_id, uint16_t queue_id,
return rte_eth_tx_buffer_flush(port_id, queue_id, buffer);
 }
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice
+ *
+ * Get supported header protocols to split on Rx.
+ *
+ * @param port_id
+ *   The port identifier of the device.
+ * @param[out] ptypes
+ *   An array pointer to store supported protocol headers, allocated by caller.
+ *   These ptypes are composed with RTE_PTYPE_*.
+ * @param num
+ *   Size of the array pointed by param ptypes.
+ * @return
+ *   - (>=0) Number of supported ptypes. If the number of types exceeds num,
+ *   only num entries will be filled into the ptypes array, but the 
full
+ *   count of supported ptypes will be returned.
+ *   - (-ENOTSUP) if header protocol is not supported by device.
+ *   - (-ENODEV) if *port_id* invalid.
+ *   - (-EINVAL) if bad parameter.
+ */
+__rte_experimental
+int rte_eth_buffer_split_get_supported_hdr_ptypes(uint16_t port_id, uint32_t 
*ptypes, int num);
+
 #ifdef __

[PATCH v3 2/4] ethdev: introduce protocol hdr based buffer split

2022-09-02 Thread Yuan Wang
Currently, Rx buffer split supports length based split. With Rx queue
offload RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT enabled and Rx packet segment
configured, PMD will be able to split the received packets into
multiple segments.

However, length based buffer split is not suitable for NICs that do split
based on protocol headers. Given an arbitrarily variable length in Rx
packet segment, it is almost impossible to pass a fixed protocol header to
driver. Besides, the existence of tunneling results in the composition of
a packet is various, which makes the situation even worse.

This patch extends current buffer split to support protocol header based
buffer split. A new proto_hdr field is introduced in the reserved field
of rte_eth_rxseg_split structure to specify protocol header. The proto_hdr
field defines the split position of packet, splitting will always happens
after the protocol header defined in the Rx packet segment. When Rx queue
offload RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT is enabled and corresponding
protocol header is configured, driver will split the ingress packets into
multiple segments.

struct rte_eth_rxseg_split {
struct rte_mempool *mp; /* memory pools to allocate segment from */
uint16_t length; /* segment maximal data length,
configures split point */
uint16_t offset; /* data offset from beginning
of mbuf data buffer */
uint32_t proto_hdr; /* supported ptype of a specific pmd,
   configures split point.
   It should be defined by RTE_PTYPE_*
 */
};

If protocol header split can be supported by a PMD. The
rte_eth_buffer_split_get_supported_hdr_ptypes function can
be use to obtain a list of these protocol headers.

For example, let's suppose we configured the Rx queue with the
following segments:
seg0 - pool0, proto_hdr0=RTE_PTYPE_L3_IPV4, off0=2B
seg1 - pool1, proto_hdr1=RTE_PTYPE_L4_UDP, off1=128B
seg2 - pool2, off1=0B

The packet consists of MAC_IPV4_UDP_PAYLOAD will be split like
following:
seg0 - ipv4 header @ RTE_PKTMBUF_HEADROOM + 2 in mbuf from pool0
seg1 - udp header @ 128 in mbuf from pool1
seg2 - payload @ 0 in mbuf from pool2

Note: NIC will only do split when the packets exactly match all the
protocol headers in the segments. For example, if ARP packets received
with above config, the NIC won't do split for ARP packets since
it does not contains ipv4 header and udp header.

Now buffer split can be configured in two modes. For length based
buffer split, the mp, length, offset field in Rx packet segment should
be configured, while the proto_hdr field will be ignored.
For protocol header based buffer split, the mp, offset, proto_hdr field
in Rx packet segment should be configured, while the length field will
be ignored.

The split limitations imposed by underlying driver is reported in the
rte_eth_dev_info->rx_seg_capa field. The memory attributes for the split
parts may differ either, dpdk memory and external memory, respectively.

Signed-off-by: Yuan Wang 
Signed-off-by: Xuan Ding 
Signed-off-by: Wenxuan Wu 
---
 doc/guides/rel_notes/release_22_11.rst |  5 +++
 lib/ethdev/rte_ethdev.c| 55 --
 lib/ethdev/rte_ethdev.h| 17 +++-
 3 files changed, 65 insertions(+), 12 deletions(-)

diff --git a/doc/guides/rel_notes/release_22_11.rst 
b/doc/guides/rel_notes/release_22_11.rst
index 4d90514a9a..f3b58c7895 100644
--- a/doc/guides/rel_notes/release_22_11.rst
+++ b/doc/guides/rel_notes/release_22_11.rst
@@ -60,6 +60,11 @@ New Features
   Added ``rte_eth_buffer_split_get_supported_hdr_ptypes()``, to get supported
   header protocols of a PMD to split.
 
+* **Added protocol header based buffer split.**
+  Ethdev: The ``reserved`` field in the  ``rte_eth_rxseg_split`` structure is
+  replaced with ``proto_hdr`` to support protocol header based buffer split.
+  User can choose length or protocol header to configure buffer split
+  according to NIC's capability.
 
 Removed Items
 -
diff --git a/lib/ethdev/rte_ethdev.c b/lib/ethdev/rte_ethdev.c
index 093c577add..dfceb723ee 100644
--- a/lib/ethdev/rte_ethdev.c
+++ b/lib/ethdev/rte_ethdev.c
@@ -1635,9 +1635,10 @@ rte_eth_dev_is_removed(uint16_t port_id)
 }
 
 static int
-rte_eth_rx_queue_check_split(const struct rte_eth_rxseg_split *rx_seg,
-uint16_t n_seg, uint32_t *mbp_buf_size,
-const struct rte_eth_dev_info *dev_info)
+rte_eth_rx_queue_check_split(uint16_t port_id,
+   const struct rte_eth_rxseg_split *rx_seg,
+   uint16_t n_seg, uint32_t *mbp_buf_size,
+   const struct rte_eth_dev_info *dev_info)
 {
const struct rte_eth_rxseg_capa *seg_capa = &dev_info->rx_seg_capa;
struct rte_mempool *mp_first;
@@ -1660,6 +1661,7 @@ rte_eth_rx_queue_ch

[PATCH v3 3/4] app/testpmd: add rxhdrs commands and parameters

2022-09-02 Thread Yuan Wang
Add command line parameter:
--rxhdrs=mac,[ipv4,udp]

Set the protocol_hdr of segments to scatter packets on receiving if
split feature is engaged. And the queues with BUFFER_SPLIT flag.

Add interactive mode command:
testpmd>set rxhdrs mac,ipv4,tcp,udp,sctp
(protocol sequence should be valid)

The protocol split feature is off by default. To enable protocol split,
you need:
1. Start testpmd with multiple mempools. E.g. --mbuf-size=2048,2048
2. Configure Rx queue with rx_offload buffer split on.
3. Set the protocol type of buffer split. E.g. set rxhdrs mac,ipv4
(default protocols of testpmd : mac|icmp|ipv4|ipv6|tcp|udp|
  sctp|inner_mac|inner_ipv4|inner_ipv6|
  inner_tcp|inner_udp|inner_sctp)
Above protocols can be configured in testpmd. But the configuration can
only be applied when it is supported by specific pmd.

Signed-off-by: Yuan Wang 
Signed-off-by: Xuan Ding 
Signed-off-by: Wenxuan Wu 
---
 app/test-pmd/cmdline.c| 123 +-
 app/test-pmd/config.c |  70 ++
 app/test-pmd/parameters.c |  16 -
 app/test-pmd/testpmd.c|   2 +
 app/test-pmd/testpmd.h|   6 ++
 5 files changed, 213 insertions(+), 4 deletions(-)

diff --git a/app/test-pmd/cmdline.c b/app/test-pmd/cmdline.c
index b4fe9dfb17..f00b7bc6a4 100644
--- a/app/test-pmd/cmdline.c
+++ b/app/test-pmd/cmdline.c
@@ -183,7 +183,7 @@ static void cmd_help_long_parsed(void *parsed_result,
"show (rxq|txq) info (port_id) (queue_id)\n"
"Display information for configured RX/TX 
queue.\n\n"
 
-   "show config (rxtx|cores|fwd|rxoffs|rxpkts|txpkts)\n"
+   "show config 
(rxtx|cores|fwd|rxoffs|rxpkts|rxhdrs|txpkts)\n"
"Display the given configuration.\n\n"
 
"read rxd (port_id) (queue_id) (rxd_id)\n"
@@ -307,6 +307,14 @@ static void cmd_help_long_parsed(void *parsed_result,
" Affects only the queues configured with split"
" offloads.\n\n"
 
+   "set rxhdrs (mac[,ipv4])*\n"
+   "   Set the protocol hdr of each segment to scatter"
+   " packets on receiving if split feature is engaged."
+   " Affects only the queues configured with split"
+   " offloads.\n\n"
+   "   Supported proto header: 
mac|ipv4|ipv6|tcp|udp|sctp|"
+   
"inner_mac|inner_ipv4|inner_ipv6|inner_tcp|inner_udp|inner_sctp\n"
+
"set txpkts (x[,y]*)\n"
"Set the length of each segment of TXONLY"
" and optionally CSUM packets.\n\n"
@@ -3456,6 +3464,68 @@ static cmdline_parse_inst_t cmd_stop = {
},
 };
 
+static unsigned int
+get_ptype(char *value)
+{
+   uint32_t protocol;
+
+   if (!strcmp(value, "mac"))
+   protocol = RTE_PTYPE_L2_ETHER;
+   else if (!strcmp(value, "ipv4"))
+   protocol = RTE_PTYPE_L3_IPV4;
+   else if (!strcmp(value, "ipv6"))
+   protocol = RTE_PTYPE_L3_IPV6;
+   else if (!strcmp(value, "tcp"))
+   protocol = RTE_PTYPE_L4_TCP;
+   else if (!strcmp(value, "udp"))
+   protocol = RTE_PTYPE_L4_UDP;
+   else if (!strcmp(value, "sctp"))
+   protocol = RTE_PTYPE_L4_SCTP;
+   else if (!strcmp(value, "inner_mac"))
+   protocol = RTE_PTYPE_INNER_L2_ETHER;
+   else if (!strcmp(value, "inner_ipv4"))
+   protocol = RTE_PTYPE_INNER_L3_IPV4;
+   else if (!strcmp(value, "inner_ipv6"))
+   protocol = RTE_PTYPE_INNER_L3_IPV6;
+   else if (!strcmp(value, "inner_tcp"))
+   protocol = RTE_PTYPE_INNER_L4_TCP;
+   else if (!strcmp(value, "inner_udp"))
+   protocol = RTE_PTYPE_INNER_L4_UDP;
+   else if (!strcmp(value, "inner_sctp"))
+   protocol = RTE_PTYPE_INNER_L4_SCTP;
+   else {
+   fprintf(stderr, "Unsupported protocol: %s\n", value);
+   protocol = RTE_PTYPE_UNKNOWN;
+   }
+
+   return protocol;
+}
+/* *** SET RXHDRSLIST *** */
+
+unsigned int
+parse_hdrs_list(const char *str, const char *item_name, unsigned int max_items,
+   unsigned int *parsed_items, int 
check_hdrs_sequence)
+{
+   unsigned int nb_item;
+   char *cur;
+   char *tmp;
+
+   nb_item = 0;
+   char *str2 = strdup(str);
+   cur = strtok_r(str2, ",", &tmp);
+   while (cur != NULL) {
+   parsed_items[nb_item] = get_ptype(cur);
+   cur = strtok_r(NULL, ",", &tmp);
+   nb_item++;
+   }
+   if (nb_item > max_items)
+   fprintf(stderr, "Number of %s = %u > %u (maximum items)\n",
+   item_name, nb_item + 1, max_items);
+   free(str2);

[PATCH v3 4/4] net/ice: support buffer split in Rx path

2022-09-02 Thread Yuan Wang
This patch adds support for protocol based buffer split in normal Rx
data paths. When the Rx queue is configured with specific protocol type,
packets received will be directly split into protocol header and
payload parts limitation of ice pmd. And the two parts will be
put into different mempools.

Currently, protocol based buffer split is not supported in vectorized
paths.

A new api ice_buffer_split_supported_hdr_ptypes_get() has been
introduced, it will return the supported header protocols of ice PMD
to app for splitting.

Signed-off-by: Yuan Wang 
Signed-off-by: Xuan Ding 
Signed-off-by: Wenxuan Wu 
Reviewed-by: Qi Zhang 
---
 doc/guides/nics/features/ice.ini   |   1 +
 doc/guides/rel_notes/release_22_11.rst |   4 +
 drivers/net/ice/ice_ethdev.c   |  30 +++-
 drivers/net/ice/ice_rxtx.c | 220 +
 drivers/net/ice/ice_rxtx.h |  16 ++
 drivers/net/ice/ice_rxtx_vec_common.h  |   3 +
 6 files changed, 242 insertions(+), 32 deletions(-)

diff --git a/doc/guides/nics/features/ice.ini b/doc/guides/nics/features/ice.ini
index 7861790a51..bf978ab7f5 100644
--- a/doc/guides/nics/features/ice.ini
+++ b/doc/guides/nics/features/ice.ini
@@ -7,6 +7,7 @@
 ; is selected.
 ;
 [Features]
+Buffer split = P
 Speed capabilities   = Y
 Link status  = Y
 Link status event= Y
diff --git a/doc/guides/rel_notes/release_22_11.rst 
b/doc/guides/rel_notes/release_22_11.rst
index f3b58c7895..99af35714d 100644
--- a/doc/guides/rel_notes/release_22_11.rst
+++ b/doc/guides/rel_notes/release_22_11.rst
@@ -66,6 +66,10 @@ New Features
   User can choose length or protocol header to configure buffer split
   according to NIC's capability.
 
+* **Updated Intel ice driver.**
+
+  Added protocol based buffer split support in scalar path.
+
 Removed Items
 -
 
diff --git a/drivers/net/ice/ice_ethdev.c b/drivers/net/ice/ice_ethdev.c
index b2300790ae..3e140439ff 100644
--- a/drivers/net/ice/ice_ethdev.c
+++ b/drivers/net/ice/ice_ethdev.c
@@ -169,6 +169,7 @@ static int ice_timesync_read_time(struct rte_eth_dev *dev,
 static int ice_timesync_write_time(struct rte_eth_dev *dev,
   const struct timespec *timestamp);
 static int ice_timesync_disable(struct rte_eth_dev *dev);
+static const uint32_t *ice_buffer_split_supported_hdr_ptypes_get(struct 
rte_eth_dev *dev);
 
 static const struct rte_pci_id pci_id_ice_map[] = {
{ RTE_PCI_DEVICE(ICE_INTEL_VENDOR_ID, ICE_DEV_ID_E823L_BACKPLANE) },
@@ -280,6 +281,7 @@ static const struct eth_dev_ops ice_eth_dev_ops = {
.timesync_write_time  = ice_timesync_write_time,
.timesync_disable = ice_timesync_disable,
.tm_ops_get   = ice_tm_ops_get,
+   .buffer_split_supported_hdr_ptypes_get = 
ice_buffer_split_supported_hdr_ptypes_get,
 };
 
 /* store statistics names and its offset in stats structure */
@@ -3749,7 +3751,8 @@ ice_dev_info_get(struct rte_eth_dev *dev, struct 
rte_eth_dev_info *dev_info)
RTE_ETH_RX_OFFLOAD_OUTER_IPV4_CKSUM |
RTE_ETH_RX_OFFLOAD_VLAN_EXTEND |
RTE_ETH_RX_OFFLOAD_RSS_HASH |
-   RTE_ETH_RX_OFFLOAD_TIMESTAMP;
+   RTE_ETH_RX_OFFLOAD_TIMESTAMP |
+   RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT;
dev_info->tx_offload_capa |=
RTE_ETH_TX_OFFLOAD_QINQ_INSERT |
RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
@@ -3761,7 +3764,7 @@ ice_dev_info_get(struct rte_eth_dev *dev, struct 
rte_eth_dev_info *dev_info)
dev_info->flow_type_rss_offloads |= ICE_RSS_OFFLOAD_ALL;
}
 
-   dev_info->rx_queue_offload_capa = 0;
+   dev_info->rx_queue_offload_capa = RTE_ETH_RX_OFFLOAD_BUFFER_SPLIT;
dev_info->tx_queue_offload_capa = RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
 
dev_info->reta_size = pf->hash_lut_size;
@@ -3830,6 +3833,11 @@ ice_dev_info_get(struct rte_eth_dev *dev, struct 
rte_eth_dev_info *dev_info)
dev_info->default_rxportconf.ring_size = ICE_BUF_SIZE_MIN;
dev_info->default_txportconf.ring_size = ICE_BUF_SIZE_MIN;
 
+   dev_info->rx_seg_capa.max_nseg = ICE_RX_MAX_NSEG;
+   dev_info->rx_seg_capa.multi_pools = 1;
+   dev_info->rx_seg_capa.offset_allowed = 0;
+   dev_info->rx_seg_capa.offset_align_log2 = 0;
+
return 0;
 }
 
@@ -5886,6 +5894,24 @@ ice_timesync_disable(struct rte_eth_dev *dev)
return 0;
 }
 
+static const uint32_t *
+ice_buffer_split_supported_hdr_ptypes_get(struct rte_eth_dev *dev __rte_unused)
+{
+/* Buffer split protocol header capability. */
+   static const uint32_t ptypes[] = {
+   RTE_PTYPE_L2_ETHER,
+   RTE_PTYPE_INNER_L2_ETHER,
+   RTE_PTYPE_INNER_L3_IPV4,
+   RTE_PTYPE_INNER_L3_IPV6,
+   RTE_PTYPE_INNER_L4_TCP,
+   RTE_PTYPE_INNER_L4_UDP,
+   RTE_PTYPE_INNER_

RE: [PATCH] vhost: compilation fix for GCC-12

2022-09-02 Thread Amit Prakash Shukla
Thanks Ruifeng for the code review and feedback. Please find my response inline.

> -Original Message-
> From: Ruifeng Wang 
> Sent: Friday, September 2, 2022 12:54 PM
> To: Amit Prakash Shukla ; Maxime Coquelin
> ; Chenbo Xia 
> Cc: dev@dpdk.org; Jerin Jacob Kollanukkaran ;
> sta...@dpdk.org; nd 
> Subject: [EXT] RE: [PATCH] vhost: compilation fix for GCC-12
> 
> External Email
> 
> --
> > -Original Message-
> > From: Amit Prakash Shukla 
> > Sent: Thursday, September 1, 2022 4:50 PM
> > To: Maxime Coquelin ; Chenbo Xia
> > 
> > Cc: dev@dpdk.org; jer...@marvell.com; sta...@dpdk.org; Amit Prakash
> > Shukla 
> > Subject: [PATCH] vhost: compilation fix for GCC-12
> >
> > ../lib/vhost/virtio_net.c:941:35: error:
> > 'buf_vec[0].buf_len' may be used uninitialized
> > [-Werror=maybe-uninitialized]
> >   941 | buf_len = buf_vec[vec_idx].buf_len;
> >   |   ^~~~
> > ../lib/vhost/virtio_net.c: In function 'virtio_dev_rx_packed':
> > ../lib/vhost/virtio_net.c:1285:27: note: 'buf_vec' declared here
> >  1285 | struct buf_vector buf_vec[BUF_VECTOR_MAX];
> >   |   ^~~
> > cc1: all warnings being treated as errors
> >
> > Fixes: 93520085efda ("vhost: add packed ring single enqueue")
> > Cc: sta...@dpdk.org
> >
> > Signed-off-by: Amit Prakash Shukla 
> > ---
> >  lib/vhost/virtio_net.c | 12 
> >  1 file changed, 12 insertions(+)
> >
> > diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c index
> > b3d954aab4..0220bc923c
> > 100644
> > --- a/lib/vhost/virtio_net.c
> > +++ b/lib/vhost/virtio_net.c
> > @@ -1069,6 +1069,12 @@ vhost_enqueue_single_packed(struct virtio_net
> *dev,
> > else
> > max_tries = 1;
> >
> > +   /* To avoid GCC-12 warning.
> > +* GCC-12 is not evaluating sizeof at compile time.
> Is this a compiler behavior change against previous versions?
> I tried to find some clue from gcc-12 doc but got nothing. Can you point me to
> any material?
Apologies for the wrong wordings in the comment. In the comment I mean, it 
seems like 
point at which sizeof gets evaluated during compilation has changed. I am not 
sure on it though.
I too could not find documentation regarding the same.

> 
> > +*/
> > +   if (unlikely(size == 0))
> > +   return -1;
> > +
> > while (size > 0) {
> Change 'while(){}' to 'do{}while()' can be a simpler solution. What do you
> think?
I agree, solution suggested by you is better than the one in patch. I will make 
the suggested
changes as part of v2. Thanks.

> 
> Thanks.
> 
> > /*
> >  * if we tried all available ring items, and still @@ -1574,6
> > +1580,12 @@ vhost_enqueue_async_packed(struct virtio_net *dev,
> > else
> > max_tries = 1;
> >
> > +   /* To avoid GCC-12 warning.
> > +* GCC-12 is not evaluating sizeof at compile time.
> > +*/
> > +   if (unlikely(size == 0))
> > +   return -1;
> > +
> > while (size > 0) {
> > /*
> >  * if we tried all available ring items, and still
> > --
> > 2.25.1



[PATCH v2] vhost: compilation fix for GCC-12

2022-09-02 Thread Amit Prakash Shukla
../lib/vhost/virtio_net.c:941:35: error:
'buf_vec[0].buf_len' may be used uninitialized
[-Werror=maybe-uninitialized]
  941 | buf_len = buf_vec[vec_idx].buf_len;
  |   ^~~~
../lib/vhost/virtio_net.c: In function 'virtio_dev_rx_packed':
../lib/vhost/virtio_net.c:1285:27: note: 'buf_vec' declared here
 1285 | struct buf_vector buf_vec[BUF_VECTOR_MAX];
  |   ^~~
cc1: all warnings being treated as errors

Fixes: 93520085efda ("vhost: add packed ring single enqueue")
Cc: sta...@dpdk.org

Signed-off-by: Amit Prakash Shukla 
---
v2:
- Changes for code review suggestion

 lib/vhost/virtio_net.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index b3d954aab4..9b77d3d10f 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -1069,7 +1069,7 @@ vhost_enqueue_single_packed(struct virtio_net *dev,
else
max_tries = 1;
 
-   while (size > 0) {
+   do {
/*
 * if we tried all available ring items, and still
 * can't get enough buf, it means something abnormal
@@ -1097,7 +1097,7 @@ vhost_enqueue_single_packed(struct virtio_net *dev,
avail_idx += desc_count;
if (avail_idx >= vq->size)
avail_idx -= vq->size;
-   }
+   } while (size > 0);
 
if (mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers, false) < 0)
return -1;
@@ -1574,7 +1574,7 @@ vhost_enqueue_async_packed(struct virtio_net *dev,
else
max_tries = 1;
 
-   while (size > 0) {
+   do {
/*
 * if we tried all available ring items, and still
 * can't get enough buf, it means something abnormal
@@ -1601,7 +1601,7 @@ vhost_enqueue_async_packed(struct virtio_net *dev,
avail_idx += desc_count;
if (avail_idx >= vq->size)
avail_idx -= vq->size;
-   }
+   } while (size > 0);
 
if (unlikely(mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers, 
true) < 0))
return -1;
-- 
2.25.1



Marvell DPDK v22.11 Roadmap

2022-09-02 Thread Jerin Jacob Kollanukkaran
Marvell DPDK v22.11 Roadmap
~~~

1) New device class:

a) Machine learning inference device library
http://patches.dpdk.org/project/dpdk/patch/20220803132839.2747858-2-jer...@marvell.com/


2) New library:
===
a) Add lib/pdcp library to enable PDCP protocol support similar to lib/ipsec/ 
library for enabling IPsec protocol support. 

 
3) ethdev:
==
a) support congestion management
http://patches.dpdk.org/project/dpdk/patch/20220713130340.2886839-1-jer...@marvell.com/

b) add IPsec SA expiry event subtypes
http://patches.dpdk.org/project/dpdk/patch/20220416192530.173895-8-gak...@marvell.com/

c) add trace points
http://patches.dpdk.org/project/dpdk/patch/20220804134430.6192-2-adwiv...@marvell.com/

d) introduce pool sort capability
https://patches.dpdk.org/project/dpdk/patch/20220902070047.2812906-1-hpoth...@marvell.com/

e)add protocol param to color table update
https://patches.dpdk.org/project/dpdk/patch/20220823102255.2905191-1-sk...@marvell.com/


4)security:
===
a) support MACsec
http://patches.dpdk.org/project/dpdk/patch/20220814184620.512343-3-gak...@marvell.com/


5) mbuf:

a) free up additional 8B mbuf space in first cacheline for IOVA mode as VA 
builds
https://patchwork.dpdk.org/project/dpdk/patch/20220829151626.2101336-2-sthot...@marvell.com/


b) mbuf: clarify meta data needed for Outbound Inline
https://patchwork.dpdk.org/project/dpdk/patch/20220822143812.30010-1-ndabilpu...@marvell.com/


6) eventdev:

a) add element offset to event vector
https://patchwork.dpdk.org/project/dpdk/patch/20220816154932.10168-1-pbhagavat...@marvell.com/

b) add weight and affinity attributes to queue conf
https://patchwork.dpdk.org/project/dpdk/patch/dcd3cf0ec034632f97223bb9df389f9cedf9753c.1660116951.git.sthot...@marvell.com/

c) introduce event cryptodev vector type
https://patches.dpdk.org/project/dpdk/patch/20220804095907.97895-2-vfia...@marvell.com/


7) example applications:

a) examples/fips_validation: enhancements
http://patches.dpdk.org/project/dpdk/list/?submitter=2301

b) examples/l3fwd: enhancements
https://patchwork.dpdk.org/project/dpdk/patch/20220829094442.3422-1-pbhagavat...@marvell.com/

c) examples/ipsec-secgw: support for per SA HW reassembly
https://patchwork.dpdk.org/project/dpdk/patch/20220829151233.2515424-1-rbhans...@marvell.com/

d) examples/ipsec-secgw: add event crypto adapter
https://patches.dpdk.org/project/dpdk/patch/20220804103626.102688-2-vfia...@marvell.com/

e) app/test-security-perf: add security perf app
http://patches.dpdk.org/project/dpdk/patch/20220811035933.802-2-ano...@marvell.com/


8) Compiler support

a) Additional GCC-12 compilation fixes
https://patches.dpdk.org/project/dpdk/list/?submitter=Amit+Prakash+Shukla


Re: [PATCH v12 1/6] eventdev/eth_rx: add adapter instance get API

2022-09-02 Thread Jerin Jacob
On Mon, Aug 29, 2022 at 1:44 PM Ganapati Kundapura
 wrote:
>
> Added rte_event_eth_rx_adapter_instance_get() to get
> adapter instance id for specified ethernet device id and
> rx queue index.
>
> Added rte_event_eth_rx_adapter_instance_get() details in
> prog_guide/event_ethernet_rx_adapter.rst
>
> Signed-off-by: Ganapati Kundapura 

No need space between these lines
Will fix on apply.

>
> Reviewed-by: Naga Harish K S V 
> Acked-by: Jay Jayatheerthan 

Acked-by: Jerin Jacob 

>
> ---
> V12:
> * Squashed 1/7 and 6/7
> * Squashed 4/7 and 7/7
> * Updated 22.11.rst for the new APIs
>
> v11:
> * added instance_get under 22.11 in version.map
>
> v10:
> * Add Review and Ack to series
>
> v9:
> * Corrected rte_event_eth_tx_adapter_instanceget to
> * rte_event_eth_tx_adapter_instance_get in event_ethernet_tx_adapter.rst
>
> v8:
> * Removed limits.h inclusion
>
> v7:
> * Remove allocation of instance array and storage of instnace id
> * in instance array
> * Use Rx adapter instance data to query instance id for specified
> * eth_dev_id and rx_queue_id
>
> v6:
> * rx adapter changes removed from patch4 and moved to patch1
>
> v5:
> * patch is split into saperate patches
>
> v4:
> * Moved instance array allocation and instance id storage
>   before adapter's nb_queue updation for handling the
>   error case  properly
>
> v3:
> * Fixed checkpatch error
>
> v2:
> * Fixed build issues
> * Added telemetry support for rte_event_eth_rx_adapter_instance_get
> * arranged functions in alphabetical order in version.map
>
> diff --git a/doc/guides/prog_guide/event_ethernet_rx_adapter.rst 
> b/doc/guides/prog_guide/event_ethernet_rx_adapter.rst
> index 3b4ef50..5b9d0cf 100644
> --- a/doc/guides/prog_guide/event_ethernet_rx_adapter.rst
> +++ b/doc/guides/prog_guide/event_ethernet_rx_adapter.rst
> @@ -177,6 +177,12 @@ used otherwise it returns -EINVAL.
>  The ``rte_event_eth_rx_adapter_queue_stats_reset`` function can be used to
>  reset queue level stats when queue level event buffer is in use.
>
> +Getting Adapter instance id
> +~~~
> +
> +The ``rte_event_eth_rx_adapter_instance_get()`` function reports
> +rx adapter instance id for a specified ethernet device id and rx queue index.
> +
>  Interrupt Based Rx Queues
>  ~~
>
> diff --git a/lib/eventdev/eventdev_pmd.h b/lib/eventdev/eventdev_pmd.h
> index 6940266..c58ba05 100644
> --- a/lib/eventdev/eventdev_pmd.h
> +++ b/lib/eventdev/eventdev_pmd.h
> @@ -888,6 +888,26 @@ typedef int 
> (*eventdev_eth_rx_adapter_vector_limits_get_t)(
> const struct rte_eventdev *dev, const struct rte_eth_dev *eth_dev,
> struct rte_event_eth_rx_adapter_vector_limits *limits);
>
> +/**
> + * Get Rx adapter instance id for Rx queue
> + *
> + * @param eth_dev_id
> + *  Port identifier of ethernet device
> + *
> + * @param rx_queue_id
> + *  Ethernet device Rx queue index
> + *
> + * @param[out] rxa_inst_id
> + *  Pointer to Rx adapter instance identifier.
> + *  Contains valid Rx adapter instance id when return value is 0
> + *
> + * @return
> + *   -  0: Success
> + *   - <0: Error code on failure
> + */
> +typedef int (*eventdev_eth_rx_adapter_instance_get_t)
> +   (uint16_t eth_dev_id, uint16_t rx_queue_id, uint8_t *rxa_inst_id);
> +
>  typedef uint32_t rte_event_pmd_selftest_seqn_t;
>  extern int rte_event_pmd_selftest_seqn_dynfield_offset;
>
> @@ -1321,6 +1341,8 @@ struct eventdev_ops {
> eventdev_eth_rx_adapter_vector_limits_get_t
> eth_rx_adapter_vector_limits_get;
> /**< Get event vector limits for the Rx adapter */
> +   eventdev_eth_rx_adapter_instance_get_t eth_rx_adapter_instance_get;
> +   /**< Get Rx adapter instance id for Rx queue */
>
> eventdev_timer_adapter_caps_get_t timer_adapter_caps_get;
> /**< Get timer adapter capabilities */
> diff --git a/lib/eventdev/rte_event_eth_rx_adapter.c 
> b/lib/eventdev/rte_event_eth_rx_adapter.c
> index bf8741d..ababe13 100644
> --- a/lib/eventdev/rte_event_eth_rx_adapter.c
> +++ b/lib/eventdev/rte_event_eth_rx_adapter.c
> @@ -1415,15 +1415,13 @@ rxa_service_func(void *args)
> return 0;
>  }
>
> -static int
> -rte_event_eth_rx_adapter_init(void)
> +static void *
> +rxa_memzone_array_get(const char *name, unsigned int elt_size, int nb_elems)
>  {
> -   const char *name = RXA_ADAPTER_ARRAY;
> const struct rte_memzone *mz;
> unsigned int sz;
>
> -   sz = sizeof(*event_eth_rx_adapter) *
> -   RTE_EVENT_ETH_RX_ADAPTER_MAX_INSTANCE;
> +   sz = elt_size * nb_elems;
> sz = RTE_ALIGN(sz, RTE_CACHE_LINE_SIZE);
>
> mz = rte_memzone_lookup(name);
> @@ -1431,13 +1429,34 @@ rte_event_eth_rx_adapter_init(void)
> mz = rte_memzone_reserve_aligned(name, sz, rte_socket_id(), 0,
>  RTE_CACHE_LINE_SIZE);
> if (mz == NULL) {
> -   RTE_EDEV_LOG_ERR("failed to reserve memzone err = %"
> - 

Re: [PATCH v12 4/6] eventdev/eth_tx: add instance get API

2022-09-02 Thread Jerin Jacob
On Mon, Aug 29, 2022 at 1:45 PM Ganapati Kundapura
 wrote:
>
> Added rte_event_eth_tx_adapter_instance_get() to get the
> adapter instance id for specified ethernet device id and
> tx queue index.
>
> Added rte_event_eth_tx_adapter_instance_get() details in
> prog_guide/event_ethernet_tx_adapter.rst
>
> Signed-off-by: Ganapati Kundapura 
>
> Reviewed-by: Naga Harish K S V 
> Acked-by: Jay Jayatheerthan 


Acked-by: Jerin Jacob 

>
> diff --git a/doc/guides/prog_guide/event_ethernet_tx_adapter.rst 
> b/doc/guides/prog_guide/event_ethernet_tx_adapter.rst
> index f80d226..4da9bcd 100644
> --- a/doc/guides/prog_guide/event_ethernet_tx_adapter.rst
> +++ b/doc/guides/prog_guide/event_ethernet_tx_adapter.rst
> @@ -165,6 +165,12 @@ in struct ``rte_event_eth_tx_adapter_stats``. The 
> counter values are the sum of
>  the counts from the eventdev PMD callback if the callback is supported, and
>  the counts maintained by the service function, if one exists.
>
> +Getting Adapter instance id
> +~~~
> +
> +The  ``rte_event_eth_tx_adapter_instance_get()`` function reports
> +tx adapter instance id for a specified ethernet device id and tx queue index.
> +
>  Tx event vectorization
>  ~~
>
> diff --git a/lib/eventdev/eventdev_pmd.h b/lib/eventdev/eventdev_pmd.h
> index c58ba05..f514a37 100644
> --- a/lib/eventdev/eventdev_pmd.h
> +++ b/lib/eventdev/eventdev_pmd.h
> @@ -1274,6 +1274,27 @@ typedef int (*eventdev_eth_tx_adapter_stats_get_t)(
>  typedef int (*eventdev_eth_tx_adapter_stats_reset_t)(uint8_t id,
> const struct rte_eventdev *dev);
>
> +/**
> + * Get TX adapter instance id for TX queue
> + *
> + * @param eth_dev_id
> + *  Port identifier of Ethernet device
> + *
> + * @param tx_queue_id
> + *  Ethernet device TX queue index
> + *
> + * @param[out] txa_inst_id
> + *  Pointer to TX adapter instance identifier
> + *  Contains valid Tx adapter instance id when return value is 0
> + *
> + * @return
> + *  -  0: Success
> + *  - <0: Error code on failure
> + */
> +typedef int (*eventdev_eth_tx_adapter_instance_get_t)
> +   (uint16_t eth_dev_id, uint16_t tx_queue_id, uint8_t *txa_inst_id);
> +
> +
>  /** Event device operations function pointer table */
>  struct eventdev_ops {
> eventdev_info_get_t dev_infos_get;  /**< Get device info. */
> @@ -1386,6 +1407,8 @@ struct eventdev_ops {
> /**< Get eth Tx adapter statistics */
> eventdev_eth_tx_adapter_stats_reset_t eth_tx_adapter_stats_reset;
> /**< Reset eth Tx adapter statistics */
> +   eventdev_eth_tx_adapter_instance_get_t eth_tx_adapter_instance_get;
> +   /**< Get Tx adapter instance id for Tx queue */
>
> eventdev_selftest dev_selftest;
> /**< Start eventdev Selftest */
> diff --git a/lib/eventdev/rte_event_eth_tx_adapter.c 
> b/lib/eventdev/rte_event_eth_tx_adapter.c
> index b4b37f1..aaef352 100644
> --- a/lib/eventdev/rte_event_eth_tx_adapter.c
> +++ b/lib/eventdev/rte_event_eth_tx_adapter.c
> @@ -18,6 +18,9 @@
>  #define TXA_INVALID_DEV_ID INT32_C(-1)
>  #define TXA_INVALID_SERVICE_ID INT64_C(-1)
>
> +#define TXA_ADAPTER_ARRAY "txa_adapter_array"
> +#define TXA_SERVICE_DATA_ARRAY "txa_service_data_array"
> +
>  #define txa_evdev(id) (&rte_eventdevs[txa_dev_id_array[(id)]])
>
>  #define txa_dev_caps_get(id) 
> txa_evdev((id))->dev_ops->eth_tx_adapter_caps_get
> @@ -41,6 +44,9 @@
>
>  #define txa_dev_stats_get(t) txa_evdev(t)->dev_ops->eth_tx_adapter_stats_get
>
> +#define txa_dev_instance_get(id) \
> +   txa_evdev(id)->dev_ops->eth_tx_adapter_instance_get
> +
>  #define RTE_EVENT_ETH_TX_ADAPTER_ID_VALID_OR_ERR_RET(id, retval) \
>  do { \
> if (!txa_valid_id(id)) { \
> @@ -194,12 +200,34 @@ txa_memzone_array_get(const char *name, unsigned int 
> elt_size, int nb_elems)
>  }
>
>  static int
> +txa_lookup(void)
> +{
> +   const struct rte_memzone *mz;
> +
> +   if (txa_dev_id_array == NULL) {
> +   mz = rte_memzone_lookup(TXA_ADAPTER_ARRAY);
> +   if (mz == NULL)
> +   return -ENOMEM;
> +   txa_dev_id_array = mz->addr;
> +   }
> +
> +   if (txa_service_data_array == NULL) {
> +   mz = rte_memzone_lookup(TXA_SERVICE_DATA_ARRAY);
> +   if (mz == NULL)
> +   return -ENOMEM;
> +   txa_service_data_array = mz->addr;
> +   }
> +
> +   return 0;
> +}
> +
> +static int
>  txa_dev_id_array_init(void)
>  {
> if (txa_dev_id_array == NULL) {
> int i;
>
> -   txa_dev_id_array = txa_memzone_array_get("txa_adapter_array",
> +   txa_dev_id_array = txa_memzone_array_get(TXA_ADAPTER_ARRAY,
> sizeof(int),
> 
> RTE_EVENT_ETH_TX_ADAPTER_MAX_INSTANCE);
> if (txa_dev_id_array == NULL)
> @@ -222,12 +250,18 @@ static int
>  txa_service_dat

Re: [PATCH v12 6/6] doc: added adapter instance get API

2022-09-02 Thread Jerin Jacob
On Mon, Aug 29, 2022 at 1:45 PM Ganapati Kundapura
 wrote:
>
> Added rx adapter instance get - rte_event_eth_rx_adapter_instance_get()
> and tx adapter instance get - rte_event_eth_tx_adapter_instance_get()
>
> Signed-off-by: Ganapati Kundapura 

Squashed this patch to the relevant API additions patch.

Series applied to dpdk-next-net-eventdev/for-main. Thanks



>
> diff --git a/doc/guides/rel_notes/release_22_11.rst 
> b/doc/guides/rel_notes/release_22_11.rst
> index 8c021cf..091cc84 100644
> --- a/doc/guides/rel_notes/release_22_11.rst
> +++ b/doc/guides/rel_notes/release_22_11.rst
> @@ -23,6 +23,14 @@ DPDK Release 22.11
>
>  New Features
>  
> +* **Added adapter instance get API**
> +
> +* Added ``rte_event_eth_rx_adapter_instance_get`` to get
> +  rx adapter instance id for specified ethernet device id and
> +  rx queue index.
> +* Added ``rte_event_eth_tx_adapter_instance_get`` to get the
> +  tx adapter instance id for specified ethernet device id and
> +  tx queue index.
>
>  .. This section should contain new features added in this release.
> Sample format:
> --
> 2.6.4
>


RE: TCP/IP stack recommendations

2022-09-02 Thread Morten Brørup
Hi Florin,

Ray Kinsella sent me in your direction...

I'm looking for recommendations (and warnings) regarding open source TCP/IP 
stacks for DPDK. Seeing that you are deeply involved in the FD.io VPP Host 
Stack, I hope you are willing to share some thoughts on the topic?


Med venlig hilsen / kind regards

Morten Brørup
CTO


SmartShare Systems A/S
Tonsbakken 16-18
DK-2740 Skovlunde
Denmark

Office  +45 70 20 00 93
Direct  +45 89 93 50 22
Mobile +45 25 40 82 12

m...@smartsharesystems.com
www.smartsharesystems.com

> -Original Message-
> From: Ray Kinsella [mailto:m...@ashroe.eu]
> Sent: Tuesday, 30 August 2022 11.45
> To: Morten Brørup
> Cc: dev@dpdk.org
> Subject: Re: TCP/IP stack recommendations
> 
> Hi Morten,
> 
> Reach out to Florin Coras over in VPP-land.
> 
> Morten Brørup  writes:
> 
> > Hi all.
> >
> > Can anyone in here recommend an actively maintained open source
> TCP/IP stack for DPDK?
> >
> >
> > Med venlig hilsen / Kind regards,
> > -Morten Brørup
> 
> 
> --
> Regards, Ray K



[PATCH v3] vhost: compilation fix for GCC-12

2022-09-02 Thread Amit Prakash Shukla
GCC-12 complains about the possible use of un-initialized array. At
compile time it seems like it is not able to evaluate the size as it
involves run-time variable and at compile time it seems like gcc assumes
value of "size" variable to be zero which makes gcc-12 to jump the while
loop.
"size = pkt->pkt_len + sizeof(struct virtio_net_hdr_mrg_rxbuf);"

As part of the fix, "while (){}" is replaced by "do {} while()" which make
the compiler to generate a code in which buf_vec will never be used
un-initialized.

../lib/vhost/virtio_net.c:941:35: error:
'buf_vec[0].buf_len' may be used uninitialized
[-Werror=maybe-uninitialized]
  941 | buf_len = buf_vec[vec_idx].buf_len;
  |   ^~~~
../lib/vhost/virtio_net.c: In function 'virtio_dev_rx_packed':
../lib/vhost/virtio_net.c:1285:27: note: 'buf_vec' declared here
 1285 | struct buf_vector buf_vec[BUF_VECTOR_MAX];
  |   ^~~
cc1: all warnings being treated as errors

Fixes: 93520085efda ("vhost: add packed ring single enqueue")
Cc: sta...@dpdk.org

Signed-off-by: Amit Prakash Shukla 
---
v2:
- Changes for code review suggestion

v3:
- Added a description

 lib/vhost/virtio_net.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/vhost/virtio_net.c b/lib/vhost/virtio_net.c
index b3d954aab4..9b77d3d10f 100644
--- a/lib/vhost/virtio_net.c
+++ b/lib/vhost/virtio_net.c
@@ -1069,7 +1069,7 @@ vhost_enqueue_single_packed(struct virtio_net *dev,
else
max_tries = 1;
 
-   while (size > 0) {
+   do {
/*
 * if we tried all available ring items, and still
 * can't get enough buf, it means something abnormal
@@ -1097,7 +1097,7 @@ vhost_enqueue_single_packed(struct virtio_net *dev,
avail_idx += desc_count;
if (avail_idx >= vq->size)
avail_idx -= vq->size;
-   }
+   } while (size > 0);
 
if (mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, num_buffers, false) < 0)
return -1;
@@ -1574,7 +1574,7 @@ vhost_enqueue_async_packed(struct virtio_net *dev,
else
max_tries = 1;
 
-   while (size > 0) {
+   do {
/*
 * if we tried all available ring items, and still
 * can't get enough buf, it means something abnormal
@@ -1601,7 +1601,7 @@ vhost_enqueue_async_packed(struct virtio_net *dev,
avail_idx += desc_count;
if (avail_idx >= vq->size)
avail_idx -= vq->size;
-   }
+   } while (size > 0);
 
if (unlikely(mbuf_to_desc(dev, vq, pkt, buf_vec, nr_vec, *nr_buffers, 
true) < 0))
return -1;
-- 
2.25.1



RE: [EXT] Re: [PATCH v2] vhost: compilation fix for GCC-12

2022-09-02 Thread Amit Prakash Shukla
Thanks for the feedback.

> -Original Message-
> From: Bagas Sanjaya 
> Sent: Friday, September 2, 2022 6:26 PM
> To: Amit Prakash Shukla 
> Cc: Maxime Coquelin ; Chenbo Xia
> ; dev@dpdk.org; Jerin Jacob Kollanukkaran
> ; sta...@dpdk.org; ruifeng.w...@arm.com
> Subject: [EXT] Re: [PATCH v2] vhost: compilation fix for GCC-12
> 
> External Email
> 
> --
> On Fri, Sep 02, 2022 at 06:11:52PM +0530, Amit Prakash Shukla wrote:
> > ../lib/vhost/virtio_net.c:941:35: error:
> > 'buf_vec[0].buf_len' may be used uninitialized
> > [-Werror=maybe-uninitialized]
> >   941 | buf_len = buf_vec[vec_idx].buf_len;
> >   |   ^~~~
> > ../lib/vhost/virtio_net.c: In function 'virtio_dev_rx_packed':
> > ../lib/vhost/virtio_net.c:1285:27: note: 'buf_vec' declared here
> >  1285 | struct buf_vector buf_vec[BUF_VECTOR_MAX];
> >   |   ^~~
> > cc1: all warnings being treated as errors
> >
> > Fixes: 93520085efda ("vhost: add packed ring single enqueue")
> > Cc: sta...@dpdk.org
> 
> Please describe what this patch is doing (the current state of code, why it
> errored, and how it is fixing the error). Write the description in imperative
> mood. I don't see the description other than error message above.
I have pushed v3 incorporating your suggestion regarding description.

> 
> Also, for stable patches submission, Cc sta...@vger.kernel.org.
> 
> Thanks.
> 
> --
> An old man doll... just what I always wanted! - Clara


[PATCH v5 0/3] Add lcore poll busyness telemetry

2022-09-02 Thread Kevin Laatz
Currently, there is no way to measure lcore polling busyness in a passive
way, without any modifications to the application. This patchset adds a new
EAL API that will be able to passively track core polling busyness. As part
of the set, new telemetry endpoints are added to read the generate metrics.

---
v5:
  * Fix Windows build
  * Make lcore_telemetry_free() an internal interface
  * Minor cleanup

v4:
  * Fix doc build
  * Rename timestamp macro to RTE_LCORE_POLL_BUSYNESS_TIMESTAMP
  * Make enable/disable read and write atomic
  * Change rte_lcore_poll_busyness_enabled_set() param to bool
  * Move mem alloc from enable/disable to init/cleanup
  * Other minor fixes

v3:
  * Fix missing renaming to poll busyness
  * Fix clang compilation
  * Fix arm compilation

v2:
  * Use rte_get_tsc_hz() to adjust the telemetry period
  * Rename to reflect polling busyness vs general busyness
  * Fix segfault when calling telemetry timestamp from an unregistered
non-EAL thread.
  * Minor cleanup

Anatoly Burakov (2):
  eal: add lcore poll busyness telemetry
  eal: add cpuset lcore telemetry entries

Kevin Laatz (1):
  doc: add howto guide for lcore poll busyness

 config/meson.build  |   1 +
 config/rte_config.h |   1 +
 doc/guides/howto/index.rst  |   1 +
 doc/guides/howto/lcore_poll_busyness.rst|  92 +
 lib/bbdev/rte_bbdev.h   |  17 +-
 lib/compressdev/rte_compressdev.c   |   2 +
 lib/cryptodev/rte_cryptodev.h   |   2 +
 lib/distributor/rte_distributor.c   |  21 +-
 lib/distributor/rte_distributor_single.c|  14 +-
 lib/dmadev/rte_dmadev.h |  15 +-
 lib/eal/common/eal_common_lcore_telemetry.c | 350 
 lib/eal/common/meson.build  |   1 +
 lib/eal/include/rte_lcore.h |  84 +
 lib/eal/linux/eal.c |   1 +
 lib/eal/meson.build |   3 +
 lib/eal/version.map |   7 +
 lib/ethdev/rte_ethdev.h |   2 +
 lib/eventdev/rte_eventdev.h |  10 +-
 lib/rawdev/rte_rawdev.c |   6 +-
 lib/regexdev/rte_regexdev.h |   5 +-
 lib/ring/rte_ring_elem_pvt.h|   1 +
 meson_options.txt   |   2 +
 22 files changed, 614 insertions(+), 24 deletions(-)
 create mode 100644 doc/guides/howto/lcore_poll_busyness.rst
 create mode 100644 lib/eal/common/eal_common_lcore_telemetry.c

-- 
2.31.1



[PATCH v5 1/3] eal: add lcore poll busyness telemetry

2022-09-02 Thread Kevin Laatz
From: Anatoly Burakov 

Currently, there is no way to measure lcore poll busyness in a passive way,
without any modifications to the application. This patch adds a new EAL API
that will be able to passively track core polling busyness.

The poll busyness is calculated by relying on the fact that most DPDK API's
will poll for work (packets, completions, eventdev events, etc). Empty
polls can be counted as "idle", while non-empty polls can be counted as
busy. To measure lcore poll busyness, we simply call the telemetry
timestamping function with the number of polls a particular code section
has processed, and count the number of cycles we've spent processing empty
bursts. The more empty bursts we encounter, the less cycles we spend in
"busy" state, and the less core poll busyness will be reported.

In order for all of the above to work without modifications to the
application, the library code needs to be instrumented with calls to the
lcore telemetry busyness timestamping function. The following parts of DPDK
are instrumented with lcore poll busyness timestamping calls:

- All major driver API's:
  - ethdev
  - cryptodev
  - compressdev
  - regexdev
  - bbdev
  - rawdev
  - eventdev
  - dmadev
- Some additional libraries:
  - ring
  - distributor

To avoid performance impact from having lcore telemetry support, a global
variable is exported by EAL, and a call to timestamping function is wrapped
into a macro, so that whenever telemetry is disabled, it only takes one
additional branch and no function calls are performed. It is disabled at
compile time by default.

This patch also adds a telemetry endpoint to report lcore poll busyness, as
well as telemetry endpoints to enable/disable lcore telemetry. A
documentation entry has been added to the howto guides to explain the usage
of the new telemetry endpoints and API.

Signed-off-by: Kevin Laatz 
Signed-off-by: Conor Walsh 
Signed-off-by: David Hunt 
Signed-off-by: Anatoly Burakov 

---
v5:
  * Fix Windows build
  * Make lcore_telemetry_free() an internal interface
  * Minor cleanup

v4:
  * Fix doc build
  * Rename timestamp macro to RTE_LCORE_POLL_BUSYNESS_TIMESTAMP
  * Make enable/disable read and write atomic
  * Change rte_lcore_poll_busyness_enabled_set() param to bool
  * Move mem alloc from enable/disable to init/cleanup
  * Other minor fixes

v3:
  * Fix missed renaming to poll busyness
  * Fix clang compilation
  * Fix arm compilation

v2:
  * Use rte_get_tsc_hz() to adjust the telemetry period
  * Rename to reflect polling busyness vs general busyness
  * Fix segfault when calling telemetry timestamp from an unregistered
non-EAL thread.
  * Minor cleanup
---
 config/meson.build  |   1 +
 config/rte_config.h |   1 +
 lib/bbdev/rte_bbdev.h   |  17 +-
 lib/compressdev/rte_compressdev.c   |   2 +
 lib/cryptodev/rte_cryptodev.h   |   2 +
 lib/distributor/rte_distributor.c   |  21 +-
 lib/distributor/rte_distributor_single.c|  14 +-
 lib/dmadev/rte_dmadev.h |  15 +-
 lib/eal/common/eal_common_lcore_telemetry.c | 303 
 lib/eal/common/meson.build  |   1 +
 lib/eal/include/rte_lcore.h |  84 ++
 lib/eal/linux/eal.c |   1 +
 lib/eal/meson.build |   3 +
 lib/eal/version.map |   7 +
 lib/ethdev/rte_ethdev.h |   2 +
 lib/eventdev/rte_eventdev.h |  10 +-
 lib/rawdev/rte_rawdev.c |   6 +-
 lib/regexdev/rte_regexdev.h |   5 +-
 lib/ring/rte_ring_elem_pvt.h|   1 +
 meson_options.txt   |   2 +
 20 files changed, 474 insertions(+), 24 deletions(-)
 create mode 100644 lib/eal/common/eal_common_lcore_telemetry.c

diff --git a/config/meson.build b/config/meson.build
index 7f7b6c92fd..d5954a059c 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -297,6 +297,7 @@ endforeach
 dpdk_conf.set('RTE_MAX_ETHPORTS', get_option('max_ethports'))
 dpdk_conf.set('RTE_LIBEAL_USE_HPET', get_option('use_hpet'))
 dpdk_conf.set('RTE_ENABLE_TRACE_FP', get_option('enable_trace_fp'))
+dpdk_conf.set('RTE_LCORE_POLL_BUSYNESS', 
get_option('enable_lcore_poll_busyness'))
 # values which have defaults which may be overridden
 dpdk_conf.set('RTE_MAX_VFIO_GROUPS', 64)
 dpdk_conf.set('RTE_DRIVER_MEMPOOL_BUCKET_SIZE_KB', 64)
diff --git a/config/rte_config.h b/config/rte_config.h
index 46549cb062..498702c9c7 100644
--- a/config/rte_config.h
+++ b/config/rte_config.h
@@ -39,6 +39,7 @@
 #define RTE_LOG_DP_LEVEL RTE_LOG_INFO
 #define RTE_BACKTRACE 1
 #define RTE_MAX_VFIO_CONTAINERS 64
+#define RTE_LCORE_POLL_BUSYNESS_PERIOD_MS 2
 
 /* bsd module defines */
 #define RTE_CONTIGMEM_MAX_NUM_BUFS 64
diff --git a/lib/bbdev/rte_bbdev.h b/lib/bbdev/rte_bbdev.h
index b88c88167e..d6a98d3f11 100644
--- a/lib/bbdev/rte_bbdev.h
+++ b/lib/bbdev/rte_bbdev.

[PATCH v5 2/3] eal: add cpuset lcore telemetry entries

2022-09-02 Thread Kevin Laatz
From: Anatoly Burakov 

Expose per-lcore cpuset information to telemetry.

Signed-off-by: Anatoly Burakov 
---
 lib/eal/common/eal_common_lcore_telemetry.c | 47 +
 1 file changed, 47 insertions(+)

diff --git a/lib/eal/common/eal_common_lcore_telemetry.c 
b/lib/eal/common/eal_common_lcore_telemetry.c
index abef1ff86d..796a4a6a73 100644
--- a/lib/eal/common/eal_common_lcore_telemetry.c
+++ b/lib/eal/common/eal_common_lcore_telemetry.c
@@ -19,6 +19,8 @@ rte_atomic32_t __rte_lcore_telemetry_enabled;
 
 #ifdef RTE_LCORE_POLL_BUSYNESS
 
+#include "eal_private.h"
+
 struct lcore_telemetry {
int poll_busyness;
/**< Calculated poll busyness (gets set/returned by the API) */
@@ -247,6 +249,48 @@ lcore_handle_poll_busyness(const char *cmd __rte_unused,
return 0;
 }
 
+static int
+lcore_handle_cpuset(const char *cmd __rte_unused,
+   const char *params __rte_unused,
+   struct rte_tel_data *d)
+{
+   char corenum[64];
+   int i;
+
+   rte_tel_data_start_dict(d);
+
+   RTE_LCORE_FOREACH(i) {
+   const struct lcore_config *cfg = &lcore_config[i];
+   const rte_cpuset_t *cpuset = &cfg->cpuset;
+   struct rte_tel_data *ld;
+   unsigned int cpu;
+
+   if (!rte_lcore_is_enabled(i))
+   continue;
+
+   /* create an array of integers */
+   ld = rte_tel_data_alloc();
+   if (ld == NULL)
+   return -ENOMEM;
+   rte_tel_data_start_array(ld, RTE_TEL_INT_VAL);
+
+   /* add cpu ID's from cpuset to the array */
+   for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
+   if (!CPU_ISSET(cpu, cpuset))
+   continue;
+   rte_tel_data_add_array_int(ld, cpu);
+   }
+
+   /* add array to the per-lcore container */
+   snprintf(corenum, sizeof(corenum), "%d", i);
+
+   /* tell telemetry library to free this array automatically */
+   rte_tel_data_add_dict_container(d, corenum, ld, 0);
+   }
+
+   return 0;
+}
+
 void
 lcore_telemetry_free(void)
 {
@@ -273,6 +317,9 @@ RTE_INIT(lcore_init_telemetry)
rte_telemetry_register_cmd("/eal/lcore/poll_busyness_disable", 
lcore_poll_busyness_disable,
   "disable lcore poll busyness measurement");
 
+   rte_telemetry_register_cmd("/eal/lcore/cpuset", lcore_handle_cpuset,
+  "list physical core affinity for each 
lcore");
+
rte_atomic32_set(&__rte_lcore_telemetry_enabled, true);
 }
 
-- 
2.31.1



[PATCH v5 3/3] doc: add howto guide for lcore poll busyness

2022-09-02 Thread Kevin Laatz
Add a new section to the howto guides for using the new lcore poll
busyness telemetry endpoints and describe general usage.

Signed-off-by: Kevin Laatz 

---
v4:
  * Include note on perf impact when the feature is enabled
  * Add doc to toctree
  * Updates to incorporate changes made earlier in the patchset

v3:
  * Update naming to poll busyness
---
 doc/guides/howto/index.rst   |  1 +
 doc/guides/howto/lcore_poll_busyness.rst | 92 
 2 files changed, 93 insertions(+)
 create mode 100644 doc/guides/howto/lcore_poll_busyness.rst

diff --git a/doc/guides/howto/index.rst b/doc/guides/howto/index.rst
index bf6337d021..0a9060c1d3 100644
--- a/doc/guides/howto/index.rst
+++ b/doc/guides/howto/index.rst
@@ -21,3 +21,4 @@ HowTo Guides
 debug_troubleshoot
 openwrt
 avx512
+lcore_poll_busyness
diff --git a/doc/guides/howto/lcore_poll_busyness.rst 
b/doc/guides/howto/lcore_poll_busyness.rst
new file mode 100644
index 00..ebbbd4c44e
--- /dev/null
+++ b/doc/guides/howto/lcore_poll_busyness.rst
@@ -0,0 +1,92 @@
+..  SPDX-License-Identifier: BSD-3-Clause
+Copyright(c) 2022 Intel Corporation.
+
+Lcore Poll Busyness Telemetry
+=
+
+The lcore poll busyness telemetry provides a built-in, generic method of 
gathering
+lcore utilization metrics for running applications. These metrics are exposed
+via a new telemetry endpoint.
+
+Since most DPDK APIs polling based, the poll busyness is calculated based on
+APIs receiving 'work' (packets, completions, events, etc). Empty polls are
+considered as idle, while non-empty polls are considered busy. Using the amount
+of cycles spent processing empty polls, the busyness can be calculated and 
recorded.
+
+Application Specified Busyness
+--
+
+Improved accuracy of the reported busyness may need more contextual awareness
+from the application. For example, an application may make a number of calls to
+rx_burst before processing packets. If the last burst was an "empty poll", then
+the processing time of the packets would be falsely considered as "idle", since
+the last burst was empty. The application should track if any of the polls
+contained "work" to do and should mark the 'bulk' as "busy" cycles before
+proceeding to the processesing. This type of awareness is only available within
+the application.
+
+Applications can be modified to incorporate the extra contextual awareness in
+order to improve the reported busyness by marking areas of code as "busy" or
+"idle" appropriately. This can be done by inserting the timestamping macro::
+
+RTE_LCORE_POLL_BUSYNESS_TIMESTAMP(0)/* to mark section as idle */
+RTE_LCORE_POLL_BUSYNESS_TIMESTAMP(32)   /* where 32 is nb_pkts to mark 
section as busy (non-zero is busy) */
+
+All cycles since the last state change (idle to busy, or vice versa) will be
+counted towards the current state's counter.
+
+Consuming the Telemetry
+---
+
+The telemetry gathered for lcore poll busyness can be read from the 
`telemetry.py`
+script via the new `/eal/lcore/poll_busyness` endpoint::
+
+$ ./usertools/dpdk-telemetry.py
+--> /eal/lcore/poll_busyness
+{"/eal/lcore/poll_busyness": {"12": -1, "13": 85, "14": 84}}
+
+* Cores not collecting poll busyness will report "-1". E.g. control cores or 
inactive cores.
+* All enabled cores will report their poll busyness in the range 0-100.
+
+Enabling and Disabling Lcore Poll Busyness Telemetry
+
+
+By default, the lcore poll busyness telemetry is disabled at compile time. In
+order to allow DPDK to gather this metric, the ``enable_lcore_poll_busyness``
+meson option must be set to ``true``.
+
+.. note::
+Enabling lcore poll busyness telemetry may impact performance due to the
+additional timestamping, potentially per poll depending on the application.
+
+At compile time
+^^^
+
+Support can be enabled/disabled at compile time via the meson option.
+It is disabled by default.::
+
+$ meson configure -Denable_lcore_poll_busyness=true #enable
+
+$ meson configure -Denable_lcore_poll_busyness=false#disable
+
+At run time
+^^^
+
+Support can also be enabled/disabled during runtime (if the meson option is
+enabled at compile time). Disabling at runtime comes at the cost of an 
additional
+branch, however no additional function calls are performed.
+
+To enable/disable support at runtime, a call can be made to the appropriately
+telemetry endpoint.
+
+Disable::
+
+$ ./usertools/dpdk-telemetry.py
+--> /eal/lcore/poll_busyness_disable
+{"/eal/lcore/poll_busyness_disable": {"poll_busyness_enabled": 0}}
+
+Enable::
+
+$ ./usertools/dpdk-telemetry.py
+--> /eal/lcore/poll_busyness_enable
+{"/eal/lcore/poll_busyness_enable": {"poll_busyness_enabled": 1}}
-- 
2.31.1



Re: [PATCH v3 1/2] test/service: add perf measurements for with stats mode

2022-09-02 Thread Mattias Rönnblom

On 2022-07-11 15:18, Harry van Haaren wrote:

This commit improves the performance reporting of the service
cores polling loop to show both with and without statistics
collection modes. Collecting cycle statistics is costly, due
to calls to rte_rdtsc() per service iteration.


That is true for a service deployed on only a single core. For 
multi-core services, non-rdtsc-related overhead dominates. For example, 
if the service is deployed on 11 cores, the extra statistics-related 
overhead is ~1000 cc/service call on x86_64. 2x rdtsc shouldn't be more 
than ~50 cc.




Reported-by: Mattias Rönnblom 
Suggested-by: Honnappa Nagarahalli 
Suggested-by: Morten Brørup 
Signed-off-by: Harry van Haaren 

---

This is split out as a seperate patch from the fix to allow
measuring the before/after of the service stats atomic fixup.
---
  app/test/test_service_cores.c | 36 ---
  1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/app/test/test_service_cores.c b/app/test/test_service_cores.c
index ced6ed0081..7415b6b686 100644
--- a/app/test/test_service_cores.c
+++ b/app/test/test_service_cores.c
@@ -777,6 +777,22 @@ service_run_on_app_core_func(void *arg)
return rte_service_run_iter_on_app_lcore(*delay_service_id, 1);
  }
  
+static float

+service_app_lcore_perf_measure(uint32_t id)
+{
+   /* Performance test: call in a loop, and measure tsc() */
+   const uint32_t perf_iters = (1 << 12);
+   uint64_t start = rte_rdtsc();
+   uint32_t i;
+   for (i = 0; i < perf_iters; i++) {
+   int err = service_run_on_app_core_func(&id);


In a real-world scenario, the latency of this function isn't 
representative for the overall service core overhead.


For example, consider a scenario where an lcore has a single service 
mapped to it. rte_service.c will call service_run() 64 times, but only 
one will be a "hit" and the service being run. One iteration in the 
service loop costs ~600 cc, on a machine where this performance 
benchmark reports 128 cc. (Both with statistics disabled.)


For low-latency services, this is a significant overhead.


+   TEST_ASSERT_EQUAL(0, err, "perf test: returned run failure");
+   }
+   uint64_t end = rte_rdtsc();
+
+   return (end - start)/(float)perf_iters;
+}
+
  static int
  service_app_lcore_poll_impl(const int mt_safe)
  {
@@ -828,17 +844,15 @@ service_app_lcore_poll_impl(const int mt_safe)
"MT Unsafe: App core1 didn't return -EBUSY");
}
  
-	/* Performance test: call in a loop, and measure tsc() */

-   const uint32_t perf_iters = (1 << 12);
-   uint64_t start = rte_rdtsc();
-   uint32_t i;
-   for (i = 0; i < perf_iters; i++) {
-   int err = service_run_on_app_core_func(&id);
-   TEST_ASSERT_EQUAL(0, err, "perf test: returned run failure");
-   }
-   uint64_t end = rte_rdtsc();
-   printf("perf test for %s: %0.1f cycles per call\n", mt_safe ?
-   "MT Safe" : "MT Unsafe", (end - start)/(float)perf_iters);
+   /* Measure performance of no-stats and with-stats. */
+   float cyc_no_stats = service_app_lcore_perf_measure(id);
+
+   TEST_ASSERT_EQUAL(0, rte_service_set_stats_enable(id, 1),
+   "failed to enable stats for service.");
+   float cyc_with_stats = service_app_lcore_perf_measure(id);
+
+   printf("perf test for %s, no stats: %0.1f, with stats %0.1f 
cycles/call\n",
+   mt_safe ? "MT Safe" : "MT Unsafe", cyc_no_stats, 
cyc_with_stats);
  
  	unregister_all();

return TEST_SUCCESS;


[PATCH v2] net/ring: add monitor callback

2022-09-02 Thread Herakliusz Lipiec
Currently ring pmd does not support ``rte_power_monitor`` api.
This patch adds support by adding monitor callback that is called
whenever we enter sleep state and need to check if it is time to wake
up.

Signed-off-by: Herakliusz Lipiec 
Acked-by: Bruce Richardson 

---
v2:
 - changed umonitor references to monitor as this is how it appears in
   dpdk api
 - fixed coding style issues
---
 drivers/net/ring/rte_eth_ring.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/drivers/net/ring/rte_eth_ring.c b/drivers/net/ring/rte_eth_ring.c
index cfb81da5fe..1050c22777 100644
--- a/drivers/net/ring/rte_eth_ring.c
+++ b/drivers/net/ring/rte_eth_ring.c
@@ -284,6 +284,29 @@ eth_dev_close(struct rte_eth_dev *dev)
return ret;
 }
 
+static int ring_monitor_callback(const uint64_t value,
+   const uint64_t arg[RTE_POWER_MONITOR_OPAQUE_SZ])
+{
+   /* Check if the head pointer has changed */
+   return value != arg[0];
+}
+
+static int
+eth_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
+{
+   struct rte_ring *rng = ((struct ring_queue *)rx_queue)->rng;
+
+   /*
+* Monitor ring head since if head moves
+* there are packets to transmit
+*/
+   pmc->addr = &rng->prod.head;
+   pmc->size = sizeof(rng->prod.head);
+   pmc->opaque[0] = rng->prod.head;
+   pmc->fn = ring_monitor_callback;
+   return 0;
+}
+
 static const struct eth_dev_ops ops = {
.dev_close = eth_dev_close,
.dev_start = eth_dev_start,
@@ -303,6 +326,7 @@ static const struct eth_dev_ops ops = {
.promiscuous_disable = eth_promiscuous_disable,
.allmulticast_enable = eth_allmulticast_enable,
.allmulticast_disable = eth_allmulticast_disable,
+   .get_monitor_addr = eth_get_monitor_addr,
 };
 
 static int
-- 
2.36.1



RE: [Patch v6 06/18] net/mana: add device info

2022-09-02 Thread Long Li
> Subject: Re: [Patch v6 06/18] net/mana: add device info
> 
> On 2022/8/31 6:51, lon...@linuxonhyperv.com wrote:
> > From: Long Li 
> >
> > Add the function to get device info.
> >
> > Signed-off-by: Long Li 
> > ---
> >  doc/guides/nics/features/mana.ini |  1 +
> >  drivers/net/mana/mana.c   | 82 +++
> >  2 files changed, 83 insertions(+)
> >
> > diff --git a/doc/guides/nics/features/mana.ini
> > b/doc/guides/nics/features/mana.ini
> > index 8043e11f99..566b3e8770 100644
> > --- a/doc/guides/nics/features/mana.ini
> > +++ b/doc/guides/nics/features/mana.ini
> > @@ -8,5 +8,6 @@ Link status  = P
> >  Linux= Y
> >  Multiprocess aware   = Y
> >  Removal event= Y
> > +Speed capabilities   = P
> >  Usage doc= Y
> >  x86-64   = Y
> > diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c index
> > c9591035ac..e1550b3c08 100644
> > --- a/drivers/net/mana/mana.c
> > +++ b/drivers/net/mana/mana.c
> > @@ -116,6 +116,86 @@ mana_dev_close(struct rte_eth_dev *dev)
> > return 0;
> >  }
> >
> > +static int mana_dev_info_get(struct rte_eth_dev *dev,
> > +struct rte_eth_dev_info *dev_info) {
> > +   struct mana_priv *priv = dev->data->dev_private;
> > +
> > +   dev_info->max_mtu = RTE_ETHER_MTU;
> 
> ...
> 
> > +   dev_info->tx_desc_lim.nb_max = priv->max_tx_desc;
> > +   dev_info->tx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
> > +   dev_info->tx_desc_lim.nb_seg_max = priv->max_send_sge;
> > +   dev_info->rx_desc_lim.nb_mtu_seg_max = priv->max_recv_sge;
> > +
> > +   /* Speed */
> > +   dev_info->speed_capa = ETH_LINK_SPEED_100G;
> 
> I notice "[Patch v6 04/18] net/mana: add link update" report always 200G, why
> here is 100G?

Thanks for pointing this out. I will fix patch 04.

> 
> > +
> > +   /* RX params */
> > +   dev_info->default_rxportconf.burst_size = 1;
> > +   dev_info->default_rxportconf.ring_size =
> MAX_RECEIVE_BUFFERS_PER_QUEUE;
> > +   dev_info->default_rxportconf.nb_queues = 1;
> > +
> > +   /* TX params */
> > +   dev_info->default_txportconf.burst_size = 1;
> > +   dev_info->default_txportconf.ring_size =
> MAX_SEND_BUFFERS_PER_QUEUE;
> > +   dev_info->default_txportconf.nb_queues = 1;
> > +
> > +   return 0;
> > +}
> > +
> 
> ...


RE: [Patch v6 15/18] net/mana: add function to send packets

2022-09-02 Thread Long Li
> Subject: Re: [Patch v6 15/18] net/mana: add function to send packets
> 
> On 2022/8/31 6:51, lon...@linuxonhyperv.com wrote:
> > From: Long Li 
> >
> > With all the TX queues created, MANA can send packets over those queues.
> >
> > Signed-off-by: Long Li 
> 
> ...
> 
> >  }
> > +
> > +uint16_t mana_tx_burst(void *dpdk_txq, struct rte_mbuf **tx_pkts,
> > +  uint16_t nb_pkts)
> > +{
> > +   struct mana_txq *txq = dpdk_txq;
> > +   struct mana_priv *priv = txq->priv;
> > +   struct gdma_comp comp;
> > +   int ret;
> > +   void *db_page;
> > +
> > +   /* Process send completions from GDMA */
> > +   while (gdma_poll_completion_queue(&txq->gdma_cq, &comp) == 1) {
> > +   struct mana_txq_desc *desc =
> > +   &txq->desc_ring[txq->desc_ring_tail];
> > +   struct mana_tx_comp_oob *oob =
> > +   (struct mana_tx_comp_oob
> *)&comp.completion_data[0];
> > +
> > +   if (oob->cqe_hdr.cqe_type != CQE_TX_OKAY) {
> > +   DRV_LOG(ERR,
> > +   "mana_tx_comp_oob cqe_type %u
> vendor_err %u",
> > +   oob->cqe_hdr.cqe_type, oob-
> >cqe_hdr.vendor_err);
> > +   txq->stats.errors++;
> > +   } else {
> > +   DRV_LOG(DEBUG, "mana_tx_comp_oob
> CQE_TX_OKAY");
> > +   txq->stats.packets++;
> > +   }
> > +
> > +   if (!desc->pkt) {
> > +   DRV_LOG(ERR, "mana_txq_desc has a NULL pkt");
> > +   } else {
> > +   txq->stats.bytes += desc->pkt->data_len;
> > +   rte_pktmbuf_free(desc->pkt);
> > +   }
> > +
> > +   desc->pkt = NULL;
> > +   txq->desc_ring_tail = (txq->desc_ring_tail + 1) % txq->num_desc;
> > +   txq->gdma_sq.tail += desc->wqe_size_in_bu;
> > +   }
> > +
> > +   /* Post send requests to GDMA */
> > +   uint16_t pkt_idx;
> > +
> > +   for (pkt_idx = 0; pkt_idx < nb_pkts; pkt_idx++) {
> > +   struct rte_mbuf *m_pkt = tx_pkts[pkt_idx];
> > +   struct rte_mbuf *m_seg = m_pkt;
> > +   struct transmit_oob_v2 tx_oob = {0};
> > +   struct one_sgl sgl = {0};
> > +
> > +   /* Drop the packet if it exceeds max segments */
> > +   if (m_pkt->nb_segs > priv->max_send_sge) {
> > +   DRV_LOG(ERR, "send packet segments %d exceeding
> max",
> > +   m_pkt->nb_segs);
> 
> This branch violate rte_eth_tx_burst definition, which defined the return 
> value is
> " *   The maximum number of packets to transmit."

Will fix this.

> 
> Also I notice the driver didn't implement tx-prepare, which could hold such
> checking in framework's definition.
> 
> > +   continue;
> > +   }
> > +
> > +   /* Fill in the oob */
> > +   tx_oob.short_oob.packet_format = short_packet_format;
> > +   tx_oob.short_oob.tx_is_outer_ipv4 =
> > +   m_pkt->ol_flags & RTE_MBUF_F_TX_IPV4 ? 1 : 0;
> > +   tx_oob.short_oob.tx_is_outer_ipv6 =
> > +   m_pkt->ol_flags & RTE_MBUF_F_TX_IPV6 ? 1 : 0;
> > +
> > +   tx_oob.short_oob.tx_compute_IP_header_checksum =
> > +   m_pkt->ol_flags & RTE_MBUF_F_TX_IP_CKSUM ? 1 : 0;
> > +
> > +   if ((m_pkt->ol_flags & RTE_MBUF_F_TX_L4_MASK) ==
> > +   RTE_MBUF_F_TX_TCP_CKSUM) {
> > +   struct rte_tcp_hdr *tcp_hdr;
> > +
> > +   /* HW needs partial TCP checksum */
> > +
> > +   tcp_hdr = rte_pktmbuf_mtod_offset(m_pkt,
> > + struct rte_tcp_hdr *,
> > + m_pkt->l2_len + m_pkt->l3_len);
> > +
> > +   if (m_pkt->ol_flags & RTE_MBUF_F_TX_IPV4) {
> > +   struct rte_ipv4_hdr *ip_hdr;
> > +
> > +   ip_hdr = rte_pktmbuf_mtod_offset(m_pkt,
> > +   struct rte_ipv4_hdr *,
> > +   m_pkt->l2_len);
> > +   tcp_hdr->cksum = rte_ipv4_phdr_cksum(ip_hdr,
> > +   m_pkt->ol_flags);
> > +
> > +   } else if (m_pkt->ol_flags & RTE_MBUF_F_TX_IPV6) {
> > +   struct rte_ipv6_hdr *ip_hdr;
> > +
> > +   ip_hdr = rte_pktmbuf_mtod_offset(m_pkt,
> > +   struct rte_ipv6_hdr *,
> > +   m_pkt->l2_len);
> > +   tcp_hdr->cksum = rte_ipv6_phdr_cksum(ip_hdr,
> > +   m_pkt->ol_flags);
> > +   } else {
> > +   DRV_LOG(ERR, "Invalid input for TCP CKSUM");
> > +   }
> > +
> > +   tx_oob.short_oob.tx_compute_TCP_checksum = 1;
> > +   tx_oob.sho

RE: [Patch v6 01/18] net/mana: add basic driver, build environment and doc

2022-09-02 Thread Long Li
> Subject: Re: [Patch v6 01/18] net/mana: add basic driver, build environment 
> and
> doc
> 
> On 2022/8/31 6:51, lon...@linuxonhyperv.com wrote:
> > From: Long Li 
> >
> > MANA is a PCI device. It uses IB verbs to access hardware through the
> > kernel RDMA layer. This patch introduces build environment and basic
> > device probe functions.
> >
> > Signed-off-by: Long Li 
> 
> ...
> 
> > +static int mana_mp_primary_handle(const struct rte_mp_msg *mp_msg,
> > + const void *peer)
> > +{
> > +   struct rte_eth_dev *dev;
> > +   const struct mana_mp_param *param =
> > +   (const struct mana_mp_param *)mp_msg->param;
> > +   struct rte_mp_msg mp_res = { 0 };
> > +   struct mana_mp_param *res = (struct mana_mp_param
> *)mp_res.param;
> > +   int ret;
> > +   struct mana_priv *priv;
> > +
> > +   if (!rte_eth_dev_is_valid_port(param->port_id)) {
> > +   DRV_LOG(ERR, "MP handle port ID %u invalid", param->port_id);
> > +   return -ENODEV;
> > +   }
> > +
> > +   dev = &rte_eth_devices[param->port_id];
> > +   priv = dev->data->dev_private;
> > +
> > +   mp_init_msg(&mp_res, param->type, param->port_id);
> > +
> > +   switch (param->type) {
> > +   case MANA_MP_REQ_VERBS_CMD_FD:
> > +   mp_res.num_fds = 1;
> > +   mp_res.fds[0] = priv->ib_ctx->cmd_fd;
> 
> The cmd_fd is system level handler?
> 
> If it's process private handler, it should not used directly in another 
> process.

According to rte_mp_xxx semantics, the file handle is duplicated to another 
process. It's not directly used. It's required for secondary process to map to 
the same doorbell pages.

> 
> > +   res->result = 0;
> > +   ret = rte_mp_reply(&mp_res, peer);
> > +   break;
> > +
> > +   default:
> > +   DRV_LOG(ERR, "Port %u unknown primary MP type %u",
> > +   param->port_id, param->type);
> > +   ret = -EINVAL;
> > +   }
> > +
> > +   return ret;
> > +}
> > +



[Patch v7 00/18] Introduce Microsoft Azure Network Adatper (MANA) PMD

2022-09-02 Thread longli
From: Long Li 

MANA is a network interface card to be used in the Azure cloud environment.
MANA provides safe access to user memory through memory registration. It has
IOMMU built into the hardware.

MANA uses IB verbs and RDMA layer to configure hardware resources. It
requires the corresponding RDMA kernel-mode and user-mode drivers.

The MANA RDMA kernel-mode driver is being reviewed at:
https://patchwork.kernel.org/project/netdevbpf/cover/1655345240-26411-1-git-send-email-lon...@linuxonhyperv.com/

The MANA RDMA user-mode driver is being reviewed at:
https://github.com/linux-rdma/rdma-core/pull/1177


Long Li (18):
  net/mana: add basic driver, build environment and doc
  net/mana: add device configuration and stop
  net/mana: add function to report support ptypes
  net/mana: add link update
  net/mana: add function for device removal interrupts
  net/mana: add device info
  net/mana: add function to configure RSS
  net/mana: add function to configure RX queues
  net/mana: add function to configure TX queues
  net/mana: implement memory registration
  net/mana: implement the hardware layer operations
  net/mana: add function to start/stop TX queues
  net/mana: add function to start/stop RX queues
  net/mana: add function to receive packets
  net/mana: add function to send packets
  net/mana: add function to start/stop device
  net/mana: add function to report queue stats
  net/mana: add function to support RX interrupts

 MAINTAINERS   |6 +
 doc/guides/nics/features/mana.ini |   21 +
 doc/guides/nics/index.rst |1 +
 doc/guides/nics/mana.rst  |   66 ++
 drivers/net/mana/gdma.c   |  289 ++
 drivers/net/mana/mana.c   | 1449 +
 drivers/net/mana/mana.h   |  552 +++
 drivers/net/mana/meson.build  |   48 +
 drivers/net/mana/mp.c |  323 +++
 drivers/net/mana/mr.c |  324 +++
 drivers/net/mana/rx.c |  519 +++
 drivers/net/mana/tx.c |  412 
 drivers/net/mana/version.map  |3 +
 drivers/net/meson.build   |1 +
 14 files changed, 4014 insertions(+)
 create mode 100644 doc/guides/nics/features/mana.ini
 create mode 100644 doc/guides/nics/mana.rst
 create mode 100644 drivers/net/mana/gdma.c
 create mode 100644 drivers/net/mana/mana.c
 create mode 100644 drivers/net/mana/mana.h
 create mode 100644 drivers/net/mana/meson.build
 create mode 100644 drivers/net/mana/mp.c
 create mode 100644 drivers/net/mana/mr.c
 create mode 100644 drivers/net/mana/rx.c
 create mode 100644 drivers/net/mana/tx.c
 create mode 100644 drivers/net/mana/version.map

-- 
2.17.1



[Patch v7 01/18] net/mana: add basic driver, build environment and doc

2022-09-02 Thread longli
From: Long Li 

MANA is a PCI device. It uses IB verbs to access hardware through the
kernel RDMA layer. This patch introduces build environment and basic
device probe functions.

Signed-off-by: Long Li 
---
Change log:
v2:
Fix typos.
Make the driver build only on x86-64 and Linux.
Remove unused header files.
Change port definition to uint16_t or uint8_t (for IB).
Use getline() in place of fgets() to read and truncate a line.
v3:
Add meson build check for required functions from RDMA direct verb header file
v4:
Remove extra "\n" in logging code.
Use "r" in place of "rb" in fopen() to read text files.
v7:
Remove RTE_ETH_TX_OFFLOAD_TCP_TSO from offload cap.

 MAINTAINERS   |   6 +
 doc/guides/nics/features/mana.ini |  10 +
 doc/guides/nics/index.rst |   1 +
 doc/guides/nics/mana.rst  |  66 +++
 drivers/net/mana/mana.c   | 704 ++
 drivers/net/mana/mana.h   | 209 +
 drivers/net/mana/meson.build  |  44 ++
 drivers/net/mana/mp.c | 235 ++
 drivers/net/mana/version.map  |   3 +
 drivers/net/meson.build   |   1 +
 10 files changed, 1279 insertions(+)
 create mode 100644 doc/guides/nics/features/mana.ini
 create mode 100644 doc/guides/nics/mana.rst
 create mode 100644 drivers/net/mana/mana.c
 create mode 100644 drivers/net/mana/mana.h
 create mode 100644 drivers/net/mana/meson.build
 create mode 100644 drivers/net/mana/mp.c
 create mode 100644 drivers/net/mana/version.map

diff --git a/MAINTAINERS b/MAINTAINERS
index 18d9edaf88..b8bda48a33 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -837,6 +837,12 @@ F: buildtools/options-ibverbs-static.sh
 F: doc/guides/nics/mlx5.rst
 F: doc/guides/nics/features/mlx5.ini
 
+Microsoft mana
+M: Long Li 
+F: drivers/net/mana
+F: doc/guides/nics/mana.rst
+F: doc/guides/nics/features/mana.ini
+
 Microsoft vdev_netvsc - EXPERIMENTAL
 M: Matan Azrad 
 F: drivers/net/vdev_netvsc/
diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
new file mode 100644
index 00..b92a27374c
--- /dev/null
+++ b/doc/guides/nics/features/mana.ini
@@ -0,0 +1,10 @@
+;
+; Supported features of the 'mana' network poll mode driver.
+;
+; Refer to default.ini for the full list of available PMD features.
+;
+[Features]
+Linux= Y
+Multiprocess aware   = Y
+Usage doc= Y
+x86-64   = Y
diff --git a/doc/guides/nics/index.rst b/doc/guides/nics/index.rst
index 1c94caccea..2725d1d9f0 100644
--- a/doc/guides/nics/index.rst
+++ b/doc/guides/nics/index.rst
@@ -41,6 +41,7 @@ Network Interface Controller Drivers
 intel_vf
 kni
 liquidio
+mana
 memif
 mlx4
 mlx5
diff --git a/doc/guides/nics/mana.rst b/doc/guides/nics/mana.rst
new file mode 100644
index 00..40e18fe810
--- /dev/null
+++ b/doc/guides/nics/mana.rst
@@ -0,0 +1,66 @@
+..  SPDX-License-Identifier: BSD-3-Clause
+Copyright 2022 Microsoft Corporation
+
+MANA poll mode driver library
+=
+
+The MANA poll mode driver library (**librte_net_mana**) implements support
+for Microsoft Azure Network Adapter VF in SR-IOV context.
+
+Features
+
+
+Features of the MANA Ethdev PMD are:
+
+Prerequisites
+-
+
+This driver relies on external libraries and kernel drivers for resources
+allocations and initialization. The following dependencies are not part of
+DPDK and must be installed separately:
+
+- **libibverbs** (provided by rdma-core package)
+
+  User space verbs framework used by librte_net_mana. This library provides
+  a generic interface between the kernel and low-level user space drivers
+  such as libmana.
+
+  It allows slow and privileged operations (context initialization, hardware
+  resources allocations) to be managed by the kernel and fast operations to
+  never leave user space.
+
+- **libmana** (provided by rdma-core package)
+
+  Low-level user space driver library for Microsoft Azure Network Adapter
+  devices, it is automatically loaded by libibverbs.
+
+- **Kernel modules**
+
+  They provide the kernel-side verbs API and low level device drivers that
+  manage actual hardware initialization and resources sharing with user
+  space processes.
+
+  Unlike most other PMDs, these modules must remain loaded and bound to
+  their devices:
+
+  - mana: Ethernet device driver that provides kernel network interfaces.
+  - mana_ib: InifiniBand device driver.
+  - ib_uverbs: user space driver for verbs (entry point for libibverbs).
+
+Driver compilation and testing
+--
+
+Refer to the document :ref:`compiling and testing a PMD for a NIC 
`
+for details.
+
+Netvsc PMD arguments
+
+
+The user can specify below argument in devargs.
+
+#.  ``mac``:
+
+Specify the MAC address for this device. If it is set, the driver
+probes and loads the NIC with a matching mac address. If it is not
+set, the driver probes on all the NICs on the PCI

[Patch v7 02/18] net/mana: add device configuration and stop

2022-09-02 Thread longli
From: Long Li 

MANA defines its memory allocation functions to override IB layer default
functions to allocate device queues. This patch adds the code for device
configuration and stop.

Signed-off-by: Long Li 
---
Change log:
v2:
Removed validation for offload settings in mana_dev_configure().

 drivers/net/mana/mana.c | 75 +++--
 drivers/net/mana/mana.h |  3 ++
 2 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index cb59eb6882..147ab144d5 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -40,7 +40,79 @@ static rte_spinlock_t mana_shared_data_lock = 
RTE_SPINLOCK_INITIALIZER;
 int mana_logtype_driver;
 int mana_logtype_init;
 
+void *mana_alloc_verbs_buf(size_t size, void *data)
+{
+   void *ret;
+   size_t alignment = rte_mem_page_size();
+   int socket = (int)(uintptr_t)data;
+
+   DRV_LOG(DEBUG, "size=%zu socket=%d", size, socket);
+
+   if (alignment == (size_t)-1) {
+   DRV_LOG(ERR, "Failed to get mem page size");
+   rte_errno = ENOMEM;
+   return NULL;
+   }
+
+   ret = rte_zmalloc_socket("mana_verb_buf", size, alignment, socket);
+   if (!ret && size)
+   rte_errno = ENOMEM;
+   return ret;
+}
+
+void mana_free_verbs_buf(void *ptr, void *data __rte_unused)
+{
+   rte_free(ptr);
+}
+
+static int mana_dev_configure(struct rte_eth_dev *dev)
+{
+   struct mana_priv *priv = dev->data->dev_private;
+   struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
+
+   if (dev_conf->rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG)
+   dev_conf->rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH;
+
+   if (dev->data->nb_rx_queues != dev->data->nb_tx_queues) {
+   DRV_LOG(ERR, "Only support equal number of rx/tx queues");
+   return -EINVAL;
+   }
+
+   if (!rte_is_power_of_2(dev->data->nb_rx_queues)) {
+   DRV_LOG(ERR, "number of TX/RX queues must be power of 2");
+   return -EINVAL;
+   }
+
+   priv->num_queues = dev->data->nb_rx_queues;
+
+   manadv_set_context_attr(priv->ib_ctx, MANADV_CTX_ATTR_BUF_ALLOCATORS,
+   (void *)((uintptr_t)&(struct 
manadv_ctx_allocators){
+   .alloc = &mana_alloc_verbs_buf,
+   .free = &mana_free_verbs_buf,
+   .data = 0,
+   }));
+
+   return 0;
+}
+
+static int
+mana_dev_close(struct rte_eth_dev *dev)
+{
+   struct mana_priv *priv = dev->data->dev_private;
+   int ret;
+
+   ret = ibv_close_device(priv->ib_ctx);
+   if (ret) {
+   ret = errno;
+   return ret;
+   }
+
+   return 0;
+}
+
 const struct eth_dev_ops mana_dev_ops = {
+   .dev_configure  = mana_dev_configure,
+   .dev_close  = mana_dev_close,
 };
 
 const struct eth_dev_ops mana_dev_sec_ops = {
@@ -627,8 +699,7 @@ static int mana_pci_probe(struct rte_pci_driver *pci_drv 
__rte_unused,
 
 static int mana_dev_uninit(struct rte_eth_dev *dev)
 {
-   RTE_SET_USED(dev);
-   return 0;
+   return mana_dev_close(dev);
 }
 
 static int mana_pci_remove(struct rte_pci_device *pci_dev)
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index a1184c579f..4e654e07d1 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -206,4 +206,7 @@ int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev);
 
 void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type);
 
+void *mana_alloc_verbs_buf(size_t size, void *data);
+void mana_free_verbs_buf(void *ptr, void *data __rte_unused);
+
 #endif
-- 
2.17.1



[Patch v7 03/18] net/mana: add function to report support ptypes

2022-09-02 Thread longli
From: Long Li 

Report supported protocol types.

Signed-off-by: Long Li 
---
Change log.
v7: change link_speed to RTE_ETH_SPEED_NUM_100G

 drivers/net/mana/mana.c | 16 
 drivers/net/mana/mana.h |  2 --
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 147ab144d5..4559632056 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -110,9 +110,25 @@ mana_dev_close(struct rte_eth_dev *dev)
return 0;
 }
 
+static const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev 
__rte_unused)
+{
+   static const uint32_t ptypes[] = {
+   RTE_PTYPE_L2_ETHER,
+   RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
+   RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
+   RTE_PTYPE_L4_FRAG,
+   RTE_PTYPE_L4_TCP,
+   RTE_PTYPE_L4_UDP,
+   RTE_PTYPE_UNKNOWN
+   };
+
+   return ptypes;
+}
+
 const struct eth_dev_ops mana_dev_ops = {
.dev_configure  = mana_dev_configure,
.dev_close  = mana_dev_close,
+   .dev_supported_ptypes_get = mana_supported_ptypes,
 };
 
 const struct eth_dev_ops mana_dev_sec_ops = {
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 4e654e07d1..2be68093c0 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -167,8 +167,6 @@ extern int mana_logtype_init;
 
 #define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
 
-const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev);
-
 uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
   uint16_t pkts_n);
 
-- 
2.17.1



[Patch v7 04/18] net/mana: add link update

2022-09-02 Thread longli
From: Long Li 

The carrier state is managed by the Azure host. MANA runs as a VF and
always reports "up".

Signed-off-by: Long Li 
---
 doc/guides/nics/features/mana.ini |  1 +
 drivers/net/mana/mana.c   | 17 +
 2 files changed, 18 insertions(+)

diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
index b92a27374c..62554b0a0a 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -4,6 +4,7 @@
 ; Refer to default.ini for the full list of available PMD features.
 ;
 [Features]
+Link status  = P
 Linux= Y
 Multiprocess aware   = Y
 Usage doc= Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 4559632056..46a7bbcca0 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -125,10 +125,27 @@ static const uint32_t *mana_supported_ptypes(struct 
rte_eth_dev *dev __rte_unuse
return ptypes;
 }
 
+static int mana_dev_link_update(struct rte_eth_dev *dev,
+   int wait_to_complete __rte_unused)
+{
+   struct rte_eth_link link;
+
+   /* MANA has no concept of carrier state, always reporting UP */
+   link = (struct rte_eth_link) {
+   .link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
+   .link_autoneg = RTE_ETH_LINK_SPEED_FIXED,
+   .link_speed = RTE_ETH_SPEED_NUM_100G,
+   .link_status = RTE_ETH_LINK_UP,
+   };
+
+   return rte_eth_linkstatus_set(dev, &link);
+}
+
 const struct eth_dev_ops mana_dev_ops = {
.dev_configure  = mana_dev_configure,
.dev_close  = mana_dev_close,
.dev_supported_ptypes_get = mana_supported_ptypes,
+   .link_update= mana_dev_link_update,
 };
 
 const struct eth_dev_ops mana_dev_sec_ops = {
-- 
2.17.1



[Patch v7 05/18] net/mana: add function for device removal interrupts

2022-09-02 Thread longli
From: Long Li 

MANA supports PCI hot plug events. Add this interrupt to DPDK core so its
parent PMD can detect device removal during Azure servicing or live
migration.

Signed-off-by: Long Li 
---
 doc/guides/nics/features/mana.ini |  1 +
 drivers/net/mana/mana.c   | 97 +++
 drivers/net/mana/mana.h   |  1 +
 3 files changed, 99 insertions(+)

diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
index 62554b0a0a..8043e11f99 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -7,5 +7,6 @@
 Link status  = P
 Linux= Y
 Multiprocess aware   = Y
+Removal event= Y
 Usage doc= Y
 x86-64   = Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 46a7bbcca0..00c5bdbf9f 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -95,12 +95,18 @@ static int mana_dev_configure(struct rte_eth_dev *dev)
return 0;
 }
 
+static int mana_intr_uninstall(struct mana_priv *priv);
+
 static int
 mana_dev_close(struct rte_eth_dev *dev)
 {
struct mana_priv *priv = dev->data->dev_private;
int ret;
 
+   ret = mana_intr_uninstall(priv);
+   if (ret)
+   return ret;
+
ret = ibv_close_device(priv->ib_ctx);
if (ret) {
ret = errno;
@@ -327,6 +333,90 @@ static int mana_ibv_device_to_pci_addr(const struct 
ibv_device *device,
return 0;
 }
 
+static void mana_intr_handler(void *arg)
+{
+   struct mana_priv *priv = arg;
+   struct ibv_context *ctx = priv->ib_ctx;
+   struct ibv_async_event event;
+
+   /* Read and ack all messages from IB device */
+   while (true) {
+   if (ibv_get_async_event(ctx, &event))
+   break;
+
+   if (event.event_type == IBV_EVENT_DEVICE_FATAL) {
+   struct rte_eth_dev *dev;
+
+   dev = &rte_eth_devices[priv->port_id];
+   if (dev->data->dev_conf.intr_conf.rmv)
+   rte_eth_dev_callback_process(dev,
+   RTE_ETH_EVENT_INTR_RMV, NULL);
+   }
+
+   ibv_ack_async_event(&event);
+   }
+}
+
+static int mana_intr_uninstall(struct mana_priv *priv)
+{
+   int ret;
+
+   ret = rte_intr_callback_unregister(priv->intr_handle,
+  mana_intr_handler, priv);
+   if (ret <= 0) {
+   DRV_LOG(ERR, "Failed to unregister intr callback ret %d", ret);
+   return ret;
+   }
+
+   rte_intr_instance_free(priv->intr_handle);
+
+   return 0;
+}
+
+static int mana_intr_install(struct mana_priv *priv)
+{
+   int ret, flags;
+   struct ibv_context *ctx = priv->ib_ctx;
+
+   priv->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_SHARED);
+   if (!priv->intr_handle) {
+   DRV_LOG(ERR, "Failed to allocate intr_handle");
+   rte_errno = ENOMEM;
+   return -ENOMEM;
+   }
+
+   rte_intr_fd_set(priv->intr_handle, -1);
+
+   flags = fcntl(ctx->async_fd, F_GETFL);
+   ret = fcntl(ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
+   if (ret) {
+   DRV_LOG(ERR, "Failed to change async_fd to NONBLOCK");
+   goto free_intr;
+   }
+
+   rte_intr_fd_set(priv->intr_handle, ctx->async_fd);
+   rte_intr_type_set(priv->intr_handle, RTE_INTR_HANDLE_EXT);
+
+   ret = rte_intr_callback_register(priv->intr_handle,
+mana_intr_handler, priv);
+   if (ret) {
+   DRV_LOG(ERR, "Failed to register intr callback");
+   rte_intr_fd_set(priv->intr_handle, -1);
+   goto restore_fd;
+   }
+
+   return 0;
+
+restore_fd:
+   fcntl(ctx->async_fd, F_SETFL, flags);
+
+free_intr:
+   rte_intr_instance_free(priv->intr_handle);
+   priv->intr_handle = NULL;
+
+   return ret;
+}
+
 static int mana_proc_priv_init(struct rte_eth_dev *dev)
 {
struct mana_process_priv *priv;
@@ -640,6 +730,13 @@ static int mana_pci_probe_mac(struct rte_pci_driver 
*pci_drv __rte_unused,
name, priv->max_rx_queues, priv->max_rx_desc,
priv->max_send_sge);
 
+   /* Create async interrupt handler */
+   ret = mana_intr_install(priv);
+   if (ret) {
+   DRV_LOG(ERR, "Failed to install intr handler");
+   goto failed;
+   }
+
rte_spinlock_lock(&mana_shared_data->lock);
mana_shared_data->primary_cnt++;
rte_spinlock_unlock(&mana_shared_data->lock);
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 2be68093c0..1c5ea9b44d 100644
--- a

[Patch v7 06/18] net/mana: add device info

2022-09-02 Thread longli
From: Long Li 

Add the function to get device info.

Signed-off-by: Long Li 
---
 doc/guides/nics/features/mana.ini |  1 +
 drivers/net/mana/mana.c   | 82 +++
 2 files changed, 83 insertions(+)

diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
index 8043e11f99..566b3e8770 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -8,5 +8,6 @@ Link status  = P
 Linux= Y
 Multiprocess aware   = Y
 Removal event= Y
+Speed capabilities   = P
 Usage doc= Y
 x86-64   = Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 00c5bdbf9f..c7c8d8c4ec 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -116,6 +116,86 @@ mana_dev_close(struct rte_eth_dev *dev)
return 0;
 }
 
+static int mana_dev_info_get(struct rte_eth_dev *dev,
+struct rte_eth_dev_info *dev_info)
+{
+   struct mana_priv *priv = dev->data->dev_private;
+
+   dev_info->max_mtu = RTE_ETHER_MTU;
+
+   /* RX params */
+   dev_info->min_rx_bufsize = MIN_RX_BUF_SIZE;
+   dev_info->max_rx_pktlen = MAX_FRAME_SIZE;
+
+   dev_info->max_rx_queues = priv->max_rx_queues;
+   dev_info->max_tx_queues = priv->max_tx_queues;
+
+   dev_info->max_mac_addrs = BNIC_MAX_MAC_ADDR;
+   dev_info->max_hash_mac_addrs = 0;
+
+   dev_info->max_vfs = 1;
+
+   /* Offload params */
+   dev_info->rx_offload_capa = BNIC_DEV_RX_OFFLOAD_SUPPORT;
+
+   dev_info->tx_offload_capa = BNIC_DEV_TX_OFFLOAD_SUPPORT;
+
+   /* RSS */
+   dev_info->reta_size = INDIRECTION_TABLE_NUM_ELEMENTS;
+   dev_info->hash_key_size = TOEPLITZ_HASH_KEY_SIZE_IN_BYTES;
+   dev_info->flow_type_rss_offloads = BNIC_ETH_RSS_SUPPORT;
+
+   /* Thresholds */
+   dev_info->default_rxconf = (struct rte_eth_rxconf){
+   .rx_thresh = {
+   .pthresh = 8,
+   .hthresh = 8,
+   .wthresh = 0,
+   },
+   .rx_free_thresh = 32,
+   /* If no descriptors available, pkts are dropped by default */
+   .rx_drop_en = 1,
+   };
+
+   dev_info->default_txconf = (struct rte_eth_txconf){
+   .tx_thresh = {
+   .pthresh = 32,
+   .hthresh = 0,
+   .wthresh = 0,
+   },
+   .tx_rs_thresh = 32,
+   .tx_free_thresh = 32,
+   };
+
+   /* Buffer limits */
+   dev_info->rx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
+   dev_info->rx_desc_lim.nb_max = priv->max_rx_desc;
+   dev_info->rx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
+   dev_info->rx_desc_lim.nb_seg_max = priv->max_recv_sge;
+   dev_info->rx_desc_lim.nb_mtu_seg_max = priv->max_recv_sge;
+
+   dev_info->tx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
+   dev_info->tx_desc_lim.nb_max = priv->max_tx_desc;
+   dev_info->tx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
+   dev_info->tx_desc_lim.nb_seg_max = priv->max_send_sge;
+   dev_info->rx_desc_lim.nb_mtu_seg_max = priv->max_recv_sge;
+
+   /* Speed */
+   dev_info->speed_capa = ETH_LINK_SPEED_100G;
+
+   /* RX params */
+   dev_info->default_rxportconf.burst_size = 1;
+   dev_info->default_rxportconf.ring_size = MAX_RECEIVE_BUFFERS_PER_QUEUE;
+   dev_info->default_rxportconf.nb_queues = 1;
+
+   /* TX params */
+   dev_info->default_txportconf.burst_size = 1;
+   dev_info->default_txportconf.ring_size = MAX_SEND_BUFFERS_PER_QUEUE;
+   dev_info->default_txportconf.nb_queues = 1;
+
+   return 0;
+}
+
 static const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev 
__rte_unused)
 {
static const uint32_t ptypes[] = {
@@ -150,11 +230,13 @@ static int mana_dev_link_update(struct rte_eth_dev *dev,
 const struct eth_dev_ops mana_dev_ops = {
.dev_configure  = mana_dev_configure,
.dev_close  = mana_dev_close,
+   .dev_infos_get  = mana_dev_info_get,
.dev_supported_ptypes_get = mana_supported_ptypes,
.link_update= mana_dev_link_update,
 };
 
 const struct eth_dev_ops mana_dev_sec_ops = {
+   .dev_infos_get = mana_dev_info_get,
 };
 
 uint16_t
-- 
2.17.1



[Patch v7 07/18] net/mana: add function to configure RSS

2022-09-02 Thread longli
From: Long Li 

Currently this PMD supports RSS configuration when the device is stopped.
Configuring RSS in running state will be supported in the future.

Signed-off-by: Long Li 
---
 doc/guides/nics/features/mana.ini |  1 +
 drivers/net/mana/mana.c   | 61 +++
 drivers/net/mana/mana.h   |  1 +
 3 files changed, 63 insertions(+)

diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
index 566b3e8770..a59c21cc10 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -8,6 +8,7 @@ Link status  = P
 Linux= Y
 Multiprocess aware   = Y
 Removal event= Y
+RSS hash = Y
 Speed capabilities   = P
 Usage doc= Y
 x86-64   = Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index c7c8d8c4ec..2c189d371f 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -211,6 +211,65 @@ static const uint32_t *mana_supported_ptypes(struct 
rte_eth_dev *dev __rte_unuse
return ptypes;
 }
 
+static int mana_rss_hash_update(struct rte_eth_dev *dev,
+   struct rte_eth_rss_conf *rss_conf)
+{
+   struct mana_priv *priv = dev->data->dev_private;
+
+   /* Currently can only update RSS hash when device is stopped */
+   if (dev->data->dev_started) {
+   DRV_LOG(ERR, "Can't update RSS after device has started");
+   return -ENODEV;
+   }
+
+   if (rss_conf->rss_hf & ~BNIC_ETH_RSS_SUPPORT) {
+   DRV_LOG(ERR, "Port %u invalid RSS HF 0x%" PRIx64,
+   dev->data->port_id, rss_conf->rss_hf);
+   return -EINVAL;
+   }
+
+   if (rss_conf->rss_key && rss_conf->rss_key_len) {
+   if (rss_conf->rss_key_len != TOEPLITZ_HASH_KEY_SIZE_IN_BYTES) {
+   DRV_LOG(ERR, "Port %u key len must be %u long",
+   dev->data->port_id,
+   TOEPLITZ_HASH_KEY_SIZE_IN_BYTES);
+   return -EINVAL;
+   }
+
+   priv->rss_conf.rss_key_len = rss_conf->rss_key_len;
+   priv->rss_conf.rss_key =
+   rte_zmalloc("mana_rss", rss_conf->rss_key_len,
+   RTE_CACHE_LINE_SIZE);
+   if (!priv->rss_conf.rss_key)
+   return -ENOMEM;
+   memcpy(priv->rss_conf.rss_key, rss_conf->rss_key,
+  rss_conf->rss_key_len);
+   }
+   priv->rss_conf.rss_hf = rss_conf->rss_hf;
+
+   return 0;
+}
+
+static int mana_rss_hash_conf_get(struct rte_eth_dev *dev,
+ struct rte_eth_rss_conf *rss_conf)
+{
+   struct mana_priv *priv = dev->data->dev_private;
+
+   if (!rss_conf)
+   return -EINVAL;
+
+   if (rss_conf->rss_key &&
+   rss_conf->rss_key_len >= priv->rss_conf.rss_key_len) {
+   memcpy(rss_conf->rss_key, priv->rss_conf.rss_key,
+  priv->rss_conf.rss_key_len);
+   }
+
+   rss_conf->rss_key_len = priv->rss_conf.rss_key_len;
+   rss_conf->rss_hf = priv->rss_conf.rss_hf;
+
+   return 0;
+}
+
 static int mana_dev_link_update(struct rte_eth_dev *dev,
int wait_to_complete __rte_unused)
 {
@@ -232,6 +291,8 @@ const struct eth_dev_ops mana_dev_ops = {
.dev_close  = mana_dev_close,
.dev_infos_get  = mana_dev_info_get,
.dev_supported_ptypes_get = mana_supported_ptypes,
+   .rss_hash_update= mana_rss_hash_update,
+   .rss_hash_conf_get  = mana_rss_hash_conf_get,
.link_update= mana_dev_link_update,
 };
 
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 1c5ea9b44d..0eeb86f8e4 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -71,6 +71,7 @@ struct mana_priv {
uint8_t ind_table_key[40];
struct ibv_qp *rwq_qp;
void *db_page;
+   struct rte_eth_rss_conf rss_conf;
struct rte_intr_handle *intr_handle;
int max_rx_queues;
int max_tx_queues;
-- 
2.17.1



[Patch v7 08/18] net/mana: add function to configure RX queues

2022-09-02 Thread longli
From: Long Li 

RX hardware queue is allocated when starting the queue. This function is
for queue configuration pre starting.

Signed-off-by: Long Li 
---
 drivers/net/mana/mana.c | 68 +
 1 file changed, 68 insertions(+)

diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 2c189d371f..173b668ba2 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -196,6 +196,16 @@ static int mana_dev_info_get(struct rte_eth_dev *dev,
return 0;
 }
 
+static void mana_dev_rx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
+  struct rte_eth_rxq_info *qinfo)
+{
+   struct mana_rxq *rxq = dev->data->rx_queues[queue_id];
+
+   qinfo->mp = rxq->mp;
+   qinfo->nb_desc = rxq->num_desc;
+   qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads;
+}
+
 static const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev 
__rte_unused)
 {
static const uint32_t ptypes[] = {
@@ -270,6 +280,61 @@ static int mana_rss_hash_conf_get(struct rte_eth_dev *dev,
return 0;
 }
 
+static int mana_dev_rx_queue_setup(struct rte_eth_dev *dev,
+  uint16_t queue_idx, uint16_t nb_desc,
+  unsigned int socket_id,
+  const struct rte_eth_rxconf *rx_conf 
__rte_unused,
+  struct rte_mempool *mp)
+{
+   struct mana_priv *priv = dev->data->dev_private;
+   struct mana_rxq *rxq;
+   int ret;
+
+   rxq = rte_zmalloc_socket("mana_rxq", sizeof(*rxq), 0, socket_id);
+   if (!rxq) {
+   DRV_LOG(ERR, "failed to allocate rxq");
+   return -ENOMEM;
+   }
+
+   DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u",
+   queue_idx, nb_desc, socket_id);
+
+   rxq->socket = socket_id;
+
+   rxq->desc_ring = rte_zmalloc_socket("mana_rx_mbuf_ring",
+   sizeof(struct mana_rxq_desc) *
+   nb_desc,
+   RTE_CACHE_LINE_SIZE, socket_id);
+
+   if (!rxq->desc_ring) {
+   DRV_LOG(ERR, "failed to allocate rxq desc_ring");
+   ret = -ENOMEM;
+   goto fail;
+   }
+
+   rxq->num_desc = nb_desc;
+
+   rxq->priv = priv;
+   rxq->num_desc = nb_desc;
+   rxq->mp = mp;
+   dev->data->rx_queues[queue_idx] = rxq;
+
+   return 0;
+
+fail:
+   rte_free(rxq->desc_ring);
+   rte_free(rxq);
+   return ret;
+}
+
+static void mana_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
+{
+   struct mana_rxq *rxq = dev->data->rx_queues[qid];
+
+   rte_free(rxq->desc_ring);
+   rte_free(rxq);
+}
+
 static int mana_dev_link_update(struct rte_eth_dev *dev,
int wait_to_complete __rte_unused)
 {
@@ -290,9 +355,12 @@ const struct eth_dev_ops mana_dev_ops = {
.dev_configure  = mana_dev_configure,
.dev_close  = mana_dev_close,
.dev_infos_get  = mana_dev_info_get,
+   .rxq_info_get   = mana_dev_rx_queue_info,
.dev_supported_ptypes_get = mana_supported_ptypes,
.rss_hash_update= mana_rss_hash_update,
.rss_hash_conf_get  = mana_rss_hash_conf_get,
+   .rx_queue_setup = mana_dev_rx_queue_setup,
+   .rx_queue_release   = mana_dev_rx_queue_release,
.link_update= mana_dev_link_update,
 };
 
-- 
2.17.1



[Patch v7 09/18] net/mana: add function to configure TX queues

2022-09-02 Thread longli
From: Long Li 

TX hardware queue is allocated when starting the queue, this is for
pre configuration.

Signed-off-by: Long Li 
---
 drivers/net/mana/mana.c | 65 +
 1 file changed, 65 insertions(+)

diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 173b668ba2..6ca708d26f 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -196,6 +196,15 @@ static int mana_dev_info_get(struct rte_eth_dev *dev,
return 0;
 }
 
+static void mana_dev_tx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
+   struct rte_eth_txq_info *qinfo)
+{
+   struct mana_txq *txq = dev->data->tx_queues[queue_id];
+
+   qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads;
+   qinfo->nb_desc = txq->num_desc;
+}
+
 static void mana_dev_rx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
   struct rte_eth_rxq_info *qinfo)
 {
@@ -280,6 +289,59 @@ static int mana_rss_hash_conf_get(struct rte_eth_dev *dev,
return 0;
 }
 
+static int mana_dev_tx_queue_setup(struct rte_eth_dev *dev,
+  uint16_t queue_idx, uint16_t nb_desc,
+  unsigned int socket_id,
+  const struct rte_eth_txconf *tx_conf 
__rte_unused)
+
+{
+   struct mana_priv *priv = dev->data->dev_private;
+   struct mana_txq *txq;
+   int ret;
+
+   txq = rte_zmalloc_socket("mana_txq", sizeof(*txq), 0, socket_id);
+   if (!txq) {
+   DRV_LOG(ERR, "failed to allocate txq");
+   return -ENOMEM;
+   }
+
+   txq->socket = socket_id;
+
+   txq->desc_ring = rte_malloc_socket("mana_tx_desc_ring",
+  sizeof(struct mana_txq_desc) *
+   nb_desc,
+  RTE_CACHE_LINE_SIZE, socket_id);
+   if (!txq->desc_ring) {
+   DRV_LOG(ERR, "failed to allocate txq desc_ring");
+   ret = -ENOMEM;
+   goto fail;
+   }
+
+   DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u txq->desc_ring %p",
+   queue_idx, nb_desc, socket_id, txq->desc_ring);
+
+   txq->desc_ring_head = 0;
+   txq->desc_ring_tail = 0;
+   txq->priv = priv;
+   txq->num_desc = nb_desc;
+   dev->data->tx_queues[queue_idx] = txq;
+
+   return 0;
+
+fail:
+   rte_free(txq->desc_ring);
+   rte_free(txq);
+   return ret;
+}
+
+static void mana_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
+{
+   struct mana_txq *txq = dev->data->tx_queues[qid];
+
+   rte_free(txq->desc_ring);
+   rte_free(txq);
+}
+
 static int mana_dev_rx_queue_setup(struct rte_eth_dev *dev,
   uint16_t queue_idx, uint16_t nb_desc,
   unsigned int socket_id,
@@ -355,10 +417,13 @@ const struct eth_dev_ops mana_dev_ops = {
.dev_configure  = mana_dev_configure,
.dev_close  = mana_dev_close,
.dev_infos_get  = mana_dev_info_get,
+   .txq_info_get   = mana_dev_tx_queue_info,
.rxq_info_get   = mana_dev_rx_queue_info,
.dev_supported_ptypes_get = mana_supported_ptypes,
.rss_hash_update= mana_rss_hash_update,
.rss_hash_conf_get  = mana_rss_hash_conf_get,
+   .tx_queue_setup = mana_dev_tx_queue_setup,
+   .tx_queue_release   = mana_dev_tx_queue_release,
.rx_queue_setup = mana_dev_rx_queue_setup,
.rx_queue_release   = mana_dev_rx_queue_release,
.link_update= mana_dev_link_update,
-- 
2.17.1



[Patch v7 10/18] net/mana: implement memory registration

2022-09-02 Thread longli
From: Long Li 

MANA hardware has iommu built-in, that provides hardware safe access to
user memory through memory registration. Since memory registration is an
expensive operation, this patch implements a two level memory registration
cache mechanisum for each queue and for each port.

Signed-off-by: Long Li 
---
Change log:
v2:
Change all header file functions to start with mana_.
Use spinlock in place of rwlock to memory cache access.
Remove unused header files.
v4:
Remove extra "\n" in logging function.

 drivers/net/mana/mana.c  |  20 +++
 drivers/net/mana/mana.h  |  39 +
 drivers/net/mana/meson.build |   1 +
 drivers/net/mana/mp.c|  85 +
 drivers/net/mana/mr.c| 324 +++
 5 files changed, 469 insertions(+)
 create mode 100644 drivers/net/mana/mr.c

diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 6ca708d26f..7a48fa02aa 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -103,6 +103,8 @@ mana_dev_close(struct rte_eth_dev *dev)
struct mana_priv *priv = dev->data->dev_private;
int ret;
 
+   mana_remove_all_mr(priv);
+
ret = mana_intr_uninstall(priv);
if (ret)
return ret;
@@ -317,6 +319,13 @@ static int mana_dev_tx_queue_setup(struct rte_eth_dev *dev,
goto fail;
}
 
+   ret = mana_mr_btree_init(&txq->mr_btree,
+MANA_MR_BTREE_PER_QUEUE_N, socket_id);
+   if (ret) {
+   DRV_LOG(ERR, "Failed to init TXQ MR btree");
+   goto fail;
+   }
+
DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u txq->desc_ring %p",
queue_idx, nb_desc, socket_id, txq->desc_ring);
 
@@ -338,6 +347,8 @@ static void mana_dev_tx_queue_release(struct rte_eth_dev 
*dev, uint16_t qid)
 {
struct mana_txq *txq = dev->data->tx_queues[qid];
 
+   mana_mr_btree_free(&txq->mr_btree);
+
rte_free(txq->desc_ring);
rte_free(txq);
 }
@@ -374,6 +385,13 @@ static int mana_dev_rx_queue_setup(struct rte_eth_dev *dev,
goto fail;
}
 
+   ret = mana_mr_btree_init(&rxq->mr_btree,
+MANA_MR_BTREE_PER_QUEUE_N, socket_id);
+   if (ret) {
+   DRV_LOG(ERR, "Failed to init RXQ MR btree");
+   goto fail;
+   }
+
rxq->num_desc = nb_desc;
 
rxq->priv = priv;
@@ -393,6 +411,8 @@ static void mana_dev_rx_queue_release(struct rte_eth_dev 
*dev, uint16_t qid)
 {
struct mana_rxq *rxq = dev->data->rx_queues[qid];
 
+   mana_mr_btree_free(&rxq->mr_btree);
+
rte_free(rxq->desc_ring);
rte_free(rxq);
 }
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 0eeb86f8e4..adeae1d399 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -49,6 +49,22 @@ struct mana_shared_data {
 #define MAX_RECEIVE_BUFFERS_PER_QUEUE  256
 #define MAX_SEND_BUFFERS_PER_QUEUE 256
 
+struct mana_mr_cache {
+   uint32_tlkey;
+   uintptr_t   addr;
+   size_t  len;
+   void*verb_obj;
+};
+
+#define MANA_MR_BTREE_CACHE_N  512
+struct mana_mr_btree {
+   uint16_tlen;/* Used entries */
+   uint16_tsize;   /* Total entries */
+   int overflow;
+   int socket;
+   struct mana_mr_cache *table;
+};
+
 struct mana_process_priv {
void *db_page;
 };
@@ -81,6 +97,8 @@ struct mana_priv {
int max_recv_sge;
int max_mr;
uint64_t max_mr_size;
+   struct mana_mr_btree mr_btree;
+   rte_spinlock_t  mr_btree_lock;
 };
 
 struct mana_txq_desc {
@@ -130,6 +148,7 @@ struct mana_txq {
uint32_t desc_ring_head, desc_ring_tail;
 
struct mana_stats stats;
+   struct mana_mr_btree mr_btree;
unsigned int socket;
 };
 
@@ -152,6 +171,7 @@ struct mana_rxq {
struct mana_gdma_queue gdma_cq;
 
struct mana_stats stats;
+   struct mana_mr_btree mr_btree;
 
unsigned int socket;
 };
@@ -175,6 +195,24 @@ uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct 
rte_mbuf **pkts,
 uint16_t mana_tx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
   uint16_t pkts_n);
 
+struct mana_mr_cache *mana_find_pmd_mr(struct mana_mr_btree *local_tree,
+  struct mana_priv *priv,
+  struct rte_mbuf *mbuf);
+int mana_new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv *priv,
+   struct rte_mempool *pool);
+void mana_remove_all_mr(struct mana_priv *priv);
+void mana_del_pmd_mr(struct mana_mr_cache *mr);
+
+void mana_mempool_chunk_cb(struct rte_mempool *mp, void *opaque,
+  struct rte_mempool_memhdr *memhdr, unsigned int idx);
+
+struct mana_mr_cache *mana_mr_btree_lookup(struct mana_mr_btree *bt,
+  uint16_t *idx,

[Patch v7 11/18] net/mana: implement the hardware layer operations

2022-09-02 Thread longli
From: Long Li 

The hardware layer of MANA understands the device queue and doorbell
formats. Those functions are implemented for use by packet RX/TX code.

Signed-off-by: Long Li 
---
Change log:
v2:
Remove unused header files.
Rename a camel case.
v5:
Use RTE_BIT32() instead of defining a new BIT()
v6:
add rte_rmb() after reading owner bits

 drivers/net/mana/gdma.c  | 289 +++
 drivers/net/mana/mana.h  | 183 ++
 drivers/net/mana/meson.build |   1 +
 3 files changed, 473 insertions(+)
 create mode 100644 drivers/net/mana/gdma.c

diff --git a/drivers/net/mana/gdma.c b/drivers/net/mana/gdma.c
new file mode 100644
index 00..7ad175651e
--- /dev/null
+++ b/drivers/net/mana/gdma.c
@@ -0,0 +1,289 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#include 
+#include 
+
+#include "mana.h"
+
+uint8_t *gdma_get_wqe_pointer(struct mana_gdma_queue *queue)
+{
+   uint32_t offset_in_bytes =
+   (queue->head * GDMA_WQE_ALIGNMENT_UNIT_SIZE) &
+   (queue->size - 1);
+
+   DRV_LOG(DEBUG, "txq sq_head %u sq_size %u offset_in_bytes %u",
+   queue->head, queue->size, offset_in_bytes);
+
+   if (offset_in_bytes + GDMA_WQE_ALIGNMENT_UNIT_SIZE > queue->size)
+   DRV_LOG(ERR, "fatal error: offset_in_bytes %u too big",
+   offset_in_bytes);
+
+   return ((uint8_t *)queue->buffer) + offset_in_bytes;
+}
+
+static uint32_t
+write_dma_client_oob(uint8_t *work_queue_buffer_pointer,
+const struct gdma_work_request *work_request,
+uint32_t client_oob_size)
+{
+   uint8_t *p = work_queue_buffer_pointer;
+
+   struct gdma_wqe_dma_oob *header = (struct gdma_wqe_dma_oob *)p;
+
+   memset(header, 0, sizeof(struct gdma_wqe_dma_oob));
+   header->num_sgl_entries = work_request->num_sgl_elements;
+   header->inline_client_oob_size_in_dwords =
+   client_oob_size / sizeof(uint32_t);
+   header->client_data_unit = work_request->client_data_unit;
+
+   DRV_LOG(DEBUG, "queue buf %p sgl %u oob_h %u du %u oob_buf %p oob_b %u",
+   work_queue_buffer_pointer, header->num_sgl_entries,
+   header->inline_client_oob_size_in_dwords,
+   header->client_data_unit, work_request->inline_oob_data,
+   work_request->inline_oob_size_in_bytes);
+
+   p += sizeof(struct gdma_wqe_dma_oob);
+   if (work_request->inline_oob_data &&
+   work_request->inline_oob_size_in_bytes > 0) {
+   memcpy(p, work_request->inline_oob_data,
+  work_request->inline_oob_size_in_bytes);
+   if (client_oob_size > work_request->inline_oob_size_in_bytes)
+   memset(p + work_request->inline_oob_size_in_bytes, 0,
+  client_oob_size -
+  work_request->inline_oob_size_in_bytes);
+   }
+
+   return sizeof(struct gdma_wqe_dma_oob) + client_oob_size;
+}
+
+static uint32_t
+write_scatter_gather_list(uint8_t *work_queue_head_pointer,
+ uint8_t *work_queue_end_pointer,
+ uint8_t *work_queue_cur_pointer,
+ struct gdma_work_request *work_request)
+{
+   struct gdma_sgl_element *sge_list;
+   struct gdma_sgl_element dummy_sgl[1];
+   uint8_t *address;
+   uint32_t size;
+   uint32_t num_sge;
+   uint32_t size_to_queue_end;
+   uint32_t sge_list_size;
+
+   DRV_LOG(DEBUG, "work_queue_cur_pointer %p work_request->flags %x",
+   work_queue_cur_pointer, work_request->flags);
+
+   num_sge = work_request->num_sgl_elements;
+   sge_list = work_request->sgl;
+   size_to_queue_end = (uint32_t)(work_queue_end_pointer -
+  work_queue_cur_pointer);
+
+   if (num_sge == 0) {
+   /* Per spec, the case of an empty SGL should be handled as
+* follows to avoid corrupted WQE errors:
+* Write one dummy SGL entry
+* Set the address to 1, leave the rest as 0
+*/
+   dummy_sgl[num_sge].address = 1;
+   dummy_sgl[num_sge].size = 0;
+   dummy_sgl[num_sge].memory_key = 0;
+   num_sge++;
+   sge_list = dummy_sgl;
+   }
+
+   sge_list_size = 0;
+   {
+   address = (uint8_t *)sge_list;
+   size = sizeof(struct gdma_sgl_element) * num_sge;
+   if (size_to_queue_end < size) {
+   memcpy(work_queue_cur_pointer, address,
+  size_to_queue_end);
+   work_queue_cur_pointer = work_queue_head_pointer;
+   address += size_to_queue_end;
+   size -= size_to_queue_end;
+   }
+
+   memcpy(work_queue_cur_pointer

[Patch v7 12/18] net/mana: add function to start/stop TX queues

2022-09-02 Thread longli
From: Long Li 

MANA allocate device queues through the IB layer when starting TX queues.
When device is stopped all the queues are unmapped and freed.

Signed-off-by: Long Li 
---
Change log:
v2:
Add prefix mana_ to all function names.
Remove unused header files.

 doc/guides/nics/features/mana.ini |   1 +
 drivers/net/mana/mana.h   |   4 +
 drivers/net/mana/meson.build  |   1 +
 drivers/net/mana/tx.c | 163 ++
 4 files changed, 169 insertions(+)
 create mode 100644 drivers/net/mana/tx.c

diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
index a59c21cc10..821443b292 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -7,6 +7,7 @@
 Link status  = P
 Linux= Y
 Multiprocess aware   = Y
+Queue start/stop = Y
 Removal event= Y
 RSS hash = Y
 Speed capabilities   = P
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 764087079f..5358bdcb77 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -378,6 +378,10 @@ uint16_t mana_tx_burst_removed(void *dpdk_rxq, struct 
rte_mbuf **pkts,
 int gdma_poll_completion_queue(struct mana_gdma_queue *cq,
   struct gdma_comp *comp);
 
+int mana_start_tx_queues(struct rte_eth_dev *dev);
+
+int mana_stop_tx_queues(struct rte_eth_dev *dev);
+
 struct mana_mr_cache *mana_find_pmd_mr(struct mana_mr_btree *local_tree,
   struct mana_priv *priv,
   struct rte_mbuf *mbuf);
diff --git a/drivers/net/mana/meson.build b/drivers/net/mana/meson.build
index 364d57a619..031f443d16 100644
--- a/drivers/net/mana/meson.build
+++ b/drivers/net/mana/meson.build
@@ -11,6 +11,7 @@ deps += ['pci', 'bus_pci', 'net', 'eal', 'kvargs']
 
 sources += files(
'mana.c',
+   'tx.c',
'mr.c',
'gdma.c',
'mp.c',
diff --git a/drivers/net/mana/tx.c b/drivers/net/mana/tx.c
new file mode 100644
index 00..fbeea40ef2
--- /dev/null
+++ b/drivers/net/mana/tx.c
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#include 
+
+#include 
+#include 
+
+#include "mana.h"
+
+int mana_stop_tx_queues(struct rte_eth_dev *dev)
+{
+   struct mana_priv *priv = dev->data->dev_private;
+   int i, ret;
+
+   for (i = 0; i < priv->num_queues; i++) {
+   struct mana_txq *txq = dev->data->tx_queues[i];
+
+   if (txq->qp) {
+   ret = ibv_destroy_qp(txq->qp);
+   if (ret)
+   DRV_LOG(ERR, "tx_queue destroy_qp failed %d",
+   ret);
+   txq->qp = NULL;
+   }
+
+   if (txq->cq) {
+   ret = ibv_destroy_cq(txq->cq);
+   if (ret)
+   DRV_LOG(ERR, "tx_queue destroy_cp failed %d",
+   ret);
+   txq->cq = NULL;
+   }
+
+   /* Drain and free posted WQEs */
+   while (txq->desc_ring_tail != txq->desc_ring_head) {
+   struct mana_txq_desc *desc =
+   &txq->desc_ring[txq->desc_ring_tail];
+
+   rte_pktmbuf_free(desc->pkt);
+
+   txq->desc_ring_tail =
+   (txq->desc_ring_tail + 1) % txq->num_desc;
+   }
+   txq->desc_ring_head = 0;
+   txq->desc_ring_tail = 0;
+
+   memset(&txq->gdma_sq, 0, sizeof(txq->gdma_sq));
+   memset(&txq->gdma_cq, 0, sizeof(txq->gdma_cq));
+   }
+
+   return 0;
+}
+
+int mana_start_tx_queues(struct rte_eth_dev *dev)
+{
+   struct mana_priv *priv = dev->data->dev_private;
+   int ret, i;
+
+   /* start TX queues */
+   for (i = 0; i < priv->num_queues; i++) {
+   struct mana_txq *txq;
+   struct ibv_qp_init_attr qp_attr = { 0 };
+   struct manadv_obj obj = {};
+   struct manadv_qp dv_qp;
+   struct manadv_cq dv_cq;
+
+   txq = dev->data->tx_queues[i];
+
+   manadv_set_context_attr(priv->ib_ctx,
+   MANADV_CTX_ATTR_BUF_ALLOCATORS,
+   (void *)((uintptr_t)&(struct manadv_ctx_allocators){
+   .alloc = &mana_alloc_verbs_buf,
+   .free = &mana_free_verbs_buf,
+   .data = (void *)(uintptr_t)txq->socket,
+   }));
+
+   txq->cq = ibv_create_cq(priv->ib_ctx, txq->num_desc,
+   NULL, NULL, 0);
+   if (!txq->cq) {
+   DRV_LOG(ERR, "failed to create cq queue index %d", i);
+   ret = -errno;

[Patch v7 13/18] net/mana: add function to start/stop RX queues

2022-09-02 Thread longli
From: Long Li 

MANA allocates device queues through the IB layer when starting RX queues.
When device is stopped all the queues are unmapped and freed.

Signed-off-by: Long Li 
---
Change log:
v2:
Add prefix mana_ to all function names.
Remove unused header files.
v4:
Move defition "uint32_t i" from inside "for ()" to outside

 drivers/net/mana/mana.h  |   3 +
 drivers/net/mana/meson.build |   1 +
 drivers/net/mana/rx.c| 346 +++
 3 files changed, 350 insertions(+)
 create mode 100644 drivers/net/mana/rx.c

diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 5358bdcb77..4c37cd7df4 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -363,6 +363,7 @@ extern int mana_logtype_init;
 
 int mana_ring_doorbell(void *db_page, enum gdma_queue_types queue_type,
   uint32_t queue_id, uint32_t tail);
+int mana_rq_ring_doorbell(struct mana_rxq *rxq);
 
 int gdma_post_work_request(struct mana_gdma_queue *queue,
   struct gdma_work_request *work_req,
@@ -378,8 +379,10 @@ uint16_t mana_tx_burst_removed(void *dpdk_rxq, struct 
rte_mbuf **pkts,
 int gdma_poll_completion_queue(struct mana_gdma_queue *cq,
   struct gdma_comp *comp);
 
+int mana_start_rx_queues(struct rte_eth_dev *dev);
 int mana_start_tx_queues(struct rte_eth_dev *dev);
 
+int mana_stop_rx_queues(struct rte_eth_dev *dev);
 int mana_stop_tx_queues(struct rte_eth_dev *dev);
 
 struct mana_mr_cache *mana_find_pmd_mr(struct mana_mr_btree *local_tree,
diff --git a/drivers/net/mana/meson.build b/drivers/net/mana/meson.build
index 031f443d16..62e103a510 100644
--- a/drivers/net/mana/meson.build
+++ b/drivers/net/mana/meson.build
@@ -11,6 +11,7 @@ deps += ['pci', 'bus_pci', 'net', 'eal', 'kvargs']
 
 sources += files(
'mana.c',
+   'rx.c',
'tx.c',
'mr.c',
'gdma.c',
diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
new file mode 100644
index 00..41d0fc9f11
--- /dev/null
+++ b/drivers/net/mana/rx.c
@@ -0,0 +1,346 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+#include 
+
+#include 
+#include 
+
+#include "mana.h"
+
+static uint8_t mana_rss_hash_key_default[TOEPLITZ_HASH_KEY_SIZE_IN_BYTES] = {
+   0x2c, 0xc6, 0x81, 0xd1,
+   0x5b, 0xdb, 0xf4, 0xf7,
+   0xfc, 0xa2, 0x83, 0x19,
+   0xdb, 0x1a, 0x3e, 0x94,
+   0x6b, 0x9e, 0x38, 0xd9,
+   0x2c, 0x9c, 0x03, 0xd1,
+   0xad, 0x99, 0x44, 0xa7,
+   0xd9, 0x56, 0x3d, 0x59,
+   0x06, 0x3c, 0x25, 0xf3,
+   0xfc, 0x1f, 0xdc, 0x2a,
+};
+
+int mana_rq_ring_doorbell(struct mana_rxq *rxq)
+{
+   struct mana_priv *priv = rxq->priv;
+   int ret;
+   void *db_page = priv->db_page;
+
+   if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+   struct rte_eth_dev *dev =
+   &rte_eth_devices[priv->dev_data->port_id];
+   struct mana_process_priv *process_priv = dev->process_private;
+
+   db_page = process_priv->db_page;
+   }
+
+   ret = mana_ring_doorbell(db_page, gdma_queue_receive,
+rxq->gdma_rq.id,
+rxq->gdma_rq.head *
+   GDMA_WQE_ALIGNMENT_UNIT_SIZE);
+
+   if (ret)
+   DRV_LOG(ERR, "failed to ring RX doorbell ret %d", ret);
+
+   return ret;
+}
+
+static int mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
+{
+   struct rte_mbuf *mbuf = NULL;
+   struct gdma_sgl_element sgl[1];
+   struct gdma_work_request request = {0};
+   struct gdma_posted_wqe_info wqe_info = {0};
+   struct mana_priv *priv = rxq->priv;
+   int ret;
+   struct mana_mr_cache *mr;
+
+   mbuf = rte_pktmbuf_alloc(rxq->mp);
+   if (!mbuf) {
+   rxq->stats.nombuf++;
+   return -ENOMEM;
+   }
+
+   mr = mana_find_pmd_mr(&rxq->mr_btree, priv, mbuf);
+   if (!mr) {
+   DRV_LOG(ERR, "failed to register RX MR");
+   rte_pktmbuf_free(mbuf);
+   return -ENOMEM;
+   }
+
+   request.gdma_header.struct_size = sizeof(request);
+   wqe_info.gdma_header.struct_size = sizeof(wqe_info);
+
+   sgl[0].address = rte_cpu_to_le_64(rte_pktmbuf_mtod(mbuf, uint64_t));
+   sgl[0].memory_key = mr->lkey;
+   sgl[0].size =
+   rte_pktmbuf_data_room_size(rxq->mp) -
+   RTE_PKTMBUF_HEADROOM;
+
+   request.sgl = sgl;
+   request.num_sgl_elements = 1;
+   request.inline_oob_data = NULL;
+   request.inline_oob_size_in_bytes = 0;
+   request.flags = 0;
+   request.client_data_unit = NOT_USING_CLIENT_DATA_UNIT;
+
+   ret = gdma_post_work_request(&rxq->gdma_rq, &request, &wqe_info);
+   if (!ret) {
+   struct mana_rxq_desc *desc =
+   &rxq->desc_ring[rxq->desc_ring_head];
+
+   /* update queue for 

[Patch v7 14/18] net/mana: add function to receive packets

2022-09-02 Thread longli
From: Long Li 

With all the RX queues created, MANA can use those queues to receive
packets.

Signed-off-by: Long Li 
---
Change log:
v2:
Add mana_ to all function names.
Rename a camel case.

 doc/guides/nics/features/mana.ini |   2 +
 drivers/net/mana/mana.c   |   2 +
 drivers/net/mana/mana.h   |  37 +++
 drivers/net/mana/mp.c |   2 +
 drivers/net/mana/rx.c | 104 ++
 5 files changed, 147 insertions(+)

diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
index 821443b292..fdbf22d335 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -6,6 +6,8 @@
 [Features]
 Link status  = P
 Linux= Y
+L3 checksum offload  = Y
+L4 checksum offload  = Y
 Multiprocess aware   = Y
 Queue start/stop = Y
 Removal event= Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 7a48fa02aa..2fd8a05658 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -950,6 +950,8 @@ static int mana_pci_probe_mac(struct rte_pci_driver 
*pci_drv __rte_unused,
/* fd is no not used after mapping doorbell */
close(fd);
 
+   eth_dev->rx_pkt_burst = mana_rx_burst;
+
rte_spinlock_lock(&mana_shared_data->lock);
mana_shared_data->secondary_cnt++;
mana_local_data.secondary_cnt++;
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 4c37cd7df4..ddc165e62f 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -177,6 +177,11 @@ struct gdma_work_request {
 
 enum mana_cqe_type {
CQE_INVALID = 0,
+
+   CQE_RX_OKAY = 1,
+   CQE_RX_COALESCED_4  = 2,
+   CQE_RX_OBJECT_FENCE = 3,
+   CQE_RX_TRUNCATED= 4,
 };
 
 struct mana_cqe_header {
@@ -202,6 +207,35 @@ struct mana_cqe_header {
(NDIS_HASH_TCP_IPV4 | NDIS_HASH_UDP_IPV4 | NDIS_HASH_TCP_IPV6 |  \
 NDIS_HASH_UDP_IPV6 | NDIS_HASH_TCP_IPV6_EX | NDIS_HASH_UDP_IPV6_EX)
 
+struct mana_rx_comp_per_packet_info {
+   uint32_t packet_length  : 16;
+   uint32_t reserved0  : 16;
+   uint32_t reserved1;
+   uint32_t packet_hash;
+}; /* HW DATA */
+#define RX_COM_OOB_NUM_PACKETINFO_SEGMENTS 4
+
+struct mana_rx_comp_oob {
+   struct mana_cqe_header cqe_hdr;
+
+   uint32_t rx_vlan_id : 12;
+   uint32_t rx_vlan_tag_present: 1;
+   uint32_t rx_outer_ip_header_checksum_succeeded  : 1;
+   uint32_t rx_outer_ip_header_checksum_failed : 1;
+   uint32_t reserved   : 1;
+   uint32_t rx_hash_type   : 9;
+   uint32_t rx_ip_header_checksum_succeeded: 1;
+   uint32_t rx_ip_header_checksum_failed   : 1;
+   uint32_t rx_tcp_checksum_succeeded  : 1;
+   uint32_t rx_tcp_checksum_failed : 1;
+   uint32_t rx_udp_checksum_succeeded  : 1;
+   uint32_t rx_udp_checksum_failed : 1;
+   uint32_t reserved1  : 1;
+   struct mana_rx_comp_per_packet_info
+   packet_info[RX_COM_OOB_NUM_PACKETINFO_SEGMENTS];
+   uint32_t received_wqe_offset;
+}; /* HW DATA */
+
 struct gdma_wqe_dma_oob {
uint32_t reserved:24;
uint32_t last_v_bytes:8;
@@ -370,6 +404,9 @@ int gdma_post_work_request(struct mana_gdma_queue *queue,
   struct gdma_posted_wqe_info *wqe_info);
 uint8_t *gdma_get_wqe_pointer(struct mana_gdma_queue *queue);
 
+uint16_t mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **rx_pkts,
+  uint16_t pkts_n);
+
 uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
   uint16_t pkts_n);
 
diff --git a/drivers/net/mana/mp.c b/drivers/net/mana/mp.c
index f4f78d2787..36a88c561a 100644
--- a/drivers/net/mana/mp.c
+++ b/drivers/net/mana/mp.c
@@ -138,6 +138,8 @@ static int mana_mp_secondary_handle(const struct rte_mp_msg 
*mp_msg,
case MANA_MP_REQ_START_RXTX:
DRV_LOG(INFO, "Port %u starting datapath", dev->data->port_id);
 
+   dev->rx_pkt_burst = mana_rx_burst;
+
rte_mb();
 
res->result = 0;
diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
index 41d0fc9f11..f2573a6d06 100644
--- a/drivers/net/mana/rx.c
+++ b/drivers/net/mana/rx.c
@@ -344,3 +344,107 @@ int mana_start_rx_queues(struct rte_eth_dev *dev)
mana_stop_rx_queues(dev);
return ret;
 }
+
+uint16_t mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+   uint16_t pkt_received = 0, cqe_processed = 0;
+   struct mana_rxq *rxq = dpdk_rxq;
+   struc

[Patch v7 15/18] net/mana: add function to send packets

2022-09-02 Thread longli
From: Long Li 

With all the TX queues created, MANA can send packets over those queues.

Signed-off-by: Long Li 
---
Change log:
v2: rename all camel cases.
v7: return the correct number of packets sent

 doc/guides/nics/features/mana.ini |   1 +
 drivers/net/mana/mana.c   |   1 +
 drivers/net/mana/mana.h   |  65 
 drivers/net/mana/mp.c |   1 +
 drivers/net/mana/tx.c | 248 ++
 5 files changed, 316 insertions(+)

diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
index fdbf22d335..7922816d66 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -4,6 +4,7 @@
 ; Refer to default.ini for the full list of available PMD features.
 ;
 [Features]
+Free Tx mbuf on demand = Y
 Link status  = P
 Linux= Y
 L3 checksum offload  = Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 2fd8a05658..46e064b746 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -950,6 +950,7 @@ static int mana_pci_probe_mac(struct rte_pci_driver 
*pci_drv __rte_unused,
/* fd is no not used after mapping doorbell */
close(fd);
 
+   eth_dev->tx_pkt_burst = mana_tx_burst;
eth_dev->rx_pkt_burst = mana_rx_burst;
 
rte_spinlock_lock(&mana_shared_data->lock);
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index ddc165e62f..9c17c1e4da 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -61,6 +61,47 @@ struct mana_shared_data {
 
 #define NOT_USING_CLIENT_DATA_UNIT 0
 
+enum tx_packet_format_v2 {
+   short_packet_format = 0,
+   long_packet_format = 1
+};
+
+struct transmit_short_oob_v2 {
+   enum tx_packet_format_v2 packet_format : 2;
+   uint32_t tx_is_outer_ipv4 : 1;
+   uint32_t tx_is_outer_ipv6 : 1;
+   uint32_t tx_compute_IP_header_checksum : 1;
+   uint32_t tx_compute_TCP_checksum : 1;
+   uint32_t tx_compute_UDP_checksum : 1;
+   uint32_t suppress_tx_CQE_generation : 1;
+   uint32_t VCQ_number : 24;
+   uint32_t tx_transport_header_offset : 10;
+   uint32_t VSQ_frame_num : 14;
+   uint32_t short_vport_offset : 8;
+};
+
+struct transmit_long_oob_v2 {
+   uint32_t tx_is_encapsulated_packet : 1;
+   uint32_t tx_inner_is_ipv6 : 1;
+   uint32_t tx_inner_TCP_options_present : 1;
+   uint32_t inject_vlan_prior_tag : 1;
+   uint32_t reserved1 : 12;
+   uint32_t priority_code_point : 3;
+   uint32_t drop_eligible_indicator : 1;
+   uint32_t vlan_identifier : 12;
+   uint32_t tx_inner_frame_offset : 10;
+   uint32_t tx_inner_IP_header_relative_offset : 6;
+   uint32_t long_vport_offset : 12;
+   uint32_t reserved3 : 4;
+   uint32_t reserved4 : 32;
+   uint32_t reserved5 : 32;
+};
+
+struct transmit_oob_v2 {
+   struct transmit_short_oob_v2 short_oob;
+   struct transmit_long_oob_v2 long_oob;
+};
+
 enum gdma_queue_types {
gdma_queue_type_invalid = 0,
gdma_queue_send,
@@ -182,6 +223,17 @@ enum mana_cqe_type {
CQE_RX_COALESCED_4  = 2,
CQE_RX_OBJECT_FENCE = 3,
CQE_RX_TRUNCATED= 4,
+
+   CQE_TX_OKAY = 32,
+   CQE_TX_SA_DROP  = 33,
+   CQE_TX_MTU_DROP = 34,
+   CQE_TX_INVALID_OOB  = 35,
+   CQE_TX_INVALID_ETH_TYPE = 36,
+   CQE_TX_HDR_PROCESSING_ERROR = 37,
+   CQE_TX_VF_DISABLED  = 38,
+   CQE_TX_VPORT_IDX_OUT_OF_RANGE   = 39,
+   CQE_TX_VPORT_DISABLED   = 40,
+   CQE_TX_VLAN_TAGGING_VIOLATION   = 41,
 };
 
 struct mana_cqe_header {
@@ -190,6 +242,17 @@ struct mana_cqe_header {
uint32_t vendor_err  : 24;
 }; /* HW DATA */
 
+struct mana_tx_comp_oob {
+   struct mana_cqe_header cqe_hdr;
+
+   uint32_t tx_data_offset;
+
+   uint32_t tx_sgl_offset   : 5;
+   uint32_t tx_wqe_offset   : 27;
+
+   uint32_t reserved[12];
+}; /* HW DATA */
+
 /* NDIS HASH Types */
 #define BIT(nr)(1 << (nr))
 #define NDIS_HASH_IPV4  BIT(0)
@@ -406,6 +469,8 @@ uint8_t *gdma_get_wqe_pointer(struct mana_gdma_queue 
*queue);
 
 uint16_t mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **rx_pkts,
   uint16_t pkts_n);
+uint16_t mana_tx_burst(void *dpdk_txq, struct rte_mbuf **tx_pkts,
+  uint16_t pkts_n);
 
 uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
   uint16_t pkts_n);
diff --git a/drivers/net/mana/mp.c b/drivers/net/mana/mp.c
index 36a88c561a..da9c0f36a1 100644
--- a/drivers/net/mana/mp.c
+++ b/drivers/net/mana/mp.c
@@ -138,6 +138,7 @@ static int mana_mp_secondary_handle(const struct rte_mp_msg 
*mp_msg,
case MA

[Patch v7 16/18] net/mana: add function to start/stop device

2022-09-02 Thread longli
From: Long Li 

Add support for starting/stopping the device.

Signed-off-by: Long Li 
---
Change log:
v2:
Use spinlock for memory registration cache.
Add prefix mana_ to all function names.
v6:
Roll back device state on error in mana_dev_start()

 drivers/net/mana/mana.c | 77 +
 1 file changed, 77 insertions(+)

diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 46e064b746..856683b01c 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -97,6 +97,81 @@ static int mana_dev_configure(struct rte_eth_dev *dev)
 
 static int mana_intr_uninstall(struct mana_priv *priv);
 
+static int
+mana_dev_start(struct rte_eth_dev *dev)
+{
+   int ret;
+   struct mana_priv *priv = dev->data->dev_private;
+
+   rte_spinlock_init(&priv->mr_btree_lock);
+   ret = mana_mr_btree_init(&priv->mr_btree, MANA_MR_BTREE_CACHE_N,
+dev->device->numa_node);
+   if (ret) {
+   DRV_LOG(ERR, "Failed to init device MR btree %d", ret);
+   return ret;
+   }
+
+   ret = mana_start_tx_queues(dev);
+   if (ret) {
+   DRV_LOG(ERR, "failed to start tx queues %d", ret);
+   goto failed_tx;
+   }
+
+   ret = mana_start_rx_queues(dev);
+   if (ret) {
+   DRV_LOG(ERR, "failed to start rx queues %d", ret);
+   goto failed_rx;
+   }
+
+   rte_wmb();
+
+   dev->tx_pkt_burst = mana_tx_burst;
+   dev->rx_pkt_burst = mana_rx_burst;
+
+   DRV_LOG(INFO, "TX/RX queues have started");
+
+   /* Enable datapath for secondary processes */
+   mana_mp_req_on_rxtx(dev, MANA_MP_REQ_START_RXTX);
+
+   return 0;
+
+failed_rx:
+   mana_stop_tx_queues(dev);
+
+failed_tx:
+   mana_mr_btree_free(&priv->mr_btree);
+
+   return ret;
+}
+
+static int
+mana_dev_stop(struct rte_eth_dev *dev __rte_unused)
+{
+   int ret;
+
+   dev->tx_pkt_burst = mana_tx_burst_removed;
+   dev->rx_pkt_burst = mana_rx_burst_removed;
+
+   /* Stop datapath on secondary processes */
+   mana_mp_req_on_rxtx(dev, MANA_MP_REQ_STOP_RXTX);
+
+   rte_wmb();
+
+   ret = mana_stop_tx_queues(dev);
+   if (ret) {
+   DRV_LOG(ERR, "failed to stop tx queues");
+   return ret;
+   }
+
+   ret = mana_stop_rx_queues(dev);
+   if (ret) {
+   DRV_LOG(ERR, "failed to stop tx queues");
+   return ret;
+   }
+
+   return 0;
+}
+
 static int
 mana_dev_close(struct rte_eth_dev *dev)
 {
@@ -435,6 +510,8 @@ static int mana_dev_link_update(struct rte_eth_dev *dev,
 
 const struct eth_dev_ops mana_dev_ops = {
.dev_configure  = mana_dev_configure,
+   .dev_start  = mana_dev_start,
+   .dev_stop   = mana_dev_stop,
.dev_close  = mana_dev_close,
.dev_infos_get  = mana_dev_info_get,
.txq_info_get   = mana_dev_tx_queue_info,
-- 
2.17.1



[Patch v7 17/18] net/mana: add function to report queue stats

2022-09-02 Thread longli
From: Long Li 

Report packet statistics.

Signed-off-by: Long Li 
---
Change log:
v5:
Fixed calculation of stats packets/bytes/errors by adding them over the queue 
stats.

 doc/guides/nics/features/mana.ini |  2 +
 drivers/net/mana/mana.c   | 77 +++
 2 files changed, 79 insertions(+)

diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
index 7922816d66..b2729aba3a 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -4,6 +4,7 @@
 ; Refer to default.ini for the full list of available PMD features.
 ;
 [Features]
+Basic stats  = Y
 Free Tx mbuf on demand = Y
 Link status  = P
 Linux= Y
@@ -14,5 +15,6 @@ Queue start/stop = Y
 Removal event= Y
 RSS hash = Y
 Speed capabilities   = P
+Stats per queue  = Y
 Usage doc= Y
 x86-64   = Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 856683b01c..e370cc58e3 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -508,6 +508,79 @@ static int mana_dev_link_update(struct rte_eth_dev *dev,
return rte_eth_linkstatus_set(dev, &link);
 }
 
+static int mana_dev_stats_get(struct rte_eth_dev *dev,
+ struct rte_eth_stats *stats)
+{
+   unsigned int i;
+
+   for (i = 0; i < dev->data->nb_tx_queues; i++) {
+   struct mana_txq *txq = dev->data->tx_queues[i];
+
+   if (!txq)
+   continue;
+
+   stats->opackets = txq->stats.packets;
+   stats->obytes = txq->stats.bytes;
+   stats->oerrors = txq->stats.errors;
+
+   if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
+   stats->q_opackets[i] = txq->stats.packets;
+   stats->q_obytes[i] = txq->stats.bytes;
+   }
+   }
+
+   stats->rx_nombuf = 0;
+   for (i = 0; i < dev->data->nb_rx_queues; i++) {
+   struct mana_rxq *rxq = dev->data->rx_queues[i];
+
+   if (!rxq)
+   continue;
+
+   stats->ipackets = rxq->stats.packets;
+   stats->ibytes = rxq->stats.bytes;
+   stats->ierrors = rxq->stats.errors;
+
+   /* There is no good way to get stats->imissed, not setting it */
+
+   if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
+   stats->q_ipackets[i] = rxq->stats.packets;
+   stats->q_ibytes[i] = rxq->stats.bytes;
+   }
+
+   stats->rx_nombuf += rxq->stats.nombuf;
+   }
+
+   return 0;
+}
+
+static int
+mana_dev_stats_reset(struct rte_eth_dev *dev __rte_unused)
+{
+   unsigned int i;
+
+   PMD_INIT_FUNC_TRACE();
+
+   for (i = 0; i < dev->data->nb_tx_queues; i++) {
+   struct mana_txq *txq = dev->data->tx_queues[i];
+
+   if (!txq)
+   continue;
+
+   memset(&txq->stats, 0, sizeof(txq->stats));
+   }
+
+   for (i = 0; i < dev->data->nb_rx_queues; i++) {
+   struct mana_rxq *rxq = dev->data->rx_queues[i];
+
+   if (!rxq)
+   continue;
+
+   memset(&rxq->stats, 0, sizeof(rxq->stats));
+   }
+
+   return 0;
+}
+
 const struct eth_dev_ops mana_dev_ops = {
.dev_configure  = mana_dev_configure,
.dev_start  = mana_dev_start,
@@ -524,9 +597,13 @@ const struct eth_dev_ops mana_dev_ops = {
.rx_queue_setup = mana_dev_rx_queue_setup,
.rx_queue_release   = mana_dev_rx_queue_release,
.link_update= mana_dev_link_update,
+   .stats_get  = mana_dev_stats_get,
+   .stats_reset= mana_dev_stats_reset,
 };
 
 const struct eth_dev_ops mana_dev_sec_ops = {
+   .stats_get = mana_dev_stats_get,
+   .stats_reset = mana_dev_stats_reset,
.dev_infos_get = mana_dev_info_get,
 };
 
-- 
2.17.1



[Patch v7 18/18] net/mana: add function to support RX interrupts

2022-09-02 Thread longli
From: Long Li 

mana can receive RX interrupts from kernel through RDMA verbs interface.
Implement RX interrupts in the driver.

Signed-off-by: Long Li 
---
Change log:
v5:
New patch added to the series

 doc/guides/nics/features/mana.ini |   1 +
 drivers/net/mana/gdma.c   |  10 +--
 drivers/net/mana/mana.c   | 125 ++
 drivers/net/mana/mana.h   |  13 +++-
 drivers/net/mana/rx.c |  91 +++---
 drivers/net/mana/tx.c |   3 +-
 6 files changed, 207 insertions(+), 36 deletions(-)

diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
index b2729aba3a..42d78ac6b1 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -14,6 +14,7 @@ Multiprocess aware   = Y
 Queue start/stop = Y
 Removal event= Y
 RSS hash = Y
+Rx interrupt = Y
 Speed capabilities   = P
 Stats per queue  = Y
 Usage doc= Y
diff --git a/drivers/net/mana/gdma.c b/drivers/net/mana/gdma.c
index 7ad175651e..275520bff5 100644
--- a/drivers/net/mana/gdma.c
+++ b/drivers/net/mana/gdma.c
@@ -204,7 +204,7 @@ union gdma_doorbell_entry {
 #define DOORBELL_OFFSET_EQ  0xFF8
 
 int mana_ring_doorbell(void *db_page, enum gdma_queue_types queue_type,
-  uint32_t queue_id, uint32_t tail)
+  uint32_t queue_id, uint32_t tail, uint8_t arm)
 {
uint8_t *addr = db_page;
union gdma_doorbell_entry e = {};
@@ -219,14 +219,14 @@ int mana_ring_doorbell(void *db_page, enum 
gdma_queue_types queue_type,
case gdma_queue_receive:
e.rq.id = queue_id;
e.rq.tail_ptr = tail;
-   e.rq.wqe_cnt = 1;
+   e.rq.wqe_cnt = arm;
addr += DOORBELL_OFFSET_RQ;
break;
 
case gdma_queue_completion:
e.cq.id = queue_id;
e.cq.tail_ptr = tail;
-   e.cq.arm = 1;
+   e.cq.arm = arm;
addr += DOORBELL_OFFSET_CQ;
break;
 
@@ -238,8 +238,8 @@ int mana_ring_doorbell(void *db_page, enum gdma_queue_types 
queue_type,
/* Ensure all writes are done before ringing doorbell */
rte_wmb();
 
-   DRV_LOG(DEBUG, "db_page %p addr %p queue_id %u type %u tail %u",
-   db_page, addr, queue_id, queue_type, tail);
+   DRV_LOG(DEBUG, "db_page %p addr %p queue_id %u type %u tail %u arm %u",
+   db_page, addr, queue_id, queue_type, tail, arm);
 
rte_write64(e.as_uint64, addr);
return 0;
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index e370cc58e3..c80737fcbe 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -95,7 +95,68 @@ static int mana_dev_configure(struct rte_eth_dev *dev)
return 0;
 }
 
-static int mana_intr_uninstall(struct mana_priv *priv);
+static void rx_intr_vec_disable(struct mana_priv *priv)
+{
+   struct rte_intr_handle *intr_handle = priv->intr_handle;
+
+   rte_intr_free_epoll_fd(intr_handle);
+   rte_intr_vec_list_free(intr_handle);
+   rte_intr_nb_efd_set(intr_handle, 0);
+}
+
+static int rx_intr_vec_enable(struct mana_priv *priv)
+{
+   unsigned int i;
+   unsigned int rxqs_n = priv->dev_data->nb_rx_queues;
+   unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
+   struct rte_intr_handle *intr_handle = priv->intr_handle;
+   int ret;
+
+   rx_intr_vec_disable(priv);
+
+   if (rte_intr_vec_list_alloc(intr_handle, NULL, n)) {
+   DRV_LOG(ERR, "Failed to allocate memory for interrupt vector");
+   return -ENOMEM;
+   }
+
+   for (i = 0; i < n; i++) {
+   struct mana_rxq *rxq = priv->dev_data->rx_queues[i];
+
+   ret = rte_intr_vec_list_index_set(intr_handle, i,
+ RTE_INTR_VEC_RXTX_OFFSET + i);
+   if (ret) {
+   DRV_LOG(ERR, "Failed to set intr vec %u", i);
+   return ret;
+   }
+
+   ret = rte_intr_efds_index_set(intr_handle, i, rxq->channel->fd);
+   if (ret) {
+   DRV_LOG(ERR, "Failed to set FD at intr %u", i);
+   return ret;
+   }
+   }
+
+   return rte_intr_nb_efd_set(intr_handle, n);
+}
+
+static void rxq_intr_disable(struct mana_priv *priv)
+{
+   int err = rte_errno;
+
+   rx_intr_vec_disable(priv);
+   rte_errno = err;
+}
+
+static int rxq_intr_enable(struct mana_priv *priv)
+{
+   const struct rte_eth_intr_conf *const intr_conf =
+   &priv->dev_data->dev_conf.intr_conf;
+
+   if (!intr_conf->rxq)
+   return 0;
+
+   return rx_intr_vec_enable(priv);
+}
 
 static int
 mana_dev_start(struct rte_eth_dev *dev)
@@ -133,8 +194,17 @@ mana_dev_start(struct rte_eth_dev *dev)
/* Enable datapat