Re: [PATCH v5 2/7] bbdev: add device status info

2022-08-30 Thread Maxime Coquelin




On 8/29/22 18:10, Chautru, Nicolas wrote:

Hi Maxime,


-Original Message-
From: Maxime Coquelin 
Sent: Friday, August 26, 2022 3:13 AM
To: Chautru, Nicolas ; dev@dpdk.org;
tho...@monjalon.net; gak...@marvell.com; hemant.agra...@nxp.com
Cc: t...@redhat.com; m...@ashroe.eu; Richardson, Bruce
; david.march...@redhat.com;
step...@networkplumber.org
Subject: Re: [PATCH v5 2/7] bbdev: add device status info

Hi,

On 8/25/22 20:30, Chautru, Nicolas wrote:

Thanks Maxime,


-Original Message-
From: Maxime Coquelin 
Sent: Thursday, August 25, 2022 7:19 AM
To: Chautru, Nicolas ; dev@dpdk.org;
tho...@monjalon.net; gak...@marvell.com; hemant.agra...@nxp.com
Cc: t...@redhat.com; m...@ashroe.eu; Richardson, Bruce
; david.march...@redhat.com;
step...@networkplumber.org
Subject: Re: [PATCH v5 2/7] bbdev: add device status info



On 7/7/22 01:28, Nicolas Chautru wrote:

Added device status information, so that the PMD can expose
information related to the underlying accelerator device status.
Minor order change in structure to fit into padding hole.

Signed-off-by: Nicolas Chautru 
---
drivers/baseband/acc100/rte_acc100_pmd.c   |  1 +
drivers/baseband/fpga_5gnr_fec/rte_fpga_5gnr_fec.c |  1 +
drivers/baseband/fpga_lte_fec/fpga_lte_fec.c   |  1 +
drivers/baseband/la12xx/bbdev_la12xx.c |  1 +
drivers/baseband/null/bbdev_null.c |  1 +
drivers/baseband/turbo_sw/bbdev_turbo_software.c   |  1 +
lib/bbdev/rte_bbdev.c  | 22 ++
lib/bbdev/rte_bbdev.h  | 35 
--
lib/bbdev/version.map  |  6 
9 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/drivers/baseband/acc100/rte_acc100_pmd.c
b/drivers/baseband/acc100/rte_acc100_pmd.c
index de7e4bc..17ba798 100644
--- a/drivers/baseband/acc100/rte_acc100_pmd.c
+++ b/drivers/baseband/acc100/rte_acc100_pmd.c
@@ -1060,6 +1060,7 @@

/* Read and save the populated config from ACC100 registers */
fetch_acc100_config(dev);
+   dev_info->device_status = RTE_BBDEV_DEV_NOT_SUPPORTED;

/* This isn't ideal because it reports the maximum number of
queues

but

 * does not provide info on how many can be uplink/downlink or
different diff --git
a/drivers/baseband/fpga_5gnr_fec/rte_fpga_5gnr_fec.c
b/drivers/baseband/fpga_5gnr_fec/rte_fpga_5gnr_fec.c
index 82ae6ba..57b12af 100644
--- a/drivers/baseband/fpga_5gnr_fec/rte_fpga_5gnr_fec.c
+++ b/drivers/baseband/fpga_5gnr_fec/rte_fpga_5gnr_fec.c
@@ -369,6 +369,7 @@
dev_info->capabilities = bbdev_capabilities;
dev_info->cpu_flag_reqs = NULL;
dev_info->data_endianness = RTE_LITTLE_ENDIAN;
+   dev_info->device_status = RTE_BBDEV_DEV_NOT_SUPPORTED;

/* Calculates number of queues assigned to device */
dev_info->max_num_queues = 0;
diff --git a/drivers/baseband/fpga_lte_fec/fpga_lte_fec.c
b/drivers/baseband/fpga_lte_fec/fpga_lte_fec.c
index 21d3529..2a330c4 100644
--- a/drivers/baseband/fpga_lte_fec/fpga_lte_fec.c
+++ b/drivers/baseband/fpga_lte_fec/fpga_lte_fec.c
@@ -645,6 +645,7 @@ struct __rte_cache_aligned fpga_queue {
dev_info->capabilities = bbdev_capabilities;
dev_info->cpu_flag_reqs = NULL;
dev_info->data_endianness = RTE_LITTLE_ENDIAN;
+   dev_info->device_status = RTE_BBDEV_DEV_NOT_SUPPORTED;

/* Calculates number of queues assigned to device */
dev_info->max_num_queues = 0;
diff --git a/drivers/baseband/la12xx/bbdev_la12xx.c
b/drivers/baseband/la12xx/bbdev_la12xx.c
index 4d1bd16..c1f88c6 100644
--- a/drivers/baseband/la12xx/bbdev_la12xx.c
+++ b/drivers/baseband/la12xx/bbdev_la12xx.c
@@ -100,6 +100,7 @@ struct bbdev_la12xx_params {
dev_info->capabilities = bbdev_capabilities;
dev_info->cpu_flag_reqs = NULL;
dev_info->min_alignment = 64;
+   dev_info->device_status = RTE_BBDEV_DEV_NOT_SUPPORTED;

rte_bbdev_log_debug("got device info from %u", dev->data-
dev_id);
}
diff --git a/drivers/baseband/null/bbdev_null.c
b/drivers/baseband/null/bbdev_null.c
index 248e129..94a1976 100644
--- a/drivers/baseband/null/bbdev_null.c
+++ b/drivers/baseband/null/bbdev_null.c
@@ -82,6 +82,7 @@ struct bbdev_queue {
 * here for code completeness.
 */
dev_info->data_endianness = RTE_LITTLE_ENDIAN;
+   dev_info->device_status = RTE_BBDEV_DEV_NOT_SUPPORTED;

rte_bbdev_log_debug("got device info from %u", dev->data-
dev_id);
}
diff --git a/drivers/baseband/turbo_sw/bbdev_turbo_software.c
b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
index af7bc41..dbc5524 100644
--- a/drivers/baseband/turbo_sw/bbdev_turbo_software.c
+++ b/drivers/baseband/turbo_sw/bbdev_turbo_software.c
@@ -254,6 +254,7 @@ struct turbo_sw_queue {
dev_info->min_alignment = 64;
dev_info->harq_buffer_size = 0;
dev_info->data_endianness = RTE_LITTLE_ENDIAN;
+   d

RE: [PATCH v2] net/octeon_ep: support CN10K SoC

2022-08-30 Thread Sathesh B Edara
This patch has 
Depends-on: https://patches.dpdk.org/project/dpdk/list/?series=24103



Re: [PATCH v1 00/10] baseband/acc200

2022-08-30 Thread Maxime Coquelin

Hi Nicolas,

On 7/12/22 15:48, Maxime Coquelin wrote:

Hi Nicolas, Hernan,

(Adding Hernan in the recipients list)

On 7/8/22 02:01, Nicolas Chautru wrote:

This is targeting 22.11 and includes the PMD for the
integrated accelerator on Intel Xeon SPR-EEC.
There is a dependency on that parallel serie still in-flight
which extends the bbdev api 
https://patches.dpdk.org/project/dpdk/list/?series=23894


I will be offline for a few weeks for the summer break but
Hernan will cover for me during that time if required.

Thanks
Nic

Nicolas Chautru (10):
   baseband/acc200: introduce PMD for ACC200
   baseband/acc200: add HW register definitions
   baseband/acc200: add info get function
   baseband/acc200: add queue configuration
   baseband/acc200: add LDPC processing functions
   baseband/acc200: add LTE processing functions
   baseband/acc200: add support for FFT operations
   baseband/acc200: support interrupt
   baseband/acc200: add device status and vf2pf comms
   baseband/acc200: add PF configure companion function

  MAINTAINERS  |    3 +
  app/test-bbdev/meson.build   |    3 +
  app/test-bbdev/test_bbdev_perf.c |   76 +
  doc/guides/bbdevs/acc200.rst |  244 ++
  doc/guides/bbdevs/index.rst  |    1 +
  drivers/baseband/acc200/acc200_pf_enum.h |  468 +++
  drivers/baseband/acc200/acc200_pmd.h |  690 
  drivers/baseband/acc200/acc200_vf_enum.h |   89 +
  drivers/baseband/acc200/meson.build  |    8 +
  drivers/baseband/acc200/rte_acc200_cfg.h |  115 +
  drivers/baseband/acc200/rte_acc200_pmd.c | 5403 
++

  drivers/baseband/acc200/version.map  |   10 +
  drivers/baseband/meson.build |    1 +
  13 files changed, 7111 insertions(+)
  create mode 100644 doc/guides/bbdevs/acc200.rst
  create mode 100644 drivers/baseband/acc200/acc200_pf_enum.h
  create mode 100644 drivers/baseband/acc200/acc200_pmd.h
  create mode 100644 drivers/baseband/acc200/acc200_vf_enum.h
  create mode 100644 drivers/baseband/acc200/meson.build
  create mode 100644 drivers/baseband/acc200/rte_acc200_cfg.h
  create mode 100644 drivers/baseband/acc200/rte_acc200_pmd.c
  create mode 100644 drivers/baseband/acc200/version.map



Comparing ACC200 & ACC100 header files, I understand ACC200 is an
evolution of the ACC10x family. The FEC bits are really close, ACC200
main addition seems to be FFT acceleration which could be handled in
ACC10x driver based on device ID.

I think both drivers have to be merged in order to avoid code
duplication. That's how other families of devices (e.g. i40e) are
handled.


I haven't seen your reply on this point.
Do you confirm you are working on a single driver for ACC family in
order to avoid code duplication?

Maxime


Thanks,
Maxime




RE: [PATCH 1/2] net/iavf: enable TSO offloading for tunnel cases

2022-08-30 Thread Yang, Qiming
Please retest: TCP/UDP/tunnel-TCP/tunnel-UDP packet

> -Original Message-
> From: peng1x.zh...@intel.com 
> Sent: Saturday, August 13, 2022 12:52 AM
> To: dev@dpdk.org
> Cc: Xing, Beilei ; Wu, Jingjing 
> ;
> Zhang, Peng1X 
> Subject: [PATCH 1/2] net/iavf: enable TSO offloading for tunnel cases

Should be a bug fix patch. 

> 
> From: Peng Zhang 
> 
No need this line.

> Hardware limits that max buffer size per Tx descriptor should be (16K-1)B.
> So when TSO enabled under unencrypt scenario, the mbuf data size may
> exceed the limit and cause malicious behavior to the NIC.

So this patch is fixing the tunnel TSO not enabling.

> 
> This patch supports Tx descriptors for this kind of large buffer.
> 
> Signed-off-by: Peng Zhang 
> ---
>  drivers/net/iavf/iavf_rxtx.c | 66 
>  1 file changed, 60 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c index
> dfd021889e..adec58e90a 100644
> --- a/drivers/net/iavf/iavf_rxtx.c
> +++ b/drivers/net/iavf/iavf_rxtx.c
> @@ -2642,6 +2642,47 @@ iavf_ipsec_crypto_get_pkt_metadata(const struct
> iavf_tx_queue *txq,
>   return NULL;
>  }
> 
> +/* HW requires that TX buffer size ranges from 1B up to (16K-1)B. */
> +#define IAVF_MAX_DATA_PER_TXD \
> + (IAVF_TXD_QW1_TX_BUF_SZ_MASK >>
> IAVF_TXD_QW1_TX_BUF_SZ_SHIFT)
> +
> +static inline void
> +iavf_fill_unencrypt_desc(volatile struct iavf_tx_desc *txd, struct rte_mbuf
> *m,
> +  volatile uint64_t desc_template, struct iavf_tx_entry
> *txe,
> +  volatile struct iavf_tx_desc *txr, struct iavf_tx_entry
> *txe_ring,
> +  int desc_idx_last)
> +{
> + /* Setup TX Descriptor */
> + int desc_idx;
> + uint16_t slen = m->data_len;
> + uint64_t buf_dma_addr = rte_mbuf_data_iova(m);
> + struct iavf_tx_entry *txn = &txe_ring[txe->next_id];
> +
> + while ((m->ol_flags & RTE_MBUF_F_TX_TCP_SEG) &&

??? lack of UDP?

> + unlikely(slen > IAVF_MAX_DATA_PER_TXD)) {
> + txd->buffer_addr =
> rte_cpu_to_le_64(buf_dma_addr);
> +
> + txd->cmd_type_offset_bsz =
> + rte_cpu_to_le_64(IAVF_TX_DESC_DTYPE_DATA |
> + (uint64_t)IAVF_MAX_DATA_PER_TXD <<
> + IAVF_TXD_DATA_QW1_TX_BUF_SZ_SHIFT) |
> desc_template;
> +
> + buf_dma_addr += IAVF_MAX_DATA_PER_TXD;
> + slen -= IAVF_MAX_DATA_PER_TXD;
> +
> + txe->last_id = desc_idx_last;
> + desc_idx = txe->next_id;
> + txe = txn;
> + txd = &txr[desc_idx];
> + txn = &txe_ring[txe->next_id];
> + }
> +
> + txd->buffer_addr = rte_cpu_to_le_64(buf_dma_addr);
> + txd->cmd_type_offset_bsz =
> + rte_cpu_to_le_64((uint64_t)slen <<
> IAVF_TXD_DATA_QW1_TX_BUF_SZ_SHIFT) |
> + desc_template;
> +}
> +
>  /* TX function */
>  uint16_t
>  iavf_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
> @@ -2650,6 +2691,7 @@ iavf_xmit_pkts(void *tx_queue, struct rte_mbuf
> **tx_pkts, uint16_t nb_pkts)
>   volatile struct iavf_tx_desc *txr = txq->tx_ring;
>   struct iavf_tx_entry *txe_ring = txq->sw_ring;
>   struct iavf_tx_entry *txe, *txn;
> + volatile struct iavf_tx_desc *txd;
>   struct rte_mbuf *mb, *mb_seg;
>   uint16_t desc_idx, desc_idx_last;
>   uint16_t idx;
> @@ -2781,6 +2823,7 @@ iavf_xmit_pkts(void *tx_queue, struct rte_mbuf
> **tx_pkts, uint16_t nb_pkts)
>   ddesc = (volatile struct iavf_tx_desc *)
>   &txr[desc_idx];
> 
> + txd = &txr[desc_idx];
>   txn = &txe_ring[txe->next_id];
>   RTE_MBUF_PREFETCH_TO_FREE(txn->mbuf);
> 
> @@ -2788,10 +2831,16 @@ iavf_xmit_pkts(void *tx_queue, struct rte_mbuf
> **tx_pkts, uint16_t nb_pkts)
>   rte_pktmbuf_free_seg(txe->mbuf);
> 
>   txe->mbuf = mb_seg;
> - iavf_fill_data_desc(ddesc, mb_seg,
> - ddesc_template, tlen, ipseclen);
> 
> - IAVF_DUMP_TX_DESC(txq, ddesc, desc_idx);
> + if (nb_desc_ipsec) {
> + iavf_fill_data_desc(ddesc, mb_seg,
> + ddesc_template, tlen, ipseclen);
> + IAVF_DUMP_TX_DESC(txq, ddesc, desc_idx);
> + } else {
> + iavf_fill_unencrypt_desc(txd, mb_seg,
> + ddesc_template, txe, txr, txe_ring,
> desc_idx_last);
> + IAVF_DUMP_TX_DESC(txq, txd, desc_idx);
> + }
> 
> 

RE: [PATCH 2/2] net/iavf: support inner and outer checksum offload

2022-08-30 Thread Yang, Qiming



> -Original Message-
> From: peng1x.zh...@intel.com 
> Sent: Saturday, August 13, 2022 12:52 AM
> To: dev@dpdk.org
> Cc: Xing, Beilei ; Wu, Jingjing 
> ;
> Zhang, Peng1X 
> Subject: [PATCH 2/2] net/iavf: support inner and outer checksum offload
> 
> From: Peng Zhang 

No need this line.

> 
> Add the support of inner and outer Tx checksum offload for tunneling
> packets by configuring tunneling parameters in Tx descriptors, including
> outer L3/L4 checksum offload.

Enable inner and outer Tx checksum offload for tunnel patch by configure 
ol_flags.

> 
> Signed-off-by: Peng Zhang 
> ---
>  drivers/net/iavf/iavf_ethdev.c |  3 +-
>  drivers/net/iavf/iavf_rxtx.c   | 51 +++---
>  drivers/net/iavf/iavf_rxtx.h   |  8 +-
>  3 files changed, 56 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/net/iavf/iavf_ethdev.c b/drivers/net/iavf/iavf_ethdev.c
> index 506fcff6e3..59238ecceb 100644
> --- a/drivers/net/iavf/iavf_ethdev.c
> +++ b/drivers/net/iavf/iavf_ethdev.c
> @@ -1140,7 +1140,8 @@ iavf_dev_info_get(struct rte_eth_dev *dev, struct
> rte_eth_dev_info *dev_info)
>   RTE_ETH_TX_OFFLOAD_IPIP_TNL_TSO |
>   RTE_ETH_TX_OFFLOAD_GENEVE_TNL_TSO |
>   RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
> - RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
> + RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE |
> + RTE_ETH_TX_OFFLOAD_OUTER_UDP_CKSUM;

Add this line after outer_ipv4_chsum

> 
>   if (vf->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_CRC)
>   dev_info->rx_offload_capa |=
> RTE_ETH_RX_OFFLOAD_KEEP_CRC; diff --git a/drivers/net/iavf/iavf_rxtx.c
> b/drivers/net/iavf/iavf_rxtx.c index adec58e90a..7cbebafc09 100644
> --- a/drivers/net/iavf/iavf_rxtx.c
> +++ b/drivers/net/iavf/iavf_rxtx.c
> @@ -2334,7 +2334,7 @@ static inline uint16_t
> iavf_calc_context_desc(uint64_t flags, uint8_t vlan_flag)  {
>   if (flags & (RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_UDP_SEG |
> - RTE_MBUF_F_TX_TUNNEL_MASK))
> + RTE_MBUF_F_TX_TUNNEL_MASK |
> RTE_MBUF_F_TX_OUTER_IP_CKSUM))

OUTER_UDP_CKSUM?

>   return 1;
>   if (flags & RTE_MBUF_F_TX_VLAN &&
>   vlan_flag & IAVF_TX_FLAGS_VLAN_TAG_LOC_L2TAG2)
> @@ -2399,6 +2399,44 @@ iavf_fill_ctx_desc_tunnelling_field(volatile
> uint64_t *qw0,
>   break;
>   }
> 
> + /* L4TUNT: L4 Tunneling Type */
> + switch (m->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) {
> + case RTE_MBUF_F_TX_TUNNEL_IPIP:
> + /* for non UDP / GRE tunneling, set to 00b */
> + break;
> + case RTE_MBUF_F_TX_TUNNEL_VXLAN:
> + case RTE_MBUF_F_TX_TUNNEL_GTP:
> + case RTE_MBUF_F_TX_TUNNEL_GENEVE:
> + eip_typ |= IAVF_TXD_CTX_UDP_TUNNELING;
> + break;
> + case RTE_MBUF_F_TX_TUNNEL_GRE:
> + eip_typ |= IAVF_TXD_CTX_GRE_TUNNELING;
> + break;
> + default:
> + PMD_TX_LOG(ERR, "Tunnel type not supported");
> + return;
> + }
> +
> + /* L4TUNLEN: L4 Tunneling Length, in Words
> +  *
> +  * We depend on app to set rte_mbuf.l2_len correctly.
> +  * For IP in GRE it should be set to the length of the GRE
> +  * header;
> +  * For MAC in GRE or MAC in UDP it should be set to the length
> +  * of the GRE or UDP headers plus the inner MAC up to including
> +  * its last Ethertype.
> +  * If MPLS labels exists, it should include them as well.
> +  */
> + eip_typ |= (m->l2_len >> 1) << IAVF_TXD_CTX_QW0_NATLEN_SHIFT;
> +
> + /**
> +  * Calculate the tunneling UDP checksum.
> +  * Shall be set only if L4TUNT = 01b and EIPT is not zero
> +  */
> + if (!(eip_typ & IAVF_TX_CTX_EXT_IP_NONE) &&
> + (eip_typ & IAVF_TXD_CTX_UDP_TUNNELING))
> + eip_typ |= IAVF_TXD_CTX_QW0_L4T_CS_MASK;
> +
>   *qw0 = eip_typ << IAVF_TXD_CTX_QW0_TUN_PARAMS_EIPT_SHIFT |
>   eip_len <<
> IAVF_TXD_CTX_QW0_TUN_PARAMS_EIPLEN_SHIFT |
>   eip_noinc <<
> IAVF_TXD_CTX_QW0_TUN_PARAMS_EIP_NOINC_SHIFT;
> @@ -2417,7 +2455,7 @@ iavf_fill_ctx_desc_segmentation_field(volatile
> uint64_t *field,
>   total_length = m->pkt_len - (m->l2_len + m->l3_len + m-
> >l4_len);
> 
>   if (m->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK)
> - total_length -= m->outer_l3_len;
> + total_length -= m->outer_l3_len +  m->outer_l2_len;

Not related, delete

>   }
> 
>  #ifdef RTE_LIBRTE_IAVF_DEBUG_TX
> @@ -2535,8 +2573,13 @@ iavf_build_data_desc_cmd_offset_fields(volatile
> uint64_t *qw1,
>   }
> 
>   /* Set MACLEN */
> - offset |= (m->l2_len >> 1) << IAVF_TX_DESC_LENGTH_MACLEN_SHIFT;
> -
> + if (m->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK)
> + offset |= (m->outer_l2_len >> 1)
> + << IAVF_TX_DESC_LENGTH_MACLEN_SHIFT;
> + else
> + offset |= (m->l2_len >> 1)
> + << IAVF_TX_DESC_LENGTH_MACLEN_SHIFT;
> 

Re: [PATCH v1 1/4] build: add meson option to configure IOVA mode as VA

2022-08-30 Thread Bruce Richardson
On Mon, Aug 29, 2022 at 08:18:56PM +0200, Morten Brørup wrote:
> > From: Shijith Thotton [mailto:sthot...@marvell.com]
> > Sent: Monday, 29 August 2022 17.16
> > 
> > IOVA mode in DPDK is either PA or VA. The new build option iova_as_va
> > configures the mode to VA at compile time and prevents setting it to PA
> > at runtime. For now, all drivers which are not always enabled are
> > disabled with this option. Supported driver can set the flag
> > pmd_iova_as_va in its build file to enable build.
> > 
> > mbuf structure holds the physical (PA) and virtual address (VA) of a
> > buffer. if IOVA mode is set to VA, PA is redundant as it is the same as
> > VA. So PA field need not be updated and marked invalid if the build is
> > configured to use only VA.
> > 
> > Signed-off-by: Shijith Thotton 
> > ---
> 
> [...]
> 
> > diff --git a/app/test/test_mbuf.c b/app/test/test_mbuf.c
> > index e09b2549ca..992b8c64ab 100644
> > --- a/app/test/test_mbuf.c
> > +++ b/app/test/test_mbuf.c
> > @@ -1232,11 +1232,13 @@ test_failing_mbuf_sanity_check(struct
> > rte_mempool *pktmbuf_pool)
> > return -1;
> > }
> > 
> > -   badbuf = *buf;
> > -   badbuf.buf_iova = 0;
> > -   if (verify_mbuf_check_panics(&badbuf)) {
> > -   printf("Error with bad-physaddr mbuf test\n");
> > -   return -1;
> > +   if (!rte_is_iova_as_va_build()) {
> > +   badbuf = *buf;
> > +   rte_mbuf_iova_set(&badbuf, 0);
> > +   if (verify_mbuf_check_panics(&badbuf)) {
> > +   printf("Error with bad-physaddr mbuf test\n");
> > +   return -1;
> > +   }
> > }
> > 
> > badbuf = *buf;
> > diff --git a/config/meson.build b/config/meson.build
> > index 7f7b6c92fd..1ff1cd774b 100644
> > --- a/config/meson.build
> > +++ b/config/meson.build
> > @@ -309,6 +309,9 @@ endif
> >  if get_option('mbuf_refcnt_atomic')
> >  dpdk_conf.set('RTE_MBUF_REFCNT_ATOMIC', true)
> >  endif
> > +if get_option('iova_as_va')
> > +dpdk_conf.set('RTE_IOVA_AS_VA', true)
> > +endif
> > 
> >  compile_time_cpuflags = []
> >  subdir(arch_subdir)
> > diff --git a/drivers/meson.build b/drivers/meson.build
> > index b22c2adda7..469e60f1fa 100644
> > --- a/drivers/meson.build
> > +++ b/drivers/meson.build
> > @@ -103,6 +103,7 @@ foreach subpath:subdirs
> >  ext_deps = []
> >  pkgconfig_extra_libs = []
> >  testpmd_sources = []
> > +pmd_iova_as_va = false
> > 
> >  if not enable_drivers.contains(drv_path)
> >  build = false
> > @@ -120,6 +121,11 @@ foreach subpath:subdirs
> >  # pull in driver directory which should update all the
> > local variables
> >  subdir(drv_path)
> > 
> > +if dpdk_conf.has('RTE_IOVA_AS_VA') and not pmd_iova_as_va
> > and not always_enable.contains(drv_path)
> > +build = false
> > +reason = 'driver does not support IOVA as VA mode'
> > +endif
> > +
> >  # get dependency objs from strings
> >  shared_deps = ext_deps
> >  static_deps = ext_deps
> > diff --git a/lib/eal/include/rte_common.h
> > b/lib/eal/include/rte_common.h
> > index a96cc2a138..0010ad7c7d 100644
> > --- a/lib/eal/include/rte_common.h
> > +++ b/lib/eal/include/rte_common.h
> > @@ -921,6 +921,23 @@ __rte_noreturn void
> >  rte_exit(int exit_code, const char *format, ...)
> > __rte_format_printf(2, 3);
> > 
> > +/**
> > + * Check if build is configured to use IOVA as VA.
> > + *
> > + * @return
> > + *   1 if true, 0 otherwise
> > + *
> > + */
> > +static inline int
> > +rte_is_iova_as_va_build(void)
> > +{
> > +#ifdef RTE_IOVA_AS_VA
> > +   return 1;
> > +#else
> > +   return 0;
> > +#endif
> > +}
> 
> The rte_is_iova_as_va_build() function is effectively a shadow of the 
> RTE_IOVA_AS_VA definition. Why the need to camouflage RTE_IOVA_AS_VA through 
> a function, instead of just using RTE_IOVA_AS_VA everywhere?
> 
My reading is that it's not quite equivalent, and in the undef case it
can't be directly used in C code. You can't do "if (RTE_IOVA_AS_VA)", for
example. However, rather than adding a function, in meson you could also
add "dpdk_conf.set10(RTE_IOVA)" to define a second macro that is 0 in
the undef case, and which therefore could be used in C conditionals.

/Bruce


Re: [PATCH v1 2/4] mbuf: add second dynamic field member for VA only build

2022-08-30 Thread Bruce Richardson
On Mon, Aug 29, 2022 at 08:32:20PM +0200, Morten Brørup wrote:
> 
> > From: Shijith Thotton [mailto:sthot...@marvell.com]
> > Sent: Monday, 29 August 2022 17.16
> > 
> > mbuf physical address field is not used in builds which only uses VA.
> > It is used to expand the dynamic field area.
> > 
> > Signed-off-by: Shijith Thotton 
> > ---
> >  lib/mbuf/rte_mbuf_core.h | 26 +-
> >  lib/mbuf/rte_mbuf_dyn.c  |  2 ++
> >  2 files changed, 19 insertions(+), 9 deletions(-)
> > 
> > diff --git a/lib/mbuf/rte_mbuf_core.h b/lib/mbuf/rte_mbuf_core.h
> > index 81cb07c2e4..98ce62fd6a 100644
> > --- a/lib/mbuf/rte_mbuf_core.h
> > +++ b/lib/mbuf/rte_mbuf_core.h
> > @@ -579,15 +579,23 @@ struct rte_mbuf {
> > RTE_MARKER cacheline0;
> > 
> > void *buf_addr;   /**< Virtual address of segment buffer.
> > */
> > -   /**
> > -* Physical address of segment buffer.
> > -* This field is invalid if the build is configured to use only
> > -* virtual address as IOVA (i.e. RTE_IOVA_AS_VA is defined).
> > -* Force alignment to 8-bytes, so as to ensure we have the exact
> > -* same mbuf cacheline0 layout for 32-bit and 64-bit. This makes
> > -* working on vector drivers easier.
> > -*/
> > -   rte_iova_t buf_iova __rte_aligned(sizeof(rte_iova_t));
> > +   RTE_STD_C11
> > +   union {
> > +   /**
> > +* Physical address of segment buffer.
> > +* This field is invalid if the build is configured to use
> > only
> > +* virtual address as IOVA (i.e. RTE_IOVA_AS_VA is
> > defined).
> > +* Force alignment to 8-bytes, so as to ensure we have the
> > exact
> > +* same mbuf cacheline0 layout for 32-bit and 64-bit. This
> > makes
> > +* working on vector drivers easier.
> > +*/
> > +   rte_iova_t buf_iova __rte_aligned(sizeof(rte_iova_t));
> > +   /**
> > +* Reserved for dynamic field in builds where physical
> > address
> > +* field is invalid.
> > +*/
> > +   uint64_t dynfield2;
> > +   };
> > 
> > /* next 8 bytes are initialised on RX descriptor rearm */
> > RTE_MARKER64 rearm_data;
> 
> I know that the intention here is to keep the rte_mbuf structure intact, 
> which will certainly improve the probability of getting this patch series 
> into DPDK.
> 
> So, I will add a comment for the benefit of the other participants in the 
> discussion:
> 
> With this patch, and in RTE_IOVA_AS_VA mode, it becomes possible to move 
> m->next into the first cache line, so rte_pktmbuf_prefree_seg() does not have 
> to touch the second cache line, thus potentially improving performance by 
> eliminating one cache miss per freed packet segment. (I also recall someone 
> mentioning that some PMDs set m->next on RX... If that is the case, a cache 
> miss per packet might also be avoidable in those PMDs.)
> 
> Obviously, moving m->next to the first cache line is not related to this 
> patch series, but would belong in a completely different patch.
>

+1 to that, with the exception that if it is decided to move the next
pointer rather than use this as dynamic space, I think it *should* be in
this patch series, rather than mucking about with mbuf twice. :-) 


Re: [PATCH] rcu: fix build failure with debug dp log level

2022-08-30 Thread Morrissey, Sean



On 29/08/2022 17:55, Honnappa Nagarahalli wrote:

-Original Message-
From: Anoob Joseph 
Sent: Monday, August 29, 2022 11:52 AM
To: Honnappa Nagarahalli 
Cc: jer...@marvell.com; dev@dpdk.org; sean.morris...@intel.com
Subject: [PATCH] rcu: fix build failure with debug dp log level

Build fails if RTE_LOG_DP_LEVEL is set to RTE_LOG_DEBUG. Fix the same by
including the required header when RTE_LOG_DP_LEVEL is set to
RTE_LOG_DEBUG.

../lib/rcu/rte_rcu_qsbr.h:678:40: error: expected ‘)’ before ‘PRIu64’
   678 |"%s: status: least acked token = %" PRIu64,
   |^~

Fixes: 30a1de105a5f ("lib: remove unneeded header includes")
Cc: sean.morris...@intel.com

Agree on the fix.
@sean.morris...@intel.com Does the process that removed this header file 
inclusion needs fixing?
If yes, should that fix be included in this patch?


@honnappa.nagaraha...@arm.com Yes, I believe the tool will need an 
update, however, I believe it should be a separate patch for that.



Signed-off-by: Anoob Joseph 
---
  lib/rcu/rte_rcu_qsbr.h | 4 
  1 file changed, 4 insertions(+)

diff --git a/lib/rcu/rte_rcu_qsbr.h b/lib/rcu/rte_rcu_qsbr.h index
d81bf5e8db..b0f1720ca1 100644
--- a/lib/rcu/rte_rcu_qsbr.h
+++ b/lib/rcu/rte_rcu_qsbr.h
@@ -37,6 +37,10 @@ extern "C" {
  #include 
  #include 

+#if RTE_LOG_DP_LEVEL >= RTE_LOG_DEBUG
+#include 
+#endif
+
  extern int rte_rcu_log_type;

  #if RTE_LOG_DP_LEVEL >= RTE_LOG_DEBUG
--
2.25.1


Re: [RFC PATCH 1/3] os: begin separating some OS compatibility from EAL

2022-08-30 Thread Bruce Richardson
On Mon, Aug 29, 2022 at 08:57:53PM +0200, Morten Brørup wrote:
> > From: Bruce Richardson [mailto:bruce.richard...@intel.com]
> > Sent: Monday, 29 August 2022 17.19
> > To: dev@dpdk.org
> > Cc: Bruce Richardson
> > Subject: [RFC PATCH 1/3] os: begin separating some OS compatibility
> > from EAL
> > 
> > Some library functionality we may want ahead of EAL build depends upon
> > some OS-specific functionality, so we create a new lib for that to be
> > built separately. For now, just includes fnmatch function for windows.
> 
> The description given in patch 0/3 mentions that this causes a circular 
> dependency between the EAL and Log libraries. You should mention that here 
> too. Until I re-read that, I didn't understand the need to move fnmatch() out 
> of the EAL library - I was even sidetracking wildly, considering if it had to 
> do with needing it on the host computer (for some host compiler checks).
> 

Sorry about that! :-)

> FYI, and not important: fnmatch() is a C library function (man 3), not a 
> System call (man 2). But obviously still O/S specific, since it is not 
> included with the C library for Windows.
> 
Interesting, but as you say, it doesn't actually affect this patch.

/Bruce


RE: [EXT] Re: [PATCH v1 2/4] mbuf: add second dynamic field member for VA only build

2022-08-30 Thread Pavan Nikhilesh Bhagavatula



> -Original Message-
> From: Bruce Richardson 
> Sent: Tuesday, August 30, 2022 2:06 PM
> To: Morten Brørup 
> Cc: Shijith Thotton ; dev@dpdk.org; Pavan
> Nikhilesh Bhagavatula ;
> honnappa.nagaraha...@arm.com; Jerin Jacob Kollanukkaran
> ; olivier.m...@6wind.com;
> step...@networkplumber.org; tho...@monjalon.net
> Subject: [EXT] Re: [PATCH v1 2/4] mbuf: add second dynamic field member
> for VA only build
> 
> External Email
> 
> --
> On Mon, Aug 29, 2022 at 08:32:20PM +0200, Morten Brørup wrote:
> >
> > > From: Shijith Thotton [mailto:sthot...@marvell.com]
> > > Sent: Monday, 29 August 2022 17.16
> > >
> > > mbuf physical address field is not used in builds which only uses VA.
> > > It is used to expand the dynamic field area.
> > >
> > > Signed-off-by: Shijith Thotton 
> > > ---
> > >  lib/mbuf/rte_mbuf_core.h | 26 +-
> > >  lib/mbuf/rte_mbuf_dyn.c  |  2 ++
> > >  2 files changed, 19 insertions(+), 9 deletions(-)
> > >
> > > diff --git a/lib/mbuf/rte_mbuf_core.h b/lib/mbuf/rte_mbuf_core.h
> > > index 81cb07c2e4..98ce62fd6a 100644
> > > --- a/lib/mbuf/rte_mbuf_core.h
> > > +++ b/lib/mbuf/rte_mbuf_core.h
> > > @@ -579,15 +579,23 @@ struct rte_mbuf {
> > >   RTE_MARKER cacheline0;
> > >
> > >   void *buf_addr;   /**< Virtual address of segment buffer.
> > > */
> > > - /**
> > > -  * Physical address of segment buffer.
> > > -  * This field is invalid if the build is configured to use only
> > > -  * virtual address as IOVA (i.e. RTE_IOVA_AS_VA is defined).
> > > -  * Force alignment to 8-bytes, so as to ensure we have the exact
> > > -  * same mbuf cacheline0 layout for 32-bit and 64-bit. This makes
> > > -  * working on vector drivers easier.
> > > -  */
> > > - rte_iova_t buf_iova __rte_aligned(sizeof(rte_iova_t));
> > > + RTE_STD_C11
> > > + union {
> > > + /**
> > > +  * Physical address of segment buffer.
> > > +  * This field is invalid if the build is configured to use
> > > only
> > > +  * virtual address as IOVA (i.e. RTE_IOVA_AS_VA is
> > > defined).
> > > +  * Force alignment to 8-bytes, so as to ensure we have the
> > > exact
> > > +  * same mbuf cacheline0 layout for 32-bit and 64-bit. This
> > > makes
> > > +  * working on vector drivers easier.
> > > +  */
> > > + rte_iova_t buf_iova __rte_aligned(sizeof(rte_iova_t));
> > > + /**
> > > +  * Reserved for dynamic field in builds where physical
> > > address
> > > +  * field is invalid.
> > > +  */
> > > + uint64_t dynfield2;
> > > + };
> > >
> > >   /* next 8 bytes are initialised on RX descriptor rearm */
> > >   RTE_MARKER64 rearm_data;
> >
> > I know that the intention here is to keep the rte_mbuf structure intact,
> which will certainly improve the probability of getting this patch series into
> DPDK.
> >
> > So, I will add a comment for the benefit of the other participants in the
> discussion:
> >
> > With this patch, and in RTE_IOVA_AS_VA mode, it becomes possible to
> move m->next into the first cache line, so rte_pktmbuf_prefree_seg() does
> not have to touch the second cache line, thus potentially improving
> performance by eliminating one cache miss per freed packet segment. (I also
> recall someone mentioning that some PMDs set m->next on RX... If that is
> the case, a cache miss per packet might also be avoidable in those PMDs.)
> >
> > Obviously, moving m->next to the first cache line is not related to this 
> > patch
> series, but would belong in a completely different patch.
> >
> 
> +1 to that, with the exception that if it is decided to move the next
> pointer rather than use this as dynamic space, I think it *should* be in
> this patch series, rather than mucking about with mbuf twice. :-)

+1 When RTE_IOVA_AS_VA is set we can set mbuf->next as the dynamic field and 
move it to mbuf->buf_iova.
mbuf->next write is one of the prominent hotspot in arm platforms.


TCP/IP stack recommendations

2022-08-30 Thread Morten Brørup
Hi all.

Can anyone in here recommend an actively maintained open source TCP/IP stack 
for DPDK?


Med venlig hilsen / Kind regards,
-Morten Brørup



Re: [RFC PATCH 1/3] os: begin separating some OS compatibility from EAL

2022-08-30 Thread David Marchand
On Mon, Aug 29, 2022 at 5:19 PM Bruce Richardson
 wrote:
>
> Some library functionality we may want ahead of EAL build depends upon
> some OS-specific functionality, so we create a new lib for that to be
> built separately. For now, just includes fnmatch function for windows.
>
> Signed-off-by: Bruce Richardson 
> ---
>  lib/eal/windows/meson.build   |  1 -
>  lib/meson.build   | 11 ++-
>  lib/os/freebsd/fnmatch.c  |  3 +++
>  lib/os/linux/fnmatch.c|  3 +++
>  lib/os/meson.build|  8 
>  lib/os/os.c   |  3 +++
>  lib/os/version.map|  7 +++
>  lib/{eal => os}/windows/fnmatch.c |  0
>  lib/{eal/windows/include => os/windows}/fnmatch.h |  0
>  9 files changed, 30 insertions(+), 6 deletions(-)
>  create mode 100644 lib/os/freebsd/fnmatch.c
>  create mode 100644 lib/os/linux/fnmatch.c
>  create mode 100644 lib/os/meson.build
>  create mode 100644 lib/os/os.c
>  create mode 100644 lib/os/version.map
>  rename lib/{eal => os}/windows/fnmatch.c (100%)
>  rename lib/{eal/windows/include => os/windows}/fnmatch.h (100%)
>
> diff --git a/lib/eal/windows/meson.build b/lib/eal/windows/meson.build
> index 845e406ca1..e4b2427610 100644
> --- a/lib/eal/windows/meson.build
> +++ b/lib/eal/windows/meson.build
> @@ -18,7 +18,6 @@ sources += files(
>  'eal_mp.c',
>  'eal_thread.c',
>  'eal_timer.c',
> -'fnmatch.c',
>  'getopt.c',
>  'rte_thread.c',
>  )
> diff --git a/lib/meson.build b/lib/meson.build
> index c648f7d800..7b61b2a5d7 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -9,6 +9,7 @@
>  # given as a dep, no need to mention ring. This is especially true for the
>  # core libs which are widely reused, so their deps are kept to a minimum.
>  libraries = [
> +'os',   # load os compatibility material
>  'kvargs', # eal depends on kvargs
>  'telemetry', # basic info querying
>  'eal', # everything depends on eal
> @@ -106,6 +107,7 @@ if cc.has_argument('-Wno-format-truncation')
>  endif
>
>  enabled_libs = [] # used to print summary at the end
> +default_deps = []
>
>  foreach l:libraries
>  build = true
> @@ -124,11 +126,7 @@ foreach l:libraries
>  # use "deps" for internal DPDK dependencies, and "ext_deps" for
>  # external package/library requirements
>  ext_deps = []
> -deps = []
> -# eal is standard dependency once built
> -if dpdk_conf.has('RTE_LIB_EAL')
> -deps += ['eal']
> -endif
> +deps = default_deps
>
>  if disabled_libs.contains(l)
>  build = false
> @@ -271,4 +269,7 @@ foreach l:libraries
>  if developer_mode
>  message('lib/@0@: Defining dependency "@1@"'.format(l, name))
>  endif
> +if name == 'os' or name == 'eal'
> +default_deps = [name]
> +endif
>  endforeach
> diff --git a/lib/os/freebsd/fnmatch.c b/lib/os/freebsd/fnmatch.c
> new file mode 100644
> index 00..ca8a050fda
> --- /dev/null
> +++ b/lib/os/freebsd/fnmatch.c
> @@ -0,0 +1,3 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Intel Corporation
> + */
> diff --git a/lib/os/linux/fnmatch.c b/lib/os/linux/fnmatch.c
> new file mode 100644
> index 00..ca8a050fda
> --- /dev/null
> +++ b/lib/os/linux/fnmatch.c
> @@ -0,0 +1,3 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Intel Corporation
> + */
> diff --git a/lib/os/meson.build b/lib/os/meson.build
> new file mode 100644
> index 00..53949ca17e
> --- /dev/null
> +++ b/lib/os/meson.build
> @@ -0,0 +1,8 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(c) 2022 Intel Corporation
> +
> +includes += global_inc
> +includes += include_directories(exec_env)
> +sources += files(
> +exec_env / 'fnmatch.c',
> +)

Not really important (that's only a RFC), but os.c is not compiled anywhere.


> diff --git a/lib/os/os.c b/lib/os/os.c
> new file mode 100644
> index 00..ca8a050fda
> --- /dev/null
> +++ b/lib/os/os.c
> @@ -0,0 +1,3 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2022 Intel Corporation
> + */
> diff --git a/lib/os/version.map b/lib/os/version.map
> new file mode 100644
> index 00..e1dbd6051e
> --- /dev/null
> +++ b/lib/os/version.map
> @@ -0,0 +1,7 @@
> +DPDK_22 {
> +   global:
> +
> +   fnmatch;
> +
> +   local: *;
> +};

Could we perhaps consider a per-os version.map file or some kind of
inclusion of os specific symbols?
That would avoid odd exporting of a symbol that is provided by the C
library in other OS.


> diff --git a/lib/eal/windows/fnmatch.c b/lib/os/windows/fnmatch.c
> similarity index 100%
> rename from lib/eal/windows/fnmatch.c
> rename to lib/os/windows/fnmatch.c
> diff --git a/lib/eal/windows/include/fnmatch.h b/lib/os/windows/fnmatch.h
> similarity 

RE: [PATCH] gro: fix gro with tcp push flag

2022-08-30 Thread Hu, Jiayu
Sure, I will review it.

> -Original Message-
> From: Thomas Monjalon 
> Sent: Monday, August 29, 2022 6:36 PM
> To: Hu, Jiayu 
> Cc: kumaraparameshwaran rathinavel ;
> dev@dpdk.org; Jun Qiu 
> Subject: Re: [PATCH] gro: fix gro with tcp push flag
> 
> Jiayu, please could you help in this review?
> 
> 
> 27/07/2022 10:44, Jun Qiu:
> > I think this delay is tolerable.
> > Many TCP stacks do not take special care of PUSH packets when receiving
> them. All received packets with data will trigger Poll events.
> >
> > The patch is simple to implement and easy to understand, similar to how
> the kernel stack is handled.
> >
> > From: kumaraparameshwaran rathinavel 
> > Sent: Tuesday, July 26, 2022 3:08 PM
> > To: Jun Qiu 
> > Cc: dev@dpdk.org; jiayu...@intel.com; sta...@dpdk.org
> > Subject: Re: [PATCH] gro: fix gro with tcp push flag
> >
> > We should do it for the rte_gro_reassemble as well, with timer mode it
> could lead to more duplicate ACKs. I had a proposal for the enhancement
> which would handle both  rte_gro_reassemble and
> rte_gro_reassemble_burst but have not got any response yet.
> >
> > I have a custom patch which is working fine for timer mode where there is
> no packet reordering, earlier without the patch there were DUP-ACKs and
> this could potentially affect the window scaling.
> >
> > On Tue, Jul 26, 2022 at 12:27 PM Jun Qiu
> mailto:jun@jaguarmicro.com>> wrote:
> > May be in rte_gro_reassemble_burst, where no delay is introduced, PUSH
> > packets can be merged
> >
> > 发件人: kumaraparameshwaran rathinavel
> > mailto:kumaraparames...@gmail.com>>
> > 发送时间: 2022年7月26日 14:41
> > 收件人: Jun Qiu
> mailto:jun@jaguarmicro.com>>
> > 抄送: dev@dpdk.org;
> > jiayu...@intel.com;
> > sta...@dpdk.org
> > 主题: Re: [PATCH] gro: fix gro with tcp push flag
> >
> >
> >
> > On Tue, Jul 26, 2022 at 11:48 AM Jun Qiu
> mailto:jun@jaguarmicro.com>> wrote:
> > TCP data packets sometimes carry a PUSH flag. Currently, only the
> > packets that do not have PUSH flag can be GROed.
> > The packets that have a PUSH flag cannot be GROed, the packets that
> > cannot be processed by GRO are placed last.
> > In this case, the received packets may be out of order.
> > For example, there are two packets mbuf1 and mbuf2. mbuf1 contains
> > PUSH flag, mbuf2 does not contain PUSH flag.
> > After GRO processing, mbuf2 is sent for processing before mbuf1.
> > This out-of-order will affect TCP processing performance and lead to
> > unnecessary dup-ACK.
> >
> > Referring to the Linux kernel implementation, packets with PUSH flag
> > can also perform GRO. And if one of the packets containing PUSH flag,
> > the packets after GRO will carry PUSH flag.
> >
> > In case of smaller transfers in which the TCP segment size is not more than
> one MTU, it is a single TCP packet with PSH flag set, so in those cases  we 
> are
> introducing unwanted delay.  I think the better approach would be if there
> are previous packets in the flow and the current packet received has PSH flag
> then coalesce with the previous packet, if lookup is failure and the current
> packet has PSH flag set then deliver it immediately.
> >
> > Fixes: 0d2cbe59b719 ("lib/gro: support TCP/IPv4")
> > Cc: sta...@dpdk.org
> >
> > Signed-off-by: Jun Qiu
> > mailto:jun@jaguarmicro.com>>
> > ---
> >  lib/gro/gro_tcp4.c   |  4 ++--
> >  lib/gro/gro_tcp4.h   | 16 +---
> >  lib/gro/gro_vxlan_tcp4.c |  4 ++--
> >  3 files changed, 17 insertions(+), 7 deletions(-)
> >
> > diff --git a/lib/gro/gro_tcp4.c b/lib/gro/gro_tcp4.c index
> > 7498c66141..7849a2bd1d 100644
> > --- a/lib/gro/gro_tcp4.c
> > +++ b/lib/gro/gro_tcp4.c
> > @@ -220,10 +220,10 @@ gro_tcp4_reassemble(struct rte_mbuf *pkt,
> > hdr_len = pkt->l2_len + pkt->l3_len + pkt->l4_len;
> >
> > /*
> > -* Don't process the packet which has FIN, SYN, RST, PSH, URG, ECE
> > +* Don't process the packet which has FIN, SYN, RST, URG, ECE
> >  * or CWR set.
> >  */
> > -   if (tcp_hdr->tcp_flags != RTE_TCP_ACK_FLAG)
> > +   if (tcp_hdr->tcp_flags & (~(RTE_TCP_ACK_FLAG |
> > + RTE_TCP_PSH_FLAG)))
> > return -1;
> > /*
> >  * Don't process the packet whose payload length is less than
> > or diff --git a/lib/gro/gro_tcp4.h b/lib/gro/gro_tcp4.h index
> > 212f97a042..2974faf228 100644
> > --- a/lib/gro/gro_tcp4.h
> > +++ b/lib/gro/gro_tcp4.h
> > @@ -210,7 +210,8 @@ merge_two_tcp4_packets(struct gro_tcp4_item
> *item,
> > uint16_t l2_offset)
> >  {
> > struct rte_mbuf *pkt_head, *pkt_tail, *lastseg;
> > -   uint16_t hdr_len, l2_len;
> > +   struct rte_tcp_hdr *head_tcp_hdr, *tail_tcp_hdr;
> > +   uint16_t hdr_len, l2_len, l3_offset;
> >
> > if (cmp > 0) {
> > pkt_head = item->firstseg; @@ -221,13 +222,22 @@
> > merge_two_tcp4_packets(struct gro_tcp4_item *item,
> >  

Re: [PATCH] doc: add removal note for power empty poll API

2022-08-30 Thread Hunt, David



On 02/08/2022 16:22, Reshma Pattan wrote:

Add removal note for experimental empty poll API.

CC: David Hunt 

Signed-off-by: Reshma Pattan 
---
  doc/guides/prog_guide/power_man.rst | 6 ++
  1 file changed, 6 insertions(+)

diff --git a/doc/guides/prog_guide/power_man.rst 
b/doc/guides/prog_guide/power_man.rst
index 98cfd3c1f3..2e47d87cbb 100644
--- a/doc/guides/prog_guide/power_man.rst
+++ b/doc/guides/prog_guide/power_man.rst
@@ -192,6 +192,12 @@ User Cases
  --
  The mechanism can applied to any device which is based on polling. e.g. NIC, 
FPGA.
  
+Removal Note

+
+The experimental empty poll APIs will be removed from the library in a future 
DPDK release.
+Suggest to use new lcore poll busyness APIs added in 22.11.
+
+
  Ethernet PMD Power Management API
  -
  


Hi Reshma,

Yes, these APIs will be superseded by the newer poll busyness telemetry.

Acked-by: David Hunt 




Re: TCP/IP stack recommendations

2022-08-30 Thread Ray Kinsella
Hi Morten,

Reach out to Florin Coras over in VPP-land.

Morten Brørup  writes:

> Hi all.
>
> Can anyone in here recommend an actively maintained open source TCP/IP stack 
> for DPDK?
>
>
> Med venlig hilsen / Kind regards,
> -Morten Brørup


-- 
Regards, Ray K


Re: [PATCH] eal: zero out new added memory

2022-08-30 Thread lic121
On Tue, Aug 30, 2022 at 01:11:25AM +, lic121 wrote:
> On Mon, Aug 29, 2022 at 03:49:25PM +0300, Dmitry Kozlyuk wrote:
> > 2022-08-29 14:37 (UTC+0200), Morten Brørup:
> > > > From: David Marchand [mailto:david.march...@redhat.com]
> > > > Sent: Monday, 29 August 2022 13.58
> > > >
> > > > > > > > On Sat, Aug 27, 2022 at 12:57:50PM +0300, Dmitry Kozlyuk wrote: 
> > > > > > > >  
> > > > > > > > > The kernel ensures that the newly mapped memory is zeroed,
> > > > > > > > > and DPDK ensures that files in hugetlbfs are not re-mapped.  
> > > 
> > > David, are you suggesting that this invariant - guaranteeing that DPDK 
> > > memory is zeroed - was violated by SELinux in the SELinux/container issue 
> > > you were tracking?
> > > 
> > > If so, the method to ensure the invariant is faulty for SELinux. Assuming 
> > > DPDK supports SELinux, this bug should be fixed.
> > 
> > +1, I'd like to know more about that case.
> > 
> > EAL checks the unlink() result, so if it fails, the allocation should fail
> > and the invariant should not be broken.
> > Code from 20.11.5:
> > 
> > if (rte_eal_process_type() == RTE_PROC_PRIMARY &&
> > unlink(path) == -1 &&
> > errno != ENOENT) {
> > RTE_LOG(DEBUG, EAL, "%s(): could not remove '%s': %s\n",
> > __func__, path, strerror(errno));
> > return -1;
> > }
> > 
> > Can SELinux restriction result in errno == ENOENT?
> > I'd expect EPERM/EACCESS.
> 
> Thanks for your info, the selinux is disabled on my server. Also I
> checked that the selinux fix is already in my dpdk. Could any other
> settings may cause dirty memory? If you can think of any thing related,
> I can have a try.
> 
> BTW, this is my nic info:
> ```
> Intel Corporation Ethernet Controller E810-XXV for SFP (rev 02)
> 
> driver: ice
> version: 1.9.3
> firmware-version: 2.30 0x80005d22 1.2877.0
> expansion-rom-version:
> bus-info: :3b:00.1
> supports-statistics: yes
> supports-test: yes
> supports-eeprom-access: yes
> supports-register-dump: yes
> supports-priv-flags: yes
> ```


update with more debugs:

Preparation:
1. set hugepage size to 2 GB.
```
[root@gz15-compute-s3-55e247e16e22 huge]# grep -i huge /proc/meminfo
AnonHugePages:124928 kB
ShmemHugePages:0 kB
HugePages_Total:   2
HugePages_Free:2
HugePages_Rsvd:0
HugePages_Surp:0
Hugepagesize:1048576 kB
Hugetlb: 2097152 kB
```

2. make a simple programe to poison memory
```c
#include 
#include 
#include 

static int memvcmp(void *memory, unsigned char val, size_t size)
{
unsigned char *mm = (unsigned char*)memory;
return (*mm == val) && memcmp(mm, mm + 1, size - 1) == 0;
}

int main(int argc, char *argv[]){
size_t size = 2 * (1 << 30)-1;
void *ptr2 = mmap(NULL,  size,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS |
MAP_HUGETLB, -1, 0);
if (! ptr2) {
printf("failed to allocted mm");
return 0;
}
if (argc > 1) {
memset(ptr2, 0xff, size);
}
unsigned char * ss = ptr2;
printf("ss: %x\n", *ss);
if (memvcmp(ptr2, 0, size)){
printf("all zero\n");
} else {
printf("not all zero\n");
}
}
```

3. insert debug info to check if memory all zero
```
diff --git a/lib/librte_eal/common/malloc_heap.c
b/lib/librte_eal/common/malloc_heap.c
index 5a09247a6..026560333 100644
--- a/lib/librte_eal/common/malloc_heap.c
+++ b/lib/librte_eal/common/malloc_heap.c
@@ -91,16 +91,32 @@ malloc_socket_to_heap_id(unsigned int socket_id)
 /*
  * Expand the heap with a memory area.
  */
+static int memvcmp(void *memory, unsigned char val, size_t size)
+{
+unsigned char *mm = (unsigned char*)memory;
+return (*mm == val) && memcmp(mm, mm + 1, size - 1) == 0;
+}
 static struct malloc_elem *
 malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list
*msl,
void *start, size_t len)
 {
struct malloc_elem *elem = start;
+   void *ptr;
+   size_t data_len;
+

malloc_elem_init(elem, heap, msl, len, elem, len);

malloc_elem_insert(elem);

+   ptr = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN);
+   data_len = elem->size - MALLOC_ELEM_OVERHEAD;
+if (memvcmp(ptr, 0, data_len)){
+RTE_LOG(ERR, EAL, "liiilog: all zero\n");
+} else {
+RTE_LOG(ERR, EAL, "liiilog: not all zero\n");
+}
+
elem = malloc_elem_join_adjacent_free(elem);

malloc_elem_free_list_insert(elem);
```

debug steps:
1. poison 2GB memory
```
[root@gz15-compute-s3-55e247e16e22 secure]# rm -rf
/dev/hugepages/rtemap_* ; huge/a.out 1
ss: ff
not all zero
```
2. Run testpmd(with no nic bind vfio-pci)
```
[root@gz15-compute-s3-55e247e16e22 secure]# dpdk-testpmd -l 0-3 -n 4 --
-i --nb-cores=3
EAL: Detected 64 lcore(s)
EAL: Detected 2 NUMA nodes
EAL: Multi-process socket /var/run/dpdk/rte/mp_socket
EAL: Selected IO

Re: [RFC PATCH 1/3] os: begin separating some OS compatibility from EAL

2022-08-30 Thread Bruce Richardson
On Tue, Aug 30, 2022 at 10:42:43AM +0200, David Marchand wrote:
> On Mon, Aug 29, 2022 at 5:19 PM Bruce Richardson
>  wrote:
> >
> > Some library functionality we may want ahead of EAL build depends upon
> > some OS-specific functionality, so we create a new lib for that to be
> > built separately. For now, just includes fnmatch function for windows.
> >
> > Signed-off-by: Bruce Richardson 
> > ---
> >  lib/eal/windows/meson.build   |  1 -
> >  lib/meson.build   | 11 ++-
> >  lib/os/freebsd/fnmatch.c  |  3 +++
> >  lib/os/linux/fnmatch.c|  3 +++
> >  lib/os/meson.build|  8 
> >  lib/os/os.c   |  3 +++
> >  lib/os/version.map|  7 +++
> >  lib/{eal => os}/windows/fnmatch.c |  0
> >  lib/{eal/windows/include => os/windows}/fnmatch.h |  0
> >  9 files changed, 30 insertions(+), 6 deletions(-)
> >  create mode 100644 lib/os/freebsd/fnmatch.c
> >  create mode 100644 lib/os/linux/fnmatch.c
> >  create mode 100644 lib/os/meson.build
> >  create mode 100644 lib/os/os.c
> >  create mode 100644 lib/os/version.map
> >  rename lib/{eal => os}/windows/fnmatch.c (100%)
> >  rename lib/{eal/windows/include => os/windows}/fnmatch.h (100%)
> >
> > diff --git a/lib/eal/windows/meson.build b/lib/eal/windows/meson.build
> > index 845e406ca1..e4b2427610 100644
> > --- a/lib/eal/windows/meson.build
> > +++ b/lib/eal/windows/meson.build
> > @@ -18,7 +18,6 @@ sources += files(
> >  'eal_mp.c',
> >  'eal_thread.c',
> >  'eal_timer.c',
> > -'fnmatch.c',
> >  'getopt.c',
> >  'rte_thread.c',
> >  )
> > diff --git a/lib/meson.build b/lib/meson.build
> > index c648f7d800..7b61b2a5d7 100644
> > --- a/lib/meson.build
> > +++ b/lib/meson.build
> > @@ -9,6 +9,7 @@
> >  # given as a dep, no need to mention ring. This is especially true for the
> >  # core libs which are widely reused, so their deps are kept to a minimum.
> >  libraries = [
> > +'os',   # load os compatibility material
> >  'kvargs', # eal depends on kvargs
> >  'telemetry', # basic info querying
> >  'eal', # everything depends on eal
> > @@ -106,6 +107,7 @@ if cc.has_argument('-Wno-format-truncation')
> >  endif
> >
> >  enabled_libs = [] # used to print summary at the end
> > +default_deps = []
> >
> >  foreach l:libraries
> >  build = true
> > @@ -124,11 +126,7 @@ foreach l:libraries
> >  # use "deps" for internal DPDK dependencies, and "ext_deps" for
> >  # external package/library requirements
> >  ext_deps = []
> > -deps = []
> > -# eal is standard dependency once built
> > -if dpdk_conf.has('RTE_LIB_EAL')
> > -deps += ['eal']
> > -endif
> > +deps = default_deps
> >
> >  if disabled_libs.contains(l)
> >  build = false
> > @@ -271,4 +269,7 @@ foreach l:libraries
> >  if developer_mode
> >  message('lib/@0@: Defining dependency "@1@"'.format(l, name))
> >  endif
> > +if name == 'os' or name == 'eal'
> > +default_deps = [name]
> > +endif
> >  endforeach
> > diff --git a/lib/os/freebsd/fnmatch.c b/lib/os/freebsd/fnmatch.c
> > new file mode 100644
> > index 00..ca8a050fda
> > --- /dev/null
> > +++ b/lib/os/freebsd/fnmatch.c
> > @@ -0,0 +1,3 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2022 Intel Corporation
> > + */
> > diff --git a/lib/os/linux/fnmatch.c b/lib/os/linux/fnmatch.c
> > new file mode 100644
> > index 00..ca8a050fda
> > --- /dev/null
> > +++ b/lib/os/linux/fnmatch.c
> > @@ -0,0 +1,3 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2022 Intel Corporation
> > + */
> > diff --git a/lib/os/meson.build b/lib/os/meson.build
> > new file mode 100644
> > index 00..53949ca17e
> > --- /dev/null
> > +++ b/lib/os/meson.build
> > @@ -0,0 +1,8 @@
> > +# SPDX-License-Identifier: BSD-3-Clause
> > +# Copyright(c) 2022 Intel Corporation
> > +
> > +includes += global_inc
> > +includes += include_directories(exec_env)
> > +sources += files(
> > +exec_env / 'fnmatch.c',
> > +)
> 
> Not really important (that's only a RFC), but os.c is not compiled anywhere.
> 
> 
> > diff --git a/lib/os/os.c b/lib/os/os.c
> > new file mode 100644
> > index 00..ca8a050fda
> > --- /dev/null
> > +++ b/lib/os/os.c
> > @@ -0,0 +1,3 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright(c) 2022 Intel Corporation
> > + */
> > diff --git a/lib/os/version.map b/lib/os/version.map
> > new file mode 100644
> > index 00..e1dbd6051e
> > --- /dev/null
> > +++ b/lib/os/version.map
> > @@ -0,0 +1,7 @@
> > +DPDK_22 {
> > +   global:
> > +
> > +   fnmatch;
> > +
> > +   local: *;
> > +};
> 
> Could we perhaps consider a per-os version.map file or some kind of
> inclusion of os specific symbols?
> Th

[PATCH v3 0/5] support flow subscription

2022-08-30 Thread Jie Wang
Add support AVF can be able to subscribe a flow from PF.

--
v3:
 * fix eth layer inputset.
 * rebase.
v2:
 * split v1 patch 2/2 to 4 small patches.
 * remove rule action RTE_FLOW_ACTION_TYPE_VF and add
   RTE_FLOW_ACTION_TYPE_REPRESENTED_PORT.

Jie Wang (5):
  common/iavf: support flow subscription
  net/iavf: add flow subscription to AVF
  net/iavf: support flow subscrption pattern
  net/iavf: support flow subscription rule
  net/iavf: support priority of flow rule

 doc/guides/rel_notes/release_22_11.rst |   4 +
 drivers/common/iavf/virtchnl.h | 104 +++-
 drivers/net/iavf/iavf.h|  13 +
 drivers/net/iavf/iavf_fdir.c   |   4 +
 drivers/net/iavf/iavf_fsub.c   | 745 +
 drivers/net/iavf/iavf_generic_flow.c   |  40 +-
 drivers/net/iavf/iavf_generic_flow.h   |   2 +
 drivers/net/iavf/iavf_hash.c   |   5 +
 drivers/net/iavf/iavf_ipsec_crypto.c   |  16 +-
 drivers/net/iavf/iavf_vchnl.c  | 133 +
 drivers/net/iavf/meson.build   |   1 +
 11 files changed, 1046 insertions(+), 21 deletions(-)
 create mode 100644 drivers/net/iavf/iavf_fsub.c

-- 
2.25.1



[PATCH v3 1/5] common/iavf: support flow subscription

2022-08-30 Thread Jie Wang
VF is able to subscribe a flow from PF by VIRTCHNL_FLOW_SUBSCRIBE.

PF is expected to offload a rule to hardware which will redirect
the packet that matching the required pattern to this VF.

Only a flow with dst mac address as PF's mac address can be subscribed.

VIRTCHNL_VF_OFFLOAD_FSUB_PF is used for Flow subscription capability
negotiation and only a trusted VF can be granted with this capability.

A flow can be unsubscribed by VIRTCHNL_FLOW_UNSUBSCRIBE.

Signed-off-by: Jie Wang 
Signed-off-by: Qi Zhang 
---
 drivers/common/iavf/virtchnl.h | 104 +++--
 1 file changed, 100 insertions(+), 4 deletions(-)

diff --git a/drivers/common/iavf/virtchnl.h b/drivers/common/iavf/virtchnl.h
index f123daec8e..e02eec4935 100644
--- a/drivers/common/iavf/virtchnl.h
+++ b/drivers/common/iavf/virtchnl.h
@@ -168,6 +168,8 @@ enum virtchnl_ops {
VIRTCHNL_OP_MAP_QUEUE_VECTOR = 111,
VIRTCHNL_OP_CONFIG_QUEUE_BW = 112,
VIRTCHNL_OP_CONFIG_QUANTA = 113,
+   VIRTCHNL_OP_FLOW_SUBSCRIBE = 114,
+   VIRTCHNL_OP_FLOW_UNSUBSCRIBE = 115,
VIRTCHNL_OP_MAX,
 };
 
@@ -282,6 +284,10 @@ static inline const char *virtchnl_op_str(enum 
virtchnl_ops v_opcode)
return "VIRTCHNL_OP_1588_PTP_GET_CAPS";
case VIRTCHNL_OP_1588_PTP_GET_TIME:
return "VIRTCHNL_OP_1588_PTP_GET_TIME";
+   case VIRTCHNL_OP_FLOW_SUBSCRIBE:
+   return "VIRTCHNL_OP_FLOW_SUBSCRIBE";
+   case VIRTCHNL_OP_FLOW_UNSUBSCRIBE:
+   return "VIRTCHNL_OP_FLOW_UNSUBSCRIBE";
case VIRTCHNL_OP_MAX:
return "VIRTCHNL_OP_MAX";
default:
@@ -401,6 +407,7 @@ VIRTCHNL_CHECK_STRUCT_LEN(16, virtchnl_vsi_resource);
 #define VIRTCHNL_VF_OFFLOAD_INLINE_IPSEC_CRYPTOBIT(8)
 #define VIRTCHNL_VF_LARGE_NUM_QPAIRS   BIT(9)
 #define VIRTCHNL_VF_OFFLOAD_CRCBIT(10)
+#define VIRTCHNL_VF_OFFLOAD_FSUB_PFBIT(14)
 #define VIRTCHNL_VF_OFFLOAD_VLAN_V2BIT(15)
 #define VIRTCHNL_VF_OFFLOAD_VLAN   BIT(16)
 #define VIRTCHNL_VF_OFFLOAD_RX_POLLING BIT(17)
@@ -1503,6 +1510,7 @@ enum virtchnl_vfr_states {
 };
 
 #define VIRTCHNL_MAX_NUM_PROTO_HDRS32
+#define VIRTCHNL_MAX_NUM_PROTO_HDRS_W_MSK  16
 #define VIRTCHNL_MAX_SIZE_RAW_PACKET   1024
 #define PROTO_HDR_SHIFT5
 #define PROTO_HDR_FIELD_START(proto_hdr_type) \
@@ -1695,6 +1703,22 @@ struct virtchnl_proto_hdr {
 
 VIRTCHNL_CHECK_STRUCT_LEN(72, virtchnl_proto_hdr);
 
+struct virtchnl_proto_hdr_w_msk {
+   /* see enum virtchnl_proto_hdr_type */
+   s32 type;
+   u32 pad;
+   /**
+* binary buffer in network order for specific header type.
+* For example, if type = VIRTCHNL_PROTO_HDR_IPV4, a IPv4
+* header is expected to be copied into the buffer.
+*/
+   u8 buffer_spec[64];
+   /* binary buffer for bit-mask applied to specific header type */
+   u8 buffer_mask[64];
+};
+
+VIRTCHNL_CHECK_STRUCT_LEN(136, virtchnl_proto_hdr_w_msk);
+
 struct virtchnl_proto_hdrs {
u8 tunnel_level;
/**
@@ -1706,11 +1730,18 @@ struct virtchnl_proto_hdrs {
 */
int count;
/**
-* number of proto layers, must < VIRTCHNL_MAX_NUM_PROTO_HDRS
-* must be 0 for a raw packet request.
+* count must <=
+* VIRTCHNL_MAX_NUM_PROTO_HDRS + VIRTCHNL_MAX_NUM_PROTO_HDRS_W_MSK
+* count = 0 :  select raw
+* 1 < count <= VIRTCHNL_MAX_NUM_PROTO_HDRS :   select proto_hdr
+* count > VIRTCHNL_MAX_NUM_PROTO_HDRS :select proto_hdr_w_msk
+* last valid index = count - VIRTCHNL_MAX_NUM_PROTO_HDRS
 */
union {
-   struct virtchnl_proto_hdr 
proto_hdr[VIRTCHNL_MAX_NUM_PROTO_HDRS];
+   struct virtchnl_proto_hdr
+   proto_hdr[VIRTCHNL_MAX_NUM_PROTO_HDRS];
+   struct virtchnl_proto_hdr_w_msk
+   proto_hdr_w_msk[VIRTCHNL_MAX_NUM_PROTO_HDRS_W_MSK];
struct {
u16 pkt_len;
u8 spec[VIRTCHNL_MAX_SIZE_RAW_PACKET];
@@ -1731,7 +1762,7 @@ struct virtchnl_rss_cfg {
 
 VIRTCHNL_CHECK_STRUCT_LEN(2444, virtchnl_rss_cfg);
 
-/* action configuration for FDIR */
+/* action configuration for FDIR and FSUB */
 struct virtchnl_filter_action {
/* see enum virtchnl_action type */
s32 type;
@@ -1849,6 +1880,65 @@ struct virtchnl_fdir_del {
 
 VIRTCHNL_CHECK_STRUCT_LEN(12, virtchnl_fdir_del);
 
+/* Status returned to VF after VF requests FSUB commands
+ * VIRTCHNL_FSUB_SUCCESS
+ * VF FLOW related request is successfully done by PF
+ * The request can be OP_FLOW_SUBSCRIBE/UNSUBSCRIBE.
+ *
+ * VIRTCHNL_FSUB_FAILURE_RULE_NORESOURCE
+ * OP_FLOW_SUBSCRIBE request is failed due to no Hardware resource.
+ *
+ * VIRTCHNL_FSUB_FAILURE_RULE_EXIST
+ * OP_FLOW_SUBSCRIBE request is failed due to the rule is already existed.
+ 

[PATCH v3 2/5] net/iavf: add flow subscription to AVF

2022-08-30 Thread Jie Wang
Add the skeletal code of flow subscription to AVF driver.

Signed-off-by: Jie Wang 
---
 doc/guides/rel_notes/release_22_11.rst |   4 +
 drivers/net/iavf/iavf_fsub.c   | 112 +
 drivers/net/iavf/iavf_generic_flow.c   |  17 +++-
 drivers/net/iavf/iavf_generic_flow.h   |   1 +
 drivers/net/iavf/iavf_vchnl.c  |   1 +
 drivers/net/iavf/meson.build   |   1 +
 6 files changed, 135 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/iavf/iavf_fsub.c

diff --git a/doc/guides/rel_notes/release_22_11.rst 
b/doc/guides/rel_notes/release_22_11.rst
index 8c021cf050..bb77a03e24 100644
--- a/doc/guides/rel_notes/release_22_11.rst
+++ b/doc/guides/rel_notes/release_22_11.rst
@@ -55,6 +55,10 @@ New Features
  Also, make sure to start the actual text at the margin.
  ===
 
+* **Updated Intel iavf driver.**
+
+  * Added flow subscription support.
+
 
 Removed Items
 -
diff --git a/drivers/net/iavf/iavf_fsub.c b/drivers/net/iavf/iavf_fsub.c
new file mode 100644
index 00..17f9bb2976
--- /dev/null
+++ b/drivers/net/iavf/iavf_fsub.c
@@ -0,0 +1,112 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2022 Intel Corporation
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "iavf_generic_flow.h"
+
+
+static struct iavf_flow_parser iavf_fsub_parser;
+
+static struct iavf_pattern_match_item iavf_fsub_pattern_list[] = {};
+
+static int
+iavf_fsub_create(__rte_unused struct iavf_adapter *ad,
+__rte_unused struct rte_flow *flow,
+__rte_unused void *meta,
+__rte_unused struct rte_flow_error *error)
+{
+   return -rte_errno;
+}
+
+static int
+iavf_fsub_destroy(__rte_unused struct iavf_adapter *ad,
+ __rte_unused struct rte_flow *flow,
+ __rte_unused struct rte_flow_error *error)
+{
+   return -rte_errno;
+}
+
+static int
+iavf_fsub_validation(__rte_unused struct iavf_adapter *ad,
+__rte_unused struct rte_flow *flow,
+__rte_unused void *meta,
+__rte_unused struct rte_flow_error *error)
+{
+   return -rte_errno;
+};
+
+static int
+iavf_fsub_parse(__rte_unused struct iavf_adapter *ad,
+   __rte_unused struct iavf_pattern_match_item *array,
+   __rte_unused uint32_t array_len,
+   __rte_unused const struct rte_flow_item pattern[],
+   __rte_unused const struct rte_flow_action actions[],
+   __rte_unused void **meta,
+   __rte_unused struct rte_flow_error *error)
+{
+   return -rte_errno;
+}
+
+static int
+iavf_fsub_init(struct iavf_adapter *ad)
+{
+   struct iavf_info *vf = IAVF_DEV_PRIVATE_TO_VF(ad);
+   struct iavf_flow_parser *parser;
+
+   if (!vf->vf_res)
+   return -EINVAL;
+
+   if (vf->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_FSUB_PF)
+   parser = &iavf_fsub_parser;
+   else
+   return -ENOTSUP;
+
+   return iavf_register_parser(parser, ad);
+}
+
+static void
+iavf_fsub_uninit(struct iavf_adapter *ad)
+{
+   iavf_unregister_parser(&iavf_fsub_parser, ad);
+}
+
+static struct
+iavf_flow_engine iavf_fsub_engine = {
+   .init = iavf_fsub_init,
+   .uninit = iavf_fsub_uninit,
+   .create = iavf_fsub_create,
+   .destroy = iavf_fsub_destroy,
+   .validation = iavf_fsub_validation,
+   .type = IAVF_FLOW_ENGINE_FSUB,
+};
+
+static struct
+iavf_flow_parser iavf_fsub_parser = {
+   .engine = &iavf_fsub_engine,
+   .array = iavf_fsub_pattern_list,
+   .array_len = RTE_DIM(iavf_fsub_pattern_list),
+   .parse_pattern_action = iavf_fsub_parse,
+   .stage = IAVF_FLOW_STAGE_DISTRIBUTOR,
+};
+
+RTE_INIT(iavf_fsub_engine_init)
+{
+   iavf_register_flow_engine(&iavf_fsub_engine);
+}
diff --git a/drivers/net/iavf/iavf_generic_flow.c 
b/drivers/net/iavf/iavf_generic_flow.c
index e1a611e319..b04614ba6e 100644
--- a/drivers/net/iavf/iavf_generic_flow.c
+++ b/drivers/net/iavf/iavf_generic_flow.c
@@ -1866,6 +1866,8 @@ iavf_register_parser(struct iavf_flow_parser *parser,
 {
struct iavf_parser_list *list = NULL;
struct iavf_flow_parser_node *parser_node;
+   struct iavf_flow_parser_node *existing_node;
+   void *temp;
struct iavf_info *vf = IAVF_DEV_PRIVATE_TO_VF(ad);
 
parser_node = rte_zmalloc("iavf_parser", sizeof(*parser_node), 0);
@@ -1880,14 +1882,26 @@ iavf_register_parser(struct iavf_flow_parser *parser,
TAILQ_INSERT_TAIL(list, parser_node, node);
} else if (parser->engine->type == IAVF_FLOW_ENGINE_FDIR) {
list = &vf->dist_parser_list;
+   RTE_TAILQ_FOREACH_SAFE(existing_node, list, node, temp) {
+   if (existing_

[PATCH v3 3/5] net/iavf: support flow subscrption pattern

2022-08-30 Thread Jie Wang
Add flow subscription pattern support for AVF.

The supported patterns are listed below:
eth/vlan/ipv4
eth/ipv4(6)
eth/ipv4(6)/udp
eth/ipv4(6)/tcp

Signed-off-by: Jie Wang 
---
 drivers/net/iavf/iavf.h  |   7 +
 drivers/net/iavf/iavf_fsub.c | 598 ++-
 2 files changed, 597 insertions(+), 8 deletions(-)

diff --git a/drivers/net/iavf/iavf.h b/drivers/net/iavf/iavf.h
index 025ab3ff60..f79c7f9f6e 100644
--- a/drivers/net/iavf/iavf.h
+++ b/drivers/net/iavf/iavf.h
@@ -148,6 +148,13 @@ struct iavf_fdir_info {
struct iavf_fdir_conf conf;
 };
 
+struct iavf_fsub_conf {
+   struct virtchnl_flow_sub sub_fltr;
+   struct virtchnl_flow_unsub unsub_fltr;
+   uint64_t input_set;
+   uint32_t flow_id;
+};
+
 struct iavf_qv_map {
uint16_t queue_id;
uint16_t vector_id;
diff --git a/drivers/net/iavf/iavf_fsub.c b/drivers/net/iavf/iavf_fsub.c
index 17f9bb2976..4600d52b91 100644
--- a/drivers/net/iavf/iavf_fsub.c
+++ b/drivers/net/iavf/iavf_fsub.c
@@ -22,9 +22,51 @@
 #include "iavf_generic_flow.h"
 
 
+#define MAX_QGRP_NUM_TYPE  7
+#define IAVF_IPV6_ADDR_LENGTH  16
+#define MAX_INPUT_SET_BYTE 32
+
+#define IAVF_SW_INSET_ETHER ( \
+   IAVF_INSET_DMAC | IAVF_INSET_SMAC | IAVF_INSET_ETHERTYPE)
+#define IAVF_SW_INSET_MAC_IPV4 ( \
+   IAVF_INSET_DMAC | IAVF_INSET_IPV4_DST | IAVF_INSET_IPV4_SRC | \
+   IAVF_INSET_IPV4_PROTO | IAVF_INSET_IPV4_TTL | IAVF_INSET_IPV4_TOS)
+#define IAVF_SW_INSET_MAC_VLAN_IPV4 ( \
+   IAVF_SW_INSET_MAC_IPV4 | IAVF_INSET_VLAN_OUTER)
+#define IAVF_SW_INSET_MAC_IPV4_TCP ( \
+   IAVF_INSET_DMAC | IAVF_INSET_IPV4_DST | IAVF_INSET_IPV4_SRC | \
+   IAVF_INSET_IPV4_TTL | IAVF_INSET_IPV4_TOS | \
+   IAVF_INSET_TCP_DST_PORT | IAVF_INSET_TCP_SRC_PORT)
+#define IAVF_SW_INSET_MAC_IPV4_UDP ( \
+   IAVF_INSET_DMAC | IAVF_INSET_IPV4_DST | IAVF_INSET_IPV4_SRC | \
+   IAVF_INSET_IPV4_TTL | IAVF_INSET_IPV4_TOS | \
+   IAVF_INSET_UDP_DST_PORT | IAVF_INSET_UDP_SRC_PORT)
+#define IAVF_SW_INSET_MAC_IPV6 ( \
+   IAVF_INSET_DMAC | IAVF_INSET_IPV6_DST | IAVF_INSET_IPV6_SRC | \
+   IAVF_INSET_IPV6_TC | IAVF_INSET_IPV6_HOP_LIMIT | \
+   IAVF_INSET_IPV6_NEXT_HDR)
+#define IAVF_SW_INSET_MAC_IPV6_TCP ( \
+   IAVF_INSET_DMAC | IAVF_INSET_IPV6_DST | IAVF_INSET_IPV6_SRC | \
+   IAVF_INSET_IPV6_HOP_LIMIT | IAVF_INSET_IPV6_TC | \
+   IAVF_INSET_TCP_DST_PORT | IAVF_INSET_TCP_SRC_PORT)
+#define IAVF_SW_INSET_MAC_IPV6_UDP ( \
+   IAVF_INSET_DMAC | IAVF_INSET_IPV6_DST | IAVF_INSET_IPV6_SRC | \
+   IAVF_INSET_IPV6_HOP_LIMIT | IAVF_INSET_IPV6_TC | \
+   IAVF_INSET_UDP_DST_PORT | IAVF_INSET_UDP_SRC_PORT)
+
 static struct iavf_flow_parser iavf_fsub_parser;
 
-static struct iavf_pattern_match_item iavf_fsub_pattern_list[] = {};
+static struct
+iavf_pattern_match_item iavf_fsub_pattern_list[] = {
+   {iavf_pattern_ethertype,IAVF_SW_INSET_ETHER,
IAVF_INSET_NONE},
+   {iavf_pattern_eth_ipv4, IAVF_SW_INSET_MAC_IPV4, 
IAVF_INSET_NONE},
+   {iavf_pattern_eth_vlan_ipv4,
IAVF_SW_INSET_MAC_VLAN_IPV4,IAVF_INSET_NONE},
+   {iavf_pattern_eth_ipv4_udp, 
IAVF_SW_INSET_MAC_IPV4_UDP, IAVF_INSET_NONE},
+   {iavf_pattern_eth_ipv4_tcp, 
IAVF_SW_INSET_MAC_IPV4_TCP, IAVF_INSET_NONE},
+   {iavf_pattern_eth_ipv6, IAVF_SW_INSET_MAC_IPV6, 
IAVF_INSET_NONE},
+   {iavf_pattern_eth_ipv6_udp, 
IAVF_SW_INSET_MAC_IPV6_UDP, IAVF_INSET_NONE},
+   {iavf_pattern_eth_ipv6_tcp, 
IAVF_SW_INSET_MAC_IPV6_TCP, IAVF_INSET_NONE},
+};
 
 static int
 iavf_fsub_create(__rte_unused struct iavf_adapter *ad,
@@ -53,17 +95,557 @@ iavf_fsub_validation(__rte_unused struct iavf_adapter *ad,
 };
 
 static int
-iavf_fsub_parse(__rte_unused struct iavf_adapter *ad,
-   __rte_unused struct iavf_pattern_match_item *array,
-   __rte_unused uint32_t array_len,
-   __rte_unused const struct rte_flow_item pattern[],
-   __rte_unused const struct rte_flow_action actions[],
-   __rte_unused void **meta,
-   __rte_unused struct rte_flow_error *error)
+iavf_fsub_parse_pattern(const struct rte_flow_item pattern[],
+   const uint64_t input_set_mask,
+   struct rte_flow_error *error,
+   struct iavf_fsub_conf *filter)
+{
+   struct virtchnl_proto_hdrs *hdrs = &filter->sub_fltr.proto_hdrs;
+   enum rte_flow_item_type item_type;
+   const struct rte_flow_item_eth *eth_spec, *eth_mask;
+   const struct rte_flow_item_ipv4 *ipv4_spec, *ipv4_mask;
+   const struct rte_flow_item_ipv6 *ipv6_spec, *ipv6_mask;
+   const struct rte_flow_item_tcp *tcp_spec, *tcp_mask;
+   const struct rte_flow_item_udp *u

[PATCH v3 4/5] net/iavf: support flow subscription rule

2022-08-30 Thread Jie Wang
Support flow subscribption create/destroy/validation flow
rule for AVF.

For examples:
testpmd> flow create 0 ingress pattern eth / ipv4 / udp src is 11
  / end actions represented_port port_id 1 / end
testpmd> flow validate 1 ingress pattern eth / ipv4 / tcp src is 22
  / end actions represented_port port_id 1 / end
testpmd> flow destroy 1 rule 0

Signed-off-by: Jie Wang 
---
 drivers/net/iavf/iavf.h   |   6 ++
 drivers/net/iavf/iavf_fsub.c  |  75 +++
 drivers/net/iavf/iavf_vchnl.c | 132 ++
 3 files changed, 201 insertions(+), 12 deletions(-)

diff --git a/drivers/net/iavf/iavf.h b/drivers/net/iavf/iavf.h
index f79c7f9f6e..26b858f6f0 100644
--- a/drivers/net/iavf/iavf.h
+++ b/drivers/net/iavf/iavf.h
@@ -489,4 +489,10 @@ int iavf_ipsec_crypto_request(struct iavf_adapter *adapter,
 extern const struct rte_tm_ops iavf_tm_ops;
 int iavf_get_ptp_cap(struct iavf_adapter *adapter);
 int iavf_get_phc_time(struct iavf_rx_queue *rxq);
+int iavf_flow_sub(struct iavf_adapter *adapter,
+ struct iavf_fsub_conf *filter);
+int iavf_flow_unsub(struct iavf_adapter *adapter,
+   struct iavf_fsub_conf *filter);
+int iavf_flow_sub_check(struct iavf_adapter *adapter,
+   struct iavf_fsub_conf *filter);
 #endif /* _IAVF_ETHDEV_H_ */
diff --git a/drivers/net/iavf/iavf_fsub.c b/drivers/net/iavf/iavf_fsub.c
index 4600d52b91..b9ad3531ff 100644
--- a/drivers/net/iavf/iavf_fsub.c
+++ b/drivers/net/iavf/iavf_fsub.c
@@ -69,29 +69,80 @@ iavf_pattern_match_item iavf_fsub_pattern_list[] = {
 };
 
 static int
-iavf_fsub_create(__rte_unused struct iavf_adapter *ad,
-__rte_unused struct rte_flow *flow,
-__rte_unused void *meta,
-__rte_unused struct rte_flow_error *error)
+iavf_fsub_create(struct iavf_adapter *ad, struct rte_flow *flow,
+void *meta, struct rte_flow_error *error)
 {
+   struct iavf_fsub_conf *filter = meta;
+   struct iavf_fsub_conf *rule;
+   int ret;
+
+   rule = rte_zmalloc("fsub_entry", sizeof(*rule), 0);
+   if (!rule) {
+   rte_flow_error_set(error, ENOMEM,
+   RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+   "Failed to allocate memory for fsub rule");
+   return -rte_errno;
+   }
+
+   ret = iavf_flow_sub(ad, filter);
+   if (ret) {
+   rte_flow_error_set(error, -ret,
+  RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+  "Failed to subscribe flow rule.");
+   goto free_entry;
+   }
+
+   rte_memcpy(rule, filter, sizeof(*rule));
+   flow->rule = rule;
+
+   return ret;
+
+free_entry:
+   rte_free(rule);
return -rte_errno;
 }
 
 static int
-iavf_fsub_destroy(__rte_unused struct iavf_adapter *ad,
- __rte_unused struct rte_flow *flow,
- __rte_unused struct rte_flow_error *error)
+iavf_fsub_destroy(struct iavf_adapter *ad, struct rte_flow *flow,
+ struct rte_flow_error *error)
 {
-   return -rte_errno;
+   struct iavf_fsub_conf *filter;
+   int ret;
+
+   filter = (struct iavf_fsub_conf *)flow->rule;
+
+   ret = iavf_flow_unsub(ad, filter);
+   if (ret) {
+   rte_flow_error_set(error, -ret,
+  RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+  "Failed to unsubscribe flow rule.");
+   return -rte_errno;
+   }
+
+   flow->rule = NULL;
+   rte_free(filter);
+
+   return ret;
 }
 
 static int
-iavf_fsub_validation(__rte_unused struct iavf_adapter *ad,
+iavf_fsub_validation(struct iavf_adapter *ad,
 __rte_unused struct rte_flow *flow,
-__rte_unused void *meta,
-__rte_unused struct rte_flow_error *error)
+void *meta,
+struct rte_flow_error *error)
 {
-   return -rte_errno;
+   struct iavf_fsub_conf *filter = meta;
+   int ret;
+
+   ret = iavf_flow_sub_check(ad, filter);
+   if (ret) {
+   rte_flow_error_set(error, -ret,
+  RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+  "Failed to validate filter rule.");
+   return -rte_errno;
+   }
+
+   return ret;
 };
 
 static int
diff --git a/drivers/net/iavf/iavf_vchnl.c b/drivers/net/iavf/iavf_vchnl.c
index 6d84add423..cc0db8d093 100644
--- a/drivers/net/iavf/iavf_vchnl.c
+++ b/drivers/net/iavf/iavf_vchnl.c
@@ -1534,6 +1534,138 @@ iavf_fdir_check(struct iavf_adapter *adapter,
return 0;
 }
 
+int
+iavf_flow_sub(struct iavf_adapter *adapter, struct iavf_fsub_conf *filter)
+{
+   struct iavf_info *vf = IAVF_DEV_PRIVATE_TO_VF(adapter);
+   struct virtchnl_flow_sub *fsub_cfg;
+   struct iavf_cmd_info args;
+   int err;
+
+   f

[PATCH v3 5/5] net/iavf: support priority of flow rule

2022-08-30 Thread Jie Wang
Add flow rule attribute "priority" support for AVF.

Lower values denote higher priority, the highest priority for
a flow rule is 0.

Signed-off-by: Jie Wang 
---
 drivers/net/iavf/iavf_fdir.c |  4 
 drivers/net/iavf/iavf_fsub.c |  2 +-
 drivers/net/iavf/iavf_generic_flow.c | 23 +--
 drivers/net/iavf/iavf_generic_flow.h |  1 +
 drivers/net/iavf/iavf_hash.c |  5 +
 drivers/net/iavf/iavf_ipsec_crypto.c | 16 ++--
 6 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/drivers/net/iavf/iavf_fdir.c b/drivers/net/iavf/iavf_fdir.c
index a397047fdb..8f80873925 100644
--- a/drivers/net/iavf/iavf_fdir.c
+++ b/drivers/net/iavf/iavf_fdir.c
@@ -1583,6 +1583,7 @@ iavf_fdir_parse(struct iavf_adapter *ad,
uint32_t array_len,
const struct rte_flow_item pattern[],
const struct rte_flow_action actions[],
+   uint32_t priority,
void **meta,
struct rte_flow_error *error)
 {
@@ -1593,6 +1594,9 @@ iavf_fdir_parse(struct iavf_adapter *ad,
 
memset(filter, 0, sizeof(*filter));
 
+   if (priority >= 1)
+   return -rte_errno;
+
item = iavf_search_pattern_match_item(pattern, array, array_len, error);
if (!item)
return -rte_errno;
diff --git a/drivers/net/iavf/iavf_fsub.c b/drivers/net/iavf/iavf_fsub.c
index b9ad3531ff..46effda9a0 100644
--- a/drivers/net/iavf/iavf_fsub.c
+++ b/drivers/net/iavf/iavf_fsub.c
@@ -649,13 +649,13 @@ iavf_fsub_parse(struct iavf_adapter *ad,
uint32_t array_len,
const struct rte_flow_item pattern[],
const struct rte_flow_action actions[],
+   uint32_t priority,
void **meta,
struct rte_flow_error *error)
 {
struct iavf_fsub_conf *filter;
struct iavf_pattern_match_item *pattern_match_item = NULL;
int ret = 0;
-   uint32_t priority = 0;
 
filter = rte_zmalloc(NULL, sizeof(*filter), 0);
if (!filter) {
diff --git a/drivers/net/iavf/iavf_generic_flow.c 
b/drivers/net/iavf/iavf_generic_flow.c
index b04614ba6e..f33c764764 100644
--- a/drivers/net/iavf/iavf_generic_flow.c
+++ b/drivers/net/iavf/iavf_generic_flow.c
@@ -1785,6 +1785,7 @@ enum rte_flow_item_type 
iavf_pattern_eth_ipv6_udp_l2tpv2_ppp_ipv6_tcp[] = {
 typedef struct iavf_flow_engine * (*parse_engine_t)(struct iavf_adapter *ad,
struct rte_flow *flow,
struct iavf_parser_list *parser_list,
+   uint32_t priority,
const struct rte_flow_item pattern[],
const struct rte_flow_action actions[],
struct rte_flow_error *error);
@@ -1951,11 +1952,11 @@ iavf_flow_valid_attr(const struct rte_flow_attr *attr,
return -rte_errno;
}
 
-   /* Not supported */
-   if (attr->priority) {
+   /* support priority for flow subscribe */
+   if (attr->priority > 1) {
rte_flow_error_set(error, EINVAL,
RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
-   attr, "Not support priority.");
+   attr, "Only support priority 0 and 1.");
return -rte_errno;
}
 
@@ -2098,6 +2099,7 @@ static struct iavf_flow_engine *
 iavf_parse_engine_create(struct iavf_adapter *ad,
struct rte_flow *flow,
struct iavf_parser_list *parser_list,
+   uint32_t priority,
const struct rte_flow_item pattern[],
const struct rte_flow_action actions[],
struct rte_flow_error *error)
@@ -2111,7 +2113,7 @@ iavf_parse_engine_create(struct iavf_adapter *ad,
if (parser_node->parser->parse_pattern_action(ad,
parser_node->parser->array,
parser_node->parser->array_len,
-   pattern, actions, &meta, error) < 0)
+   pattern, actions, priority, &meta, error) < 0)
continue;
 
engine = parser_node->parser->engine;
@@ -2127,6 +2129,7 @@ static struct iavf_flow_engine *
 iavf_parse_engine_validate(struct iavf_adapter *ad,
struct rte_flow *flow,
struct iavf_parser_list *parser_list,
+   uint32_t priority,
const struct rte_flow_item pattern[],
const struct rte_flow_action actions[],
struct rte_flow_error *error)
@@ -2140,7 +2143,7 @@ iavf_parse_engine_validate(struct iavf_adapter *ad,
if (parser_node->parser->parse_pattern_action(ad,
parser_node->parser->array,
parser_node->parser->array_len,
-   pattern, actions, &meta,  error) < 0)
+   pattern, actions, priority

Re: [PATCH v2] net/octeon_ep: support CN10K SoC

2022-08-30 Thread Jerin Jacob
On Mon, Aug 29, 2022 at 9:15 PM Sathesh Edara  wrote:
>
> This patch adds the required functionality in the Octeon endpoint
> driver to support the CN10K endpoint device. It adds the CN10K SoC
> specific routines to configure, enable, and disable input and output
> queues to establish basic data transfers.
>
> Signed-off-by: Sathesh Edara 

Applied to dpdk-next-net-mrvl/for-next-net. Thanks


> ---
>
> Changes in v2:
> - Rephased the commit message and description.
> - Updated copy rights.
> - Aligned with max line length.
> - Added a timeout to avoid hangs in the while loops.
>
>  drivers/net/octeon_ep/cnxk_ep_vf.c| 375 ++
>  drivers/net/octeon_ep/cnxk_ep_vf.h| 161 +++
>  drivers/net/octeon_ep/meson.build |   1 +
>  drivers/net/octeon_ep/otx2_ep_vf.c| 129 +
>  drivers/net/octeon_ep/otx_ep_common.h |   4 +
>  drivers/net/octeon_ep/otx_ep_ethdev.c |   9 +
>  6 files changed, 622 insertions(+), 57 deletions(-)
>  create mode 100644 drivers/net/octeon_ep/cnxk_ep_vf.c
>  create mode 100644 drivers/net/octeon_ep/cnxk_ep_vf.h
>
> diff --git a/drivers/net/octeon_ep/cnxk_ep_vf.c 
> b/drivers/net/octeon_ep/cnxk_ep_vf.c
> new file mode 100644
> index 00..52f08c844b
> --- /dev/null
> +++ b/drivers/net/octeon_ep/cnxk_ep_vf.c
> @@ -0,0 +1,375 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(C) 2022 Marvell.
> + */
> +
> +#include 
> +#include 
> +#include "cnxk_ep_vf.h"
> +
> +static void
> +cnxk_ep_vf_setup_global_iq_reg(struct otx_ep_device *otx_ep, int q_no)
> +{
> +   volatile uint64_t reg_val = 0ull;
> +
> +   /* Select ES, RO, NS, RDSIZE,DPTR Format#0 for IQs
> +* IS_64B is by default enabled.
> +*/
> +   reg_val = oct_ep_read64(otx_ep->hw_addr + CNXK_EP_R_IN_CONTROL(q_no));
> +
> +   reg_val |= CNXK_EP_R_IN_CTL_RDSIZE;
> +   reg_val |= CNXK_EP_R_IN_CTL_IS_64B;
> +   reg_val |= CNXK_EP_R_IN_CTL_ESR;
> +
> +   oct_ep_write64(reg_val, otx_ep->hw_addr + CNXK_EP_R_IN_CONTROL(q_no));
> +}
> +
> +static void
> +cnxk_ep_vf_setup_global_oq_reg(struct otx_ep_device *otx_ep, int q_no)
> +{
> +   volatile uint64_t reg_val = 0ull;
> +
> +   reg_val = oct_ep_read64(otx_ep->hw_addr + 
> CNXK_EP_R_OUT_CONTROL(q_no));
> +
> +   reg_val &= ~(CNXK_EP_R_OUT_CTL_IMODE);
> +   reg_val &= ~(CNXK_EP_R_OUT_CTL_ROR_P);
> +   reg_val &= ~(CNXK_EP_R_OUT_CTL_NSR_P);
> +   reg_val &= ~(CNXK_EP_R_OUT_CTL_ROR_I);
> +   reg_val &= ~(CNXK_EP_R_OUT_CTL_NSR_I);
> +   reg_val &= ~(CNXK_EP_R_OUT_CTL_ROR_D);
> +   reg_val &= ~(CNXK_EP_R_OUT_CTL_NSR_D);
> +   reg_val &= ~(CNXK_EP_R_OUT_CTL_ES_I | CNXK_EP_R_OUT_CTL_ES_D);
> +
> +   /* INFO/DATA ptr swap is required  */
> +   reg_val |= (CNXK_EP_R_OUT_CTL_ES_P);
> +   oct_ep_write64(reg_val, otx_ep->hw_addr + 
> CNXK_EP_R_OUT_CONTROL(q_no));
> +}
> +
> +static void
> +cnxk_ep_vf_setup_global_input_regs(struct otx_ep_device *otx_ep)
> +{
> +   uint64_t q_no = 0ull;
> +
> +   for (q_no = 0; q_no < (otx_ep->sriov_info.rings_per_vf); q_no++)
> +   cnxk_ep_vf_setup_global_iq_reg(otx_ep, q_no);
> +}
> +
> +static void
> +cnxk_ep_vf_setup_global_output_regs(struct otx_ep_device *otx_ep)
> +{
> +   uint32_t q_no;
> +
> +   for (q_no = 0; q_no < (otx_ep->sriov_info.rings_per_vf); q_no++)
> +   cnxk_ep_vf_setup_global_oq_reg(otx_ep, q_no);
> +}
> +
> +static void
> +cnxk_ep_vf_setup_device_regs(struct otx_ep_device *otx_ep)
> +{
> +   cnxk_ep_vf_setup_global_input_regs(otx_ep);
> +   cnxk_ep_vf_setup_global_output_regs(otx_ep);
> +}
> +
> +static void
> +cnxk_ep_vf_setup_iq_regs(struct otx_ep_device *otx_ep, uint32_t iq_no)
> +{
> +   struct otx_ep_instr_queue *iq = otx_ep->instr_queue[iq_no];
> +   uint64_t loop = OTX_EP_BUSY_LOOP_COUNT;
> +   volatile uint64_t reg_val = 0ull;
> +
> +   reg_val = oct_ep_read64(otx_ep->hw_addr + 
> CNXK_EP_R_IN_CONTROL(iq_no));
> +
> +   /* Wait till IDLE to set to 1, not supposed to configure BADDR
> +* as long as IDLE is 0
> +*/
> +   if (!(reg_val & CNXK_EP_R_IN_CTL_IDLE)) {
> +   do {
> +   reg_val = oct_ep_read64(otx_ep->hw_addr + 
> CNXK_EP_R_IN_CONTROL(iq_no));
> +   rte_delay_ms(1);
> +   } while ((!(reg_val & CNXK_EP_R_IN_CTL_IDLE)) && loop--);
> +   }
> +
> +   if (!loop) {
> +   otx_ep_err("IDLE bit is not set\n");
> +   return;
> +   }
> +
> +   /* Write the start of the input queue's ring and its size  */
> +   oct_ep_write64(iq->base_addr_dma, otx_ep->hw_addr + 
> CNXK_EP_R_IN_INSTR_BADDR(iq_no));
> +   oct_ep_write64(iq->nb_desc, otx_ep->hw_addr + 
> CNXK_EP_R_IN_INSTR_RSIZE(iq_no));
> +
> +   /* Remember the doorbell & instruction count register addr
> +* for this queue
> +*/
> +   iq->doorbell_reg = (uint8_t *)otx_ep->hw_addr + 
> CNXK_EP_R_IN_INSTR_DBELL(iq_no);
> +  

Re: [PATCH v3 1/3] eal: add lcore poll busyness telemetry

2022-08-30 Thread Kevin Laatz

On 26/08/2022 23:06, Mattias Rönnblom wrote:

On 2022-08-25 17:28, Kevin Laatz wrote:

From: Anatoly Burakov 




To avoid performance impact from having lcore telemetry support, a 
global
variable is exported by EAL, and a call to timestamping function is 
wrapped

into a macro, so that whenever telemetry is disabled, it only takes one


Use an static inline function if you don't need the additional 
expressive power of a macro.


I suggest you also mention the performance implications, when this 
function is enabled.


Keeping the performance implications of having the feature enabled in 
mind, I think the expressive power of the macro is beneficial here.





diff --git a/lib/eal/common/eal_common_lcore_telemetry.c 
b/lib/eal/common/eal_common_lcore_telemetry.c

new file mode 100644
index 00..bba0afc26d
--- /dev/null
+++ b/lib/eal/common/eal_common_lcore_telemetry.c
@@ -0,0 +1,293 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+#include 
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+
+#ifdef RTE_LCORE_POLL_BUSYNESS
+#include 
+#endif
+
+int __rte_lcore_telemetry_enabled;


Is "telemetry" really the term to use here? Isn't this just another 
piece of statistics? It can be used for telemetry, or in some other 
fashion.


(Use bool not int.)


Will change to bool.

Looking at this again, the telemetry naming is more accurate here since 
'__rte_lcore_telemetry_enabled' is used to enable/disable the telemetry 
endpoints.


-Kevin



RE: [PATCH v5] net/ice: refactor proto_ext to remove global variable

2022-08-30 Thread Zhang, Qi Z



> -Original Message-
> From: Liu, KevinX 
> Sent: Tuesday, August 30, 2022 10:31 PM
> To: dev@dpdk.org
> Cc: Yang, Qiming ; Zhang, Qi Z
> ; Yang, SteveX ; Liu, KevinX
> ; Ling, Jin 
> Subject: [PATCH v5] net/ice: refactor proto_ext to remove global variable
> 
> The ice has the feature to extract protocol fields into flex descriptor by
> programming per queue. However, the dynamic field for proto_ext are
> allocated by PMD, it is the responsibility of application to reserved the 
> field,
> before start DPDK.
> 
> Application with parse the offset and proto_ext name to PMD with devargs.
> Remove related private API in 'rte_pmd_ice.h' and 'rte_pmd_ice.h' file.
> 
> Signed-off-by: Kevin Liu 
> Tested-by: Jin Ling 

Acked-by: Qi Zhang 

Applied to dpdk-next-net-intel.

Thanks
Qi



Re: [PATCH] eal: zero out new added memory

2022-08-30 Thread Dmitry Kozlyuk
Thank you for the most detailed info!

1. If you run the poisoner program the second time,
   does it also see dirty memory immediately after mmap()?

2. Kernel 4.19.90-2102 patchlevel 2102 is very high,
   can there be any unusual patches applied?
   Your host has "compute" in its name,
   can it have patches that trade security for performance?


Re: [PATCH] net/failsafe: fix interrupt handle leak

2022-08-30 Thread Ferruh Yigit

On 8/29/2022 11:23 AM, David Marchand wrote:



On Fri, Apr 29, 2022 at 3:56 PM Ferruh Yigit  wrote:


On 3/24/2022 3:09 PM, David Marchand wrote:

A intr_handle is being allocated as a hack to get a (proxy) eventfd from
the Linux interrupt implementation.
But this handle is never freed.

Remove this convoluted hack and create an eventfd in Linux case.

Fixes: d61138d4f0e2 ("drivers: remove direct access to interrupt handle")
Cc: sta...@dpdk.org

Signed-off-by: David Marchand 
---
   drivers/net/failsafe/failsafe_ops.c | 32 ++---
   1 file changed, 11 insertions(+), 21 deletions(-)

diff --git a/drivers/net/failsafe/failsafe_ops.c 
b/drivers/net/failsafe/failsafe_ops.c
index 55e21d635c..2c23d0e70a 100644
--- a/drivers/net/failsafe/failsafe_ops.c
+++ b/drivers/net/failsafe/failsafe_ops.c
@@ -6,6 +6,9 @@
   #include 
   #include 
   #include 
+#ifdef RTE_EXEC_ENV_LINUX
+#include 
+#endif

   #include 
   #include 
@@ -387,28 +390,11 @@ fs_rx_queue_setup(struct rte_eth_dev *dev,
   const struct rte_eth_rxconf *rx_conf,
   struct rte_mempool *mb_pool)
   {
- /*
-  * FIXME: Add a proper interface in rte_eal_interrupts for
-  * allocating eventfd as an interrupt vector.
-  * For the time being, fake as if we are using MSIX interrupts,
-  * this will cause rte_intr_efd_enable to allocate an eventfd for us.
-  */
- struct rte_intr_handle *intr_handle;
   struct sub_device *sdev;
   struct rxq *rxq;
   uint8_t i;
   int ret;

- intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_PRIVATE);
- if (intr_handle == NULL)
- return -ENOMEM;
-
- if (rte_intr_type_set(intr_handle, RTE_INTR_HANDLE_VFIO_MSIX))
- return -rte_errno;
-
- if (rte_intr_efds_index_set(intr_handle, 0, -1))
- return -rte_errno;
-
   fs_lock(dev, 0);
   if (rx_conf->rx_deferred_start) {
   FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_PROBED) {
@@ -442,12 +428,16 @@ fs_rx_queue_setup(struct rte_eth_dev *dev,
   rxq->info.nb_desc = nb_rx_desc;
   rxq->priv = PRIV(dev);
   rxq->sdev = PRIV(dev)->subs;
- ret = rte_intr_efd_enable(intr_handle, 1);
- if (ret < 0) {
+#ifdef RTE_EXEC_ENV_LINUX
+ rxq->event_fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
+ if (rxq->event_fd < 0) {
+ ERROR("Failed to create an eventfd: %s", strerror(errno));
   fs_unlock(dev, 0);
- return ret;
+ return -errno;
   }
- rxq->event_fd = rte_intr_efds_index_get(intr_handle, 0);
+#else
+ rxq->event_fd = -1;
+#endif


How this impacts the BSD? I don't know if driver used on BSD but
technically it looks supported.

@Gaetan, any objection to the change?


There was no feedback for months, can we get this merged?
Thanks.



There is no comment from maintainer, but patch is out for a while, so 
agree to proceed:


Acked-by: Ferruh Yigit 

Applied to dpdk-next-net/main, thanks.


[PATCH] net/cnxk: multi-seg support for Inline IPsec

2022-08-30 Thread Rahul Bhansali
From: Nithin Dabilpuram 

Add multi-seg support for Inline IPsec.
Also in reassembly, FI_PAD is not required to compute pointer to
Fragment info because it is only at CPT_PARSE_HDR_S + FI_OFFSET * 8
and is always 8B aligned.

Signed-off-by: Nithin Dabilpuram 
Signed-off-by: Rahul Bhansali 
---
 drivers/net/cnxk/cn10k_rx.h |  40 +---
 drivers/net/cnxk/cn10k_tx.h | 181 ++--
 2 files changed, 159 insertions(+), 62 deletions(-)

diff --git a/drivers/net/cnxk/cn10k_rx.h b/drivers/net/cnxk/cn10k_rx.h
index 5ecb20f038..8501ae9439 100644
--- a/drivers/net/cnxk/cn10k_rx.h
+++ b/drivers/net/cnxk/cn10k_rx.h
@@ -171,7 +171,7 @@ nix_sec_attach_frags(const struct cpt_parse_hdr_s *hdr,
 
/* offset of 0 implies 256B, otherwise it implies offset*8B */
offset = (((offset - 1) & 0x1f) + 1) * 8;
-   finfo = RTE_PTR_ADD(hdr, offset + hdr->w2.fi_pad);
+   finfo = RTE_PTR_ADD(hdr, offset);
 
/* Frag-0: */
wqe = (uint64_t *)(rte_be_to_cpu_64(hdr->wqe_ptr));
@@ -300,7 +300,7 @@ nix_sec_reassemble_frags(const struct cpt_parse_hdr_s *hdr, 
uint64_t cq_w1,
 
/* offset of 0 implies 256B, otherwise it implies offset*8B */
offset = (((offset - 1) & 0x1f) + 1) * 8;
-   finfo = RTE_PTR_ADD(hdr, offset + hdr->w2.fi_pad);
+   finfo = RTE_PTR_ADD(hdr, offset);
 
/* Frag-0: */
wqe = (uint64_t *)rte_be_to_cpu_64(hdr->wqe_ptr);
@@ -685,20 +685,32 @@ nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, 
struct rte_mbuf *mbuf,
struct rte_mbuf *head;
const rte_iova_t *eol;
uint8_t nb_segs;
+   uint64_t cq_w1;
+   int64_t len;
uint64_t sg;
 
+   cq_w1 = *(const uint64_t *)rx;
+   /* Use inner rx parse for meta pkts sg list */
+   if (cq_w1 & BIT(11) && flags & NIX_RX_OFFLOAD_SECURITY_F) {
+   const uint64_t *wqe = (const uint64_t *)(mbuf + 1);
+   rx = (const union nix_rx_parse_u *)(wqe + 1);
+   }
+
sg = *(const uint64_t *)(rx + 1);
nb_segs = (sg >> 48) & 0x3;
 
-   if (nb_segs == 1 && !(flags & NIX_RX_SEC_REASSEMBLY_F)) {
-   mbuf->next = NULL;
+   if (nb_segs == 1)
return;
-   }
 
-   mbuf->pkt_len = (rx->pkt_lenm1 + 1) - (flags & NIX_RX_OFFLOAD_TSTAMP_F ?
-  CNXK_NIX_TIMESYNC_RX_OFFSET : 0);
-   mbuf->data_len = (sg & 0x) - (flags & NIX_RX_OFFLOAD_TSTAMP_F ?
- CNXK_NIX_TIMESYNC_RX_OFFSET : 0);
+   /* For security we have already updated right pkt_len */
+   if (cq_w1 & BIT(11) && flags & NIX_RX_OFFLOAD_SECURITY_F)
+   len = mbuf->pkt_len;
+   else
+   len = rx->pkt_lenm1 + 1;
+   mbuf->pkt_len = len - (flags & NIX_RX_OFFLOAD_TSTAMP_F ? 
CNXK_NIX_TIMESYNC_RX_OFFSET : 0);
+   mbuf->data_len =
+   (sg & 0x) - (flags & NIX_RX_OFFLOAD_TSTAMP_F ? 
CNXK_NIX_TIMESYNC_RX_OFFSET : 0);
+   len -= mbuf->data_len;
mbuf->nb_segs = nb_segs;
sg = sg >> 16;
 
@@ -717,6 +729,7 @@ nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct 
rte_mbuf *mbuf,
RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool, (void **)&mbuf, 1, 1);
 
mbuf->data_len = sg & 0x;
+   len -= sg & 0X;
sg = sg >> 16;
*(uint64_t *)(&mbuf->rearm_data) = rearm;
nb_segs--;
@@ -729,7 +742,10 @@ nix_cqe_xtract_mseg(const union nix_rx_parse_u *rx, struct 
rte_mbuf *mbuf,
iova_list = (const rte_iova_t *)(iova_list + 1);
}
}
-   mbuf->next = NULL;
+
+   /* Adjust last mbuf data length with negative offset for security pkts 
if needed */
+   if (cq_w1 & BIT(11) && flags & NIX_RX_OFFLOAD_SECURITY_F && len < 0)
+   mbuf->data_len += len;
 }
 
 static __rte_always_inline void
@@ -787,9 +803,9 @@ cn10k_nix_cqe_to_mbuf(const struct nix_cqe_hdr_s *cq, const 
uint32_t tag,
 * For multi segment packets, mbuf length correction according
 * to Rx timestamp length will be handled later during
 * timestamp data process.
-* Hence, flag argument is not required.
+* Hence, timestamp flag argument is not required.
 */
-   nix_cqe_xtract_mseg(rx, mbuf, val, 0);
+   nix_cqe_xtract_mseg(rx, mbuf, val, flag & 
~NIX_RX_OFFLOAD_TSTAMP_F);
 }
 
 static inline uint16_t
diff --git a/drivers/net/cnxk/cn10k_tx.h b/drivers/net/cnxk/cn10k_tx.h
index ea13866b20..2be5ecdf5e 100644
--- a/drivers/net/cnxk/cn10k_tx.h
+++ b/drivers/net/cnxk/cn10k_tx.h
@@ -282,7 +282,7 @@ cn10k_nix_prep_sec_vec(struct rte_mbuf *m, uint64x2_t 
*cmd0, uint64x2_t *cmd1,
uint8_t l2_len, l3_len;
uintptr_t dptr, nixtx;
uint64_t ucode_cmd[4];
-   uint64_t *laddr;
+   uint64_t *laddr, w0;
uint16_t tag;
uint64_t sa;
 
@@ 

RE: [PATCH] IGC: Remove I225_I_PHY_ID checking

2022-08-30 Thread Zhang, Qi Z



> -Original Message-
> From: iotg.dpdk.ref@intel.com 
> Sent: Monday, August 29, 2022 4:15 PM
> To: dev@dpdk.org
> Subject: [PATCH] IGC: Remove I225_I_PHY_ID checking
> 
> From: NSWE SWS DPDK Dev 
> 
> i225 devices have only one PHY vendor. There is unnecessary to check
> _I_PHY_ID during the link establishment and auto-negotiation process, the
> checking also caused devices like i225-IT failed. This patch is to remove the
> mentioned unnecessary checking.
> 
> Cc: sta...@dpdk.org
> Signed-off-by: NSWE SWS DPDK Dev 

Is this the expected author name?

> ---
>  drivers/net/igc/base/igc_api.c  |  1 +
>  drivers/net/igc/base/igc_hw.h   |  1 +
>  drivers/net/igc/base/igc_i225.c | 15 ++-
> drivers/net/igc/base/igc_phy.c  |  6 ++
>  drivers/net/igc/igc_ethdev.c|  1 +
>  5 files changed, 7 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/net/igc/base/igc_api.c b/drivers/net/igc/base/igc_api.c
> index 9b791dc082..c9fc9ed4b0 100644
> --- a/drivers/net/igc/base/igc_api.c
> +++ b/drivers/net/igc/base/igc_api.c
> @@ -886,6 +886,7 @@ s32 igc_set_mac_type(struct igc_hw *hw)
>   case IGC_DEV_ID_I225_V:
>   case IGC_DEV_ID_I225_K:
>   case IGC_DEV_ID_I225_I:
> + case IGC_DEV_ID_I225_IT:
>   case IGC_DEV_ID_I220_V:
>   case IGC_DEV_ID_I225_BLANK_NVM:
>   case IGC_DEV_ID_I226_K:
> diff --git a/drivers/net/igc/base/igc_hw.h b/drivers/net/igc/base/igc_hw.h
> index 707a1883b4..e919a11c02 100644
> --- a/drivers/net/igc/base/igc_hw.h
> +++ b/drivers/net/igc/base/igc_hw.h
> @@ -164,6 +164,7 @@ struct igc_hw;
>  #define IGC_DEV_ID_I225_V0x15F3
>  #define IGC_DEV_ID_I225_K0x3100
>  #define IGC_DEV_ID_I225_I0x15F8
> +#define IGC_DEV_ID_I225_IT   0x0D9F

The patch's commit log claims to remove something, 
but in code it also add some new device ID, could you clarify why we need this 
or it should be in a separate patch?




RE: [EXT] Re: [PATCH v2 1/3] ethdev: introduce pool sort capability

2022-08-30 Thread Hanumanth Reddy Pothula


> -Original Message-
> From: Ferruh Yigit 
> Sent: Wednesday, August 24, 2022 9:04 PM
> To: Ding, Xuan ; Hanumanth Reddy Pothula
> ; Thomas Monjalon ; Andrew
> Rybchenko 
> Cc: dev@dpdk.org; Wu, WenxuanX ; Li, Xiaoyun
> ; step...@networkplumber.org; Wang, YuanX
> ; m...@ashroe.eu; Zhang, Yuying
> ; Zhang, Qi Z ;
> viachesl...@nvidia.com; Jerin Jacob Kollanukkaran ;
> Nithin Kumar Dabilpuram 
> Subject: [EXT] Re: [PATCH v2 1/3] ethdev: introduce pool sort capability
> 
> External Email
> 
> --


Thanks Ding Xuan and Ferruh Yigit for reviewing the changes and for providing 
your valuable feedback.
Please find responses inline.

> On 8/23/2022 4:26 AM, Ding, Xuan wrote:
> > Hi Hanumanth,
> >
> >> -Original Message-
> >> From: Hanumanth Pothula 
> >> Sent: Saturday, August 13, 2022 1:25 AM
> >> To: Thomas Monjalon ; Ferruh Yigit
> >> ; Andrew Rybchenko
> >> 
> >> Cc: dev@dpdk.org; Ding, Xuan ; Wu, WenxuanX
> >> ; Li, Xiaoyun ;
> >> step...@networkplumber.org; Wang, YuanX ;
> >> m...@ashroe.eu; Zhang, Yuying ; Zhang, Qi Z
> >> ; viachesl...@nvidia.com; jer...@marvell.com;
> >> ndabilpu...@marvell.com; Hanumanth Pothula 
> >> Subject: [PATCH v2 1/3] ethdev: introduce pool sort capability
> >>
> >> Presently, the 'Buffer Split' feature supports sending multiple
> >> segments of the received packet to PMD, which programs the HW to
> >> receive the packet in segments from different pools.
> >>
> >> This patch extends the feature to support the pool sort capability.
> >> Some of the HW has support for choosing memory pools based on the
> >> packet's size. The pool sort capability allows PMD to choose a memory
> >> pool based on the packet's length.
> >>
> >> This is often useful for saving the memory where the application can
> >> create a different pool to steer the specific size of the packet,
> >> thus enabling effective use of memory.
> >>
> >> For example, let's say HW has a capability of three pools,
> >>   - pool-1 size is 2K
> >>   - pool-2 size is > 2K and < 4K
> >>   - pool-3 size is > 4K
> >> Here,
> >>  pool-1 can accommodate packets with sizes < 2K
> >>  pool-2 can accommodate packets with sizes > 2K and < 4K
> >>  pool-3 can accommodate packets with sizes > 4K
> >>
> >> With pool sort capability enabled in SW, an application may create
> >> three pools of different sizes and send them to PMD. Allowing PMD to
> >> program HW based on packet lengths. So that packets with less than 2K
> >> are received on pool-1, packets with lengths between 2K and 4K are
> >> received on pool-2 and finally packets greater than 4K are received on 
> >> pool-
> 3.
> >>
> >> The following two capabilities are added to the rte_eth_rxseg_capa
> >> structure, 1. pool_sort --> tells pool sort capability is supported by HW.
> >> 2. max_npool --> max number of pools supported by HW.
> >>
> >> Defined new structure rte_eth_rxseg_sort, to be used only when pool
> >> sort capability is present. If required this may be extended further
> >> to support more configurations.
> >>
> >> Signed-off-by: Hanumanth Pothula 
> >>
> >> v2:
> >>   - Along with spec changes, uploading testpmd and driver changes.
> >
> > Thanks for CCing. It's an interesting feature.
> >
> > But I have one question here:
> > Buffer split is for split receiving packets into multiple segments,
> > while pool sort supports PMD to put the receiving packets into different 
> > pools
> according to packet size.
> > Every packet is still intact.
> >
> > So, at this level, pool sort does not belong to buffer split.
> > And you already use a different function to check pool sort rather than 
> > check
> buffer split.
> >
> > Should a new RX offload be introduced? like
> "RTE_ETH_RX_OFFLOAD_POOL_SORT".
> >
Please find my response below. 
> 
> Hi Hanumanth,
> 
> I had the similar concern with the feature. I assume you want to benefit from
> exiting config structure that gets multiple mempool as argument, since this
> feature also needs multiple mempools, but the feature is different.
> 
> It looks to me wrong to check 'OFFLOAD_BUFFER_SPLIT' offload to decide if to
> receive into multiple mempool or not, which doesn't have anything related 
> split.
> Also not sure about using the 'sort' keyword.
> What do you think to introduce new fetaure, instead of extending existing 
> split
> one?

Actually we thought both BUFFER_SPLIT and POOL_SORT are similar features where 
RX
pools are configured in certain way and thought not use up one more RX offload 
capability, 
as the existing software architecture can be extended to support pool_sort 
capability.
Yes, as part of pool sort, there is no buffer split but pools are picked based 
on the buffer length.

Since you think it's better to use new RX offload for POOL_SORT, will go ahead 
and implement the same.

> This is optimisation, right? To enable us to use less memory for the packet
> buffer, does it qualify to a device off

RE: [PATCH v2 1/1] app/testpmd: add command line argument 'nic-to-pmd-rx-metadata'

2022-08-30 Thread Hanumanth Reddy Pothula
Ping

> -Original Message-
> From: Hanumanth Pothula 
> Sent: Tuesday, August 2, 2022 11:22 PM
> To: Aman Singh ; Yuying Zhang
> 
> Cc: dev@dpdk.org; Hanumanth Reddy Pothula 
> Subject: [PATCH v2 1/1] app/testpmd: add command line argument 'nic-to-pmd-
> rx-metadata'
> 
> Presently, rx metadata is sent to PMD by default, leading to a performance 
> drop
> as processing for the same in rx path takes extra cycles.
> 
> Hence, introducing command line argument, 'nic-to-pmd-rx-metadata'
> to control passing rx metadata to PMD. By default it’s disabled.
> 
> Signed-off-by: Hanumanth Pothula 
> 
> v2:
> - taken cared alignment issues
> - renamed command line argument from rx-metadata to nic-to-pmd-rx-
> metadata
> - renamed variable name from rx-metadata to nic_to_pmd_rx_metadata
> ---
>  app/test-pmd/parameters.c | 4 
>  app/test-pmd/testpmd.c| 6 +-
>  app/test-pmd/testpmd.h| 2 ++
>  3 files changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/app/test-pmd/parameters.c b/app/test-pmd/parameters.c index
> e3c9757f3f..a381945492 100644
> --- a/app/test-pmd/parameters.c
> +++ b/app/test-pmd/parameters.c
> @@ -213,6 +213,7 @@ usage(char* progname)
>   printf("  --hairpin-mode=0xXX: bitmask set the hairpin port mode.\n"
>  "0x10 - explicit Tx rule, 0x02 - hairpin ports paired\n"
>  "0x01 - hairpin ports loop, 0x00 - hairpin port self\n");
> + printf("  --nic-to-pmd-rx-metadata: let the NIC deliver per-packet Rx
> +metadata to PMD\n");
>  }
> 
>  #ifdef RTE_LIB_CMDLINE
> @@ -710,6 +711,7 @@ launch_args_parse(int argc, char** argv)
>   { "record-burst-stats", 0, 0, 0 },
>   { PARAM_NUM_PROCS,  1, 0, 0 },
>   { PARAM_PROC_ID,1, 0, 0 },
> + { "nic-to-pmd-rx-metadata", 0, 0, 0 },
>   { 0, 0, 0, 0 },
>   };
> 
> @@ -1510,6 +1512,8 @@ launch_args_parse(int argc, char** argv)
>   num_procs = atoi(optarg);
>   if (!strcmp(lgopts[opt_idx].name, PARAM_PROC_ID))
>   proc_id = atoi(optarg);
> + if (!strcmp(lgopts[opt_idx].name, "nic-to-pmd-rx-
> metadata"))
> + nic_to_pmd_rx_metadata = 1;
>   break;
>   case 'h':
>   usage(argv[0]);
> diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c index
> addcbcac85..2b17d4f757 100644
> --- a/app/test-pmd/testpmd.c
> +++ b/app/test-pmd/testpmd.c
> @@ -411,6 +411,9 @@ uint8_t clear_ptypes = true;
>  /* Hairpin ports configuration mode. */  uint16_t hairpin_mode;
> 
> +/* Send Rx metadata */
> +uint8_t nic_to_pmd_rx_metadata;
> +
>  /* Pretty printing of ethdev events */
>  static const char * const eth_event_desc[] = {
>   [RTE_ETH_EVENT_UNKNOWN] = "unknown",
> @@ -1628,7 +1631,8 @@ init_config_port_offloads(portid_t pid, uint32_t
> socket_id)
>   int ret;
>   int i;
> 
> - eth_rx_metadata_negotiate_mp(pid);
> + if (nic_to_pmd_rx_metadata)
> + eth_rx_metadata_negotiate_mp(pid);
> 
>   port->dev_conf.txmode = tx_mode;
>   port->dev_conf.rxmode = rx_mode;
> diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h index
> fb2f5195d3..294a9c8cf4 100644
> --- a/app/test-pmd/testpmd.h
> +++ b/app/test-pmd/testpmd.h
> @@ -621,6 +621,8 @@ extern struct rte_ether_addr
> peer_eth_addrs[RTE_MAX_ETHPORTS];  extern uint32_t burst_tx_delay_time;
> /**< Burst tx delay time(us) for mac-retry. */  extern uint32_t
> burst_tx_retry_num;  /**< Burst tx retry number for mac-retry. */
> 
> +extern uint8_t nic_to_pmd_rx_metadata;
> +
>  #ifdef RTE_LIB_GRO
>  #define GRO_DEFAULT_ITEM_NUM_PER_FLOW 32  #define
> GRO_DEFAULT_FLOW_NUM (RTE_GRO_MAX_BURST_ITEM_NUM / \
> --
> 2.25.1



Re: [RFC] memarea: introduce memory area library

2022-08-30 Thread Dmitry Kozlyuk
>
> Note:
> a) the memarea is oriented towards the application layer, which could
> provides 'region-based memory management' [1] function.
>

Judging from the API, this library would rather provide
an interface to a generic allocator over a fixed memory extent,
because it offers freeing of specific elements, and thus must track them.
So it's more than RBMM. Is this intended?
It's a very interesting RFC anyway, just trying to understand the scope.

b) the eal library also provide memory zone/heap management, but these
> are tied to huge pages management.
>
[...]
> + * The memarea is a collection of allocated objects that can be
> efficiently
> + * alloc or free all at once, the main feature are as follows:
> + *   a) it facilitate alloc and free of memory with low overhead.
> + *   [...]
>
+ *   c) it supports MT-safe as long as it's specified at creation time.
>

These two bullets seem to add the most value compared to DPDK heap API.
DPDK heap overhead is at least 64 bytes per allocation (sizeof malloc_elem),
so I assume memarea aims at a large number of small elements.


> +struct rte_memarea_param {
> +   char name[RTE_MEMAREA_NAMESIZE]; /**< Name of memarea */
> +   enum rte_memarea_source source;  /**< Memory source of memarea */
> +   uint64_t size;   /**< Size (byte) of memarea */
>

Is it an upper limit or a guaranteed size?
It probably depends on the source: guaranteed for USER_ADDR,
upper limit for SYSAPI (or it would be no different from USER_ADDR),
not sure about USER_MEMAREA.

Do you envision memarea as always limited?
Generic allocators usually have means of adding extents,
even if this one doesn't currently.

Nit: size is uint64_t here but uint32_t in rte_memarea_allloc().
Should be size_t in both places.


> +   uint32_t align;  /**< Align of allocated object */
>
+   /** Indicates whether the memarea should be MT-safe */
> +   bool mt_safe;
> +   /** Indicates whether the memarea is visible to multiple process.
> +* If the memory source is RTE_MEMAREA_SOURCE_USER_ADDR, this filed
> +* depends on user settings and must be set.
> +* If the memory source is RTE_MEMAREA_SOURCE_SYSAPI or
> +* RTE_MEMAREA_SOURCE_USER_MEMAREA, this filed does not need to be
> set.
> +*/
> +   bool mp_visible;
> +   /** User provided address, this field is valid only when source
> +* is set to RTE_MEMAREA_SOURCE_USER_ADDR.
> +*/
> +   void *user_addr;
> +   /** User provided memarea, this field is valid only when source
> +* is set to RTE_MEMAREA_SOURCE_MEMAREA.
> +*/
> +   struct rte_memarea *user_memarea;
>

Jerin already suggested a union here.
I'll add another reason to do so: if in the future there will be new
memarea types
that require new options, one pointer-sized field can be used to pass
anything
without breaking the ABI once this structure becomes stable.


> +/**
> + * @warning
> + * @b EXPERIMENTAL: this API may change without prior notice.
> + *
> + * Update memory's refcnt.
> + *
> + * Update one memory region's refcnt.
> + *
> + * @param ma
> + *   The pointer of memarea.
> + * @param ptr
> + *   The pointer of memory region which need be updated refcnt.
> + * @param value
> + *   The value which need be updated.
> + *   Note: it could be negative.
> + *
> + * @return
> + *   0 on success. Otherwise negative value is returned.
> + */
> +__rte_experimental
> +int rte_memarea_refcnt_update(struct rte_memarea *ma, void *ptr, int16_t
> value);
>

If this function only updates the refcnt, an API to inspect the refcnt is
missing.
Furthermore, in this case refcnt is just a value attached to every object,
what is the benefit compared to simply storing it in the object?

If this function also frees "ptr" when refcnt reaches zero,
missing is a way for the user to know that it did.
What happens if refcnt > 1 on rte_memarea_free()?

I don't think refcnt belongs to this library.
A principal objection: memarea is for freeing all objects at once,
refcnt is for releasing objects one-by-one when they're not used.
Technical issues I foresee: refcnt can be atomic (and require alignment) or
not,
16 bits may be too few (rte_flow_action_handle ref'd by thousands of
rte_flow).
Refcnt could be optional per memarea, but it seems like another
complication.


Re: [PATCH] eal: zero out new added memory

2022-08-30 Thread lic121
On Tue, Aug 30, 2022 at 01:59:16PM +0300, Dmitry Kozlyuk wrote:
> Thank you for the most detailed info!
> 
> 1. If you run the poisoner program the second time,
>does it also see dirty memory immediately after mmap()?

If I run the poisoner program the second time, no dirty memory.

There could be some difference on how my poisoner program and how
testpmd using mmap. Because I notice that testpmd leaves the hugepage
files under /dev/hugepage/rtemap_xxx even after testpmd exits. But my
poisoner program didn't create any file under /dev/hugepage.

> 
> 2. Kernel 4.19.90-2102 patchlevel 2102 is very high,
>can there be any unusual patches applied?
>Your host has "compute" in its name,
>can it have patches that trade security for performance?

I may need to talk with the kernel team on this. Indeed, I failed to
reprouce the issue on a virtuam machine of kernel
4.18.0-305.12.1.el8_4.x86_64(2M page size instead of 1G). 


Re: [PATCH] eal: zero out new added memory

2022-08-30 Thread lic121
On Tue, Aug 30, 2022 at 01:59:16PM +0300, Dmitry Kozlyuk wrote:
> Thank you for the most detailed info!
> 
> 1. If you run the poisoner program the second time,
>does it also see dirty memory immediately after mmap()?

No, running the poisoner program again doesn't show dirty memory
immediately after mmap.

I assume that there are some differences on how my poisoner program
using hugepage and how testpmd using hugepage. I notice that testpmd
leaves some files under /dev/hugepages/rtemap_xxx even after testpmd
process exits. But my program didn't create any file under
/dev/hugepages/.
> 
> 2. Kernel 4.19.90-2102 patchlevel 2102 is very high,
>can there be any unusual patches applied?
>Your host has "compute" in its name,
>can it have patches that trade security for performance?

I may need to talk to the kernel team. Indeed, I failed to reproduce the
issue on a Virtual Machine of kernel 4.18.0-305.12.1.el8_4.x86_64.(2M
page size insetad of 1G)


Re: [PATCH] mbuf: add mbuf physical address field to dynamic field

2022-08-30 Thread Ferruh Yigit

On 7/1/2022 1:24 PM, Shijith Thotton wrote:

If all devices are configured to run in IOVA mode as VA, physical
address field of mbuf (buf_iova) won't be used. In such cases, buf_iova
space is free to use as a dynamic field. So a new dynamic field member
(dynfield2) is added in mbuf structure to make use of that space.

A new mbuf flag RTE_MBUF_F_DYNFIELD2 is introduced to help identify the
mbuf that can use dynfield2.

Signed-off-by: Shijith Thotton 


This seems like a complex and potentially error prone way to do this.
What is the use case?



PCI drivers with the flag RTE_PCI_DRV_NEED_IOVA_AS_VA only works in IOVA mode as
VA. buf_iova field of mbuf is not used by those PMDs and can be used as a
dynamic area to save space.



'RTE_PCI_DRV_NEED_IOVA_AS_VA' means device can *only* work in 
RTE_IOVA_VA mode, right?


Although there are many devices that support RTE_IOVA_VA mode, only a 
few of them works *only* with RTE_IOVA_VA mode, rest can prefer to use 
RTE_IOVA_PA or RTE_IOVA_VA.

Also using KNI forces to use RTE_IOVA_PA mode.
And moving 'buf_iova' filed out of first cache will impact the 
performance for RTE_IOVA_PA mode.


Since KNI is going away and vfio is more preferred way, it can be OK to 
make 'buf_iova' dynamic filed in long term, but I think it is better to 
do this slowly, like should we wait for KNI to go away first?




How much of a performance gain?


No change in performance.




RE: [PATCH] rcu: fix build failure with debug dp log level

2022-08-30 Thread Honnappa Nagarahalli


> >>
> >> Build fails if RTE_LOG_DP_LEVEL is set to RTE_LOG_DEBUG. Fix the same
> >> by including the required header when RTE_LOG_DP_LEVEL is set to
> >> RTE_LOG_DEBUG.
> >>
> >> ../lib/rcu/rte_rcu_qsbr.h:678:40: error: expected ‘)’ before ‘PRIu64’
> >>678 |"%s: status: least acked token = %" PRIu64,
> >>|^~
> >>
> >> Fixes: 30a1de105a5f ("lib: remove unneeded header includes")
> >> Cc: sean.morris...@intel.com
> > Agree on the fix.
> > @sean.morris...@intel.com Does the process that removed this header file
> inclusion needs fixing?
> > If yes, should that fix be included in this patch?
> 
> @honnappa.nagaraha...@arm.com Yes, I believe the tool will need an
> update, however, I believe it should be a separate patch for that.
Ok, as long as it is addressed, it should be fine.
 
> 
> >> Signed-off-by: Anoob Joseph 
> >> ---
> >>   lib/rcu/rte_rcu_qsbr.h | 4 
> >>   1 file changed, 4 insertions(+)
> >>
> >> diff --git a/lib/rcu/rte_rcu_qsbr.h b/lib/rcu/rte_rcu_qsbr.h index
> >> d81bf5e8db..b0f1720ca1 100644
> >> --- a/lib/rcu/rte_rcu_qsbr.h
> >> +++ b/lib/rcu/rte_rcu_qsbr.h
> >> @@ -37,6 +37,10 @@ extern "C" {
> >>   #include 
> >>   #include 
> >>
> >> +#if RTE_LOG_DP_LEVEL >= RTE_LOG_DEBUG #include  #endif
> >> +
> >>   extern int rte_rcu_log_type;
> >>
> >>   #if RTE_LOG_DP_LEVEL >= RTE_LOG_DEBUG
> >> --
> >> 2.25.1


RE: [EXT] Re: [PATCH v1 2/4] mbuf: add second dynamic field member for VA only build

2022-08-30 Thread Honnappa Nagarahalli

> >
> > --
> > On Mon, Aug 29, 2022 at 08:32:20PM +0200, Morten Brørup wrote:
> > >
> > > > From: Shijith Thotton [mailto:sthot...@marvell.com]
> > > > Sent: Monday, 29 August 2022 17.16
> > > >
> > > > mbuf physical address field is not used in builds which only uses VA.
> > > > It is used to expand the dynamic field area.
> > > >
> > > > Signed-off-by: Shijith Thotton 
> > > > ---
> > > >  lib/mbuf/rte_mbuf_core.h | 26 +-
> > > > lib/mbuf/rte_mbuf_dyn.c  |  2 ++
> > > >  2 files changed, 19 insertions(+), 9 deletions(-)
> > > >
> > > > diff --git a/lib/mbuf/rte_mbuf_core.h b/lib/mbuf/rte_mbuf_core.h
> > > > index 81cb07c2e4..98ce62fd6a 100644
> > > > --- a/lib/mbuf/rte_mbuf_core.h
> > > > +++ b/lib/mbuf/rte_mbuf_core.h
> > > > @@ -579,15 +579,23 @@ struct rte_mbuf {
> > > > RTE_MARKER cacheline0;
> > > >
> > > > void *buf_addr;   /**< Virtual address of segment 
> > > > buffer.
> > > > */
> > > > -   /**
> > > > -* Physical address of segment buffer.
> > > > -* This field is invalid if the build is configured to use only
> > > > -* virtual address as IOVA (i.e. RTE_IOVA_AS_VA is defined).
> > > > -* Force alignment to 8-bytes, so as to ensure we have the exact
> > > > -* same mbuf cacheline0 layout for 32-bit and 64-bit. This makes
> > > > -* working on vector drivers easier.
> > > > -*/
> > > > -   rte_iova_t buf_iova __rte_aligned(sizeof(rte_iova_t));
> > > > +   RTE_STD_C11
> > > > +   union {
> > > > +   /**
> > > > +* Physical address of segment buffer.
> > > > +* This field is invalid if the build is configured to 
> > > > use
> > > > only
> > > > +* virtual address as IOVA (i.e. RTE_IOVA_AS_VA is
> > > > defined).
> > > > +* Force alignment to 8-bytes, so as to ensure we have 
> > > > the
> > > > exact
> > > > +* same mbuf cacheline0 layout for 32-bit and 64-bit. 
> > > > This
> > > > makes
> > > > +* working on vector drivers easier.
> > > > +*/
> > > > +   rte_iova_t buf_iova __rte_aligned(sizeof(rte_iova_t));
> > > > +   /**
> > > > +* Reserved for dynamic field in builds where physical
> > > > address
> > > > +* field is invalid.
> > > > +*/
> > > > +   uint64_t dynfield2;
> > > > +   };
> > > >
> > > > /* next 8 bytes are initialised on RX descriptor rearm */
> > > > RTE_MARKER64 rearm_data;
> > >
> > > I know that the intention here is to keep the rte_mbuf structure
> > > intact,
> > which will certainly improve the probability of getting this patch
> > series into DPDK.
> > >
> > > So, I will add a comment for the benefit of the other participants
> > > in the
> > discussion:
> > >
> > > With this patch, and in RTE_IOVA_AS_VA mode, it becomes possible to
> > move m->next into the first cache line, so rte_pktmbuf_prefree_seg()
> > does not have to touch the second cache line, thus potentially
> > improving performance by eliminating one cache miss per freed packet
> > segment. (I also recall someone mentioning that some PMDs set m->next
> > on RX... If that is the case, a cache miss per packet might also be
> > avoidable in those PMDs.)
> > >
> > > Obviously, moving m->next to the first cache line is not related to
> > > this patch
> > series, but would belong in a completely different patch.
> > >
> >
> > +1 to that, with the exception that if it is decided to move the next
> > pointer rather than use this as dynamic space, I think it *should* be
> > in this patch series, rather than mucking about with mbuf twice. :-)
> 
> +1 When RTE_IOVA_AS_VA is set we can set mbuf->next as the dynamic field
> and move it to mbuf->buf_iova.
> mbuf->next write is one of the prominent hotspot in arm platforms.
+1 for reducing the cachelines that need to be touched


Re: [RFC v3 00/26] Bus and device cleanup for 22.11

2022-08-30 Thread David Marchand
On Mon, Aug 29, 2022 at 7:12 PM Walker, Benjamin
 wrote:
> > > Can we keep rte_pci_register(), or a new variation of it that keeps
> > > the rte_pci_driver structure hidden?  Hiding rte_pci_register() would
> > > mean SPDK can no longer work with a packaged DPDK.  Or the DPDK
> > > packages would need to set enable_driver_sdk which I suspect is not the
> > intent.
> >
> > What do you think if SPDK maintains a copy of the internal headers?
> >
> > The internal API are not supposed to change that often, but we (DPDK) won't
> > guarantee it.
> > This would still put some maintenance burden on SPDK but I think it is a 
> > good
> > compromise.
> >
>
> Would these internal symbols be considered part of the public/official ABI? 
> When

What do you mean by "public/official"?
If you mean the "stable" ABI (as described in the ABI policy document
and for which compatibility is preserved across minor versions of the
ABI), the answer is no: internal symbols are not part of it.


> SPDK goes to dynamically load a shared DPDK library, how can we detect
> whether it's a version that we support linking against?

The runtime version of a DPDK library is available via rte_version().


As for the PCI drivers that SPDK wants to register in DPDK, what do
you think if SPDK people added and maintained a "generic" PCI driver
in DPDK.
This driver would expose a new API (which can not re-expose internal
structures, like rte_pci_driver and consorts) and ensure its ABI is
maintained in the long term.
This makes me think of pci-stub, but in DPDK.

I did not think too much about it and I don't understand what SPDK
requires, but is there something wrong with this approach?


-- 
David Marchand



RE: TCP/IP stack recommendations

2022-08-30 Thread Honnappa Nagarahalli
One key thing to look for is the interface between the stack and the 
application. If it is not zero-copy, it quickly becomes a bottleneck depending 
on the data size and the buffer size.

Thanks,
Honnappa

> -Original Message-
> From: Ray Kinsella 
> Sent: Tuesday, August 30, 2022 4:45 AM
> To: Morten Brørup 
> Cc: dev@dpdk.org
> Subject: Re: TCP/IP stack recommendations
> 
> Hi Morten,
> 
> Reach out to Florin Coras over in VPP-land.
> 
> Morten Brørup  writes:
> 
> > Hi all.
> >
> > Can anyone in here recommend an actively maintained open source TCP/IP
> stack for DPDK?
> >
> >
> > Med venlig hilsen / Kind regards,
> > -Morten Brørup
> 
> 
> --
> Regards, Ray K


Re: [PATCH] config: set pkgconfig for ppc64le

2022-08-30 Thread David Christensen

On 7/7/22 4:41 AM, Ali Alnubani wrote:

Meson fails to detect the dependencies that are included
in PKG_CONFIG_PATH and built for ppc64le if binaries.pkgconfig
is not set in the ppc64le cross-file for Ubuntu.

This fixes the issue by setting binaries.pkgconfig to the
binary provided by the package 'pkg-config-powerpc64le-linux-gnu'.

Signed-off-by: Ali Alnubani 
---
  config/ppc/ppc64le-power8-linux-gcc-ubuntu | 1 +
  1 file changed, 1 insertion(+)

diff --git a/config/ppc/ppc64le-power8-linux-gcc-ubuntu 
b/config/ppc/ppc64le-power8-linux-gcc-ubuntu
index 3027d66f8d..c2d5b1dc6a 100644
--- a/config/ppc/ppc64le-power8-linux-gcc-ubuntu
+++ b/config/ppc/ppc64le-power8-linux-gcc-ubuntu
@@ -3,6 +3,7 @@ c = ['ccache', 'powerpc64le-linux-gnu-gcc']
  cpp = ['ccache', 'powerpc64le-linux-gnu-g++']
  ar = 'powerpc64le-linux-gnu-ar'
  strip = 'powerpc64le-linux-gnu-strip'
+pkgconfig = 'powerpc64le-linux-gnu-pkg-config'

  [host_machine]
  system = 'linux'


What's the test environment that prompted this patch?  I'd like to try 
replicating before approving.


Dave


Re: [PATCH] config: set pkgconfig for ppc64le

2022-08-30 Thread David Christensen




On 8/29/22 3:30 AM, Thomas Monjalon wrote:

What is the conclusion on this patch?
It is good to go? Acked?


Not from me yet.

Just asked about the test environment so I can duplicate the issue.  My 
understanding is that Ubuntu cross-compiles for CI/CD are still working 
so I'd like to understand the test case the drives this need.


Dave


RE: [PATCH] config: set pkgconfig for ppc64le

2022-08-30 Thread Ali Alnubani
> On 8/29/22 3:30 AM, Thomas Monjalon wrote:
> > What is the conclusion on this patch?
> > It is good to go? Acked?
> 
> Not from me yet.
> 
> Just asked about the test environment so I can duplicate the issue.  My
> understanding is that Ubuntu cross-compiles for CI/CD are still working
> so I'd like to understand the test case the drives this need.
> 
> Dave

Trying to enable the mlx drivers when cross-building for ppc64le, I added the 
directory containing
the .pc files from an rdma-core (https://github.com/linux-rdma/rdma-core) 
ppc64le cross build, but Meson
didn't detect the dependencies without installing 
pkg-config-powerpc64le-linux-gnu and setting
binaries.pkgconfig as powerpc64le-linux-gnu-pkg-config.

I just tried to reproduce the issue with a cross build of numactl, but Meson 
will not
detect it, even with my change. Seems that 
PKG_CONFIG_PATH=:/path/to/numactl/build/lib/pkgconfig
gets ignored.

- Ali


[PATCH V3 0/7] pipeline: support direct registers and meters

2022-08-30 Thread Cristian Dumitrescu
This patch introduces support for direct registers and meters. The
difference between indirect (indexed) and direct registers and meters
is explained below [1][2][3].

1. Indirect (indexed) registers and meters.

The index into an array of registers or meters used on the data path
is typically read from the action data identified by a table lookup
operation.

This means that the control plane manages the mapping of the mapping
of array entries to table entries and sets up this index explicitly
into the table entry action data.

These are called indirect or indexed registers and meters, and they
allow multiple table entries to share the same register/meter, as well
as a 1:1 mapping of table entries to register/meter array entries.

2. Direct registers and meters.
In the case of 1:1 mapping of table entries to register/meter array
elements, it is sometimes desired to avoid the explicit allocation of
register/meter array index to each table entry by the control plane.

One way this can be done is by implementing a mechanism to associate
a unique ID to each table entry, including the default table entry as
well; once the entry ID is retrieved as result of the table lookup
operation, it is saved by the pipeline and used later on as the
index into the register/meter array.

These are called direct registers and meters, and have the advantage
that the index is auto-generated, which simplifies the controller
implementation; the disadvantage is that they do not allow multiple
table entries to share the same register or meter.

References:

[1] Indirect and direct counters:
https://p4.org/p4-spec/docs/PSA.html#sec-counters

[2] Indirect and direct registers:
https://p4.org/p4-spec/docs/PSA.html#sec-registers

[3] Indirect and direct meters:
https://p4.org/p4-spec/docs/PSA.html#sec-meters

Depends-on: series-24366 ("pipeline: make the hash function configurable per 
table")

Change log:

V3:
-Fixed issues related to CLI parsing
-Fixed issue related to the key offset

V2:
-Fixed minor style issues flagged by CI/CD

Cristian Dumitrescu (7):
  table: add entry ID for regular tables
  table: add entry ID for learner tables
  pipeline: add table entry ID read instruction
  pipeline: support direct registers on the control path
  pipeline: support direct meters on the control path
  examples/pipeline: add CLI commands for direct registers
  examples/pipeline: add CLI commands for direct meters

 examples/pipeline/cli.c  | 654 +--
 examples/pipeline/examples/meter.cli |   2 +-
 lib/pipeline/rte_swx_ctl.h   | 133 +
 lib/pipeline/rte_swx_pipeline.c  | 390 ++
 lib/pipeline/rte_swx_pipeline_internal.h |  21 +
 lib/pipeline/version.map |   5 +
 lib/table/rte_swx_table.h|  13 +
 lib/table/rte_swx_table_em.c |   5 +
 lib/table/rte_swx_table_learner.c|  13 +-
 lib/table/rte_swx_table_learner.h|  12 +
 lib/table/rte_swx_table_wm.c |   2 +
 11 files changed, 1097 insertions(+), 153 deletions(-)

-- 
2.34.1



[PATCH V3 1/7] table: add entry ID for regular tables

2022-08-30 Thread Cristian Dumitrescu
Add support for unique ID for each table entry. The entry ID is
retrieved as part of the table lookup operation and is saved by the
pipeline for later use.

Signed-off-by: Cristian Dumitrescu 
---
 lib/pipeline/rte_swx_pipeline.c  |  9 +
 lib/pipeline/rte_swx_pipeline_internal.h |  1 +
 lib/table/rte_swx_table.h| 13 +
 lib/table/rte_swx_table_em.c |  5 +
 lib/table/rte_swx_table_wm.c |  2 ++
 5 files changed, 30 insertions(+)

diff --git a/lib/pipeline/rte_swx_pipeline.c b/lib/pipeline/rte_swx_pipeline.c
index 1c49622be7..e271cc50eb 100644
--- a/lib/pipeline/rte_swx_pipeline.c
+++ b/lib/pipeline/rte_swx_pipeline.c
@@ -2401,6 +2401,7 @@ instr_table_exec(struct rte_swx_pipeline *p)
struct table_statistics *stats = &p->table_stats[table_id];
uint64_t action_id, n_pkts_hit, n_pkts_action;
uint8_t *action_data;
+   size_t entry_id;
int done, hit;
 
/* Table. */
@@ -2409,6 +2410,7 @@ instr_table_exec(struct rte_swx_pipeline *p)
   table->key,
   &action_id,
   &action_data,
+  &entry_id,
   &hit);
if (!done) {
/* Thread. */
@@ -2422,6 +2424,7 @@ instr_table_exec(struct rte_swx_pipeline *p)
 
action_id = hit ? action_id : ts->default_action_id;
action_data = hit ? action_data : ts->default_action_data;
+   entry_id = hit ? (1 + entry_id) : 0;
n_pkts_hit = stats->n_pkts_hit[hit];
n_pkts_action = stats->n_pkts_action[action_id];
 
@@ -2433,6 +2436,7 @@ instr_table_exec(struct rte_swx_pipeline *p)
 
t->action_id = action_id;
t->structs[0] = action_data;
+   t->entry_id = entry_id;
t->hit = hit;
stats->n_pkts_hit[hit] = n_pkts_hit + 1;
stats->n_pkts_action[action_id] = n_pkts_action + 1;
@@ -2452,6 +2456,7 @@ instr_table_af_exec(struct rte_swx_pipeline *p)
struct table_statistics *stats = &p->table_stats[table_id];
uint64_t action_id, n_pkts_hit, n_pkts_action;
uint8_t *action_data;
+   size_t entry_id;
action_func_t action_func;
int done, hit;
 
@@ -2461,6 +2466,7 @@ instr_table_af_exec(struct rte_swx_pipeline *p)
   table->key,
   &action_id,
   &action_data,
+  &entry_id,
   &hit);
if (!done) {
/* Thread. */
@@ -2474,6 +2480,7 @@ instr_table_af_exec(struct rte_swx_pipeline *p)
 
action_id = hit ? action_id : ts->default_action_id;
action_data = hit ? action_data : ts->default_action_data;
+   entry_id = hit ? (1 + entry_id) : 0;
action_func = p->action_funcs[action_id];
n_pkts_hit = stats->n_pkts_hit[hit];
n_pkts_action = stats->n_pkts_action[action_id];
@@ -2486,6 +2493,7 @@ instr_table_af_exec(struct rte_swx_pipeline *p)
 
t->action_id = action_id;
t->structs[0] = action_data;
+   t->entry_id = entry_id;
t->hit = hit;
stats->n_pkts_hit[hit] = n_pkts_hit + 1;
stats->n_pkts_action[action_id] = n_pkts_action + 1;
@@ -8283,6 +8291,7 @@ table_stub_lkp(void *table __rte_unused,
   uint8_t **key __rte_unused,
   uint64_t *action_id __rte_unused,
   uint8_t **action_data __rte_unused,
+  size_t *entry_id __rte_unused,
   int *hit)
 {
*hit = 0;
diff --git a/lib/pipeline/rte_swx_pipeline_internal.h 
b/lib/pipeline/rte_swx_pipeline_internal.h
index ef60288dca..8f96b67d76 100644
--- a/lib/pipeline/rte_swx_pipeline_internal.h
+++ b/lib/pipeline/rte_swx_pipeline_internal.h
@@ -1009,6 +1009,7 @@ struct thread {
struct learner_runtime *learners;
struct rte_swx_table_state *table_state;
uint64_t action_id;
+   size_t entry_id;
int hit; /* 0 = Miss, 1 = Hit. */
uint32_t learner_id;
uint64_t time;
diff --git a/lib/table/rte_swx_table.h b/lib/table/rte_swx_table.h
index 4b8dc06798..ac01e19781 100644
--- a/lib/table/rte_swx_table.h
+++ b/lib/table/rte_swx_table.h
@@ -233,6 +233,15 @@ typedef int
  * data likely to be read from the CPU cache with no CPU pipeline stall, which
  * significantly improves the table lookup performance.
  *
+ * The table entry consists of the action ID and the action data. Each table
+ * entry is unique, although different table entries can have identical 
content,
+ * i.e. same values for the action ID and the action data. The table entry ID 
is
+ * also returned by the table lookup operation. It can be used to index into an
+ * external array of resources such as counters, registers or meters to 
identify
+ * the resource directly associated with the current table entry with no need 
to
+ * store the corresponding index into the table entry. The index of the 
external
+ * resource i

[PATCH V3 2/7] table: add entry ID for learner tables

2022-08-30 Thread Cristian Dumitrescu
Add support for unique ID for each learner table entry. The entry ID
is retrieved as part of the learner table lookup operation and is
saved by the pipeline for later use.

Signed-off-by: Cristian Dumitrescu 
---
 lib/pipeline/rte_swx_pipeline.c   |  8 
 lib/table/rte_swx_table_learner.c | 13 -
 lib/table/rte_swx_table_learner.h | 12 
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/lib/pipeline/rte_swx_pipeline.c b/lib/pipeline/rte_swx_pipeline.c
index e271cc50eb..80108b8916 100644
--- a/lib/pipeline/rte_swx_pipeline.c
+++ b/lib/pipeline/rte_swx_pipeline.c
@@ -2556,6 +2556,7 @@ instr_learner_exec(struct rte_swx_pipeline *p)
struct learner_statistics *stats = &p->learner_stats[learner_id];
uint64_t action_id, n_pkts_hit, n_pkts_action, time;
uint8_t *action_data;
+   size_t entry_id;
int done, hit;
 
/* Table. */
@@ -2567,6 +2568,7 @@ instr_learner_exec(struct rte_swx_pipeline *p)
l->key,
&action_id,
&action_data,
+   &entry_id,
&hit);
if (!done) {
/* Thread. */
@@ -2580,6 +2582,7 @@ instr_learner_exec(struct rte_swx_pipeline *p)
 
action_id = hit ? action_id : ts->default_action_id;
action_data = hit ? action_data : ts->default_action_data;
+   entry_id = hit ? (1 + entry_id) : 0;
n_pkts_hit = stats->n_pkts_hit[hit];
n_pkts_action = stats->n_pkts_action[action_id];
 
@@ -2591,6 +2594,7 @@ instr_learner_exec(struct rte_swx_pipeline *p)
 
t->action_id = action_id;
t->structs[0] = action_data;
+   t->entry_id = entry_id;
t->hit = hit;
t->learner_id = learner_id;
t->time = time;
@@ -2613,6 +2617,7 @@ instr_learner_af_exec(struct rte_swx_pipeline *p)
struct learner_statistics *stats = &p->learner_stats[learner_id];
uint64_t action_id, n_pkts_hit, n_pkts_action, time;
uint8_t *action_data;
+   size_t entry_id;
action_func_t action_func;
int done, hit;
 
@@ -2625,6 +2630,7 @@ instr_learner_af_exec(struct rte_swx_pipeline *p)
l->key,
&action_id,
&action_data,
+   &entry_id,
&hit);
if (!done) {
/* Thread. */
@@ -2638,6 +2644,7 @@ instr_learner_af_exec(struct rte_swx_pipeline *p)
 
action_id = hit ? action_id : ts->default_action_id;
action_data = hit ? action_data : ts->default_action_data;
+   entry_id = hit ? (1 + entry_id) : 0;
action_func = p->action_funcs[action_id];
n_pkts_hit = stats->n_pkts_hit[hit];
n_pkts_action = stats->n_pkts_action[action_id];
@@ -2650,6 +2657,7 @@ instr_learner_af_exec(struct rte_swx_pipeline *p)
 
t->action_id = action_id;
t->structs[0] = action_data;
+   t->entry_id = entry_id;
t->hit = hit;
t->learner_id = learner_id;
t->time = time;
diff --git a/lib/table/rte_swx_table_learner.c 
b/lib/table/rte_swx_table_learner.c
index c1045a1082..996fd3de5b 100644
--- a/lib/table/rte_swx_table_learner.c
+++ b/lib/table/rte_swx_table_learner.c
@@ -72,6 +72,7 @@ table_keycpy(void *dst, void *src, uint32_t n_bytes)
 }
 
 #define TABLE_KEYS_PER_BUCKET 4
+#define TABLE_KEYS_PER_BUCKET_LOG2 2
 
 #define TABLE_BUCKET_USEFUL_SIZE \
(TABLE_KEYS_PER_BUCKET * (sizeof(uint32_t) + sizeof(uint32_t) + 
sizeof(uint8_t)))
@@ -263,6 +264,14 @@ table_bucket_data_get(struct table *t, struct table_bucket 
*b, size_t bucket_key
   (bucket_key_pos << 
t->params.data_size_log2)];
 }
 
+static inline size_t
+table_entry_id_get(struct table *t, struct table_bucket *b, size_t 
bucket_key_pos)
+{
+   size_t bucket_id = ((uint8_t *)b - t->buckets) >> 
t->params.bucket_size_log2;
+
+   return (bucket_id << TABLE_KEYS_PER_BUCKET_LOG2) + bucket_key_pos;
+}
+
 uint64_t
 rte_swx_table_learner_footprint_get(struct rte_swx_table_learner_params 
*params)
 {
@@ -332,7 +341,7 @@ struct mailbox {
/* Writer: lookup state 0. Reader(s): lookup state 1, add(). */
uint32_t input_sig;
 
-   /* Writer: lookup state 1. Reader(s): add(). */
+   /* Writer: lookup state 0. Reader(s): lookup state 1, add(). */
uint8_t *input_key;
 
/* Writer: lookup state 1. Reader(s): add(). Values: 0 = miss; 1 = hit. 
*/
@@ -358,6 +367,7 @@ rte_swx_table_learner_lookup(void *table,
 uint8_t **key,
 uint64_t *action_id,
 uint8_t **action_data,
+size_t *entry_id,
 int *

[PATCH V3 4/7] pipeline: support direct registers on the control path

2022-08-30 Thread Cristian Dumitrescu
Add pipeline control path API to read/write direct registers. These
registers are identified by a table key, whose entry ID is used as the
index into the register array.

Signed-off-by: Cristian Dumitrescu 
---
 lib/pipeline/rte_swx_ctl.h  |  52 +++
 lib/pipeline/rte_swx_pipeline.c | 255 
 lib/pipeline/version.map|   2 +
 3 files changed, 309 insertions(+)

diff --git a/lib/pipeline/rte_swx_ctl.h b/lib/pipeline/rte_swx_ctl.h
index 0694df557a..1b47820441 100644
--- a/lib/pipeline/rte_swx_ctl.h
+++ b/lib/pipeline/rte_swx_ctl.h
@@ -1237,6 +1237,58 @@ rte_swx_ctl_pipeline_regarray_write(struct 
rte_swx_pipeline *p,
   uint32_t regarray_index,
   uint64_t value);
 
+/**
+ * Register read with table key lookup
+ *
+ * @param[in] p
+ *   Pipeline handle.
+ * @param[in] regarray_name
+ *   Register array name.
+ * @param[in] table_name
+ *   Regular or learner table name.
+ * @param[in] table_key
+ *   Table key.
+ * @param[out] value
+ *   Current register value.
+ * @return
+ *   0 on success or the following error codes otherwise:
+ *   -EINVAL: Invalid argument;
+ *   -ENOMEM: Not enough memory.
+ */
+__rte_experimental
+int
+rte_swx_ctl_pipeline_regarray_read_with_key(struct rte_swx_pipeline *p,
+   const char *regarray_name,
+   const char *table_name,
+   uint8_t *table_key,
+   uint64_t *value);
+
+/**
+ * Register write with table key lookup
+ *
+ * @param[in] p
+ *   Pipeline handle.
+ * @param[in] regarray_name
+ *   Register array name.
+ * @param[in] table_name
+ *   Regular or learner table name.
+ * @param[in] table_key
+ *   Table key.
+ * @param[in] value
+ *   Value to be written to the register.
+ * @return
+ *   0 on success or the following error codes otherwise:
+ *   -EINVAL: Invalid argument;
+ *   -ENOMEM: Not enough memory.
+ */
+__rte_experimental
+int
+rte_swx_ctl_pipeline_regarray_write_with_key(struct rte_swx_pipeline *p,
+const char *regarray_name,
+const char *table_name,
+uint8_t *table_key,
+uint64_t value);
+
 /*
  * Meter Array Query and Configuration API.
  */
diff --git a/lib/pipeline/rte_swx_pipeline.c b/lib/pipeline/rte_swx_pipeline.c
index ec8268b7f8..ab59e7ad79 100644
--- a/lib/pipeline/rte_swx_pipeline.c
+++ b/lib/pipeline/rte_swx_pipeline.c
@@ -8261,6 +8261,24 @@ rte_swx_pipeline_table_config(struct rte_swx_pipeline *p,
return status;
 }
 
+static uint32_t
+table_params_offset_get(struct table *table)
+{
+   struct field *first;
+   uint32_t i;
+
+   first = table->fields[0].field;
+
+   for (i = 1; i < table->n_fields; i++) {
+   struct field *f = table->fields[i].field;
+
+   if (f->offset < first->offset)
+   first = f;
+   }
+
+   return first->offset / 8;
+}
+
 static struct rte_swx_table_params *
 table_params_get(struct table *table)
 {
@@ -9217,6 +9235,24 @@ rte_swx_pipeline_learner_config(struct rte_swx_pipeline 
*p,
return status;
 }
 
+static uint32_t
+learner_params_offset_get(struct learner *l)
+{
+   struct field *first;
+   uint32_t i;
+
+   first = l->fields[0];
+
+   for (i = 1; i < l->n_fields; i++) {
+   struct field *f = l->fields[i];
+
+   if (f->offset < first->offset)
+   first = f;
+   }
+
+   return first->offset / 8;
+}
+
 static void
 learner_params_free(struct rte_swx_table_learner_params *params)
 {
@@ -11101,6 +11137,225 @@ rte_swx_ctl_pipeline_mirroring_session_set(struct 
rte_swx_pipeline *p,
return 0;
 }
 
+static int
+rte_swx_ctl_pipeline_table_lookup(struct rte_swx_pipeline *p,
+ const char *table_name,
+ uint8_t *key,
+ uint64_t *action_id,
+ uint8_t **action_data,
+ size_t *entry_id,
+ int *hit)
+{
+   struct table *t;
+   void *mailbox = NULL;
+
+   /* Check input arguments. */
+   if (!p ||
+   !p->build_done ||
+   !table_name ||
+   !table_name[0] ||
+   !key ||
+   !entry_id ||
+   !hit)
+   return -EINVAL;
+
+   /* Find the table. */
+   t = table_find(p, table_name);
+   if (!t)
+   return -EINVAL;
+
+   if (!t->type) {
+   *hit = 0;
+   return 0;
+   }
+
+   /* Setup mailbox.  */
+   if (t->type->ops.mailbox_size_get) {
+   uint64_t mailbox_size;
+
+   mailbox_size = t->type->ops.m

[PATCH V3 3/7] pipeline: add table entry ID read instruction

2022-08-30 Thread Cristian Dumitrescu
Add the entry ID instruction that reads the entry ID of the latest
table lookup operation from the pipeline into the meta-data. The entry
ID is then used by the register and meter instructions as the index
into the register or meter array.

Signed-off-by: Cristian Dumitrescu 
---
 lib/pipeline/rte_swx_pipeline.c  | 68 
 lib/pipeline/rte_swx_pipeline_internal.h | 20 +++
 2 files changed, 88 insertions(+)

diff --git a/lib/pipeline/rte_swx_pipeline.c b/lib/pipeline/rte_swx_pipeline.c
index 80108b8916..ec8268b7f8 100644
--- a/lib/pipeline/rte_swx_pipeline.c
+++ b/lib/pipeline/rte_swx_pipeline.c
@@ -2834,6 +2834,43 @@ instr_forget_exec(struct rte_swx_pipeline *p)
thread_ip_inc(p);
 }
 
+/*
+ * entryid.
+ */
+static int
+instr_entryid_translate(struct rte_swx_pipeline *p,
+   struct action *action __rte_unused,
+   char **tokens,
+   int n_tokens,
+   struct instruction *instr,
+   struct instruction_data *data __rte_unused)
+{
+   struct field *f;
+
+   CHECK(n_tokens == 2, EINVAL);
+
+   f = metadata_field_parse(p, tokens[1]);
+   CHECK(f, EINVAL);
+   CHECK(f->n_bits <= 64, EINVAL);
+
+   instr->type = INSTR_ENTRYID;
+   instr->mov.dst.n_bits = f->n_bits;
+   instr->mov.dst.offset = f->offset / 8;
+   return 0;
+}
+
+static inline void
+instr_entryid_exec(struct rte_swx_pipeline *p)
+{
+   struct thread *t = &p->threads[p->thread_id];
+   struct instruction *ip = t->ip;
+
+   __instr_entryid_exec(p, t, ip);
+
+   /* Thread. */
+   thread_ip_inc(p);
+}
+
 /*
  * extern.
  */
@@ -6336,6 +6373,14 @@ instr_translate(struct rte_swx_pipeline *p,
  instr,
  data);
 
+   if (!strcmp(tokens[tpos], "entryid"))
+   return instr_entryid_translate(p,
+  action,
+  &tokens[tpos],
+  n_tokens - tpos,
+  instr,
+  data);
+
if (!strcmp(tokens[tpos], "extern"))
return instr_extern_translate(p,
  action,
@@ -7321,6 +7366,8 @@ static instr_exec_t instruction_table[] = {
[INSTR_LEARNER_REARM] = instr_rearm_exec,
[INSTR_LEARNER_REARM_NEW] = instr_rearm_new_exec,
[INSTR_LEARNER_FORGET] = instr_forget_exec,
+   [INSTR_ENTRYID] = instr_entryid_exec,
+
[INSTR_EXTERN_OBJ] = instr_extern_obj_exec,
[INSTR_EXTERN_FUNC] = instr_extern_func_exec,
[INSTR_HASH_FUNC] = instr_hash_func_exec,
@@ -11222,6 +11269,7 @@ instr_type_to_name(struct instruction *instr)
case INSTR_LEARNER_REARM: return "INSTR_LEARNER_REARM";
case INSTR_LEARNER_REARM_NEW: return "INSTR_LEARNER_REARM_NEW";
case INSTR_LEARNER_FORGET: return "INSTR_LEARNER_FORGET";
+   case INSTR_ENTRYID: return "INSTR_ENTRYID";
 
case INSTR_EXTERN_OBJ: return "INSTR_EXTERN_OBJ";
case INSTR_EXTERN_FUNC: return "INSTR_EXTERN_FUNC";
@@ -11922,6 +11970,24 @@ instr_forget_export(struct instruction *instr, FILE *f)
instr_type_to_name(instr));
 }
 
+static void
+instr_entryid_export(struct instruction *instr, FILE *f)
+{
+   fprintf(f,
+   "\t{\n"
+   "\t\t.type = %s,\n"
+   "\t\t.mov = {\n"
+   "\t\t\t.dst = {\n"
+   "\t\t\t\t.n_bits = %u,\n"
+   "\t\t\t\t.offset = %u,\n"
+   "\t\t\t},\n"
+   "\t\t},\n"
+   "\t},\n",
+   instr_type_to_name(instr),
+   instr->mov.dst.n_bits,
+   instr->mov.dst.offset);
+}
+
 static void
 instr_extern_export(struct instruction *instr, FILE *f)
 {
@@ -12212,6 +12278,7 @@ static instruction_export_t export_table[] = {
[INSTR_LEARNER_REARM] = instr_rearm_export,
[INSTR_LEARNER_REARM_NEW] = instr_rearm_export,
[INSTR_LEARNER_FORGET] = instr_forget_export,
+   [INSTR_ENTRYID] = instr_entryid_export,
 
[INSTR_EXTERN_OBJ] = instr_extern_export,
[INSTR_EXTERN_FUNC] = instr_extern_export,
@@ -12438,6 +12505,7 @@ instr_type_to_func(struct instruction *instr)
case INSTR_LEARNER_REARM: return "__instr_rearm_exec";
case INSTR_LEARNER_REARM_NEW: return "__instr_rearm_new_exec";
case INSTR_LEARNER_FORGET: return "__instr_forget_exec";
+   case INSTR_ENTRYID: return "__instr_entryid_exec";
 
case INSTR_EXTERN_OBJ: return NULL;
case INSTR_EXTERN_FUNC: return NULL;
diff --git a/lib/pipeline/rte_swx_pipeline_internal.h 
b/lib/pipeline/rte_swx_pipeline_internal.h
index 8f96b67d76..335506039b 100644
--- a/lib/pipeline/rte_swx_pipeline_internal.h
+++ b/lib/pipelin

[PATCH V3 7/7] examples/pipeline: add CLI commands for direct meters

2022-08-30 Thread Cristian Dumitrescu
Add the CLI command support for managing direct meters.

Signed-off-by: Cristian Dumitrescu 
---
 examples/pipeline/cli.c  | 426 +++
 examples/pipeline/examples/meter.cli |   2 +-
 2 files changed, 312 insertions(+), 116 deletions(-)

diff --git a/examples/pipeline/cli.c b/examples/pipeline/cli.c
index 115147adfc..1426faf1f9 100644
--- a/examples/pipeline/cli.c
+++ b/examples/pipeline/cli.c
@@ -2105,8 +2105,9 @@ cmd_pipeline_meter_profile_delete(char **tokens,
 }
 
 static const char cmd_pipeline_meter_reset_help[] =
-"pipeline  meter  from  to  "
-   "reset\n";
+"pipeline  meter  reset\n"
+   "index from  to \n"
+   " | table  match  ...\n";
 
 static void
 cmd_pipeline_meter_reset(char **tokens,
@@ -2116,16 +2117,18 @@ cmd_pipeline_meter_reset(char **tokens,
void *obj __rte_unused)
 {
struct rte_swx_pipeline *p;
-   const char *name;
-   uint32_t idx0 = 0, idx1 = 0;
+   struct rte_swx_ctl_pipeline *ctl;
+   const char *pipeline_name, *name;
 
-   if (n_tokens != 9) {
+   if (n_tokens < 6) {
snprintf(out, out_size, MSG_ARG_MISMATCH, tokens[0]);
return;
}
 
-   p = rte_swx_pipeline_find(tokens[1]);
-   if (!p) {
+   pipeline_name = tokens[1];
+   p = rte_swx_pipeline_find(pipeline_name);
+   ctl = rte_swx_ctl_pipeline_find(pipeline_name);
+   if (!p || !ctl) {
snprintf(out, out_size, MSG_ARG_INVALID, "pipeline_name");
return;
}
@@ -2137,45 +2140,96 @@ cmd_pipeline_meter_reset(char **tokens,
 
name = tokens[3];
 
-   if (strcmp(tokens[4], "from")) {
-   snprintf(out, out_size, MSG_ARG_NOT_FOUND, "from");
+   if (strcmp(tokens[4], "reset")) {
+   snprintf(out, out_size, MSG_ARG_NOT_FOUND, "reset");
return;
}
 
-   if (parser_read_uint32(&idx0, tokens[5])) {
-   snprintf(out, out_size, MSG_ARG_INVALID, "index0");
-   return;
-   }
+   /* index. */
+   if (!strcmp(tokens[5], "index")) {
+   uint32_t idx0 = 0, idx1 = 0;
 
-   if (strcmp(tokens[6], "to")) {
-   snprintf(out, out_size, MSG_ARG_NOT_FOUND, "to");
-   return;
-   }
+   if (n_tokens != 10) {
+   snprintf(out, out_size, MSG_ARG_MISMATCH, tokens[0]);
+   return;
+   }
 
-   if (parser_read_uint32(&idx1, tokens[7]) || (idx1 < idx0)) {
-   snprintf(out, out_size, MSG_ARG_INVALID, "index1");
-   return;
-   }
+   if (strcmp(tokens[6], "from")) {
+   snprintf(out, out_size, MSG_ARG_NOT_FOUND, "from");
+   return;
+   }
+
+   if (parser_read_uint32(&idx0, tokens[7])) {
+   snprintf(out, out_size, MSG_ARG_INVALID, "index0");
+   return;
+   }
+
+   if (strcmp(tokens[8], "to")) {
+   snprintf(out, out_size, MSG_ARG_NOT_FOUND, "to");
+   return;
+   }
+
+   if (parser_read_uint32(&idx1, tokens[9]) || (idx1 < idx0)) {
+   snprintf(out, out_size, MSG_ARG_INVALID, "index1");
+   return;
+   }
+
+   for ( ; idx0 <= idx1; idx0++) {
+   int status;
+
+   status = rte_swx_ctl_meter_reset(p, name, idx0);
+   if (status) {
+   snprintf(out, out_size, "Command failed for 
index %u.\n", idx0);
+   return;
+   }
+   }
 
-   if (strcmp(tokens[8], "reset")) {
-   snprintf(out, out_size, MSG_ARG_NOT_FOUND, "reset");
return;
}
 
-   for ( ; idx0 <= idx1; idx0++) {
+   /* table. */
+   if (!strcmp(tokens[5], "table")) {
+   struct rte_swx_table_entry *entry;
+   char *table_name;
int status;
 
-   status = rte_swx_ctl_meter_reset(p, name, idx0);
+   if (n_tokens < 9) {
+   snprintf(out, out_size, MSG_ARG_MISMATCH, tokens[0]);
+   return;
+   }
+
+   table_name = tokens[6];
+
+   if (strcmp(tokens[7], "match")) {
+   snprintf(out, out_size, MSG_ARG_NOT_FOUND, "match");
+   return;
+   }
+
+   entry = parse_table_entry(ctl, table_name, &tokens[7], n_tokens 
- 7);
+   if (!entry) {
+   snprintf(out, out_size, "Invalid match tokens.\n");
+   return;
+   }
+
+   status = rte_swx_ctl_meter_reset_with_key(p, name, table_name, 
entry->key);
+   table_entry_free(entry);
if (status) {
-

[PATCH V3 5/7] pipeline: support direct meters on the control path

2022-08-30 Thread Cristian Dumitrescu
Add pipeline control path API to manage direct meters. These meters
are identified by a table key, whose entry ID is used as the index
into the meter array.

Signed-off-by: Cristian Dumitrescu 
---
 lib/pipeline/rte_swx_ctl.h  | 81 +
 lib/pipeline/rte_swx_pipeline.c | 50 
 lib/pipeline/version.map|  3 ++
 3 files changed, 134 insertions(+)

diff --git a/lib/pipeline/rte_swx_ctl.h b/lib/pipeline/rte_swx_ctl.h
index 1b47820441..2eb51b2c76 100644
--- a/lib/pipeline/rte_swx_ctl.h
+++ b/lib/pipeline/rte_swx_ctl.h
@@ -1440,6 +1440,87 @@ rte_swx_ctl_meter_stats_read(struct rte_swx_pipeline *p,
 uint32_t metarray_index,
 struct rte_swx_ctl_meter_stats *stats);
 
+/**
+ * Meter reset with table key lookup
+ *
+ * Reset a meter within a given meter array to use the default profile that
+ * causes all the input packets to be colored as green. It is the 
responsibility
+ * of the control plane to make sure this meter is not used by the data plane
+ * pipeline before calling this function.
+ *
+ * @param[in] p
+ *   Pipeline handle.
+ * @param[in] metarray_name
+ *   Meter array name.
+ * @param[in] table_name
+ *   Regular or learner table name.
+ * @param[in] table_key
+ *   Table key.
+ * @return
+ *   0 on success or the following error codes otherwise:
+ *   -EINVAL: Invalid argument.
+ */
+__rte_experimental
+int
+rte_swx_ctl_meter_reset_with_key(struct rte_swx_pipeline *p,
+const char *metarray_name,
+const char *table_name,
+uint8_t *table_key);
+
+/**
+ * Meter set with table key lookup
+ *
+ * Set a meter within a given meter array to use a specific profile. It is the
+ * responsibility of the control plane to make sure this meter is not used by
+ * the data plane pipeline before calling this function.
+ *
+ * @param[in] p
+ *   Pipeline handle.
+ * @param[in] metarray_name
+ *   Meter array name.
+ * @param[in] table_name
+ *   Regular or learner table name.
+ * @param[in] table_key
+ *   Table key.
+ * @param[in] profile_name
+ *   Existing meter profile name.
+ * @return
+ *   0 on success or the following error codes otherwise:
+ *   -EINVAL: Invalid argument.
+ */
+__rte_experimental
+int
+rte_swx_ctl_meter_set_with_key(struct rte_swx_pipeline *p,
+  const char *metarray_name,
+  const char *table_name,
+  uint8_t *table_key,
+  const char *profile_name);
+
+/**
+ * Meter statistics counters read with table key lookup
+ *
+ * @param[in] p
+ *   Pipeline handle.
+ * @param[in] metarray_name
+ *   Meter array name.
+ * @param[in] table_name
+ *   Regular or learner table name.
+ * @param[in] table_key
+ *   Table key.
+ * @param[out] stats
+ *   Meter statistics counters.
+ * @return
+ *   0 on success or the following error codes otherwise:
+ *   -EINVAL: Invalid argument.
+ */
+__rte_experimental
+int
+rte_swx_ctl_meter_stats_read_with_key(struct rte_swx_pipeline *p,
+ const char *metarray_name,
+ const char *table_name,
+ uint8_t *table_key,
+ struct rte_swx_ctl_meter_stats *stats);
+
 /**
  * Pipeline control free
  *
diff --git a/lib/pipeline/rte_swx_pipeline.c b/lib/pipeline/rte_swx_pipeline.c
index ab59e7ad79..232dafb95e 100644
--- a/lib/pipeline/rte_swx_pipeline.c
+++ b/lib/pipeline/rte_swx_pipeline.c
@@ -11356,6 +11356,56 @@ rte_swx_ctl_pipeline_regarray_write_with_key(struct 
rte_swx_pipeline *p,
return rte_swx_ctl_pipeline_regarray_write(p, regarray_name, entry_id, 
value);
 }
 
+int
+rte_swx_ctl_meter_reset_with_key(struct rte_swx_pipeline *p,
+const char *metarray_name,
+const char *table_name,
+uint8_t *table_key)
+{
+   size_t entry_id = 0;
+   int status;
+
+   status = rte_swx_ctl_pipeline_table_entry_id_get(p, table_name, 
table_key, &entry_id);
+   if (status)
+   return status;
+
+   return rte_swx_ctl_meter_reset(p, metarray_name, entry_id);
+}
+
+int
+rte_swx_ctl_meter_set_with_key(struct rte_swx_pipeline *p,
+  const char *metarray_name,
+  const char *table_name,
+  uint8_t *table_key,
+  const char *profile_name)
+{
+   size_t entry_id = 0;
+   int status;
+
+   status = rte_swx_ctl_pipeline_table_entry_id_get(p, table_name, 
table_key, &entry_id);
+   if (status)
+   return status;
+
+   return rte_swx_ctl_meter_set(p, metarray_name, entry_id, profile_name);
+}
+
+int
+rte_swx_ctl_meter_stats_read_with_key(struct rte_swx_pipeline *p,
+

[PATCH V3 6/7] examples/pipeline: add CLI commands for direct registers

2022-08-30 Thread Cristian Dumitrescu
Add the CLI command support for reading/writing direct registers.

Signed-off-by: Cristian Dumitrescu 
---
 examples/pipeline/cli.c | 228 +---
 1 file changed, 192 insertions(+), 36 deletions(-)

diff --git a/examples/pipeline/cli.c b/examples/pipeline/cli.c
index 2e69698031..115147adfc 100644
--- a/examples/pipeline/cli.c
+++ b/examples/pipeline/cli.c
@@ -142,6 +142,54 @@ is_comment(char *in)
return 0;
 }
 
+static void
+table_entry_free(struct rte_swx_table_entry *entry)
+{
+   if (!entry)
+   return;
+
+   free(entry->key);
+   free(entry->key_mask);
+   free(entry->action_data);
+   free(entry);
+}
+
+static struct rte_swx_table_entry *
+parse_table_entry(struct rte_swx_ctl_pipeline *p,
+ char *table_name,
+ char **tokens,
+ uint32_t n_tokens)
+{
+   struct rte_swx_table_entry *entry;
+   char *line;
+   uint32_t i;
+
+   /* Buffer allocation. */
+   line = malloc(MAX_LINE_SIZE);
+   if (!line)
+   return NULL;
+
+   /* Copy tokens to buffer. Since the tokens were initially part of a 
buffer of size
+* MAX_LINE_LENGTH, it is guaranteed that putting back some of them 
into a buffer of the
+* same size separated by a single space will not result in buffer 
overrun.
+*/
+   line[0] = 0;
+   for (i = 0; i < n_tokens; i++) {
+   if (i)
+   strcat(line, " ");
+
+   strcat(line, tokens[i]);
+   }
+
+   /* Read the table entry from the input buffer. */
+   entry = rte_swx_ctl_pipeline_table_entry_read(p, table_name, line, 
NULL);
+
+   /* Buffer free. */
+   free(line);
+
+   return entry;
+}
+
 static const char cmd_mempool_help[] =
 "mempool \n"
 "   buffer \n"
@@ -732,18 +780,6 @@ cmd_pipeline_build(char **tokens,
fclose(iospec_file);
 }
 
-static void
-table_entry_free(struct rte_swx_table_entry *entry)
-{
-   if (!entry)
-   return;
-
-   free(entry->key);
-   free(entry->key_mask);
-   free(entry->action_data);
-   free(entry);
-}
-
 static int
 pipeline_table_entries_add(struct rte_swx_ctl_pipeline *p,
   const char *table_name,
@@ -1710,7 +1746,9 @@ cmd_pipeline_abort(char **tokens,
 }
 
 static const char cmd_pipeline_regrd_help[] =
-"pipeline  regrd  \n";
+"pipeline  regrd \n"
+   "index \n"
+   " | table  match  ...\n";
 
 static void
 cmd_pipeline_regrd(char **tokens,
@@ -1720,18 +1758,20 @@ cmd_pipeline_regrd(char **tokens,
void *obj __rte_unused)
 {
struct rte_swx_pipeline *p;
-   const char *name;
+   struct rte_swx_ctl_pipeline *ctl;
+   const char *pipeline_name, *name;
uint64_t value;
-   uint32_t idx;
int status;
 
-   if (n_tokens != 5) {
+   if (n_tokens < 5) {
snprintf(out, out_size, MSG_ARG_MISMATCH, tokens[0]);
return;
}
 
-   p = rte_swx_pipeline_find(tokens[1]);
-   if (!p) {
+   pipeline_name = tokens[1];
+   p = rte_swx_pipeline_find(pipeline_name);
+   ctl = rte_swx_ctl_pipeline_find(pipeline_name);
+   if (!p || !ctl) {
snprintf(out, out_size, MSG_ARG_INVALID, "pipeline_name");
return;
}
@@ -1743,22 +1783,77 @@ cmd_pipeline_regrd(char **tokens,
 
name = tokens[3];
 
-   if (parser_read_uint32(&idx, tokens[4])) {
-   snprintf(out, out_size, MSG_ARG_INVALID, "index");
+   /* index. */
+   if (!strcmp(tokens[4], "index")) {
+   uint32_t idx;
+
+   if (n_tokens != 6) {
+   snprintf(out, out_size, MSG_ARG_MISMATCH, tokens[0]);
+   return;
+   }
+
+   if (parser_read_uint32(&idx, tokens[5])) {
+   snprintf(out, out_size, MSG_ARG_INVALID, "index");
+   return;
+   }
+
+   status = rte_swx_ctl_pipeline_regarray_read(p, name, idx, 
&value);
+   if (status) {
+   snprintf(out, out_size, "Command failed.\n");
+   return;
+   }
+
+   snprintf(out, out_size, "0x%" PRIx64 "\n", value);
return;
}
 
-   status = rte_swx_ctl_pipeline_regarray_read(p, name, idx, &value);
-   if (status) {
-   snprintf(out, out_size, "Command failed.\n");
+   /* table. */
+   if (!strcmp(tokens[4], "table")) {
+   struct rte_swx_table_entry *entry;
+   char *table_name;
+
+   if (n_tokens < 8) {
+   snprintf(out, out_size, MSG_ARG_MISMATCH, tokens[0]);
+   return;
+   }
+
+   table_name = tokens[5];
+
+   if (strcmp(tokens[6], "match")) {
+   snprintf(out, out_size, MSG_ARG_NOT_FOUND, "

RE: [PATCH v5 2/7] bbdev: add device status info

2022-08-30 Thread Chautru, Nicolas
Hi Maxime, 

> -Original Message-
> From: Maxime Coquelin 
> Sent: Tuesday, August 30, 2022 12:09 AM
> To: Chautru, Nicolas ; dev@dpdk.org;
> tho...@monjalon.net; gak...@marvell.com; hemant.agra...@nxp.com
> Cc: t...@redhat.com; m...@ashroe.eu; Richardson, Bruce
> ; david.march...@redhat.com;
> step...@networkplumber.org
> Subject: Re: [PATCH v5 2/7] bbdev: add device status info
> 
> 
> 
> On 8/29/22 18:10, Chautru, Nicolas wrote:
> > Hi Maxime,
> >
> >> -Original Message-
> >> From: Maxime Coquelin 
> >> Sent: Friday, August 26, 2022 3:13 AM
> >> To: Chautru, Nicolas ; dev@dpdk.org;
> >> tho...@monjalon.net; gak...@marvell.com; hemant.agra...@nxp.com
> >> Cc: t...@redhat.com; m...@ashroe.eu; Richardson, Bruce
> >> ; david.march...@redhat.com;
> >> step...@networkplumber.org
> >> Subject: Re: [PATCH v5 2/7] bbdev: add device status info
> >>
> >> Hi,
> >>
> >> On 8/25/22 20:30, Chautru, Nicolas wrote:
> >>> Thanks Maxime,
> >>>
>  -Original Message-
>  From: Maxime Coquelin 
>  Sent: Thursday, August 25, 2022 7:19 AM
>  To: Chautru, Nicolas ; dev@dpdk.org;
>  tho...@monjalon.net; gak...@marvell.com;
> hemant.agra...@nxp.com
>  Cc: t...@redhat.com; m...@ashroe.eu; Richardson, Bruce
>  ; david.march...@redhat.com;
>  step...@networkplumber.org
>  Subject: Re: [PATCH v5 2/7] bbdev: add device status info
> 
> 
> 
>  On 7/7/22 01:28, Nicolas Chautru wrote:
> > Added device status information, so that the PMD can expose
> > information related to the underlying accelerator device status.
> > Minor order change in structure to fit into padding hole.
> >
> > Signed-off-by: Nicolas Chautru 
> > ---
> > drivers/baseband/acc100/rte_acc100_pmd.c   |  1 +
> > drivers/baseband/fpga_5gnr_fec/rte_fpga_5gnr_fec.c |  1 +
> > drivers/baseband/fpga_lte_fec/fpga_lte_fec.c   |  1 +
> > drivers/baseband/la12xx/bbdev_la12xx.c |  1 +
> > drivers/baseband/null/bbdev_null.c |  1 +
> > drivers/baseband/turbo_sw/bbdev_turbo_software.c   |  1 +
> > lib/bbdev/rte_bbdev.c  | 22 
> > ++
> > lib/bbdev/rte_bbdev.h  | 35
> --
> > lib/bbdev/version.map  |  6 
> > 9 files changed, 67 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/baseband/acc100/rte_acc100_pmd.c
> > b/drivers/baseband/acc100/rte_acc100_pmd.c
> > index de7e4bc..17ba798 100644
> > --- a/drivers/baseband/acc100/rte_acc100_pmd.c
> > +++ b/drivers/baseband/acc100/rte_acc100_pmd.c
> > @@ -1060,6 +1060,7 @@
> >
> > /* Read and save the populated config from ACC100
> registers */
> > fetch_acc100_config(dev);
> > +   dev_info->device_status =
> RTE_BBDEV_DEV_NOT_SUPPORTED;
> >
> > /* This isn't ideal because it reports the maximum number of
> > queues
>  but
> >  * does not provide info on how many can be
> uplink/downlink
> > or different diff --git
> > a/drivers/baseband/fpga_5gnr_fec/rte_fpga_5gnr_fec.c
> > b/drivers/baseband/fpga_5gnr_fec/rte_fpga_5gnr_fec.c
> > index 82ae6ba..57b12af 100644
> > --- a/drivers/baseband/fpga_5gnr_fec/rte_fpga_5gnr_fec.c
> > +++ b/drivers/baseband/fpga_5gnr_fec/rte_fpga_5gnr_fec.c
> > @@ -369,6 +369,7 @@
> > dev_info->capabilities = bbdev_capabilities;
> > dev_info->cpu_flag_reqs = NULL;
> > dev_info->data_endianness = RTE_LITTLE_ENDIAN;
> > +   dev_info->device_status =
> RTE_BBDEV_DEV_NOT_SUPPORTED;
> >
> > /* Calculates number of queues assigned to device */
> > dev_info->max_num_queues = 0; diff --git
> > a/drivers/baseband/fpga_lte_fec/fpga_lte_fec.c
> > b/drivers/baseband/fpga_lte_fec/fpga_lte_fec.c
> > index 21d3529..2a330c4 100644
> > --- a/drivers/baseband/fpga_lte_fec/fpga_lte_fec.c
> > +++ b/drivers/baseband/fpga_lte_fec/fpga_lte_fec.c
> > @@ -645,6 +645,7 @@ struct __rte_cache_aligned fpga_queue {
> > dev_info->capabilities = bbdev_capabilities;
> > dev_info->cpu_flag_reqs = NULL;
> > dev_info->data_endianness = RTE_LITTLE_ENDIAN;
> > +   dev_info->device_status =
> RTE_BBDEV_DEV_NOT_SUPPORTED;
> >
> > /* Calculates number of queues assigned to device */
> > dev_info->max_num_queues = 0; diff --git
> > a/drivers/baseband/la12xx/bbdev_la12xx.c
> > b/drivers/baseband/la12xx/bbdev_la12xx.c
> > index 4d1bd16..c1f88c6 100644
> > --- a/drivers/baseband/la12xx/bbdev_la12xx.c
> > +++ b/drivers/baseband/la12xx/bbdev_la12xx.c
> > @@ -100,6 +100,7 @@ struct bbdev_la12xx_params {
> > dev_info->capabilities = bbdev_capabilities;
> > dev_info->cpu_flag_reqs = NULL

RE: [PATCH v1 00/10] baseband/acc200

2022-08-30 Thread Chautru, Nicolas
Hi Maxime, 

> -Original Message-
> From: Maxime Coquelin 
> Sent: Tuesday, August 30, 2022 12:45 AM
> To: Chautru, Nicolas ; dev@dpdk.org;
> tho...@monjalon.net; gak...@marvell.com; hemant.agra...@nxp.com;
> t...@redhat.com; Vargas, Hernan 
> Cc: m...@ashroe.eu; Richardson, Bruce ;
> david.march...@redhat.com; step...@networkplumber.org
> Subject: Re: [PATCH v1 00/10] baseband/acc200
> 
> Hi Nicolas,
> 
> On 7/12/22 15:48, Maxime Coquelin wrote:
> > Hi Nicolas, Hernan,
> >
> > (Adding Hernan in the recipients list)
> >
> > On 7/8/22 02:01, Nicolas Chautru wrote:
> >> This is targeting 22.11 and includes the PMD for the integrated
> >> accelerator on Intel Xeon SPR-EEC.
> >> There is a dependency on that parallel serie still in-flight which
> >> extends the bbdev api
> >> https://patches.dpdk.org/project/dpdk/list/?series=23894
> >>
> >> I will be offline for a few weeks for the summer break but Hernan
> >> will cover for me during that time if required.
> >>
> >> Thanks
> >> Nic
> >>
> >> Nicolas Chautru (10):
> >>    baseband/acc200: introduce PMD for ACC200
> >>    baseband/acc200: add HW register definitions
> >>    baseband/acc200: add info get function
> >>    baseband/acc200: add queue configuration
> >>    baseband/acc200: add LDPC processing functions
> >>    baseband/acc200: add LTE processing functions
> >>    baseband/acc200: add support for FFT operations
> >>    baseband/acc200: support interrupt
> >>    baseband/acc200: add device status and vf2pf comms
> >>    baseband/acc200: add PF configure companion function
> >>
> >>   MAINTAINERS  |    3 +
> >>   app/test-bbdev/meson.build   |    3 +
> >>   app/test-bbdev/test_bbdev_perf.c |   76 +
> >>   doc/guides/bbdevs/acc200.rst |  244 ++
> >>   doc/guides/bbdevs/index.rst  |    1 +
> >>   drivers/baseband/acc200/acc200_pf_enum.h |  468 +++
> >>   drivers/baseband/acc200/acc200_pmd.h |  690 
> >>   drivers/baseband/acc200/acc200_vf_enum.h |   89 +
> >>   drivers/baseband/acc200/meson.build  |    8 +
> >>   drivers/baseband/acc200/rte_acc200_cfg.h |  115 +
> >>   drivers/baseband/acc200/rte_acc200_pmd.c | 5403
> >> ++
> >>   drivers/baseband/acc200/version.map  |   10 +
> >>   drivers/baseband/meson.build |    1 +
> >>   13 files changed, 7111 insertions(+)
> >>   create mode 100644 doc/guides/bbdevs/acc200.rst
> >>   create mode 100644 drivers/baseband/acc200/acc200_pf_enum.h
> >>   create mode 100644 drivers/baseband/acc200/acc200_pmd.h
> >>   create mode 100644 drivers/baseband/acc200/acc200_vf_enum.h
> >>   create mode 100644 drivers/baseband/acc200/meson.build
> >>   create mode 100644 drivers/baseband/acc200/rte_acc200_cfg.h
> >>   create mode 100644 drivers/baseband/acc200/rte_acc200_pmd.c
> >>   create mode 100644 drivers/baseband/acc200/version.map
> >>
> >
> > Comparing ACC200 & ACC100 header files, I understand ACC200 is an
> > evolution of the ACC10x family. The FEC bits are really close, ACC200
> > main addition seems to be FFT acceleration which could be handled in
> > ACC10x driver based on device ID.
> >
> > I think both drivers have to be merged in order to avoid code
> > duplication. That's how other families of devices (e.g. i40e) are
> > handled.
> 
> I haven't seen your reply on this point.
> Do you confirm you are working on a single driver for ACC family in order to
> avoid code duplication?
> 

The implementation is based on distinct ACC100 and ACC200 drivers. The 2 
devices are fundamentally different generation, processes and IP.
MountBryce is an eASIC device over PCIe while ACC200 is an integrated 
accelerator on Xeon CPU. 
The actual implementation are not the same, underlying IP are all distinct even 
if many of the descriptor format have similarities. 
The actual capabilities of the acceleration are different and/or new. 
The workaround and silicon errata are also different causing different 
limitation and implementation in the driver (see the serie with ongoing changes 
for ACC100 in parallel).
This is fundamentally distinct from ACC101 which was a derivative product from 
ACC100 and where it made sense to share implementation between ACC100 and 
ACC101. 
So in a nutshell these 2 devices and drivers are 2 different beasts and the 
intention is to keep them intentionally separate as in the serie.
Let me know if unclear, thanks!

Thanks
Nic


> Maxime
> 
> > Thanks,
> > Maxime



[PATCH v2] net/mlx5: use just sufficient barrier for Arm platforms

2022-08-30 Thread Honnappa Nagarahalli
cqe->op_own indicates if the CQE is owned by the NIC. The rest of
the fields in CQE should be read only after op_own is read. On Arm
platforms using "dmb ishld" is sufficient to enforce this.

Fixes: 88c0733535d6 ("net/mlx5: extend Rx completion with error handling")
Cc: ma...@mellanox.com
Cc: sta...@dpdk.org

Signed-off-by: Honnappa Nagarahalli 
Reviewed-by: Ruifeng Wang 
---
 drivers/common/mlx5/mlx5_common.h | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/common/mlx5/mlx5_common.h 
b/drivers/common/mlx5/mlx5_common.h
index 5028a05b49..ac2e85b15f 100644
--- a/drivers/common/mlx5/mlx5_common.h
+++ b/drivers/common/mlx5/mlx5_common.h
@@ -195,7 +195,11 @@ check_cqe(volatile struct mlx5_cqe *cqe, const uint16_t 
cqes_n,
 
if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID)))
return MLX5_CQE_STATUS_HW_OWN;
-   rte_io_rmb();
+   /* Prevent speculative reading of other fields in CQE until
+* CQE is valid.
+*/
+   rte_atomic_thread_fence(__ATOMIC_ACQUIRE);
+
if (unlikely(op_code == MLX5_CQE_RESP_ERR ||
 op_code == MLX5_CQE_REQ_ERR))
return MLX5_CQE_STATUS_ERR;
-- 
2.17.1



[Patch v6 00/18] Introduce Microsoft Azure Network Adatper (MANA) PMD

2022-08-30 Thread longli
From: Long Li 

MANA is a network interface card to be used in the Azure cloud environment.
MANA provides safe access to user memory through memory registration. It has
IOMMU built into the hardware.

MANA uses IB verbs and RDMA layer to configure hardware resources. It
requires the corresponding RDMA kernel-mode and user-mode drivers.

The MANA RDMA kernel-mode driver is being reviewed at:
https://patchwork.kernel.org/project/netdevbpf/cover/1655345240-26411-1-git-send-email-lon...@linuxonhyperv.com/

The MANA RDMA user-mode driver is being reviewed at:
https://github.com/linux-rdma/rdma-core/pull/1177


Long Li (18):
  net/mana: add basic driver, build environment and doc
  net/mana: add device configuration and stop
  net/mana: add function to report support ptypes
  net/mana: add link update
  net/mana: add function for device removal interrupts
  net/mana: add device info
  net/mana: add function to configure RSS
  net/mana: add function to configure RX queues
  net/mana: add function to configure TX queues
  net/mana: implement memory registration
  net/mana: implement the hardware layer operations
  net/mana: add function to start/stop TX queues
  net/mana: add function to start/stop RX queues
  net/mana: add function to receive packets
  net/mana: add function to send packets
  net/mana: add function to start/stop device
  net/mana: add function to report queue stats
  net/mana: add function to support RX interrupts

 MAINTAINERS   |6 +
 doc/guides/nics/features/mana.ini |   21 +
 doc/guides/nics/index.rst |1 +
 doc/guides/nics/mana.rst  |   66 ++
 drivers/net/mana/gdma.c   |  289 ++
 drivers/net/mana/mana.c   | 1449 +
 drivers/net/mana/mana.h   |  553 +++
 drivers/net/mana/meson.build  |   48 +
 drivers/net/mana/mp.c |  323 +++
 drivers/net/mana/mr.c |  324 +++
 drivers/net/mana/rx.c |  519 +++
 drivers/net/mana/tx.c |  405 
 drivers/net/mana/version.map  |3 +
 drivers/net/meson.build   |1 +
 14 files changed, 4008 insertions(+)
 create mode 100644 doc/guides/nics/features/mana.ini
 create mode 100644 doc/guides/nics/mana.rst
 create mode 100644 drivers/net/mana/gdma.c
 create mode 100644 drivers/net/mana/mana.c
 create mode 100644 drivers/net/mana/mana.h
 create mode 100644 drivers/net/mana/meson.build
 create mode 100644 drivers/net/mana/mp.c
 create mode 100644 drivers/net/mana/mr.c
 create mode 100644 drivers/net/mana/rx.c
 create mode 100644 drivers/net/mana/tx.c
 create mode 100644 drivers/net/mana/version.map

-- 
2.17.1



[Patch v6 01/18] net/mana: add basic driver, build environment and doc

2022-08-30 Thread longli
From: Long Li 

MANA is a PCI device. It uses IB verbs to access hardware through the
kernel RDMA layer. This patch introduces build environment and basic
device probe functions.

Signed-off-by: Long Li 
---
Change log:
v2:
Fix typos.
Make the driver build only on x86-64 and Linux.
Remove unused header files.
Change port definition to uint16_t or uint8_t (for IB).
Use getline() in place of fgets() to read and truncate a line.
v3:
Add meson build check for required functions from RDMA direct verb header file
v4:
Remove extra "\n" in logging code.
Use "r" in place of "rb" in fopen() to read text files.

 MAINTAINERS   |   6 +
 doc/guides/nics/features/mana.ini |  10 +
 doc/guides/nics/index.rst |   1 +
 doc/guides/nics/mana.rst  |  66 +++
 drivers/net/mana/mana.c   | 704 ++
 drivers/net/mana/mana.h   | 210 +
 drivers/net/mana/meson.build  |  44 ++
 drivers/net/mana/mp.c | 235 ++
 drivers/net/mana/version.map  |   3 +
 drivers/net/meson.build   |   1 +
 10 files changed, 1280 insertions(+)
 create mode 100644 doc/guides/nics/features/mana.ini
 create mode 100644 doc/guides/nics/mana.rst
 create mode 100644 drivers/net/mana/mana.c
 create mode 100644 drivers/net/mana/mana.h
 create mode 100644 drivers/net/mana/meson.build
 create mode 100644 drivers/net/mana/mp.c
 create mode 100644 drivers/net/mana/version.map

diff --git a/MAINTAINERS b/MAINTAINERS
index 18d9edaf88..b8bda48a33 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -837,6 +837,12 @@ F: buildtools/options-ibverbs-static.sh
 F: doc/guides/nics/mlx5.rst
 F: doc/guides/nics/features/mlx5.ini
 
+Microsoft mana
+M: Long Li 
+F: drivers/net/mana
+F: doc/guides/nics/mana.rst
+F: doc/guides/nics/features/mana.ini
+
 Microsoft vdev_netvsc - EXPERIMENTAL
 M: Matan Azrad 
 F: drivers/net/vdev_netvsc/
diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
new file mode 100644
index 00..b92a27374c
--- /dev/null
+++ b/doc/guides/nics/features/mana.ini
@@ -0,0 +1,10 @@
+;
+; Supported features of the 'mana' network poll mode driver.
+;
+; Refer to default.ini for the full list of available PMD features.
+;
+[Features]
+Linux= Y
+Multiprocess aware   = Y
+Usage doc= Y
+x86-64   = Y
diff --git a/doc/guides/nics/index.rst b/doc/guides/nics/index.rst
index 1c94caccea..2725d1d9f0 100644
--- a/doc/guides/nics/index.rst
+++ b/doc/guides/nics/index.rst
@@ -41,6 +41,7 @@ Network Interface Controller Drivers
 intel_vf
 kni
 liquidio
+mana
 memif
 mlx4
 mlx5
diff --git a/doc/guides/nics/mana.rst b/doc/guides/nics/mana.rst
new file mode 100644
index 00..40e18fe810
--- /dev/null
+++ b/doc/guides/nics/mana.rst
@@ -0,0 +1,66 @@
+..  SPDX-License-Identifier: BSD-3-Clause
+Copyright 2022 Microsoft Corporation
+
+MANA poll mode driver library
+=
+
+The MANA poll mode driver library (**librte_net_mana**) implements support
+for Microsoft Azure Network Adapter VF in SR-IOV context.
+
+Features
+
+
+Features of the MANA Ethdev PMD are:
+
+Prerequisites
+-
+
+This driver relies on external libraries and kernel drivers for resources
+allocations and initialization. The following dependencies are not part of
+DPDK and must be installed separately:
+
+- **libibverbs** (provided by rdma-core package)
+
+  User space verbs framework used by librte_net_mana. This library provides
+  a generic interface between the kernel and low-level user space drivers
+  such as libmana.
+
+  It allows slow and privileged operations (context initialization, hardware
+  resources allocations) to be managed by the kernel and fast operations to
+  never leave user space.
+
+- **libmana** (provided by rdma-core package)
+
+  Low-level user space driver library for Microsoft Azure Network Adapter
+  devices, it is automatically loaded by libibverbs.
+
+- **Kernel modules**
+
+  They provide the kernel-side verbs API and low level device drivers that
+  manage actual hardware initialization and resources sharing with user
+  space processes.
+
+  Unlike most other PMDs, these modules must remain loaded and bound to
+  their devices:
+
+  - mana: Ethernet device driver that provides kernel network interfaces.
+  - mana_ib: InifiniBand device driver.
+  - ib_uverbs: user space driver for verbs (entry point for libibverbs).
+
+Driver compilation and testing
+--
+
+Refer to the document :ref:`compiling and testing a PMD for a NIC 
`
+for details.
+
+Netvsc PMD arguments
+
+
+The user can specify below argument in devargs.
+
+#.  ``mac``:
+
+Specify the MAC address for this device. If it is set, the driver
+probes and loads the NIC with a matching mac address. If it is not
+set, the driver probes on all the NICs on the PCI device. The default
+value is not set, meaning all 

[Patch v6 02/18] net/mana: add device configuration and stop

2022-08-30 Thread longli
From: Long Li 

MANA defines its memory allocation functions to override IB layer default
functions to allocate device queues. This patch adds the code for device
configuration and stop.

Signed-off-by: Long Li 
---
Change log:
v2:
Removed validation for offload settings in mana_dev_configure().

 drivers/net/mana/mana.c | 75 +++--
 drivers/net/mana/mana.h |  3 ++
 2 files changed, 76 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index cb59eb6882..147ab144d5 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -40,7 +40,79 @@ static rte_spinlock_t mana_shared_data_lock = 
RTE_SPINLOCK_INITIALIZER;
 int mana_logtype_driver;
 int mana_logtype_init;
 
+void *mana_alloc_verbs_buf(size_t size, void *data)
+{
+   void *ret;
+   size_t alignment = rte_mem_page_size();
+   int socket = (int)(uintptr_t)data;
+
+   DRV_LOG(DEBUG, "size=%zu socket=%d", size, socket);
+
+   if (alignment == (size_t)-1) {
+   DRV_LOG(ERR, "Failed to get mem page size");
+   rte_errno = ENOMEM;
+   return NULL;
+   }
+
+   ret = rte_zmalloc_socket("mana_verb_buf", size, alignment, socket);
+   if (!ret && size)
+   rte_errno = ENOMEM;
+   return ret;
+}
+
+void mana_free_verbs_buf(void *ptr, void *data __rte_unused)
+{
+   rte_free(ptr);
+}
+
+static int mana_dev_configure(struct rte_eth_dev *dev)
+{
+   struct mana_priv *priv = dev->data->dev_private;
+   struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
+
+   if (dev_conf->rxmode.mq_mode & ETH_MQ_RX_RSS_FLAG)
+   dev_conf->rxmode.offloads |= DEV_RX_OFFLOAD_RSS_HASH;
+
+   if (dev->data->nb_rx_queues != dev->data->nb_tx_queues) {
+   DRV_LOG(ERR, "Only support equal number of rx/tx queues");
+   return -EINVAL;
+   }
+
+   if (!rte_is_power_of_2(dev->data->nb_rx_queues)) {
+   DRV_LOG(ERR, "number of TX/RX queues must be power of 2");
+   return -EINVAL;
+   }
+
+   priv->num_queues = dev->data->nb_rx_queues;
+
+   manadv_set_context_attr(priv->ib_ctx, MANADV_CTX_ATTR_BUF_ALLOCATORS,
+   (void *)((uintptr_t)&(struct 
manadv_ctx_allocators){
+   .alloc = &mana_alloc_verbs_buf,
+   .free = &mana_free_verbs_buf,
+   .data = 0,
+   }));
+
+   return 0;
+}
+
+static int
+mana_dev_close(struct rte_eth_dev *dev)
+{
+   struct mana_priv *priv = dev->data->dev_private;
+   int ret;
+
+   ret = ibv_close_device(priv->ib_ctx);
+   if (ret) {
+   ret = errno;
+   return ret;
+   }
+
+   return 0;
+}
+
 const struct eth_dev_ops mana_dev_ops = {
+   .dev_configure  = mana_dev_configure,
+   .dev_close  = mana_dev_close,
 };
 
 const struct eth_dev_ops mana_dev_sec_ops = {
@@ -627,8 +699,7 @@ static int mana_pci_probe(struct rte_pci_driver *pci_drv 
__rte_unused,
 
 static int mana_dev_uninit(struct rte_eth_dev *dev)
 {
-   RTE_SET_USED(dev);
-   return 0;
+   return mana_dev_close(dev);
 }
 
 static int mana_pci_remove(struct rte_pci_device *pci_dev)
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index e30c030b4e..66873394b9 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -207,4 +207,7 @@ int mana_mp_req_verbs_cmd_fd(struct rte_eth_dev *dev);
 
 void mana_mp_req_on_rxtx(struct rte_eth_dev *dev, enum mana_mp_req_type type);
 
+void *mana_alloc_verbs_buf(size_t size, void *data);
+void mana_free_verbs_buf(void *ptr, void *data __rte_unused);
+
 #endif
-- 
2.17.1



[Patch v6 03/18] net/mana: add function to report support ptypes

2022-08-30 Thread longli
From: Long Li 

Report supported protocol types.

Signed-off-by: Long Li 
---
 drivers/net/mana/mana.c | 16 
 drivers/net/mana/mana.h |  2 --
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 147ab144d5..4559632056 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -110,9 +110,25 @@ mana_dev_close(struct rte_eth_dev *dev)
return 0;
 }
 
+static const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev 
__rte_unused)
+{
+   static const uint32_t ptypes[] = {
+   RTE_PTYPE_L2_ETHER,
+   RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
+   RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
+   RTE_PTYPE_L4_FRAG,
+   RTE_PTYPE_L4_TCP,
+   RTE_PTYPE_L4_UDP,
+   RTE_PTYPE_UNKNOWN
+   };
+
+   return ptypes;
+}
+
 const struct eth_dev_ops mana_dev_ops = {
.dev_configure  = mana_dev_configure,
.dev_close  = mana_dev_close,
+   .dev_supported_ptypes_get = mana_supported_ptypes,
 };
 
 const struct eth_dev_ops mana_dev_sec_ops = {
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 66873394b9..c433940022 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -168,8 +168,6 @@ extern int mana_logtype_init;
 
 #define PMD_INIT_FUNC_TRACE() PMD_INIT_LOG(DEBUG, " >>")
 
-const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev);
-
 uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
   uint16_t pkts_n);
 
-- 
2.17.1



[Patch v6 04/18] net/mana: add link update

2022-08-30 Thread longli
From: Long Li 

The carrier state is managed by the Azure host. MANA runs as a VF and
always reports "up".

Signed-off-by: Long Li 
---
 doc/guides/nics/features/mana.ini |  1 +
 drivers/net/mana/mana.c   | 17 +
 2 files changed, 18 insertions(+)

diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
index b92a27374c..62554b0a0a 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -4,6 +4,7 @@
 ; Refer to default.ini for the full list of available PMD features.
 ;
 [Features]
+Link status  = P
 Linux= Y
 Multiprocess aware   = Y
 Usage doc= Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 4559632056..b77d0c29b0 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -125,10 +125,27 @@ static const uint32_t *mana_supported_ptypes(struct 
rte_eth_dev *dev __rte_unuse
return ptypes;
 }
 
+static int mana_dev_link_update(struct rte_eth_dev *dev,
+   int wait_to_complete __rte_unused)
+{
+   struct rte_eth_link link;
+
+   /* MANA has no concept of carrier state, always reporting UP */
+   link = (struct rte_eth_link) {
+   .link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
+   .link_autoneg = RTE_ETH_LINK_SPEED_FIXED,
+   .link_speed = RTE_ETH_SPEED_NUM_200G,
+   .link_status = RTE_ETH_LINK_UP,
+   };
+
+   return rte_eth_linkstatus_set(dev, &link);
+}
+
 const struct eth_dev_ops mana_dev_ops = {
.dev_configure  = mana_dev_configure,
.dev_close  = mana_dev_close,
.dev_supported_ptypes_get = mana_supported_ptypes,
+   .link_update= mana_dev_link_update,
 };
 
 const struct eth_dev_ops mana_dev_sec_ops = {
-- 
2.17.1



[Patch v6 05/18] net/mana: add function for device removal interrupts

2022-08-30 Thread longli
From: Long Li 

MANA supports PCI hot plug events. Add this interrupt to DPDK core so its
parent PMD can detect device removal during Azure servicing or live
migration.

Signed-off-by: Long Li 
---
 doc/guides/nics/features/mana.ini |  1 +
 drivers/net/mana/mana.c   | 97 +++
 drivers/net/mana/mana.h   |  1 +
 3 files changed, 99 insertions(+)

diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
index 62554b0a0a..8043e11f99 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -7,5 +7,6 @@
 Link status  = P
 Linux= Y
 Multiprocess aware   = Y
+Removal event= Y
 Usage doc= Y
 x86-64   = Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index b77d0c29b0..c9591035ac 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -95,12 +95,18 @@ static int mana_dev_configure(struct rte_eth_dev *dev)
return 0;
 }
 
+static int mana_intr_uninstall(struct mana_priv *priv);
+
 static int
 mana_dev_close(struct rte_eth_dev *dev)
 {
struct mana_priv *priv = dev->data->dev_private;
int ret;
 
+   ret = mana_intr_uninstall(priv);
+   if (ret)
+   return ret;
+
ret = ibv_close_device(priv->ib_ctx);
if (ret) {
ret = errno;
@@ -327,6 +333,90 @@ static int mana_ibv_device_to_pci_addr(const struct 
ibv_device *device,
return 0;
 }
 
+static void mana_intr_handler(void *arg)
+{
+   struct mana_priv *priv = arg;
+   struct ibv_context *ctx = priv->ib_ctx;
+   struct ibv_async_event event;
+
+   /* Read and ack all messages from IB device */
+   while (true) {
+   if (ibv_get_async_event(ctx, &event))
+   break;
+
+   if (event.event_type == IBV_EVENT_DEVICE_FATAL) {
+   struct rte_eth_dev *dev;
+
+   dev = &rte_eth_devices[priv->port_id];
+   if (dev->data->dev_conf.intr_conf.rmv)
+   rte_eth_dev_callback_process(dev,
+   RTE_ETH_EVENT_INTR_RMV, NULL);
+   }
+
+   ibv_ack_async_event(&event);
+   }
+}
+
+static int mana_intr_uninstall(struct mana_priv *priv)
+{
+   int ret;
+
+   ret = rte_intr_callback_unregister(priv->intr_handle,
+  mana_intr_handler, priv);
+   if (ret <= 0) {
+   DRV_LOG(ERR, "Failed to unregister intr callback ret %d", ret);
+   return ret;
+   }
+
+   rte_intr_instance_free(priv->intr_handle);
+
+   return 0;
+}
+
+static int mana_intr_install(struct mana_priv *priv)
+{
+   int ret, flags;
+   struct ibv_context *ctx = priv->ib_ctx;
+
+   priv->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_SHARED);
+   if (!priv->intr_handle) {
+   DRV_LOG(ERR, "Failed to allocate intr_handle");
+   rte_errno = ENOMEM;
+   return -ENOMEM;
+   }
+
+   rte_intr_fd_set(priv->intr_handle, -1);
+
+   flags = fcntl(ctx->async_fd, F_GETFL);
+   ret = fcntl(ctx->async_fd, F_SETFL, flags | O_NONBLOCK);
+   if (ret) {
+   DRV_LOG(ERR, "Failed to change async_fd to NONBLOCK");
+   goto free_intr;
+   }
+
+   rte_intr_fd_set(priv->intr_handle, ctx->async_fd);
+   rte_intr_type_set(priv->intr_handle, RTE_INTR_HANDLE_EXT);
+
+   ret = rte_intr_callback_register(priv->intr_handle,
+mana_intr_handler, priv);
+   if (ret) {
+   DRV_LOG(ERR, "Failed to register intr callback");
+   rte_intr_fd_set(priv->intr_handle, -1);
+   goto restore_fd;
+   }
+
+   return 0;
+
+restore_fd:
+   fcntl(ctx->async_fd, F_SETFL, flags);
+
+free_intr:
+   rte_intr_instance_free(priv->intr_handle);
+   priv->intr_handle = NULL;
+
+   return ret;
+}
+
 static int mana_proc_priv_init(struct rte_eth_dev *dev)
 {
struct mana_process_priv *priv;
@@ -640,6 +730,13 @@ static int mana_pci_probe_mac(struct rte_pci_driver 
*pci_drv __rte_unused,
name, priv->max_rx_queues, priv->max_rx_desc,
priv->max_send_sge);
 
+   /* Create async interrupt handler */
+   ret = mana_intr_install(priv);
+   if (ret) {
+   DRV_LOG(ERR, "Failed to install intr handler");
+   goto failed;
+   }
+
rte_spinlock_lock(&mana_shared_data->lock);
mana_shared_data->primary_cnt++;
rte_spinlock_unlock(&mana_shared_data->lock);
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index c433940022..f97eed2e81 100644
--- a

[Patch v6 06/18] net/mana: add device info

2022-08-30 Thread longli
From: Long Li 

Add the function to get device info.

Signed-off-by: Long Li 
---
 doc/guides/nics/features/mana.ini |  1 +
 drivers/net/mana/mana.c   | 82 +++
 2 files changed, 83 insertions(+)

diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
index 8043e11f99..566b3e8770 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -8,5 +8,6 @@ Link status  = P
 Linux= Y
 Multiprocess aware   = Y
 Removal event= Y
+Speed capabilities   = P
 Usage doc= Y
 x86-64   = Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index c9591035ac..e1550b3c08 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -116,6 +116,86 @@ mana_dev_close(struct rte_eth_dev *dev)
return 0;
 }
 
+static int mana_dev_info_get(struct rte_eth_dev *dev,
+struct rte_eth_dev_info *dev_info)
+{
+   struct mana_priv *priv = dev->data->dev_private;
+
+   dev_info->max_mtu = RTE_ETHER_MTU;
+
+   /* RX params */
+   dev_info->min_rx_bufsize = MIN_RX_BUF_SIZE;
+   dev_info->max_rx_pktlen = MAX_FRAME_SIZE;
+
+   dev_info->max_rx_queues = priv->max_rx_queues;
+   dev_info->max_tx_queues = priv->max_tx_queues;
+
+   dev_info->max_mac_addrs = BNIC_MAX_MAC_ADDR;
+   dev_info->max_hash_mac_addrs = 0;
+
+   dev_info->max_vfs = 1;
+
+   /* Offload params */
+   dev_info->rx_offload_capa = BNIC_DEV_RX_OFFLOAD_SUPPORT;
+
+   dev_info->tx_offload_capa = BNIC_DEV_TX_OFFLOAD_SUPPORT;
+
+   /* RSS */
+   dev_info->reta_size = INDIRECTION_TABLE_NUM_ELEMENTS;
+   dev_info->hash_key_size = TOEPLITZ_HASH_KEY_SIZE_IN_BYTES;
+   dev_info->flow_type_rss_offloads = BNIC_ETH_RSS_SUPPORT;
+
+   /* Thresholds */
+   dev_info->default_rxconf = (struct rte_eth_rxconf){
+   .rx_thresh = {
+   .pthresh = 8,
+   .hthresh = 8,
+   .wthresh = 0,
+   },
+   .rx_free_thresh = 32,
+   /* If no descriptors available, pkts are dropped by default */
+   .rx_drop_en = 1,
+   };
+
+   dev_info->default_txconf = (struct rte_eth_txconf){
+   .tx_thresh = {
+   .pthresh = 32,
+   .hthresh = 0,
+   .wthresh = 0,
+   },
+   .tx_rs_thresh = 32,
+   .tx_free_thresh = 32,
+   };
+
+   /* Buffer limits */
+   dev_info->rx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
+   dev_info->rx_desc_lim.nb_max = priv->max_rx_desc;
+   dev_info->rx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
+   dev_info->rx_desc_lim.nb_seg_max = priv->max_recv_sge;
+   dev_info->rx_desc_lim.nb_mtu_seg_max = priv->max_recv_sge;
+
+   dev_info->tx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
+   dev_info->tx_desc_lim.nb_max = priv->max_tx_desc;
+   dev_info->tx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
+   dev_info->tx_desc_lim.nb_seg_max = priv->max_send_sge;
+   dev_info->rx_desc_lim.nb_mtu_seg_max = priv->max_recv_sge;
+
+   /* Speed */
+   dev_info->speed_capa = ETH_LINK_SPEED_100G;
+
+   /* RX params */
+   dev_info->default_rxportconf.burst_size = 1;
+   dev_info->default_rxportconf.ring_size = MAX_RECEIVE_BUFFERS_PER_QUEUE;
+   dev_info->default_rxportconf.nb_queues = 1;
+
+   /* TX params */
+   dev_info->default_txportconf.burst_size = 1;
+   dev_info->default_txportconf.ring_size = MAX_SEND_BUFFERS_PER_QUEUE;
+   dev_info->default_txportconf.nb_queues = 1;
+
+   return 0;
+}
+
 static const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev 
__rte_unused)
 {
static const uint32_t ptypes[] = {
@@ -150,11 +230,13 @@ static int mana_dev_link_update(struct rte_eth_dev *dev,
 const struct eth_dev_ops mana_dev_ops = {
.dev_configure  = mana_dev_configure,
.dev_close  = mana_dev_close,
+   .dev_infos_get  = mana_dev_info_get,
.dev_supported_ptypes_get = mana_supported_ptypes,
.link_update= mana_dev_link_update,
 };
 
 const struct eth_dev_ops mana_dev_sec_ops = {
+   .dev_infos_get = mana_dev_info_get,
 };
 
 uint16_t
-- 
2.17.1



[Patch v6 07/18] net/mana: add function to configure RSS

2022-08-30 Thread longli
From: Long Li 

Currently this PMD supports RSS configuration when the device is stopped.
Configuring RSS in running state will be supported in the future.

Signed-off-by: Long Li 
---
 doc/guides/nics/features/mana.ini |  1 +
 drivers/net/mana/mana.c   | 61 +++
 drivers/net/mana/mana.h   |  1 +
 3 files changed, 63 insertions(+)

diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
index 566b3e8770..a59c21cc10 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -8,6 +8,7 @@ Link status  = P
 Linux= Y
 Multiprocess aware   = Y
 Removal event= Y
+RSS hash = Y
 Speed capabilities   = P
 Usage doc= Y
 x86-64   = Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index e1550b3c08..cb136f24c1 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -211,6 +211,65 @@ static const uint32_t *mana_supported_ptypes(struct 
rte_eth_dev *dev __rte_unuse
return ptypes;
 }
 
+static int mana_rss_hash_update(struct rte_eth_dev *dev,
+   struct rte_eth_rss_conf *rss_conf)
+{
+   struct mana_priv *priv = dev->data->dev_private;
+
+   /* Currently can only update RSS hash when device is stopped */
+   if (dev->data->dev_started) {
+   DRV_LOG(ERR, "Can't update RSS after device has started");
+   return -ENODEV;
+   }
+
+   if (rss_conf->rss_hf & ~BNIC_ETH_RSS_SUPPORT) {
+   DRV_LOG(ERR, "Port %u invalid RSS HF 0x%" PRIx64,
+   dev->data->port_id, rss_conf->rss_hf);
+   return -EINVAL;
+   }
+
+   if (rss_conf->rss_key && rss_conf->rss_key_len) {
+   if (rss_conf->rss_key_len != TOEPLITZ_HASH_KEY_SIZE_IN_BYTES) {
+   DRV_LOG(ERR, "Port %u key len must be %u long",
+   dev->data->port_id,
+   TOEPLITZ_HASH_KEY_SIZE_IN_BYTES);
+   return -EINVAL;
+   }
+
+   priv->rss_conf.rss_key_len = rss_conf->rss_key_len;
+   priv->rss_conf.rss_key =
+   rte_zmalloc("mana_rss", rss_conf->rss_key_len,
+   RTE_CACHE_LINE_SIZE);
+   if (!priv->rss_conf.rss_key)
+   return -ENOMEM;
+   memcpy(priv->rss_conf.rss_key, rss_conf->rss_key,
+  rss_conf->rss_key_len);
+   }
+   priv->rss_conf.rss_hf = rss_conf->rss_hf;
+
+   return 0;
+}
+
+static int mana_rss_hash_conf_get(struct rte_eth_dev *dev,
+ struct rte_eth_rss_conf *rss_conf)
+{
+   struct mana_priv *priv = dev->data->dev_private;
+
+   if (!rss_conf)
+   return -EINVAL;
+
+   if (rss_conf->rss_key &&
+   rss_conf->rss_key_len >= priv->rss_conf.rss_key_len) {
+   memcpy(rss_conf->rss_key, priv->rss_conf.rss_key,
+  priv->rss_conf.rss_key_len);
+   }
+
+   rss_conf->rss_key_len = priv->rss_conf.rss_key_len;
+   rss_conf->rss_hf = priv->rss_conf.rss_hf;
+
+   return 0;
+}
+
 static int mana_dev_link_update(struct rte_eth_dev *dev,
int wait_to_complete __rte_unused)
 {
@@ -232,6 +291,8 @@ const struct eth_dev_ops mana_dev_ops = {
.dev_close  = mana_dev_close,
.dev_infos_get  = mana_dev_info_get,
.dev_supported_ptypes_get = mana_supported_ptypes,
+   .rss_hash_update= mana_rss_hash_update,
+   .rss_hash_conf_get  = mana_rss_hash_conf_get,
.link_update= mana_dev_link_update,
 };
 
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index f97eed2e81..33f68b3d1b 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -72,6 +72,7 @@ struct mana_priv {
uint8_t ind_table_key[40];
struct ibv_qp *rwq_qp;
void *db_page;
+   struct rte_eth_rss_conf rss_conf;
struct rte_intr_handle *intr_handle;
int max_rx_queues;
int max_tx_queues;
-- 
2.17.1



[Patch v6 08/18] net/mana: add function to configure RX queues

2022-08-30 Thread longli
From: Long Li 

RX hardware queue is allocated when starting the queue. This function is
for queue configuration pre starting.

Signed-off-by: Long Li 
---
 drivers/net/mana/mana.c | 68 +
 1 file changed, 68 insertions(+)

diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index cb136f24c1..d03adab041 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -196,6 +196,16 @@ static int mana_dev_info_get(struct rte_eth_dev *dev,
return 0;
 }
 
+static void mana_dev_rx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
+  struct rte_eth_rxq_info *qinfo)
+{
+   struct mana_rxq *rxq = dev->data->rx_queues[queue_id];
+
+   qinfo->mp = rxq->mp;
+   qinfo->nb_desc = rxq->num_desc;
+   qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads;
+}
+
 static const uint32_t *mana_supported_ptypes(struct rte_eth_dev *dev 
__rte_unused)
 {
static const uint32_t ptypes[] = {
@@ -270,6 +280,61 @@ static int mana_rss_hash_conf_get(struct rte_eth_dev *dev,
return 0;
 }
 
+static int mana_dev_rx_queue_setup(struct rte_eth_dev *dev,
+  uint16_t queue_idx, uint16_t nb_desc,
+  unsigned int socket_id,
+  const struct rte_eth_rxconf *rx_conf 
__rte_unused,
+  struct rte_mempool *mp)
+{
+   struct mana_priv *priv = dev->data->dev_private;
+   struct mana_rxq *rxq;
+   int ret;
+
+   rxq = rte_zmalloc_socket("mana_rxq", sizeof(*rxq), 0, socket_id);
+   if (!rxq) {
+   DRV_LOG(ERR, "failed to allocate rxq");
+   return -ENOMEM;
+   }
+
+   DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u",
+   queue_idx, nb_desc, socket_id);
+
+   rxq->socket = socket_id;
+
+   rxq->desc_ring = rte_zmalloc_socket("mana_rx_mbuf_ring",
+   sizeof(struct mana_rxq_desc) *
+   nb_desc,
+   RTE_CACHE_LINE_SIZE, socket_id);
+
+   if (!rxq->desc_ring) {
+   DRV_LOG(ERR, "failed to allocate rxq desc_ring");
+   ret = -ENOMEM;
+   goto fail;
+   }
+
+   rxq->num_desc = nb_desc;
+
+   rxq->priv = priv;
+   rxq->num_desc = nb_desc;
+   rxq->mp = mp;
+   dev->data->rx_queues[queue_idx] = rxq;
+
+   return 0;
+
+fail:
+   rte_free(rxq->desc_ring);
+   rte_free(rxq);
+   return ret;
+}
+
+static void mana_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
+{
+   struct mana_rxq *rxq = dev->data->rx_queues[qid];
+
+   rte_free(rxq->desc_ring);
+   rte_free(rxq);
+}
+
 static int mana_dev_link_update(struct rte_eth_dev *dev,
int wait_to_complete __rte_unused)
 {
@@ -290,9 +355,12 @@ const struct eth_dev_ops mana_dev_ops = {
.dev_configure  = mana_dev_configure,
.dev_close  = mana_dev_close,
.dev_infos_get  = mana_dev_info_get,
+   .rxq_info_get   = mana_dev_rx_queue_info,
.dev_supported_ptypes_get = mana_supported_ptypes,
.rss_hash_update= mana_rss_hash_update,
.rss_hash_conf_get  = mana_rss_hash_conf_get,
+   .rx_queue_setup = mana_dev_rx_queue_setup,
+   .rx_queue_release   = mana_dev_rx_queue_release,
.link_update= mana_dev_link_update,
 };
 
-- 
2.17.1



[Patch v6 09/18] net/mana: add function to configure TX queues

2022-08-30 Thread longli
From: Long Li 

TX hardware queue is allocated when starting the queue, this is for
pre configuration.

Signed-off-by: Long Li 
---
 drivers/net/mana/mana.c | 65 +
 1 file changed, 65 insertions(+)

diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index d03adab041..490686f404 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -196,6 +196,15 @@ static int mana_dev_info_get(struct rte_eth_dev *dev,
return 0;
 }
 
+static void mana_dev_tx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
+   struct rte_eth_txq_info *qinfo)
+{
+   struct mana_txq *txq = dev->data->tx_queues[queue_id];
+
+   qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads;
+   qinfo->nb_desc = txq->num_desc;
+}
+
 static void mana_dev_rx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
   struct rte_eth_rxq_info *qinfo)
 {
@@ -280,6 +289,59 @@ static int mana_rss_hash_conf_get(struct rte_eth_dev *dev,
return 0;
 }
 
+static int mana_dev_tx_queue_setup(struct rte_eth_dev *dev,
+  uint16_t queue_idx, uint16_t nb_desc,
+  unsigned int socket_id,
+  const struct rte_eth_txconf *tx_conf 
__rte_unused)
+
+{
+   struct mana_priv *priv = dev->data->dev_private;
+   struct mana_txq *txq;
+   int ret;
+
+   txq = rte_zmalloc_socket("mana_txq", sizeof(*txq), 0, socket_id);
+   if (!txq) {
+   DRV_LOG(ERR, "failed to allocate txq");
+   return -ENOMEM;
+   }
+
+   txq->socket = socket_id;
+
+   txq->desc_ring = rte_malloc_socket("mana_tx_desc_ring",
+  sizeof(struct mana_txq_desc) *
+   nb_desc,
+  RTE_CACHE_LINE_SIZE, socket_id);
+   if (!txq->desc_ring) {
+   DRV_LOG(ERR, "failed to allocate txq desc_ring");
+   ret = -ENOMEM;
+   goto fail;
+   }
+
+   DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u txq->desc_ring %p",
+   queue_idx, nb_desc, socket_id, txq->desc_ring);
+
+   txq->desc_ring_head = 0;
+   txq->desc_ring_tail = 0;
+   txq->priv = priv;
+   txq->num_desc = nb_desc;
+   dev->data->tx_queues[queue_idx] = txq;
+
+   return 0;
+
+fail:
+   rte_free(txq->desc_ring);
+   rte_free(txq);
+   return ret;
+}
+
+static void mana_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
+{
+   struct mana_txq *txq = dev->data->tx_queues[qid];
+
+   rte_free(txq->desc_ring);
+   rte_free(txq);
+}
+
 static int mana_dev_rx_queue_setup(struct rte_eth_dev *dev,
   uint16_t queue_idx, uint16_t nb_desc,
   unsigned int socket_id,
@@ -355,10 +417,13 @@ const struct eth_dev_ops mana_dev_ops = {
.dev_configure  = mana_dev_configure,
.dev_close  = mana_dev_close,
.dev_infos_get  = mana_dev_info_get,
+   .txq_info_get   = mana_dev_tx_queue_info,
.rxq_info_get   = mana_dev_rx_queue_info,
.dev_supported_ptypes_get = mana_supported_ptypes,
.rss_hash_update= mana_rss_hash_update,
.rss_hash_conf_get  = mana_rss_hash_conf_get,
+   .tx_queue_setup = mana_dev_tx_queue_setup,
+   .tx_queue_release   = mana_dev_tx_queue_release,
.rx_queue_setup = mana_dev_rx_queue_setup,
.rx_queue_release   = mana_dev_rx_queue_release,
.link_update= mana_dev_link_update,
-- 
2.17.1



[Patch v6 10/18] net/mana: implement memory registration

2022-08-30 Thread longli
From: Long Li 

MANA hardware has iommu built-in, that provides hardware safe access to
user memory through memory registration. Since memory registration is an
expensive operation, this patch implements a two level memory registration
cache mechanisum for each queue and for each port.

Signed-off-by: Long Li 
---
Change log:
v2:
Change all header file functions to start with mana_.
Use spinlock in place of rwlock to memory cache access.
Remove unused header files.
v4:
Remove extra "\n" in logging function.

 drivers/net/mana/mana.c  |  20 +++
 drivers/net/mana/mana.h  |  39 +
 drivers/net/mana/meson.build |   1 +
 drivers/net/mana/mp.c|  85 +
 drivers/net/mana/mr.c| 324 +++
 5 files changed, 469 insertions(+)
 create mode 100644 drivers/net/mana/mr.c

diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 490686f404..d18cc4ab0e 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -103,6 +103,8 @@ mana_dev_close(struct rte_eth_dev *dev)
struct mana_priv *priv = dev->data->dev_private;
int ret;
 
+   mana_remove_all_mr(priv);
+
ret = mana_intr_uninstall(priv);
if (ret)
return ret;
@@ -317,6 +319,13 @@ static int mana_dev_tx_queue_setup(struct rte_eth_dev *dev,
goto fail;
}
 
+   ret = mana_mr_btree_init(&txq->mr_btree,
+MANA_MR_BTREE_PER_QUEUE_N, socket_id);
+   if (ret) {
+   DRV_LOG(ERR, "Failed to init TXQ MR btree");
+   goto fail;
+   }
+
DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u txq->desc_ring %p",
queue_idx, nb_desc, socket_id, txq->desc_ring);
 
@@ -338,6 +347,8 @@ static void mana_dev_tx_queue_release(struct rte_eth_dev 
*dev, uint16_t qid)
 {
struct mana_txq *txq = dev->data->tx_queues[qid];
 
+   mana_mr_btree_free(&txq->mr_btree);
+
rte_free(txq->desc_ring);
rte_free(txq);
 }
@@ -374,6 +385,13 @@ static int mana_dev_rx_queue_setup(struct rte_eth_dev *dev,
goto fail;
}
 
+   ret = mana_mr_btree_init(&rxq->mr_btree,
+MANA_MR_BTREE_PER_QUEUE_N, socket_id);
+   if (ret) {
+   DRV_LOG(ERR, "Failed to init RXQ MR btree");
+   goto fail;
+   }
+
rxq->num_desc = nb_desc;
 
rxq->priv = priv;
@@ -393,6 +411,8 @@ static void mana_dev_rx_queue_release(struct rte_eth_dev 
*dev, uint16_t qid)
 {
struct mana_rxq *rxq = dev->data->rx_queues[qid];
 
+   mana_mr_btree_free(&rxq->mr_btree);
+
rte_free(rxq->desc_ring);
rte_free(rxq);
 }
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 33f68b3d1b..9e15b43275 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -50,6 +50,22 @@ struct mana_shared_data {
 #define MAX_RECEIVE_BUFFERS_PER_QUEUE  256
 #define MAX_SEND_BUFFERS_PER_QUEUE 256
 
+struct mana_mr_cache {
+   uint32_tlkey;
+   uintptr_t   addr;
+   size_t  len;
+   void*verb_obj;
+};
+
+#define MANA_MR_BTREE_CACHE_N  512
+struct mana_mr_btree {
+   uint16_tlen;/* Used entries */
+   uint16_tsize;   /* Total entries */
+   int overflow;
+   int socket;
+   struct mana_mr_cache *table;
+};
+
 struct mana_process_priv {
void *db_page;
 };
@@ -82,6 +98,8 @@ struct mana_priv {
int max_recv_sge;
int max_mr;
uint64_t max_mr_size;
+   struct mana_mr_btree mr_btree;
+   rte_spinlock_t  mr_btree_lock;
 };
 
 struct mana_txq_desc {
@@ -131,6 +149,7 @@ struct mana_txq {
uint32_t desc_ring_head, desc_ring_tail;
 
struct mana_stats stats;
+   struct mana_mr_btree mr_btree;
unsigned int socket;
 };
 
@@ -153,6 +172,7 @@ struct mana_rxq {
struct mana_gdma_queue gdma_cq;
 
struct mana_stats stats;
+   struct mana_mr_btree mr_btree;
 
unsigned int socket;
 };
@@ -176,6 +196,24 @@ uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct 
rte_mbuf **pkts,
 uint16_t mana_tx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
   uint16_t pkts_n);
 
+struct mana_mr_cache *mana_find_pmd_mr(struct mana_mr_btree *local_tree,
+  struct mana_priv *priv,
+  struct rte_mbuf *mbuf);
+int mana_new_pmd_mr(struct mana_mr_btree *local_tree, struct mana_priv *priv,
+   struct rte_mempool *pool);
+void mana_remove_all_mr(struct mana_priv *priv);
+void mana_del_pmd_mr(struct mana_mr_cache *mr);
+
+void mana_mempool_chunk_cb(struct rte_mempool *mp, void *opaque,
+  struct rte_mempool_memhdr *memhdr, unsigned int idx);
+
+struct mana_mr_cache *mana_mr_btree_lookup(struct mana_mr_btree *bt,
+  uint16_t *idx,

[Patch v6 11/18] net/mana: implement the hardware layer operations

2022-08-30 Thread longli
From: Long Li 

The hardware layer of MANA understands the device queue and doorbell
formats. Those functions are implemented for use by packet RX/TX code.

Signed-off-by: Long Li 
---
Change log:
v2:
Remove unused header files.
Rename a camel case.
v5:
Use RTE_BIT32() instead of defining a new BIT()
v6:
add rte_rmb() after reading owner bits

 drivers/net/mana/gdma.c  | 289 +++
 drivers/net/mana/mana.h  | 183 ++
 drivers/net/mana/meson.build |   1 +
 3 files changed, 473 insertions(+)
 create mode 100644 drivers/net/mana/gdma.c

diff --git a/drivers/net/mana/gdma.c b/drivers/net/mana/gdma.c
new file mode 100644
index 00..7ad175651e
--- /dev/null
+++ b/drivers/net/mana/gdma.c
@@ -0,0 +1,289 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#include 
+#include 
+
+#include "mana.h"
+
+uint8_t *gdma_get_wqe_pointer(struct mana_gdma_queue *queue)
+{
+   uint32_t offset_in_bytes =
+   (queue->head * GDMA_WQE_ALIGNMENT_UNIT_SIZE) &
+   (queue->size - 1);
+
+   DRV_LOG(DEBUG, "txq sq_head %u sq_size %u offset_in_bytes %u",
+   queue->head, queue->size, offset_in_bytes);
+
+   if (offset_in_bytes + GDMA_WQE_ALIGNMENT_UNIT_SIZE > queue->size)
+   DRV_LOG(ERR, "fatal error: offset_in_bytes %u too big",
+   offset_in_bytes);
+
+   return ((uint8_t *)queue->buffer) + offset_in_bytes;
+}
+
+static uint32_t
+write_dma_client_oob(uint8_t *work_queue_buffer_pointer,
+const struct gdma_work_request *work_request,
+uint32_t client_oob_size)
+{
+   uint8_t *p = work_queue_buffer_pointer;
+
+   struct gdma_wqe_dma_oob *header = (struct gdma_wqe_dma_oob *)p;
+
+   memset(header, 0, sizeof(struct gdma_wqe_dma_oob));
+   header->num_sgl_entries = work_request->num_sgl_elements;
+   header->inline_client_oob_size_in_dwords =
+   client_oob_size / sizeof(uint32_t);
+   header->client_data_unit = work_request->client_data_unit;
+
+   DRV_LOG(DEBUG, "queue buf %p sgl %u oob_h %u du %u oob_buf %p oob_b %u",
+   work_queue_buffer_pointer, header->num_sgl_entries,
+   header->inline_client_oob_size_in_dwords,
+   header->client_data_unit, work_request->inline_oob_data,
+   work_request->inline_oob_size_in_bytes);
+
+   p += sizeof(struct gdma_wqe_dma_oob);
+   if (work_request->inline_oob_data &&
+   work_request->inline_oob_size_in_bytes > 0) {
+   memcpy(p, work_request->inline_oob_data,
+  work_request->inline_oob_size_in_bytes);
+   if (client_oob_size > work_request->inline_oob_size_in_bytes)
+   memset(p + work_request->inline_oob_size_in_bytes, 0,
+  client_oob_size -
+  work_request->inline_oob_size_in_bytes);
+   }
+
+   return sizeof(struct gdma_wqe_dma_oob) + client_oob_size;
+}
+
+static uint32_t
+write_scatter_gather_list(uint8_t *work_queue_head_pointer,
+ uint8_t *work_queue_end_pointer,
+ uint8_t *work_queue_cur_pointer,
+ struct gdma_work_request *work_request)
+{
+   struct gdma_sgl_element *sge_list;
+   struct gdma_sgl_element dummy_sgl[1];
+   uint8_t *address;
+   uint32_t size;
+   uint32_t num_sge;
+   uint32_t size_to_queue_end;
+   uint32_t sge_list_size;
+
+   DRV_LOG(DEBUG, "work_queue_cur_pointer %p work_request->flags %x",
+   work_queue_cur_pointer, work_request->flags);
+
+   num_sge = work_request->num_sgl_elements;
+   sge_list = work_request->sgl;
+   size_to_queue_end = (uint32_t)(work_queue_end_pointer -
+  work_queue_cur_pointer);
+
+   if (num_sge == 0) {
+   /* Per spec, the case of an empty SGL should be handled as
+* follows to avoid corrupted WQE errors:
+* Write one dummy SGL entry
+* Set the address to 1, leave the rest as 0
+*/
+   dummy_sgl[num_sge].address = 1;
+   dummy_sgl[num_sge].size = 0;
+   dummy_sgl[num_sge].memory_key = 0;
+   num_sge++;
+   sge_list = dummy_sgl;
+   }
+
+   sge_list_size = 0;
+   {
+   address = (uint8_t *)sge_list;
+   size = sizeof(struct gdma_sgl_element) * num_sge;
+   if (size_to_queue_end < size) {
+   memcpy(work_queue_cur_pointer, address,
+  size_to_queue_end);
+   work_queue_cur_pointer = work_queue_head_pointer;
+   address += size_to_queue_end;
+   size -= size_to_queue_end;
+   }
+
+   memcpy(work_queue_cur_pointer

[Patch v6 12/18] net/mana: add function to start/stop TX queues

2022-08-30 Thread longli
From: Long Li 

MANA allocate device queues through the IB layer when starting TX queues.
When device is stopped all the queues are unmapped and freed.

Signed-off-by: Long Li 
---
Change log:
v2:
Add prefix mana_ to all function names.
Remove unused header files.

 doc/guides/nics/features/mana.ini |   1 +
 drivers/net/mana/mana.h   |   4 +
 drivers/net/mana/meson.build  |   1 +
 drivers/net/mana/tx.c | 163 ++
 4 files changed, 169 insertions(+)
 create mode 100644 drivers/net/mana/tx.c

diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
index a59c21cc10..821443b292 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -7,6 +7,7 @@
 Link status  = P
 Linux= Y
 Multiprocess aware   = Y
+Queue start/stop = Y
 Removal event= Y
 RSS hash = Y
 Speed capabilities   = P
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index d87358ab15..3613ba7ca2 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -379,6 +379,10 @@ uint16_t mana_tx_burst_removed(void *dpdk_rxq, struct 
rte_mbuf **pkts,
 int gdma_poll_completion_queue(struct mana_gdma_queue *cq,
   struct gdma_comp *comp);
 
+int mana_start_tx_queues(struct rte_eth_dev *dev);
+
+int mana_stop_tx_queues(struct rte_eth_dev *dev);
+
 struct mana_mr_cache *mana_find_pmd_mr(struct mana_mr_btree *local_tree,
   struct mana_priv *priv,
   struct rte_mbuf *mbuf);
diff --git a/drivers/net/mana/meson.build b/drivers/net/mana/meson.build
index 364d57a619..031f443d16 100644
--- a/drivers/net/mana/meson.build
+++ b/drivers/net/mana/meson.build
@@ -11,6 +11,7 @@ deps += ['pci', 'bus_pci', 'net', 'eal', 'kvargs']
 
 sources += files(
'mana.c',
+   'tx.c',
'mr.c',
'gdma.c',
'mp.c',
diff --git a/drivers/net/mana/tx.c b/drivers/net/mana/tx.c
new file mode 100644
index 00..fbeea40ef2
--- /dev/null
+++ b/drivers/net/mana/tx.c
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+
+#include 
+
+#include 
+#include 
+
+#include "mana.h"
+
+int mana_stop_tx_queues(struct rte_eth_dev *dev)
+{
+   struct mana_priv *priv = dev->data->dev_private;
+   int i, ret;
+
+   for (i = 0; i < priv->num_queues; i++) {
+   struct mana_txq *txq = dev->data->tx_queues[i];
+
+   if (txq->qp) {
+   ret = ibv_destroy_qp(txq->qp);
+   if (ret)
+   DRV_LOG(ERR, "tx_queue destroy_qp failed %d",
+   ret);
+   txq->qp = NULL;
+   }
+
+   if (txq->cq) {
+   ret = ibv_destroy_cq(txq->cq);
+   if (ret)
+   DRV_LOG(ERR, "tx_queue destroy_cp failed %d",
+   ret);
+   txq->cq = NULL;
+   }
+
+   /* Drain and free posted WQEs */
+   while (txq->desc_ring_tail != txq->desc_ring_head) {
+   struct mana_txq_desc *desc =
+   &txq->desc_ring[txq->desc_ring_tail];
+
+   rte_pktmbuf_free(desc->pkt);
+
+   txq->desc_ring_tail =
+   (txq->desc_ring_tail + 1) % txq->num_desc;
+   }
+   txq->desc_ring_head = 0;
+   txq->desc_ring_tail = 0;
+
+   memset(&txq->gdma_sq, 0, sizeof(txq->gdma_sq));
+   memset(&txq->gdma_cq, 0, sizeof(txq->gdma_cq));
+   }
+
+   return 0;
+}
+
+int mana_start_tx_queues(struct rte_eth_dev *dev)
+{
+   struct mana_priv *priv = dev->data->dev_private;
+   int ret, i;
+
+   /* start TX queues */
+   for (i = 0; i < priv->num_queues; i++) {
+   struct mana_txq *txq;
+   struct ibv_qp_init_attr qp_attr = { 0 };
+   struct manadv_obj obj = {};
+   struct manadv_qp dv_qp;
+   struct manadv_cq dv_cq;
+
+   txq = dev->data->tx_queues[i];
+
+   manadv_set_context_attr(priv->ib_ctx,
+   MANADV_CTX_ATTR_BUF_ALLOCATORS,
+   (void *)((uintptr_t)&(struct manadv_ctx_allocators){
+   .alloc = &mana_alloc_verbs_buf,
+   .free = &mana_free_verbs_buf,
+   .data = (void *)(uintptr_t)txq->socket,
+   }));
+
+   txq->cq = ibv_create_cq(priv->ib_ctx, txq->num_desc,
+   NULL, NULL, 0);
+   if (!txq->cq) {
+   DRV_LOG(ERR, "failed to create cq queue index %d", i);
+   ret = -errno;

[Patch v6 13/18] net/mana: add function to start/stop RX queues

2022-08-30 Thread longli
From: Long Li 

MANA allocates device queues through the IB layer when starting RX queues.
When device is stopped all the queues are unmapped and freed.

Signed-off-by: Long Li 
---
Change log:
v2:
Add prefix mana_ to all function names.
Remove unused header files.
v4:
Move defition "uint32_t i" from inside "for ()" to outside

 drivers/net/mana/mana.h  |   3 +
 drivers/net/mana/meson.build |   1 +
 drivers/net/mana/rx.c| 346 +++
 3 files changed, 350 insertions(+)
 create mode 100644 drivers/net/mana/rx.c

diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index 3613ba7ca2..dc808d363f 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -364,6 +364,7 @@ extern int mana_logtype_init;
 
 int mana_ring_doorbell(void *db_page, enum gdma_queue_types queue_type,
   uint32_t queue_id, uint32_t tail);
+int mana_rq_ring_doorbell(struct mana_rxq *rxq);
 
 int gdma_post_work_request(struct mana_gdma_queue *queue,
   struct gdma_work_request *work_req,
@@ -379,8 +380,10 @@ uint16_t mana_tx_burst_removed(void *dpdk_rxq, struct 
rte_mbuf **pkts,
 int gdma_poll_completion_queue(struct mana_gdma_queue *cq,
   struct gdma_comp *comp);
 
+int mana_start_rx_queues(struct rte_eth_dev *dev);
 int mana_start_tx_queues(struct rte_eth_dev *dev);
 
+int mana_stop_rx_queues(struct rte_eth_dev *dev);
 int mana_stop_tx_queues(struct rte_eth_dev *dev);
 
 struct mana_mr_cache *mana_find_pmd_mr(struct mana_mr_btree *local_tree,
diff --git a/drivers/net/mana/meson.build b/drivers/net/mana/meson.build
index 031f443d16..62e103a510 100644
--- a/drivers/net/mana/meson.build
+++ b/drivers/net/mana/meson.build
@@ -11,6 +11,7 @@ deps += ['pci', 'bus_pci', 'net', 'eal', 'kvargs']
 
 sources += files(
'mana.c',
+   'rx.c',
'tx.c',
'mr.c',
'gdma.c',
diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
new file mode 100644
index 00..41d0fc9f11
--- /dev/null
+++ b/drivers/net/mana/rx.c
@@ -0,0 +1,346 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2022 Microsoft Corporation
+ */
+#include 
+
+#include 
+#include 
+
+#include "mana.h"
+
+static uint8_t mana_rss_hash_key_default[TOEPLITZ_HASH_KEY_SIZE_IN_BYTES] = {
+   0x2c, 0xc6, 0x81, 0xd1,
+   0x5b, 0xdb, 0xf4, 0xf7,
+   0xfc, 0xa2, 0x83, 0x19,
+   0xdb, 0x1a, 0x3e, 0x94,
+   0x6b, 0x9e, 0x38, 0xd9,
+   0x2c, 0x9c, 0x03, 0xd1,
+   0xad, 0x99, 0x44, 0xa7,
+   0xd9, 0x56, 0x3d, 0x59,
+   0x06, 0x3c, 0x25, 0xf3,
+   0xfc, 0x1f, 0xdc, 0x2a,
+};
+
+int mana_rq_ring_doorbell(struct mana_rxq *rxq)
+{
+   struct mana_priv *priv = rxq->priv;
+   int ret;
+   void *db_page = priv->db_page;
+
+   if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+   struct rte_eth_dev *dev =
+   &rte_eth_devices[priv->dev_data->port_id];
+   struct mana_process_priv *process_priv = dev->process_private;
+
+   db_page = process_priv->db_page;
+   }
+
+   ret = mana_ring_doorbell(db_page, gdma_queue_receive,
+rxq->gdma_rq.id,
+rxq->gdma_rq.head *
+   GDMA_WQE_ALIGNMENT_UNIT_SIZE);
+
+   if (ret)
+   DRV_LOG(ERR, "failed to ring RX doorbell ret %d", ret);
+
+   return ret;
+}
+
+static int mana_alloc_and_post_rx_wqe(struct mana_rxq *rxq)
+{
+   struct rte_mbuf *mbuf = NULL;
+   struct gdma_sgl_element sgl[1];
+   struct gdma_work_request request = {0};
+   struct gdma_posted_wqe_info wqe_info = {0};
+   struct mana_priv *priv = rxq->priv;
+   int ret;
+   struct mana_mr_cache *mr;
+
+   mbuf = rte_pktmbuf_alloc(rxq->mp);
+   if (!mbuf) {
+   rxq->stats.nombuf++;
+   return -ENOMEM;
+   }
+
+   mr = mana_find_pmd_mr(&rxq->mr_btree, priv, mbuf);
+   if (!mr) {
+   DRV_LOG(ERR, "failed to register RX MR");
+   rte_pktmbuf_free(mbuf);
+   return -ENOMEM;
+   }
+
+   request.gdma_header.struct_size = sizeof(request);
+   wqe_info.gdma_header.struct_size = sizeof(wqe_info);
+
+   sgl[0].address = rte_cpu_to_le_64(rte_pktmbuf_mtod(mbuf, uint64_t));
+   sgl[0].memory_key = mr->lkey;
+   sgl[0].size =
+   rte_pktmbuf_data_room_size(rxq->mp) -
+   RTE_PKTMBUF_HEADROOM;
+
+   request.sgl = sgl;
+   request.num_sgl_elements = 1;
+   request.inline_oob_data = NULL;
+   request.inline_oob_size_in_bytes = 0;
+   request.flags = 0;
+   request.client_data_unit = NOT_USING_CLIENT_DATA_UNIT;
+
+   ret = gdma_post_work_request(&rxq->gdma_rq, &request, &wqe_info);
+   if (!ret) {
+   struct mana_rxq_desc *desc =
+   &rxq->desc_ring[rxq->desc_ring_head];
+
+   /* update queue for 

[Patch v6 14/18] net/mana: add function to receive packets

2022-08-30 Thread longli
From: Long Li 

With all the RX queues created, MANA can use those queues to receive
packets.

Signed-off-by: Long Li 
---
Change log:
v2:
Add mana_ to all function names.
Rename a camel case.

 doc/guides/nics/features/mana.ini |   2 +
 drivers/net/mana/mana.c   |   2 +
 drivers/net/mana/mana.h   |  37 +++
 drivers/net/mana/mp.c |   2 +
 drivers/net/mana/rx.c | 104 ++
 5 files changed, 147 insertions(+)

diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
index 821443b292..fdbf22d335 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -6,6 +6,8 @@
 [Features]
 Link status  = P
 Linux= Y
+L3 checksum offload  = Y
+L4 checksum offload  = Y
 Multiprocess aware   = Y
 Queue start/stop = Y
 Removal event= Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index d18cc4ab0e..c349822991 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -950,6 +950,8 @@ static int mana_pci_probe_mac(struct rte_pci_driver 
*pci_drv __rte_unused,
/* fd is no not used after mapping doorbell */
close(fd);
 
+   eth_dev->rx_pkt_burst = mana_rx_burst;
+
rte_spinlock_lock(&mana_shared_data->lock);
mana_shared_data->secondary_cnt++;
mana_local_data.secondary_cnt++;
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index dc808d363f..bafc4d6082 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -178,6 +178,11 @@ struct gdma_work_request {
 
 enum mana_cqe_type {
CQE_INVALID = 0,
+
+   CQE_RX_OKAY = 1,
+   CQE_RX_COALESCED_4  = 2,
+   CQE_RX_OBJECT_FENCE = 3,
+   CQE_RX_TRUNCATED= 4,
 };
 
 struct mana_cqe_header {
@@ -203,6 +208,35 @@ struct mana_cqe_header {
(NDIS_HASH_TCP_IPV4 | NDIS_HASH_UDP_IPV4 | NDIS_HASH_TCP_IPV6 |  \
 NDIS_HASH_UDP_IPV6 | NDIS_HASH_TCP_IPV6_EX | NDIS_HASH_UDP_IPV6_EX)
 
+struct mana_rx_comp_per_packet_info {
+   uint32_t packet_length  : 16;
+   uint32_t reserved0  : 16;
+   uint32_t reserved1;
+   uint32_t packet_hash;
+}; /* HW DATA */
+#define RX_COM_OOB_NUM_PACKETINFO_SEGMENTS 4
+
+struct mana_rx_comp_oob {
+   struct mana_cqe_header cqe_hdr;
+
+   uint32_t rx_vlan_id : 12;
+   uint32_t rx_vlan_tag_present: 1;
+   uint32_t rx_outer_ip_header_checksum_succeeded  : 1;
+   uint32_t rx_outer_ip_header_checksum_failed : 1;
+   uint32_t reserved   : 1;
+   uint32_t rx_hash_type   : 9;
+   uint32_t rx_ip_header_checksum_succeeded: 1;
+   uint32_t rx_ip_header_checksum_failed   : 1;
+   uint32_t rx_tcp_checksum_succeeded  : 1;
+   uint32_t rx_tcp_checksum_failed : 1;
+   uint32_t rx_udp_checksum_succeeded  : 1;
+   uint32_t rx_udp_checksum_failed : 1;
+   uint32_t reserved1  : 1;
+   struct mana_rx_comp_per_packet_info
+   packet_info[RX_COM_OOB_NUM_PACKETINFO_SEGMENTS];
+   uint32_t received_wqe_offset;
+}; /* HW DATA */
+
 struct gdma_wqe_dma_oob {
uint32_t reserved:24;
uint32_t last_v_bytes:8;
@@ -371,6 +405,9 @@ int gdma_post_work_request(struct mana_gdma_queue *queue,
   struct gdma_posted_wqe_info *wqe_info);
 uint8_t *gdma_get_wqe_pointer(struct mana_gdma_queue *queue);
 
+uint16_t mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **rx_pkts,
+  uint16_t pkts_n);
+
 uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
   uint16_t pkts_n);
 
diff --git a/drivers/net/mana/mp.c b/drivers/net/mana/mp.c
index f4f78d2787..36a88c561a 100644
--- a/drivers/net/mana/mp.c
+++ b/drivers/net/mana/mp.c
@@ -138,6 +138,8 @@ static int mana_mp_secondary_handle(const struct rte_mp_msg 
*mp_msg,
case MANA_MP_REQ_START_RXTX:
DRV_LOG(INFO, "Port %u starting datapath", dev->data->port_id);
 
+   dev->rx_pkt_burst = mana_rx_burst;
+
rte_mb();
 
res->result = 0;
diff --git a/drivers/net/mana/rx.c b/drivers/net/mana/rx.c
index 41d0fc9f11..f2573a6d06 100644
--- a/drivers/net/mana/rx.c
+++ b/drivers/net/mana/rx.c
@@ -344,3 +344,107 @@ int mana_start_rx_queues(struct rte_eth_dev *dev)
mana_stop_rx_queues(dev);
return ret;
 }
+
+uint16_t mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+   uint16_t pkt_received = 0, cqe_processed = 0;
+   struct mana_rxq *rxq = dpdk_rxq;
+   struc

[Patch v6 15/18] net/mana: add function to send packets

2022-08-30 Thread longli
From: Long Li 

With all the TX queues created, MANA can send packets over those queues.

Signed-off-by: Long Li 
---
Change log:
v2:
Rename all camel cases.

 doc/guides/nics/features/mana.ini |   1 +
 drivers/net/mana/mana.c   |   1 +
 drivers/net/mana/mana.h   |  65 
 drivers/net/mana/mp.c |   1 +
 drivers/net/mana/tx.c | 241 ++
 5 files changed, 309 insertions(+)

diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
index fdbf22d335..7922816d66 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -4,6 +4,7 @@
 ; Refer to default.ini for the full list of available PMD features.
 ;
 [Features]
+Free Tx mbuf on demand = Y
 Link status  = P
 Linux= Y
 L3 checksum offload  = Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index c349822991..0dcd3f3124 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -950,6 +950,7 @@ static int mana_pci_probe_mac(struct rte_pci_driver 
*pci_drv __rte_unused,
/* fd is no not used after mapping doorbell */
close(fd);
 
+   eth_dev->tx_pkt_burst = mana_tx_burst;
eth_dev->rx_pkt_burst = mana_rx_burst;
 
rte_spinlock_lock(&mana_shared_data->lock);
diff --git a/drivers/net/mana/mana.h b/drivers/net/mana/mana.h
index bafc4d6082..b4056bd50b 100644
--- a/drivers/net/mana/mana.h
+++ b/drivers/net/mana/mana.h
@@ -62,6 +62,47 @@ struct mana_shared_data {
 
 #define NOT_USING_CLIENT_DATA_UNIT 0
 
+enum tx_packet_format_v2 {
+   short_packet_format = 0,
+   long_packet_format = 1
+};
+
+struct transmit_short_oob_v2 {
+   enum tx_packet_format_v2 packet_format : 2;
+   uint32_t tx_is_outer_ipv4 : 1;
+   uint32_t tx_is_outer_ipv6 : 1;
+   uint32_t tx_compute_IP_header_checksum : 1;
+   uint32_t tx_compute_TCP_checksum : 1;
+   uint32_t tx_compute_UDP_checksum : 1;
+   uint32_t suppress_tx_CQE_generation : 1;
+   uint32_t VCQ_number : 24;
+   uint32_t tx_transport_header_offset : 10;
+   uint32_t VSQ_frame_num : 14;
+   uint32_t short_vport_offset : 8;
+};
+
+struct transmit_long_oob_v2 {
+   uint32_t tx_is_encapsulated_packet : 1;
+   uint32_t tx_inner_is_ipv6 : 1;
+   uint32_t tx_inner_TCP_options_present : 1;
+   uint32_t inject_vlan_prior_tag : 1;
+   uint32_t reserved1 : 12;
+   uint32_t priority_code_point : 3;
+   uint32_t drop_eligible_indicator : 1;
+   uint32_t vlan_identifier : 12;
+   uint32_t tx_inner_frame_offset : 10;
+   uint32_t tx_inner_IP_header_relative_offset : 6;
+   uint32_t long_vport_offset : 12;
+   uint32_t reserved3 : 4;
+   uint32_t reserved4 : 32;
+   uint32_t reserved5 : 32;
+};
+
+struct transmit_oob_v2 {
+   struct transmit_short_oob_v2 short_oob;
+   struct transmit_long_oob_v2 long_oob;
+};
+
 enum gdma_queue_types {
gdma_queue_type_invalid = 0,
gdma_queue_send,
@@ -183,6 +224,17 @@ enum mana_cqe_type {
CQE_RX_COALESCED_4  = 2,
CQE_RX_OBJECT_FENCE = 3,
CQE_RX_TRUNCATED= 4,
+
+   CQE_TX_OKAY = 32,
+   CQE_TX_SA_DROP  = 33,
+   CQE_TX_MTU_DROP = 34,
+   CQE_TX_INVALID_OOB  = 35,
+   CQE_TX_INVALID_ETH_TYPE = 36,
+   CQE_TX_HDR_PROCESSING_ERROR = 37,
+   CQE_TX_VF_DISABLED  = 38,
+   CQE_TX_VPORT_IDX_OUT_OF_RANGE   = 39,
+   CQE_TX_VPORT_DISABLED   = 40,
+   CQE_TX_VLAN_TAGGING_VIOLATION   = 41,
 };
 
 struct mana_cqe_header {
@@ -191,6 +243,17 @@ struct mana_cqe_header {
uint32_t vendor_err  : 24;
 }; /* HW DATA */
 
+struct mana_tx_comp_oob {
+   struct mana_cqe_header cqe_hdr;
+
+   uint32_t tx_data_offset;
+
+   uint32_t tx_sgl_offset   : 5;
+   uint32_t tx_wqe_offset   : 27;
+
+   uint32_t reserved[12];
+}; /* HW DATA */
+
 /* NDIS HASH Types */
 #define BIT(nr)(1 << (nr))
 #define NDIS_HASH_IPV4  BIT(0)
@@ -407,6 +470,8 @@ uint8_t *gdma_get_wqe_pointer(struct mana_gdma_queue 
*queue);
 
 uint16_t mana_rx_burst(void *dpdk_rxq, struct rte_mbuf **rx_pkts,
   uint16_t pkts_n);
+uint16_t mana_tx_burst(void *dpdk_txq, struct rte_mbuf **tx_pkts,
+  uint16_t pkts_n);
 
 uint16_t mana_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
   uint16_t pkts_n);
diff --git a/drivers/net/mana/mp.c b/drivers/net/mana/mp.c
index 36a88c561a..da9c0f36a1 100644
--- a/drivers/net/mana/mp.c
+++ b/drivers/net/mana/mp.c
@@ -138,6 +138,7 @@ static int mana_mp_secondary_handle(const struct rte_mp_msg 
*mp_msg,
case MANA_MP_REQ_START_RXTX:
DRV_LOG(

[Patch v6 16/18] net/mana: add function to start/stop device

2022-08-30 Thread longli
From: Long Li 

Add support for starting/stopping the device.

Signed-off-by: Long Li 
---
Change log:
v2:
Use spinlock for memory registration cache.
Add prefix mana_ to all function names.
v6:
Roll back device state on error in mana_dev_start()

 drivers/net/mana/mana.c | 77 +
 1 file changed, 77 insertions(+)

diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index 0dcd3f3124..eb37f359db 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -97,6 +97,81 @@ static int mana_dev_configure(struct rte_eth_dev *dev)
 
 static int mana_intr_uninstall(struct mana_priv *priv);
 
+static int
+mana_dev_start(struct rte_eth_dev *dev)
+{
+   int ret;
+   struct mana_priv *priv = dev->data->dev_private;
+
+   rte_spinlock_init(&priv->mr_btree_lock);
+   ret = mana_mr_btree_init(&priv->mr_btree, MANA_MR_BTREE_CACHE_N,
+dev->device->numa_node);
+   if (ret) {
+   DRV_LOG(ERR, "Failed to init device MR btree %d", ret);
+   return ret;
+   }
+
+   ret = mana_start_tx_queues(dev);
+   if (ret) {
+   DRV_LOG(ERR, "failed to start tx queues %d", ret);
+   goto failed_tx;
+   }
+
+   ret = mana_start_rx_queues(dev);
+   if (ret) {
+   DRV_LOG(ERR, "failed to start rx queues %d", ret);
+   goto failed_rx;
+   }
+
+   rte_wmb();
+
+   dev->tx_pkt_burst = mana_tx_burst;
+   dev->rx_pkt_burst = mana_rx_burst;
+
+   DRV_LOG(INFO, "TX/RX queues have started");
+
+   /* Enable datapath for secondary processes */
+   mana_mp_req_on_rxtx(dev, MANA_MP_REQ_START_RXTX);
+
+   return 0;
+
+failed_rx:
+   mana_stop_tx_queues(dev);
+
+failed_tx:
+   mana_mr_btree_free(&priv->mr_btree);
+
+   return ret;
+}
+
+static int
+mana_dev_stop(struct rte_eth_dev *dev __rte_unused)
+{
+   int ret;
+
+   dev->tx_pkt_burst = mana_tx_burst_removed;
+   dev->rx_pkt_burst = mana_rx_burst_removed;
+
+   /* Stop datapath on secondary processes */
+   mana_mp_req_on_rxtx(dev, MANA_MP_REQ_STOP_RXTX);
+
+   rte_wmb();
+
+   ret = mana_stop_tx_queues(dev);
+   if (ret) {
+   DRV_LOG(ERR, "failed to stop tx queues");
+   return ret;
+   }
+
+   ret = mana_stop_rx_queues(dev);
+   if (ret) {
+   DRV_LOG(ERR, "failed to stop tx queues");
+   return ret;
+   }
+
+   return 0;
+}
+
 static int
 mana_dev_close(struct rte_eth_dev *dev)
 {
@@ -435,6 +510,8 @@ static int mana_dev_link_update(struct rte_eth_dev *dev,
 
 const struct eth_dev_ops mana_dev_ops = {
.dev_configure  = mana_dev_configure,
+   .dev_start  = mana_dev_start,
+   .dev_stop   = mana_dev_stop,
.dev_close  = mana_dev_close,
.dev_infos_get  = mana_dev_info_get,
.txq_info_get   = mana_dev_tx_queue_info,
-- 
2.17.1



[Patch v6 17/18] net/mana: add function to report queue stats

2022-08-30 Thread longli
From: Long Li 

Report packet statistics.

Signed-off-by: Long Li 
---
Change log:
v5:
Fixed calculation of stats packets/bytes/errors by adding them over the queue 
stats.

 doc/guides/nics/features/mana.ini |  2 +
 drivers/net/mana/mana.c   | 77 +++
 2 files changed, 79 insertions(+)

diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
index 7922816d66..b2729aba3a 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -4,6 +4,7 @@
 ; Refer to default.ini for the full list of available PMD features.
 ;
 [Features]
+Basic stats  = Y
 Free Tx mbuf on demand = Y
 Link status  = P
 Linux= Y
@@ -14,5 +15,6 @@ Queue start/stop = Y
 Removal event= Y
 RSS hash = Y
 Speed capabilities   = P
+Stats per queue  = Y
 Usage doc= Y
 x86-64   = Y
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index eb37f359db..bb8ef652bf 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -508,6 +508,79 @@ static int mana_dev_link_update(struct rte_eth_dev *dev,
return rte_eth_linkstatus_set(dev, &link);
 }
 
+static int mana_dev_stats_get(struct rte_eth_dev *dev,
+ struct rte_eth_stats *stats)
+{
+   unsigned int i;
+
+   for (i = 0; i < dev->data->nb_tx_queues; i++) {
+   struct mana_txq *txq = dev->data->tx_queues[i];
+
+   if (!txq)
+   continue;
+
+   stats->opackets = txq->stats.packets;
+   stats->obytes = txq->stats.bytes;
+   stats->oerrors = txq->stats.errors;
+
+   if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
+   stats->q_opackets[i] = txq->stats.packets;
+   stats->q_obytes[i] = txq->stats.bytes;
+   }
+   }
+
+   stats->rx_nombuf = 0;
+   for (i = 0; i < dev->data->nb_rx_queues; i++) {
+   struct mana_rxq *rxq = dev->data->rx_queues[i];
+
+   if (!rxq)
+   continue;
+
+   stats->ipackets = rxq->stats.packets;
+   stats->ibytes = rxq->stats.bytes;
+   stats->ierrors = rxq->stats.errors;
+
+   /* There is no good way to get stats->imissed, not setting it */
+
+   if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
+   stats->q_ipackets[i] = rxq->stats.packets;
+   stats->q_ibytes[i] = rxq->stats.bytes;
+   }
+
+   stats->rx_nombuf += rxq->stats.nombuf;
+   }
+
+   return 0;
+}
+
+static int
+mana_dev_stats_reset(struct rte_eth_dev *dev __rte_unused)
+{
+   unsigned int i;
+
+   PMD_INIT_FUNC_TRACE();
+
+   for (i = 0; i < dev->data->nb_tx_queues; i++) {
+   struct mana_txq *txq = dev->data->tx_queues[i];
+
+   if (!txq)
+   continue;
+
+   memset(&txq->stats, 0, sizeof(txq->stats));
+   }
+
+   for (i = 0; i < dev->data->nb_rx_queues; i++) {
+   struct mana_rxq *rxq = dev->data->rx_queues[i];
+
+   if (!rxq)
+   continue;
+
+   memset(&rxq->stats, 0, sizeof(rxq->stats));
+   }
+
+   return 0;
+}
+
 const struct eth_dev_ops mana_dev_ops = {
.dev_configure  = mana_dev_configure,
.dev_start  = mana_dev_start,
@@ -524,9 +597,13 @@ const struct eth_dev_ops mana_dev_ops = {
.rx_queue_setup = mana_dev_rx_queue_setup,
.rx_queue_release   = mana_dev_rx_queue_release,
.link_update= mana_dev_link_update,
+   .stats_get  = mana_dev_stats_get,
+   .stats_reset= mana_dev_stats_reset,
 };
 
 const struct eth_dev_ops mana_dev_sec_ops = {
+   .stats_get = mana_dev_stats_get,
+   .stats_reset = mana_dev_stats_reset,
.dev_infos_get = mana_dev_info_get,
 };
 
-- 
2.17.1



[Patch v6 18/18] net/mana: add function to support RX interrupts

2022-08-30 Thread longli
From: Long Li 

mana can receive RX interrupts from kernel through RDMA verbs interface.
Implement RX interrupts in the driver.

Signed-off-by: Long Li 
---
Change log:
v5:
New patch added to the series

 doc/guides/nics/features/mana.ini |   1 +
 drivers/net/mana/gdma.c   |  10 +--
 drivers/net/mana/mana.c   | 125 ++
 drivers/net/mana/mana.h   |  13 +++-
 drivers/net/mana/rx.c |  91 +++---
 drivers/net/mana/tx.c |   3 +-
 6 files changed, 207 insertions(+), 36 deletions(-)

diff --git a/doc/guides/nics/features/mana.ini 
b/doc/guides/nics/features/mana.ini
index b2729aba3a..42d78ac6b1 100644
--- a/doc/guides/nics/features/mana.ini
+++ b/doc/guides/nics/features/mana.ini
@@ -14,6 +14,7 @@ Multiprocess aware   = Y
 Queue start/stop = Y
 Removal event= Y
 RSS hash = Y
+Rx interrupt = Y
 Speed capabilities   = P
 Stats per queue  = Y
 Usage doc= Y
diff --git a/drivers/net/mana/gdma.c b/drivers/net/mana/gdma.c
index 7ad175651e..275520bff5 100644
--- a/drivers/net/mana/gdma.c
+++ b/drivers/net/mana/gdma.c
@@ -204,7 +204,7 @@ union gdma_doorbell_entry {
 #define DOORBELL_OFFSET_EQ  0xFF8
 
 int mana_ring_doorbell(void *db_page, enum gdma_queue_types queue_type,
-  uint32_t queue_id, uint32_t tail)
+  uint32_t queue_id, uint32_t tail, uint8_t arm)
 {
uint8_t *addr = db_page;
union gdma_doorbell_entry e = {};
@@ -219,14 +219,14 @@ int mana_ring_doorbell(void *db_page, enum 
gdma_queue_types queue_type,
case gdma_queue_receive:
e.rq.id = queue_id;
e.rq.tail_ptr = tail;
-   e.rq.wqe_cnt = 1;
+   e.rq.wqe_cnt = arm;
addr += DOORBELL_OFFSET_RQ;
break;
 
case gdma_queue_completion:
e.cq.id = queue_id;
e.cq.tail_ptr = tail;
-   e.cq.arm = 1;
+   e.cq.arm = arm;
addr += DOORBELL_OFFSET_CQ;
break;
 
@@ -238,8 +238,8 @@ int mana_ring_doorbell(void *db_page, enum gdma_queue_types 
queue_type,
/* Ensure all writes are done before ringing doorbell */
rte_wmb();
 
-   DRV_LOG(DEBUG, "db_page %p addr %p queue_id %u type %u tail %u",
-   db_page, addr, queue_id, queue_type, tail);
+   DRV_LOG(DEBUG, "db_page %p addr %p queue_id %u type %u tail %u arm %u",
+   db_page, addr, queue_id, queue_type, tail, arm);
 
rte_write64(e.as_uint64, addr);
return 0;
diff --git a/drivers/net/mana/mana.c b/drivers/net/mana/mana.c
index bb8ef652bf..935811ea01 100644
--- a/drivers/net/mana/mana.c
+++ b/drivers/net/mana/mana.c
@@ -95,7 +95,68 @@ static int mana_dev_configure(struct rte_eth_dev *dev)
return 0;
 }
 
-static int mana_intr_uninstall(struct mana_priv *priv);
+static void rx_intr_vec_disable(struct mana_priv *priv)
+{
+   struct rte_intr_handle *intr_handle = priv->intr_handle;
+
+   rte_intr_free_epoll_fd(intr_handle);
+   rte_intr_vec_list_free(intr_handle);
+   rte_intr_nb_efd_set(intr_handle, 0);
+}
+
+static int rx_intr_vec_enable(struct mana_priv *priv)
+{
+   unsigned int i;
+   unsigned int rxqs_n = priv->dev_data->nb_rx_queues;
+   unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
+   struct rte_intr_handle *intr_handle = priv->intr_handle;
+   int ret;
+
+   rx_intr_vec_disable(priv);
+
+   if (rte_intr_vec_list_alloc(intr_handle, NULL, n)) {
+   DRV_LOG(ERR, "Failed to allocate memory for interrupt vector");
+   return -ENOMEM;
+   }
+
+   for (i = 0; i < n; i++) {
+   struct mana_rxq *rxq = priv->dev_data->rx_queues[i];
+
+   ret = rte_intr_vec_list_index_set(intr_handle, i,
+ RTE_INTR_VEC_RXTX_OFFSET + i);
+   if (ret) {
+   DRV_LOG(ERR, "Failed to set intr vec %u", i);
+   return ret;
+   }
+
+   ret = rte_intr_efds_index_set(intr_handle, i, rxq->channel->fd);
+   if (ret) {
+   DRV_LOG(ERR, "Failed to set FD at intr %u", i);
+   return ret;
+   }
+   }
+
+   return rte_intr_nb_efd_set(intr_handle, n);
+}
+
+static void rxq_intr_disable(struct mana_priv *priv)
+{
+   int err = rte_errno;
+
+   rx_intr_vec_disable(priv);
+   rte_errno = err;
+}
+
+static int rxq_intr_enable(struct mana_priv *priv)
+{
+   const struct rte_eth_intr_conf *const intr_conf =
+   &priv->dev_data->dev_conf.intr_conf;
+
+   if (!intr_conf->rxq)
+   return 0;
+
+   return rx_intr_vec_enable(priv);
+}
 
 static int
 mana_dev_start(struct rte_eth_dev *dev)
@@ -133,8 +194,17 @@ mana_dev_start(struct rte_eth_dev *dev)
/* Enable datapat

Re: [PATCH v3 1/5] mbuf: clarify meta data needed for Outbound Inline

2022-08-30 Thread Nithin Kumar Dabilpuram

Ping. Any thoughts on this ?

This is just clarification of existing assumptions for Inline Outbound 
that are being made.



On 2022-08-22 8:08 PM, Nithin Dabilpuram wrote:

Clarify mbuf meta data needed for Outbound Inline processing.
Application needs to provide mbuf.l3_len and L3 type in
mbuf.ol_flags so that like tunnel mode using mbuf.l2_len, transport mode
can make use of l3_len and l3_type to determine perform
proper transport mode IPsec processing.

Signed-off-by: Nithin Dabilpuram 
---
v3:
- Addressed comments on patch 4/5 and added acks.
v2:
- Modified ipsec-secgw to do ether type update for outbound path.

  doc/guides/nics/features.rst | 2 +-
  lib/mbuf/rte_mbuf_core.h | 3 ++-
  2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/guides/nics/features.rst b/doc/guides/nics/features.rst
index 7f6cb91..b4a8e98 100644
--- a/doc/guides/nics/features.rst
+++ b/doc/guides/nics/features.rst
@@ -431,7 +431,7 @@ protocol operations. See security library and PMD 
documentation for more details
  
  * **[uses]   rte_eth_rxconf,rte_eth_rxmode**: ``offloads:RTE_ETH_RX_OFFLOAD_SECURITY``,

  * **[uses]   rte_eth_txconf,rte_eth_txmode**: 
``offloads:RTE_ETH_TX_OFFLOAD_SECURITY``.
-* **[uses]   mbuf**: ``mbuf.l2_len``.
+* **[uses]   mbuf**: ``mbuf.l2_len``, ``mbuf.l3_len``, ``mbuf.ol_flags``.
  * **[implements] rte_security_ops**: ``session_create``, ``session_update``,
``session_stats_get``, ``session_destroy``, ``set_pkt_metadata``, 
``get_userdata``,
``capabilities_get``.
diff --git a/lib/mbuf/rte_mbuf_core.h b/lib/mbuf/rte_mbuf_core.h
index 3d6ddd6..b62a7c6 100644
--- a/lib/mbuf/rte_mbuf_core.h
+++ b/lib/mbuf/rte_mbuf_core.h
@@ -267,7 +267,8 @@ extern "C" {
  /**
   * Request security offload processing on the TX packet.
   * To use Tx security offload, the user needs to fill l2_len in mbuf
- * indicating L2 header size and where L3 header starts.
+ * indicating L2 header size and where L3 header starts. Similarly,
+ * l3_len should also be filled along with ol_flags reflecting current L3 type.
   */
  #define RTE_MBUF_F_TX_SEC_OFFLOAD (1ULL << 43)
  #define PKT_TX_SEC_OFFLOAD RTE_DEPRECATED(PKT_TX_SEC_OFFLOAD) \


[PATCH v2] net/ice: support original represented_port action

2022-08-30 Thread Zhichao Zeng
Add support to send matching traffic to the original DCF port
with represented_port action by using DCF port id as ethdev_port_id.

Signed-off-by: Zhichao Zeng 

---
v2: use id instead of name to represent port for comparing
---
 drivers/net/ice/ice_switch_filter.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ice/ice_switch_filter.c 
b/drivers/net/ice/ice_switch_filter.c
index da81e49bfa..240fa10750 100644
--- a/drivers/net/ice/ice_switch_filter.c
+++ b/drivers/net/ice/ice_switch_filter.c
@@ -1639,7 +1639,8 @@ ice_switch_parse_dcf_action(struct ice_dcf_adapter *ad,
rule_port_id = ad->parent.pf.dev_data->port_id;
backer_port_id = repr_dev->data->backer_port_id;
 
-   if (backer_port_id != rule_port_id)
+   if (backer_port_id != rule_port_id &&
+   act_ethdev->port_id != rule_port_id)
goto invalid;
 
rule_info->sw_act.vsi_handle = 
repr_dev->data->representor_id;
-- 
2.25.1



[PATCH v2] net/ice: support original represented_port action

2022-08-30 Thread Zhichao Zeng
Add support to send matching traffic to the original DCF port
with represented_port action by using DCF port id as ethdev_port_id.

Signed-off-by: Zhichao Zeng 

---
v2: use id instead of name to represent port for comparing
---
 drivers/net/ice/ice_switch_filter.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ice/ice_switch_filter.c 
b/drivers/net/ice/ice_switch_filter.c
index da81e49bfa..240fa10750 100644
--- a/drivers/net/ice/ice_switch_filter.c
+++ b/drivers/net/ice/ice_switch_filter.c
@@ -1639,7 +1639,8 @@ ice_switch_parse_dcf_action(struct ice_dcf_adapter *ad,
rule_port_id = ad->parent.pf.dev_data->port_id;
backer_port_id = repr_dev->data->backer_port_id;
 
-   if (backer_port_id != rule_port_id)
+   if (backer_port_id != rule_port_id &&
+   act_ethdev->port_id != rule_port_id)
goto invalid;
 
rule_info->sw_act.vsi_handle = 
repr_dev->data->representor_id;
-- 
2.25.1



[PATCH v2 0/2] introduce NitroSketch Mode into membership library

2022-08-30 Thread Leyi Rong
This patchset introduce a brand new NitroSketch Mode into membership
library. This algorithm provides high-fidelity approximate measurements
and appears as a promissing alternative to triditional approches such as
packet sampling.

---
v2:
- attach paper link to commit log.
- fix potential memory leaks in test_member.c.
- build error fix according to CI build fail log.

Leyi Rong (2):
  member: implement NitroSketch mode
  test/member: add functional and perf tests for sketch

 app/test/test_member.c| 272 
 app/test/test_member_perf.c   | 153 ++-
 lib/member/meson.build|  38 +-
 lib/member/rte_member.c   |  75 
 lib/member/rte_member.h   | 151 ++-
 lib/member/rte_member_heap.h  | 424 ++
 lib/member/rte_member_sketch.c| 594 ++
 lib/member/rte_member_sketch.h|  97 +
 lib/member/rte_member_sketch_avx512.c |  69 +++
 lib/member/rte_member_sketch_avx512.h |  36 ++
 lib/member/rte_xxh64_avx512.h | 117 +
 lib/member/version.map|   3 +
 12 files changed, 2021 insertions(+), 8 deletions(-)
 create mode 100644 lib/member/rte_member_heap.h
 create mode 100644 lib/member/rte_member_sketch.c
 create mode 100644 lib/member/rte_member_sketch.h
 create mode 100644 lib/member/rte_member_sketch_avx512.c
 create mode 100644 lib/member/rte_member_sketch_avx512.h
 create mode 100644 lib/member/rte_xxh64_avx512.h

-- 
2.25.1



[PATCH v2 1/2] member: implement NitroSketch mode

2022-08-30 Thread Leyi Rong
Sketching algorithm provide high-fidelity approximate measurements and
appears as a promising alternative to traditional approaches such as
packet sampling.

NitroSketch [1] is a software sketching framework that optimizes
performance, provides accuracy guarantees, and supports a variety of
sketches.

This commit adds a new data structure called sketch into
membership library. This new data structure is an efficient
way to profile the traffic for heavy hitters. Also use min-heap
structure to maintain the top-k flow keys.

[1] Zaoxing Liu, Ran Ben-Basat, Gil Einziger, Yaron Kassner, Vladimir
Braverman, Roy Friedman, Vyas Sekar, "NitroSketch: Robust and General
Sketch-based Monitoring in Software Switches", in ACM SIGCOMM 2019.
https://dl.acm.org/doi/pdf/10.1145/3341302.3342076

Signed-off-by: Alan Liu 
Signed-off-by: Yipeng Wang 
Signed-off-by: Leyi Rong 
---
 lib/member/meson.build|  38 +-
 lib/member/rte_member.c   |  75 
 lib/member/rte_member.h   | 151 ++-
 lib/member/rte_member_heap.h  | 424 ++
 lib/member/rte_member_sketch.c| 594 ++
 lib/member/rte_member_sketch.h|  97 +
 lib/member/rte_member_sketch_avx512.c |  69 +++
 lib/member/rte_member_sketch_avx512.h |  36 ++
 lib/member/rte_xxh64_avx512.h | 117 +
 lib/member/version.map|   3 +
 10 files changed, 1600 insertions(+), 4 deletions(-)
 create mode 100644 lib/member/rte_member_heap.h
 create mode 100644 lib/member/rte_member_sketch.c
 create mode 100644 lib/member/rte_member_sketch.h
 create mode 100644 lib/member/rte_member_sketch_avx512.c
 create mode 100644 lib/member/rte_member_sketch_avx512.h
 create mode 100644 lib/member/rte_xxh64_avx512.h

diff --git a/lib/member/meson.build b/lib/member/meson.build
index e06fddc240..9b3418c25c 100644
--- a/lib/member/meson.build
+++ b/lib/member/meson.build
@@ -7,6 +7,42 @@ if is_windows
 subdir_done()
 endif
 
-sources = files('rte_member.c', 'rte_member_ht.c', 'rte_member_vbf.c')
+sources = files('rte_member.c', 'rte_member_ht.c', 'rte_member_vbf.c', 
'rte_member_sketch.c')
 headers = files('rte_member.h')
 deps += ['hash']
+includes += include_directories('../hash', '../ring')
+
+# compile AVX512 version if:
+# we are building 64-bit binary AND binutils can generate proper code
+if dpdk_conf.has('RTE_ARCH_X86_64') and binutils_ok
+# compile AVX512 version if either:
+# a. we have AVX512 supported in minimum instruction set
+#baseline
+# b. it's not minimum instruction set, but supported by
+#compiler
+#
+# in former case, just add avx512 C file to files list
+# in latter case, compile c file to static lib, using correct
+# compiler flags, and then have the .o file from static lib
+# linked into main lib.
+sketch_avx512_cpu_support = (
+cc.get_define('__AVX512F__', args: machine_args) != ''
+)
+
+if sketch_avx512_cpu_support == true
+   cflags += ['-DCC_AVX512_SUPPORT']
+   if cc.has_multi_arguments('-mavx512f', '-mavx512dq', '-mavx512ifma')
+   cflags += ['-mavx512f', '-mavx512dq', '-mavx512ifma']
+   endif
+   sources += files('rte_member_sketch_avx512.c')
+elif cc.has_multi_arguments('-mavx512f', '-mavx512dq', '-mavx512ifma')
+   cflags += ['-DCC_AVX512_SUPPORT']
+   cflags += ['-mavx512f', '-mavx512dq', '-mavx512ifma']
+   sketch_avx512_tmp = static_library('sketch_avx512_tmp',
+   'rte_member_sketch_avx512.c',
+   include_directories: includes,
+   dependencies: static_rte_eal,
+   c_args: cflags)
+   objs += sketch_avx512_tmp.extract_objects('rte_member_sketch_avx512.c')
+endif
+endif
diff --git a/lib/member/rte_member.c b/lib/member/rte_member.c
index 7e1632e6b5..8f859f7fbd 100644
--- a/lib/member/rte_member.c
+++ b/lib/member/rte_member.c
@@ -9,10 +9,12 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "rte_member.h"
 #include "rte_member_ht.h"
 #include "rte_member_vbf.h"
+#include "rte_member_sketch.h"
 
 TAILQ_HEAD(rte_member_list, rte_tailq_entry);
 static struct rte_tailq_elem rte_member_tailq = {
@@ -72,6 +74,9 @@ rte_member_free(struct rte_member_setsum *setsum)
case RTE_MEMBER_TYPE_VBF:
rte_member_free_vbf(setsum);
break;
+   case RTE_MEMBER_TYPE_SKETCH:
+   rte_member_free_sketch(setsum);
+   break;
default:
break;
}
@@ -86,6 +91,8 @@ rte_member_create(const struct rte_member_parameters *params)
struct rte_member_list *member_list;
struct rte_member_setsum *setsum;
int ret;
+   char ring_name[RTE_RING_NAMESIZE];
+   struct rte_ring *sketch_key_ring = NULL;
 
if (params == NULL) {
rte_errno = EINVAL;
@@ -100,6 +107,16 @@ rte_member_create(const struct rte_member_parameters 
*params)
return NULL;
}
 
+   if (pa

[PATCH v2 2/2] test/member: add functional and perf tests for sketch

2022-08-30 Thread Leyi Rong
This patch adds functional and performance tests for sketch mode of
membership library.

Signed-off-by: Yipeng Wang 
Signed-off-by: Leyi Rong 
---
 app/test/test_member.c  | 272 
 app/test/test_member_perf.c | 153 +++-
 2 files changed, 421 insertions(+), 4 deletions(-)

diff --git a/app/test/test_member.c b/app/test/test_member.c
index 26a712439f..8266e6437b 100644
--- a/app/test/test_member.c
+++ b/app/test/test_member.c
@@ -4,6 +4,7 @@
 
 /* This test is for membership library's simple feature test */
 
+#include 
 #include "test.h"
 
 #include 
@@ -28,6 +29,7 @@ test_member(void)
 struct rte_member_setsum *setsum_ht;
 struct rte_member_setsum *setsum_cache;
 struct rte_member_setsum *setsum_vbf;
+struct rte_member_setsum *setsum_sketch;
 
 /* 5-tuple key type */
 struct flow_key {
@@ -108,6 +110,21 @@ static struct rte_member_parameters params = {
.socket_id = 0  /* NUMA Socket ID for memory. */
 };
 
+/* for sketch definitions */
+#define TOP_K 10
+#define HH_PKT_SIZE 16
+#define SKETCH_ERROR_RATE 0.05
+#define SKETCH_SAMPLE_RATE 0.001
+#define PRINT_OUT_COUNT 20
+
+#define SKETCH_LARGEST_KEY_SIZE 100
+#define SKETCH_TOTAL_KEY 500
+#define NUM_OF_KEY(key) {\
+   (unsigned int)ceil(SKETCH_LARGEST_KEY_SIZE / (key + 1)) \
+}
+
+void *heavy_hitters[TOP_K];
+
 /*
  * Sequence of operations for find existing setsummary
  *
@@ -684,6 +701,257 @@ perform_free(void)
rte_member_free(setsum_vbf);
 }
 
+static void
+print_out_sketch_results(uint64_t *count_result, member_set_t *heavy_set,
+uint32_t print_num, bool count_byte)
+{
+   uint32_t i;
+
+   for (i = 0; i < print_num; i++) {
+   if (count_byte)
+   printf("key %2u, count %8"PRIu64", real count %8u, "
+   "heavy_set %u, deviation rate [%.04f]\n",
+   i, count_result[i],
+   (unsigned int)ceil(SKETCH_LARGEST_KEY_SIZE / (i 
+ 1)) *
+   HH_PKT_SIZE,
+   heavy_set[i],
+   fabs((float)count_result[i] - 
(float)NUM_OF_KEY(i) * HH_PKT_SIZE) /
+   ((float)NUM_OF_KEY(i) * HH_PKT_SIZE));
+   else
+   printf("key %2u, count %8"PRIu64", real count %8u, "
+   "heavy_set %u, deviation rate [%.04f]\n",
+   i, count_result[i],
+   (unsigned int)ceil(SKETCH_LARGEST_KEY_SIZE / (i 
+ 1)),
+   heavy_set[i],
+   fabs((float)count_result[i] - 
(float)NUM_OF_KEY(i)) /
+   (float)NUM_OF_KEY(i));
+   }
+}
+
+static int
+sketch_test(uint32_t *keys, uint32_t total_pkt, int count_byte, int reset_test)
+{
+   uint32_t i;
+   uint64_t result_count[SKETCH_TOTAL_KEY];
+   member_set_t heavy_set[SKETCH_TOTAL_KEY];
+   uint64_t count[TOP_K];
+   int ret;
+   int hh_cnt;
+
+   setsum_sketch = rte_member_create(¶ms);
+   if (setsum_sketch == NULL) {
+   printf("Creation of setsums fail\n");
+   return -1;
+   }
+
+   for (i = 0; i < total_pkt; i++) {
+   if (count_byte)
+   ret = rte_member_add_byte_count(setsum_sketch, 
&keys[i], HH_PKT_SIZE);
+   else
+   ret = rte_member_add(setsum_sketch, &keys[i], 1);
+
+   if (ret < 0) {
+   printf("rte_member_add Failed! Error [%d]\n", ret);
+   rte_member_free(setsum_sketch);
+
+   return -1;
+   }
+   }
+
+   for (i = 0; i < SKETCH_TOTAL_KEY; i++) {
+   uint32_t tmp_key = i;
+
+   rte_member_query_count(setsum_sketch, (void *)&tmp_key, 
&result_count[i]);
+   rte_member_lookup(setsum_sketch, (void *)&tmp_key, 
&heavy_set[i]);
+   }
+
+   print_out_sketch_results(result_count, heavy_set, PRINT_OUT_COUNT, 
count_byte);
+
+   hh_cnt = rte_member_report_heavyhitter(setsum_sketch, heavy_hitters, 
count);
+   if (hh_cnt < 0) {
+   printf("sketch report heavy hitter error!");
+   rte_member_free(setsum_sketch);
+
+   return -1;
+   }
+
+   printf("Report heavy hitters:");
+   for (i = 0; i < (unsigned int)hh_cnt; i++) {
+   printf("%u: %"PRIu64"\t",
+   *((uint32_t *)heavy_hitters[i]), count[i]);
+   }
+   printf("\n");
+
+   if (reset_test) {
+   printf("\nEntering Sketch Reset Test Process!\n");
+   rte_member_reset(setsum_sketch);
+
+   /* after reset, check some key's count */
+   for (i = 0; i < SKETCH_TOTAL_KEY; i++) {
+   uint32_t tmp_key = i;
+
+ 

[PATCH] examples/ptpclient: add signal handler for cleanup

2022-08-30 Thread Rahul Bhansali
This adds the signal handler for SIGINT, SIGTERM.
Also, this will come out from infinite loop and do cleanup once it receives
any of the registered signal.

Signed-off-by: Rahul Bhansali 
---
 examples/ptpclient/ptpclient.c | 32 ++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/examples/ptpclient/ptpclient.c b/examples/ptpclient/ptpclient.c
index 1f1c9c9c52..8b69716be1 100644
--- a/examples/ptpclient/ptpclient.c
+++ b/examples/ptpclient/ptpclient.c
@@ -19,6 +19,9 @@
 #include 
 #include 
 #include 
+#include 
+
+static volatile bool force_quit;
 
 #define RX_RING_SIZE 1024
 #define TX_RING_SIZE 1024
@@ -609,7 +612,7 @@ parse_ptp_frames(uint16_t portid, struct rte_mbuf *m) {
  * The lcore main. This is the main thread that does the work, reading from an
  * input port and writing to an output port.
  */
-static __rte_noreturn void
+static void
 lcore_main(void)
 {
uint16_t portid;
@@ -621,7 +624,7 @@ lcore_main(void)
 
/* Run until the application is quit or killed. */
 
-   while (1) {
+   while (!force_quit) {
/* Read packet from RX queues. 8< */
for (portid = 0; portid < ptp_enabled_port_nb; portid++) {
 
@@ -734,6 +737,13 @@ ptp_parse_args(int argc, char **argv)
return 0;
 }
 
+static void
+signal_handler(int signum)
+{
+   if (signum == SIGINT || signum == SIGTERM)
+   force_quit = true;
+}
+
 /*
  * The main function, which does initialization and calls the per-lcore
  * functions.
@@ -758,6 +768,10 @@ main(int argc, char *argv[])
argc -= ret;
argv += ret;
 
+   force_quit = false;
+   signal(SIGINT, signal_handler);
+   signal(SIGTERM, signal_handler);
+
ret = ptp_parse_args(argc, argv);
if (ret < 0)
rte_exit(EXIT_FAILURE, "Error with PTP initialization\n");
@@ -802,6 +816,20 @@ main(int argc, char *argv[])
/* Call lcore_main on the main core only. */
lcore_main();
 
+   RTE_ETH_FOREACH_DEV(portid) {
+   if ((ptp_enabled_port_mask & (1 << portid)) == 0)
+   continue;
+
+   /* Disable timesync timestamping for the Ethernet device */
+   rte_eth_timesync_disable(portid);
+
+   ret = rte_eth_dev_stop(portid);
+   if (ret != 0)
+   printf("rte_eth_dev_stop: err=%d, port=%d\n", ret, 
portid);
+
+   rte_eth_dev_close(portid);
+   }
+
/* clean up the EAL */
rte_eal_cleanup();
 
-- 
2.25.1