> 
> On 2/4/2022 12:54 PM, Ciara Loftus wrote:
> > Secondary process support had been disabled for the AF_XDP PMD
> > because there was no logic in place to share the AF_XDP socket
> > file descriptors between the processes. This commit introduces
> > this logic using the IPC APIs.
> >
> > Since AF_XDP rings are single-producer single-consumer, rx/tx
> > in the secondary process is disabled. However other operations
> > including retrieval of stats are permitted.
> >
> > Signed-off-by: Ciara Loftus <ciara.lof...@intel.com>
> >
> > ---
> > v1 -> v2:
> > * Rebase to next-net
> >
> > RFC -> v1:
> > * Added newline to af_xdp.rst
> > * Fixed spelling errors
> > * Fixed potential NULL dereference in init_internals
> > * Fixed potential free of address-of expression in afxdp_mp_request_fds
> > ---
> >   doc/guides/nics/af_xdp.rst             |   9 ++
> >   doc/guides/nics/features/af_xdp.ini    |   1 +
> >   doc/guides/rel_notes/release_22_03.rst |   1 +
> >   drivers/net/af_xdp/rte_eth_af_xdp.c    | 210
> +++++++++++++++++++++++--
> >   4 files changed, 207 insertions(+), 14 deletions(-)
> >
> > diff --git a/doc/guides/nics/af_xdp.rst b/doc/guides/nics/af_xdp.rst
> > index db02ea1984..eb4eab28a8 100644
> > --- a/doc/guides/nics/af_xdp.rst
> > +++ b/doc/guides/nics/af_xdp.rst
> > @@ -141,4 +141,13 @@ Limitations
> >     NAPI context from a watchdog timer instead of from softirqs. More
> information
> >     on this feature can be found at [1].
> >
> > +- **Secondary Processes**
> > +
> > +  Rx and Tx are not supported for secondary processes due to the single-
> producer
> > +  single-consumer nature of the AF_XDP rings. However other operations
> including
> > +  statistics retrieval are permitted.
> 
> Hi Ciara,
> 
> Isn't this limitation same for all PMDs, like not both primary & secondary can
> Rx/Tx
> from same queue at the same time.
> But primary can initiallize the PMD and secondary can do the datapath,
> or isn't af_xdp supports multiple queue, if so some queues can be used by
> primary and some by secondary for datapath.
> 
> Is there anyhing special for af_xdp that prevents it?

Hi Ferruh,

Thanks for the review.
Each queue of the PMD corresponds to a new AF_XDP socket.
Each socket has an RX and TX ring that is mmapped from the kernel to userspace 
and this mapping is only valid for the primary process.
I did not figure out a way to share that mapping with the secondary process 
successfully. Can you think of anything that might work?

> 
> > +  The maximum number of queues permitted for PMDs operating in this
> model is 8
> > +  as this is the maximum number of fds that can be sent through the IPC
> APIs as
> > +  defined by RTE_MP_MAX_FD_NUM.
> > +
> >     [1] https://lwn.net/Articles/837010/
> > diff --git a/doc/guides/nics/features/af_xdp.ini
> b/doc/guides/nics/features/af_xdp.ini
> > index 54b738e616..8e7e075aaf 100644
> > --- a/doc/guides/nics/features/af_xdp.ini
> > +++ b/doc/guides/nics/features/af_xdp.ini
> > @@ -9,4 +9,5 @@ Power mgmt address monitor = Y
> >   MTU update           = Y
> >   Promiscuous mode     = Y
> >   Stats per queue      = Y
> > +Multiprocess aware   = Y
> >   x86-64               = Y
> > diff --git a/doc/guides/rel_notes/release_22_03.rst
> b/doc/guides/rel_notes/release_22_03.rst
> > index bf2e3f78a9..dfd2cbbccf 100644
> > --- a/doc/guides/rel_notes/release_22_03.rst
> > +++ b/doc/guides/rel_notes/release_22_03.rst
> > @@ -58,6 +58,7 @@ New Features
> >   * **Updated AF_XDP PMD**
> >
> >     * Added support for libxdp >=v1.2.2.
> > +  * Re-enabled secondary process support. RX/TX is not supported.
> >
> >   * **Updated Cisco enic driver.**
> >
> > diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c
> b/drivers/net/af_xdp/rte_eth_af_xdp.c
> > index 1b6192fa44..407f6d8dbe 100644
> > --- a/drivers/net/af_xdp/rte_eth_af_xdp.c
> > +++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
> > @@ -80,6 +80,18 @@ RTE_LOG_REGISTER_DEFAULT(af_xdp_logtype,
> NOTICE);
> >
> >   #define ETH_AF_XDP_ETH_OVERHEAD
>       (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN)
> >
> > +#define ETH_AF_XDP_MP_KEY "afxdp_mp_send_fds"
> > +
> > +static int afxdp_dev_count;
> > +
> > +/* Message header to synchronize fds via IPC */
> > +struct ipc_hdr {
> > +   char port_name[RTE_DEV_NAME_MAX_LEN];
> > +   /* The file descriptors are in the dedicated part
> > +    * of the Unix message to be translated by the kernel.
> > +    */
> > +};
> > +
> >   struct xsk_umem_info {
> >     struct xsk_umem *umem;
> >     struct rte_ring *buf_ring;
> > @@ -147,6 +159,10 @@ struct pmd_internals {
> >     struct pkt_tx_queue *tx_queues;
> >   };
> >
> > +struct pmd_process_private {
> > +   int rxq_xsk_fds[RTE_MAX_QUEUES_PER_PORT];
> > +};
> > +
> >   #define ETH_AF_XDP_IFACE_ARG                      "iface"
> >   #define ETH_AF_XDP_START_QUEUE_ARG                "start_queue"
> >   #define ETH_AF_XDP_QUEUE_COUNT_ARG
>       "queue_count"
> > @@ -795,11 +811,12 @@ static int
> >   eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
> >   {
> >     struct pmd_internals *internals = dev->data->dev_private;
> > +   struct pmd_process_private *process_private = dev-
> >process_private;
> >     struct xdp_statistics xdp_stats;
> >     struct pkt_rx_queue *rxq;
> >     struct pkt_tx_queue *txq;
> >     socklen_t optlen;
> > -   int i, ret;
> > +   int i, ret, fd;
> >
> >     for (i = 0; i < dev->data->nb_rx_queues; i++) {
> >             optlen = sizeof(struct xdp_statistics);
> > @@ -815,8 +832,9 @@ eth_stats_get(struct rte_eth_dev *dev, struct
> rte_eth_stats *stats)
> >             stats->ibytes += stats->q_ibytes[i];
> >             stats->imissed += rxq->stats.rx_dropped;
> >             stats->oerrors += txq->stats.tx_dropped;
> > -           ret = getsockopt(xsk_socket__fd(rxq->xsk), SOL_XDP,
> > -                           XDP_STATISTICS, &xdp_stats, &optlen);
> > +           fd = process_private->rxq_xsk_fds[i];
> > +           ret = fd >= 0 ? getsockopt(fd, SOL_XDP, XDP_STATISTICS,
> > +                                      &xdp_stats, &optlen) : -1;
> >             if (ret != 0) {
> >                     AF_XDP_LOG(ERR, "getsockopt() failed for
> XDP_STATISTICS.\n");
> >                     return -1;
> > @@ -883,8 +901,10 @@ eth_dev_close(struct rte_eth_dev *dev)
> >     struct pkt_rx_queue *rxq;
> >     int i;
> >
> > -   if (rte_eal_process_type() != RTE_PROC_PRIMARY)
> > +   if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
> > +           rte_free(dev->process_private);
> >             return 0;
> > +   }
> >
> >     AF_XDP_LOG(INFO, "Closing AF_XDP ethdev on numa socket %u\n",
> >             rte_socket_id());
> > @@ -1349,6 +1369,7 @@ eth_rx_queue_setup(struct rte_eth_dev *dev,
> >                struct rte_mempool *mb_pool)
> >   {
> >     struct pmd_internals *internals = dev->data->dev_private;
> > +   struct pmd_process_private *process_private = dev-
> >process_private;
> >     struct pkt_rx_queue *rxq;
> >     int ret;
> >
> > @@ -1387,6 +1408,8 @@ eth_rx_queue_setup(struct rte_eth_dev *dev,
> >     rxq->fds[0].fd = xsk_socket__fd(rxq->xsk);
> >     rxq->fds[0].events = POLLIN;
> >
> > +   process_private->rxq_xsk_fds[rx_queue_id] = rxq->fds[0].fd;
> > +
> >     dev->data->rx_queues[rx_queue_id] = rxq;
> >     return 0;
> >
> > @@ -1688,6 +1711,7 @@ init_internals(struct rte_vdev_device *dev, const
> char *if_name,
> >   {
> >     const char *name = rte_vdev_device_name(dev);
> >     const unsigned int numa_node = dev->device.numa_node;
> > +   struct pmd_process_private *process_private;
> >     struct pmd_internals *internals;
> >     struct rte_eth_dev *eth_dev;
> >     int ret;
> > @@ -1753,9 +1777,17 @@ init_internals(struct rte_vdev_device *dev,
> const char *if_name,
> >     if (ret)
> >             goto err_free_tx;
> >
> > +   process_private = (struct pmd_process_private *)
> > +           rte_zmalloc_socket(name, sizeof(struct
> pmd_process_private),
> > +                              RTE_CACHE_LINE_SIZE, numa_node);
> > +   if (process_private == NULL) {
> > +           AF_XDP_LOG(ERR, "Failed to alloc memory for process
> private\n");
> > +           goto err_free_tx;
> > +   }
> 
> Need to free 'process_private' in the PMD, in 'close()' and 'remove()' paths.

+1

> 
> > +
> >     eth_dev = rte_eth_vdev_allocate(dev, 0);
> >     if (eth_dev == NULL)
> > -           goto err_free_tx;
> > +           goto err_free_pp;
> >
> >     eth_dev->data->dev_private = internals;
> >     eth_dev->data->dev_link = pmd_link;
> > @@ -1764,6 +1796,10 @@ init_internals(struct rte_vdev_device *dev,
> const char *if_name,
> >     eth_dev->dev_ops = &ops;
> >     eth_dev->rx_pkt_burst = eth_af_xdp_rx;
> >     eth_dev->tx_pkt_burst = eth_af_xdp_tx;
> > +   eth_dev->process_private = process_private;
> > +
> > +   for (i = 0; i < queue_cnt; i++)
> > +           process_private->rxq_xsk_fds[i] = -1;
> >
> >   #if defined(XDP_UMEM_UNALIGNED_CHUNK_FLAG)
> >     AF_XDP_LOG(INFO, "Zero copy between umem and mbuf
> enabled.\n");
> > @@ -1771,6 +1807,8 @@ init_internals(struct rte_vdev_device *dev, const
> char *if_name,
> >
> >     return eth_dev;
> >
> > +err_free_pp:
> > +   rte_free(process_private);
> >   err_free_tx:
> >     rte_free(internals->tx_queues);
> >   err_free_rx:
> > @@ -1780,6 +1818,115 @@ init_internals(struct rte_vdev_device *dev,
> const char *if_name,
> >     return NULL;
> >   }
> >
> > +/* Secondary process requests rxq fds from primary. */
> > +static int
> > +afxdp_mp_request_fds(const char *name, struct rte_eth_dev *dev)
> > +{
> > +   struct pmd_process_private *process_private = dev-
> >process_private;
> > +   struct timespec timeout = {.tv_sec = 1, .tv_nsec = 0};
> > +   struct rte_mp_msg request, *reply;
> > +   struct rte_mp_reply replies;
> > +   struct ipc_hdr *request_param = (struct ipc_hdr *)request.param;
> > +   int i, ret;
> > +
> > +   /* Prepare the request */
> > +   memset(&request, 0, sizeof(request));
> > +   strlcpy(request.name, ETH_AF_XDP_MP_KEY,
> sizeof(request.name));
> > +   strlcpy(request_param->port_name, name,
> > +           sizeof(request_param->port_name));
> > +   request.len_param = sizeof(*request_param);
> > +
> > +   /* Send the request and receive the reply */
> > +   AF_XDP_LOG(DEBUG, "Sending IPC request for %s\n", name);
> > +   ret = rte_mp_request_sync(&request, &replies, &timeout);
> > +   if (ret < 0 || replies.nb_received != 1) {
> > +           AF_XDP_LOG(ERR, "Failed to request fds from primary: %d",
> > +                      rte_errno);
> > +           return -1;
> > +   }
> > +   reply = replies.msgs;
> > +   AF_XDP_LOG(DEBUG, "Received IPC reply for %s\n", name);
> 
> I think message can mention "multi-process IPC" for clarification.

+1

> 
> > +   if (dev->data->nb_rx_queues != reply->num_fds) {
> > +           AF_XDP_LOG(ERR, "Incorrect number of fds received: %d !=
> %d\n",
> > +                      reply->num_fds, dev->data->nb_rx_queues);
> > +           return -EINVAL;
> > +   }
> > +
> > +   for (i = 0; i < reply->num_fds; i++)
> > +           process_private->rxq_xsk_fds[i] = reply->fds[i];
> > +
> > +   free(reply);
> > +   return 0;
> > +}
> > +
> > +/* Primary process sends rxq fds to secondary. */
> > +static int
> > +afxdp_mp_send_fds(const struct rte_mp_msg *request, const void
> *peer)
> > +{
> > +   struct rte_eth_dev *dev;
> > +   struct pmd_process_private *process_private;
> > +   struct rte_mp_msg reply;
> > +   const struct ipc_hdr *request_param =
> > +           (const struct ipc_hdr *)request->param;
> > +   struct ipc_hdr *reply_param =
> > +           (struct ipc_hdr *)reply.param;
> > +   const char *request_name = request_param->port_name;
> > +   uint16_t port_id;
> > +   int i, ret;
> > +
> > +   AF_XDP_LOG(DEBUG, "Received IPC request for %s\n",
> request_name);
> > +
> > +   /* Find the requested port */
> > +   ret = rte_eth_dev_get_port_by_name(request_name, &port_id);
> > +   if (ret) {
> > +           AF_XDP_LOG(ERR, "Failed to get port id for %s\n",
> request_name);
> > +           return -1;
> > +   }
> > +   dev = &rte_eth_devices[port_id];
> 
> Better to not access the global array, there is a new API and a cleanup
> already done [1] in other PMDs, can you please apply the same here.
> 
> [1]
> https://patches.dpdk.org/project/dpdk/patch/20220203082412.79028-1-
> kumaraparames...@gmail.com/

Thanks for sharing, I wasn't aware of this API. I'll add it in the v3.

> 
> > +   process_private = dev->process_private;
> > +
> > +   /* Populate the reply with the xsk fd for each queue */
> > +   reply.num_fds = 0;
> > +   if (dev->data->nb_rx_queues > RTE_MP_MAX_FD_NUM) {
> > +           AF_XDP_LOG(ERR, "Number of rx queues (%d) exceeds max
> number of fds (%d)\n",
> > +                      dev->data->nb_rx_queues,
> RTE_MP_MAX_FD_NUM);
> > +           return -EINVAL;
> > +   }
> > +
> > +   for (i = 0; i < dev->data->nb_rx_queues; i++)
> > +           reply.fds[reply.num_fds++] = process_private-
> >rxq_xsk_fds[i];
> > +
> > +   /* Send the reply */
> > +   strlcpy(reply.name, request->name, sizeof(reply.name));
> > +   strlcpy(reply_param->port_name, request_name,
> > +           sizeof(reply_param->port_name));
> > +   reply.len_param = sizeof(*reply_param);
> > +   AF_XDP_LOG(DEBUG, "Sending IPC reply for %s\n", reply_param-
> >port_name);
> > +   if (rte_mp_reply(&reply, peer) < 0) {
> > +           AF_XDP_LOG(ERR, "Failed to reply to IPC request\n");
> > +           return -1;
> > +   }
> > +   return 0;
> > +}
> > +
> > +/* Secondary process rx function. RX is disabled because rings are SPSC */
> > +static uint16_t
> > +eth_af_xdp_rx_noop(void *queue __rte_unused,
> > +           struct rte_mbuf **bufs __rte_unused,
> > +           uint16_t nb_pkts __rte_unused)
> > +{
> > +   return 0;
> > +}
> > +
> > +/* Secondary process tx function. TX is disabled because rings are SPSC */
> > +static uint16_t
> > +eth_af_xdp_tx_noop(void *queue __rte_unused,
> > +                   struct rte_mbuf **bufs __rte_unused,
> > +                   uint16_t nb_pkts __rte_unused)
> > +{
> > +   return 0;
> > +}
> > +
> 
> Now there are multiple PMDs using noop/dummy Rx/Tx burst functions,
> what do you think to add static inline functions in the 'ethdev_driver.h'
> (and clean existing drivers to use it) with a separate patch, later in this
> patch use those functions?

+1

> 
> 
> >   static int
> >   rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
> >   {
> > @@ -1789,19 +1936,39 @@ rte_pmd_af_xdp_probe(struct
> rte_vdev_device *dev)
> >     int xsk_queue_cnt = ETH_AF_XDP_DFLT_QUEUE_COUNT;
> >     int shared_umem = 0;
> >     char prog_path[PATH_MAX] = {'\0'};
> > -   int busy_budget = -1;
> > +   int busy_budget = -1, ret;
> >     struct rte_eth_dev *eth_dev = NULL;
> > -   const char *name;
> > +   const char *name = rte_vdev_device_name(dev);
> >
> > -   AF_XDP_LOG(INFO, "Initializing pmd_af_xdp for %s\n",
> > -           rte_vdev_device_name(dev));
> > +   AF_XDP_LOG(INFO, "Initializing pmd_af_xdp for %s\n", name);
> >
> > -   name = rte_vdev_device_name(dev);
> >     if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
> > -           AF_XDP_LOG(ERR, "Failed to probe %s. "
> > -                           "AF_XDP PMD does not support secondary
> processes.\n",
> > -                           name);
> > -           return -ENOTSUP;
> > +           eth_dev = rte_eth_dev_attach_secondary(name);
> > +           if (eth_dev == NULL) {
> > +                   AF_XDP_LOG(ERR, "Failed to probe %s\n", name);
> > +                   return -EINVAL;
> > +           }
> > +           eth_dev->dev_ops = &ops;
> > +           eth_dev->device = &dev->device;
> > +           eth_dev->rx_pkt_burst = eth_af_xdp_rx_noop;
> > +           eth_dev->tx_pkt_burst = eth_af_xdp_tx_noop;
> > +           eth_dev->process_private = (struct pmd_process_private *)
> > +                   rte_zmalloc_socket(name,
> > +                                      sizeof(struct
> pmd_process_private),
> > +                                      RTE_CACHE_LINE_SIZE,
> > +                                      eth_dev->device->numa_node);
> > +           if (eth_dev->process_private == NULL) {
> > +                   AF_XDP_LOG(ERR,
> > +                           "Failed to alloc memory for process
> private\n");
> > +                   return -ENOMEM;
> > +           }
> > +
> > +           /* Obtain the xsk fds from the primary process. */
> > +           if (afxdp_mp_request_fds(name, eth_dev))
> > +                   return -1;
> > +
> > +           rte_eth_dev_probing_finish(eth_dev);
> > +           return 0;
> >     }
> >
> >     kvlist = rte_kvargs_parse(rte_vdev_device_args(dev),
> valid_arguments);
> > @@ -1836,6 +2003,17 @@ rte_pmd_af_xdp_probe(struct rte_vdev_device
> *dev)
> >             return -1;
> >     }
> >
> > +   /* Register IPC callback which shares xsk fds from primary to
> secondary */
> > +   if (!afxdp_dev_count) {
> > +           ret = rte_mp_action_register(ETH_AF_XDP_MP_KEY,
> afxdp_mp_send_fds);
> > +           if (ret < 0) {
> > +                   AF_XDP_LOG(ERR, "%s: Failed to register IPC
> callback: %s",
> > +                              name, strerror(rte_errno));
> > +                   return -1;
> > +           }
> > +   }
> > +   afxdp_dev_count++;
> > +
> >     rte_eth_dev_probing_finish(eth_dev);
> >
> >     return 0;
> > @@ -1858,6 +2036,10 @@ rte_pmd_af_xdp_remove(struct
> rte_vdev_device *dev)
> >             return 0;
> >
> >     eth_dev_close(eth_dev);
> > +   rte_free(eth_dev->process_private);
> > +   if (afxdp_dev_count == 1)
> > +           rte_mp_action_unregister(ETH_AF_XDP_MP_KEY);
> > +   afxdp_dev_count--;
> >     rte_eth_dev_release_port(eth_dev);
> >
> >

Reply via email to