This is the vanilla version.
Packet data will copy between af_xdp memory buffer and mbuf mempool.
indexes of memory buffer is simply managed by a fifo ring.

Signed-off-by: Qi Zhang <qi.z.zh...@intel.com>
---
 config/common_base                            |   5 +
 config/common_linuxapp                        |   1 +
 drivers/net/Makefile                          |   1 +
 drivers/net/af_xdp/Makefile                   |  56 ++
 drivers/net/af_xdp/meson.build                |   7 +
 drivers/net/af_xdp/rte_eth_af_xdp.c           | 763 ++++++++++++++++++++++++++
 drivers/net/af_xdp/rte_pmd_af_xdp_version.map |   4 +
 drivers/net/af_xdp/xdpsock_queue.h            |  62 +++
 mk/rte.app.mk                                 |   1 +
 9 files changed, 900 insertions(+)
 create mode 100644 drivers/net/af_xdp/Makefile
 create mode 100644 drivers/net/af_xdp/meson.build
 create mode 100644 drivers/net/af_xdp/rte_eth_af_xdp.c
 create mode 100644 drivers/net/af_xdp/rte_pmd_af_xdp_version.map
 create mode 100644 drivers/net/af_xdp/xdpsock_queue.h

diff --git a/config/common_base b/config/common_base
index ad03cf433..84b7b3b7e 100644
--- a/config/common_base
+++ b/config/common_base
@@ -368,6 +368,11 @@ CONFIG_RTE_LIBRTE_VMXNET3_DEBUG_TX_FREE=n
 CONFIG_RTE_LIBRTE_PMD_AF_PACKET=n
 
 #
+# Compile software PMD backed by AF_XDP sockets (Linux only)
+#
+CONFIG_RTE_LIBRTE_PMD_AF_XDP=n
+
+#
 # Compile link bonding PMD library
 #
 CONFIG_RTE_LIBRTE_PMD_BOND=y
diff --git a/config/common_linuxapp b/config/common_linuxapp
index ff98f2355..3b10695b6 100644
--- a/config/common_linuxapp
+++ b/config/common_linuxapp
@@ -16,6 +16,7 @@ CONFIG_RTE_LIBRTE_VHOST=y
 CONFIG_RTE_LIBRTE_VHOST_NUMA=y
 CONFIG_RTE_LIBRTE_PMD_VHOST=y
 CONFIG_RTE_LIBRTE_PMD_AF_PACKET=y
+CONFIG_RTE_LIBRTE_PMD_AF_XDP=y
 CONFIG_RTE_LIBRTE_PMD_TAP=y
 CONFIG_RTE_LIBRTE_AVP_PMD=y
 CONFIG_RTE_LIBRTE_VDEV_NETVSC_PMD=y
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index e1127326b..409234ac3 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -9,6 +9,7 @@ ifeq ($(CONFIG_RTE_LIBRTE_THUNDERX_NICVF_PMD),d)
 endif
 
 DIRS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET) += af_packet
+DIRS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += af_xdp
 DIRS-$(CONFIG_RTE_LIBRTE_ARK_PMD) += ark
 DIRS-$(CONFIG_RTE_LIBRTE_AVF_PMD) += avf
 DIRS-$(CONFIG_RTE_LIBRTE_AVP_PMD) += avp
diff --git a/drivers/net/af_xdp/Makefile b/drivers/net/af_xdp/Makefile
new file mode 100644
index 000000000..ac38e20bf
--- /dev/null
+++ b/drivers/net/af_xdp/Makefile
@@ -0,0 +1,56 @@
+#   BSD LICENSE
+#
+#   Copyright(c) 2014 John W. Linville <linvi...@redhat.com>
+#   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
+#   Copyright(c) 2014 6WIND S.A.
+#   All rights reserved.
+#
+#   Redistribution and use in source and binary forms, with or without
+#   modification, are permitted provided that the following conditions
+#   are met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in
+#       the documentation and/or other materials provided with the
+#       distribution.
+#     * Neither the name of Intel Corporation nor the names of its
+#       contributors may be used to endorse or promote products derived
+#       from this software without specific prior written permission.
+#
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+#
+# library name
+#
+LIB = librte_pmd_af_xdp.a
+
+EXPORT_MAP := rte_pmd_af_xdp_version.map
+
+LIBABIVER := 1
+
+CFLAGS += -O3 -I/opt/af_xdp/linux_headers/include
+CFLAGS += $(WERROR_FLAGS)
+LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
+LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs
+LDLIBS += -lrte_bus_vdev
+
+#
+# all source are stored in SRCS-y
+#
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP) += rte_eth_af_xdp.c
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/drivers/net/af_xdp/meson.build b/drivers/net/af_xdp/meson.build
new file mode 100644
index 000000000..4b5299c8e
--- /dev/null
+++ b/drivers/net/af_xdp/meson.build
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(c) 2017 Intel Corporation
+
+if host_machine.system() != 'linux'
+       build = false
+endif
+sources = files('rte_eth_af_xdp.c')
diff --git a/drivers/net/af_xdp/rte_eth_af_xdp.c 
b/drivers/net/af_xdp/rte_eth_af_xdp.c
new file mode 100644
index 000000000..4eb8a2c28
--- /dev/null
+++ b/drivers/net/af_xdp/rte_eth_af_xdp.c
@@ -0,0 +1,763 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2014 John W. Linville <linvi...@tuxdriver.com>
+ * Originally based upon librte_pmd_pcap code:
+ * Copyright(c) 2010-2015 Intel Corporation.
+ * Copyright(c) 2014 6WIND S.A.
+ * All rights reserved.
+ */
+
+#include <rte_mbuf.h>
+#include <rte_ethdev_driver.h>
+#include <rte_ethdev_vdev.h>
+#include <rte_malloc.h>
+#include <rte_kvargs.h>
+#include <rte_bus_vdev.h>
+
+#include <linux/if_ether.h>
+#include <linux/if_xdp.h>
+#include <arpa/inet.h>
+#include <net/if.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <poll.h>
+#include "xdpsock_queue.h"
+
+#ifndef SOL_XDP
+#define SOL_XDP 283
+#endif
+
+#ifndef AF_XDP
+#define AF_XDP 44
+#endif
+
+#ifndef PF_XDP
+#define PF_XDP AF_XDP
+#endif
+
+#define ETH_AF_XDP_IFACE_ARG           "iface"
+#define ETH_AF_XDP_QUEUE_IDX_ARG       "queue"
+#define ETH_AF_XDP_RING_SIZE_ARG       "ringsz"
+
+#define ETH_AF_XDP_FRAME_SIZE          2048
+#define ETH_AF_XDP_NUM_BUFFERS         131072
+#define ETH_AF_XDP_DATA_HEADROOM       0
+#define ETH_AF_XDP_DFLT_RING_SIZE      1024
+#define ETH_AF_XDP_DFLT_QUEUE_IDX      0
+
+#define ETH_AF_XDP_RX_BATCH_SIZE       32
+#define ETH_AF_XDP_TX_BATCH_SIZE       32
+
+struct xdp_umem {
+       char *buffer;
+       size_t size;
+       unsigned int frame_size;
+       unsigned int frame_size_log2;
+       unsigned int nframes;
+       int mr_fd;
+};
+
+struct pmd_internals {
+       int sfd;
+       int if_index;
+       char if_name[0x100];
+       struct ether_addr eth_addr;
+       struct xdp_queue rx;
+       struct xdp_queue tx;
+       struct xdp_umem *umem;
+       struct rte_mempool *mb_pool;
+
+       unsigned long rx_pkts;
+       unsigned long rx_bytes;
+       unsigned long rx_dropped;
+
+       unsigned long tx_pkts;
+       unsigned long err_pkts;
+       unsigned long tx_bytes;
+
+       uint16_t port_id;
+       uint16_t queue_idx;
+       int ring_size;
+       struct rte_ring *buf_ring;
+};
+
+static const char * const valid_arguments[] = {
+       ETH_AF_XDP_IFACE_ARG,
+       ETH_AF_XDP_QUEUE_IDX_ARG,
+       ETH_AF_XDP_RING_SIZE_ARG,
+       NULL
+};
+
+static struct rte_eth_link pmd_link = {
+       .link_speed = ETH_SPEED_NUM_10G,
+       .link_duplex = ETH_LINK_FULL_DUPLEX,
+       .link_status = ETH_LINK_DOWN,
+       .link_autoneg = ETH_LINK_AUTONEG
+};
+
+static void *get_pkt_data(struct pmd_internals *internals,
+                         uint32_t index,
+                         uint32_t offset)
+{
+       return (uint8_t *)(internals->umem->buffer +
+                          (index << internals->umem->frame_size_log2) +
+                          offset);
+}
+
+static uint16_t
+eth_af_xdp_rx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+       struct pmd_internals *internals = queue;
+       struct xdp_queue *rxq = &internals->rx;
+       struct rte_mbuf *mbuf;
+       unsigned long dropped = 0;
+       unsigned long rx_bytes = 0;
+       uint16_t count = 0;
+
+       nb_pkts = nb_pkts < ETH_AF_XDP_RX_BATCH_SIZE ?
+                 nb_pkts : ETH_AF_XDP_RX_BATCH_SIZE;
+
+       struct xdp_desc descs[ETH_AF_XDP_RX_BATCH_SIZE];
+       void *indexes[ETH_AF_XDP_RX_BATCH_SIZE];
+       int rcvd, i;
+       /* fill rx ring */
+       if (rxq->num_free >= ETH_AF_XDP_RX_BATCH_SIZE) {
+               int n = rte_ring_dequeue_bulk(internals->buf_ring,
+                                             indexes,
+                                             ETH_AF_XDP_RX_BATCH_SIZE,
+                                             NULL);
+               for (i = 0; i < n; i++)
+                       descs[i].idx = (uint32_t)((long int)indexes[i]);
+               xq_enq(rxq, descs, n);
+       }
+
+       /* read data */
+       rcvd = xq_deq(rxq, descs, nb_pkts);
+       if (rcvd == 0)
+               return 0;
+
+       for (i = 0; i < rcvd; i++) {
+               char *pkt;
+               uint32_t idx = descs[i].idx;
+
+               mbuf = rte_pktmbuf_alloc(internals->mb_pool);
+               rte_pktmbuf_pkt_len(mbuf) =
+                       rte_pktmbuf_data_len(mbuf) =
+                       descs[i].len;
+               if (mbuf) {
+                       pkt = get_pkt_data(internals, idx, descs[i].offset);
+                       memcpy(rte_pktmbuf_mtod(mbuf, void *),
+                              pkt, descs[i].len);
+                       rx_bytes += descs[i].len;
+                       bufs[count++] = mbuf;
+               } else {
+                       dropped++;
+               }
+               indexes[i] = (void *)((long int)idx);
+       }
+
+       rte_ring_enqueue_bulk(internals->buf_ring, indexes, rcvd, NULL);
+
+       internals->rx_pkts += (rcvd - dropped);
+       internals->rx_bytes += rx_bytes;
+       internals->rx_dropped += dropped;
+
+       return count;
+}
+
+static void kick_tx(int fd)
+{
+       int ret;
+
+       for (;;) {
+               ret = sendto(fd, NULL, 0, MSG_DONTWAIT, NULL, 0);
+               if (ret >= 0 || errno == ENOBUFS)
+                       return;
+               if (errno == EAGAIN)
+                       continue;
+       }
+}
+
+static uint16_t
+eth_af_xdp_tx(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+       struct pmd_internals *internals = queue;
+       struct xdp_queue *txq = &internals->tx;
+       struct rte_mbuf *mbuf;
+       struct xdp_desc descs[ETH_AF_XDP_TX_BATCH_SIZE];
+       void *indexes[ETH_AF_XDP_TX_BATCH_SIZE];
+       uint16_t i, valid;
+       unsigned long tx_bytes = 0;
+
+       nb_pkts = nb_pkts < ETH_AF_XDP_TX_BATCH_SIZE ?
+                 nb_pkts : ETH_AF_XDP_TX_BATCH_SIZE;
+
+       if (txq->num_free < ETH_AF_XDP_TX_BATCH_SIZE * 2) {
+               int n = xq_deq(txq, descs, ETH_AF_XDP_TX_BATCH_SIZE);
+
+               for (i = 0; i < n; i++)
+                       indexes[i] = (void *)((long int)descs[i].idx);
+               rte_ring_enqueue_bulk(internals->buf_ring, indexes, n, NULL);
+       }
+
+       nb_pkts = nb_pkts > txq->num_free ? txq->num_free : nb_pkts;
+       nb_pkts = rte_ring_dequeue_bulk(internals->buf_ring, indexes,
+                                       nb_pkts, NULL);
+
+       valid = 0;
+       for (i = 0; i < nb_pkts; i++) {
+               char *pkt;
+               unsigned int buf_len =
+                       internals->umem->frame_size - ETH_AF_XDP_DATA_HEADROOM;
+               mbuf = bufs[i];
+               if (mbuf->pkt_len <= buf_len) {
+                       descs[valid].idx = (uint32_t)((long int)indexes[valid]);
+                       descs[valid].offset = ETH_AF_XDP_DATA_HEADROOM;
+                       descs[valid].flags = 0;
+                       descs[valid].len = mbuf->pkt_len;
+                       pkt = get_pkt_data(internals, descs[i].idx,
+                                          descs[i].offset);
+                       memcpy(pkt, rte_pktmbuf_mtod(mbuf, void *),
+                              descs[i].len);
+                       valid++;
+                       tx_bytes += mbuf->pkt_len;
+               }
+               rte_pktmbuf_free(mbuf);
+       }
+
+       xq_enq(txq, descs, valid);
+       kick_tx(internals->sfd);
+
+       if (valid < nb_pkts)
+               rte_ring_enqueue_bulk(internals->buf_ring, &indexes[valid],
+                                     nb_pkts - valid, NULL);
+
+       internals->err_pkts += (nb_pkts - valid);
+       internals->tx_pkts += valid;
+       internals->tx_bytes += tx_bytes;
+
+       return valid;
+}
+
+static void
+fill_rx_desc(struct pmd_internals *internals)
+{
+       int num_free = internals->rx.num_free;
+       void *p = NULL;
+       int i;
+
+       for (i = 0; i < num_free; i++) {
+               struct xdp_desc desc = {};
+
+               rte_ring_dequeue(internals->buf_ring, &p);
+               desc.idx = (uint32_t)((long int)p);
+               xq_enq(&internals->rx, &desc, 1);
+       }
+}
+
+static int
+eth_dev_start(struct rte_eth_dev *dev)
+{
+       struct pmd_internals *internals = dev->data->dev_private;
+
+       dev->data->dev_link.link_status = ETH_LINK_UP;
+       fill_rx_desc(internals);
+
+       return 0;
+}
+
+/* This function gets called when the current port gets stopped. */
+static void
+eth_dev_stop(struct rte_eth_dev *dev)
+{
+       dev->data->dev_link.link_status = ETH_LINK_DOWN;
+}
+
+static int
+eth_dev_configure(struct rte_eth_dev *dev __rte_unused)
+{
+       return 0;
+}
+
+static void
+eth_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
+{
+       struct pmd_internals *internals = dev->data->dev_private;
+
+       dev_info->if_index = internals->if_index;
+       dev_info->max_mac_addrs = 1;
+       dev_info->max_rx_pktlen = (uint32_t)ETH_FRAME_LEN;
+       dev_info->max_rx_queues = 1;
+       dev_info->max_tx_queues = 1;
+       dev_info->min_rx_bufsize = 0;
+}
+
+static int
+eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
+{
+       const struct pmd_internals *internal = dev->data->dev_private;
+
+       stats->ipackets = stats->q_ipackets[0] =
+               internal->rx_pkts;
+       stats->ibytes = stats->q_ibytes[0] =
+               internal->rx_bytes;
+       stats->imissed =
+               internal->rx_dropped;
+
+       stats->opackets = stats->q_opackets[0]
+               = internal->tx_pkts;
+       stats->oerrors = stats->q_errors[0] =
+               internal->err_pkts;
+       stats->obytes = stats->q_obytes[0] =
+               internal->tx_bytes;
+
+       return 0;
+}
+
+static void
+eth_stats_reset(struct rte_eth_dev *dev)
+{
+       struct pmd_internals *internal = dev->data->dev_private;
+
+       internal->rx_pkts = 0;
+       internal->rx_bytes = 0;
+       internal->rx_dropped = 0;
+
+       internal->tx_pkts = 0;
+       internal->err_pkts = 0;
+       internal->tx_bytes = 0;
+}
+
+static void
+eth_dev_close(struct rte_eth_dev *dev __rte_unused)
+{
+}
+
+static void
+eth_queue_release(void *q __rte_unused)
+{
+}
+
+static int
+eth_link_update(struct rte_eth_dev *dev __rte_unused,
+               int wait_to_complete __rte_unused)
+{
+       return 0;
+}
+
+static struct xdp_umem *xsk_alloc_and_mem_reg_buffers(int sfd, size_t nbuffers)
+{
+       struct xdp_mr_req req = { .frame_size = ETH_AF_XDP_FRAME_SIZE,
+                                 .data_headroom = ETH_AF_XDP_DATA_HEADROOM };
+       struct xdp_umem *umem;
+       void *bufs;
+       int ret;
+
+       ret = posix_memalign((void **)&bufs, getpagesize(),
+                            nbuffers * req.frame_size);
+       if (ret)
+               return NULL;
+
+       umem = calloc(1, sizeof(*umem));
+       if (!umem) {
+               free(bufs);
+               return NULL;
+       }
+
+       req.addr = (unsigned long)bufs;
+       req.len = nbuffers * req.frame_size;
+       ret = setsockopt(sfd, SOL_XDP, XDP_MEM_REG, &req, sizeof(req));
+       RTE_ASSERT(ret == 0);
+
+       umem->frame_size = ETH_AF_XDP_FRAME_SIZE;
+       umem->frame_size_log2 = 11;
+       umem->buffer = bufs;
+       umem->size = nbuffers * req.frame_size;
+       umem->nframes = nbuffers;
+       umem->mr_fd = sfd;
+
+       return umem;
+}
+
+static int
+xdp_configure(struct pmd_internals *internals)
+{
+       struct sockaddr_xdp sxdp;
+       struct xdp_ring_req req;
+       char ring_name[0x100];
+       int ret = 0;
+       long int i;
+
+       snprintf(ring_name, 0x100, "%s_%s_%d", "af_xdp_ring",
+                internals->if_name, internals->queue_idx);
+       internals->buf_ring = rte_ring_create(ring_name,
+                                             ETH_AF_XDP_NUM_BUFFERS,
+                                             SOCKET_ID_ANY,
+                                             0x0);
+       if (!internals->buf_ring)
+               return -1;
+
+       for (i = 0; i < ETH_AF_XDP_NUM_BUFFERS; i++)
+               rte_ring_enqueue(internals->buf_ring, (void *)i);
+
+       internals->umem = xsk_alloc_and_mem_reg_buffers(internals->sfd,
+                                                       ETH_AF_XDP_NUM_BUFFERS);
+       if (!internals->umem)
+               goto error;
+
+       req.mr_fd = internals->umem->mr_fd;
+       req.desc_nr = internals->ring_size;
+
+       ret = setsockopt(internals->sfd, SOL_XDP, XDP_RX_RING,
+                        &req, sizeof(req));
+
+       RTE_ASSERT(ret == 0);
+
+       ret = setsockopt(internals->sfd, SOL_XDP, XDP_TX_RING,
+                        &req, sizeof(req));
+
+       RTE_ASSERT(ret == 0);
+
+       internals->rx.ring = mmap(0, req.desc_nr * sizeof(struct xdp_desc),
+                                 PROT_READ | PROT_WRITE,
+                                 MAP_SHARED | MAP_LOCKED | MAP_POPULATE,
+                                 internals->sfd,
+                                 XDP_PGOFF_RX_RING);
+       RTE_ASSERT(internals->rx.ring != MAP_FAILED);
+
+       internals->rx.num_free = req.desc_nr;
+       internals->rx.ring_mask = req.desc_nr - 1;
+
+       internals->tx.ring = mmap(0, req.desc_nr * sizeof(struct xdp_desc),
+                                 PROT_READ | PROT_WRITE,
+                                 MAP_SHARED | MAP_LOCKED | MAP_POPULATE,
+                                 internals->sfd,
+                                 XDP_PGOFF_TX_RING);
+       RTE_ASSERT(internals->tx.ring != MAP_FAILED);
+
+       internals->tx.num_free = req.desc_nr;
+       internals->tx.ring_mask = req.desc_nr - 1;
+
+       sxdp.sxdp_family = PF_XDP;
+       sxdp.sxdp_ifindex = internals->if_index;
+       sxdp.sxdp_queue_id = internals->queue_idx;
+
+       ret = bind(internals->sfd, (struct sockaddr *)&sxdp, sizeof(sxdp));
+       RTE_ASSERT(ret == 0);
+
+       return ret;
+error:
+       rte_ring_free(internals->buf_ring);
+       internals->buf_ring = NULL;
+       return -1;
+}
+
+static int
+eth_rx_queue_setup(struct rte_eth_dev *dev,
+                  uint16_t rx_queue_id,
+                  uint16_t nb_rx_desc __rte_unused,
+                  unsigned int socket_id __rte_unused,
+                  const struct rte_eth_rxconf *rx_conf __rte_unused,
+                  struct rte_mempool *mb_pool)
+{
+       struct pmd_internals *internals = dev->data->dev_private;
+       unsigned int buf_size, data_size;
+
+       RTE_ASSERT(rx_queue_id == 0);
+       internals->mb_pool = mb_pool;
+       xdp_configure(internals);
+
+       /* Now get the space available for data in the mbuf */
+       buf_size = rte_pktmbuf_data_room_size(internals->mb_pool) -
+               RTE_PKTMBUF_HEADROOM;
+       data_size = internals->umem->frame_size;
+
+       if (data_size > buf_size) {
+               RTE_LOG(ERR, PMD,
+                       "%s: %d bytes will not fit in mbuf (%d bytes)\n",
+                       dev->device->name, data_size, buf_size);
+               return -ENOMEM;
+       }
+
+       dev->data->rx_queues[rx_queue_id] = internals;
+       return 0;
+}
+
+static int
+eth_tx_queue_setup(struct rte_eth_dev *dev,
+                  uint16_t tx_queue_id,
+                  uint16_t nb_tx_desc __rte_unused,
+                  unsigned int socket_id __rte_unused,
+                  const struct rte_eth_txconf *tx_conf __rte_unused)
+{
+       struct pmd_internals *internals = dev->data->dev_private;
+
+       RTE_ASSERT(tx_queue_id == 0);
+       dev->data->tx_queues[tx_queue_id] = internals;
+       return 0;
+}
+
+static int
+eth_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
+{
+       struct pmd_internals *internals = dev->data->dev_private;
+       struct ifreq ifr = { .ifr_mtu = mtu };
+       int ret;
+       int s;
+
+       s = socket(PF_INET, SOCK_DGRAM, 0);
+       if (s < 0)
+               return -EINVAL;
+
+       snprintf(ifr.ifr_name, IFNAMSIZ, "%s", internals->if_name);
+       ret = ioctl(s, SIOCSIFMTU, &ifr);
+       close(s);
+
+       if (ret < 0)
+               return -EINVAL;
+
+       return 0;
+}
+
+static void
+eth_dev_change_flags(char *if_name, uint32_t flags, uint32_t mask)
+{
+       struct ifreq ifr;
+       int s;
+
+       s = socket(PF_INET, SOCK_DGRAM, 0);
+       if (s < 0)
+               return;
+
+       snprintf(ifr.ifr_name, IFNAMSIZ, "%s", if_name);
+       if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0)
+               goto out;
+       ifr.ifr_flags &= mask;
+       ifr.ifr_flags |= flags;
+       if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0)
+               goto out;
+out:
+       close(s);
+}
+
+static void
+eth_dev_promiscuous_enable(struct rte_eth_dev *dev)
+{
+       struct pmd_internals *internals = dev->data->dev_private;
+
+       eth_dev_change_flags(internals->if_name, IFF_PROMISC, ~0);
+}
+
+static void
+eth_dev_promiscuous_disable(struct rte_eth_dev *dev)
+{
+       struct pmd_internals *internals = dev->data->dev_private;
+
+       eth_dev_change_flags(internals->if_name, 0, ~IFF_PROMISC);
+}
+
+static const struct eth_dev_ops ops = {
+       .dev_start = eth_dev_start,
+       .dev_stop = eth_dev_stop,
+       .dev_close = eth_dev_close,
+       .dev_configure = eth_dev_configure,
+       .dev_infos_get = eth_dev_info,
+       .mtu_set = eth_dev_mtu_set,
+       .promiscuous_enable = eth_dev_promiscuous_enable,
+       .promiscuous_disable = eth_dev_promiscuous_disable,
+       .rx_queue_setup = eth_rx_queue_setup,
+       .tx_queue_setup = eth_tx_queue_setup,
+       .rx_queue_release = eth_queue_release,
+       .tx_queue_release = eth_queue_release,
+       .link_update = eth_link_update,
+       .stats_get = eth_stats_get,
+       .stats_reset = eth_stats_reset,
+};
+
+static struct rte_vdev_driver pmd_af_xdp_drv;
+
+static void
+parse_parameters(struct rte_kvargs *kvlist,
+                char **if_name,
+                int *queue_idx,
+                int *ring_size)
+{
+       struct rte_kvargs_pair *pair = NULL;
+       unsigned int k_idx;
+
+       for (k_idx = 0; k_idx < kvlist->count; k_idx++) {
+               pair = &kvlist->pairs[k_idx];
+               if (strstr(pair->key, ETH_AF_XDP_IFACE_ARG))
+                       *if_name = pair->value;
+               else if (strstr(pair->key, ETH_AF_XDP_QUEUE_IDX_ARG))
+                       *queue_idx = atoi(pair->value);
+               else if (strstr(pair->key, ETH_AF_XDP_RING_SIZE_ARG))
+                       *ring_size = atoi(pair->value);
+       }
+}
+
+static int
+get_iface_info(const char *if_name,
+              struct ether_addr *eth_addr,
+              int *if_index)
+{
+       struct ifreq ifr;
+       int sock = socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
+
+       if (sock < 0)
+               return -1;
+
+       strcpy(ifr.ifr_name, if_name);
+       if (ioctl(sock, SIOCGIFINDEX, &ifr))
+               goto error;
+       *if_index = ifr.ifr_ifindex;
+
+       if (ioctl(sock, SIOCGIFHWADDR, &ifr))
+               goto error;
+
+       memcpy(eth_addr, ifr.ifr_hwaddr.sa_data, 6);
+
+       close(sock);
+       return 0;
+
+error:
+       close(sock);
+       return -1;
+}
+
+static int
+init_internals(struct rte_vdev_device *dev,
+              const char *if_name,
+              int queue_idx,
+              int ring_size)
+{
+       const char *name = rte_vdev_device_name(dev);
+       struct rte_eth_dev *eth_dev = NULL;
+       struct rte_eth_dev_data *data = NULL;
+       const unsigned int numa_node = dev->device.numa_node;
+       struct pmd_internals *internals = NULL;
+       int ret;
+
+       data = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
+       if (!data)
+               return -1;
+
+       internals = rte_zmalloc_socket(name, sizeof(*internals), 0, numa_node);
+       if (!internals)
+               goto error_1;
+
+       internals->queue_idx = queue_idx;
+       internals->ring_size = ring_size;
+       strcpy(internals->if_name, if_name);
+       internals->sfd = socket(PF_XDP, SOCK_RAW, 0);
+       if (internals->sfd < 0)
+               goto error_2;
+
+       ret = get_iface_info(if_name, &internals->eth_addr,
+                            &internals->if_index);
+       if (ret)
+               goto error_3;
+
+       eth_dev = rte_eth_vdev_allocate(dev, 0);
+       if (!eth_dev)
+               goto error_3;
+
+       rte_memcpy(data, eth_dev->data, sizeof(*data));
+       internals->port_id = eth_dev->data->port_id;
+       data->dev_private = internals;
+       data->nb_rx_queues = 1;
+       data->nb_tx_queues = 1;
+       data->dev_link = pmd_link;
+       data->mac_addrs = &internals->eth_addr;
+
+       eth_dev->data = data;
+       eth_dev->dev_ops = &ops;
+
+       eth_dev->rx_pkt_burst = eth_af_xdp_rx;
+       eth_dev->tx_pkt_burst = eth_af_xdp_tx;
+
+       return 0;
+
+error_3:
+       close(internals->sfd);
+
+error_2:
+       rte_free(internals);
+
+error_1:
+       rte_free(data);
+       return -1;
+}
+
+static int
+rte_pmd_af_xdp_probe(struct rte_vdev_device *dev)
+{
+       struct rte_kvargs *kvlist;
+       char *if_name = NULL;
+       int ring_size = ETH_AF_XDP_DFLT_RING_SIZE;
+       int queue_idx = ETH_AF_XDP_DFLT_QUEUE_IDX;
+       int ret;
+
+       RTE_LOG(INFO, PMD, "Initializing pmd_af_packet for %s\n",
+               rte_vdev_device_name(dev));
+
+       kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
+       if (!kvlist) {
+               RTE_LOG(ERR, PMD,
+                       "Invalid kvargs");
+               return -1;
+       }
+
+       if (dev->device.numa_node == SOCKET_ID_ANY)
+               dev->device.numa_node = rte_socket_id();
+
+       parse_parameters(kvlist, &if_name, &queue_idx, &ring_size);
+
+       ret = init_internals(dev, if_name, queue_idx, ring_size);
+       rte_kvargs_free(kvlist);
+
+       return ret;
+}
+
+static int
+rte_pmd_af_xdp_remove(struct rte_vdev_device *dev)
+{
+       struct rte_eth_dev *eth_dev = NULL;
+       struct pmd_internals *internals;
+
+       RTE_LOG(INFO, PMD, "Closing AF_XDP ethdev on numa socket %u\n",
+               rte_socket_id());
+
+       if (!dev)
+               return -1;
+
+       /* find the ethdev entry */
+       eth_dev = rte_eth_dev_allocated(rte_vdev_device_name(dev));
+       if (!eth_dev)
+               return -1;
+
+       internals = eth_dev->data->dev_private;
+       rte_ring_free(internals->buf_ring);
+       rte_free(internals->umem);
+       rte_free(eth_dev->data->dev_private);
+       rte_free(eth_dev->data);
+       close(internals->sfd);
+
+       rte_eth_dev_release_port(eth_dev);
+
+       return 0;
+}
+
+static struct rte_vdev_driver pmd_af_xdp_drv = {
+       .probe = rte_pmd_af_xdp_probe,
+       .remove = rte_pmd_af_xdp_remove,
+};
+
+RTE_PMD_REGISTER_VDEV(net_af_xdp, pmd_af_xdp_drv);
+RTE_PMD_REGISTER_ALIAS(net_af_xdp, eth_af_xdp);
+RTE_PMD_REGISTER_PARAM_STRING(net_af_xdp,
+                             "iface=<string> "
+                             "queue=<int> "
+                             "ringsz=<int> ");
diff --git a/drivers/net/af_xdp/rte_pmd_af_xdp_version.map 
b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
new file mode 100644
index 000000000..ef3539840
--- /dev/null
+++ b/drivers/net/af_xdp/rte_pmd_af_xdp_version.map
@@ -0,0 +1,4 @@
+DPDK_2.0 {
+
+       local: *;
+};
diff --git a/drivers/net/af_xdp/xdpsock_queue.h 
b/drivers/net/af_xdp/xdpsock_queue.h
new file mode 100644
index 000000000..0dc666a08
--- /dev/null
+++ b/drivers/net/af_xdp/xdpsock_queue.h
@@ -0,0 +1,62 @@
+#ifndef __XDPSOCK_QUEUE_H
+#define __XDPSOCK_QUEUE_H
+
+static inline int xq_enq(struct xdp_queue *q,
+                        const struct xdp_desc *descs,
+                        unsigned int ndescs)
+{
+       unsigned int avail_idx = q->avail_idx;
+       unsigned int i;
+       int j;
+
+       if (q->num_free < ndescs)
+               return -ENOSPC;
+
+       q->num_free -= ndescs;
+
+       for (i = 0; i < ndescs; i++) {
+               unsigned int idx = avail_idx++ & q->ring_mask;
+
+               q->ring[idx].idx        = descs[i].idx;
+               q->ring[idx].len        = descs[i].len;
+               q->ring[idx].offset     = descs[i].offset;
+               q->ring[idx].error      = 0;
+       }
+       rte_smp_wmb();
+
+       for (j = ndescs - 1; j >= 0; j--) {
+               unsigned int idx = (q->avail_idx + j) & q->ring_mask;
+
+               q->ring[idx].flags = descs[j].flags | XDP_DESC_KERNEL;
+       }
+       q->avail_idx += ndescs;
+
+       return 0;
+}
+
+static inline int xq_deq(struct xdp_queue *q,
+                        struct xdp_desc *descs,
+                        int ndescs)
+{
+       unsigned int idx, last_used_idx = q->last_used_idx;
+       int i, entries = 0;
+
+       for (i = 0; i < ndescs; i++) {
+               idx = (last_used_idx++) & q->ring_mask;
+               if (q->ring[idx].flags & XDP_DESC_KERNEL)
+                       break;
+               entries++;
+       }
+       q->num_free += entries;
+
+       rte_smp_rmb();
+
+       for (i = 0; i < entries; i++) {
+               idx = q->last_used_idx++ & q->ring_mask;
+               descs[i] = q->ring[idx];
+       }
+
+       return entries;
+}
+
+#endif /* __XDPSOCK_QUEUE_H */
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 3eb41d176..bc26e1457 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -120,6 +120,7 @@ ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),n)
 _LDLIBS-$(CONFIG_RTE_DRIVER_MEMPOOL_STACK)  += -lrte_mempool_stack
 
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  += -lrte_pmd_af_packet
+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_XDP)     += -lrte_pmd_af_xdp
 _LDLIBS-$(CONFIG_RTE_LIBRTE_ARK_PMD)        += -lrte_pmd_ark
 _LDLIBS-$(CONFIG_RTE_LIBRTE_AVF_PMD)        += -lrte_pmd_avf
 _LDLIBS-$(CONFIG_RTE_LIBRTE_AVP_PMD)        += -lrte_pmd_avp
-- 
2.13.6

Reply via email to