> -----Original Message----- > From: Bie, Tiwei > Sent: Friday, November 24, 2017 2:05 PM > To: Wang, Xiao W <xiao.w.w...@intel.com> > Cc: dev@dpdk.org; y...@fridaylinux.org > Subject: Re: [dpdk-dev] [PATCH 2/2] net/virtio: support GUEST ANNOUNCE > > Hi, > > Some quick comments. Will go through the whole patch later. > > On Fri, Nov 24, 2017 at 03:04:00AM -0800, Xiao Wang wrote: > > When live migration is done, for the backup VM, either the virtio > > frontend or the vhost backend needs to send out gratuitous RARP packet > > to announce its new network location. > > > > This patch enables VIRTIO_NET_F_GUEST_ANNOUNCE feature to support > live > > migration scenario where the vhost backend doesn't have the ability to > > generate RARP packet. > > > > Brief introduction of the work flow: > > 1. QEMU finishes live migration, pokes the backup VM with an interrupt. > > 2. Virtio interrupt handler reads out the interrupt status value, and > > realizes it needs to send out RARP packet to announce its location. > > 3. Pause device to stop worker thread touching the queues. > > 4. Inject a RARP packet into a Tx Queue. > > 5. Ack the interrupt via control queue. > > 6. Resume device to continue packet processing. > > > > Signed-off-by: Xiao Wang <xiao.w.w...@intel.com> > > --- > > drivers/net/virtio/virtio_ethdev.c | 131 > ++++++++++++++++++++++++++++++++++++- > > drivers/net/virtio/virtio_ethdev.h | 4 ++ > > drivers/net/virtio/virtio_pci.h | 1 + > > drivers/net/virtio/virtio_rxtx.c | 81 +++++++++++++++++++++++ > > drivers/net/virtio/virtqueue.h | 11 ++++ > > 5 files changed, 226 insertions(+), 2 deletions(-) > > > > diff --git a/drivers/net/virtio/virtio_ethdev.c > b/drivers/net/virtio/virtio_ethdev.c > > index 1959b11..6eaea0e 100644 > > --- a/drivers/net/virtio/virtio_ethdev.c > > +++ b/drivers/net/virtio/virtio_ethdev.c > > @@ -48,6 +48,8 @@ > > #include <rte_pci.h> > > #include <rte_bus_pci.h> > > #include <rte_ether.h> > > +#include <rte_ip.h> > > +#include <rte_arp.h> > > #include <rte_common.h> > > #include <rte_errno.h> > > #include <rte_cpuflags.h> > > @@ -55,6 +57,7 @@ > > #include <rte_memory.h> > > #include <rte_eal.h> > > #include <rte_dev.h> > > +#include <rte_cycles.h> > > > > #include "virtio_ethdev.h" > > #include "virtio_pci.h" > > @@ -106,6 +109,13 @@ static int virtio_dev_queue_stats_mapping_set( > > uint8_t stat_idx, > > uint8_t is_rx); > > > > +static int make_rarp_packet(struct rte_mbuf *rarp_mbuf, > > + const struct ether_addr *mac); > > +static int virtio_dev_pause(struct rte_eth_dev *dev); > > +static void virtio_dev_resume(struct rte_eth_dev *dev); > > +static void generate_rarp(struct rte_eth_dev *dev); > > +static void virtnet_ack_link_announce(struct rte_eth_dev *dev); > > + > > /* > > * The set of PCI devices this driver supports > > */ > > @@ -1249,9 +1259,116 @@ static int virtio_dev_xstats_get_names(struct > rte_eth_dev *dev, > > return 0; > > } > > > > +#define RARP_PKT_SIZE 64 > > + > > +static int > > +make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr > *mac) > > +{ > > + struct ether_hdr *eth_hdr; > > + struct arp_hdr *rarp; > > + > > + if (rarp_mbuf->buf_len < RARP_PKT_SIZE) { > > + PMD_DRV_LOG(ERR, "mbuf size too small %u (< %d)", > > + rarp_mbuf->buf_len, RARP_PKT_SIZE); > > + return -1; > > + } > > + > > + /* Ethernet header. */ > > + eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0); > > You can use rte_pktmbuf_mtod() directly.
Looks better. Will change it in v2. > > > + memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN); > > + ether_addr_copy(mac, ð_hdr->s_addr); > > + eth_hdr->ether_type = htons(ETHER_TYPE_RARP); > > + > > + /* RARP header. */ > > + rarp = (struct arp_hdr *)(eth_hdr + 1); > > + rarp->arp_hrd = htons(ARP_HRD_ETHER); > > + rarp->arp_pro = htons(ETHER_TYPE_IPv4); > > + rarp->arp_hln = ETHER_ADDR_LEN; > > + rarp->arp_pln = 4; > > + rarp->arp_op = htons(ARP_OP_REVREQUEST); > > + > > + ether_addr_copy(mac, &rarp->arp_data.arp_sha); > > + ether_addr_copy(mac, &rarp->arp_data.arp_tha); > > + memset(&rarp->arp_data.arp_sip, 0x00, 4); > > + memset(&rarp->arp_data.arp_tip, 0x00, 4); > > + > > + rarp_mbuf->data_len = RARP_PKT_SIZE; > > + rarp_mbuf->pkt_len = RARP_PKT_SIZE; > > + > > + return 0; > > +} > > + > > +static int > > +virtio_dev_pause(struct rte_eth_dev *dev) > > +{ > > + struct virtio_hw *hw = dev->data->dev_private; > > + > > + if (hw->started == 0) > > + return -1; > > + hw->started = 0; > > + /* > > + * Prevent the worker thread from touching queues to avoid condition, > > + * 1 ms should be enough for the ongoing Tx function to finish. > > + */ > > + rte_delay_ms(1); > > + return 0; > > +} > > + > > +static void > > +virtio_dev_resume(struct rte_eth_dev *dev) > > +{ > > + struct virtio_hw *hw = dev->data->dev_private; > > + > > + hw->started = 1; > > +} > > + > > +static void > > +generate_rarp(struct rte_eth_dev *dev) > > +{ > > + struct virtio_hw *hw = dev->data->dev_private; > > + struct rte_mbuf *rarp_mbuf = NULL; > > + struct virtnet_tx *txvq = dev->data->tx_queues[0]; > > + struct virtnet_rx *rxvq = dev->data->rx_queues[0]; > > + > > + rarp_mbuf = rte_mbuf_raw_alloc(rxvq->mpool); > > + if (rarp_mbuf == NULL) { > > + PMD_DRV_LOG(ERR, "mbuf allocate failed"); > > + return; > > + } > > + > > + if (make_rarp_packet(rarp_mbuf, (struct ether_addr *)hw->mac_addr)) > { > > + rte_pktmbuf_free(rarp_mbuf); > > + rarp_mbuf = NULL; > > + return; > > + } > > + > > + /* If virtio port just stopped, no need to send RARP */ > > + if (virtio_dev_pause(dev) < -1) > > You mean < 0? Yes, will fix it in v2. > > > + return; > > + > > + virtio_inject_pkts(txvq, &rarp_mbuf, 1); > > + /* Recover the stored hw status to let worker thread continue */ > > + virtio_dev_resume(dev); > > +} > > + > > +static void > > +virtnet_ack_link_announce(struct rte_eth_dev *dev) > > +{ > > + struct virtio_hw *hw = dev->data->dev_private; > > + struct virtio_pmd_ctrl ctrl; > > + int len; > > + > > + ctrl.hdr.class = VIRTIO_NET_CTRL_ANNOUNCE; > > + ctrl.hdr.cmd = VIRTIO_NET_CTRL_ANNOUNCE_ACK; > > + len = 0; > > + > > + virtio_send_command(hw->cvq, &ctrl, &len, 0); > > +} > > + > > /* > > - * Process Virtio Config changed interrupt and call the callback > > - * if link state changed. > > + * Process virtio config changed interrupt. Call the callback > > + * if link state changed; generate gratuitous RARP packet if > > + * the status indicates an ANNOUNCE. > > */ > > void > > virtio_interrupt_handler(void *param) > > @@ -1274,6 +1391,12 @@ static int virtio_dev_xstats_get_names(struct > rte_eth_dev *dev, > > NULL, NULL); > > } > > > > + if (isr & VIRTIO_NET_S_ANNOUNCE) { > > + rte_spinlock_lock(&hw->sl); > > + generate_rarp(dev); > > + virtnet_ack_link_announce(dev); > > + rte_spinlock_unlock(&hw->sl); > > + } > > } > > > > /* set rx and tx handlers according to what is supported */ > > @@ -1786,6 +1909,8 @@ static int eth_virtio_pci_remove(struct > rte_pci_device *pci_dev) > > return -EBUSY; > > } > > > > + rte_spinlock_init(&hw->sl); > > + > > hw->use_simple_rx = 1; > > hw->use_simple_tx = 1; > > > > @@ -1952,12 +2077,14 @@ static void virtio_dev_free_mbufs(struct > rte_eth_dev *dev) > > > > PMD_INIT_LOG(DEBUG, "stop"); > > > > + rte_spinlock_lock(&hw->sl); > > if (intr_conf->lsc || intr_conf->rxq) > > virtio_intr_disable(dev); > > > > hw->started = 0; > > memset(&link, 0, sizeof(link)); > > virtio_dev_atomic_write_link_status(dev, &link); > > + rte_spinlock_unlock(&hw->sl); > > } > > > > static int > > diff --git a/drivers/net/virtio/virtio_ethdev.h > b/drivers/net/virtio/virtio_ethdev.h > > index 2039bc5..24271cb 100644 > > --- a/drivers/net/virtio/virtio_ethdev.h > > +++ b/drivers/net/virtio/virtio_ethdev.h > > @@ -67,6 +67,7 @@ > > 1u << VIRTIO_NET_F_HOST_TSO6 | \ > > 1u << VIRTIO_NET_F_MRG_RXBUF | \ > > 1u << VIRTIO_NET_F_MTU | \ > > + 1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE | \ > > 1u << VIRTIO_RING_F_INDIRECT_DESC | \ > > 1ULL << VIRTIO_F_VERSION_1 | \ > > 1ULL << VIRTIO_F_IOMMU_PLATFORM) > > @@ -111,6 +112,9 @@ uint16_t virtio_recv_mergeable_pkts(void *rx_queue, > struct rte_mbuf **rx_pkts, > > uint16_t virtio_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, > > uint16_t nb_pkts); > > > > +uint16_t virtio_inject_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, > > + uint16_t nb_pkts); > > + > > uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, > > uint16_t nb_pkts); > > > > diff --git a/drivers/net/virtio/virtio_pci.h > > b/drivers/net/virtio/virtio_pci.h > > index 3c5ce66..3cd367e 100644 > > --- a/drivers/net/virtio/virtio_pci.h > > +++ b/drivers/net/virtio/virtio_pci.h > > @@ -270,6 +270,7 @@ struct virtio_hw { > > struct virtio_pci_common_cfg *common_cfg; > > struct virtio_net_config *dev_cfg; > > void *virtio_user_dev; > > + rte_spinlock_t sl; > > Need to add some detailed comments to describe what's > protected by this lock. With this feature, the hw->started flag can be changed by two threads: App management thread and the interrupt handler thread. This lock can prevent such a case: 1. When LM is done, config change interrupt triggered, the handler pause the device. hw->started = 0; 2. app stops virtio port. Hw->started = 0; 3. RARP injected in Tx queue, ack sent in Control queue, resume device. hw->started = 1; (but the port is stopped already) > > > > > struct virtqueue **vqs; > > }; > > diff --git a/drivers/net/virtio/virtio_rxtx.c > > b/drivers/net/virtio/virtio_rxtx.c > > index 6a24fde..7313bdd 100644 > > --- a/drivers/net/virtio/virtio_rxtx.c > > +++ b/drivers/net/virtio/virtio_rxtx.c > > @@ -1100,3 +1100,84 @@ > > > > return nb_tx; > > } > > + > > +uint16_t > > +virtio_inject_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t > nb_pkts) > > +{ > > + struct virtnet_tx *txvq = tx_queue; > > + struct virtqueue *vq = txvq->vq; > > + struct virtio_hw *hw = vq->hw; > > + uint16_t hdr_size = hw->vtnet_hdr_size; > > + uint16_t nb_used, nb_tx = 0; > > + > > + if (unlikely(nb_pkts < 1)) > > + return nb_pkts; > > + > > + PMD_TX_LOG(DEBUG, "%d packets to xmit", nb_pkts); > > + nb_used = VIRTQUEUE_NUSED(vq); > > + > > + virtio_rmb(); > > + if (likely(nb_used > vq->vq_nentries - vq->vq_free_thresh)) > > + virtio_xmit_cleanup(vq, nb_used); > > + > > + for (nb_tx = 0; nb_tx < nb_pkts; nb_tx++) { > > + struct rte_mbuf *txm = tx_pkts[nb_tx]; > > + int can_push = 0, use_indirect = 0, slots, need; > > + > > + /* optimize ring usage */ > > + if ((vtpci_with_feature(hw, VIRTIO_F_ANY_LAYOUT) || > > + vtpci_with_feature(hw, > VIRTIO_F_VERSION_1)) && > > + rte_mbuf_refcnt_read(txm) == 1 && > > + RTE_MBUF_DIRECT(txm) && > > + txm->nb_segs == 1 && > > + rte_pktmbuf_headroom(txm) >= hdr_size && > > + rte_is_aligned(rte_pktmbuf_mtod(txm, char *), > > + __alignof__(struct virtio_net_hdr_mrg_rxbuf))) > > + can_push = 1; > > + else if (vtpci_with_feature(hw, > VIRTIO_RING_F_INDIRECT_DESC) && > > + txm->nb_segs < VIRTIO_MAX_TX_INDIRECT) > > + use_indirect = 1; > > + > > + /* How many main ring entries are needed to this Tx? > > + * any_layout => number of segments > > + * indirect => 1 > > + * default => number of segments + 1 > > + */ > > + slots = use_indirect ? 1 : (txm->nb_segs + !can_push); > > + need = slots - vq->vq_free_cnt; > > + > > + /* Positive value indicates it need free vring descriptors */ > > + if (unlikely(need > 0)) { > > + nb_used = VIRTQUEUE_NUSED(vq); > > + virtio_rmb(); > > + need = RTE_MIN(need, (int)nb_used); > > + > > + virtio_xmit_cleanup(vq, need); > > + need = slots - vq->vq_free_cnt; > > + if (unlikely(need > 0)) { > > + PMD_TX_LOG(ERR, > > + "No free tx descriptors to > transmit"); > > + break; > > + } > > + } > > + > > + /* Enqueue Packet buffers */ > > + virtqueue_enqueue_xmit(txvq, txm, slots, use_indirect, > can_push); > > + > > + txvq->stats.bytes += txm->pkt_len; > > + virtio_update_packet_stats(&txvq->stats, txm); > > + } > > + > > + txvq->stats.packets += nb_tx; > > + > > + if (likely(nb_tx)) { > > + vq_update_avail_idx(vq); > > + > > + if (unlikely(virtqueue_kick_prepare(vq))) { > > + virtqueue_notify(vq); > > + PMD_TX_LOG(DEBUG, "Notified backend after xmit"); > > + } > > + } > > + > > + return nb_tx; > > +} > > What's the difference between virtio_inject_pkts() and > virtio_xmit_pkts() except the latter will check hw->started? No vlan tag insertion. Actually they are both using virtqueue_enqueue_xmit() to enqueue packet. Thanks for your comments. Xiao