On 02.06.2016 04:33, Daniele Di Proietto wrote: > I wanted to do this for a long time now, thanks for posting this patch. > > I didn't notice a drop in throughput with this patch, for phy-ovs-phy > tests, even when we call rte_eth_tx_burst() for every single packet.
It's good news. > How about 'dpdk_tx_burst()' instead of 'netdev_dpdk_eth_instant_send()'? > The "instant" part makes sense compared to the current code, but that > code is removed. Yes, I also wanted to change it. May be 'netdev_dpdk_eth_tx_burst()'? Just to be closer to coding style, and, also, this points out that function works with 'eth' devices. But, I think, you may choose any name you want and just replace while pushing. > Acked-by: Daniele Di Proietto <diproiet...@vmware.com> > > If there are no objection I can push this separately from the rest of > the series. OK. Thanks. Best regards, Ilya Maximets. > On 24/05/2016 06:34, "Ilya Maximets" <i.maxim...@samsung.com> wrote: > >> Current implementarion of TX packet's queueing is broken in several ways: >> >> * TX queue flushing implemented on receive assumes that all >> core_id-s are sequential and starts from zero. This may lead >> to situation when packets will stuck in queue forever and, >> also, this influences on latency. >> >> * For a long time flushing logic depends on uninitialized >> 'txq_needs_locking', because it usually calculated after >> 'netdev_dpdk_alloc_txq' but used inside of this function >> for initialization of 'flush_tx'. >> >> According to current flushing logic, constant flushing required if TX >> queues will be shared among different CPUs. Further patches will implement >> mechanisms for manipulations with TX queues in runtime. In this case PMD >> threads will not know is current queue shared or not. This means that >> constant flushing will be required. >> >> Conclusion: Lets remove queueing at all because it doesn't work >> properly now and, also, constant flushing required anyway. >> >> Testing on basic PHY-OVS-PHY and PHY-OVS-VM-OVS-PHY scenarios shows >> insignificant performance drop (less than 0.5 percents) in compare to >> profit that we can achieve in the future using XPS or other features. >> >> Signed-off-by: Ilya Maximets <i.maxim...@samsung.com> >> --- >> lib/netdev-dpdk.c | 102 >> ++++++++---------------------------------------------- >> 1 file changed, 14 insertions(+), 88 deletions(-) >> >> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c >> index 0d1b8c9..66e33df 100644 >> --- a/lib/netdev-dpdk.c >> +++ b/lib/netdev-dpdk.c >> @@ -167,7 +167,6 @@ static const struct rte_eth_conf port_conf = { >> }, >> }; >> >> -enum { MAX_TX_QUEUE_LEN = 384 }; >> enum { DPDK_RING_SIZE = 256 }; >> BUILD_ASSERT_DECL(IS_POW2(DPDK_RING_SIZE)); >> enum { DRAIN_TSC = 200000ULL }; >> @@ -284,8 +283,7 @@ static struct ovs_list dpdk_mp_list >> OVS_GUARDED_BY(dpdk_mutex) >> = OVS_LIST_INITIALIZER(&dpdk_mp_list); >> >> /* This mutex must be used by non pmd threads when allocating or freeing >> - * mbufs through mempools. Since dpdk_queue_pkts() and dpdk_queue_flush() >> may >> - * use mempools, a non pmd thread should hold this mutex while calling them >> */ >> + * mbufs through mempools. */ >> static struct ovs_mutex nonpmd_mempool_mutex = OVS_MUTEX_INITIALIZER; >> >> struct dpdk_mp { >> @@ -299,17 +297,12 @@ struct dpdk_mp { >> /* There should be one 'struct dpdk_tx_queue' created for >> * each cpu core. */ >> struct dpdk_tx_queue { >> - bool flush_tx; /* Set to true to flush queue everytime >> */ >> - /* pkts are queued. */ >> - int count; >> rte_spinlock_t tx_lock; /* Protects the members and the NIC queue >> * from concurrent access. It is used >> only >> * if the queue is shared among different >> * pmd threads (see 'txq_needs_locking'). >> */ >> int map; /* Mapping of configured vhost-user queues >> * to enabled by guest. */ >> - uint64_t tsc; >> - struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN]; >> }; >> >> /* dpdk has no way to remove dpdk ring ethernet devices >> @@ -703,19 +696,6 @@ netdev_dpdk_alloc_txq(struct netdev_dpdk *dev, unsigned >> int n_txqs) >> >> dev->tx_q = dpdk_rte_mzalloc(n_txqs * sizeof *dev->tx_q); >> for (i = 0; i < n_txqs; i++) { >> - int numa_id = ovs_numa_get_numa_id(i); >> - >> - if (!dev->txq_needs_locking) { >> - /* Each index is considered as a cpu core id, since there should >> - * be one tx queue for each cpu core. If the corresponding core >> - * is not on the same numa node as 'dev', flags the >> - * 'flush_tx'. */ >> - dev->tx_q[i].flush_tx = dev->socket_id == numa_id; >> - } else { >> - /* Queues are shared among CPUs. Always flush */ >> - dev->tx_q[i].flush_tx = true; >> - } >> - >> /* Initialize map for vhost devices. */ >> dev->tx_q[i].map = OVS_VHOST_QUEUE_MAP_UNKNOWN; >> rte_spinlock_init(&dev->tx_q[i].tx_lock); >> @@ -1056,16 +1036,15 @@ netdev_dpdk_rxq_dealloc(struct netdev_rxq *rxq) >> } >> >> static inline void >> -dpdk_queue_flush__(struct netdev_dpdk *dev, int qid) >> +netdev_dpdk_eth_instant_send(struct netdev_dpdk *dev, int qid, >> + struct rte_mbuf **pkts, int cnt) >> { >> - struct dpdk_tx_queue *txq = &dev->tx_q[qid]; >> uint32_t nb_tx = 0; >> >> - while (nb_tx != txq->count) { >> + while (nb_tx != cnt) { >> uint32_t ret; >> >> - ret = rte_eth_tx_burst(dev->port_id, qid, txq->burst_pkts + nb_tx, >> - txq->count - nb_tx); >> + ret = rte_eth_tx_burst(dev->port_id, qid, pkts + nb_tx, cnt - >> nb_tx); >> if (!ret) { >> break; >> } >> @@ -1073,32 +1052,18 @@ dpdk_queue_flush__(struct netdev_dpdk *dev, int qid) >> nb_tx += ret; >> } >> >> - if (OVS_UNLIKELY(nb_tx != txq->count)) { >> + if (OVS_UNLIKELY(nb_tx != cnt)) { >> /* free buffers, which we couldn't transmit, one at a time (each >> * packet could come from a different mempool) */ >> int i; >> >> - for (i = nb_tx; i < txq->count; i++) { >> - rte_pktmbuf_free(txq->burst_pkts[i]); >> + for (i = nb_tx; i < cnt; i++) { >> + rte_pktmbuf_free(pkts[i]); >> } >> rte_spinlock_lock(&dev->stats_lock); >> - dev->stats.tx_dropped += txq->count-nb_tx; >> + dev->stats.tx_dropped += cnt - nb_tx; >> rte_spinlock_unlock(&dev->stats_lock); >> } >> - >> - txq->count = 0; >> - txq->tsc = rte_get_timer_cycles(); >> -} >> - >> -static inline void >> -dpdk_queue_flush(struct netdev_dpdk *dev, int qid) >> -{ >> - struct dpdk_tx_queue *txq = &dev->tx_q[qid]; >> - >> - if (txq->count == 0) { >> - return; >> - } >> - dpdk_queue_flush__(dev, qid); >> } >> >> static bool >> @@ -1207,18 +1172,8 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq, struct >> dp_packet **packets, >> int *c) >> { >> struct netdev_rxq_dpdk *rx = netdev_rxq_dpdk_cast(rxq); >> - struct netdev_dpdk *dev = netdev_dpdk_cast(rxq->netdev); >> int nb_rx; >> >> - /* There is only one tx queue for this core. Do not flush other >> - * queues. >> - * Do not flush tx queue which is shared among CPUs >> - * since it is always flushed */ >> - if (rxq->queue_id == rte_lcore_id() && >> - OVS_LIKELY(!dev->txq_needs_locking)) { >> - dpdk_queue_flush(dev, rxq->queue_id); >> - } >> - >> nb_rx = rte_eth_rx_burst(rx->port_id, rxq->queue_id, >> (struct rte_mbuf **) packets, >> NETDEV_MAX_BURST); >> @@ -1345,35 +1300,6 @@ out: >> } >> } >> >> -inline static void >> -dpdk_queue_pkts(struct netdev_dpdk *dev, int qid, >> - struct rte_mbuf **pkts, int cnt) >> -{ >> - struct dpdk_tx_queue *txq = &dev->tx_q[qid]; >> - uint64_t diff_tsc; >> - >> - int i = 0; >> - >> - while (i < cnt) { >> - int freeslots = MAX_TX_QUEUE_LEN - txq->count; >> - int tocopy = MIN(freeslots, cnt-i); >> - >> - memcpy(&txq->burst_pkts[txq->count], &pkts[i], >> - tocopy * sizeof (struct rte_mbuf *)); >> - >> - txq->count += tocopy; >> - i += tocopy; >> - >> - if (txq->count == MAX_TX_QUEUE_LEN || txq->flush_tx) { >> - dpdk_queue_flush__(dev, qid); >> - } >> - diff_tsc = rte_get_timer_cycles() - txq->tsc; >> - if (diff_tsc >= DRAIN_TSC) { >> - dpdk_queue_flush__(dev, qid); >> - } >> - } >> -} >> - >> /* Tx function. Transmit packets indefinitely */ >> static void >> dpdk_do_tx_copy(struct netdev *netdev, int qid, struct dp_packet **pkts, >> @@ -1435,8 +1361,7 @@ dpdk_do_tx_copy(struct netdev *netdev, int qid, struct >> dp_packet **pkts, >> newcnt = netdev_dpdk_qos_run__(dev, mbufs, newcnt); >> >> dropped += qos_pkts - newcnt; >> - dpdk_queue_pkts(dev, qid, mbufs, newcnt); >> - dpdk_queue_flush(dev, qid); >> + netdev_dpdk_eth_instant_send(dev, qid, mbufs, newcnt); >> } >> >> if (OVS_UNLIKELY(dropped)) { >> @@ -1508,7 +1433,7 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid, >> temp_cnt = netdev_dpdk_qos_run__(dev, (struct >> rte_mbuf**)pkts, >> temp_cnt); >> dropped += qos_pkts - temp_cnt; >> - dpdk_queue_pkts(dev, qid, >> + netdev_dpdk_eth_instant_send(dev, qid, >> (struct rte_mbuf **)&pkts[next_tx_idx], >> temp_cnt); >> >> @@ -1528,8 +1453,9 @@ netdev_dpdk_send__(struct netdev_dpdk *dev, int qid, >> >> cnt = netdev_dpdk_qos_run__(dev, (struct rte_mbuf**)pkts, cnt); >> dropped += qos_pkts - cnt; >> - dpdk_queue_pkts(dev, qid, (struct rte_mbuf >> **)&pkts[next_tx_idx], >> - cnt); >> + netdev_dpdk_eth_instant_send(dev, qid, >> + (struct rte_mbuf >> **)&pkts[next_tx_idx], >> + cnt); >> } >> >> if (OVS_UNLIKELY(dropped)) { >> -- >> 2.5.0 _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev