Hi Tomasz, > > Added API for `rte_eth_tx_prep` > > uint16_t rte_eth_tx_prep(uint8_t port_id, uint16_t queue_id, > struct rte_mbuf **tx_pkts, uint16_t nb_pkts) > > Added fields to the `struct rte_eth_desc_lim`: > > uint16_t nb_seg_max; > /**< Max number of segments per whole packet. */ > > uint16_t nb_mtu_seg_max; > /**< Max number of segments per one MTU */ > > Created `rte_pkt.h` header with common used functions: > > int rte_validate_tx_offload(struct rte_mbuf *m) > to validate general requirements for tx offload in packet such a > flag completness. In current implementation this funtion is called > optionaly when RTE_LIBRTE_ETHDEV_DEBUG is enabled. > > int rte_phdr_cksum_fix(struct rte_mbuf *m) > to fix pseudo header checksum for TSO and non-TSO tcp/udp packets > before hardware tx checksum offload. > - for non-TSO tcp/udp packets full pseudo-header checksum is > counted and set. > - for TSO the IP payload length is not included. > > Signed-off-by: Tomasz Kulasek <tomaszx.kulasek at intel.com> > --- > config/common_base | 1 + > lib/librte_ether/rte_ethdev.h | 85 ++++++++++++++++++++++++++ > lib/librte_mbuf/rte_mbuf.h | 8 +++ > lib/librte_net/Makefile | 2 +- > lib/librte_net/rte_pkt.h | 132 > +++++++++++++++++++++++++++++++++++++++++ > 5 files changed, 227 insertions(+), 1 deletion(-) > create mode 100644 lib/librte_net/rte_pkt.h > > diff --git a/config/common_base b/config/common_base > index 7830535..7ada9e0 100644 > --- a/config/common_base > +++ b/config/common_base > @@ -120,6 +120,7 @@ CONFIG_RTE_MAX_QUEUES_PER_PORT=1024 > CONFIG_RTE_LIBRTE_IEEE1588=n > CONFIG_RTE_ETHDEV_QUEUE_STAT_CNTRS=16 > CONFIG_RTE_ETHDEV_RXTX_CALLBACKS=y > +CONFIG_RTE_ETHDEV_TX_PREP=y > > # > # Support NIC bypass logic > diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h > index b0fe033..4fa674d 100644 > --- a/lib/librte_ether/rte_ethdev.h > +++ b/lib/librte_ether/rte_ethdev.h > @@ -182,6 +182,7 @@ extern "C" { > #include <rte_pci.h> > #include <rte_dev.h> > #include <rte_devargs.h> > +#include <rte_errno.h> > #include "rte_ether.h" > #include "rte_eth_ctrl.h" > #include "rte_dev_info.h" > @@ -696,6 +697,8 @@ struct rte_eth_desc_lim { > uint16_t nb_max; /**< Max allowed number of descriptors. */ > uint16_t nb_min; /**< Min allowed number of descriptors. */ > uint16_t nb_align; /**< Number of descriptors should be aligned to. */ > + uint16_t nb_seg_max; /**< Max number of segments per whole packet. > */ > + uint16_t nb_mtu_seg_max; /**< Max number of segments per one MTU */ > }; > > /** > @@ -1181,6 +1184,11 @@ typedef uint16_t (*eth_tx_burst_t)(void *txq, > uint16_t nb_pkts); > /**< @internal Send output packets on a transmit queue of an Ethernet > device. */ > > +typedef uint16_t (*eth_tx_prep_t)(void *txq, > + struct rte_mbuf **tx_pkts, > + uint16_t nb_pkts); > +/**< @internal Prepare output packets on a transmit queue of an Ethernet > device. */ > + > typedef int (*flow_ctrl_get_t)(struct rte_eth_dev *dev, > struct rte_eth_fc_conf *fc_conf); > /**< @internal Get current flow control parameter on an Ethernet device */ > @@ -1626,6 +1634,7 @@ enum rte_eth_dev_type { > struct rte_eth_dev { > eth_rx_burst_t rx_pkt_burst; /**< Pointer to PMD receive function. */ > eth_tx_burst_t tx_pkt_burst; /**< Pointer to PMD transmit function. */ > + eth_tx_prep_t tx_pkt_prep; /**< Pointer to PMD transmit prepare > function. */ > struct rte_eth_dev_data *data; /**< Pointer to device data */ > const struct eth_driver *driver;/**< Driver for this device */ > const struct eth_dev_ops *dev_ops; /**< Functions exported by PMD */ > @@ -2833,6 +2842,82 @@ rte_eth_tx_burst(uint8_t port_id, uint16_t queue_id, > return (*dev->tx_pkt_burst)(dev->data->tx_queues[queue_id], tx_pkts, > nb_pkts); > } > > +/** > + * Process a burst of output packets on a transmit queue of an Ethernet > device. > + * > + * The rte_eth_tx_prep() function is invoked to prepare output packets to be > + * transmitted on the output queue *queue_id* of the Ethernet device > designated > + * by its *port_id*. > + * The *nb_pkts* parameter is the number of packets to be prepared which are > + * supplied in the *tx_pkts* array of *rte_mbuf* structures, each of them > + * allocated from a pool created with rte_pktmbuf_pool_create(). > + * For each packet to send, the rte_eth_tx_prep() function performs > + * the following operations: > + * > + * - Check if packet meets devices requirements for tx offloads. > + * > + * - Check limitations about number of segments. > + * > + * - Check additional requirements when debug is enabled. > + * > + * - Update and/or reset required checksums when tx offload is set for > packet. > + * > + * The rte_eth_tx_prep() function returns the number of packets ready to be > + * sent. A return value equal to *nb_pkts* means that all packets are valid > and > + * ready to be sent. > + * > + * @param port_id > + * The port identifier of the Ethernet device. > + * @param queue_id > + * The index of the transmit queue through which output packets must be > + * sent. > + * The value must be in the range [0, nb_tx_queue - 1] previously supplied > + * to rte_eth_dev_configure(). > + * @param tx_pkts > + * The address of an array of *nb_pkts* pointers to *rte_mbuf* structures > + * which contain the output packets. > + * @param nb_pkts > + * The maximum number of packets to process. > + * @return > + * The number of packets correct and ready to be sent. The return value > can be > + * less than the value of the *tx_pkts* parameter when some packet doesn't > + * meet devices requirements with rte_errno set appropriately. > + */ > + > +#ifdef RTE_ETHDEV_TX_PREP
Sorry for being a bit late on that discussion, but what the point of having that config macro (RTE_ETHDEV_TX_PREP ) at all? As I can see right now, if driver doesn't setup tx_pkt_prep, then nb_pkts would be return anyway... BTW, there is my another question - should it be that way? Shouldn't we return 0 (and set rte_errno=ENOTSUP) here if dev->tx_pk_prep == NULL? > + > +static inline uint16_t > +rte_eth_tx_prep(uint8_t port_id, uint16_t queue_id, struct rte_mbuf > **tx_pkts, > + uint16_t nb_pkts) > +{ > + struct rte_eth_dev *dev = &rte_eth_devices[port_id]; > + > + if (!dev->tx_pkt_prep) > + return nb_pkts; > + > +#ifdef RTE_LIBRTE_ETHDEV_DEBUG > + if (queue_id >= dev->data->nb_tx_queues) { > + RTE_PMD_DEBUG_TRACE("Invalid TX queue_id=%d\n", queue_id); > + rte_errno = -EINVAL; > + return 0; > + } > +#endif > + > + return (*dev->tx_pkt_prep)(dev->data->tx_queues[queue_id], > + tx_pkts, nb_pkts); > +} > + > +#else > + > +static inline uint16_t > +rte_eth_tx_prep(uint8_t port_id __rte_unused, uint16_t queue_id __rte_unused, > + struct rte_mbuf **tx_pkts __rte_unused, uint16_t nb_pkts) > +{ > + return nb_pkts; > +} > + > +#endif > + > typedef void (*buffer_tx_error_fn)(struct rte_mbuf **unsent, uint16_t count, > void *userdata); > > diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h > index 7ea66ed..72fd352 100644 > --- a/lib/librte_mbuf/rte_mbuf.h > +++ b/lib/librte_mbuf/rte_mbuf.h > @@ -211,6 +211,14 @@ extern "C" { > */ > #define PKT_TX_OUTER_IPV4 (1ULL << 59) > > +#define PKT_TX_OFFLOAD_MASK ( \ > + PKT_TX_IP_CKSUM | \ > + PKT_TX_L4_MASK | \ > + PKT_TX_OUTER_IP_CKSUM | \ > + PKT_TX_TCP_SEG | \ > + PKT_TX_QINQ_PKT | \ > + PKT_TX_VLAN_PKT) > + > /** > * Packet outer header is IPv6. This flag must be set when using any > * outer offload feature (L4 checksum) to tell the NIC that the outer > diff --git a/lib/librte_net/Makefile b/lib/librte_net/Makefile > index ad2e482..b5abe84 100644 > --- a/lib/librte_net/Makefile > +++ b/lib/librte_net/Makefile > @@ -34,7 +34,7 @@ include $(RTE_SDK)/mk/rte.vars.mk > CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3 > > # install includes > -SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include := rte_ip.h rte_tcp.h rte_udp.h > rte_sctp.h rte_icmp.h rte_arp.h > +SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include := rte_ip.h rte_tcp.h rte_udp.h > rte_sctp.h rte_icmp.h rte_arp.h rte_pkt.h > > > include $(RTE_SDK)/mk/rte.install.mk > diff --git a/lib/librte_net/rte_pkt.h b/lib/librte_net/rte_pkt.h > new file mode 100644 > index 0000000..a3c3e3c > --- /dev/null > +++ b/lib/librte_net/rte_pkt.h > @@ -0,0 +1,132 @@ > +/*- > + * BSD LICENSE > + * > + * Copyright(c) 2016 Intel Corporation. All rights reserved. > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * > + * * Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * * Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in > + * the documentation and/or other materials provided with the > + * distribution. > + * * Neither the name of Intel Corporation nor the names of its > + * contributors may be used to endorse or promote products derived > + * from this software without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS > + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT > + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR > + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT > + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, > + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT > + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, > + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY > + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT > + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE > + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. > + */ > + > +#ifndef _RTE_PKT_H_ > +#define _RTE_PKT_H_ > + > +#include <rte_ip.h> > +#include <rte_udp.h> > +#include <rte_tcp.h> > +#include <rte_sctp.h> > + > +/** > + * Validate general requirements for tx offload in packet. > + */ > +static inline int > +rte_validate_tx_offload(struct rte_mbuf *m) > +{ > + uint64_t ol_flags = m->ol_flags; > + > + /* Does packet set any of available offloads? */ > + if (!(ol_flags & PKT_TX_OFFLOAD_MASK)) > + return 0; > + > + /* IP checksum can be counted only for IPv4 packet */ > + if ((ol_flags & PKT_TX_IP_CKSUM) && (ol_flags & PKT_TX_IPV6)) > + return -EINVAL; > + > + if (ol_flags & (PKT_TX_L4_MASK | PKT_TX_TCP_SEG)) > + /* IP type not set */ > + if (!(ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6))) > + return -EINVAL; > + > + if (ol_flags & PKT_TX_TCP_SEG) { > + > + /* PKT_TX_IP_CKSUM offload not set for IPv4 TSO packet */ > + if ((ol_flags & PKT_TX_IPV4) && !(ol_flags & PKT_TX_IP_CKSUM)) > + return -EINVAL; > + > + if (m->tso_segsz == 0) > + return -EINVAL; > + > + } I suppose these 2 if(ol_flags & PKT_TX_TCP_SEG) above could be united into one. > + > + /* PKT_TX_OUTER_IP_CKSUM set for non outer IPv4 packet. */ > + if ((ol_flags & PKT_TX_OUTER_IP_CKSUM) && !(ol_flags & > PKT_TX_OUTER_IPV4)) > + return -EINVAL; > + > + return 0; > +} > + > +/** > + * Fix pseudo header checksum for TSO and non-TSO tcp/udp packets before > + * hardware tx checksum. > + * For non-TSO tcp/udp packets full pseudo-header checksum is counted and > set. > + * For TSO the IP payload length is not included. > + */ > +static inline int > +rte_phdr_cksum_fix(struct rte_mbuf *m) > +{ > + struct ipv4_hdr *ipv4_hdr; > + struct ipv6_hdr *ipv6_hdr; > + struct tcp_hdr *tcp_hdr; > + struct udp_hdr *udp_hdr; > + > + if (m->ol_flags & PKT_TX_IPV4) { > + ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, > m->l2_len); > + > + if (m->ol_flags & PKT_TX_IP_CKSUM) > + ipv4_hdr->hdr_checksum = 0; > + > + if ((m->ol_flags & PKT_TX_UDP_CKSUM) == PKT_TX_UDP_CKSUM) { > + /* non-TSO udp */ > + udp_hdr = rte_pktmbuf_mtod_offset(m, struct udp_hdr *, > m->l2_len + > + m->l3_len); > + udp_hdr->dgram_cksum = rte_ipv4_phdr_cksum(ipv4_hdr, > m->ol_flags); > + } else if ((m->ol_flags & PKT_TX_TCP_CKSUM) || > + (m->ol_flags & PKT_TX_TCP_SEG)) { > + /* non-TSO tcp or TSO */ > + tcp_hdr = rte_pktmbuf_mtod_offset(m, struct tcp_hdr *, > m->l2_len + > + m->l3_len); > + tcp_hdr->cksum = rte_ipv4_phdr_cksum(ipv4_hdr, > m->ol_flags); > + } > + } else if (m->ol_flags & PKT_TX_IPV6) { > + ipv6_hdr = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr *, > m->l2_len); > + > + if ((m->ol_flags & PKT_TX_UDP_CKSUM) == PKT_TX_UDP_CKSUM) { > + /* non-TSO udp */ > + udp_hdr = rte_pktmbuf_mtod_offset(m, struct udp_hdr *, > m->l2_len + > + m->l3_len); > + udp_hdr->dgram_cksum = rte_ipv6_phdr_cksum(ipv6_hdr, > m->ol_flags); > + } else if ((m->ol_flags & PKT_TX_TCP_CKSUM) || > + (m->ol_flags & PKT_TX_TCP_SEG)) { > + /* non-TSO tcp or TSO */ > + tcp_hdr = rte_pktmbuf_mtod_offset(m, struct tcp_hdr *, > m->l2_len + > + m->l3_len); > + tcp_hdr->cksum = rte_ipv6_phdr_cksum(ipv6_hdr, > m->ol_flags); > + } > + } > + return 0; > +} We probably need to take into account that in some cases outer_l*_len could be setup here (in case of tunneled packets). Konstantin > + > +#endif /* _RTE_PKT_H_ */ > -- > 1.7.9.5