Hi Yi, Some comments are inline.
In addition, have you tested UDP GRO function and measure performance? Thanks, Jiayu > -----Original Message----- > From: yang_y...@163.com <yang_y...@163.com> > Sent: Wednesday, September 2, 2020 5:27 PM > To: dev@dpdk.org > Cc: Hu, Jiayu <jiayu...@intel.com>; tho...@monjalon.net; > yangy...@inspur.com; yang_y...@163.com > Subject: [PATCH V3 1/2] gro: add UDP GRO support > > From: Yi Yang <yangy...@inspur.com> > > UDP GRO can help improve VM-to-VM UDP performance when > VM is enabled UFO or GSO, GRO must be supported if GSO > or UFO is enabled, otherwise, performance gain will be > hurt. > > With this enabled in DPDK, OVS DPDK can leverage it > to improve VM-to-VM UDP performance, this will make > sure IP fragments will be reassembled once it is > received from physical NIC. It is very helpful in OVS > DPDK VLAN TSO case. > > Signed-off-by: Yi Yang <yangy...@inspur.com> > --- > > # install this header file > diff --git a/lib/librte_gro/gro_udp4.c b/lib/librte_gro/gro_udp4.c > new file mode 100644 > index 0000000..d6beece > --- /dev/null > +++ b/lib/librte_gro/gro_udp4.c > +static inline void > +update_header(struct gro_udp4_item *item) > +{ > + struct rte_ipv4_hdr *ipv4_hdr; > + struct rte_mbuf *pkt = item->firstseg; > + uint16_t frag_offset; > + > + ipv4_hdr = (struct rte_ipv4_hdr *)(rte_pktmbuf_mtod(pkt, char *) + > + pkt->l2_len); > + ipv4_hdr->total_length = rte_cpu_to_be_16(pkt->pkt_len - > + pkt->l2_len); > + > + /* Clear MF bit if it is last fragment */ > + if (item->is_last_frag) { > + frag_offset = rte_be_to_cpu_16(ipv4_hdr->fragment_offset); > + ipv4_hdr->fragment_offset = > + rte_cpu_to_be_16(frag_offset & > ~RTE_IPV4_HDR_MF_FLAG); > + } I think we need to clear MF bit and offset both, since either MF bit or offset is non-zero indicates that the packet is an IP fragment. Once the packet is reassembled successfully, the two fields should be zero. You can reference IP fragment library in DPDK. > +} > + > +int32_t > +gro_udp4_reassemble(struct rte_mbuf *pkt, > + struct gro_udp4_tbl *tbl, > + uint64_t start_time) > +{ > + struct rte_ether_hdr *eth_hdr; > + struct rte_ipv4_hdr *ipv4_hdr; > + uint16_t ip_dl; > + uint16_t ip_id, hdr_len; > + uint16_t frag_offset = 0; > + uint8_t is_last_frag; > + > + struct udp4_flow_key key; > + uint32_t cur_idx, prev_idx, item_idx; > + uint32_t i, max_flow_num, remaining_flow_num; > + int cmp; > + uint8_t find; > + > + eth_hdr = rte_pktmbuf_mtod(pkt, struct rte_ether_hdr *); > + ipv4_hdr = (struct rte_ipv4_hdr *)((char *)eth_hdr + pkt->l2_len); > + hdr_len = pkt->l2_len + pkt->l3_len + pkt->l4_len; > + > + /* > + * Don't process non-fragment packet. > + */ > + if (!is_ipv4_fragment(ipv4_hdr)) > + return -1; > + > + /* > + * Don't process the packet whose payload length is less than or > + * equal to 0. > + */ > + if (pkt->pkt_len - hdr_len <= 0) > + return -1; Input packets are IP fragments, so the header length shouldn't include l4_len. > + > + ip_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) - pkt->l3_len; > + ip_id = rte_be_to_cpu_16(ipv4_hdr->packet_id); > + frag_offset = rte_be_to_cpu_16(ipv4_hdr->fragment_offset); > + is_last_frag = ((frag_offset & RTE_IPV4_HDR_MF_FLAG) == 0) ? 1 : 0; > + frag_offset = (uint16_t)(frag_offset & RTE_IPV4_HDR_OFFSET_MASK) > << 3; > + > + return 0; > +} > + > + > +uint16_t > +gro_udp4_tbl_timeout_flush(struct gro_udp4_tbl *tbl, > + uint64_t flush_timestamp, > + struct rte_mbuf **out, > + uint16_t nb_out) > +{ > + uint16_t k = 0; > + uint32_t i, j; > + uint32_t max_flow_num = tbl->max_flow_num; > + > + for (i = 0; i < max_flow_num; i++) { > + if (unlikely(tbl->flow_num == 0)) > + return k; > + > + j = tbl->flows[i].start_index; > + while (j != INVALID_ARRAY_INDEX) { > + if (tbl->items[j].start_time <= flush_timestamp) { > + gro_udp4_merge_items(tbl, j); Why need to merge packets again when flush the table? > + out[k++] = tbl->items[j].firstseg; > + if (tbl->items[j].nb_merged > 1) > + update_header(&(tbl->items[j])); > + /* > + * Delete the packet and get the next > + * packet in the flow. > + */ > + j = delete_item(tbl, j, INVALID_ARRAY_INDEX); > + tbl->flows[i].start_index = j; > + if (j == INVALID_ARRAY_INDEX) > + tbl->flow_num--; > + > + if (unlikely(k == nb_out)) > + return k; > + } else > + /* > + * The left packets in this flow won't be > + * timeout. Go to check other flows. > + */ > + break; > + } > + } > + return k; > +} > + > diff --git a/lib/librte_gro/gro_udp4.h b/lib/librte_gro/gro_udp4.h > new file mode 100644 > index 0000000..e1002c6 > --- /dev/null > +++ b/lib/librte_gro/gro_udp4.h > @@ -0,0 +1,294 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2020 Inspur Corporation > + */ > + > +#ifndef _GRO_UDP4_H_ > +#define _GRO_UDP4_H_ > + > +#include <rte_ip.h> > +#include <rte_udp.h> > +#include <rte_vxlan.h> rte_vxlan.h is used in VxLAN/UDP GRO. We don't need it in this patch. > + > +struct gro_udp4_item { > + /* > + * The first MBUF segment of the packet. If the value > + * is NULL, it means the item is empty. > + */ > + struct rte_mbuf *firstseg; > + /* The last MBUF segment of the packet */ > + struct rte_mbuf *lastseg; > + /* > + * The time when the first packet is inserted into the table. > + * This value won't be updated, even if the packet is merged > + * with other packets. > + */ > + uint64_t start_time; > + /* > + * next_pkt_idx is used to chain the packets that > + * are in the same flow but can't be merged together > + * (e.g. caused by packet reordering). > + */ > + uint32_t next_pkt_idx; > + /* offset of IP fragment packet */ > + uint16_t frag_offset; > + /* is last IP fragment? */ > + uint8_t is_last_frag; > + /* IPv4 ID of the packet */ > + uint16_t ip_id; Fragments of a UDP packet have the same IP ID. We only need to match it in udp4_flow_key structure, and gro_udp4_item doesn't need it. > + /* the number of merged packets */ > + uint16_t nb_merged; > + /* Indicate if IPv4 ID can be ignored */ > + uint8_t is_atomic; Is is_atomic used? > +}; > + > + > +/* > + * Check if two UDP/IPv4 packets belong to the same flow. > + */ > +static inline int > +is_same_udp4_flow(struct udp4_flow_key k1, struct udp4_flow_key k2) > +{ > + return (rte_is_same_ether_addr(&k1.eth_saddr, &k2.eth_saddr) && > + rte_is_same_ether_addr(&k1.eth_daddr, > &k2.eth_daddr) && > + (k1.ip_src_addr == k2.ip_src_addr) && > + (k1.ip_dst_addr == k2.ip_dst_addr) && > + (k1.ip_id == k2.ip_id)); > +} > + > +/* > + * Merge two UDP/IPv4 packets without updating checksums. > + * If cmp is larger than 0, append the new packet to the > + * original packet. Otherwise, pre-pend the new packet to > + * the original packet. > + */ > +static inline int > +merge_two_udp4_packets(struct gro_udp4_item *item, > + struct rte_mbuf *pkt, > + int cmp, > + uint16_t frag_offset, > + uint8_t is_last_frag, > + uint16_t ip_id, > + uint16_t l2_offset) > +{ > + struct rte_mbuf *pkt_head, *pkt_tail, *lastseg; > + uint16_t hdr_len, l2_len; > + uint32_t ip_len; > + > + if (cmp > 0) { > + pkt_head = item->firstseg; > + pkt_tail = pkt; > + } else { > + pkt_head = pkt; > + pkt_tail = item->firstseg; > + } > + > + /* check if the IPv4 packet length is greater than the max value */ > + hdr_len = l2_offset + pkt_head->l2_len + pkt_head->l3_len; > + l2_len = l2_offset > 0 ? pkt_head->outer_l2_len : pkt_head->l2_len; > + ip_len = pkt_head->pkt_len - l2_len > + + pkt_tail->pkt_len - hdr_len; > + if (unlikely(ip_len > MAX_IPV4_PKT_LENGTH)) > + return 0; > + > + /* remove the packet header for the tail packet */ > + rte_pktmbuf_adj(pkt_tail, hdr_len); > + > + /* chain two packets together */ > + if (cmp > 0) { > + item->lastseg->next = pkt; > + item->lastseg = rte_pktmbuf_lastseg(pkt); > + /* update IP ID to the larger value */ > + item->ip_id = ip_id; IP ID is the same for all fragments of a packet. I don't think we need to update it. > + } else { > + lastseg = rte_pktmbuf_lastseg(pkt); > + lastseg->next = item->firstseg; > + item->firstseg = pkt; > + /* update sent_seq to the smaller value */ > + item->frag_offset = frag_offset; > + item->ip_id = ip_id; > + } > + item->nb_merged++; > + if (is_last_frag) > + item->is_last_frag = is_last_frag; > + > + /* update MBUF metadata for the merged packet */ > + pkt_head->nb_segs += pkt_tail->nb_segs; > + pkt_head->pkt_len += pkt_tail->pkt_len; > + > + return 1; > +} > + > +/* > + * Check if two UDP/IPv4 packets are neighbors. > + */ > +static inline int > +udp_check_neighbor(struct gro_udp4_item *item, > + uint16_t frag_offset, > + uint16_t ip_id, > + uint16_t ip_dl, > + uint16_t l2_offset) > +{ > + struct rte_mbuf *pkt_orig = item->firstseg; > + uint16_t len; > + > + /* check if the two packets are neighbors */ > + len = pkt_orig->pkt_len - l2_offset - pkt_orig->l2_len - > + pkt_orig->l3_len; > + if ((frag_offset == item->frag_offset + len) && > + (ip_id == item->ip_id)) > + /* append the new packet */ > + return 1; > + else if ((frag_offset + ip_dl == item->frag_offset) && > + (ip_id == item->ip_id)) Is_same_udp4_flow() checks ip_id. No need to check again. > + /* pre-pend the new packet */ > + return -1; > + > + return 0; > +} > + > +static inline int > +is_ipv4_fragment(const struct rte_ipv4_hdr *hdr) > +{ > + uint16_t flag_offset, ip_flag, ip_ofs; > + > + flag_offset = rte_be_to_cpu_16(hdr->fragment_offset); > + ip_ofs = (uint16_t)(flag_offset & RTE_IPV4_HDR_OFFSET_MASK); > + ip_flag = (uint16_t)(flag_offset & RTE_IPV4_HDR_MF_FLAG); > + > + return ip_flag != 0 || ip_ofs != 0; If DF bit is set, the packet is not fragmented, which shouldn't be processed by UDP GRO. So we also need to make sure that DF bit is not set. > +} > +#endif > diff --git a/lib/librte_gro/meson.build b/lib/librte_gro/meson.build > index 501668c..0d18dc2 100644 > --- a/lib/librte_gro/rte_gro.c > +++ b/lib/librte_gro/rte_gro.c > @@ -9,6 +9,7 @@ > > #include "rte_gro.h" > #include "gro_tcp4.h" > +#include "gro_udp4.h" > #include "gro_vxlan_tcp4.h" > > typedef void *(*gro_tbl_create_fn)(uint16_t socket_id, > @@ -18,17 +19,23 @@ > typedef uint32_t (*gro_tbl_pkt_count_fn)(void *tbl); > > static gro_tbl_create_fn tbl_create_fn[RTE_GRO_TYPE_MAX_NUM] = { > - gro_tcp4_tbl_create, gro_vxlan_tcp4_tbl_create, NULL}; > + gro_tcp4_tbl_create, gro_vxlan_tcp4_tbl_create, > + gro_udp4_tbl_create, NULL}; > static gro_tbl_destroy_fn tbl_destroy_fn[RTE_GRO_TYPE_MAX_NUM] = { > gro_tcp4_tbl_destroy, gro_vxlan_tcp4_tbl_destroy, > + gro_udp4_tbl_destroy, > NULL}; > static gro_tbl_pkt_count_fn tbl_pkt_count_fn[RTE_GRO_TYPE_MAX_NUM] = > { > gro_tcp4_tbl_pkt_count, > gro_vxlan_tcp4_tbl_pkt_count, > + gro_udp4_tbl_pkt_count, > NULL}; > > #define IS_IPV4_TCP_PKT(ptype) (RTE_ETH_IS_IPV4_HDR(ptype) && \ > ((ptype & RTE_PTYPE_L4_TCP) == RTE_PTYPE_L4_TCP)) > > +#define IS_IPV4_UDP_PKT(ptype) (RTE_ETH_IS_IPV4_HDR(ptype) && \ > + ((ptype & RTE_PTYPE_L4_UDP) == RTE_PTYPE_L4_UDP)) > + > #define IS_IPV4_VXLAN_TCP4_PKT(ptype) (RTE_ETH_IS_IPV4_HDR(ptype) > && \ > ((ptype & RTE_PTYPE_L4_UDP) == RTE_PTYPE_L4_UDP) && \ > ((ptype & RTE_PTYPE_TUNNEL_VXLAN) == \ > @@ -40,6 +47,7 @@ > RTE_PTYPE_INNER_L3_IPV4_EXT | \ > RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN)) != 0)) > > + > /* > * GRO context structure. It keeps the table structures, which are > * used to merge packets, for different GRO types. Before using > @@ -123,20 +131,26 @@ struct gro_ctx { > struct gro_tcp4_flow tcp_flows[RTE_GRO_MAX_BURST_ITEM_NUM]; > struct gro_tcp4_item tcp_items[RTE_GRO_MAX_BURST_ITEM_NUM] > = {{0} }; > > - /* Allocate a reassembly table for VXLAN GRO */ > - struct gro_vxlan_tcp4_tbl vxlan_tbl; > - struct gro_vxlan_tcp4_flow > vxlan_flows[RTE_GRO_MAX_BURST_ITEM_NUM]; > - struct gro_vxlan_tcp4_item > vxlan_items[RTE_GRO_MAX_BURST_ITEM_NUM] = { > - {{0}, 0, 0} }; > + /* allocate a reassembly table for UDP/IPv4 GRO */ > + struct gro_udp4_tbl udp_tbl; > + struct gro_udp4_flow > udp_flows[RTE_GRO_MAX_BURST_ITEM_NUM]; > + struct gro_udp4_item udp_items[RTE_GRO_MAX_BURST_ITEM_NUM] > = {{0} }; > + > + /* Allocate a reassembly table for VXLAN TCP GRO */ > + struct gro_vxlan_tcp4_tbl vxlan_tcp_tbl; > + struct gro_vxlan_tcp4_flow > vxlan_tcp_flows[RTE_GRO_MAX_BURST_ITEM_NUM]; > + struct gro_vxlan_tcp4_item > vxlan_tcp_items[RTE_GRO_MAX_BURST_ITEM_NUM] > + = {{{0}, 0, 0} }; Renaming vxlan_items/_flows should be in the second patch, as this patch just supports UDP GRO. > > struct rte_mbuf *unprocess_pkts[nb_pkts]; > uint32_t item_num; > int32_t ret; > uint16_t i, unprocess_num = 0, nb_after_gro = nb_pkts; > - uint8_t do_tcp4_gro = 0, do_vxlan_gro = 0; > + uint8_t do_tcp4_gro = 0, do_vxlan_tcp_gro = 0, do_udp4_gro = 0; Renaming do_vxlan_gro should be in the second patch. > > /* Get the maximum number of packets */ > @@ -146,15 +160,15 @@ struct gro_ctx { > > if (param->gro_types & RTE_GRO_TCP_IPV4) { > @@ -170,14 +184,29 @@ struct gro_ctx { > do_tcp4_gro = 1; > } > + > > return nb_after_gro; > @@ -224,29 +269,33 @@ struct gro_ctx { > { > struct rte_mbuf *unprocess_pkts[nb_pkts]; > struct gro_ctx *gro_ctx = ctx; > - void *tcp_tbl, *vxlan_tbl; > + void *tcp_tbl, *udp_tbl, *vxlan_tcp_tbl; > uint64_t current_time; > uint16_t i, unprocess_num = 0; > - uint8_t do_tcp4_gro, do_vxlan_gro; > + uint8_t do_tcp4_gro, do_vxlan_tcp_gro, do_udp4_gro; > > if (unlikely((gro_ctx->gro_types & (RTE_GRO_IPV4_VXLAN_TCP_IPV4 > | > - RTE_GRO_TCP_IPV4)) == 0)) > + RTE_GRO_TCP_IPV4 | > + RTE_GRO_UDP_IPV4)) == 0)) > return nb_pkts; > > tcp_tbl = gro_ctx->tbls[RTE_GRO_TCP_IPV4_INDEX]; > - vxlan_tbl = gro_ctx->tbls[RTE_GRO_IPV4_VXLAN_TCP_IPV4_INDEX]; > + vxlan_tcp_tbl = gro_ctx- > >tbls[RTE_GRO_IPV4_VXLAN_TCP_IPV4_INDEX]; > + udp_tbl = gro_ctx->tbls[RTE_GRO_UDP_IPV4_INDEX]; > > do_tcp4_gro = (gro_ctx->gro_types & RTE_GRO_TCP_IPV4) == > RTE_GRO_TCP_IPV4; > - do_vxlan_gro = (gro_ctx->gro_types & > RTE_GRO_IPV4_VXLAN_TCP_IPV4) == > + do_vxlan_tcp_gro = (gro_ctx->gro_types & > RTE_GRO_IPV4_VXLAN_TCP_IPV4) == > RTE_GRO_IPV4_VXLAN_TCP_IPV4; > + do_udp4_gro = (gro_ctx->gro_types & RTE_GRO_UDP_IPV4) == > + RTE_GRO_UDP_IPV4; > > current_time = rte_rdtsc(); > > for (i = 0; i < nb_pkts; i++) { > if (IS_IPV4_VXLAN_TCP4_PKT(pkts[i]->packet_type) && > - do_vxlan_gro) { > - if (gro_vxlan_tcp4_reassemble(pkts[i], vxlan_tbl, > + do_vxlan_tcp_gro) { > + if (gro_vxlan_tcp4_reassemble(pkts[i], vxlan_tcp_tbl, > current_time) < 0) > unprocess_pkts[unprocess_num++] = pkts[i]; > } else if (IS_IPV4_TCP_PKT(pkts[i]->packet_type) && > @@ -254,6 +303,11 @@ struct gro_ctx { > if (gro_tcp4_reassemble(pkts[i], tcp_tbl, > current_time) < 0) > unprocess_pkts[unprocess_num++] = pkts[i]; > + } else if (IS_IPV4_UDP_PKT(pkts[i]->packet_type) && > + do_udp4_gro) { > + if (gro_udp4_reassemble(pkts[i], udp_tbl, > + current_time) < 0) > + unprocess_pkts[unprocess_num++] = pkts[i]; > } else > unprocess_pkts[unprocess_num++] = pkts[i]; > } > @@ -275,6 +329,7 @@ struct gro_ctx { > struct gro_ctx *gro_ctx = ctx; > uint64_t flush_timestamp; > uint16_t num = 0; > + uint16_t left_nb_out = max_nb_out; > > gro_types = gro_types & gro_ctx->gro_types; > flush_timestamp = rte_rdtsc() - timeout_cycles; > @@ -282,8 +337,8 @@ struct gro_ctx { > if (gro_types & RTE_GRO_IPV4_VXLAN_TCP_IPV4) { > num = gro_vxlan_tcp4_tbl_timeout_flush(gro_ctx->tbls[ > RTE_GRO_IPV4_VXLAN_TCP_IPV4_INDEX], > - flush_timestamp, out, max_nb_out); > - max_nb_out -= num; > + flush_timestamp, out, left_nb_out); > + left_nb_out = max_nb_out - num; > } > > /* If no available space in 'out', stop flushing. */ > @@ -291,7 +346,17 @@ struct gro_ctx { > num += gro_tcp4_tbl_timeout_flush( > gro_ctx->tbls[RTE_GRO_TCP_IPV4_INDEX], > flush_timestamp, > - &out[num], max_nb_out); > + &out[num], left_nb_out); > + left_nb_out = max_nb_out - num; > + } > + > + /* If no available space in 'out', stop flushing. */ > + if ((gro_types & RTE_GRO_UDP_IPV4) && max_nb_out > 0) { > + num += gro_udp4_tbl_timeout_flush( > + gro_ctx->tbls[RTE_GRO_UDP_IPV4_INDEX], > + flush_timestamp, > + &out[num], left_nb_out); > + left_nb_out = max_nb_out - num; Don't need to update left_nb_out here. > } > > return num; > diff --git a/lib/librte_gro/rte_gro.h b/lib/librte_gro/rte_gro.h > index 8d781b5..470f3ed 100644 > --- a/lib/librte_gro/rte_gro.h > +++ b/lib/librte_gro/rte_gro.h > @@ -31,7 +31,10 @@ > /**< TCP/IPv4 GRO flag */ > #define RTE_GRO_IPV4_VXLAN_TCP_IPV4_INDEX 1 > #define RTE_GRO_IPV4_VXLAN_TCP_IPV4 (1ULL << > RTE_GRO_IPV4_VXLAN_TCP_IPV4_INDEX) > -/**< VxLAN GRO flag. */ > +/**< VxLAN TCP/IPv4 GRO flag. */ > +#define RTE_GRO_UDP_IPV4_INDEX 2 > +#define RTE_GRO_UDP_IPV4 (1ULL << RTE_GRO_UDP_IPV4_INDEX) > +/**< UDP/IPv4 GRO flag */ > > /** > * Structure used to create GRO context objects or used to pass > -- > 1.8.3.1