Introduce three new functions to support TCP/IPv4 GRO.
- rte_gro_tcp4_tbl_create: create a TCP/IPv4 hashing table;
- rte_gro_tcp4_reassemble: try to reassemble an incoming TCP/IPv4 packet
    with existed TCP/IPv4 packets;
- rte_gro_tcp4_cksum_update: update TCP and IPv4 checksums.

rte_gro_tcp4_reassemble uses a TCP/IPv4 hashing table to implement packet
reassembly. The TCP/IPv4 hashing table is a cuckoo hashing table, whose
keys are rules of merging TCP/IPv4 packets, and whose values point to
item-lists. The item-list contains items, which point to packets with
the same keys.

That rte_gro_tcp4_reassemble processes an incoming packet requires four
steps:
a. check if the packet should be processed. TCP/IPv4 GRO doesn't process
packets of the following types:
        - packets without data;
        - packets with wrong checksums;
        - fragmented packets.
b. lookup the hashing table to find a item-list, which stores packets that
may be able to merge with the incoming one;
c. if find the item-list, check all of its packets. If find one that
is the neighbor of the incoming packet, chaining them together and update
packet length and mbuf metadata; if don't find, allocate a new item for
the incoming packet and insert it into the item-list;
d. if fail to find a item-list, allocate a new item-list for the incoming
packet and insert it into the hash table.

Signed-off-by: Jiayu Hu <jiayu...@intel.com>
---
 lib/librte_gro/Makefile         |   1 +
 lib/librte_gro/rte_gro.c        |  81 +++++++++++-
 lib/librte_gro/rte_gro_common.h |   4 +-
 lib/librte_gro/rte_gro_tcp.c    | 270 ++++++++++++++++++++++++++++++++++++++++
 lib/librte_gro/rte_gro_tcp.h    |  95 ++++++++++++++
 5 files changed, 448 insertions(+), 3 deletions(-)
 create mode 100644 lib/librte_gro/rte_gro_tcp.c
 create mode 100644 lib/librte_gro/rte_gro_tcp.h

diff --git a/lib/librte_gro/Makefile b/lib/librte_gro/Makefile
index fb3a36c..c45f4f2 100644
--- a/lib/librte_gro/Makefile
+++ b/lib/librte_gro/Makefile
@@ -43,6 +43,7 @@ LIBABIVER := 1
 
 #source files
 SRCS-$(CONFIG_RTE_LIBRTE_GRO) += rte_gro.c
+SRCS-$(CONFIG_RTE_LIBRTE_GRO) += rte_gro_tcp.c
 
 # install this header file
 SYMLINK-$(CONFIG_RTE_LIBRTE_GRO)-include += rte_gro.h
diff --git a/lib/librte_gro/rte_gro.c b/lib/librte_gro/rte_gro.c
index 9b1df53..e1ac062 100644
--- a/lib/librte_gro/rte_gro.c
+++ b/lib/librte_gro/rte_gro.c
@@ -6,9 +6,12 @@
 
 #include "rte_gro.h"
 #include "rte_gro_common.h"
+#include "rte_gro_tcp.h"
 
-gro_reassemble_fn reassemble_functions[GRO_TYPE_MAX_NB] = {NULL};
-gro_tbl_create_fn tbl_create_functions[GRO_TYPE_MAX_NB] = {NULL};
+gro_reassemble_fn reassemble_functions[GRO_TYPE_MAX_NB] = {
+       rte_gro_tcp4_reassemble, NULL};
+gro_tbl_create_fn tbl_create_functions[GRO_TYPE_MAX_NB] = {
+       rte_gro_tcp4_tbl_create, NULL};
 
 struct rte_gro_status *gro_status;
 
@@ -102,7 +105,81 @@ rte_gro_reassemble_burst(uint8_t port __rte_unused,
                printf("invalid parameters for GRO.\n");
                return 0;
        }
+       struct ether_hdr *eth_hdr;
+       struct ipv4_hdr *ipv4_hdr;
+       uint16_t l3proc_type;
+
+       /* record packet GRO info */
+       struct gro_info gro_infos[nb_pkts];
+       struct rte_gro_lkp_tbl *lkp_tbls = ((struct rte_gro_tbl *)
+                       gro_tbl)->lkp_tbls;
+       int32_t ret;
        uint16_t nb_after_gro = nb_pkts;
+       uint8_t dirty_tbls[GRO_SUPPORT_TYPE_NB] = {0};
+
+       /* pre-allocate tcp items for TCP GRO */
+       struct gro_tcp_item tcp_items[nb_pkts * nb_pkts];
+
+       for (uint16_t i = 0; i < nb_pkts; i++) {
+               gro_infos[i].nb_merged_packets = 1;     /* initial value */
+               eth_hdr = rte_pktmbuf_mtod(pkts[i], struct ether_hdr *);
+               l3proc_type = rte_be_to_cpu_16(eth_hdr->ether_type);
+               if (l3proc_type == ETHER_TYPE_IPv4) {
+                       ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+                       if (ipv4_hdr->next_proto_id == IPPROTO_TCP) {
+                               gro_infos[i].gro_type = GRO_TCP_IPV4;
+                               /* allocate an item-list for the packet */
+                               gro_infos[i].item_list.items =
+                                       &tcp_items[i * nb_pkts];
+                               gro_infos[i].item_list.nb_item = 1;
+                               /**
+                                * fill the packet information into the first
+                                * item of the item-list
+                                */
+                               tcp_items[i * nb_pkts].segment = pkts[i];
+                               tcp_items[i * nb_pkts].segment_idx = i;
+
+                               ret = rte_gro_tcp4_reassemble(
+                                               lkp_tbls[GRO_TCP_IPV4].hash_tbl,
+                                               &gro_infos[i].item_list);
+                               if (ret > 0) {
+                                       gro_infos[i].nb_merged_packets = 0;
+                                       gro_infos[--ret].nb_merged_packets++;
+                                       nb_after_gro--;
+                               }
+                               dirty_tbls[GRO_TCP_IPV4] = ret >= 0 ? 1 : 0;
+                       }
+               }
+       }
+       /**
+        * if there are packets been merged, update their headers,
+        * and remove useless packet addresses from the inputted
+        * packet array.
+        */
+       if (nb_after_gro < nb_pkts) {
+               struct rte_mbuf *tmp[nb_pkts];
+
+               memset(tmp, 0,
+                               sizeof(struct rte_mbuf *) * nb_pkts);
+               for (uint16_t i = 0, j = 0; i < nb_pkts; i++) {
+                       if (gro_infos[i].nb_merged_packets > 1) {
+                               switch (gro_infos[i].gro_type) {
+                               case GRO_TCP_IPV4:
+                                       gro_tcp4_cksum_update(pkts[i]);
+                                       break;
+                               }
+                       }
+                       if (gro_infos[i].nb_merged_packets != 0)
+                               tmp[j++] = pkts[i];
+               }
+               rte_memcpy(pkts, tmp,
+                               nb_pkts * sizeof(struct rte_mbuf *));
+       }
+
+       /* if GRO is performed, reset the hash table */
+       for (uint16_t i = 0; i < GRO_SUPPORT_TYPE_NB; i++)
+               if (dirty_tbls[i])
+                       rte_hash_reset(lkp_tbls[i].hash_tbl);
 
        return nb_after_gro;
 }
diff --git a/lib/librte_gro/rte_gro_common.h b/lib/librte_gro/rte_gro_common.h
index 611d833..7b5d9ec 100644
--- a/lib/librte_gro/rte_gro_common.h
+++ b/lib/librte_gro/rte_gro_common.h
@@ -12,7 +12,9 @@
 /**
  * current supported GRO types number
  */
-#define GRO_SUPPORT_TYPE_NB 0
+#define GRO_SUPPORT_TYPE_NB 1
+
+#define GRO_TCP_IPV4 0 /**< TCP/IPv4 GRO */
 
 /**
  * default element number of the hashing table
diff --git a/lib/librte_gro/rte_gro_tcp.c b/lib/librte_gro/rte_gro_tcp.c
new file mode 100644
index 0000000..f17d9f5
--- /dev/null
+++ b/lib/librte_gro/rte_gro_tcp.c
@@ -0,0 +1,270 @@
+#include "rte_gro_tcp.h"
+
+int
+rte_gro_tcp4_tbl_create(char *name,
+               uint32_t nb_entries, uint16_t socket_id,
+               struct rte_hash **hash_tbl)
+{
+       struct rte_hash_parameters ht_param = {
+               .entries = nb_entries,
+               .name = name,
+               .key_len = sizeof(struct gro_tcp4_pre_rules),
+               .hash_func = rte_jhash,
+               .hash_func_init_val = 0,
+               .socket_id = socket_id,
+       };
+
+       *hash_tbl = rte_hash_create(&ht_param);
+       if (likely(*hash_tbl != NULL))
+               return 0;
+       return -1;
+}
+
+/* update TCP IPv4 checksum */
+void
+rte_gro_tcp4_cksum_update(struct rte_mbuf *pkt)
+{
+       uint32_t len, offset, cksum;
+       struct ether_hdr *eth_hdr;
+       struct ipv4_hdr *ipv4_hdr;
+       struct tcp_hdr *tcp_hdr;
+       uint16_t ipv4_ihl, cksum_pld;
+
+       if (pkt == NULL)
+               return;
+
+       len = pkt->pkt_len;
+       eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+       ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+       ipv4_ihl = IPv4_HDR_LEN(ipv4_hdr);
+       tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + ipv4_ihl);
+
+       offset = sizeof(struct ether_hdr) + ipv4_ihl;
+       len -= offset;
+
+       /* TCP cksum without IP pseudo header */
+       ipv4_hdr->hdr_checksum = 0;
+       tcp_hdr->cksum = 0;
+       if (rte_raw_cksum_mbuf(pkt, offset, len, &cksum_pld) < 0) {
+               printf("invalid param for raw_cksum_mbuf\n");
+               return;
+       }
+       /* IP pseudo header cksum */
+       cksum = cksum_pld;
+       cksum += rte_ipv4_phdr_cksum(ipv4_hdr, 0);
+
+       /* combine TCP checksum and IP pseudo header checksum */
+       cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff);
+       cksum = (~cksum) & 0xffff;
+       cksum = (cksum == 0) ? 0xffff : cksum;
+       tcp_hdr->cksum = cksum;
+
+       /* update IP header cksum */
+       ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
+}
+
+/**
+ * This function traverses the item-list to find one item that can be
+ * merged with the incoming packet. If merge successfully, the merged
+ * packets are chained together; if not, insert the incoming packet into
+ * the item-list.
+ */
+static int32_t
+gro_tcp4_reassemble(struct rte_mbuf *pkt,
+               uint16_t pkt_idx,
+               uint32_t pkt_sent_seq,
+               struct gro_item_list *list)
+{
+       struct gro_tcp_item *items;
+       struct ipv4_hdr *ipv4_hdr1;
+       struct tcp_hdr *tcp_hdr1;
+       uint16_t ipv4_ihl1, tcp_hl1, tcp_dl1;
+
+       items = (struct gro_tcp_item *)list->items;
+       ipv4_hdr1 = (struct ipv4_hdr *)(rte_pktmbuf_mtod(pkt, struct
+                               ether_hdr *) + 1);
+       ipv4_ihl1 = IPv4_HDR_LEN(ipv4_hdr1);
+       tcp_hdr1 = (struct tcp_hdr *)((char *)ipv4_hdr1 + ipv4_ihl1);
+       tcp_hl1 = TCP_HDR_LEN(tcp_hdr1);
+       tcp_dl1 = rte_be_to_cpu_16(ipv4_hdr1->total_length) - ipv4_ihl1
+               - tcp_hl1;
+
+       for (uint16_t i = 0; i < list->nb_item; i++) {
+               /* check if the two packets are neighbor */
+               if ((pkt_sent_seq ^ items[i].next_sent_seq) == 0) {
+                       struct ipv4_hdr *ipv4_hdr2;
+                       struct tcp_hdr *tcp_hdr2;
+                       uint16_t ipv4_ihl2, tcp_hl2;
+                       struct rte_mbuf *tail;
+
+                       ipv4_hdr2 = (struct ipv4_hdr *)(rte_pktmbuf_mtod(
+                                               items[i].segment,
+                                               struct ether_hdr *)
+                                       + 1);
+
+                       /* check if the option fields equal */
+                       if (tcp_hl1 > sizeof(struct tcp_hdr)) {
+                               ipv4_ihl2 = IPv4_HDR_LEN(ipv4_hdr2);
+                               tcp_hdr2 = (struct tcp_hdr *)
+                                       ((char *)ipv4_hdr2 + ipv4_ihl2);
+                               tcp_hl2 = TCP_HDR_LEN(tcp_hdr2);
+                               if ((tcp_hl1 != tcp_hl2) ||
+                                               (memcmp(tcp_hdr1 + 1,
+                                                               tcp_hdr2 + 1,
+                                                               tcp_hl2 - sizeof
+                                                               (struct 
tcp_hdr))
+                                                != 0))
+                                       continue;
+                       }
+                       /* check if the packet length will be beyond 64K */
+                       if (items[i].segment->pkt_len + tcp_dl1 > UINT16_MAX)
+                               goto merge_fail;
+
+                       /* remove the header of the incoming packet */
+                       rte_pktmbuf_adj(pkt, sizeof(struct ether_hdr) +
+                                       ipv4_ihl1 + tcp_hl1);
+                       /* chain the two packet together */
+                       tail = rte_pktmbuf_lastseg(items[i].segment);
+                       tail->next = pkt;
+
+                       /* update IP header for the merged packet */
+                       ipv4_hdr2->total_length = rte_cpu_to_be_16(
+                                       rte_be_to_cpu_16(
+                                               ipv4_hdr2->total_length)
+                                       + tcp_dl1);
+
+                       /* update the next expected sequence number */
+                       items[i].next_sent_seq += tcp_dl1;
+
+                       /* update mbuf metadata for the merged packet */
+                       items[i].segment->nb_segs++;
+                       items[i].segment->pkt_len += pkt->pkt_len;
+
+                       return items[i].segment_idx + 1;
+               }
+       }
+
+merge_fail:
+       /* fail to merge. Insert the incoming packet into the item-list */
+       items[list->nb_item].next_sent_seq = pkt_sent_seq + tcp_dl1;
+       items[list->nb_item].segment = pkt;
+       items[list->nb_item].segment_idx = pkt_idx;
+       list->nb_item++;
+
+       return 0;
+}
+
+/**
+ * Traverse the item-list to find a packet to merge with the incoming
+ * one.
+ * @param hash_tbl
+ *  TCP IPv4 lookup table
+ * @param item_list
+ *  Pre-allocated item-list, in which the first item stores the packet
+ *  to process.
+ * @return
+ *  If the incoming packet merges with one packet successfully, return
+ *  the index + 1 of the merged packet; if the incoming packet hasn't
+ *  been performed GRO, return -1; if the incoming packet is performed
+ *  GRO but fail to merge, return 0.
+ */
+int32_t
+rte_gro_tcp4_reassemble(struct rte_hash *hash_tbl,
+               struct gro_item_list *item_list)
+{
+       struct ether_hdr *eth_hdr;
+       struct ipv4_hdr *ipv4_hdr;
+       struct tcp_hdr *tcp_hdr;
+       uint16_t ipv4_ihl, tcp_hl, tcp_dl, tcp_cksum, ip_cksum;
+       struct gro_tcp4_pre_rules key = {0};
+       struct gro_item_list *list;
+       uint64_t ol_flags;
+       uint32_t sent_seq;
+       int32_t ret = -1;
+
+       /* get the packet to process */
+       struct gro_tcp_item *items = item_list->items;
+       struct rte_mbuf *pkt = items[0].segment;
+       uint32_t pkt_idx = items[0].segment_idx;
+
+       eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+       ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+       ipv4_ihl = IPv4_HDR_LEN(ipv4_hdr);
+
+       /* 1. check if the packet should be processed */
+       if (ipv4_ihl < sizeof(struct ipv4_hdr))
+               goto end;
+       if (ipv4_hdr->next_proto_id != IPPROTO_TCP)
+               goto end;
+       if ((ipv4_hdr->fragment_offset &
+                               rte_cpu_to_be_16(IPV4_HDR_DF_MASK))
+                       == 0)
+               goto end;
+
+       tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr + ipv4_ihl);
+       tcp_hl = TCP_HDR_LEN(tcp_hdr);
+       tcp_dl = rte_be_to_cpu_16(ipv4_hdr->total_length) - ipv4_ihl
+               - tcp_hl;
+       if (tcp_dl == 0)
+               goto end;
+
+       ol_flags = pkt->ol_flags;
+       /**
+        * 2. if HW rx checksum offload isn't enabled, recalculate the
+        * checksum in SW. Then, check if the checksum is correct
+        */
+       if ((ol_flags & PKT_RX_IP_CKSUM_MASK) !=
+                       PKT_RX_IP_CKSUM_UNKNOWN) {
+               if (ol_flags == PKT_RX_IP_CKSUM_BAD)
+                       goto end;
+       } else {
+               ip_cksum = ipv4_hdr->hdr_checksum;
+               ipv4_hdr->hdr_checksum = 0;
+               ipv4_hdr->hdr_checksum = rte_ipv4_cksum(ipv4_hdr);
+               if (ipv4_hdr->hdr_checksum ^ ip_cksum)
+                       goto end;
+       }
+
+       if ((ol_flags & PKT_RX_L4_CKSUM_MASK) !=
+                       PKT_RX_L4_CKSUM_UNKNOWN) {
+               if (ol_flags == PKT_RX_L4_CKSUM_BAD)
+                       goto end;
+       } else {
+               tcp_cksum = tcp_hdr->cksum;
+               tcp_hdr->cksum = 0;
+               tcp_hdr->cksum = rte_ipv4_udptcp_cksum
+                       (ipv4_hdr, tcp_hdr);
+               if (tcp_hdr->cksum ^ tcp_cksum)
+                       goto end;
+       }
+
+       /* 3. search for the corresponding item-list for the packet */
+       key.eth_saddr = eth_hdr->s_addr;
+       key.eth_daddr = eth_hdr->d_addr;
+       key.ip_src_addr = rte_be_to_cpu_32(ipv4_hdr->src_addr);
+       key.ip_dst_addr = rte_be_to_cpu_32(ipv4_hdr->dst_addr);
+       key.src_port = rte_be_to_cpu_16(tcp_hdr->src_port);
+       key.dst_port = rte_be_to_cpu_16(tcp_hdr->dst_port);
+       key.recv_ack = rte_be_to_cpu_32(tcp_hdr->recv_ack);
+       key.tcp_flags = tcp_hdr->tcp_flags;
+
+       sent_seq = rte_be_to_cpu_32(tcp_hdr->sent_seq);
+
+       if (rte_hash_lookup_data(hash_tbl, &key, (void **)&list) >= 0) {
+               ret = gro_tcp4_reassemble(pkt, pkt_idx, sent_seq, list);
+       } else {
+               /**
+                * fail to find a item-list. Record the sequence number of the
+                * incoming packet's neighbor into its item_list, and insert it
+                * into the hash table.
+                */
+               items[0].next_sent_seq = sent_seq + tcp_dl;
+               if (unlikely(rte_hash_add_key_data(hash_tbl, &key, item_list)
+                                       != 0))
+                       printf("GRO TCP hash insert fail.\n");
+               else
+                       ret = 0;
+       }
+end:
+       return ret;
+}
diff --git a/lib/librte_gro/rte_gro_tcp.h b/lib/librte_gro/rte_gro_tcp.h
new file mode 100644
index 0000000..52be9cd
--- /dev/null
+++ b/lib/librte_gro/rte_gro_tcp.h
@@ -0,0 +1,95 @@
+#ifndef _RTE_GRO_TCP_H_
+#define _RTE_GRO_TCP_H_
+
+#include <rte_ethdev.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+#include <rte_hash.h>
+#include <rte_jhash.h>
+
+#include "rte_gro_common.h"
+
+#if RTE_BYTE_ORDER == RTE_LITTLE_ENDIAN
+#define TCP_HDR_LEN(tcph) \
+       ((tcph->data_off >> 4) * 4)
+#define IPv4_HDR_LEN(iph) \
+       ((iph->version_ihl & 0x0f) * 4)
+#else
+#define TCP_DATAOFF_MASK 0x0f
+#define TCP_HDR_LEN(tcph) \
+       ((tcph->data_off & TCP_DATAOFF_MASK) * 4)
+#define IPv4_HDR_LEN(iph) \
+       ((iph->version_ihl >> 4) * 4)
+#endif
+
+#define IPV4_HDR_DF_SHIFT 14
+#define IPV4_HDR_DF_MASK (1 << IPV4_HDR_DF_SHIFT)
+
+
+/**
+ * key structure of TCP ipv4 hash table. It describes the prerequsite
+ * rules of merging packets.
+ */
+struct gro_tcp4_pre_rules {
+       struct ether_addr eth_saddr;
+       struct ether_addr eth_daddr;
+       uint32_t ip_src_addr;
+       uint32_t ip_dst_addr;
+
+       uint32_t recv_ack;      /**< acknowledgment sequence number. */
+       uint16_t src_port;
+       uint16_t dst_port;
+       uint8_t tcp_flags;      /**< TCP flags. */
+
+       uint8_t padding[3];
+};
+
+/**
+ * TCP item structure
+ */
+struct gro_tcp_item {
+       struct rte_mbuf *segment;       /**< packet address. */
+       uint32_t next_sent_seq; /**< sequence number of the next packet. */
+       uint16_t segment_idx;   /**< packet index. */
+};
+
+void
+rte_gro_tcp4_cksum_update(struct rte_mbuf *pkt);
+
+/**
+ * Create a new TCP ipv4 GRO lookup table.
+ *
+ * @param name
+ *     Lookup table name
+ * @param nb_entries
+ *  Lookup table elements number, whose value should be larger than or
+ *  equal to RTE_GRO_HASH_ENTRIES_MIN, and less than or equal to
+ *  RTE_GRO_HASH_ENTRIES_MAX, and should be power of two.
+ * @param socket_id
+ *  socket id
+ * @return
+ *  lookup table address
+ */
+int
+rte_gro_tcp4_tbl_create(char *name, uint32_t nb_entries,
+               uint16_t socket_id, struct rte_hash **hash_tbl);
+/**
+ * This function reassembles a bulk of TCP IPv4 packets. For non-TCP IPv4
+ * packets, the function won't process them.
+ *
+ * @param hash_tbl
+ *     Lookup table used to reassemble packets. It stores key-value pairs.
+ *     The key describes the prerequsite rules to merge two TCP IPv4 packets;
+ *     the value is a pointer pointing to a item-list, which contains
+ *     packets that have the same prerequisite TCP IPv4 rules. Note that
+ *     applications need to guarantee the hash_tbl is clean when first call
+ *     this function.
+ * @return
+ *     The packet number after GRO. If reassemble successfully, the value is
+ *     less than nb_pkts; if not, the value is equal to nb_pkts.
+ */
+int32_t
+rte_gro_tcp4_reassemble(struct rte_hash *hash_tbl,
+               struct gro_item_list *item_list);
+
+#endif
-- 
2.7.4

Reply via email to