Large Receive Offload (tcp) Signed-off-by: Jan-Bernd Themann <[EMAIL PROTECTED]> --- include/linux/inet_lro.h | 107 ++++++++++++++++ net/ipv4/inet_lro.c | 311 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 418 insertions(+), 0 deletions(-) create mode 100644 include/linux/inet_lro.h create mode 100644 net/ipv4/inet_lro.c
diff --git a/include/linux/inet_lro.h b/include/linux/inet_lro.h new file mode 100644 index 0000000..6df444a --- /dev/null +++ b/include/linux/inet_lro.h @@ -0,0 +1,107 @@ +/* + * linux/include/linux/inet_lro.h + * + * Large Receive Offload (ipv4 / tcp) + * + * (C) Copyright IBM Corp. 2007 + * + * Authors: + * Jan-Bernd Themann <[EMAIL PROTECTED]> + * Christoph Raisch <[EMAIL PROTECTED]> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef __INET_LRO_H_ +#define __INET_LRO_H_ + +#include <net/ip.h> +#include <net/tcp.h> + +/* + * LRO descriptor for a tcp session + */ +struct net_lro_desc { + struct sk_buff *parent; + struct sk_buff *last_skb; + struct iphdr *iph; + struct tcphdr *tcph; + struct vlan_group *vgrp; + u32 tcp_rcv_tsecr; + u32 tcp_rcv_tsval; + u32 tcp_ack; + u32 tcp_next_seq; + u32 skb_tot_frags_len; + u16 ip_tot_len; + u16 tcp_saw_tstamp; /* timestamps enabled */ + u16 tcp_window; + u16 vlan_tag; + int skb_sg_cnt; /* counts aggregated skbs */ + int vlan_packet; + int active; +}; + +/* + * Large Receive Offload (LRO) Manager + * + * Fields must be set by driver + */ + +struct net_lro_mgr { + int max_desc; /* Max number of LRO descriptors */ + int max_aggr; /* Max number of LRO packets to be aggregated */ + + struct net_lro_desc *lro_arr; /* Array of LRO descriptors */ + + /* + * Optimized driver functions + * + * get_tcp_ip_hdr: returns tcp and ip header for packet in SKB + */ + int (*get_ip_tcp_hdr)(struct sk_buff *skb, struct iphdr **iphdr, + struct tcphdr **tcph, void *priv); +}; + +/* + * Processes a SKB + * + * @lro_mgr: LRO manager to use + * @skb: SKB to aggregate + * @priv: Private data that may be used by driver functions + * (for example get_tcp_ip_hdr) + */ + +void lro_receive_skb(struct net_lro_mgr *lro_mgr, + struct sk_buff *skb, + void *priv); + +/* + * Processes a SKB with VLAN HW acceleration support + */ + +void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr, + struct sk_buff *skb, + struct vlan_group *vgrp, + u16 vlan_tag, + void *priv); + +/* + * Forward all aggregated SKBs held by lro_mgr to network stack + */ + +void lro_flush_all(struct net_lro_mgr *lro_mgr); + +#endif diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c new file mode 100644 index 0000000..2b9d871 --- /dev/null +++ b/net/ipv4/inet_lro.c @@ -0,0 +1,311 @@ +/* + * linux/net/ipv4/inet_lro.c + * + * Large Receive Offload (ipv4 / tcp) + * + * (C) Copyright IBM Corp. 2007 + * + * Authors: + * Jan-Bernd Themann <[EMAIL PROTECTED]> + * Christoph Raisch <[EMAIL PROTECTED]> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + + +#include <linux/module.h> +#include <linux/if_vlan.h> +#include <linux/inet_lro.h> + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jan-Bernd Themann <[EMAIL PROTECTED]>"); +MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)"); + +#define TCP_PAYLOAD_LENGTH(iph, tcph) \ +(ntohs(iph->tot_len) - (iph->ihl << 2) - (tcph->doff << 2)) + +#define IPH_LEN_WO_OPTIONS 5 +#define TCPH_LEN_WO_OPTIONS 5 +#define TCPH_LEN_W_TIMESTAMP 8 + +/* + * Basic tcp checks whether packet is suitable for LRO + */ + +static int lro_tcp_ip_check(struct sk_buff *skb, struct iphdr *iph, + struct tcphdr *tcph, struct net_lro_desc *lro_desc) +{ + /* check ip header: packet length */ + if (ntohs(iph->tot_len) > skb->len) + return -1; + + if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0) + return -1; + + if (iph->ihl != IPH_LEN_WO_OPTIONS) + return -1; + + if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack || tcph->psh + || tcph->rst || tcph->syn || tcph->fin) + return -1; + + if (INET_ECN_is_ce(ipv4_get_dsfield(iph))) + return -1; + + if (tcph->doff != TCPH_LEN_WO_OPTIONS + && tcph->doff != TCPH_LEN_W_TIMESTAMP) + return -1; + + /* check tcp options (only timestamp allowed) */ + if (tcph->doff == TCPH_LEN_W_TIMESTAMP) { + u32 *topt = (u32 *)(tcph + 1); + + if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) + | TCPOLEN_TIMESTAMP)) + return -1; + + /* timestamp should be in right order */ + topt++; + if (lro_desc && (ntohl(lro_desc->tcp_rcv_tsval) > ntohl(*topt))) + return -1; + + /* timestamp reply should not be zero */ + topt++; + if (*topt == 0) + return -1; + } + + return 0; +} + +static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc) +{ + struct iphdr *iph = lro_desc->iph; + struct tcphdr *tcph = lro_desc->tcph; + u32 *p; + + tcph->ack_seq = lro_desc->tcp_ack; + tcph->window = lro_desc->tcp_window; + + if (lro_desc->tcp_saw_tstamp) { + p = (u32 *)(tcph + 1); + *(p+2) = lro_desc->tcp_rcv_tsecr; + } + + iph->tot_len = htons(lro_desc->ip_tot_len); + iph->check = 0; + iph->check = ip_fast_csum((u8 *)lro_desc->iph, iph->ihl); +} + +static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb, + struct iphdr *iph, struct tcphdr *tcph, + u16 vlan_tag, struct vlan_group *vgrp) +{ + u32 *ptr; + u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); + + lro_desc->parent = skb; + lro_desc->iph = iph; + lro_desc->tcph = tcph; + lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len; + lro_desc->tcp_ack = ntohl(tcph->ack_seq); + + lro_desc->skb_sg_cnt = 1; + lro_desc->ip_tot_len = ntohs(iph->tot_len); + + if (tcph->doff == 8) { + ptr = (u32 *)(tcph+1); + lro_desc->tcp_saw_tstamp = 1; + lro_desc->tcp_rcv_tsval = *(ptr+1); + lro_desc->tcp_rcv_tsecr = *(ptr+2); + } + + lro_desc->vgrp = vgrp; + lro_desc->vlan_tag = vlan_tag; + lro_desc->active = 1; +} + +static inline void lro_clear_desc(struct net_lro_desc *lro_desc) +{ + memset(lro_desc, 0, sizeof(struct net_lro_desc)); +} + +static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb, + struct iphdr *iph, struct tcphdr *tcph) +{ + struct sk_buff *parent = lro_desc->parent; + u32 *topt; + u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph); + + lro_desc->skb_sg_cnt++; + + lro_desc->ip_tot_len += tcp_data_len; + lro_desc->tcp_next_seq += tcp_data_len; + lro_desc->tcp_window = lro_desc->tcph->window; + lro_desc->tcp_ack = lro_desc->tcph->ack_seq; + + /* don't update tcp_rcv_tsval, would not work with PAWS */ + if (lro_desc->tcp_saw_tstamp) { + topt = (u32 *) (tcph + 1); + lro_desc->tcp_rcv_tsecr = *(topt + 2); + } + + parent->len += tcp_data_len; + parent->data_len += tcp_data_len; + + skb_pull(skb, (skb->len - tcp_data_len)); + parent->truesize += skb->truesize; + + if (lro_desc->last_skb) + lro_desc->last_skb->next = skb; + else + skb_shinfo(parent)->frag_list = skb; + + lro_desc->last_skb = skb; + + return; +} + +static int lro_check_tcp_conn(struct net_lro_desc *lro_desc, + struct iphdr *iph, + struct tcphdr *tcph) +{ + if ((lro_desc->iph->saddr != iph->saddr) + || (lro_desc->iph->daddr != iph->daddr) + || (lro_desc->tcph->source != tcph->source) + || (lro_desc->tcph->dest != tcph->dest)) + return -1; + return 0; +} + +static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *mgr, + struct net_lro_desc *lro_arr, + struct iphdr *iph, + struct tcphdr *tcph) +{ + struct net_lro_desc *lro_desc = NULL; + struct net_lro_desc *tmp; + int max_desc = mgr->max_desc; + int i; + + for (i = 0; i < max_desc; i++) { + tmp = &lro_arr[i]; + if (tmp->active) + if (!lro_check_tcp_conn(tmp, iph, tcph)) { + lro_desc = tmp; + goto out; + } + } + + for (i = 0; i < max_desc; i++) { + if(!lro_arr[i].active) { + lro_desc = &lro_arr[i]; + goto out; + } + } + +out: + return lro_desc; +} + +static void lro_flush(struct net_lro_desc *lro_desc) +{ + lro_update_tcp_ip_header(lro_desc); + + if (lro_desc->vgrp) + vlan_hwaccel_receive_skb(lro_desc->parent, lro_desc->vgrp, + lro_desc->vlan_tag); + else + netif_receive_skb(lro_desc->parent); + + lro_clear_desc(lro_desc); +} + +void lro_flush_all(struct net_lro_mgr *lro_mgr) +{ + int i; + struct net_lro_desc *lro_desc = lro_mgr->lro_arr; + + for (i = 0; i < lro_mgr->max_desc; i++) { + if (lro_desc[i].active) + lro_flush(&lro_desc[i]); + } +} +EXPORT_SYMBOL(lro_flush_all); + +int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, + struct vlan_group *vgrp, u16 vlan_tag, void *priv) +{ + struct net_lro_desc *lro_desc; + struct iphdr *iph; + struct tcphdr *tcph; + + if (!lro_mgr->get_ip_tcp_hdr + || lro_mgr->get_ip_tcp_hdr(skb, &iph, &tcph, priv)) + goto out; + + lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph); + if (!lro_desc) + goto out; + + if (!lro_desc->active) { /* start new lro session */ + if (lro_tcp_ip_check(skb, iph, tcph, NULL)) + goto out; + + lro_init_desc(lro_desc, skb, iph, tcph, vlan_tag, vgrp); + return 0; + } + + if (lro_desc->tcp_next_seq != ntohl(tcph->seq)) + goto out2; + + if (lro_tcp_ip_check(skb, iph, tcph, lro_desc)) + goto out2; + + lro_add_packet(lro_desc, skb, iph, tcph); + + if (lro_desc->skb_sg_cnt >= lro_mgr->max_aggr) + lro_flush(lro_desc); + + return 0; + +out2: /* send aggregated SKBs to stack */ + lro_flush(lro_desc); + +out: /* Original SKB has to be posted to stack */ + return 1; +} + +void lro_receive_skb(struct net_lro_mgr *lro_mgr, + struct sk_buff *skb, + void *priv) +{ + if (__lro_proc_skb(lro_mgr, skb, NULL, 0, priv)) + netif_receive_skb(skb); +} +EXPORT_SYMBOL(lro_receive_skb); + +void lro_vlan_hwaccel_receive_skb(struct net_lro_mgr *lro_mgr, + struct sk_buff *skb, + struct vlan_group *vgrp, + u16 vlan_tag, + void *priv) +{ + if (__lro_proc_skb(lro_mgr, skb, vgrp, vlan_tag, priv)) + vlan_hwaccel_receive_skb(skb, vgrp, vlan_tag); +} +EXPORT_SYMBOL(lro_vlan_hwaccel_receive_skb); -- 1.5.2 - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/