Add Large Receive Offload implemented in software. Signed-off-by: Brice Goglin <[EMAIL PROTECTED]> --- drivers/net/myri10ge/myri10ge.c | 422 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 422 insertions(+)
Index: linux-rc/drivers/net/myri10ge/myri10ge.c =================================================================== --- linux-rc.orig/drivers/net/myri10ge/myri10ge.c 2007-02-21 17:42:22.000000000 +0100 +++ linux-rc/drivers/net/myri10ge/myri10ge.c 2007-02-21 17:55:22.000000000 +0100 @@ -61,6 +61,8 @@ #include <linux/moduleparam.h> #include <linux/io.h> #include <net/checksum.h> +#include <net/ip.h> +#include <net/tcp.h> #include <asm/byteorder.h> #include <asm/io.h> #include <asm/processor.h> @@ -145,11 +147,32 @@ int pkt_done; /* packets completed */ }; +struct myri10ge_lro_packet { + struct hlist_node lro_node; + struct sk_buff *skb; + int timestamp; + __u32 tsval; + __u32 tsecr; + __u32 source_ip; + __u32 dest_ip; + __u32 next_seq; + __u32 ack_seq; + __wsum data_csum; + __u16 window; + __u16 source_port; + __u16 dest_port; + __u16 append_cnt; + __u16 mss; + __u16 vlan_tci; +}; + struct myri10ge_rx_done { struct mcp_slot *entry; dma_addr_t bus; int cnt; int idx; + struct hlist_head lro_active; + struct hlist_head lro_free; }; struct myri10ge_priv { @@ -161,6 +184,10 @@ struct myri10ge_rx_done rx_done; int small_bytes; int big_bytes; + int lro_flushed; + int lro_queued; + int lro_too_many_streams; + int lro_bad_csum; struct net_device *dev; struct net_device_stats stats; u8 __iomem *sram; @@ -274,6 +301,10 @@ module_param(myri10ge_debug, int, 0); MODULE_PARM_DESC(myri10ge_debug, "Debug level (0=none,...,16=all)"); +static int myri10ge_lro = 8; +module_param(myri10ge_lro, int, S_IRUGO); +MODULE_PARM_DESC(myri10ge_lro, "Enable large N receive offload queues\n"); + static int myri10ge_fill_thresh = 256; module_param(myri10ge_fill_thresh, int, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(myri10ge_fill_thresh, "Number of empty rx slots allowed\n"); @@ -808,6 +839,9 @@ mgp->rx_done.idx = 0; mgp->rx_done.cnt = 0; mgp->link_changes = 0; + mgp->lro_queued = 0; + mgp->lro_flushed = 0; + mgp->lro_too_many_streams = 0; status = myri10ge_update_mac_address(mgp, mgp->dev->dev_addr); myri10ge_change_promisc(mgp, 0, 0); myri10ge_change_pause(mgp, mgp->pause); @@ -876,6 +910,357 @@ skb_pull(skb, MXGEFW_PAD); } +/* debug aid to check for "bad" hardware */ + +static void +myri10ge_frag_trim(struct skb_frag_struct *rx_frags, int old_len, int trim) +{ + struct skb_frag_struct *frag; + int offset = 0; + int new_len = old_len - trim; + int old_size; + + /* find the frag where the IP payload ends. This + * should almost always be the 1st fragment */ + frag = rx_frags; + while (offset + frag->size < new_len) { + offset += frag->size; + frag++; + } + /* adjust its length */ + old_size = frag->size; + frag->size = new_len - offset; + + /* release any excess pages */ + offset += old_size; + while (offset < old_len) { + frag++; + offset += frag->size; + put_page(frag->page); + } +} + +static inline int myri10ge_lro_csum(int tcplen, struct iphdr *iph, __wsum csum) +{ + if (unlikely(ip_fast_csum((u8 *) iph, iph->ihl))) + return -1; + + if (unlikely(csum_tcpudp_magic(iph->saddr, iph->daddr, + tcplen, IPPROTO_TCP, csum))) + return -1; + return 0; +} + +static inline void +myri10ge_lro_flush(struct myri10ge_priv *mgp, struct myri10ge_lro_packet *lro) +{ + struct iphdr *iph; + struct tcphdr *th; + struct sk_buff *skb; + u32 *ts_ptr; + u32 tcplen; + + skb = lro->skb; + + if (lro->append_cnt) { + /* incorporate the new len into the ip header and + * re-calculate the checksum, Note that + * eth_type_trans() left skb->data at the start of + * the vlan header, so we need to skip past it to + * get to the IP header */ + if (lro->vlan_tci) { + iph = (struct iphdr *)(skb->data + VLAN_HLEN); + iph->tot_len = htons(skb->len - VLAN_HLEN); + } else { + iph = (struct iphdr *)skb->data; + iph->tot_len = htons(skb->len); + } + iph->check = 0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + + /* incorporate the latest ack into the tcp header */ + th = (struct tcphdr *)(iph + 1); + th->ack_seq = lro->ack_seq; + th->window = lro->window; + + /* incorporate latest timestamp into the tcp header */ + if (lro->timestamp) { + ts_ptr = (u32 *) (th + 1); + ts_ptr[1] = htonl(lro->tsval); + ts_ptr[2] = lro->tsecr; + } + + /* + * update checksum in tcp header by re-calculating the + * tcp pseudoheader checksum, and adding it to the checksum + * of the tcp payload data + */ + th->check = 0; + tcplen = ntohs(iph->tot_len) - sizeof(*iph); + th->check = tcp_v4_check(tcplen, iph->saddr, iph->daddr, + csum_partial((char *)th, + th->doff << 2, + lro->data_csum)); + + skb->truesize = skb->len + sizeof(struct sk_buff); + } + + skb_shinfo(skb)->gso_size = lro->mss; + netif_receive_skb(skb); + mgp->dev->last_rx = jiffies; + mgp->lro_queued += lro->append_cnt + 1; + mgp->lro_flushed++; + lro->skb = NULL; + lro->timestamp = 0; + lro->append_cnt = 0; + hlist_add_head(&lro->lro_node, &mgp->rx_done.lro_free); +} + +static int +myri10ge_lro_rx(struct myri10ge_priv *mgp, u8 * va, + struct skb_frag_struct *rx_frags, int *len, __wsum csum) +{ + struct ethhdr *eh; + struct vlan_ethhdr *vh; + struct iphdr *iph; + struct tcphdr *th; + struct myri10ge_lro_packet *lro; + u32 *ts_ptr = NULL; /* XXX -Wuninitialized */ + struct sk_buff *skb; + struct skb_frag_struct *skb_frags; + struct hlist_node *node; + int opt_bytes, tcp_data_len, tcp_hdr_len, hlen, trim, llhlen; + __u32 seq; + __u16 ip_len, vlan_tci; + + /* check to see that it is IP */ + eh = (struct ethhdr *)(va + MXGEFW_PAD); + if (eh->h_proto == ntohs(ETH_P_IP)) { + llhlen = ETH_HLEN; + vlan_tci = 0; + } else if (eh->h_proto == ntohs(ETH_P_8021Q)) { + vh = (struct vlan_ethhdr *)(va + MXGEFW_PAD); + if (vh->h_vlan_encapsulated_proto != ntohs(ETH_P_IP)) + return -1; + llhlen = VLAN_ETH_HLEN; + vlan_tci = vh->h_vlan_TCI; + /* HW checksum starts after the ethernet header, we + * must subtract off the VLAN header's checksum before + * csum can be used */ + csum = csum_sub(csum, + csum_partial(va + MXGEFW_PAD + ETH_HLEN, + VLAN_HLEN, 0)); + } else { + return -1; + } + + /* now check to see if it is TCP */ + iph = (struct iphdr *)(va + llhlen + MXGEFW_PAD); + if (iph->protocol != IPPROTO_TCP) + return -1; + + /* ensure there are no options */ + if ((iph->ihl << 2) != sizeof(*iph)) + return -1; + + /* .. and the packet is not fragmented */ + if (iph->frag_off & htons(IP_MF | IP_OFFSET)) + return -1; + + /* find the TCP header */ + th = (struct tcphdr *)(iph + 1); + + /* ensure no bits set besides ack or psh */ + if (th->fin || th->syn || th->rst || th->urg || th->ece + || th->cwr || !th->ack) + return -1; + + /* check for timestamps. Since the only option we handle are + * timestamps, we only have to handle the simple case of + * aligned timestamps */ + + opt_bytes = (th->doff << 2) - sizeof(*th); + tcp_hdr_len = sizeof(*th) + opt_bytes; + if (opt_bytes != 0) { + ts_ptr = (u32 *) (th + 1); + if (unlikely(opt_bytes != TCPOLEN_TSTAMP_ALIGNED) || + (*ts_ptr != ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) + | TCPOLEN_TIMESTAMP))) { + return -1; + } + } + + ip_len = ntohs(iph->tot_len); + tcp_data_len = ip_len - (th->doff << 2) - sizeof(*iph); + + /* + * If frame is padded beyond the end of the IP packet, + * then we must trim the extra bytes off the end. + */ + trim = *len - (ip_len + llhlen + MXGEFW_PAD); + if (trim != 0) { + /* ensure we received the full frame */ + if (unlikely(trim < 0)) + return -1; + /* trim off any padding */ + myri10ge_frag_trim(rx_frags, *len, trim); + *len -= trim; + } + + hlen = ip_len + llhlen - tcp_data_len; + + seq = ntohl(th->seq); + + if (unlikely(myri10ge_lro_csum(tcp_hdr_len + tcp_data_len, iph, csum))) { + mgp->lro_bad_csum++; + return -1; + } + + /* now we have a packet that might be eligible for LRO, + * so see if it matches anything we might expect */ + + hlist_for_each_entry(lro, node, &mgp->rx_done.lro_active, lro_node) { + if (lro->source_port == th->source && + lro->dest_port == th->dest && + lro->source_ip == iph->saddr && + lro->dest_ip == iph->daddr && lro->vlan_tci == vlan_tci) { + /* Try to append it */ + + if (unlikely(seq != lro->next_seq)) { + /* out of order packet */ + hlist_del(&lro->lro_node); + myri10ge_lro_flush(mgp, lro); + return -1; + } + if (lro->timestamp) { + __u32 tsval = ntohl(*(ts_ptr + 1)); + /* make sure timestamp values are increasing */ + if (unlikely(lro->tsval > tsval || + *(ts_ptr + 2) == 0)) { + return -1; + } + lro->tsval = tsval; + lro->tsecr = *(ts_ptr + 2); + } + lro->next_seq += tcp_data_len; + lro->ack_seq = th->ack_seq; + skb = lro->skb; + + /* subtract off the checksum of the tcp header + * from the hardware checksum, and add it to the + * stored tcp data checksum. csum_block_add() + * is used, as the total length so far may be + * odd + */ + lro->data_csum = + csum_block_add(lro->data_csum, + csum_sub(csum, + csum_partial((u8 *) th, + tcp_hdr_len, + 0)), + skb->data_len); + lro->window = th->window; + skb->data_len += tcp_data_len; + skb->len += tcp_data_len; + if (tcp_data_len > lro->mss) + lro->mss = tcp_data_len; + + /* pull off the header and firmware pad + * before we copy the data */ + + hlen += MXGEFW_PAD; + rx_frags[0].page_offset += hlen; + rx_frags[0].size -= hlen; + *len -= hlen; + skb_frags = + &skb_shinfo(skb)->frags[skb_shinfo(skb)->nr_frags]; + /* if it was just header (like a TCP ack with + * no data), release the page */ + if (*len <= 0) { + put_page(rx_frags[0].page); + } else { + while (*len > 0) { + memcpy(skb_frags, rx_frags, + sizeof(*skb_frags)); + *len -= rx_frags->size; + rx_frags++; + skb_frags++; + skb_shinfo(skb)->nr_frags++; + } + } + + lro->append_cnt++; + + /* cheap, conservative test. We may waste + * some slots with a 1500 byte mtu */ + if (skb_shinfo(skb)->nr_frags + + MYRI10GE_MAX_FRAGS_PER_FRAME > MAX_SKB_FRAGS + || mgp->dev->mtu + skb->len > 65535) { + hlist_del(&lro->lro_node); + myri10ge_lro_flush(mgp, lro); + } + return 0; + } + } + + /* start a new packet */ + if (!hlist_empty(&mgp->rx_done.lro_free)) { + lro = hlist_entry(mgp->rx_done.lro_free.first, + struct myri10ge_lro_packet, lro_node); + /* allocate an skb to attach the page(s) to */ + + skb = netdev_alloc_skb(mgp->dev, hlen + 16); + if (unlikely(skb == NULL)) + return -1; + + myri10ge_rx_skb_build(skb, va, rx_frags, *len, + hlen + MXGEFW_PAD); + skb->protocol = eth_type_trans(skb, mgp->dev); + skb->dev = mgp->dev; + skb->ip_summed = CHECKSUM_UNNECESSARY; + lro->skb = skb; + lro->source_ip = iph->saddr; + lro->dest_ip = iph->daddr; + lro->source_port = th->source; + lro->dest_port = th->dest; + lro->next_seq = seq + tcp_data_len; + lro->mss = tcp_data_len; + lro->ack_seq = th->ack_seq; + + /* save the checksum of just the TCP payload by + * subtracting off the checksum of the TCP header from + * the entire hardware checksum + */ + lro->data_csum = csum_sub(csum, + csum_partial((u8 *) th, + tcp_hdr_len, 0)); + lro->window = th->window; + lro->vlan_tci = vlan_tci; + /* record timestamp if it is present */ + if (opt_bytes) { + lro->timestamp = 1; + lro->tsval = ntohl(*(ts_ptr + 1)); + lro->tsecr = *(ts_ptr + 2); + } + /* remove first packet from freelist.. */ + hlist_del(&lro->lro_node); + /* .. and insert at the front of the active list */ + hlist_add_head(&lro->lro_node, &mgp->rx_done.lro_active); + + /* release the page if there was no data. We do it + * down here since the code above refers to the + * contents of the page */ + if (skb_shinfo(skb)->frags[0].size <= 0) { + put_page(skb_shinfo(skb)->frags[0].page); + skb_shinfo(skb)->nr_frags = 0; + } + return 0; + } + mgp->lro_too_many_streams++; + return -1; +} + static void myri10ge_alloc_rx_pages(struct myri10ge_priv *mgp, struct myri10ge_rx_buf *rx, int bytes, int watchdog) @@ -983,9 +1368,14 @@ remainder -= MYRI10GE_ALLOC_SIZE; } + if (mgp->csum_flag && myri10ge_lro && + (0 == myri10ge_lro_rx(mgp, va, rx_frags, &len, csum))) + return 1; hlen = MYRI10GE_HLEN > len ? len : MYRI10GE_HLEN; /* allocate an skb to attach the page(s) to. */ + /* This is done + * after trying LRO, so as to avoid skb allocation overheads */ skb = netdev_alloc_skb(dev, MYRI10GE_HLEN + 16); if (unlikely(skb == NULL)) { @@ -1073,6 +1463,8 @@ static inline void myri10ge_clean_rx_done(struct myri10ge_priv *mgp, int *limit) { struct myri10ge_rx_done *rx_done = &mgp->rx_done; + struct hlist_node *node, *node2; + struct myri10ge_lro_packet *lro; unsigned long rx_bytes = 0; unsigned long rx_packets = 0; unsigned long rx_ok; @@ -1105,6 +1497,11 @@ } rx_done->idx = idx; rx_done->cnt = cnt; + hlist_for_each_entry_safe(lro, node, node2, &mgp->rx_done.lro_active, + lro_node) { + hlist_del(&lro->lro_node); + myri10ge_lro_flush(mgp, lro); + } mgp->stats.rx_packets += rx_packets; mgp->stats.rx_bytes += rx_bytes; @@ -1338,6 +1735,7 @@ "read_dma_bw_MBs", "write_dma_bw_MBs", "read_write_dma_bw_MBs", "serial_number", "tx_pkt_start", "tx_pkt_done", "tx_req", "tx_done", "rx_small_cnt", "rx_big_cnt", + "lro_queued", "lro_flushed", "lro_too_many_streams", "lro_bad_csum", "wake_queue", "stop_queue", "watchdog_resets", "tx_linearized", "link_changes", "link_up", "dropped_link_overflow", "dropped_link_error_or_filtered", "dropped_multicast_filtered", @@ -1388,6 +1786,10 @@ data[i++] = (unsigned int)mgp->tx.done; data[i++] = (unsigned int)mgp->rx_small.cnt; data[i++] = (unsigned int)mgp->rx_big.cnt; + data[i++] = (unsigned int)mgp->lro_queued; + data[i++] = (unsigned int)mgp->lro_flushed; + data[i++] = (unsigned int)mgp->lro_too_many_streams; + data[i++] = (unsigned int)mgp->lro_bad_csum; data[i++] = (unsigned int)mgp->wake_queue; data[i++] = (unsigned int)mgp->stop_queue; data[i++] = (unsigned int)mgp->watchdog_resets; @@ -1527,6 +1929,18 @@ goto abort_with_rx_big_ring; } + bytes = sizeof(struct myri10ge_lro_packet); + INIT_HLIST_HEAD(&mgp->rx_done.lro_free); + INIT_HLIST_HEAD(&mgp->rx_done.lro_active); + for (i = 0; i < myri10ge_lro; i++) { + struct myri10ge_lro_packet *lro; + lro = kzalloc(bytes, GFP_KERNEL); + if (lro != NULL) { + INIT_HLIST_NODE(&lro->lro_node); + hlist_add_head(&lro->lro_node, &mgp->rx_done.lro_free); + } + } + return 0; abort_with_rx_big_ring: @@ -1573,10 +1987,18 @@ struct myri10ge_priv *mgp; struct sk_buff *skb; struct myri10ge_tx_buf *tx; + struct hlist_node *node, *node2; + struct myri10ge_lro_packet *lro; int i, len, idx; mgp = netdev_priv(dev); + hlist_for_each_entry_safe(lro, node, node2, &mgp->rx_done.lro_active, + lro_node) { + hlist_del(&lro->lro_node); + kfree(lro); + } + for (i = mgp->rx_big.cnt; i < mgp->rx_big.fill_cnt; i++) { idx = i & mgp->rx_big.mask; if (i == mgp->rx_big.fill_cnt - 1) - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html