Hi, This version corrects a couple of bugs previously noted and ties up some loose ends in the e1000 driver. Some versions of this driver support packet splitting into multiple pages, with just the protocol header in the skb itself. This is a very good thing because it avoids the high order page fragmentation problem. Though this is something that probably needs to be pushed down into generic skb allocation, for now the driver handles it explicitly with the help of a new memalloc_page function that just gets a page from the memalloc reserve if normal allocation fails. This does not need separate reserve accounting because such pages are allocated per-skb.
While I was in there, I could not resist cleaning up some non-orthogonality in the 64K overlap handling (e1000 people, please check my work). The result is that with the new memalloc handling, the source stayed the same size. Code size is another question. I have added a number of new inlines. I suspect that inlining the skb allocation functions in general doesn't buy anything, but this needs to be checked. Anyway, we now have a pretty good picture of the full per-driver damage of closing this hole, and it is not much. I still haven't looked at the various hooks in the packet delivery path, but it's coming up pretty soon. There are other wrinkles too, like the fact that there can actually be many block devices mapped over the same network interface. I have to ponder what is best to do so they can't wedge each other, and so that the SOCK_MEMALLOC bit doesn't get cleared prematurely. Anyway, progress marches on. diff -up --recursive 2.6.12.3.clean/drivers/net/e1000/e1000_main.c 2.6.12.3/drivers/net/e1000/e1000_main.c --- 2.6.12.3.clean/drivers/net/e1000/e1000_main.c 2005-07-15 17:18:57.000000000 -0400 +++ 2.6.12.3/drivers/net/e1000/e1000_main.c 2005-08-11 17:42:12.000000000 -0400 @@ -309,6 +309,16 @@ e1000_up(struct e1000_adapter *adapter) e1000_phy_reset(&adapter->hw); } + netdev->memalloc_pages = estimate_skb_pages(netdev->rx_reserve, + adapter->rx_buffer_len + NET_IP_ALIGN); + if (adapter->rx_ps) + netdev->memalloc_pages += PS_PAGE_BUFFERS * netdev->rx_reserve; + if ((err = adjust_memalloc_reserve(netdev->memalloc_pages))) { + DPRINTK(PROBE, ERR, + "Unable to allocate rx reserve Error: %d\n", err); + return err; + } + e1000_set_multi(netdev); e1000_restore_vlan(adapter); @@ -386,6 +396,7 @@ e1000_down(struct e1000_adapter *adapter mii_reg |= MII_CR_POWER_DOWN; e1000_write_phy_reg(&adapter->hw, PHY_CTRL, mii_reg); mdelay(1); + adjust_memalloc_reserve(-netdev->memalloc_pages); } } @@ -3116,34 +3127,29 @@ e1000_alloc_rx_buffers(struct e1000_adap buffer_info = &rx_ring->buffer_info[i]; while(!buffer_info->skb) { - skb = dev_alloc_skb(bufsz); + skb = dev_memalloc_skb(netdev, bufsz); - if(unlikely(!skb)) { + if(unlikely(!skb)) /* Better luck next round */ break; - } /* Fix for errata 23, can't cross 64kB boundary */ if (!e1000_check_64k_bound(adapter, skb->data, bufsz)) { struct sk_buff *oldskb = skb; DPRINTK(RX_ERR, ERR, "skb align check failed: %u bytes " "at %p\n", bufsz, skb->data); - /* Try again, without freeing the previous */ - skb = dev_alloc_skb(bufsz); + /* Try again, then free previous */ + skb = dev_memalloc_skb(netdev, bufsz); + dev_memfree_skb(oldskb); + /* Failed allocation, critical failure */ - if (!skb) { - dev_kfree_skb(oldskb); + if (!skb) break; - } + /* give up */ if (!e1000_check_64k_bound(adapter, skb->data, bufsz)) { - /* give up */ - dev_kfree_skb(skb); - dev_kfree_skb(oldskb); + dev_memfree_skb(skb); break; /* while !buffer_info->skb */ - } else { - /* Use new allocation */ - dev_kfree_skb(oldskb); } } /* Make buffer alignment 2 beyond a 16 byte boundary @@ -3152,8 +3158,6 @@ e1000_alloc_rx_buffers(struct e1000_adap */ skb_reserve(skb, NET_IP_ALIGN); - skb->dev = netdev; - buffer_info->skb = skb; buffer_info->length = adapter->rx_buffer_len; buffer_info->dma = pci_map_single(pdev, @@ -3169,8 +3173,8 @@ e1000_alloc_rx_buffers(struct e1000_adap "dma align check failed: %u bytes at %p\n", adapter->rx_buffer_len, (void *)(unsigned long)buffer_info->dma); - dev_kfree_skb(skb); buffer_info->skb = NULL; + dev_memfree_skb(skb); pci_unmap_single(pdev, buffer_info->dma, adapter->rx_buffer_len, @@ -3225,8 +3229,7 @@ e1000_alloc_rx_buffers_ps(struct e1000_a for(j = 0; j < PS_PAGE_BUFFERS; j++) { if(unlikely(!ps_page->ps_page[j])) { - ps_page->ps_page[j] = - alloc_page(GFP_ATOMIC); + ps_page->ps_page[j] = memalloc_page(); if(unlikely(!ps_page->ps_page[j])) goto no_buffers; ps_page_dma->ps_page_dma[j] = @@ -3242,7 +3245,7 @@ e1000_alloc_rx_buffers_ps(struct e1000_a cpu_to_le64(ps_page_dma->ps_page_dma[j]); } - skb = dev_alloc_skb(adapter->rx_ps_bsize0 + NET_IP_ALIGN); + skb = dev_memalloc_skb(netdev, adapter->rx_ps_bsize0 + NET_IP_ALIGN); if(unlikely(!skb)) break; @@ -3253,8 +3256,6 @@ e1000_alloc_rx_buffers_ps(struct e1000_a */ skb_reserve(skb, NET_IP_ALIGN); - skb->dev = netdev; - buffer_info->skb = skb; buffer_info->length = adapter->rx_ps_bsize0; buffer_info->dma = pci_map_single(pdev, skb->data, diff -up --recursive 2.6.12.3.clean/include/linux/gfp.h 2.6.12.3/include/linux/gfp.h --- 2.6.12.3.clean/include/linux/gfp.h 2005-07-15 17:18:57.000000000 -0400 +++ 2.6.12.3/include/linux/gfp.h 2005-08-05 21:53:09.000000000 -0400 @@ -39,6 +39,7 @@ struct vm_area_struct; #define __GFP_COMP 0x4000u /* Add compound page metadata */ #define __GFP_ZERO 0x8000u /* Return zeroed page on success */ #define __GFP_NOMEMALLOC 0x10000u /* Don't use emergency reserves */ +#define __GFP_MEMALLOC 0x20000u /* Use emergency reserves */ #define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */ #define __GFP_BITS_MASK ((1 << __GFP_BITS_SHIFT) - 1) diff -up --recursive 2.6.12.3.clean/include/linux/mmzone.h 2.6.12.3/include/linux/mmzone.h --- 2.6.12.3.clean/include/linux/mmzone.h 2005-07-15 17:18:57.000000000 -0400 +++ 2.6.12.3/include/linux/mmzone.h 2005-08-08 04:32:21.000000000 -0400 @@ -378,6 +378,7 @@ int min_free_kbytes_sysctl_handler(struc extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +int adjust_memalloc_reserve(int bytes); #include <linux/topology.h> /* Returns the number of the current Node. */ diff -up --recursive 2.6.12.3.clean/include/linux/netdevice.h 2.6.12.3/include/linux/netdevice.h --- 2.6.12.3.clean/include/linux/netdevice.h 2005-07-15 17:18:57.000000000 -0400 +++ 2.6.12.3/include/linux/netdevice.h 2005-08-11 17:40:41.000000000 -0400 @@ -371,6 +371,9 @@ struct net_device struct Qdisc *qdisc_ingress; struct list_head qdisc_list; unsigned long tx_queue_len; /* Max frames per queue allowed */ + int rx_reserve; + atomic_t rx_reserve_used; + int memalloc_pages; /* ingress path synchronizer */ spinlock_t ingress_lock; @@ -662,6 +665,60 @@ static inline void dev_kfree_skb_any(str dev_kfree_skb(skb); } +/* + * Support for critical network IO under low memory conditions + */ +static inline int dev_reserve_used(struct net_device *dev) +{ + return atomic_read(&dev->rx_reserve_used); +} + +static inline struct sk_buff *__dev_memalloc_skb(struct net_device *dev, + unsigned length, int gfp_mask) +{ + struct sk_buff *skb = __dev_alloc_skb(length, gfp_mask); + if (skb) + goto done; + if (dev_reserve_used(dev) >= dev->rx_reserve) + return NULL; + if (!(skb = __dev_alloc_skb(length, gfp_mask|__GFP_MEMALLOC))) + return NULL; + atomic_inc(&dev->rx_reserve_used); +done: + skb->dev = dev; + return skb; +} + +static inline struct sk_buff *dev_memalloc_skb(struct net_device *dev, + unsigned length) +{ + return __dev_memalloc_skb(dev, length, GFP_ATOMIC); +} + +/* + * This is a stopgap to be used only until reserve accounting is changed to + * from page to skb granularity. It depends on no more than a fixed maximum + * number of pages being allocated each time an skb is allocated. + */ +static inline struct page *memalloc_page(void) +{ + struct page *page = alloc_page(GFP_ATOMIC); + return page ? : alloc_page(GFP_ATOMIC|__GFP_MEMALLOC); +} + +static inline void dev_unreserve_skb(struct net_device *dev) +{ + if (atomic_dec_return(&dev->rx_reserve_used) < 0) + atomic_inc(&dev->rx_reserve_used); +} + +static inline void dev_memfree_skb(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + __kfree_skb(skb); + dev_unreserve_skb(dev); +} + #define HAVE_NETIF_RX 1 extern int netif_rx(struct sk_buff *skb); extern int netif_rx_ni(struct sk_buff *skb); diff -up --recursive 2.6.12.3.clean/include/linux/skbuff.h 2.6.12.3/include/linux/skbuff.h --- 2.6.12.3.clean/include/linux/skbuff.h 2005-07-15 17:18:57.000000000 -0400 +++ 2.6.12.3/include/linux/skbuff.h 2005-08-08 04:25:31.000000000 -0400 @@ -994,6 +994,8 @@ static inline struct sk_buff *__dev_allo extern struct sk_buff *__dev_alloc_skb(unsigned int length, int gfp_mask); #endif +unsigned estimate_skb_pages(unsigned howmany, unsigned size); + /** * dev_alloc_skb - allocate an skbuff for sending * @length: length to allocate diff -up --recursive 2.6.12.3.clean/include/linux/slab.h 2.6.12.3/include/linux/slab.h --- 2.6.12.3.clean/include/linux/slab.h 2005-07-15 17:18:57.000000000 -0400 +++ 2.6.12.3/include/linux/slab.h 2005-08-08 05:02:07.000000000 -0400 @@ -65,6 +65,7 @@ extern void *kmem_cache_alloc(kmem_cache extern void kmem_cache_free(kmem_cache_t *, void *); extern unsigned int kmem_cache_size(kmem_cache_t *); extern kmem_cache_t *kmem_find_general_cachep(size_t size, int gfpflags); +unsigned kmem_estimate_pages(kmem_cache_t *cache, unsigned num); /* Size description struct for general caches. */ struct cache_sizes { diff -up --recursive 2.6.12.3.clean/include/net/sock.h 2.6.12.3/include/net/sock.h --- 2.6.12.3.clean/include/net/sock.h 2005-07-15 17:18:57.000000000 -0400 +++ 2.6.12.3/include/net/sock.h 2005-08-05 21:53:09.000000000 -0400 @@ -382,6 +382,7 @@ enum sock_flags { SOCK_NO_LARGESEND, /* whether to sent large segments or not */ SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ + SOCK_MEMALLOC, /* protocol can use memalloc reserve */ }; static inline void sock_set_flag(struct sock *sk, enum sock_flags flag) @@ -399,6 +400,11 @@ static inline int sock_flag(struct sock return test_bit(flag, &sk->sk_flags); } +static inline int is_memalloc_sock(struct sock *sk) +{ + return sock_flag(sk, SOCK_MEMALLOC); +} + static inline void sk_acceptq_removed(struct sock *sk) { sk->sk_ack_backlog--; diff -up --recursive 2.6.12.3.clean/mm/page_alloc.c 2.6.12.3/mm/page_alloc.c --- 2.6.12.3.clean/mm/page_alloc.c 2005-07-15 17:18:57.000000000 -0400 +++ 2.6.12.3/mm/page_alloc.c 2005-08-08 21:20:15.000000000 -0400 @@ -73,6 +73,7 @@ EXPORT_SYMBOL(zone_table); static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; int min_free_kbytes = 1024; +int var_free_kbytes; unsigned long __initdata nr_kernel_pages; unsigned long __initdata nr_all_pages; @@ -802,8 +803,8 @@ __alloc_pages(unsigned int __nocast gfp_ /* This allocation should allow future memory freeing. */ - if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) - && !in_interrupt()) { + if ((((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) + && !in_interrupt()) || (gfp_mask & __GFP_MEMALLOC)) { if (!(gfp_mask & __GFP_NOMEMALLOC)) { /* go through the zonelist yet again, ignoring mins */ for (i = 0; (z = zones[i]) != NULL; i++) { @@ -2029,7 +2030,8 @@ static void setup_per_zone_lowmem_reserv */ static void setup_per_zone_pages_min(void) { - unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); + unsigned pages_min = (min_free_kbytes + var_free_kbytes) + >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; struct zone *zone; unsigned long flags; @@ -2075,6 +2077,18 @@ static void setup_per_zone_pages_min(voi } } +int adjust_memalloc_reserve(int pages) +{ + int kbytes = var_free_kbytes + (pages << (PAGE_SHIFT - 10)); + if (kbytes < 0) + return -EINVAL; + var_free_kbytes = kbytes; + setup_per_zone_pages_min(); + return 0; +} + +EXPORT_SYMBOL_GPL(adjust_memalloc_reserve); + /* * Initialise min_free_kbytes. * diff -up --recursive 2.6.12.3.clean/mm/slab.c 2.6.12.3/mm/slab.c --- 2.6.12.3.clean/mm/slab.c 2005-07-15 17:18:57.000000000 -0400 +++ 2.6.12.3/mm/slab.c 2005-08-08 05:00:38.000000000 -0400 @@ -2353,6 +2353,11 @@ out: return 0; } +unsigned kmem_estimate_pages(kmem_cache_t *cache, unsigned num) +{ + return ((num + cache->num - 1) / cache->num) << cache->gfporder; +} + #ifdef CONFIG_NUMA /** * kmem_cache_alloc_node - Allocate an object on the specified node diff -up --recursive 2.6.12.3.clean/net/core/skbuff.c 2.6.12.3/net/core/skbuff.c --- 2.6.12.3.clean/net/core/skbuff.c 2005-07-15 17:18:57.000000000 -0400 +++ 2.6.12.3/net/core/skbuff.c 2005-08-08 23:16:23.000000000 -0400 @@ -44,6 +44,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/mm.h> +#include <linux/pagemap.h> #include <linux/interrupt.h> #include <linux/in.h> #include <linux/inet.h> @@ -167,6 +168,15 @@ nodata: goto out; } +#define ceiling_log2(x) fls(x - 1) +unsigned estimate_skb_pages(unsigned num, unsigned size) +{ + int slab_pages = kmem_estimate_pages(skbuff_head_cache, num); + int data_space = num * (1 << ceiling_log2(size + 16)); + int data_pages = (data_space + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + return slab_pages + data_pages; +} + /** * alloc_skb_from_cache - allocate a network buffer * @cp: kmem_cache from which to allocate the data area diff -up --recursive 2.6.12.3.clean/net/ethernet/eth.c 2.6.12.3/net/ethernet/eth.c --- 2.6.12.3.clean/net/ethernet/eth.c 2005-07-15 17:18:57.000000000 -0400 +++ 2.6.12.3/net/ethernet/eth.c 2005-08-06 02:32:02.000000000 -0400 @@ -281,6 +281,7 @@ void ether_setup(struct net_device *dev) dev->mtu = 1500; /* eth_mtu */ dev->addr_len = ETH_ALEN; dev->tx_queue_len = 1000; /* Ethernet wants good queues */ + dev->rx_reserve = 50; dev->flags = IFF_BROADCAST|IFF_MULTICAST; memset(dev->broadcast,0xFF, ETH_ALEN); diff -up --recursive 2.6.12.3.clean/net/ipv4/icmp.c 2.6.12.3/net/ipv4/icmp.c --- 2.6.12.3.clean/net/ipv4/icmp.c 2005-07-15 17:18:57.000000000 -0400 +++ 2.6.12.3/net/ipv4/icmp.c 2005-08-11 16:28:46.000000000 -0400 @@ -944,6 +944,11 @@ int icmp_rcv(struct sk_buff *skb) default:; } + if (unlikely(dev_reserve_used(skb->dev))) { + dev_unreserve_skb(skb->dev); + goto drop; + } + if (!pskb_pull(skb, sizeof(struct icmphdr))) goto error; diff -up --recursive 2.6.12.3.clean/net/ipv4/tcp_ipv4.c 2.6.12.3/net/ipv4/tcp_ipv4.c --- 2.6.12.3.clean/net/ipv4/tcp_ipv4.c 2005-07-15 17:18:57.000000000 -0400 +++ 2.6.12.3/net/ipv4/tcp_ipv4.c 2005-08-11 16:29:14.000000000 -0400 @@ -1766,6 +1766,12 @@ int tcp_v4_rcv(struct sk_buff *skb) if (!sk) goto no_tcp_socket; + if (unlikely(dev_reserve_used(skb->dev))) { + dev_unreserve_skb(skb->dev); + if (!is_memalloc_sock(sk)) + goto discard_and_relse; + } + process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; diff -up --recursive 2.6.12.3.clean/net/ipv4/udp.c 2.6.12.3/net/ipv4/udp.c --- 2.6.12.3.clean/net/ipv4/udp.c 2005-07-15 17:18:57.000000000 -0400 +++ 2.6.12.3/net/ipv4/udp.c 2005-08-11 16:28:56.000000000 -0400 @@ -1152,6 +1152,12 @@ int udp_rcv(struct sk_buff *skb) sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex); if (sk != NULL) { + if (unlikely(dev_reserve_used(skb->dev))) { + dev_unreserve_skb(skb->dev); + if (!is_memalloc_sock(sk)) + goto drop_noncritical; + } + int ret = udp_queue_rcv_skb(sk, skb); sock_put(sk); @@ -1163,6 +1169,7 @@ int udp_rcv(struct sk_buff *skb) return 0; } +drop_noncritical: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) goto drop; - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html