On Tue, 30 Oct 2007 17:04:24 +0100
Peter Zijlstra <[EMAIL PROTECTED]> wrote:

> In order to make sure emergency packets receive all memory needed to proceed
> ensure processing of emergency SKBs happens under PF_MEMALLOC.
> 
> Use the (new) sk_backlog_rcv() wrapper to ensure this for backlog processing.
> 
> Skip taps, since those are user-space again.
> 
> Signed-off-by: Peter Zijlstra <[EMAIL PROTECTED]>
> ---
>  include/net/sock.h |    5 +++++
>  net/core/dev.c     |   44 ++++++++++++++++++++++++++++++++++++++------
>  net/core/sock.c    |   18 ++++++++++++++++++
>  3 files changed, 61 insertions(+), 6 deletions(-)
> 
> Index: linux-2.6/net/core/dev.c
> ===================================================================
> --- linux-2.6.orig/net/core/dev.c
> +++ linux-2.6/net/core/dev.c
> @@ -1976,10 +1976,23 @@ int netif_receive_skb(struct sk_buff *sk
>       struct net_device *orig_dev;
>       int ret = NET_RX_DROP;
>       __be16 type;
> +     unsigned long pflags = current->flags;
> +
> +     /* Emergency skb are special, they should
> +      *  - be delivered to SOCK_MEMALLOC sockets only
> +      *  - stay away from userspace
> +      *  - have bounded memory usage
> +      *
> +      * Use PF_MEMALLOC as a poor mans memory pool - the grouping kind.
> +      * This saves us from propagating the allocation context down to all
> +      * allocation sites.
> +      */
> +     if (skb_emergency(skb))
> +             current->flags |= PF_MEMALLOC;
>  
>       /* if we've gotten here through NAPI, check netpoll */
>       if (netpoll_receive_skb(skb))
> -             return NET_RX_DROP;
> +             goto out;

Why the change? doesn't gcc optimize the common exit case anyway?

>  
>       if (!skb->tstamp.tv64)
>               net_timestamp(skb);
> @@ -1990,7 +2003,7 @@ int netif_receive_skb(struct sk_buff *sk
>       orig_dev = skb_bond(skb);
>  
>       if (!orig_dev)
> -             return NET_RX_DROP;
> +             goto out;
>  
>       __get_cpu_var(netdev_rx_stat).total++;
>  
> @@ -2009,6 +2022,9 @@ int netif_receive_skb(struct sk_buff *sk
>       }
>  #endif
>  
> +     if (skb_emergency(skb))
> +             goto skip_taps;
> +
>       list_for_each_entry_rcu(ptype, &ptype_all, list) {
>               if (!ptype->dev || ptype->dev == skb->dev) {
>                       if (pt_prev)
> @@ -2017,6 +2033,7 @@ int netif_receive_skb(struct sk_buff *sk
>               }
>       }
>  
> +skip_taps:
>  #ifdef CONFIG_NET_CLS_ACT
>       if (pt_prev) {
>               ret = deliver_skb(skb, pt_prev, orig_dev);
> @@ -2029,19 +2046,31 @@ int netif_receive_skb(struct sk_buff *sk
>  
>       if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
>               kfree_skb(skb);
> -             goto out;
> +             goto unlock;
>       }
>  
>       skb->tc_verd = 0;
>  ncls:
>  #endif
>  
> +     if (skb_emergency(skb))
> +             switch(skb->protocol) {
> +                     case __constant_htons(ETH_P_ARP):
> +                     case __constant_htons(ETH_P_IP):
> +                     case __constant_htons(ETH_P_IPV6):
> +                     case __constant_htons(ETH_P_8021Q):
> +                             break;

Indentation is wrong, and hard coding protocol values as spcial case
seems bad here. What about vlan's, etc?

> +                     default:
> +                             goto drop;
> +             }
> +
>       skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
>       if (!skb)
> -             goto out;
> +             goto unlock;
>       skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
>       if (!skb)
> -             goto out;
> +             goto unlock;
>  
>       type = skb->protocol;
>       list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
> @@ -2056,6 +2085,7 @@ ncls:
>       if (pt_prev) {
>               ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
>       } else {
> +drop:
>               kfree_skb(skb);
>               /* Jamal, now you will not able to escape explaining
>                * me how you were going to use this. :-)
> @@ -2063,8 +2093,10 @@ ncls:
>               ret = NET_RX_DROP;
>       }
>  
> -out:
> +unlock:
>       rcu_read_unlock();
> +out:
> +     tsk_restore_flags(current, pflags, PF_MEMALLOC);
>       return ret;
>  }
>  
> Index: linux-2.6/include/net/sock.h
> ===================================================================
> --- linux-2.6.orig/include/net/sock.h
> +++ linux-2.6/include/net/sock.h
> @@ -523,8 +523,13 @@ static inline void sk_add_backlog(struct
>       skb->next = NULL;
>  }
>  
> +extern int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb);
> +
>  static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
>  {
> +     if (skb_emergency(skb))
> +             return __sk_backlog_rcv(sk, skb);
> +
>       return sk->sk_backlog_rcv(sk, skb);
>  }
>  
> Index: linux-2.6/net/core/sock.c
> ===================================================================
> --- linux-2.6.orig/net/core/sock.c
> +++ linux-2.6/net/core/sock.c
> @@ -319,6 +319,24 @@ int sk_clear_memalloc(struct sock *sk)
>  }
>  EXPORT_SYMBOL_GPL(sk_clear_memalloc);
>  
> +#ifdef CONFIG_NETVM
> +int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
> +{
> +     int ret;
> +     unsigned long pflags = current->flags;
> +
> +     /* these should have been dropped before queueing */
> +     BUG_ON(!sk_has_memalloc(sk));
> +
> +     current->flags |= PF_MEMALLOC;
> +     ret = sk->sk_backlog_rcv(sk, skb);
> +     tsk_restore_flags(current, pflags, PF_MEMALLOC);
> +
> +     return ret;
> +}
> +EXPORT_SYMBOL(__sk_backlog_rcv);
> +#endif
> +
>  static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
>  {
>       struct timeval tv;


I am still not convinced that this solves the problem well enough
to be useful.  Can you really survive a heavy memory overcommit?
In other words, can you prove that the added complexity causes the system
to survive a real test where otherwise it would not?


-- 
Stephen Hemminger <[EMAIL PROTECTED]>
-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to