Patrick McHardy wrote: > Mattia Dongili wrote: > >>On Mon, Aug 01, 2005 at 04:27:53PM +0200, Patrick McHardy wrote: >> >> >>>>--- include/linux/netfilter_ipv4/ip_conntrack.h.clean 2005-08-01 >>>>15:09:49.000000000 +0200 >>>>+++ include/linux/netfilter_ipv4/ip_conntrack.h 2005-08-01 >>>>15:08:52.000000000 +0200 >>>>@@ -298,6 +298,7 @@ static inline struct ip_conntrack * >>>>ip_conntrack_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo) >>>>{ >>>> *ctinfo = skb->nfctinfo; >>>>+ nf_conntrack_get(skb->nfct); >>>> return (struct ip_conntrack *)skb->nfct; >>>>} >>> >>>This creates lots of refcnt leaks, which is probably why it makes the >>>underflow go away :) Please try this patch instead. >> >> >>this doesn't fix it actually, see dmesg below: > > > It looks like ip_ct_iterate_cleanup and ip_conntrack_event_cache_init > race against each other with assigning pointers and grabbing/putting the > refcounts if called from different contexts.
This should be a fist step towards fixing it. It's probably incomplete (I'm too tired to check it now), but it should fix the problem you're seeing. Could you give it a spin? BTW, ip_ct_iterate_cleanup can only be called from ipt_MASQUERADE when a device goes down. It seems a bit odd that this is happending on boot, is there anything special about your setup? Regards Patrick
diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h --- a/include/linux/netfilter_ipv4/ip_conntrack.h +++ b/include/linux/netfilter_ipv4/ip_conntrack.h @@ -411,6 +411,7 @@ struct ip_conntrack_stat #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS #include <linux/notifier.h> +#include <linux/interrupt.h> struct ip_conntrack_ecache { struct ip_conntrack *ct; @@ -445,26 +446,25 @@ ip_conntrack_expect_unregister_notifier( return notifier_chain_unregister(&ip_conntrack_expect_chain, nb); } +extern void ip_conntrack_event_cache_init(struct ip_conntrack *ct); +extern void +ip_conntrack_deliver_cached_events_for(const struct ip_conntrack *ct); + static inline void ip_conntrack_event_cache(enum ip_conntrack_events event, const struct sk_buff *skb) { - struct ip_conntrack_ecache *ecache = - &__get_cpu_var(ip_conntrack_ecache); - - if (unlikely((struct ip_conntrack *) skb->nfct != ecache->ct)) { - if (net_ratelimit()) { - printk(KERN_ERR "ctevent: skb->ct != ecache->ct !!!\n"); - dump_stack(); - } - } + struct ip_conntrack *ct = (struct ip_conntrack *)skb->nfct; + struct ip_conntrack_ecache *ecache; + + local_bh_disable(); + ecache = &__get_cpu_var(ip_conntrack_ecache); + if (unlikely(ct != ecache->ct)) + ip_conntrack_event_cache_init(ct); ecache->events |= event; + local_bh_enable(); } -extern void -ip_conntrack_deliver_cached_events_for(const struct ip_conntrack *ct); -extern void ip_conntrack_event_cache_init(const struct sk_buff *skb); - static inline void ip_conntrack_event(enum ip_conntrack_events event, struct ip_conntrack *ct) { diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -100,56 +100,45 @@ void __ip_ct_deliver_cached_events(struc /* Deliver all cached events for a particular conntrack. This is called * by code prior to async packet handling or freeing the skb */ -void -ip_conntrack_deliver_cached_events_for(const struct ip_conntrack *ct) +void ip_conntrack_deliver_cached_events_for(const struct ip_conntrack *ct) { - struct ip_conntrack_ecache *ecache = - &__get_cpu_var(ip_conntrack_ecache); - - if (!ct) - return; - + struct ip_conntrack_ecache *ecache; + + local_bh_disable(); + ecache = &__get_cpu_var(ip_conntrack_ecache); if (ecache->ct == ct) { DEBUGP("ecache: delivering event for %p\n", ct); __deliver_cached_events(ecache); } else { - if (net_ratelimit()) - printk(KERN_WARNING "ecache: want to deliver for %p, " - "but cache has %p\n", ct, ecache->ct); + DEBUGP("ecache: already delivered for %p, cache %p", + ct, ecache->ct); } - /* signalize that events have already been delivered */ ecache->ct = NULL; + local_bh_enable(); } /* Deliver cached events for old pending events, if current conntrack != old */ -void ip_conntrack_event_cache_init(const struct sk_buff *skb) +void ip_conntrack_event_cache_init(struct ip_conntrack *ct) { - struct ip_conntrack *ct = (struct ip_conntrack *) skb->nfct; - struct ip_conntrack_ecache *ecache = - &__get_cpu_var(ip_conntrack_ecache); + struct ip_conntrack_ecache *ecache; /* take care of delivering potentially old events */ - if (ecache->ct != ct) { - enum ip_conntrack_info ctinfo; - /* we have to check, since at startup the cache is NULL */ - if (likely(ecache->ct)) { - DEBUGP("ecache: entered for different conntrack: " - "ecache->ct=%p, skb->nfct=%p. delivering " - "events\n", ecache->ct, ct); - __deliver_cached_events(ecache); - ip_conntrack_put(ecache->ct); - } else { - DEBUGP("ecache: entered for conntrack %p, " - "cache was clean before\n", ct); - } - - /* initialize for this conntrack/packet */ - ecache->ct = ip_conntrack_get(skb, &ctinfo); - /* ecache->events cleared by __deliver_cached_devents() */ + ecache = &__get_cpu_var(ip_conntrack_ecache); + BUG_ON(ecache->ct == ct); + if (ecache->ct) { + DEBUGP("ecache: entered for different conntrack: " + "ecache->ct=%p, skb->nfct=%p. delivering " + "events\n", ecache->ct, ct); + __deliver_cached_events(ecache); + ip_conntrack_put(ecache->ct); } else { - DEBUGP("ecache: re-entered for conntrack %p.\n", ct); + DEBUGP("ecache: entered for conntrack %p, " + "cache was clean before\n", ct); } + /* initialize for this conntrack/packet */ + ecache->ct = ct; + nf_conntrack_get(&ct->ct_general); } #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */ @@ -873,8 +862,6 @@ unsigned int ip_conntrack_in(unsigned in IP_NF_ASSERT((*pskb)->nfct); - ip_conntrack_event_cache_init(*pskb); - ret = proto->packet(ct, *pskb, ctinfo); if (ret < 0) { /* Invalid: inverse of the return code tells @@ -1279,15 +1266,18 @@ ip_ct_iterate_cleanup(int (*iter)(struct /* we need to deliver all cached events in order to drop * the reference counts */ int cpu; + + local_bh_disable(); for_each_cpu(cpu) { struct ip_conntrack_ecache *ecache = &per_cpu(ip_conntrack_ecache, cpu); if (ecache->ct) { - __ip_ct_deliver_cached_events(ecache); + __deliver_cached_events(ecache); ip_conntrack_put(ecache->ct); ecache->ct = NULL; } } + local_bh_enable(); } #endif } diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -401,9 +401,13 @@ static unsigned int ip_confirm(unsigned const struct net_device *out, int (*okfn)(struct sk_buff *)) { - ip_conntrack_event_cache_init(*pskb); /* We've seen it coming out the other side: confirm it */ - return ip_conntrack_confirm(pskb); + struct ip_conntrack *ct = (struct ip_conntrack *)(*pskb)->nfct; + unsigned int ret; + + ret = ip_conntrack_confirm(pskb); + ip_conntrack_deliver_cached_events_for(ct); + return ret; } static unsigned int ip_conntrack_help(unsigned int hooknum, @@ -419,7 +423,7 @@ static unsigned int ip_conntrack_help(un ct = ip_conntrack_get(*pskb, &ctinfo); if (ct && ct->helper) { unsigned int ret; - ip_conntrack_event_cache_init(*pskb); + ip_conntrack_event_cache_init(ct); ret = ct->helper->help(pskb, ct, ctinfo); if (ret != NF_ACCEPT) return ret;