As many of you have noticed I've been working on cleanups
and simplifications to net/ipv6/addrconf.c so that we can
move over to RCU locking for the inet6_dev management of
the IPV6 stack.

The following patch implements a major hurdle in implementing such
changes.  Unlike ipv4, ipv6 can have device addresses added or removed
in response to packet reception.  So, ipv6 currently adds and removes
device addresses in software interrupt context which is racey and
error prone.  That's without also considering the possible bad
interactions and races with userspace initiated RTNL messages.

So what I've done is move these operations into base context using
workqueues.  Packet initiated events simply schedule a workqueue
event, and the actual processing occurs in the workqueue.

With this in place, moving over to RCU and removing in6_dev->lock
is a pretty straightforward task.

Patch is against the current net-2.6.16 tree.

Comments?

diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 8bb373e..5f18824 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -147,9 +147,10 @@ static DEFINE_RWLOCK(addrconf_hash_lock)
 /* Protects inet6 devices */
 DEFINE_RWLOCK(addrconf_lock);
 
-static void addrconf_verify(unsigned long);
+static void addrconf_verify(void);
+static void addrconf_verify_timer(unsigned long);
 
-static DEFINE_TIMER(addr_chk_timer, addrconf_verify, 0, 0);
+static DEFINE_TIMER(addr_chk_timer, addrconf_verify_timer, 0, 0);
 static DEFINE_SPINLOCK(addrconf_verify_lock);
 
 static void addrconf_join_anycast(struct inet6_ifaddr *ifp);
@@ -606,6 +607,8 @@ ipv6_add_addr(struct inet6_dev *idev, co
        struct inet6_ifaddr *ifa;
        int err;
 
+       ASSERT_RTNL();
+
        read_lock_bh(&addrconf_lock);
 
        err = -ENODEV;
@@ -786,6 +789,8 @@ static void ipv6_del_addr(struct inet6_i
        unsigned long expires;
        int onlink;
 
+       ASSERT_RTNL();
+
        ifp->dead = 1;
 
        purge_from_hashes(ifp);
@@ -1309,18 +1314,22 @@ int ipv6_rcv_saddr_equal(const struct so
 
 /* Gets referenced address, destroys ifaddr */
 
-void addrconf_dad_failure(struct inet6_ifaddr *ifp)
+static void do_addrconf_dad_failure(struct inet6_ifaddr *ifp)
 {
+       ASSERT_RTNL();
+
        if (net_ratelimit())
-               printk(KERN_INFO "%s: duplicate address detected!\n", 
ifp->idev->dev->name);
-       if (ifp->flags&IFA_F_PERMANENT) {
+               printk(KERN_INFO "%s: duplicate address detected!\n",
+                      ifp->idev->dev->name);
+
+       if (ifp->flags & IFA_F_PERMANENT) {
                spin_lock_bh(&ifp->lock);
                addrconf_del_timer(ifp);
                ifp->flags |= IFA_F_TENTATIVE;
                spin_unlock_bh(&ifp->lock);
                in6_ifa_put(ifp);
 #ifdef CONFIG_IPV6_PRIVACY
-       } else if (ifp->flags&IFA_F_TEMPORARY) {
+       } else if (ifp->flags & IFA_F_TEMPORARY) {
                struct inet6_ifaddr *ifpub;
                spin_lock_bh(&ifp->lock);
                ifpub = ifp->ifpub;
@@ -1338,7 +1347,6 @@ void addrconf_dad_failure(struct inet6_i
                ipv6_del_addr(ifp);
 }
 
-
 /* Join to solicited addr multicast group. */
 
 void addrconf_join_solict(struct net_device *dev, struct in6_addr *addr)
@@ -1823,32 +1831,17 @@ static void prefix_update_temp_lifetimes
 #endif
 }
 
-void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
+static void prefix_rcv(struct inet6_dev *in6_dev, struct ipv6_prefix_info 
*pinfo)
 {
-       struct ipv6_prefix_info *pinfo;
+       struct net_device *dev = in6_dev->dev;
        __u32 valid_lft;
        __u32 prefered_lft;
-       int addr_type;
-       struct inet6_dev *in6_dev;
 
-       if (unlikely(len < sizeof(struct ipv6_prefix_info)))
-               return;
-       
-       pinfo = (struct ipv6_prefix_info *) opt;
-       addr_type = ipv6_addr_type(&pinfo->prefix);
-       if (unlikely(addr_type & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)))
-               return;
-       
+       ASSERT_RTNL();
+
        valid_lft = ntohl(pinfo->valid);
        prefered_lft = ntohl(pinfo->prefered);
 
-       if (unlikely(prefered_lft > valid_lft))
-               goto out_invalid_lifetime;
-
-       in6_dev = in6_dev_get(dev);
-       if (unlikely(!in6_dev))
-               goto out_nodev;
-
        prefix_add_route(pinfo, dev, valid_lft);
 
        /* Try to figure out our local address for this prefix */
@@ -1872,26 +1865,193 @@ void addrconf_prefix_rcv(struct net_devi
                                                     prefered_lft, create);
 
                        in6_ifa_put(ifp);
-                       addrconf_verify(0);
+                       addrconf_verify();
                }
        }
        inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo);
 
 out_put_in6_dev:
        in6_dev_put(in6_dev);
-       return;
+}
 
-out_invalid_lifetime:
-       if (net_ratelimit())
-               printk(KERN_WARNING "addrconf: prefix option "
-                      "has invalid lifetime\n");
-       return;
+/* Many operations execute asynchronously, during packet receive,
+ * and end up performing major device state changes (addition and
+ * removal of interface addresses, for example).  This is extremely
+ * race prone.  The simplest thing to do is to get the work out of
+ * software interrupt context, via a workqueue.  This way, we can
+ * hold the RTNL semaphore and thus properly synchronize with the
+ * rest of the system.
+ *
+ * When async jobs are queued, references must be held to the
+ * relevant objects (inet6_ifaddr, inet6_dev, etc.) and released
+ * by the job implementation.  The caller of queue_async_job() must
+ * also release these references on failure.
+ */
 
-out_nodev:
-       if (net_ratelimit())
-               printk(KERN_DEBUG "addrconf: device %s not "
-                      "configured\n", dev->name);
-       return;
+enum async_job_type {
+       addrconf_vrfy,
+       dad_failure,
+       prefix_rx,
+};
+
+struct addrconf_async_job {
+       struct list_head        list;
+       enum async_job_type     type;
+       union {
+               struct {
+                       struct inet6_ifaddr *ifp;
+               } dad;
+               struct {
+                       struct inet6_dev *idev;
+                       struct ipv6_prefix_info pinfo;
+               } prefix;
+       } data;
+};
+
+static void run_async_jobs(void *dummy);
+static DECLARE_WORK(async_job_work, run_async_jobs, NULL);
+
+static DEFINE_SPINLOCK(async_job_lock);
+static struct list_head async_job_list;
+
+static int queue_async_job(enum async_job_type type, struct inet6_ifaddr *ifp,
+                          struct inet6_dev *idev,
+                          struct ipv6_prefix_info *pinfo)
+{
+       struct addrconf_async_job *job;
+       int err;
+
+       err = -ENOMEM;
+       job = kmalloc(sizeof(struct addrconf_async_job), GFP_ATOMIC);
+       if (unlikely(!job))
+               goto out_err;
+
+       memset(job, 0, sizeof(struct addrconf_async_job));
+
+       job->type = type;
+
+       err = -EINVAL;
+       switch (type) {
+       case addrconf_vrfy:
+               if (unlikely(ifp || idev || pinfo))
+                       goto out_err_free;
+               break;
+
+       case dad_failure:
+               if (unlikely(!ifp || idev || pinfo))
+                       goto out_err_free;
+               job->data.dad.ifp = ifp;
+               break;
+
+       case prefix_rx:
+               if (unlikely(ifp || !idev || !pinfo))
+                       goto out_err_free;
+               job->data.prefix.idev = idev;
+               memcpy(&job->data.prefix.pinfo, pinfo, sizeof(*pinfo));
+               break;
+       };
+
+       spin_lock(&async_job_lock);
+       list_add_tail(&job->list, &async_job_list);
+       spin_unlock(&async_job_lock);
+
+       schedule_work(&async_job_work);
+
+       return 0;
+
+out_err_free:
+       kfree(job);
+
+out_err:
+       return err;
+}
+
+static void process_one_async_job(struct addrconf_async_job *job)
+{
+       ASSERT_RTNL();
+
+       switch (job->type) {
+       case addrconf_vrfy:
+               addrconf_verify();
+               break;
+
+       case dad_failure:
+               do_addrconf_dad_failure(job->data.dad.ifp);
+               break;
+
+       case prefix_rx:
+               prefix_rcv(job->data.prefix.idev,
+                          &job->data.prefix.pinfo);
+               break;
+       };
+}
+
+static void run_async_jobs(void *dummy)
+{
+       LIST_HEAD(head);
+       struct list_head *n, *next;
+
+       spin_lock_bh(&async_job_lock);
+       list_splice_init(&async_job_list, &head);
+       spin_unlock_bh(&async_job_lock);
+
+       rtnl_shlock();
+
+       list_for_each_safe(n, next, &head) {
+               struct addrconf_async_job *job =
+                       list_entry(n, struct addrconf_async_job, list);
+
+               process_one_async_job(job);
+               kfree(job);
+       }
+
+       rtnl_shunlock();
+}
+
+static void addrconf_verify_timer(unsigned long __unused)
+{
+       if (queue_async_job(addrconf_vrfy, NULL, NULL, NULL) < 0) {
+               spin_lock_bh(&addrconf_verify_lock);
+               mod_timer(&addr_chk_timer, jiffies + HZ);
+               spin_unlock_bh(&addrconf_verify_lock);
+       }
+}
+
+void addrconf_dad_failure(struct inet6_ifaddr *ifp)
+{
+       /* We inherit 'ifp' reference from caller. */
+       if (queue_async_job(dad_failure, ifp, NULL, NULL) < 0)
+               in6_ifa_put(ifp);
+}
+
+void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len)
+{
+       struct ipv6_prefix_info *pinfo;
+       __u32 valid_lft;
+       __u32 prefered_lft;
+       int addr_type;
+       struct inet6_dev *in6_dev;
+
+       if (unlikely(len < sizeof(struct ipv6_prefix_info)))
+               return;
+
+       pinfo = (struct ipv6_prefix_info *) opt;
+       addr_type = ipv6_addr_type(&pinfo->prefix);
+       if (unlikely(addr_type & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)))
+               return;
+       
+       valid_lft = ntohl(pinfo->valid);
+       prefered_lft = ntohl(pinfo->prefered);
+
+       if (unlikely(prefered_lft > valid_lft))
+               return;
+
+       in6_dev = in6_dev_get(dev);
+       if (unlikely(!in6_dev))
+               return;
+
+       if (queue_async_job(prefix_rx, NULL, in6_dev, pinfo) < 0)
+               in6_dev_put(in6_dev);
 }
 
 /*
@@ -2742,18 +2902,19 @@ void if6_proc_exit(void)
  *     Periodic address status verification
  */
 
-static void addrconf_verify(unsigned long foo)
+static void addrconf_verify(void)
 {
        struct inet6_ifaddr *ifp;
        unsigned long now, next;
        int i;
 
+       ASSERT_RTNL();
+
        spin_lock_bh(&addrconf_verify_lock);
+
        now = jiffies;
        next = now + ADDR_CHECK_FREQUENCY;
 
-       del_timer(&addr_chk_timer);
-
        for (i=0; i < IN6_ADDR_HSIZE; i++) {
 
 restart:
@@ -2836,8 +2997,9 @@ restart:
                read_unlock(&addrconf_hash_lock);
        }
 
-       addr_chk_timer.expires = time_before(next, jiffies + HZ) ? jiffies + HZ 
: next;
-       add_timer(&addr_chk_timer);
+       mod_timer(&addr_chk_timer,
+                 time_before(next, jiffies + HZ) ? jiffies + HZ : next);
+
        spin_unlock_bh(&addrconf_verify_lock);
 }
 
@@ -3798,7 +3960,10 @@ int __init addrconf_init(void)
                        "failed to load transform for md5\n");
 #endif
 
-       addrconf_verify(0);
+       rtnl_shlock();
+       addrconf_verify();
+       rtnl_shunlock();
+
        rtnetlink_links[PF_INET6] = inet6_rtnetlink_table;
 #ifdef CONFIG_SYSCTL
        addrconf_sysctl.sysctl_header =
@@ -3856,9 +4021,15 @@ void __exit addrconf_cleanup(void)
        }
        write_unlock_bh(&addrconf_hash_lock);
 
+       rtnl_unlock();
+
+       spin_lock_bh(&addrconf_verify_lock);
+
+       flush_scheduled_work();
+
        del_timer(&addr_chk_timer);
 
-       rtnl_unlock();
+       spin_unlock_bh(&addrconf_verify_lock);
 
 #ifdef CONFIG_IPV6_PRIVACY
        crypto_free_tfm(md5_tfm);

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to