On Thu, May 31, 2007 at 03:22:27PM -0700, David Miller wrote: > From: Frederik Deweerdt <[EMAIL PROTECTED]> > Date: Fri, 11 May 2007 17:00:14 +0200 > > > I'm seeing an Oops[1] with a 2.6.19.2 kernel: > > Frederik, I finally was able to spend some quality time on > this issue today. Sorry for taking so long.
Thank you for handling this David. I going to be offline for two weeks, but I'll be able to report tests results after that. Regards, Frederik > I came up with a series of two patches, the first one makes > the locking easier to understand by fixing the unix_sock_*() > lock macro names. > > The second patch fixes this race conditon by properly holding > both socket locks, being careful to avoid deadlocks, and > checking for possible SOCK_DEAD state. > > -------------------- > commit d410b81b4eef2e4409f9c38ef201253fbbcc7d94 > Author: David S. Miller <[EMAIL PROTECTED]> > Date: Thu May 31 13:24:26 2007 -0700 > > [AF_UNIX]: Make socket locking much less confusing. > > The unix_state_*() locking macros imply that there is some > rwlock kind of thing going on, but the implementation is > actually a spinlock which makes the code more confusing than > it needs to be. > > So use plain unix_state_lock and unix_state_unlock. > > Signed-off-by: David S. Miller <[EMAIL PROTECTED]> > > diff --git a/include/net/af_unix.h b/include/net/af_unix.h > index c0398f5..65f49fd 100644 > --- a/include/net/af_unix.h > +++ b/include/net/af_unix.h > @@ -62,13 +62,11 @@ struct unix_skb_parms { > #define UNIXCREDS(skb) (&UNIXCB((skb)).creds) > #define UNIXSID(skb) (&UNIXCB((skb)).secid) > > -#define unix_state_rlock(s) spin_lock(&unix_sk(s)->lock) > -#define unix_state_runlock(s) spin_unlock(&unix_sk(s)->lock) > -#define unix_state_wlock(s) spin_lock(&unix_sk(s)->lock) > -#define unix_state_wlock_nested(s) \ > +#define unix_state_lock(s) spin_lock(&unix_sk(s)->lock) > +#define unix_state_unlock(s) spin_unlock(&unix_sk(s)->lock) > +#define unix_state_lock_nested(s) \ > spin_lock_nested(&unix_sk(s)->lock, \ > SINGLE_DEPTH_NESTING) > -#define unix_state_wunlock(s) spin_unlock(&unix_sk(s)->lock) > > #ifdef __KERNEL__ > /* The AF_UNIX socket */ > diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c > index fc12ba5..453ede8 100644 > --- a/net/unix/af_unix.c > +++ b/net/unix/af_unix.c > @@ -174,11 +174,11 @@ static struct sock *unix_peer_get(struct sock *s) > { > struct sock *peer; > > - unix_state_rlock(s); > + unix_state_lock(s); > peer = unix_peer(s); > if (peer) > sock_hold(peer); > - unix_state_runlock(s); > + unix_state_unlock(s); > return peer; > } > > @@ -369,7 +369,7 @@ static int unix_release_sock (struct sock *sk, int > embrion) > unix_remove_socket(sk); > > /* Clear state */ > - unix_state_wlock(sk); > + unix_state_lock(sk); > sock_orphan(sk); > sk->sk_shutdown = SHUTDOWN_MASK; > dentry = u->dentry; > @@ -378,7 +378,7 @@ static int unix_release_sock (struct sock *sk, int > embrion) > u->mnt = NULL; > state = sk->sk_state; > sk->sk_state = TCP_CLOSE; > - unix_state_wunlock(sk); > + unix_state_unlock(sk); > > wake_up_interruptible_all(&u->peer_wait); > > @@ -386,12 +386,12 @@ static int unix_release_sock (struct sock *sk, int > embrion) > > if (skpair!=NULL) { > if (sk->sk_type == SOCK_STREAM || sk->sk_type == > SOCK_SEQPACKET) { > - unix_state_wlock(skpair); > + unix_state_lock(skpair); > /* No more writes */ > skpair->sk_shutdown = SHUTDOWN_MASK; > if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) > skpair->sk_err = ECONNRESET; > - unix_state_wunlock(skpair); > + unix_state_unlock(skpair); > skpair->sk_state_change(skpair); > read_lock(&skpair->sk_callback_lock); > sk_wake_async(skpair,1,POLL_HUP); > @@ -448,7 +448,7 @@ static int unix_listen(struct socket *sock, int backlog) > err = -EINVAL; > if (!u->addr) > goto out; /* No listens on an unbound > socket */ > - unix_state_wlock(sk); > + unix_state_lock(sk); > if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) > goto out_unlock; > if (backlog > sk->sk_max_ack_backlog) > @@ -462,7 +462,7 @@ static int unix_listen(struct socket *sock, int backlog) > err = 0; > > out_unlock: > - unix_state_wunlock(sk); > + unix_state_unlock(sk); > out: > return err; > } > @@ -881,7 +881,7 @@ static int unix_dgram_connect(struct socket *sock, struct > sockaddr *addr, > if (!other) > goto out; > > - unix_state_wlock(sk); > + unix_state_lock(sk); > > err = -EPERM; > if (!unix_may_send(sk, other)) > @@ -896,7 +896,7 @@ static int unix_dgram_connect(struct socket *sock, struct > sockaddr *addr, > * 1003.1g breaking connected state with AF_UNSPEC > */ > other = NULL; > - unix_state_wlock(sk); > + unix_state_lock(sk); > } > > /* > @@ -905,19 +905,19 @@ static int unix_dgram_connect(struct socket *sock, > struct sockaddr *addr, > if (unix_peer(sk)) { > struct sock *old_peer = unix_peer(sk); > unix_peer(sk)=other; > - unix_state_wunlock(sk); > + unix_state_unlock(sk); > > if (other != old_peer) > unix_dgram_disconnected(sk, old_peer); > sock_put(old_peer); > } else { > unix_peer(sk)=other; > - unix_state_wunlock(sk); > + unix_state_unlock(sk); > } > return 0; > > out_unlock: > - unix_state_wunlock(sk); > + unix_state_unlock(sk); > sock_put(other); > out: > return err; > @@ -936,7 +936,7 @@ static long unix_wait_for_peer(struct sock *other, long > timeo) > (skb_queue_len(&other->sk_receive_queue) > > other->sk_max_ack_backlog); > > - unix_state_runlock(other); > + unix_state_unlock(other); > > if (sched) > timeo = schedule_timeout(timeo); > @@ -994,11 +994,11 @@ restart: > goto out; > > /* Latch state of peer */ > - unix_state_rlock(other); > + unix_state_lock(other); > > /* Apparently VFS overslept socket death. Retry. */ > if (sock_flag(other, SOCK_DEAD)) { > - unix_state_runlock(other); > + unix_state_unlock(other); > sock_put(other); > goto restart; > } > @@ -1048,18 +1048,18 @@ restart: > goto out_unlock; > } > > - unix_state_wlock_nested(sk); > + unix_state_lock_nested(sk); > > if (sk->sk_state != st) { > - unix_state_wunlock(sk); > - unix_state_runlock(other); > + unix_state_unlock(sk); > + unix_state_unlock(other); > sock_put(other); > goto restart; > } > > err = security_unix_stream_connect(sock, other->sk_socket, newsk); > if (err) { > - unix_state_wunlock(sk); > + unix_state_unlock(sk); > goto out_unlock; > } > > @@ -1096,7 +1096,7 @@ restart: > smp_mb__after_atomic_inc(); /* sock_hold() does an atomic_inc() */ > unix_peer(sk) = newsk; > > - unix_state_wunlock(sk); > + unix_state_unlock(sk); > > /* take ten and and send info to listening sock */ > spin_lock(&other->sk_receive_queue.lock); > @@ -1105,14 +1105,14 @@ restart: > * is installed to listening socket. */ > atomic_inc(&newu->inflight); > spin_unlock(&other->sk_receive_queue.lock); > - unix_state_runlock(other); > + unix_state_unlock(other); > other->sk_data_ready(other, 0); > sock_put(other); > return 0; > > out_unlock: > if (other) > - unix_state_runlock(other); > + unix_state_unlock(other); > > out: > if (skb) > @@ -1178,10 +1178,10 @@ static int unix_accept(struct socket *sock, struct > socket *newsock, int flags) > wake_up_interruptible(&unix_sk(sk)->peer_wait); > > /* attach accepted sock to socket */ > - unix_state_wlock(tsk); > + unix_state_lock(tsk); > newsock->state = SS_CONNECTED; > sock_graft(tsk, newsock); > - unix_state_wunlock(tsk); > + unix_state_unlock(tsk); > return 0; > > out: > @@ -1208,7 +1208,7 @@ static int unix_getname(struct socket *sock, struct > sockaddr *uaddr, int *uaddr_ > } > > u = unix_sk(sk); > - unix_state_rlock(sk); > + unix_state_lock(sk); > if (!u->addr) { > sunaddr->sun_family = AF_UNIX; > sunaddr->sun_path[0] = 0; > @@ -1219,7 +1219,7 @@ static int unix_getname(struct socket *sock, struct > sockaddr *uaddr, int *uaddr_ > *uaddr_len = addr->len; > memcpy(sunaddr, addr->name, *uaddr_len); > } > - unix_state_runlock(sk); > + unix_state_unlock(sk); > sock_put(sk); > out: > return err; > @@ -1337,7 +1337,7 @@ restart: > goto out_free; > } > > - unix_state_rlock(other); > + unix_state_lock(other); > err = -EPERM; > if (!unix_may_send(sk, other)) > goto out_unlock; > @@ -1347,20 +1347,20 @@ restart: > * Check with 1003.1g - what should > * datagram error > */ > - unix_state_runlock(other); > + unix_state_unlock(other); > sock_put(other); > > err = 0; > - unix_state_wlock(sk); > + unix_state_lock(sk); > if (unix_peer(sk) == other) { > unix_peer(sk)=NULL; > - unix_state_wunlock(sk); > + unix_state_unlock(sk); > > unix_dgram_disconnected(sk, other); > sock_put(other); > err = -ECONNREFUSED; > } else { > - unix_state_wunlock(sk); > + unix_state_unlock(sk); > } > > other = NULL; > @@ -1397,14 +1397,14 @@ restart: > } > > skb_queue_tail(&other->sk_receive_queue, skb); > - unix_state_runlock(other); > + unix_state_unlock(other); > other->sk_data_ready(other, len); > sock_put(other); > scm_destroy(siocb->scm); > return len; > > out_unlock: > - unix_state_runlock(other); > + unix_state_unlock(other); > out_free: > kfree_skb(skb); > out: > @@ -1494,14 +1494,14 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, > struct socket *sock, > goto out_err; > } > > - unix_state_rlock(other); > + unix_state_lock(other); > > if (sock_flag(other, SOCK_DEAD) || > (other->sk_shutdown & RCV_SHUTDOWN)) > goto pipe_err_free; > > skb_queue_tail(&other->sk_receive_queue, skb); > - unix_state_runlock(other); > + unix_state_unlock(other); > other->sk_data_ready(other, size); > sent+=size; > } > @@ -1512,7 +1512,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, > struct socket *sock, > return sent; > > pipe_err_free: > - unix_state_runlock(other); > + unix_state_unlock(other); > kfree_skb(skb); > pipe_err: > if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL)) > @@ -1641,7 +1641,7 @@ static long unix_stream_data_wait(struct sock * sk, > long timeo) > { > DEFINE_WAIT(wait); > > - unix_state_rlock(sk); > + unix_state_lock(sk); > > for (;;) { > prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); > @@ -1654,14 +1654,14 @@ static long unix_stream_data_wait(struct sock * sk, > long timeo) > break; > > set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); > - unix_state_runlock(sk); > + unix_state_unlock(sk); > timeo = schedule_timeout(timeo); > - unix_state_rlock(sk); > + unix_state_lock(sk); > clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); > } > > finish_wait(sk->sk_sleep, &wait); > - unix_state_runlock(sk); > + unix_state_unlock(sk); > return timeo; > } > > @@ -1816,12 +1816,12 @@ static int unix_shutdown(struct socket *sock, int > mode) > mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN); > > if (mode) { > - unix_state_wlock(sk); > + unix_state_lock(sk); > sk->sk_shutdown |= mode; > other=unix_peer(sk); > if (other) > sock_hold(other); > - unix_state_wunlock(sk); > + unix_state_unlock(sk); > sk->sk_state_change(sk); > > if (other && > @@ -1833,9 +1833,9 @@ static int unix_shutdown(struct socket *sock, int mode) > peer_mode |= SEND_SHUTDOWN; > if (mode&SEND_SHUTDOWN) > peer_mode |= RCV_SHUTDOWN; > - unix_state_wlock(other); > + unix_state_lock(other); > other->sk_shutdown |= peer_mode; > - unix_state_wunlock(other); > + unix_state_unlock(other); > other->sk_state_change(other); > read_lock(&other->sk_callback_lock); > if (peer_mode == SHUTDOWN_MASK) > @@ -1973,7 +1973,7 @@ static int unix_seq_show(struct seq_file *seq, void *v) > else { > struct sock *s = v; > struct unix_sock *u = unix_sk(s); > - unix_state_rlock(s); > + unix_state_lock(s); > > seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu", > s, > @@ -2001,7 +2001,7 @@ static int unix_seq_show(struct seq_file *seq, void *v) > for ( ; i < len; i++) > seq_putc(seq, u->addr->name->sun_path[i]); > } > - unix_state_runlock(s); > + unix_state_unlock(s); > seq_putc(seq, '\n'); > } > > -------------------- > commit 19fec3e807a487415e77113cb9dbdaa2da739836 > Author: David S. Miller <[EMAIL PROTECTED]> > Date: Thu May 31 15:19:20 2007 -0700 > > [AF_UNIX]: Fix datagram connect race causing an OOPS. > > Based upon an excellent bug report and initial patch by > Frederik Deweerdt. > > The UNIX datagram connect code blindly dereferences other->sk_socket > via the call down to the security_unix_may_send() function. > > Without locking 'other' that pointer can go NULL via unix_release_sock() > which does sock_orphan() which also marks the socket SOCK_DEAD. > > So we have to lock both 'sk' and 'other' yet avoid all kinds of > potential deadlocks (connect to self is OK for datagram sockets and it > is possible for two datagram sockets to perform a simultaneous connect > to each other). So what we do is have a "double lock" function similar > to how we handle this situation in other areas of the kernel. We take > the lock of the socket pointer with the smallest address first in > order to avoid ABBA style deadlocks. > > Once we have them both locked, we check to see if SOCK_DEAD is set > for 'other' and if so, drop everything and retry the lookup. > > Signed-off-by: David S. Miller <[EMAIL PROTECTED]> > > diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c > index 453ede8..87c794d 100644 > --- a/net/unix/af_unix.c > +++ b/net/unix/af_unix.c > @@ -858,6 +858,31 @@ out_mknod_parent: > goto out_up; > } > > +static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) > +{ > + if (unlikely(sk1 == sk2) || !sk2) { > + unix_state_lock(sk1); > + return; > + } > + if (sk1 < sk2) { > + unix_state_lock(sk1); > + unix_state_lock_nested(sk2); > + } else { > + unix_state_lock(sk2); > + unix_state_lock_nested(sk1); > + } > +} > + > +static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) > +{ > + if (unlikely(sk1 == sk2) || !sk2) { > + unix_state_unlock(sk1); > + return; > + } > + unix_state_unlock(sk1); > + unix_state_unlock(sk2); > +} > + > static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, > int alen, int flags) > { > @@ -877,11 +902,19 @@ static int unix_dgram_connect(struct socket *sock, > struct sockaddr *addr, > !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0) > goto out; > > +restart: > other=unix_find_other(sunaddr, alen, sock->type, hash, &err); > if (!other) > goto out; > > - unix_state_lock(sk); > + unix_state_double_lock(sk, other); > + > + /* Apparently VFS overslept socket death. Retry. */ > + if (sock_flag(other, SOCK_DEAD)) { > + unix_state_double_unlock(sk, other); > + sock_put(other); > + goto restart; > + } > > err = -EPERM; > if (!unix_may_send(sk, other)) > @@ -896,7 +929,7 @@ static int unix_dgram_connect(struct socket *sock, struct > sockaddr *addr, > * 1003.1g breaking connected state with AF_UNSPEC > */ > other = NULL; > - unix_state_lock(sk); > + unix_state_double_lock(sk, other); > } > > /* > @@ -905,19 +938,19 @@ static int unix_dgram_connect(struct socket *sock, > struct sockaddr *addr, > if (unix_peer(sk)) { > struct sock *old_peer = unix_peer(sk); > unix_peer(sk)=other; > - unix_state_unlock(sk); > + unix_state_double_unlock(sk, other); > > if (other != old_peer) > unix_dgram_disconnected(sk, old_peer); > sock_put(old_peer); > } else { > unix_peer(sk)=other; > - unix_state_unlock(sk); > + unix_state_double_unlock(sk, other); > } > return 0; > > out_unlock: > - unix_state_unlock(sk); > + unix_state_double_unlock(sk, other); > sock_put(other); > out: > return err; > -------------------- > - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html