Looking at this bug: http://bugzilla.kernel.org/show_bug.cgi?id=9149
Exposes some rather deep issues in the filesystem/socket/inet/tcp layering. It seems that sys_close() zaps the file table entry, but since each thread has a separate reference, the actual tcp_close() doesn't happen until the last thread calls close/exits. I am no VFS expert. The semantically correct fix appears to be complex. * add a flush handle to the socket_file_ops * propagate the flush through socket to inet and tcp * split tcp_close into two parts. 1) tcp_flush - flush buffers and wakeup all other threads on the socket 2) tcp_release - release last reference and cleanup. Bogus patch for explanation purposes: --- a/include/linux/net.h 2007-10-26 12:44:41.000000000 -0700 +++ b/include/linux/net.h 2007-10-26 13:56:39.000000000 -0700 @@ -128,6 +128,7 @@ struct proto_ops { int family; struct module *owner; int (*release) (struct socket *sock); + void (*flush) (struct socket *sock); int (*bind) (struct socket *sock, struct sockaddr *myaddr, int sockaddr_len); --- a/include/net/tcp.h 2007-10-26 12:44:42.000000000 -0700 +++ b/include/net/tcp.h 2007-10-26 13:06:53.000000000 -0700 @@ -367,6 +367,7 @@ extern void tcp_enter_loss(struct sock extern void tcp_clear_retrans(struct tcp_sock *tp); extern void tcp_update_metrics(struct sock *sk); +extern void tcp_flush(struct sock *sk); extern void tcp_close(struct sock *sk, long timeout); extern unsigned int tcp_poll(struct file * file, struct socket *sock, struct poll_table_struct *wait); --- a/net/ipv4/af_inet.c 2007-10-26 12:44:46.000000000 -0700 +++ b/net/ipv4/af_inet.c 2007-10-26 13:56:00.000000000 -0700 @@ -386,6 +386,14 @@ out_rcu_unlock: goto out; } +/* The file handle is closed, but not all threads maybe gone */ +void inet_flush(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk && sk->sk_prot->flush) + sk->sk_prot->flush(sk); +} /* * The peer socket should always be NULL (or else). When we call this @@ -822,6 +830,7 @@ int inet_ioctl(struct socket *sock, unsi const struct proto_ops inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, + .flush = inet_flush, .release = inet_release, .bind = inet_bind, .connect = inet_stream_connect, --- a/net/ipv4/tcp.c 2007-10-26 12:44:46.000000000 -0700 +++ b/net/ipv4/tcp.c 2007-10-26 14:00:42.000000000 -0700 @@ -1557,11 +1557,10 @@ void tcp_shutdown(struct sock *sk, int h } } -void tcp_close(struct sock *sk, long timeout) +void tcp_flush(struct sock *sk) { struct sk_buff *skb; int data_was_unread = 0; - int state; lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; @@ -1572,7 +1571,7 @@ void tcp_close(struct sock *sk, long tim /* Special case. */ inet_csk_listen_stop(sk); - goto adjudge_to_death; + return; } /* We need to flush the recv. buffs. We do this only on the @@ -1632,10 +1631,16 @@ void tcp_close(struct sock *sk, long tim */ tcp_send_fin(sk); } + release_sock(sk); +} +void tcp_close(struct sock *sk, long timeout) +{ + int state; + + lock_sock(sk); sk_stream_wait_close(sk, timeout); -adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); @@ -2524,6 +2529,7 @@ void __init tcp_init(void) tcp_register_congestion_control(&tcp_reno); } +EXPORT_SYMBOL(tcp_flush); EXPORT_SYMBOL(tcp_close); EXPORT_SYMBOL(tcp_disconnect); EXPORT_SYMBOL(tcp_getsockopt); --- a/net/ipv4/tcp_ipv4.c 2007-10-26 12:44:46.000000000 -0700 +++ b/net/ipv4/tcp_ipv4.c 2007-10-26 13:06:04.000000000 -0700 @@ -2420,6 +2420,7 @@ void tcp4_proc_exit(void) struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, + .flush = tcp_flush, .close = tcp_close, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, --- a/net/ipv6/tcp_ipv6.c 2007-10-26 12:44:47.000000000 -0700 +++ b/net/ipv6/tcp_ipv6.c 2007-10-26 13:06:25.000000000 -0700 @@ -2109,6 +2109,7 @@ void tcp6_proc_exit(void) struct proto tcpv6_prot = { .name = "TCPv6", .owner = THIS_MODULE, + .flush = tcp_flush, .close = tcp_close, .connect = tcp_v6_connect, .disconnect = tcp_disconnect, --- a/net/socket.c 2007-10-26 12:44:48.000000000 -0700 +++ b/net/socket.c 2007-10-26 13:47:48.000000000 -0700 @@ -100,7 +100,7 @@ static ssize_t sock_aio_read(struct kioc static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos); static int sock_mmap(struct file *file, struct vm_area_struct *vma); - +static int sock_flush(struct file *file, fl_owner_t id); static int sock_close(struct inode *inode, struct file *file); static unsigned int sock_poll(struct file *file, struct poll_table_struct *wait); @@ -130,6 +130,7 @@ static const struct file_operations sock #endif .mmap = sock_mmap, .open = sock_no_open, /* special open code to disallow open via /proc */ + .flush = sock_flush, .release = sock_close, .fasync = sock_fasync, .sendpage = sock_sendpage, @@ -961,6 +962,15 @@ static int sock_mmap(struct file *file, return sock->ops->mmap(file, sock, vma); } +static int sock_flush(struct file *file, fl_owner_t id) +{ + struct socket *sock = file->private_data; + + if (sock->ops && sock->ops->flush) + sock->ops->flush(sock); + return 0; +} + static int sock_close(struct inode *inode, struct file *filp) { /* --- a/include/net/sock.h 2007-10-26 12:44:42.000000000 -0700 +++ b/include/net/sock.h 2007-10-26 13:50:03.000000000 -0700 @@ -515,6 +515,7 @@ struct timewait_sock_ops; struct proto { void (*close)(struct sock *sk, long timeout); + void (*flush)(struct sock *sk); int (*connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html