On Monday 01 October 2007 04:15, Davide Libenzi wrote: > On Mon, 1 Oct 2007, Denys Vlasenko wrote: > > > My use case is: I want to do a nonblocking read on descriptor 0 (stdin). > > It may be a pipe or a socket. > > > > There may be other processes which share this descriptor with me, > > I simply cannot know that. And they, too, may want to do reads on it. > > > > I want to do nonblocking read in such a way that neither those other > > processes will ever see fd switching to O_NONBLOCK and back, and > > I also want to be safe from other processes doing the same. > > > > I don't see how this can be done using standard unix primitives. > > Indeed. You could simulate non-blocking using poll with zero timeout, but > if another task may read/write on it, your following read/write may end up > blocking even after a poll returned the required events. > One way to solve this would be some sort of readx/writex where you pass an > extra flags parameter
We have that already. They are called send and recv. ;) > (this could be done with sys_indirect, assuming > we'll ever get that mainline) where you specify the non-blocking > requirement for-this-call, and not as global per-file flag. Then, of > course, you'll have to modify all the "file->f_flags & O_NONBLOCK" tests > (and there are many of them) to check for that flag too (that can be a > per task_struct flag). Attached patch detects send/recv(fd, buf, size, MSG_DONTWAIT) on non-sockets and turns them into non-blocking write/read. Since filp->f_flags appear to be read and modified without any locking, I cannot modify it without potentially affecting other processes accessing the same file through shared struct file. Therefore I simply make a temporary copy of struct file, set O_NONBLOCK in it and pass it to vfs_read/write. Is this heresy? ;) I see only one spinlock in struct file: #ifdef CONFIG_EPOLL spinlock_t f_ep_lock; #endif /* #ifdef CONFIG_EPOLL */ Do I need to take it? Also attached is ndelaytest.c which can be used to test that send(MSG_DONTWAIT) indeed is failing with EAGAIN if write would block and that other processes never see O_NONBLOCK set. Comments? -- vda
#include <sys/types.h> #include <sys/socket.h> #include <errno.h> #include <stdio.h> #include <unistd.h> #include <fcntl.h> #include <time.h> #include <signal.h> #define SECONDS 10 #define STR "." //#define STR "123456789 123456789 123456789 123456789 " /* To see send() resulting in EAGAIN: * strace -ff -o log ndelaytest | while sleep 11; do break; done * log.$PID: * send(1, "123456789 123456789 123456789 12"..., 40, MSG_DONTWAIT) * = -1 EAGAIN (Resource temporarily unavailable) */ int main() { pid_t pid; time_t t; int fl; puts("starting"); t = time(0); pid = fork(); if (pid == 0) { /* child */ while ((time(0) - t) < SECONDS-1) { #if 0 /* Uncomment this part and simply run the executable * to see race detection code in action */ #define OP "write" fcntl(1, F_SETFL, fcntl(1, F_GETFL) | O_NONBLOCK); fl = write(1, STR, sizeof(STR) - 1); fcntl(1, F_SETFL, fcntl(1, F_GETFL) & ~O_NONBLOCK); #else /* This part tests whether send(MSG_DONTWAIT) * is racy or not */ #define OP "send" fl = send(1, STR, sizeof(STR) - 1, MSG_DONTWAIT); #endif if (fl < 0) { perror(OP); kill(getppid(), SIGKILL); return 1; } } return 0; } while ((time(0) - t) < SECONDS) { fl = fcntl(1, F_GETFL); if (fl & O_NONBLOCK) { fprintf(stderr, "NONBLOCK:1\n"); kill(pid, SIGKILL); fcntl(1, F_SETFL, fl & ~O_NONBLOCK); return 1; } } fprintf(stderr, "NONBLOCK:0\n"); return 0; }
--- linux-2.6.22-rc6.src/fs/read_write.c Fri Jun 15 19:30:05 2007 +++ linux-2.6.22-rc6_ndelay/fs/read_write.c Sun Aug 19 10:43:24 2007 @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/syscalls.h> #include <linux/pagemap.h> +#include <linux/socket.h> #include "read_write.h" #include <asm/uaccess.h> @@ -351,6 +352,36 @@ static inline void file_pos_write(struct file *file, loff_t pos) { file->f_pos = pos; +} + +/* Helper for send/recv on non-sockets */ +ssize_t rw_with_flags(struct file *file, int fput_needed, void __user *buf, size_t count, unsigned flags) +{ + int err; + loff_t pos; + struct file *file_copy; + + file_copy = file; + if (flags & MSG_DONTWAIT) { + /* We make copy even if O_NONBLOCK is already set. */ + /* We don't want it to change under our feet. */ + file_copy = kmalloc(sizeof(*file_copy), GFP_KERNEL); + memcpy(file_copy, file, sizeof(*file_copy)); + file_copy->f_flags |= O_NONBLOCK; + } + + pos = file_pos_read(file); + if (flags & MSG_OOB) /* MSG_OOB is reused to mean 'write' */ + err = vfs_write(file_copy, buf, count, &pos); + else + err = vfs_read(file_copy, buf, count, &pos); + file_pos_write(file, pos); + + if (flags & MSG_DONTWAIT) { + kfree(file_copy); + } + fput_light(file, fput_needed); + return err; } asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count) --- linux-2.6.22-rc6.src/include/linux/fs.h Wed Jun 27 21:24:18 2007 +++ linux-2.6.22-rc6_ndelay/include/linux/fs.h Sun Aug 19 10:32:20 2007 @@ -1154,6 +1154,9 @@ extern ssize_t vfs_writev(struct file *, const struct iovec __user *, unsigned long, loff_t *); +extern ssize_t rw_with_flags(struct file *, int, void __user *, size_t, + unsigned); + /* * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called * without the big kernel lock held in all filesystems. --- linux-2.6.22-rc6.src/net/socket.c Fri Jun 15 19:30:08 2007 +++ linux-2.6.22-rc6_ndelay/net/socket.c Sun Aug 19 11:34:07 2007 @@ -1585,8 +1585,17 @@ goto out; sock = sock_from_file(sock_file, &err); - if (!sock) - goto out_put; + if (!sock) { + if (addr) + goto out_put; + if (flags & ~MSG_DONTWAIT) + goto out_put; + /* it's not a socket, but we support a special case: + * send(fd, buf, count, MSG_DONTWAIT) + * (MSG_OOB is reused to mean 'write') */ + return rw_with_flags(sock_file, fput_needed, buff, len, flags | MSG_OOB); + } + iov.iov_base = buff; iov.iov_len = len; msg.msg_name = NULL; @@ -1646,8 +1655,15 @@ goto out; sock = sock_from_file(sock_file, &err); - if (!sock) - goto out_put; + if (!sock) { + if (addr) + goto out_put; + if (flags & ~MSG_DONTWAIT) + goto out_put; + /* it's not a socket, but we support a special case: + * recv(fd, ubuf, size, MSG_DONTWAIT) */ + return rw_with_flags(sock_file, fput_needed, ubuf, size, flags); + } msg.msg_control = NULL; msg.msg_controllen = 0;