diff -urN -x *.[oa] -x .config -x .version -x .depend -x .hdepend -x *.flags -x autoconf.h -x modversions.h -x version.h -x asm -x modules -x config -x soundmodem linux-2.4.0-test9/fs/select.c linux-2.4.0-test9+/fs/select.c
--- linux-2.4.0-test9/fs/select.c	Mon Jul 24 06:39:44 2000
+++ linux-2.4.0-test9+/fs/select.c	Sun Oct 29 20:53:15 2000
@@ -9,238 +9,325 @@
  *     flag set in its personality we do *not* modify the given timeout
  *     parameter to reflect time remaining.
  *
- *  24 January 2000
- *     Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation 
- *     of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
+ *  August 1999 - February 2000
+ *     Rewritten to use a wake up callback to queue events
+ *     after sleeping, plus general performance enhancements.
+ *     -- Mike Jagdis <jaggy@purplet.demon.co.uk>
  */
 
 #include <linux/malloc.h>
 #include <linux/smp_lock.h>
+#include <linux/list.h>
 #include <linux/poll.h>
 #include <linux/file.h>
 
 #include <asm/uaccess.h>
 
-#define ROUND_UP(x,y) (((x)+(y)-1)/(y))
-#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
+/* Claim the file_lock semaphore around the whole fdset scan loop
+ * rather than when looking at an fdset bit within the loop (this
+ * is done by fget()/fput()). This holds a read lock on the decsriptor
+ * table for longer but reduces overhead in the loop.
+ *   Note that poll accesses user space while holding the file lock.
+ * This may be an issue for threaded programs?
+ */
+#define WIDE_FILE_LOCK
 
-struct poll_table_entry {
-	struct file * filp;
-	wait_queue_t wait;
-	wait_queue_head_t * wait_address;
-};
-
-struct poll_table_page {
-	struct poll_table_page * next;
-	struct poll_table_entry * entry;
-	struct poll_table_entry entries[0];
-};
 
-#define POLL_TABLE_FULL(table) \
-	((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
+#define ROUND_UP(x,y) (((x)+(y)-1)/(y))
+#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
 
-/*
- * Ok, Peter made a complicated, but straightforward multiple_wait() function.
- * I have rewritten this, taking some shortcuts: This code may not be easy to
- * follow, but it should be free of race-conditions, and it's practical. If you
- * understand what I'm doing here, then you understand how the linux
- * sleep/wakeup mechanism works.
- *
- * Two very simple procedures, poll_wait() and poll_freewait() make all the
- * work.  poll_wait() is an inline-function defined in <linux/poll.h>,
- * as all select/poll functions have to call it to add an entry to the
- * poll table.
- */
 
-void poll_freewait(poll_table* pt)
+void poll_freewait(poll_table * ptab)
 {
-	struct poll_table_page * p = pt->table;
-	while (p) {
-		struct poll_table_entry * entry;
-		struct poll_table_page *old;
+	struct poll_table_head *p;
 
-		entry = p->entry;
-		do {
-			entry--;
-			remove_wait_queue(entry->wait_address,&entry->wait);
+	p = ptab->head;
+	while (p) {
+		struct poll_table_entry *entry;
+		entry = (struct poll_table_entry *)(p + 1);
+		while (entry < p->entry) {
+			remove_wait_queue(entry->wait_address, &entry->wait);
 			fput(entry->filp);
-		} while (entry > p->entries);
-		old = p;
+			entry++;
+		}
+		p = p->next;
+	}
+	p = ptab->head;
+	while (p) {
+		struct poll_table_head *old = p;
 		p = p->next;
 		free_page((unsigned long) old);
 	}
 }
 
-void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
+static int
+__pollwake(wait_queue_t *wq, unsigned int mode, const int sync)
 {
-	struct poll_table_page *table = p->table;
-
-	if (!table || POLL_TABLE_FULL(table)) {
-		struct poll_table_page *new_table;
-
-		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
-		if (!new_table) {
-			p->error = -ENOMEM;
-			__set_current_state(TASK_RUNNING);
-			return;
+	struct poll_table_entry *p;
+	struct poll_table_head *head;
+	unsigned long flags;
+
+	wq->onwake = NULL;
+	p = (void *)wq - offsetof(struct poll_table_entry, wait);
+	head = (struct poll_table_head *)((unsigned long)p & PAGE_MASK);
+
+	spin_lock_irqsave(&head->poll_table->woken_lock, flags);
+	p->woken_list = head->poll_table->woken_list;
+	head->poll_table->woken_list = p;
+	spin_unlock_irqrestore(&head->poll_table->woken_lock, flags);
+
+	if (wq->task->state & (mode & ~TASK_EXCLUSIVE)) {
+		if (!sync) {
+			wake_up_process(wq->task);
+			goto out;
 		}
-		new_table->entry = new_table->entries;
-		new_table->next = table;
-		p->table = new_table;
-		table = new_table;
+		wake_up_process_synchronous(wq->task);
 	}
+out:
+	return 0;
+}
 
-	/* Add a new entry */
-	{
-		struct poll_table_entry * entry = table->entry;
-		table->entry = entry+1;
+void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *ptab)
+{
+	struct poll_table_head * tmp, * p = ptab->last;
+	struct poll_table_entry *last_entry;
+
+	last_entry = (void *)p + PAGE_SIZE - sizeof(struct poll_table_entry);
+	if (p->entry <= last_entry) {
+		struct poll_table_entry *entry;
+ok_table:
+		entry = p->entry++;
 	 	get_file(filp);
 	 	entry->filp = filp;
+		entry->n = ptab->n;
+		init_waitqueue_entry_onwake(&entry->wait, current, __pollwake);
 		entry->wait_address = wait_address;
-		init_waitqueue_entry(&entry->wait, current);
-		add_wait_queue(wait_address,&entry->wait);
+		add_wait_queue(wait_address, &entry->wait);
+		return;
 	}
-}
 
-#define __IN(fds, n)		(fds->in + n)
-#define __OUT(fds, n)		(fds->out + n)
-#define __EX(fds, n)		(fds->ex + n)
-#define __RES_IN(fds, n)	(fds->res_in + n)
-#define __RES_OUT(fds, n)	(fds->res_out + n)
-#define __RES_EX(fds, n)	(fds->res_ex + n)
-
-#define BITS(fds, n)		(*__IN(fds, n)|*__OUT(fds, n)|*__EX(fds, n))
-
-static int max_select_fd(unsigned long n, fd_set_bits *fds)
-{
-	unsigned long *open_fds;
-	unsigned long set;
-	int max;
-
-	/* handle last in-complete long-word first */
-	set = ~(~0UL << (n & (__NFDBITS-1)));
-	n /= __NFDBITS;
-	open_fds = current->files->open_fds->fds_bits+n;
-	max = 0;
-	if (set) {
-		set &= BITS(fds, n);
-		if (set) {
-			if (!(set & ~*open_fds))
-				goto get_max;
-			return -EBADF;
-		}
-	}
-	while (n) {
-		open_fds--;
-		n--;
-		set = BITS(fds, n);
-		if (!set)
-			continue;
-		if (set & ~*open_fds)
-			return -EBADF;
-		if (max)
-			continue;
-get_max:
-		do {
-			max++;
-			set >>= 1;
-		} while (set);
-		max += n * __NFDBITS;
+	tmp = (void *) __get_free_page(GFP_KERNEL);
+	if (tmp) {
+		tmp->entry = (struct poll_table_entry *)(tmp + 1);
+		tmp->next = NULL;
+		tmp->poll_table = ptab;
+		p->next = tmp;
+		ptab->last = p = tmp;
+		goto ok_table;
 	}
-
-	return max;
+	ptab->error = -ENOMEM;
 }
 
+
 #define BIT(i)		(1UL << ((i)&(__NFDBITS-1)))
-#define MEM(i,m)	((m)+(unsigned)(i)/__NFDBITS)
-#define ISSET(i,m)	(((i)&*(m)) != 0)
-#define SET(i,m)	(*(m) |= (i))
 
 #define POLLIN_SET (POLLRDNORM | POLLRDBAND | POLLIN | POLLHUP | POLLERR)
 #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
 #define POLLEX_SET (POLLPRI)
 
-int do_select(int n, fd_set_bits *fds, long *timeout)
+int do_select(int n, unsigned long *bits, const int size, long *timeout)
 {
-	poll_table table, *wait;
-	int retval, i, off;
-	long __timeout = *timeout;
-
- 	read_lock(&current->files->file_lock);
-	retval = max_select_fd(n, fds);
-	read_unlock(&current->files->file_lock);
-
-	if (retval < 0)
-		return retval;
-	n = retval;
-
-	poll_initwait(&table);
-	wait = &table;
-	if (!__timeout)
-		wait = NULL;
-	retval = 0;
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		for (i = 0 ; i < n; i++) {
-			unsigned long bit = BIT(i);
-			unsigned long mask;
-			struct file *file;
-
-			off = i / __NFDBITS;
-			if (!(bit & BITS(fds, off)))
-				continue;
-			file = fget(i);
-			mask = POLLNVAL;
-			if (file) {
+	int retval;
+	poll_table *wait, wait_table;
+	long __timeout;
+	unsigned long bit;
+	unsigned long fds, *fdl;
+	unsigned long mask;
+	struct file *file;
+
+	wait = NULL;
+	__timeout = *timeout;
+	if (__timeout) {
+		wait_table.head = (void *) __get_free_page(GFP_KERNEL);
+		if (!wait_table.head)
+			return -ENOMEM;
+
+		poll_initwait(&wait_table);
+		wait = &wait_table;
+	}
+
+	wait_table.error = retval = 0;
+
+	if (!n)
+		goto no_setup;
+
+	fdl = bits + (n - 1) / __NFDBITS;
+
+	fds = (*fdl | fdl[size] | fdl[2*size]);
+
+#ifdef WIDE_FILE_LOCK
+	read_lock(&current->files->file_lock);
+#endif
+	if (!fds)
+		goto next_long;
+
+	mask = (n & (__NFDBITS-1));
+	if (mask) {
+		fds &= ~(~0UL << mask);
+		if (!fds)
+			goto next_long;
+	}
+
+	do {
+		/* At this point we know some bit is set in fds. */
+		wait_table.n = (fdl - bits) * 8*sizeof(unsigned long);
+		bit = 1;
+		do {
+			/* If a long has set bits do we expect most
+			 * of them to be set?
+			 */
+			if ((fds & bit)) {
+				fds ^= bit;
+#ifdef WIDE_FILE_LOCK
+				file = fcheck(wait_table.n);
+#else
+				file = fget(wait_table.n);
+#endif
+				if (!file)
+					goto out_badf;
 				mask = DEFAULT_POLLMASK;
 				if (file->f_op && file->f_op->poll)
 					mask = file->f_op->poll(file, wait);
+#ifndef WIDE_FILE_LOCK
 				fput(file);
+#endif
+				/* Branch prediction says that forward
+				 * conditional branches are not taken. So...
+				 */
+				/* Normally have to wait for input */
+				if (!(mask & POLLIN_SET))
+					goto no_in_check;
+				if ((*fdl & bit)) {
+					fdl[3*size] |= bit;
+					retval++;
+					wait = NULL;
+				}
+no_in_check:
+				/* Normally can output */
+				if (!(fdl[size] & bit))
+					goto no_out_check;
+				if ((mask & POLLOUT_SET)) {
+					fdl[4*size] |= bit;
+					retval++;
+					wait = NULL;
+				}
+no_out_check:
+				/* Normally no exception */
+				if (!(mask & POLLEX_SET))
+					goto no_ex_check;
+				if ((fdl[2*size] & bit)) {
+					fdl[5*size] |= bit;
+					retval++;
+					wait = NULL;
+				}
+no_ex_check:
 			}
-			if ((mask & POLLIN_SET) && ISSET(bit, __IN(fds,off))) {
-				SET(bit, __RES_IN(fds,off));
+			bit += bit;
+			wait_table.n++;
+		} while (fds);
+
+next_long:
+		/* Long runs of zero bits tend to be common, especially
+		 * at the end of over large sets so we use a tight loop
+		 * to skip them.
+		 * N.B. We allocated an extra, non-zero, long in front
+		 * of bits in sys_select specifically so we can access
+		 * below bits and avoid an extra test and branch.
+		 */
+		do {
+			fdl--;
+			fds = *fdl | fdl[size] | fdl[2*size];
+		} while (!fds);
+	} while (fdl >= bits);
+
+#ifdef WIDE_FILE_LOCK
+	read_unlock(&current->files->file_lock);
+#endif
+
+	if (!retval)
+		retval = wait_table.error;
+
+no_setup:
+	while (!retval && __timeout && !signal_pending(current)) {
+		unsigned long flags;
+		struct poll_table_entry *item, *next;
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (wait_table.woken_list == NULL)
+			__timeout = schedule_timeout(__timeout);
+		set_current_state(TASK_RUNNING);
+
+		/* Lift the queued events and reset the queue ready
+		 * for anything that happens while we "think".
+		 */
+		spin_lock_irqsave(&wait_table.woken_lock, flags);
+		item = wait_table.woken_list;
+		wait_table.woken_list = NULL;
+		spin_unlock_irqrestore(&wait_table.woken_lock, flags);
+
+		for (; item; item=next) {
+			struct poll_table_entry * p = item;
+
+			/* Reset the wait_queue's event pointer before
+			 * checking in case it is triggered again.
+			 */
+			next = item->woken_list;
+			set_waitqueue_entry_onwake(&p->wait, __pollwake);
+
+			file = p->filp;
+#if 0
+			/* This has to be true or we would never have
+			 * done a wait on the file!
+			 */
+			if (file->f_op && file->f_op->poll)
+#endif
+				mask = file->f_op->poll(file, NULL);
+
+			fdl = bits + (p->n / __NFDBITS);
+			bit = BIT(p->n);
+
+			/* Branch prediction says that forward
+			 * conditional branches are not taken.
+			 * Wake ups are dumb. We get woken up every time
+			 * more space becomes available even if we are
+			 * not interested.
+			 */
+			if (!(mask & POLLIN_SET))
+				goto ev_no_in_check;
+			if ((*fdl & bit)) {
+				fdl[3*size] |= bit;
 				retval++;
-				wait = NULL;
 			}
-			if ((mask & POLLOUT_SET) && ISSET(bit, __OUT(fds,off))) {
-				SET(bit, __RES_OUT(fds,off));
+ev_no_in_check:
+			if (!(fdl[size] & bit))
+				goto ev_no_out_check;
+			if ((mask & POLLOUT_SET)) {
+				fdl[4*size] |= bit;
 				retval++;
-				wait = NULL;
 			}
-			if ((mask & POLLEX_SET) && ISSET(bit, __EX(fds,off))) {
-				SET(bit, __RES_EX(fds,off));
+ev_no_out_check:
+			/* Normally no exception */
+			if (!(mask & POLLEX_SET))
+				goto ev_no_ex_check;
+			if ((fdl[2*size] & bit)) {
+				fdl[5*size] |= bit;
 				retval++;
-				wait = NULL;
 			}
+ev_no_ex_check:
 		}
-		wait = NULL;
-		if (retval || !__timeout || signal_pending(current))
-			break;
-		if(table.error) {
-			retval = table.error;
-			break;
-		}
-		__timeout = schedule_timeout(__timeout);
 	}
-	current->state = TASK_RUNNING;
-
-	poll_freewait(&table);
-
-	/*
-	 * Up-to-date the caller timeout.
-	 */
+out:
+	if (*timeout)
+		poll_freewait(&wait_table);
 	*timeout = __timeout;
 	return retval;
-}
-
-static void *select_bits_alloc(int size)
-{
-	return kmalloc(6 * size, GFP_KERNEL);
-}
 
-static void select_bits_free(void *bits, int size)
-{
-	kfree(bits);
+out_badf:
+#ifdef WIDE_FILE_LOCK
+	read_unlock(&current->files->file_lock);
+#endif
+	retval = -EBADF;
+	goto out;
 }
 
 /*
@@ -254,13 +341,22 @@
 #define MAX_SELECT_SECONDS \
 	((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1)
 
+/* How big a set we can use stack for rather than going out to kmalloc.
+ * 512 fds = 388 bytes, 1024 fds = 772 bytes, 2048 fds = 1540 bytes,
+ * 4096 fds = 3068 bytes
+ * Remember: the future call depth is fairly shallow but we need
+ * to leave space for interrupts!
+ */
+#define MAX_LOC_FDS	1024
+#define MAX_LOC_LONGS	(1 + 6 * FDS_LONGS(MAX_LOC_FDS))
+
 asmlinkage long
 sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
 {
-	fd_set_bits fds;
-	char *bits;
+	unsigned long *data, *bits;
 	long timeout;
 	int ret, size;
+	unsigned long ldata[MAX_LOC_LONGS];
 
 	timeout = MAX_SCHEDULE_TIMEOUT;
 	if (tvp) {
@@ -291,29 +387,27 @@
 	/*
 	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
 	 * since we used fdset we need to allocate memory in units of
-	 * long-words. 
+	 * long-words. We add an extra, non-zero, long at the beginning
+	 * because that lets us avoid an extra test-and-branch within
+	 * the loop in do_select.
 	 */
 	ret = -ENOMEM;
-	size = FDS_BYTES(n);
-	bits = select_bits_alloc(size);
-	if (!bits)
-		goto out_nofds;
-	fds.in      = (unsigned long *)  bits;
-	fds.out     = (unsigned long *) (bits +   size);
-	fds.ex      = (unsigned long *) (bits + 2*size);
-	fds.res_in  = (unsigned long *) (bits + 3*size);
-	fds.res_out = (unsigned long *) (bits + 4*size);
-	fds.res_ex  = (unsigned long *) (bits + 5*size);
-
-	if ((ret = get_fd_set(n, inp, fds.in)) ||
-	    (ret = get_fd_set(n, outp, fds.out)) ||
-	    (ret = get_fd_set(n, exp, fds.ex)))
+	size = FDS_LONGS(n);
+	data = ldata;
+	if (n > MAX_LOC_FDS) {
+		data = kmalloc((1 + 6 * size) * sizeof(unsigned long), GFP_KERNEL);
+		if (!data)
+			goto out_nofds;
+	}
+	*data = 1;
+	bits = data + 1;
+	if ((ret = get_fd_set(n, inp, bits)) ||
+	    (ret = get_fd_set(n, outp, bits+size)) ||
+	    (ret = get_fd_set(n, exp, bits+2*size)))
 		goto out;
-	zero_fd_set(n, fds.res_in);
-	zero_fd_set(n, fds.res_out);
-	zero_fd_set(n, fds.res_ex);
+	memset(bits+3*size, 0, 3*size*sizeof(unsigned long));
 
-	ret = do_select(n, &fds, &timeout);
+	ret = do_select(n, bits, size, &timeout);
 
 	if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
 		time_t sec = 0, usec = 0;
@@ -322,8 +416,8 @@
 			usec = timeout % HZ;
 			usec *= (1000000/HZ);
 		}
-		put_user(sec, &tvp->tv_sec);
-		put_user(usec, &tvp->tv_usec);
+		__put_user(sec, &tvp->tv_sec);
+		__put_user(usec, &tvp->tv_usec);
 	}
 
 	if (ret < 0)
@@ -335,86 +429,148 @@
 		ret = 0;
 	}
 
-	set_fd_set(n, inp, fds.res_in);
-	set_fd_set(n, outp, fds.res_out);
-	set_fd_set(n, exp, fds.res_ex);
+	if (inp)
+		__copy_to_user(inp, bits+3*size, size*sizeof(unsigned long));
+	if (outp)
+		__copy_to_user(outp, bits+4*size, size*sizeof(unsigned long));
+	if (exp)
+		__copy_to_user(exp, bits+5*size, size*sizeof(unsigned long));
 
 out:
-	select_bits_free(bits, size);
+	if (n > MAX_LOC_FDS)
+		kfree(data);
 out_nofds:
 	return ret;
 }
 
-#define POLLFD_PER_PAGE  ((PAGE_SIZE) / sizeof(struct pollfd))
-
-static void do_pollfd(unsigned int num, struct pollfd * fdpage,
-	poll_table ** pwait, int *count)
+static int do_poll(unsigned int nfds, struct pollfd *ufds, long timeout)
 {
-	int i;
+	int retval;
+	poll_table *wait, wait_table;
+
+	wait = NULL;
+	if (timeout) {
+		wait_table.head = (void *) __get_free_page(GFP_KERNEL);
+		if (!wait_table.head)
+			return -ENOMEM;
 
-	for (i = 0; i < num; i++) {
+		poll_initwait(&wait_table);
+		wait = &wait_table;
+	}
+
+	wait_table.error = retval = 0;
+
+#ifdef WIDE_FILE_LOCK
+	read_lock(&current->files->file_lock);
+#endif
+	for (wait_table.n = 0; wait_table.n < nfds; wait_table.n++) {
 		int fd;
-		unsigned int mask;
-		struct pollfd *fdp;
+		unsigned int mask, events;
 
 		mask = 0;
-		fdp = fdpage+i;
-		fd = fdp->fd;
+		if (__get_user(fd, &ufds[wait_table.n].fd))
+			goto out_fault;
 		if (fd >= 0) {
-			struct file * file = fget(fd);
+			struct file * file;
+#ifdef WIDE_FILE_LOCK
+			file = fcheck(fd);
+#else
+			file = fget(fd);
+#endif
 			mask = POLLNVAL;
 			if (file != NULL) {
 				mask = DEFAULT_POLLMASK;
 				if (file->f_op && file->f_op->poll)
-					mask = file->f_op->poll(file, *pwait);
-				mask &= fdp->events | POLLERR | POLLHUP;
+					mask = file->f_op->poll(file, wait);
+#ifndef WIDE_FILE_LOCK
 				fput(file);
+#endif
+				if (__get_user(events, &ufds[wait_table.n].events))
+					goto out_fault;
+				mask &= events | POLLERR | POLLHUP;
 			}
 			if (mask) {
-				*pwait = NULL;
-				(*count)++;
+				wait = NULL;
+				retval++;
 			}
 		}
-		fdp->revents = mask;
+		__put_user(mask, &ufds[wait_table.n].revents);
 	}
-}
 
-static int do_poll(unsigned int nfds, unsigned int nchunks, unsigned int nleft, 
-	struct pollfd *fds[], poll_table *wait, long timeout)
-{
-	int count = 0;
-	poll_table* pt = wait;
+#ifdef WIDE_FILE_LOCK
+	read_unlock(&current->files->file_lock);
+#endif
 
-	for (;;) {
-		unsigned int i;
+	if (!retval)
+		retval = wait_table.error;
+
+	while (!retval && timeout && !signal_pending(current)) {
+		unsigned long flags;
+		struct poll_table_entry *item, *next;
 
 		set_current_state(TASK_INTERRUPTIBLE);
-		for (i=0; i < nchunks; i++)
-			do_pollfd(POLLFD_PER_PAGE, fds[i], &pt, &count);
-		if (nleft)
-			do_pollfd(nleft, fds[nchunks], &pt, &count);
-		pt = NULL;
-		if (count || !timeout || signal_pending(current))
-			break;
-		if(wait->error) {
-			return wait->error;
+		if (wait_table.woken_list == NULL)
+			timeout = schedule_timeout(timeout);
+		set_current_state(TASK_RUNNING);
+
+		/* Lift the queued events and reset the queue ready
+		 * for anything that happens while we "think".
+		 */
+		spin_lock_irqsave(&wait_table.woken_lock, flags);
+		item = wait_table.woken_list;
+		wait_table.woken_list = NULL;
+		spin_unlock_irqrestore(&wait_table.woken_lock, flags);
+
+		for (; item; item=next) {
+			struct poll_table_entry * p = item;
+			unsigned long mask, events;
+			struct file * file;
+
+			/* Reset the wait_queue's event pointer before
+			 * checking in case it is triggered again.
+			 */
+			next = item->woken_list;
+			set_waitqueue_entry_onwake(&p->wait, __pollwake);
+
+			file = p->filp;
+#if 0
+			/* This has to be true or we would never have
+			 * done a wait on the file!
+			 */
+			if (file->f_op && file->f_op->poll)
+#endif
+				mask = file->f_op->poll(file, NULL);
+			if (__get_user(events, &ufds[p->n].events))
+				goto out_fault;
+			mask &= events | POLLERR | POLLHUP;
+			if (mask) {
+				retval++;
+				__put_user(mask, &ufds[p->n].revents);
+			}
 		}
-		timeout = schedule_timeout(timeout);
 	}
-	current->state = TASK_RUNNING;
-	return count;
+out:
+	if (timeout)
+		poll_freewait(&wait_table);
+	return retval;
+
+out_fault:
+#ifdef WIDE_FILE_LOCK
+	read_unlock(&current->files->file_lock);
+#endif
+	retval = -EFAULT;
+	goto out;
 }
 
-asmlinkage long sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout)
+asmlinkage long
+sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout)
 {
-	int i, j, fdcount, err;
-	struct pollfd **fds;
-	poll_table table, *wait;
-	int nchunks, nleft;
+	int err;
 
 	/* Do a sanity check on nfds ... */
+	err = -EINVAL;
 	if (nfds > current->files->max_fds)
-		return -EINVAL;
+		goto out;
 
 	if (timeout) {
 		/* Careful about overflow in the intermediate values */
@@ -424,69 +580,13 @@
 			timeout = MAX_SCHEDULE_TIMEOUT;
 	}
 
-	poll_initwait(&table);
-	wait = &table;
-	if (!timeout)
-		wait = NULL;
-
-	err = -ENOMEM;
-	fds = NULL;
-	if (nfds != 0) {
-		fds = (struct pollfd **)kmalloc(
-			(1 + (nfds - 1) / POLLFD_PER_PAGE) * sizeof(struct pollfd *),
-			GFP_KERNEL);
-		if (fds == NULL)
-			goto out;
-	}
+	err = verify_area(VERIFY_WRITE, ufds, nfds * sizeof(struct pollfd));
+	if (!err) {
+		err = do_poll(nfds, ufds, timeout);
 
-	nchunks = 0;
-	nleft = nfds;
-	while (nleft > POLLFD_PER_PAGE) { /* allocate complete PAGE_SIZE chunks */
-		fds[nchunks] = (struct pollfd *)__get_free_page(GFP_KERNEL);
-		if (fds[nchunks] == NULL)
-			goto out_fds;
-		nchunks++;
-		nleft -= POLLFD_PER_PAGE;
-	}
-	if (nleft) { /* allocate last PAGE_SIZE chunk, only nleft elements used */
-		fds[nchunks] = (struct pollfd *)__get_free_page(GFP_KERNEL);
-		if (fds[nchunks] == NULL)
-			goto out_fds;
-	}
-
-	err = -EFAULT;
-	for (i=0; i < nchunks; i++)
-		if (copy_from_user(fds[i], ufds + i*POLLFD_PER_PAGE, PAGE_SIZE))
-			goto out_fds1;
-	if (nleft) {
-		if (copy_from_user(fds[nchunks], ufds + nchunks*POLLFD_PER_PAGE, 
-				nleft * sizeof(struct pollfd)))
-			goto out_fds1;
-	}
-
-	fdcount = do_poll(nfds, nchunks, nleft, fds, wait, timeout);
-
-	/* OK, now copy the revents fields back to user space. */
-	for(i=0; i < nchunks; i++)
-		for (j=0; j < POLLFD_PER_PAGE; j++, ufds++)
-			__put_user((fds[i] + j)->revents, &ufds->revents);
-	if (nleft)
-		for (j=0; j < nleft; j++, ufds++)
-			__put_user((fds[nchunks] + j)->revents, &ufds->revents);
-
-	err = fdcount;
-	if (!fdcount && signal_pending(current))
-		err = -EINTR;
-
-out_fds1:
-	if (nleft)
-		free_page((unsigned long)(fds[nchunks]));
-out_fds:
-	for (i=0; i < nchunks; i++)
-		free_page((unsigned long)(fds[i]));
-	if (nfds != 0)
-		kfree(fds);
+		if (!err && signal_pending(current))
+			err = -EINTR;
+	}
 out:
-	poll_freewait(&table);
 	return err;
 }
diff -urN -x *.[oa] -x .config -x .version -x .depend -x .hdepend -x *.flags -x autoconf.h -x modversions.h -x version.h -x asm -x modules -x config -x soundmodem linux-2.4.0-test9/include/linux/poll.h linux-2.4.0-test9+/include/linux/poll.h
--- linux-2.4.0-test9/include/linux/poll.h	Mon Oct  2 19:01:39 2000
+++ linux-2.4.0-test9+/include/linux/poll.h	Sun Oct 29 21:01:35 2000
@@ -8,15 +8,33 @@
 #include <linux/wait.h>
 #include <linux/string.h>
 #include <linux/mm.h>
+#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 
-struct poll_table_page;
+
+struct poll_table_entry {
+	int n;
+	struct file * filp;
+	struct poll_table_entry *woken_list;
+	wait_queue_t wait;
+	wait_queue_head_t * wait_address;
+};
 
 typedef struct poll_table_struct {
+	int n;
 	int error;
-	struct poll_table_page * table;
+	spinlock_t woken_lock;
+	struct poll_table_entry *woken_list;
+	struct poll_table_head * head, * last;
 } poll_table;
 
+struct poll_table_head {
+	poll_table *poll_table;
+	struct poll_table_head * next;
+	struct poll_table_entry * entry;
+};
+
+
 extern void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p);
 
 extern inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
@@ -27,20 +45,16 @@
 
 static inline void poll_initwait(poll_table* pt)
 {
+	spin_lock_init(&pt->woken_lock);
+	pt->head->entry = (struct poll_table_entry *)(pt->head + 1);
+	pt->head->poll_table = pt;
+	pt->head->next = NULL;
+	pt->woken_list = NULL;
 	pt->error = 0;
-	pt->table = NULL;
+	pt->last = pt->head;
 }
-extern void poll_freewait(poll_table* pt);
-
 
-/*
- * Scaleable version of the fd_set.
- */
-
-typedef struct {
-	unsigned long *in, *out, *ex;
-	unsigned long *res_in, *res_out, *res_ex;
-} fd_set_bits;
+extern void poll_freewait(poll_table* pt);
 
 /*
  * How many longwords for "nr" bits?
@@ -69,21 +83,6 @@
 	memset(fdset, 0, nr);
 	return 0;
 }
-
-static inline
-void set_fd_set(unsigned long nr, void *ufdset, unsigned long *fdset)
-{
-	if (ufdset)
-		__copy_to_user(ufdset, fdset, FDS_BYTES(nr));
-}
-
-static inline
-void zero_fd_set(unsigned long nr, unsigned long *fdset)
-{
-	memset(fdset, 0, FDS_BYTES(nr));
-}
-
-extern int do_select(int n, fd_set_bits *fds, long *timeout);
 
 #endif /* KERNEL */
 
diff -urN -x *.[oa] -x .config -x .version -x .depend -x .hdepend -x *.flags -x autoconf.h -x modversions.h -x version.h -x asm -x modules -x config -x soundmodem linux-2.4.0-test9/include/linux/sched.h linux-2.4.0-test9+/include/linux/sched.h
--- linux-2.4.0-test9/include/linux/sched.h	Mon Oct  2 19:01:19 2000
+++ linux-2.4.0-test9+/include/linux/sched.h	Sun Oct 29 18:01:27 2000
@@ -541,6 +541,7 @@
 extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
 						    signed long timeout));
 extern void FASTCALL(wake_up_process(struct task_struct * tsk));
+extern void FASTCALL(wake_up_process_synchronous(struct task_struct * tsk));
 
 #define wake_up(x)			__wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE | TASK_EXCLUSIVE)
 #define wake_up_all(x)			__wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE)
diff -urN -x *.[oa] -x .config -x .version -x .depend -x .hdepend -x *.flags -x autoconf.h -x modversions.h -x version.h -x asm -x modules -x config -x soundmodem linux-2.4.0-test9/include/linux/wait.h linux-2.4.0-test9+/include/linux/wait.h
--- linux-2.4.0-test9/include/linux/wait.h	Mon Oct  2 19:01:17 2000
+++ linux-2.4.0-test9+/include/linux/wait.h	Sun Oct 29 17:10:05 2000
@@ -16,6 +16,7 @@
 #include <linux/spinlock.h>
 
 #include <asm/page.h>
+#include <asm/system.h>
 #include <asm/processor.h>
 
 /*
@@ -31,7 +32,7 @@
 	BUG(); \
 } while (0)
 
-#define CHECK_MAGIC(x) if (x != (long)&(x)) \
+#define CHECK_MAGIC(x) if ((long)x != (long)&(x)) \
 	{ printk("bad magic %lx (should be %lx), ", (long)x, (long)&(x)); WQ_BUG(); }
 
 #define CHECK_MAGIC_WQHEAD(x) do { \
@@ -43,16 +44,19 @@
 } while (0)
 #endif
 
+typedef struct __wait_queue wait_queue_t;
+typedef int (*onwake_func_t)(wait_queue_t *, unsigned int, const int);
+
 struct __wait_queue {
 	unsigned int compiler_warning;
 	struct task_struct * task;
 	struct list_head task_list;
+	onwake_func_t onwake;
 #if WAITQUEUE_DEBUG
 	long __magic;
 	long __waker;
 #endif
 };
-typedef struct __wait_queue wait_queue_t;
 
 /*
  * 'dual' spinlock architecture. Can be switched between spinlock_t and
@@ -109,7 +113,7 @@
 #endif
 
 #define __WAITQUEUE_INITIALIZER(name,task) \
-	{ 0x1234567, task, { NULL, NULL } __WAITQUEUE_DEBUG_INIT(name)}
+	{ 0x1234567, task, { NULL, NULL }, (void *)-1 __WAITQUEUE_DEBUG_INIT(name)}
 #define DECLARE_WAITQUEUE(name,task) \
 	wait_queue_t name = __WAITQUEUE_INITIALIZER(name,task)
 
@@ -134,17 +138,25 @@
 #endif
 }
 
-static inline void init_waitqueue_entry(wait_queue_t *q,
-				 struct task_struct *p)
+static inline void init_waitqueue_entry_onwake(wait_queue_t *q,
+				 struct task_struct *p, onwake_func_t onwake)
 {
 #if WAITQUEUE_DEBUG
 	if (!q || !p)
 		WQ_BUG();
 #endif
 	q->task = p;
+	q->onwake = onwake;
 #if WAITQUEUE_DEBUG
 	q->__magic = (long)&q->__magic;
 #endif
+}
+
+#define init_waitqueue_entry(q, p) init_waitqueue_entry_onwake(q, p, (void *)-1);
+
+static inline void set_waitqueue_entry_onwake(wait_queue_t *wq, onwake_func_t onwake)
+{
+	wq->onwake = onwake;
 }
 
 static inline int waitqueue_active(wait_queue_head_t *q)
diff -urN -x *.[oa] -x .config -x .version -x .depend -x .hdepend -x *.flags -x autoconf.h -x modversions.h -x version.h -x asm -x modules -x config -x soundmodem linux-2.4.0-test9/kernel/sched.c linux-2.4.0-test9+/kernel/sched.c
--- linux-2.4.0-test9/kernel/sched.c	Mon Oct  2 19:45:01 2000
+++ linux-2.4.0-test9+/kernel/sched.c	Tue Oct 24 11:49:07 2000
@@ -361,7 +361,7 @@
 	spin_unlock_irqrestore(&runqueue_lock, flags);
 }
 
-static inline void wake_up_process_synchronous(struct task_struct * p)
+inline void wake_up_process_synchronous(struct task_struct * p)
 {
 	unsigned long flags;
 
@@ -726,38 +726,45 @@
 	while (tmp != head) {
 		unsigned int state;
                 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
+		onwake_func_t onwake = curr->onwake;
 
 		tmp = tmp->next;
 
 #if WAITQUEUE_DEBUG
 		CHECK_MAGIC(curr->__magic);
 #endif
-		p = curr->task;
-		state = p->state;
-		if (state & (mode & ~TASK_EXCLUSIVE)) {
+		if (onwake == (void *)-1) {
+			p = curr->task;
+			state = p->state;
+			if (state & (mode & ~TASK_EXCLUSIVE)) {
 #if WAITQUEUE_DEBUG
-			curr->__waker = (long)__builtin_return_address(0);
+				curr->__waker = (long)__builtin_return_address(0);
 #endif
-			/*
-			 * If waking up from an interrupt context then
-			 * prefer processes which are affine to this
-			 * CPU.
-			 */
-			if (irq && (state & mode & TASK_EXCLUSIVE)) {
-				if (!best_exclusive)
-					best_exclusive = p;
-				else if ((p->processor == best_cpu) &&
-					(best_exclusive->processor != best_cpu))
+				/*
+				 * If waking up from an interrupt context then
+				 * prefer processes which are affine to this
+				 * CPU.
+				 */
+				if (irq && (state & mode & TASK_EXCLUSIVE)) {
+					if (!best_exclusive)
 						best_exclusive = p;
-			} else {
-				if (sync)
-					wake_up_process_synchronous(p);
-				else
-					wake_up_process(p);
-				if (state & mode & TASK_EXCLUSIVE)
-					break;
+					else if ((p->processor == best_cpu) &&
+						(best_exclusive->processor != best_cpu))
+							best_exclusive = p;
+				} else {
+					if (sync)
+						wake_up_process_synchronous(p);
+					else
+						wake_up_process(p);
+					if (state & mode & TASK_EXCLUSIVE)
+						break;
+				}
 			}
+			continue;
 		}
+
+		if (onwake && onwake(curr, mode, sync))
+			break;
 	}
 	if (best_exclusive)
 		best_exclusive->state = TASK_RUNNING;
