On Sat, Nov 06, 2021 at 06:23:27PM +0100, Martin Pieuchot wrote:
> On 06/11/21(Sat) 15:53, Visa Hankala wrote:
> > On Fri, Nov 05, 2021 at 10:04:50AM +0100, Martin Pieuchot wrote:
> > > New poll/select(2) implementation convert 'struct pollfd' and 'fdset' to
> > > knotes (kqueue event descriptors) then pass them to the kqueue subsystem.
> > > A knote is allocated, with kqueue_register(), for every read, write and
> > > except condition watched on a given FD. That means at most 3 allocations
> > > might be necessary per FD.
> > >
> > > The diff below reduce the overhead of per-syscall allocation/free of those
> > > descriptors by leaving those which didn't trigger on the kqueue across
> > > syscall. Leaving knotes on the kqueue allows kqueue_register() to re-use
> > > existing descriptor instead of re-allocating a new one.
> > >
> > > With this knotes are now lazily removed. The mechanism uses a serial
> > > number which is incremented for every syscall that indicates if a knote
> > > sitting in the kqueue is still valid or should be freed.
> > >
> > > Note that performance improvements might not be visible with this diff
> > > alone because kqueue_register() still pre-allocate a descriptor then drop
> > > it.
> > >
> > > visa@ already pointed out that the lazy removal logic could be integrated
> > > in kqueue_scan() which would reduce the complexity of those two syscalls.
> > > I'm arguing for doing this in a next step in-tree.
> >
> > I think it would make more sense to add the removal logic to the scan
> > function first as doing so would keep the code modifications more
> > logical and simpler. This would also avoid the need to go through
> > a temporary removal approach.
>
> I totally support your effort and your design however I don't have the
> time to do another round of test/debugging. So please, can you take
> care of doing these cleanups afterward? If not, please send a full diff
> and take over this feature, it's too much effort for me to work out of
> tree.
Here is a full diff that does lazy removal in kqueue_scan().
The implementation clears all kqpoll knotes when p_kq_serial is about
to wrap. This keeps the code relatively simple because any knote with
a serial less than p_kq_serial is expired.
With current speeds of hardware, serial wraparound should not happen
when p_kq_serial is 64-bits wide. However, long-running and busy
processes might experience wraparounds on 32-bit hardware.
p_kq_serial is still initialized with a 32-bit random number. It is not
clear to me if this gives any clear benefit over a constant initializer.
With native kqueue, kev->udata values are user-controlled. With poll and
select, the user cannot directly manipulate the knotes, but the udata
values are observable through ktrace. Going for 64-bit initial random
values on 64-bit systems is an option as well, though at the cost of
increased noise in ktrace output.
Index: kern/kern_event.c
===================================================================
RCS file: src/sys/kern/kern_event.c,v
retrieving revision 1.170
diff -u -p -r1.170 kern_event.c
--- kern/kern_event.c 6 Nov 2021 05:48:47 -0000 1.170
+++ kern/kern_event.c 7 Nov 2021 12:31:12 -0000
@@ -73,6 +73,7 @@ void kqueue_terminate(struct proc *p, st
void KQREF(struct kqueue *);
void KQRELE(struct kqueue *);
+void kqueue_purge(struct proc *, struct kqueue *);
int kqueue_sleep(struct kqueue *, struct timespec *);
int kqueue_read(struct file *, struct uio *, int);
@@ -763,29 +764,55 @@ filter_process(struct knote *kn, struct
return (active);
}
+/*
+ * Initialize the current thread for poll/select system call.
+ * num indicates the number of serials that the system call may utilize.
+ * After this function, the valid range of serials is
+ * p_kq_serial <= x < p_kq_serial + num.
+ */
void
-kqpoll_init(void)
+kqpoll_init(unsigned int num)
{
struct proc *p = curproc;
struct filedesc *fdp;
- if (p->p_kq != NULL) {
- /*
- * Discard any badfd knotes that have been enqueued after
- * previous scan.
- * This prevents them from accumulating in case
- * scan does not make progress for some reason.
- */
- kqpoll_dequeue(p, 0);
- return;
+ if (p->p_kq == NULL) {
+ p->p_kq = kqueue_alloc(p->p_fd);
+ p->p_kq_serial = arc4random();
+ fdp = p->p_fd;
+ fdplock(fdp);
+ LIST_INSERT_HEAD(&fdp->fd_kqlist, p->p_kq, kq_next);
+ fdpunlock(fdp);
}
- p->p_kq = kqueue_alloc(p->p_fd);
- p->p_kq_serial = arc4random();
- fdp = p->p_fd;
- fdplock(fdp);
- LIST_INSERT_HEAD(&fdp->fd_kqlist, p->p_kq, kq_next);
- fdpunlock(fdp);
+ if (p->p_kq_serial + num < p->p_kq_serial) {
+ /* Serial is about to wrap. Clear all attached knotes. */
+ kqueue_purge(p, p->p_kq);
+ p->p_kq_serial = 0;
+ }
+
+ /*
+ * Discard any detached knotes that have been enqueued after
+ * previous scan.
+ * This prevents them from accumulating in case
+ * scan does not make progress for some reason.
+ */
+ kqpoll_dequeue(p, 0);
+}
+
+/*
+ * Finish poll/select system call.
+ * num must have the same value that was used with kqpoll_init().
+ */
+void
+kqpoll_done(unsigned int num)
+{
+ struct proc *p = curproc;
+
+ KASSERT(p->p_kq != NULL);
+ KASSERT(p->p_kq_serial + num >= p->p_kq_serial);
+
+ p->p_kq_serial += num;
}
void
@@ -1383,6 +1410,15 @@ retry:
mtx_leave(&kq->kq_lock);
+ /* Drop expired kqpoll knotes. */
+ if (p->p_kq == kq &&
+ p->p_kq_serial > (unsigned long)kn->kn_udata) {
+ filter_detach(kn);
+ knote_drop(kn, p);
+ mtx_enter(&kq->kq_lock);
+ continue;
+ }
+
memset(kevp, 0, sizeof(*kevp));
if (filter_process(kn, kevp) == 0) {
mtx_enter(&kq->kq_lock);
Index: kern/sys_generic.c
===================================================================
RCS file: src/sys/kern/sys_generic.c,v
retrieving revision 1.139
diff -u -p -r1.139 sys_generic.c
--- kern/sys_generic.c 29 Oct 2021 15:52:44 -0000 1.139
+++ kern/sys_generic.c 7 Nov 2021 12:31:12 -0000
@@ -633,7 +633,7 @@ dopselect(struct proc *p, int nd, fd_set
pobits[2] = (fd_set *)&bits[5];
}
- kqpoll_init();
+ kqpoll_init(nd);
#define getbits(name, x) \
if (name && (error = copyin(name, pibits[x], ni))) \
@@ -730,8 +730,7 @@ done:
if (pibits[0] != (fd_set *)&bits[0])
free(pibits[0], M_TEMP, 6 * ni);
- kqueue_purge(p, p->p_kq);
- p->p_kq_serial += nd;
+ kqpoll_done(nd);
return (error);
}
@@ -759,7 +758,7 @@ pselregister(struct proc *p, fd_set *pib
DPRINTFN(2, "select fd %d mask %d serial %lu\n",
fd, msk, p->p_kq_serial);
EV_SET(&kev, fd, evf[msk],
- EV_ADD|EV_ENABLE|EV_ONESHOT|__EV_POLL,
+ EV_ADD|EV_ENABLE|__EV_POLL,
evff[msk], 0, (void *)(p->p_kq_serial));
#ifdef KTRACE
if (KTRPOINT(p, KTR_STRUCT))
@@ -804,13 +803,10 @@ int
pselcollect(struct proc *p, struct kevent *kevp, fd_set *pobits[3],
int *ncollected)
{
- /* Filter out and lazily delete spurious events */
if ((unsigned long)kevp->udata != p->p_kq_serial) {
- DPRINTFN(0, "select fd %u mismatched serial %lu\n",
- (int)kevp->ident, p->p_kq_serial);
- kevp->flags = EV_DISABLE|EV_DELETE;
- kqueue_register(p->p_kq, kevp, p);
- return (0);
+ panic("%s: spurious kevp %p fd %d udata 0x%lx serial 0x%lx",
+ __func__, kevp, (int)kevp->ident,
+ (unsigned long)kevp->udata, p->p_kq_serial);
}
if (kevp->flags & EV_ERROR) {
@@ -1001,14 +997,14 @@ ppollregister(struct proc *p, struct pol
kevp = kev;
if (pl[i].events & (POLLIN | POLLRDNORM)) {
EV_SET(kevp, pl[i].fd, EVFILT_READ,
- EV_ADD|EV_ENABLE|EV_ONESHOT|__EV_POLL, 0, 0,
+ EV_ADD|EV_ENABLE|__EV_POLL, 0, 0,
(void *)(p->p_kq_serial + i));
nkev++;
kevp++;
}
if (pl[i].events & (POLLOUT | POLLWRNORM)) {
EV_SET(kevp, pl[i].fd, EVFILT_WRITE,
- EV_ADD|EV_ENABLE|EV_ONESHOT|__EV_POLL, 0, 0,
+ EV_ADD|EV_ENABLE|__EV_POLL, 0, 0,
(void *)(p->p_kq_serial + i));
nkev++;
kevp++;
@@ -1017,7 +1013,7 @@ ppollregister(struct proc *p, struct pol
int evff = forcehup ? 0 : NOTE_OOB;
EV_SET(kevp, pl[i].fd, EVFILT_EXCEPT,
- EV_ADD|EV_ENABLE|EV_ONESHOT|__EV_POLL, evff, 0,
+ EV_ADD|EV_ENABLE|__EV_POLL, evff, 0,
(void *)(p->p_kq_serial + i));
nkev++;
kevp++;
@@ -1143,7 +1139,7 @@ doppoll(struct proc *p, struct pollfd *f
return (EINVAL);
}
- kqpoll_init();
+ kqpoll_init(nfds);
sz = nfds * sizeof(*pl);
@@ -1230,8 +1226,7 @@ bad:
if (pl != pfds)
free(pl, M_TEMP, sz);
- kqueue_purge(p, p->p_kq);
- p->p_kq_serial += nfds;
+ kqpoll_done(nfds);
return (error);
}
@@ -1248,25 +1243,15 @@ ppollcollect(struct proc *p, struct keve
/* Extract poll array index */
i = (unsigned long)kevp->udata - p->p_kq_serial;
- /*
- * Lazily delete spurious events.
- *
- * This should not happen as long as kqueue_purge() is called
- * at the end of every syscall. It migh be interesting to do
- * like DragonFlyBSD and not always allocated a new knote in
- * kqueue_register() with that lazy removal makes sense.
- */
if (i >= nfds) {
- DPRINTFN(0, "poll get out of range udata %lu vs serial %lu\n",
+ panic("%s: spurious kevp %p nfds %u udata 0x%lx serial 0x%lx",
+ __func__, kevp, nfds,
(unsigned long)kevp->udata, p->p_kq_serial);
- kevp->flags = EV_DISABLE|EV_DELETE;
- kqueue_register(p->p_kq, kevp, p);
- return (0);
}
if ((int)kevp->ident != pl[i].fd) {
- DPRINTFN(0, "poll get %lu/%d mismatch fd %u!=%d serial %lu\n",
- i+1, nfds, (int)kevp->ident, pl[i].fd, p->p_kq_serial);
- return (0);
+ panic("%s: kevp %p %lu/%d mismatch fd %d!=%d serial 0x%lx",
+ __func__, kevp, i + 1, nfds, (int)kevp->ident, pl[i].fd,
+ p->p_kq_serial);
}
/*
Index: sys/event.h
===================================================================
RCS file: src/sys/sys/event.h,v
retrieving revision 1.57
diff -u -p -r1.57 event.h
--- sys/event.h 24 Oct 2021 07:02:47 -0000 1.57
+++ sys/event.h 7 Nov 2021 12:31:12 -0000
@@ -287,7 +287,8 @@ extern const struct filterops sig_filtop
extern const struct filterops dead_filtops;
extern const struct klistops socket_klistops;
-extern void kqpoll_init(void);
+extern void kqpoll_init(unsigned int);
+extern void kqpoll_done(unsigned int);
extern void kqpoll_exit(void);
extern void knote(struct klist *list, long hint);
extern void knote_fdclose(struct proc *p, int fd);
@@ -302,7 +303,6 @@ extern int kqueue_scan(struct kqueue_sca
struct timespec *, struct proc *, int *);
extern void kqueue_scan_setup(struct kqueue_scan_state *, struct kqueue *);
extern void kqueue_scan_finish(struct kqueue_scan_state *);
-extern void kqueue_purge(struct proc *, struct kqueue *);
extern int filt_seltrue(struct knote *kn, long hint);
extern int seltrue_kqfilter(dev_t, struct knote *);
extern void klist_init(struct klist *, const struct klistops *, void *);