The branch main has been updated by jamie:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=1bd74d201a534540614663686890ab96a3bbe2c7

commit 1bd74d201a534540614663686890ab96a3bbe2c7
Author:     Jamie Gritton <[email protected]>
AuthorDate: 2025-09-04 18:56:56 +0000
Commit:     Jamie Gritton <[email protected]>
CommitDate: 2025-09-04 18:56:56 +0000

    jail: add kqueue(2) support for jails
    
    Add kqueue tracking to jails, inspired by how it's done with processes.
    EVFILT_JAIL takes a jail ID, and tracks with NOTE_JAIL_SET,
    NOTE_JAIL_ATTACH, NOTE_JAIL_REMOVE, and NOTE_JAIL_CHILD. It also uses
    the NOTE_TRACK mechanism that EVFILT_PROC uses, using the same result
    flags (NOTE_CHILD and NOTE_TRACKERR).
    
    Relnotes:       yes
    Differential Revision:  https://reviews.freebsd.org/D51940
---
 lib/libsys/kqueue.2   |  58 ++++++++++++++++++++-
 sys/kern/kern_event.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++--
 sys/kern/kern_jail.c  |  64 ++++++++++++++++++++---
 sys/sys/event.h       |  19 +++++--
 sys/sys/jail.h        |   7 ++-
 5 files changed, 270 insertions(+), 17 deletions(-)

diff --git a/lib/libsys/kqueue.2 b/lib/libsys/kqueue.2
index d6e949baa24c..e413f7d4fbca 100644
--- a/lib/libsys/kqueue.2
+++ b/lib/libsys/kqueue.2
@@ -22,7 +22,7 @@
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
-.Dd March 26, 2023
+.Dd September 4, 2025
 .Dt KQUEUE 2
 .Os
 .Sh NAME
@@ -593,6 +593,62 @@ returns the number of times the signal has occurred since 
the last call to
 This filter automatically sets the
 .Dv EV_CLEAR
 flag internally.
+.It Dv EVFILT_JAIL
+Takes the jail ID to monitor as the identifier and the events to watch for
+in
+.Va fflags ,
+and returns when the jail performs one or more of the requested events.
+If a process can normally see a jail, it can attach an event to it.
+An identifier of zero will watch the process's own jail.
+The events to monitor are:
+.Bl -tag -width "Dv NOTE_JAIL_ATTACH"
+.It Dv NOTE_JAIL_SET
+The jail has been changed via
+.Xr jail_set 2 .
+.It Dv NOTE_JAIL_ATTACH
+A process has attached to the jail via
+.Xr jail_attach 2
+or a similar call.
+The process ID will be stored in
+.Va data .
+If more than one process has attached since the last call to
+.Fn kevent ,
+.Va data
+will contain the most recently attached process ID,
+with
+.Dv NOTE_JAIL_ATTACH_MULTI
+set in
+.Va fflags .
+.It Dv NOTE_JAIL_REMOVE
+The jail has been removed.
+.It Dv NOTE_JAIL_CHILD
+A child of the watched jail has been created.
+.It Dv NOTE_TRACK
+Follow child jails created under this jail.
+Register a new kevent to monitor the child jail using the same
+.Va fflags
+as the original event.
+The child jail will signal an event with
+.Dv NOTE_CHILD
+set in
+.Va fflags
+and the parent JID in
+.Va data .
+.Pp
+If registering a new kevent fails
+.Pq usually due to resource limitations ,
+it will signal an event with
+.Dv NOTE_TRACKERR
+set in
+.Va fflags ,
+and the child jail will not signal a
+.Dv NOTE_CHILD
+event.
+.El
+.Pp
+On return,
+.Va fflags
+contains the events which triggered the filter.
 .It Dv EVFILT_TIMER
 Establishes an arbitrary timer identified by
 .Va ident .
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index eb77a5064113..501adc151d44 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@@ -50,6 +50,7 @@
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
+#include <sys/jail.h>
 #include <sys/kthread.h>
 #include <sys/selinfo.h>
 #include <sys/queue.h>
@@ -163,6 +164,9 @@ static int  filt_kqueue(struct knote *kn, long hint);
 static int     filt_procattach(struct knote *kn);
 static void    filt_procdetach(struct knote *kn);
 static int     filt_proc(struct knote *kn, long hint);
+static int     filt_jailattach(struct knote *kn);
+static void    filt_jaildetach(struct knote *kn);
+static int     filt_jail(struct knote *kn, long hint);
 static int     filt_fileattach(struct knote *kn);
 static void    filt_timerexpire(void *knx);
 static void    filt_timerexpire_l(struct knote *kn, bool proc_locked);
@@ -195,6 +199,12 @@ static const struct filterops proc_filtops = {
        .f_detach = filt_procdetach,
        .f_event = filt_proc,
 };
+static const struct filterops jail_filtops = {
+       .f_isfd = 0,
+       .f_attach = filt_jailattach,
+       .f_detach = filt_jaildetach,
+       .f_event = filt_jail,
+};
 static const struct filterops timer_filtops = {
        .f_isfd = 0,
        .f_attach = filt_timerattach,
@@ -365,6 +375,7 @@ static struct {
        [~EVFILT_USER] = { &user_filtops, 1 },
        [~EVFILT_SENDFILE] = { &null_filtops },
        [~EVFILT_EMPTY] = { &file_filtops, 1 },
+       [~EVFILT_JAIL] = { &jail_filtops, 1 },
 };
 
 /*
@@ -528,7 +539,8 @@ filt_proc(struct knote *kn, long hint)
  * process forked. Additionally, for each knote attached to the
  * parent, check whether user wants to track the new process. If so
  * attach a new knote to it, and immediately report an event with the
- * child's pid.
+ * child's pid. This is also called on jail creation, which is treated
+ * the same way by jail events.
  */
 void
 knote_fork(struct knlist *list, int pid)
@@ -555,6 +567,8 @@ knote_fork(struct knlist *list, int pid)
                /*
                 * The same as knote(), activate the event.
                 */
+               _Static_assert(NOTE_JAIL_CHILD == NOTE_FORK,
+                   "NOTE_JAIL_CHILD should be the same as NOTE_FORK");
                if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
                        if (kn->kn_fop->f_event(kn, NOTE_FORK))
                                KNOTE_ACTIVATE(kn, 1);
@@ -614,6 +628,124 @@ knote_fork(struct knlist *list, int pid)
        }
 }
 
+int
+filt_jailattach(struct knote *kn)
+{
+       struct prison *pr;
+       bool immediate;
+
+       immediate = false;
+       if (kn->kn_id == 0) {
+               /* Let jid=0 watch the current prison (including prison0). */
+               pr = curthread->td_ucred->cr_prison;
+               mtx_lock(&pr->pr_mtx);
+       } else if (kn->kn_flags & (EV_FLAG1 | EV_FLAG2)) {
+               /*
+                * The kernel registers prisons before they are valid,
+                * so prison_find_child will fail.
+                */
+               TAILQ_FOREACH(pr, &allprison, pr_list) {
+                       if (pr->pr_id < kn->kn_id)
+                               continue;
+                       if (pr->pr_id > kn->kn_id) {
+                               pr = NULL;
+                               break;
+                       }
+                       mtx_lock(&pr->pr_mtx);
+                       break;
+               }
+               if (pr == NULL)
+                       return (ENOENT);
+       } else {
+               sx_slock(&allprison_lock);
+               pr = prison_find_child(curthread->td_ucred->cr_prison,
+                   kn->kn_id);
+               sx_sunlock(&allprison_lock);
+               if (pr == NULL)
+                       return (ENOENT);
+               if (!prison_isalive(pr)) {
+                       mtx_unlock(&pr->pr_mtx);
+                       return (ENOENT);
+               }
+       }
+       kn->kn_ptr.p_prison = pr;
+       kn->kn_flags |= EV_CLEAR;
+
+       /*
+        * Internal flag indicating registration done by kernel for the
+        * purposes of getting a NOTE_CHILD notification.
+        */
+       if (kn->kn_flags & EV_FLAG2) {
+               kn->kn_flags &= ~EV_FLAG2;
+               kn->kn_data = kn->kn_sdata;             /* parent id */
+               kn->kn_fflags = NOTE_CHILD;
+               kn->kn_sfflags &= ~NOTE_JAIL_CTRLMASK;
+               immediate = true; /* Force immediate activation of child note. 
*/
+       }
+       /*
+        * Internal flag indicating registration done by kernel (for other than
+        * NOTE_CHILD).
+        */
+       if (kn->kn_flags & EV_FLAG1) {
+               kn->kn_flags &= ~EV_FLAG1;
+       }
+
+       knlist_add(pr->pr_klist, kn, 1);
+
+       /* Immediately activate any child notes. */
+       if (immediate)
+               KNOTE_ACTIVATE(kn, 0);
+
+       mtx_unlock(&pr->pr_mtx);
+       return (0);
+}
+
+void
+filt_jaildetach(struct knote *kn)
+{
+       if (kn->kn_ptr.p_prison != NULL) {
+               knlist_remove(kn->kn_knlist, kn, 0);
+               kn->kn_ptr.p_prison = NULL;
+       } else
+               kn->kn_status |= KN_DETACHED;
+}
+
+int
+filt_jail(struct knote *kn, long hint)
+{
+       struct prison *pr;
+       u_int event;
+
+       pr = kn->kn_ptr.p_prison;
+       if (pr == NULL) /* already activated, from attach filter */
+               return (0);
+
+       /* Mask off extra data. */
+       event = (u_int)hint & NOTE_JAIL_CTRLMASK;
+
+       /* If the user is interested in this event, record it. */
+       if (kn->kn_sfflags & event)
+               kn->kn_fflags |= event;
+
+       /* Report the attached process id. */
+       if (event == NOTE_JAIL_ATTACH) {
+               if (kn->kn_data != 0)
+                       kn->kn_fflags |= NOTE_JAIL_ATTACH_MULTI;
+               kn->kn_data = hint & NOTE_JAIL_DATAMASK;
+       }
+
+       /* Prison is gone, so flag the event as finished. */
+       if (event == NOTE_JAIL_REMOVE) {
+               kn->kn_flags |= EV_EOF | EV_ONESHOT;
+               kn->kn_ptr.p_prison = NULL;
+               if (kn->kn_fflags == 0)
+                       kn->kn_flags |= EV_DROP;
+               return (1);
+       }
+
+       return (kn->kn_fflags != 0);
+}
+
 /*
  * XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
  * interval timer support code.
@@ -1597,8 +1729,8 @@ findkn:
                /*
                 * If possible, find an existing knote to use for this kevent.
                 */
-               if (kev->filter == EVFILT_PROC &&
-                   (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
+               if ((kev->filter == EVFILT_PROC || kev->filter == EVFILT_JAIL)
+                   && (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
                        /* This is an internal creation of a process tracking
                         * note. Don't attempt to coalesce this with an
                         * existing note.
@@ -2800,6 +2932,7 @@ knote_init(void)
        knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
            NULL, NULL, UMA_ALIGN_PTR, 0);
        ast_register(TDA_KQUEUE, ASTR_ASTF_REQUIRED, 0, ast_kqueue);
+       prison0.pr_klist = knlist_alloc(&prison0.pr_mtx);
 }
 SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
 
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index 7c9a15ae18f3..52210553016b 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -45,6 +45,7 @@
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/epoch.h>
+#include <sys/event.h>
 #include <sys/taskqueue.h>
 #include <sys/fcntl.h>
 #include <sys/jail.h>
@@ -154,7 +155,8 @@ static void prison_complete(void *context, int pending);
 static void prison_deref(struct prison *pr, int flags);
 static void prison_deref_kill(struct prison *pr, struct prisonlist 
*freeprison);
 static int prison_lock_xlock(struct prison *pr, int flags);
-static void prison_cleanup(struct prison *pr);
+static void prison_cleanup_locked(struct prison *pr);
+static void prison_cleanup_unlocked(struct prison *pr);
 static void prison_free_not_last(struct prison *pr);
 static void prison_proc_free_not_last(struct prison *pr);
 static void prison_proc_relink(struct prison *opr, struct prison *npr,
@@ -167,6 +169,7 @@ static void prison_racct_attach(struct prison *pr);
 static void prison_racct_modify(struct prison *pr);
 static void prison_racct_detach(struct prison *pr);
 #endif
+static void prison_knote(struct prison *pr, long hint);
 
 /* Flags for prison_deref */
 #define        PD_DEREF        0x01    /* Decrement pr_ref */
@@ -1018,6 +1021,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int 
flags)
        int ip6s;
        bool redo_ip6;
 #endif
+       bool maybe_changed;
        uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
        uint64_t pr_allow_diff;
        unsigned tallow;
@@ -1422,6 +1426,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int 
flags)
        pr = NULL;
        inspr = NULL;
        deadpr = NULL;
+       maybe_changed = false;
        if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
                namelc = strrchr(name, '.');
                jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
@@ -1643,6 +1648,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int 
flags)
                LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
                for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
                        tpr->pr_childcount++;
+               pr->pr_klist = knlist_alloc(&pr->pr_mtx);
 
                /* Set some default values, and inherit some from the parent. */
                if (namelc == NULL)
@@ -1880,6 +1886,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int 
flags)
                        goto done_deref;
                }
        }
+       maybe_changed = true;
 
        /* Set the parameters of the prison. */
 #ifdef INET
@@ -2112,7 +2119,12 @@ kern_jail_set(struct thread *td, struct uio *optuio, int 
flags)
         * reference via persistence, or is about to gain one via attachment.
         */
        if (created) {
-               drflags = prison_lock_xlock(pr, drflags);
+               sx_assert(&allprison_lock, SX_XLOCKED);
+               mtx_lock(&ppr->pr_mtx);
+               knote_fork(ppr->pr_klist, pr->pr_id);
+               mtx_unlock(&ppr->pr_mtx);
+               mtx_lock(&pr->pr_mtx);
+               drflags |= PD_LOCKED;
                pr->pr_state = PRISON_STATE_ALIVE;
        }
 
@@ -2150,6 +2162,13 @@ kern_jail_set(struct thread *td, struct uio *optuio, int 
flags)
        td->td_retval[0] = pr->pr_id;
 
  done_deref:
+       /*
+        * Report changes to kevent.  This can happen even if the
+        * system call fails, as changes might have been made before
+        * the failure.
+        */
+       if (maybe_changed && !created)
+               prison_knote(pr, NOTE_JAIL_SET);
        /* Release any temporary prison holds and/or locks. */
        if (pr != NULL)
                prison_deref(pr, drflags);
@@ -2755,6 +2774,7 @@ do_jail_attach(struct thread *td, struct prison *pr, int 
drflags)
        prison_proc_relink(oldcred->cr_prison, pr, p);
        prison_deref(oldcred->cr_prison, drflags);
        crfree(oldcred);
+       prison_knote(pr, NOTE_JAIL_ATTACH | td->td_proc->p_pid);
 
        /*
         * If the prison was killed while changing credentials, die along
@@ -3182,9 +3202,10 @@ prison_deref(struct prison *pr, int flags)
                                            refcount_load(&prison0.pr_uref) > 0,
                                            ("prison0 pr_uref=0"));
                                        pr->pr_state = PRISON_STATE_DYING;
+                                       prison_cleanup_locked(pr);
                                        mtx_unlock(&pr->pr_mtx);
                                        flags &= ~PD_LOCKED;
-                                       prison_cleanup(pr);
+                                       prison_cleanup_unlocked(pr);
                                }
                        }
                }
@@ -3327,8 +3348,9 @@ prison_deref_kill(struct prison *pr, struct prisonlist 
*freeprison)
                }
                if (!(cpr->pr_flags & PR_REMOVE))
                        continue;
-               prison_cleanup(cpr);
+               prison_cleanup_unlocked(cpr);
                mtx_lock(&cpr->pr_mtx);
+               prison_cleanup_locked(cpr);
                cpr->pr_flags &= ~PR_REMOVE;
                if (cpr->pr_flags & PR_PERSIST) {
                        cpr->pr_flags &= ~PR_PERSIST;
@@ -3363,8 +3385,9 @@ prison_deref_kill(struct prison *pr, struct prisonlist 
*freeprison)
        if (rpr != NULL)
                LIST_REMOVE(rpr, pr_sibling);
 
-       prison_cleanup(pr);
+       prison_cleanup_unlocked(pr);
        mtx_lock(&pr->pr_mtx);
+       prison_cleanup_locked(pr);
        if (pr->pr_flags & PR_PERSIST) {
                pr->pr_flags &= ~PR_PERSIST;
                prison_proc_free_not_last(pr);
@@ -3411,10 +3434,21 @@ prison_lock_xlock(struct prison *pr, int flags)
 
 /*
  * Release a prison's resources when it starts dying (when the last user
- * reference is dropped, or when it is killed).
+ * reference is dropped, or when it is killed).  Two functions are called,
+ * for work that requires a locked prison or an unlocked one.
  */
 static void
-prison_cleanup(struct prison *pr)
+prison_cleanup_locked(struct prison *pr)
+{
+       sx_assert(&allprison_lock, SA_XLOCKED);
+       mtx_assert(&pr->pr_mtx, MA_OWNED);
+       prison_knote(pr, NOTE_JAIL_REMOVE);
+       knlist_detach(pr->pr_klist);
+       pr->pr_klist = NULL;
+}
+
+static void
+prison_cleanup_unlocked(struct prison *pr)
 {
        sx_assert(&allprison_lock, SA_XLOCKED);
        mtx_assert(&pr->pr_mtx, MA_NOTOWNED);
@@ -5039,6 +5073,22 @@ prison_racct_detach(struct prison *pr)
 }
 #endif /* RACCT */
 
+/*
+ * Submit a knote for a prison, locking if necessary.
+ */
+static void
+prison_knote(struct prison *pr, long hint)
+{
+       int locked;
+
+       locked = mtx_owned(&pr->pr_mtx);
+       if (!locked)
+               mtx_lock(&pr->pr_mtx);
+       KNOTE_LOCKED(pr->pr_klist, hint);
+       if (!locked)
+               mtx_unlock(&pr->pr_mtx);
+}
+
 #ifdef DDB
 
 static void
diff --git a/sys/sys/event.h b/sys/sys/event.h
index 1b30e4292de8..f161d2c938c1 100644
--- a/sys/sys/event.h
+++ b/sys/sys/event.h
@@ -45,7 +45,8 @@
 #define EVFILT_USER            (-11)   /* User events */
 #define EVFILT_SENDFILE                (-12)   /* attached to sendfile 
requests */
 #define EVFILT_EMPTY           (-13)   /* empty send socket buf */
-#define EVFILT_SYSCOUNT                13
+#define EVFILT_JAIL            (-14)   /* attached to struct prison */
+#define EVFILT_SYSCOUNT                14
 
 #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
 #define        EV_SET(kevp_, a, b, c, d, e, f) do {    \
@@ -204,10 +205,19 @@ struct freebsd11_kevent32 {
 #define        NOTE_PCTRLMASK  0xf0000000              /* mask for hint bits */
 #define        NOTE_PDATAMASK  0x000fffff              /* mask for pid */
 
-/* additional flags for EVFILT_PROC */
-#define        NOTE_TRACK      0x00000001              /* follow across forks 
*/
+/* data/hint flags for EVFILT_JAIL */
+#define        NOTE_JAIL_SET           0x80000000      /* jail was modified */
+#define        NOTE_JAIL_CHILD         0x40000000      /* child jail was 
created */
+#define        NOTE_JAIL_ATTACH        0x20000000      /* jail was attached to 
*/
+#define        NOTE_JAIL_REMOVE        0x10000000      /* jail was removed */
+#define NOTE_JAIL_ATTACH_MULTI 0x08000000      /* multiple procs attached */
+#define        NOTE_JAIL_CTRLMASK      0xf0000000      /* mask for hint bits */
+#define        NOTE_JAIL_DATAMASK      0x000fffff      /* mask for pid */
+
+/* additional flags for EVFILT_PROC and EVFILT_JAIL */
+#define        NOTE_TRACK      0x00000001              /* follow across 
fork/create */
 #define        NOTE_TRACKERR   0x00000002              /* could not track 
child */
-#define        NOTE_CHILD      0x00000004              /* am a child process */
+#define        NOTE_CHILD      0x00000004              /* am a child 
process/jail */
 
 /* additional flags for EVFILT_TIMER */
 #define NOTE_SECONDS           0x00000001      /* data is seconds */
@@ -309,6 +319,7 @@ struct knote {
                struct          proc *p_proc;   /* proc pointer */
                struct          kaiocb *p_aio;  /* AIO job pointer */
                struct          aioliojob *p_lio;       /* LIO job pointer */
+               struct          prison *p_prison;       /* prison pointer */
                void            *p_v;           /* generic other pointer */
        } kn_ptr;
        const struct            filterops *kn_fop;
diff --git a/sys/sys/jail.h b/sys/sys/jail.h
index d2655c52e832..4512e43a2866 100644
--- a/sys/sys/jail.h
+++ b/sys/sys/jail.h
@@ -144,6 +144,7 @@ MALLOC_DECLARE(M_PRISON);
 #define        JAIL_META_PRIVATE       "meta"
 #define        JAIL_META_SHARED        "env"
 
+struct knlist;
 struct racct;
 struct prison_racct;
 
@@ -189,7 +190,8 @@ struct prison {
        struct vnode    *pr_root;                       /* (c) vnode to rdir */
        struct prison_ip  *pr_addrs[PR_FAMILY_MAX];     /* (p,n) IPs of jail */
        struct prison_racct *pr_prison_racct;           /* (c) racct jail proxy 
*/
-       void            *pr_sparep[3];
+       struct knlist   *pr_klist;                      /* (m) attached knotes 
*/
+       void            *pr_sparep[2];
        int              pr_childcount;                 /* (a) number of child 
jails */
        int              pr_childmax;                   /* (p) maximum child 
jails */
        unsigned         pr_allow;                      /* (p) PR_ALLOW_* flags 
*/
@@ -425,10 +427,11 @@ SYSCTL_DECL(_security_jail_param);
 /*
  * Kernel support functions for jail().
  */
-struct ucred;
+struct knote;
 struct mount;
 struct sockaddr;
 struct statfs;
+struct ucred;
 struct vfsconf;
 
 /*

Reply via email to