The branch main has been updated by markj:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=f1f230439fa48581f40a57f095627f667a9713c3

commit f1f230439fa48581f40a57f095627f667a9713c3
Author:     Mark Johnston <ma...@freebsd.org>
AuthorDate: 2025-07-03 20:07:45 +0000
Commit:     Mark Johnston <ma...@freebsd.org>
CommitDate: 2025-07-04 14:42:33 +0000

    vfs: Initial revision of inotify
    
    Add an implementation of inotify_init(), inotify_add_watch(),
    inotify_rm_watch(), source-compatible with Linux.  This provides
    functionality similar to kevent(2)'s EVFILT_VNODE, i.e., it lets
    applications monitor filesystem files for accesses.  Compared to
    inotify, however, EVFILT_VNODE has the limitation of requiring the
    application to open the file to be monitored.  This means that activity
    on a newly created file cannot be monitored reliably, and that a file
    descriptor per file in the hierarchy is required.
    
    inotify on the other hand allows a directory and its entries to be
    monitored at once.  It introduces a new file descriptor type to which
    "watches" can be attached; a watch is a pseudo-file descriptor
    associated with a file or directory and a set of events to watch for.
    When a watched vnode is accessed, a description of the event is queued
    to the inotify descriptor, readable with read(2).  Events for files in a
    watched directory include the file name.
    
    A watched vnode has its usecount bumped, so name cache entries
    originating from a watched directory are not evicted.  Name cache
    entries are used to populate inotify events for files with a link in a
    watched directory.  In particular, if a file is accessed with, say,
    read(2), an IN_ACCESS event will be generated for any watched hard link
    of the file.
    
    The inotify_add_watch_at() variant is included so that this
    functionality is available in capability mode; plain inotify_add_watch()
    is disallowed in capability mode.
    
    When a file in a nullfs mount is watched, the watch is attached to the
    lower vnode, such that accesses via either layer generate inotify
    events.
    
    Many thanks to Gleb Popov for testing this patch and finding lots of
    bugs.
    
    PR:             258010, 215011
    Reviewed by:    kib
    Tested by:      arrowd
    MFC after:      3 months
    Sponsored by:   Klara, Inc.
    Differential Revision:  https://reviews.freebsd.org/D50315
---
 share/man/man4/rights.4    |   10 +-
 sys/bsm/audit_kevents.h    |    1 +
 sys/conf/files             |    1 +
 sys/fs/nullfs/null_subr.c  |    4 +
 sys/fs/nullfs/null_vnops.c |   29 +-
 sys/kern/kern_resource.c   |   21 +
 sys/kern/subr_capability.c |    4 +
 sys/kern/sys_generic.c     |   35 +-
 sys/kern/syscalls.master   |   15 +
 sys/kern/vfs_cache.c       |   59 +++
 sys/kern/vfs_default.c     |   17 +
 sys/kern/vfs_inotify.c     | 1008 ++++++++++++++++++++++++++++++++++++++++++++
 sys/kern/vfs_subr.c        |    7 +-
 sys/kern/vfs_vnops.c       |    3 +-
 sys/kern/vnode_if.src      |   21 +
 sys/sys/caprights.h        |    2 +
 sys/sys/capsicum.h         |    8 +-
 sys/sys/exterr_cat.h       |    1 +
 sys/sys/file.h             |    1 +
 sys/sys/inotify.h          |  146 +++++++
 sys/sys/resourcevar.h      |    4 +
 sys/sys/specialfd.h        |    5 +
 sys/sys/user.h             |    5 +
 sys/sys/vnode.h            |   12 +-
 sys/tools/vnode_if.awk     |    1 +
 25 files changed, 1405 insertions(+), 15 deletions(-)

diff --git a/share/man/man4/rights.4 b/share/man/man4/rights.4
index 0c24f6b45f88..8f5f6ad9c2d2 100644
--- a/share/man/man4/rights.4
+++ b/share/man/man4/rights.4
@@ -30,7 +30,7 @@
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
-.Dd May 1, 2024
+.Dd May 22, 2025
 .Dt RIGHTS 4
 .Os
 .Sh NAME
@@ -319,6 +319,14 @@ Permit
 .It Dv CAP_GETSOCKOPT
 Permit
 .Xr getsockopt 2 .
+.It Dv CAP_INOTIFY_ADD
+Permit
+.Xr inotify_add_watch 2
+and
+.Xr inotify_add_watch_at 2 .
+.It Dv CAP_INOTIFY_RM
+Permit
+.Xr inotify_rm_watch 2 .
 .It Dv CAP_IOCTL
 Permit
 .Xr ioctl 2 .
diff --git a/sys/bsm/audit_kevents.h b/sys/bsm/audit_kevents.h
index 0f110d5f9ddd..9381396f247c 100644
--- a/sys/bsm/audit_kevents.h
+++ b/sys/bsm/audit_kevents.h
@@ -663,6 +663,7 @@
 #define        AUE_FSPACECTL           43269   /* FreeBSD-specific. */
 #define        AUE_TIMERFD             43270   /* FreeBSD/Linux. */
 #define        AUE_SETCRED             43271   /* FreeBSD-specific. */
+#define        AUE_INOTIFY             43272   /* FreeBSD/Linux. */
 
 /*
  * Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the
diff --git a/sys/conf/files b/sys/conf/files
index f6d473b1431b..dd6f9a3021d4 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3992,6 +3992,7 @@ kern/vfs_export.c         standard
 kern/vfs_extattr.c             standard
 kern/vfs_hash.c                        standard
 kern/vfs_init.c                        standard
+kern/vfs_inotify.c             standard
 kern/vfs_lookup.c              standard
 kern/vfs_mount.c               standard
 kern/vfs_mountroot.c           standard
diff --git a/sys/fs/nullfs/null_subr.c b/sys/fs/nullfs/null_subr.c
index 0356877eaf05..7dcc83880bb9 100644
--- a/sys/fs/nullfs/null_subr.c
+++ b/sys/fs/nullfs/null_subr.c
@@ -245,6 +245,10 @@ null_nodeget(struct mount *mp, struct vnode *lowervp, 
struct vnode **vpp)
                vp->v_object = lowervp->v_object;
                vn_irflag_set(vp, VIRF_PGREAD);
        }
+       if ((vn_irflag_read(lowervp) & VIRF_INOTIFY) != 0)
+               vn_irflag_set(vp, VIRF_INOTIFY);
+       if ((vn_irflag_read(lowervp) & VIRF_INOTIFY_PARENT) != 0)
+               vn_irflag_set(vp, VIRF_INOTIFY_PARENT);
        if (lowervp == MOUNTTONULLMOUNT(mp)->nullm_lowerrootvp)
                vp->v_vflag |= VV_ROOT;
 
diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c
index 8608216e10e5..74c1a8f3acb6 100644
--- a/sys/fs/nullfs/null_vnops.c
+++ b/sys/fs/nullfs/null_vnops.c
@@ -189,6 +189,26 @@ static int null_bug_bypass = 0;   /* for debugging: 
enables bypass printf'ing */
 SYSCTL_INT(_debug, OID_AUTO, nullfs_bug_bypass, CTLFLAG_RW, 
        &null_bug_bypass, 0, "");
 
+/*
+ * Synchronize inotify flags with the lower vnode:
+ * - If the upper vnode has the flag set and the lower does not, then the lower
+ *   vnode is unwatched and the upper vnode does not need to go through
+ *   VOP_INOTIFY.
+ * - If the lower vnode is watched, then the upper vnode should go through
+ *   VOP_INOTIFY, so copy the flag up.
+ */
+static void
+null_copy_inotify(struct vnode *vp, struct vnode *lvp, short flag)
+{
+       if ((vn_irflag_read(vp) & flag) != 0) {
+               if (__predict_false((vn_irflag_read(lvp) & flag) == 0))
+                       vn_irflag_unset(vp, flag);
+       } else if ((vn_irflag_read(lvp) & flag) != 0) {
+               if (__predict_false((vn_irflag_read(vp) & flag) == 0))
+                       vn_irflag_set(vp, flag);
+       }
+}
+
 /*
  * This is the 10-Apr-92 bypass routine.
  *    This version has been optimized for speed, throwing away some
@@ -305,7 +325,10 @@ null_bypass(struct vop_generic_args *ap)
                        lvp = *(vps_p[i]);
 
                        /*
-                        * Get rid of the transient hold on lvp.
+                        * Get rid of the transient hold on lvp.  Copy inotify
+                        * flags up in case something is watching the lower
+                        * layer.
+                        *
                         * If lowervp was unlocked during VOP
                         * operation, nullfs upper vnode could have
                         * been reclaimed, which changes its v_vnlock
@@ -314,6 +337,10 @@ null_bypass(struct vop_generic_args *ap)
                         * upper (reclaimed) vnode.
                         */
                        if (lvp != NULLVP) {
+                               null_copy_inotify(old_vps[i], lvp,
+                                   VIRF_INOTIFY);
+                               null_copy_inotify(old_vps[i], lvp,
+                                   VIRF_INOTIFY_PARENT);
                                if (VOP_ISLOCKED(lvp) == LK_EXCLUSIVE &&
                                    old_vps[i]->v_vnlock != lvp->v_vnlock) {
                                        VOP_UNLOCK(lvp);
diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
index c8b01afeab4f..dcd38c6e6fbe 100644
--- a/sys/kern/kern_resource.c
+++ b/sys/kern/kern_resource.c
@@ -1637,6 +1637,12 @@ uifree(struct uidinfo *uip)
        if (uip->ui_pipecnt != 0)
                printf("freeing uidinfo: uid = %d, pipecnt = %ld\n",
                    uip->ui_uid, uip->ui_pipecnt);
+       if (uip->ui_inotifycnt != 0)
+               printf("freeing uidinfo: uid = %d, inotifycnt = %ld\n",
+                   uip->ui_uid, uip->ui_inotifycnt);
+       if (uip->ui_inotifywatchcnt != 0)
+               printf("freeing uidinfo: uid = %d, inotifywatchcnt = %ld\n",
+                   uip->ui_uid, uip->ui_inotifywatchcnt);
        free(uip, M_UIDINFO);
 }
 
@@ -1742,6 +1748,21 @@ chgpipecnt(struct uidinfo *uip, int diff, rlim_t max)
        return (chglimit(uip, &uip->ui_pipecnt, diff, max, "pipecnt"));
 }
 
+int
+chginotifycnt(struct uidinfo *uip, int diff, rlim_t max)
+{
+
+       return (chglimit(uip, &uip->ui_inotifycnt, diff, max, "inotifycnt"));
+}
+
+int
+chginotifywatchcnt(struct uidinfo *uip, int diff, rlim_t max)
+{
+
+       return (chglimit(uip, &uip->ui_inotifywatchcnt, diff, max,
+           "inotifywatchcnt"));
+}
+
 static int
 sysctl_kern_proc_rlimit_usage(SYSCTL_HANDLER_ARGS)
 {
diff --git a/sys/kern/subr_capability.c b/sys/kern/subr_capability.c
index 7cc6fb593697..5ad5b0af1681 100644
--- a/sys/kern/subr_capability.c
+++ b/sys/kern/subr_capability.c
@@ -74,6 +74,10 @@ const cap_rights_t cap_getsockopt_rights =
     CAP_RIGHTS_INITIALIZER(CAP_GETSOCKOPT);
 const cap_rights_t cap_getsockname_rights =
     CAP_RIGHTS_INITIALIZER(CAP_GETSOCKNAME);
+const cap_rights_t cap_inotify_add_rights =
+    CAP_RIGHTS_INITIALIZER(CAP_INOTIFY_ADD);
+const cap_rights_t cap_inotify_rm_rights =
+    CAP_RIGHTS_INITIALIZER(CAP_INOTIFY_RM);
 const cap_rights_t cap_ioctl_rights = CAP_RIGHTS_INITIALIZER(CAP_IOCTL);
 const cap_rights_t cap_listen_rights = CAP_RIGHTS_INITIALIZER(CAP_LISTEN);
 const cap_rights_t cap_linkat_source_rights =
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index d31ff3b939cc..5d09ba3f37f7 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -42,11 +42,12 @@
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/capsicum.h>
+#include <sys/exterrvar.h>
 #include <sys/filedesc.h>
 #include <sys/filio.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
-#include <sys/exterrvar.h>
+#include <sys/inotify.h>
 #include <sys/lock.h>
 #include <sys/proc.h>
 #include <sys/signalvar.h>
@@ -939,7 +940,6 @@ int
 kern_specialfd(struct thread *td, int type, void *arg)
 {
        struct file *fp;
-       struct specialfd_eventfd *ae;
        int error, fd, fflags;
 
        fflags = 0;
@@ -948,12 +948,22 @@ kern_specialfd(struct thread *td, int type, void *arg)
                return (error);
 
        switch (type) {
-       case SPECIALFD_EVENTFD:
+       case SPECIALFD_EVENTFD: {
+               struct specialfd_eventfd *ae;
+
                ae = arg;
                if ((ae->flags & EFD_CLOEXEC) != 0)
                        fflags |= O_CLOEXEC;
                error = eventfd_create_file(td, fp, ae->initval, ae->flags);
                break;
+       }
+       case SPECIALFD_INOTIFY: {
+               struct specialfd_inotify *si;
+
+               si = arg;
+               error = inotify_create_file(td, fp, si->flags, &fflags);
+               break;
+       }
        default:
                error = EINVAL;
                break;
@@ -970,11 +980,12 @@ kern_specialfd(struct thread *td, int type, void *arg)
 int
 sys___specialfd(struct thread *td, struct __specialfd_args *args)
 {
-       struct specialfd_eventfd ae;
        int error;
 
        switch (args->type) {
-       case SPECIALFD_EVENTFD:
+       case SPECIALFD_EVENTFD: {
+               struct specialfd_eventfd ae;
+
                if (args->len != sizeof(struct specialfd_eventfd)) {
                        error = EINVAL;
                        break;
@@ -989,6 +1000,20 @@ sys___specialfd(struct thread *td, struct 
__specialfd_args *args)
                }
                error = kern_specialfd(td, args->type, &ae);
                break;
+       }
+       case SPECIALFD_INOTIFY: {
+               struct specialfd_inotify si;
+
+               if (args->len != sizeof(si)) {
+                       error = EINVAL;
+                       break;
+               }
+               error = copyin(args->req, &si, sizeof(si));
+               if (error != 0)
+                       break;
+               error = kern_specialfd(td, args->type, &si);
+               break;
+       }
        default:
                error = EINVAL;
                break;
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index 08b557a7a540..2ab17e036d5c 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -3356,4 +3356,19 @@
                    _In_reads_bytes_(4) void *ptr
                );
        }
+593    AUE_INOTIFY     STD|CAPENABLED {
+               int inotify_add_watch_at(
+                   int fd,
+                   int dfd,
+                   _In_z_ const char *path,
+                   uint32_t mask
+               );
+       }
+594    AUE_INOTIFY     STD|CAPENABLED {
+               int inotify_rm_watch(
+                   int fd,
+                   int wd
+               );
+       }
+
 ; vim: syntax=off
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index 883beaf6d1da..3d455b3874cc 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -41,6 +41,7 @@
 #include <sys/counter.h>
 #include <sys/filedesc.h>
 #include <sys/fnv_hash.h>
+#include <sys/inotify.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
@@ -2628,6 +2629,14 @@ cache_enter_time(struct vnode *dvp, struct vnode *vp, 
struct componentname *cnp,
                atomic_thread_fence_rel();
                atomic_store_ptr(&dvp->v_cache_dd, ncp);
        } else if (vp != NULL) {
+               /*
+                * Take the slow path in INOTIFY().  This flag will be lazily
+                * cleared by cache_vop_inotify() once all directories referring
+                * to vp are unwatched.
+                */
+               if (__predict_false((vn_irflag_read(dvp) & VIRF_INOTIFY) != 0))
+                       vn_irflag_set_cond(vp, VIRF_INOTIFY_PARENT);
+
                /*
                 * For this case, the cache entry maps both the
                 * directory name in it and the name ".." for the
@@ -4008,6 +4017,56 @@ out:
        return (error);
 }
 
+void
+cache_vop_inotify(struct vnode *vp, int event, uint32_t cookie)
+{
+       struct mtx *vlp;
+       struct namecache *ncp;
+       int isdir;
+       bool logged, self;
+
+       isdir = vp->v_type == VDIR ? IN_ISDIR : 0;
+       self = (vn_irflag_read(vp) & VIRF_INOTIFY) != 0 &&
+           (vp->v_type != VDIR || (event & ~_IN_DIR_EVENTS) != 0);
+
+       if (self) {
+               int selfevent;
+
+               if (event == _IN_ATTRIB_LINKCOUNT)
+                       selfevent = IN_ATTRIB;
+               else
+                       selfevent = event;
+               inotify_log(vp, NULL, 0, selfevent | isdir, cookie);
+       }
+       if ((event & IN_ALL_EVENTS) == 0)
+               return;
+
+       logged = false;
+       vlp = VP2VNODELOCK(vp);
+       mtx_lock(vlp);
+       TAILQ_FOREACH(ncp, &vp->v_cache_dst, nc_dst) {
+               if ((ncp->nc_flag & NCF_ISDOTDOT) != 0)
+                       continue;
+               if ((vn_irflag_read(ncp->nc_dvp) & VIRF_INOTIFY) != 0) {
+                       /*
+                        * XXX-MJ if the vnode has two links in the same
+                        * dir, we'll log the same event twice.
+                        */
+                       inotify_log(ncp->nc_dvp, ncp->nc_name, ncp->nc_nlen,
+                           event | isdir, cookie);
+                       logged = true;
+               }
+       }
+       if (!logged && (vn_irflag_read(vp) & VIRF_INOTIFY_PARENT) != 0) {
+               /*
+                * We didn't find a watched directory that contains this vnode,
+                * so stop calling VOP_INOTIFY for operations on the vnode.
+                */
+               vn_irflag_unset(vp, VIRF_INOTIFY_PARENT);
+       }
+       mtx_unlock(vlp);
+}
+
 #ifdef DDB
 static void
 db_print_vpath(struct vnode *vp)
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index be49c0887609..2a01ec1e307e 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -39,6 +39,7 @@
 #include <sys/conf.h>
 #include <sys/event.h>
 #include <sys/filio.h>
+#include <sys/inotify.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
@@ -119,6 +120,8 @@ struct vop_vector default_vnodeops = {
        .vop_getwritemount =    vop_stdgetwritemount,
        .vop_inactive =         VOP_NULL,
        .vop_need_inactive =    vop_stdneed_inactive,
+       .vop_inotify =          vop_stdinotify,
+       .vop_inotify_add_watch = vop_stdinotify_add_watch,
        .vop_ioctl =            vop_stdioctl,
        .vop_kqfilter =         vop_stdkqfilter,
        .vop_islocked =         vop_stdislocked,
@@ -1305,6 +1308,20 @@ vop_stdneed_inactive(struct vop_need_inactive_args *ap)
        return (1);
 }
 
+int
+vop_stdinotify(struct vop_inotify_args *ap)
+{
+       vn_inotify(ap->a_vp, ap->a_dvp, ap->a_cnp, ap->a_event, ap->a_cookie);
+       return (0);
+}
+
+int
+vop_stdinotify_add_watch(struct vop_inotify_add_watch_args *ap)
+{
+       return (vn_inotify_add_watch(ap->a_vp, ap->a_sc, ap->a_mask,
+           ap->a_wdp, ap->a_td));
+}
+
 int
 vop_stdioctl(struct vop_ioctl_args *ap)
 {
diff --git a/sys/kern/vfs_inotify.c b/sys/kern/vfs_inotify.c
new file mode 100644
index 000000000000..929ce0426ee8
--- /dev/null
+++ b/sys/kern/vfs_inotify.c
@@ -0,0 +1,1008 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2025 Klara, Inc.
+ */
+
+#include "opt_ktrace.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/caprights.h>
+#include <sys/counter.h>
+#include <sys/dirent.h>
+#define        EXTERR_CATEGORY EXTERR_CAT_INOTIFY
+#include <sys/exterrvar.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filio.h>
+#include <sys/inotify.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/ktrace.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/poll.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/resourcevar.h>
+#include <sys/selinfo.h>
+#include <sys/stat.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslimits.h>
+#include <sys/sysproto.h>
+#include <sys/tree.h>
+#include <sys/user.h>
+#include <sys/vnode.h>
+
+uint32_t inotify_rename_cookie;
+
+static SYSCTL_NODE(_vfs, OID_AUTO, inotify, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
+    "inotify configuration");
+
+static int inotify_max_queued_events = 16384;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_queued_events, CTLFLAG_RWTUN,
+    &inotify_max_queued_events, 0,
+    "Maximum number of events to queue on an inotify descriptor");
+
+static int inotify_max_user_instances = 256;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_instances, CTLFLAG_RWTUN,
+    &inotify_max_user_instances, 0,
+    "Maximum number of inotify descriptors per user");
+
+static int inotify_max_user_watches;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_user_watches, CTLFLAG_RWTUN,
+    &inotify_max_user_watches, 0,
+    "Maximum number of inotify watches per user");
+
+static int inotify_max_watches;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, max_watches, CTLFLAG_RWTUN,
+    &inotify_max_watches, 0,
+    "Maximum number of inotify watches system-wide");
+
+static int inotify_watches;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, watches, CTLFLAG_RD,
+    &inotify_watches, 0,
+    "Total number of inotify watches currently in use");
+
+static int inotify_coalesce = 1;
+SYSCTL_INT(_vfs_inotify, OID_AUTO, coalesce, CTLFLAG_RWTUN,
+    &inotify_coalesce, 0,
+    "Coalesce inotify events when possible");
+
+static COUNTER_U64_DEFINE_EARLY(inotify_event_drops);
+SYSCTL_COUNTER_U64(_vfs_inotify, OID_AUTO, event_drops, CTLFLAG_RD,
+    &inotify_event_drops,
+    "Number of inotify events dropped due to limits or allocation failures");
+
+static fo_rdwr_t       inotify_read;
+static fo_ioctl_t      inotify_ioctl;
+static fo_poll_t       inotify_poll;
+static fo_kqfilter_t   inotify_kqfilter;
+static fo_stat_t       inotify_stat;
+static fo_close_t      inotify_close;
+static fo_fill_kinfo_t inotify_fill_kinfo;
+
+static const struct fileops inotifyfdops = {
+       .fo_read = inotify_read,
+       .fo_write = invfo_rdwr,
+       .fo_truncate = invfo_truncate,
+       .fo_ioctl = inotify_ioctl,
+       .fo_poll = inotify_poll,
+       .fo_kqfilter = inotify_kqfilter,
+       .fo_stat = inotify_stat,
+       .fo_close = inotify_close,
+       .fo_chmod = invfo_chmod,
+       .fo_chown = invfo_chown,
+       .fo_sendfile = invfo_sendfile,
+       .fo_fill_kinfo = inotify_fill_kinfo,
+       .fo_cmp = file_kcmp_generic,
+       .fo_flags = DFLAG_PASSABLE,
+};
+
+static void    filt_inotifydetach(struct knote *kn);
+static int     filt_inotifyevent(struct knote *kn, long hint);
+
+static const struct filterops inotify_rfiltops = {
+       .f_isfd = 1,
+       .f_detach = filt_inotifydetach,
+       .f_event = filt_inotifyevent,
+};
+
+static MALLOC_DEFINE(M_INOTIFY, "inotify", "inotify data structures");
+
+struct inotify_record {
+       STAILQ_ENTRY(inotify_record) link;
+       struct inotify_event    ev;
+};
+
+static uint64_t inotify_ino = 1;
+
+/*
+ * On LP64 systems this occupies 64 bytes, so we don't get internal
+ * fragmentation by allocating watches with malloc(9).  If the size changes,
+ * consider using a UMA zone to improve memory efficiency.
+ */
+struct inotify_watch {
+       struct inotify_softc *sc; /* back-pointer */
+       int             wd;     /* unique ID */
+       uint32_t        mask;   /* event mask */
+       struct vnode    *vp;    /* vnode being watched, refed */
+       RB_ENTRY(inotify_watch) ilink;          /* inotify linkage */
+       TAILQ_ENTRY(inotify_watch) vlink;       /* vnode linkage */
+};
+
+static void
+inotify_init(void *arg __unused)
+{
+       /* Don't let a user hold too many vnodes. */
+       inotify_max_user_watches = desiredvnodes / 3;
+       /* Don't let the system hold too many vnodes. */
+       inotify_max_watches = desiredvnodes / 2;
+}
+SYSINIT(inotify, SI_SUB_VFS, SI_ORDER_ANY, inotify_init, NULL);
+
+static int
+inotify_watch_cmp(const struct inotify_watch *a,
+    const struct inotify_watch *b)
+{
+       if (a->wd < b->wd)
+               return (-1);
+       else if (a->wd > b->wd)
+               return (1);
+       else
+               return (0);
+}
+RB_HEAD(inotify_watch_tree, inotify_watch);
+RB_GENERATE_STATIC(inotify_watch_tree, inotify_watch, ilink, 
inotify_watch_cmp);
+
+struct inotify_softc {
+       struct mtx      lock;                   /* serialize all softc writes */
+       STAILQ_HEAD(, inotify_record) pending;  /* events waiting to be read */
+       struct inotify_record overflow;         /* preallocated record */
+       int             nextwatch;              /* next watch ID to try */
+       int             npending;               /* number of pending events */
+       size_t          nbpending;              /* bytes available to read */
+       uint64_t        ino;                    /* unique identifier */
+       struct inotify_watch_tree watches;      /* active watches */
+       struct selinfo  sel;                    /* select/poll/kevent info */
+       struct ucred    *cred;                  /* credential ref */
+};
+
+static struct inotify_record *
+inotify_dequeue(struct inotify_softc *sc)
+{
+       struct inotify_record *rec;
+
+       mtx_assert(&sc->lock, MA_OWNED);
+       KASSERT(!STAILQ_EMPTY(&sc->pending),
+           ("%s: queue for %p is empty", __func__, sc));
+
+       rec = STAILQ_FIRST(&sc->pending);
+       STAILQ_REMOVE_HEAD(&sc->pending, link);
+       sc->npending--;
+       sc->nbpending -= sizeof(rec->ev) + rec->ev.len;
+       return (rec);
+}
+
+static void
+inotify_enqueue(struct inotify_softc *sc, struct inotify_record *rec, bool 
head)
+{
+       mtx_assert(&sc->lock, MA_OWNED);
+
+       if (head)
+               STAILQ_INSERT_HEAD(&sc->pending, rec, link);
+       else
+               STAILQ_INSERT_TAIL(&sc->pending, rec, link);
+       sc->npending++;
+       sc->nbpending += sizeof(rec->ev) + rec->ev.len;
+}
+
+static int
+inotify_read(struct file *fp, struct uio *uio, struct ucred *cred, int flags,
+    struct thread *td)
+{
+       struct inotify_softc *sc;
+       struct inotify_record *rec;
+       int error;
+       bool first;
+
+       sc = fp->f_data;
+       error = 0;
+
+       mtx_lock(&sc->lock);
+       while (STAILQ_EMPTY(&sc->pending)) {
+               if ((flags & IO_NDELAY) != 0 || (fp->f_flag & FNONBLOCK) != 0) {
+                       mtx_unlock(&sc->lock);
+                       return (EWOULDBLOCK);
+               }
+               error = msleep(&sc->pending, &sc->lock, PCATCH, "inotify", 0);
+               if (error != 0) {
+                       mtx_unlock(&sc->lock);
+                       return (error);
+               }
+       }
+       for (first = true; !STAILQ_EMPTY(&sc->pending); first = false) {
+               size_t len;
+
+               rec = inotify_dequeue(sc);
+               len = sizeof(rec->ev) + rec->ev.len;
+               if (uio->uio_resid < (ssize_t)len) {
+                       inotify_enqueue(sc, rec, true);
+                       if (first) {
+                               error = EXTERROR(EINVAL,
+                                   "read buffer is too small");
+                       }
+                       break;
+               }
+               mtx_unlock(&sc->lock);
+               error = uiomove(&rec->ev, len, uio);
+#ifdef KTRACE
+               if (error == 0 && KTRPOINT(td, KTR_STRUCT))
+                       ktrstruct("inotify", &rec->ev, len);
+#endif
+               mtx_lock(&sc->lock);
+               if (error != 0) {
+                       inotify_enqueue(sc, rec, true);
+                       mtx_unlock(&sc->lock);
+                       return (error);
+               }
+               if (rec == &sc->overflow) {
+                       /*
+                        * Signal to inotify_queue_record() that the overflow
+                        * record can be reused.
+                        */
+                       memset(rec, 0, sizeof(*rec));
+               } else {
+                       free(rec, M_INOTIFY);
+               }
+       }
+       mtx_unlock(&sc->lock);
+       return (error);
+}
+
+static int
+inotify_ioctl(struct file *fp, u_long com, void *data, struct ucred *cred,
+    struct thread *td)
+{
+       struct inotify_softc *sc;
+
+       sc = fp->f_data;
+
+       switch (com) {
+       case FIONREAD:
+               *(int *)data = (int)sc->nbpending;
+               return (0);
+       case FIONBIO:
+       case FIOASYNC:
+               return (0);
+       default:
+               return (ENOTTY);
+       }
+
+       return (0);
+}
+
+static int
+inotify_poll(struct file *fp, int events, struct ucred *cred, struct thread 
*td)
+{
+       struct inotify_softc *sc;
+       int revents;
+
+       sc = fp->f_data;
+       revents = 0;
+
+       mtx_lock(&sc->lock);
+       if ((events & (POLLIN | POLLRDNORM)) != 0 && sc->npending > 0)
+               revents |= events & (POLLIN | POLLRDNORM);
+       else
+               selrecord(td, &sc->sel);
+       mtx_unlock(&sc->lock);
+       return (revents);
+}
+
+static void
+filt_inotifydetach(struct knote *kn)
+{
+       struct inotify_softc *sc;
+
+       sc = kn->kn_hook;
+       knlist_remove(&sc->sel.si_note, kn, 0);
+}
+
+static int
+filt_inotifyevent(struct knote *kn, long hint)
+{
+       struct inotify_softc *sc;
+
+       sc = kn->kn_hook;
+       mtx_assert(&sc->lock, MA_OWNED);
+       kn->kn_data = sc->nbpending;
+       return (kn->kn_data > 0);
+}
+
+static int
+inotify_kqfilter(struct file *fp, struct knote *kn)
+{
+       struct inotify_softc *sc;
+
+       if (kn->kn_filter != EVFILT_READ)
+               return (EINVAL);
+       sc = fp->f_data;
+       kn->kn_fop = &inotify_rfiltops;
+       kn->kn_hook = sc;
+       knlist_add(&sc->sel.si_note, kn, 0);
+       return (0);
+}
+
+static int
+inotify_stat(struct file *fp, struct stat *sb, struct ucred *cred)
+{
+       struct inotify_softc *sc;
+
+       sc = fp->f_data;
+
+       memset(sb, 0, sizeof(*sb));
+       sb->st_mode = S_IFREG | S_IRUSR;
+       sb->st_blksize = sizeof(struct inotify_event) + _IN_NAMESIZE(NAME_MAX);
+       mtx_lock(&sc->lock);
+       sb->st_size = sc->nbpending;
+       sb->st_blocks = sc->npending;
+       sb->st_uid = sc->cred->cr_ruid;
+       sb->st_gid = sc->cred->cr_rgid;
+       sb->st_ino = sc->ino;
+       mtx_unlock(&sc->lock);
+       return (0);
+}
+
+static void
+inotify_unlink_watch_locked(struct inotify_softc *sc, struct inotify_watch 
*watch)
+{
+       struct vnode *vp;
+
+       vp = watch->vp;
+       mtx_assert(&vp->v_pollinfo->vpi_lock, MA_OWNED);
+
+       atomic_subtract_int(&inotify_watches, 1);
+       (void)chginotifywatchcnt(sc->cred->cr_ruidinfo, -1, 0);
+
+       TAILQ_REMOVE(&vp->v_pollinfo->vpi_inotify, watch, vlink);
+       if (TAILQ_EMPTY(&vp->v_pollinfo->vpi_inotify))
+               vn_irflag_unset_locked(vp, VIRF_INOTIFY);
+}
+
+/*
+ * Assumes that the watch has already been removed from its softc.
+ */
+static void
+inotify_remove_watch(struct inotify_watch *watch)
+{
+       struct inotify_softc *sc;
+       struct vnode *vp;
+
+       sc = watch->sc;
+
+       vp = watch->vp;
+       mtx_lock(&vp->v_pollinfo->vpi_lock);
+       inotify_unlink_watch_locked(sc, watch);
+       mtx_unlock(&vp->v_pollinfo->vpi_lock);
+
+       vrele(vp);
+       free(watch, M_INOTIFY);
+}
+
+static int
+inotify_close(struct file *fp, struct thread *td)
+{
+       struct inotify_softc *sc;
+       struct inotify_record *rec;
+       struct inotify_watch *watch;
+
+       sc = fp->f_data;
+
+       mtx_lock(&sc->lock);
+       (void)chginotifycnt(sc->cred->cr_ruidinfo, -1, 0);
+       while ((watch = RB_MIN(inotify_watch_tree, &sc->watches)) != NULL) {
+               RB_REMOVE(inotify_watch_tree, &sc->watches, watch);
+               mtx_unlock(&sc->lock);
+               inotify_remove_watch(watch);
+               mtx_lock(&sc->lock);
+       }
+       while (!STAILQ_EMPTY(&sc->pending)) {
+               rec = inotify_dequeue(sc);
+               if (rec != &sc->overflow)
+                       free(rec, M_INOTIFY);
+       }
+       mtx_unlock(&sc->lock);
+       seldrain(&sc->sel);
+       knlist_destroy(&sc->sel.si_note);
+       mtx_destroy(&sc->lock);
+       crfree(sc->cred);
+       free(sc, M_INOTIFY);
+       return (0);
+}
+
+static int
+inotify_fill_kinfo(struct file *fp, struct kinfo_file *kif,
+    struct filedesc *fdp)
+{
+       struct inotify_softc *sc;
+
+       sc = fp->f_data;
+
+       kif->kf_type = KF_TYPE_INOTIFY;
+       kif->kf_un.kf_inotify.kf_inotify_npending = sc->npending;
+       kif->kf_un.kf_inotify.kf_inotify_nbpending = sc->nbpending;
+       return (0);
+}
+
+int
+inotify_create_file(struct thread *td, struct file *fp, int flags, int 
*fflagsp)
+{
+       struct inotify_softc *sc;
+       int fflags;
+
+       if ((flags & ~(IN_NONBLOCK | IN_CLOEXEC)) != 0)
+               return (EINVAL);
+
+       if (!chginotifycnt(td->td_ucred->cr_ruidinfo, 1,
+           inotify_max_user_instances))
+               return (EMFILE);
+
+       sc = malloc(sizeof(*sc), M_INOTIFY, M_WAITOK | M_ZERO);
+       sc->nextwatch = 1; /* Required for compatibility. */
+       STAILQ_INIT(&sc->pending);
+       RB_INIT(&sc->watches);
+       mtx_init(&sc->lock, "inotify", NULL, MTX_DEF);
+       knlist_init_mtx(&sc->sel.si_note, &sc->lock);
+       sc->cred = crhold(td->td_ucred);
+       sc->ino = atomic_fetchadd_64(&inotify_ino, 1);
+
+       fflags = FREAD;
+       if ((flags & IN_NONBLOCK) != 0)
+               fflags |= FNONBLOCK;
+       if ((flags & IN_CLOEXEC) != 0)
+               *fflagsp |= O_CLOEXEC;
+       finit(fp, fflags, DTYPE_INOTIFY, sc, &inotifyfdops);
+
+       return (0);
+}
+
+static struct inotify_record *
+inotify_alloc_record(uint32_t wd, const char *name, size_t namelen, int event,
+    uint32_t cookie, int waitok)
+{
+       struct inotify_event *evp;
+       struct inotify_record *rec;
+
+       rec = malloc(sizeof(*rec) + _IN_NAMESIZE(namelen), M_INOTIFY,
+           waitok | M_ZERO);
+       if (rec == NULL)
+               return (NULL);
+       evp = &rec->ev;
+       evp->wd = wd;
+       evp->mask = event;
+       evp->cookie = cookie;
+       evp->len = _IN_NAMESIZE(namelen);
*** 940 LINES SKIPPED ***

Reply via email to