On Wed, Feb 18, 2026 at 10:01 AM Jan Kara <[email protected]> wrote:
>
> On Tue 17-02-26 19:22:31, T.J. Mercier wrote:
> > Currently some kernfs files (e.g. cgroup.events, memory.events) support
> > inotify watches for IN_MODIFY, but unlike with regular filesystems, they
> > do not receive IN_DELETE_SELF or IN_IGNORED events when they are
> > removed.
>
> Please see my email:
> https://lore.kernel.org/all/lc2jgt3yrvuvtdj2kk7q3rloie2c5mzyhfdy4zvxylx732voet@ol3kl4ackrpb
>
> I think this is actually a bug in kernfs...
>
>                                                                 Honza

Thanks, I'm looking at this now. I've tried calling clear_nlink in
kernfs_iop_rmdir, but I've found that when we get back to vfs_rmdir
and shrink_dcache_parent is called, d_walk doesn't find any entries,
so shrink_kill->__dentry_kill is not called. I'm investigating why
that is...

> >
> > This creates a problem for processes monitoring cgroups. For example, a
> > service monitoring memory.events for memory.high breaches needs to know
> > when a cgroup is removed to clean up its state. Where it's known that a
> > cgroup is removed when all processes die, without IN_DELETE_SELF the
> > service must resort to inefficient workarounds such as:
> > 1.  Periodically scanning procfs to detect process death (wastes CPU and
> >     is susceptible to PID reuse).
> > 2.  Placing an additional IN_DELETE watch on the parent directory
> >     (wastes resources managing double the watches).
> > 3.  Holding a pidfd for every monitored cgroup (can exhaust file
> >     descriptors).
> >
> > This patch enables kernfs to send IN_DELETE_SELF and IN_IGNORED events.
> > This allows applications to rely on a single existing watch on the file
> > of interest (e.g. memory.events) to receive notifications for both
> > modifications and the eventual removal of the file, as well as automatic
> > watch descriptor cleanup, simplifying userspace logic and improving
> > resource efficiency.
> >
> > Implementation details:
> > The kernfs notification worker is updated to handle file deletion.
> > The optimized single call for MODIFY events to both the parent and the
> > file is retained, however because CREATE (parent) events remain
> > unsupported for kernfs files, support for DELETE (parent) events is not
> > added here to retain symmetry. Only support for DELETE_SELF events is
> > added.
> >
> > Signed-off-by: T.J. Mercier <[email protected]>
> > Acked-by: Tejun Heo <[email protected]>
> > ---
> >  fs/kernfs/dir.c             | 21 +++++++++++++++++
> >  fs/kernfs/file.c            | 45 ++++++++++++++++++++-----------------
> >  fs/kernfs/kernfs-internal.h |  3 +++
> >  3 files changed, 48 insertions(+), 21 deletions(-)
> >
> > diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
> > index 29baeeb97871..e5bda829fcb8 100644
> > --- a/fs/kernfs/dir.c
> > +++ b/fs/kernfs/dir.c
> > @@ -9,6 +9,7 @@
> >
> >  #include <linux/sched.h>
> >  #include <linux/fs.h>
> > +#include <linux/fsnotify_backend.h>
> >  #include <linux/namei.h>
> >  #include <linux/idr.h>
> >  #include <linux/slab.h>
> > @@ -1471,6 +1472,23 @@ void kernfs_show(struct kernfs_node *kn, bool show)
> >       up_write(&root->kernfs_rwsem);
> >  }
> >
> > +static void kernfs_notify_file_deleted(struct kernfs_node *kn)
> > +{
> > +     static DECLARE_WORK(kernfs_notify_deleted_work,
> > +                         kernfs_notify_workfn);
> > +
> > +     guard(spinlock_irqsave)(&kernfs_notify_lock);
> > +     /* may overwite already pending FS_MODIFY events */
> > +     kn->attr.notify_event = FS_DELETE;
> > +
> > +     if (!kn->attr.notify_next) {
> > +             kernfs_get(kn);
> > +             kn->attr.notify_next = kernfs_notify_list;
> > +             kernfs_notify_list = kn;
> > +             schedule_work(&kernfs_notify_deleted_work);
> > +     }
> > +}
> > +
> >  static void __kernfs_remove(struct kernfs_node *kn)
> >  {
> >       struct kernfs_node *pos, *parent;
> > @@ -1520,6 +1538,9 @@ static void __kernfs_remove(struct kernfs_node *kn)
> >                       struct kernfs_iattrs *ps_iattr =
> >                               parent ? parent->iattr : NULL;
> >
> > +                     if (kernfs_type(pos) == KERNFS_FILE)
> > +                             kernfs_notify_file_deleted(pos);
> > +
> >                       /* update timestamps on the parent */
> >                       down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
> >
> > diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
> > index e978284ff983..4be9bbe29378 100644
> > --- a/fs/kernfs/file.c
> > +++ b/fs/kernfs/file.c
> > @@ -37,8 +37,8 @@ struct kernfs_open_node {
> >   */
> >  #define KERNFS_NOTIFY_EOL                    ((void *)&kernfs_notify_list)
> >
> > -static DEFINE_SPINLOCK(kernfs_notify_lock);
> > -static struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
> > +DEFINE_SPINLOCK(kernfs_notify_lock);
> > +struct kernfs_node *kernfs_notify_list = KERNFS_NOTIFY_EOL;
> >
> >  static inline struct mutex *kernfs_open_file_mutex_ptr(struct kernfs_node 
> > *kn)
> >  {
> > @@ -909,7 +909,7 @@ static loff_t kernfs_fop_llseek(struct file *file, 
> > loff_t offset, int whence)
> >       return ret;
> >  }
> >
> > -static void kernfs_notify_workfn(struct work_struct *work)
> > +void kernfs_notify_workfn(struct work_struct *work)
> >  {
> >       struct kernfs_node *kn;
> >       struct kernfs_super_info *info;
> > @@ -935,11 +935,7 @@ static void kernfs_notify_workfn(struct work_struct 
> > *work)
> >       down_read(&root->kernfs_supers_rwsem);
> >       down_read(&root->kernfs_rwsem);
> >       list_for_each_entry(info, &kernfs_root(kn)->supers, node) {
> > -             struct kernfs_node *parent;
> > -             struct inode *p_inode = NULL;
> > -             const char *kn_name;
> >               struct inode *inode;
> > -             struct qstr name;
> >
> >               /*
> >                * We want fsnotify_modify() on @kn but as the
> > @@ -951,24 +947,31 @@ static void kernfs_notify_workfn(struct work_struct 
> > *work)
> >               if (!inode)
> >                       continue;
> >
> > -             kn_name = kernfs_rcu_name(kn);
> > -             name = QSTR(kn_name);
> > -             parent = kernfs_get_parent(kn);
> > -             if (parent) {
> > -                     p_inode = ilookup(info->sb, kernfs_ino(parent));
> > -                     if (p_inode) {
> > -                             fsnotify(notify_event | FS_EVENT_ON_CHILD,
> > -                                      inode, FSNOTIFY_EVENT_INODE,
> > -                                      p_inode, &name, inode, 0);
> > -                             iput(p_inode);
> > +             if (notify_event == FS_DELETE) {
> > +                     fsnotify_inoderemove(inode);
> > +             } else {
> > +                     struct kernfs_node *parent = kernfs_get_parent(kn);
> > +                     struct inode *p_inode = NULL;
> > +
> > +                     if (parent) {
> > +                             p_inode = ilookup(info->sb, 
> > kernfs_ino(parent));
> > +                             if (p_inode) {
> > +                                     const char *kn_name = 
> > kernfs_rcu_name(kn);
> > +                                     struct qstr name = QSTR(kn_name);
> > +
> > +                                     fsnotify(notify_event | 
> > FS_EVENT_ON_CHILD,
> > +                                              inode, FSNOTIFY_EVENT_INODE,
> > +                                              p_inode, &name, inode, 0);
> > +                                     iput(p_inode);
> > +                             }
> > +
> > +                             kernfs_put(parent);
> >                       }
> >
> > -                     kernfs_put(parent);
> > +                     if (!p_inode)
> > +                             fsnotify_inode(inode, notify_event);
> >               }
> >
> > -             if (!p_inode)
> > -                     fsnotify_inode(inode, notify_event);
> > -
> >               iput(inode);
> >       }
> >
> > diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
> > index 6061b6f70d2a..cf4b21f4f3b6 100644
> > --- a/fs/kernfs/kernfs-internal.h
> > +++ b/fs/kernfs/kernfs-internal.h
> > @@ -199,6 +199,8 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node 
> > *parent,
> >   * file.c
> >   */
> >  extern const struct file_operations kernfs_file_fops;
> > +extern struct kernfs_node *kernfs_notify_list;
> > +extern void kernfs_notify_workfn(struct work_struct *work);
> >
> >  bool kernfs_should_drain_open_files(struct kernfs_node *kn);
> >  void kernfs_drain_open_files(struct kernfs_node *kn);
> > @@ -212,4 +214,5 @@ extern const struct inode_operations 
> > kernfs_symlink_iops;
> >   * kernfs locks
> >   */
> >  extern struct kernfs_global_locks *kernfs_locks;
> > +extern spinlock_t kernfs_notify_lock;
> >  #endif       /* __KERNFS_INTERNAL_H */
> > --
> > 2.53.0.310.g728cabbaf7-goog
> >
> --
> Jan Kara <[email protected]>
> SUSE Labs, CR

Reply via email to