On Mon, Nov 02, 2020 at 09:37:04PM +0100, Jann Horn wrote:
> At the moment, the seccomp notifier API is hard to use without combining
> it with APIs like poll() or epoll(); if all target processes have gone
> away, the polling APIs will raise an error indication on the file
> descriptor, but SECCOMP_IOCTL_NOTIF_RECV will keep blocking indefinitely.
> 
> This usability problem was discovered when Michael Kerrisk wrote a
> manpage for this API.
> 
> To fix it, get rid of the semaphore logic and let SECCOMP_IOCTL_NOTIF_RECV
> behave as follows:
> 
> If O_NONBLOCK is set, SECCOMP_IOCTL_NOTIF_RECV always returns
> immediately, no matter whether a notification is available or not.
> 
> If O_NONBLOCK is unset, SECCOMP_IOCTL_NOTIF_RECV blocks until either a
> notification is delivered to userspace or all users of the filter have
> gone away.
> 
> To avoid subtle breakage from eventloop-style code that doesn't set
> O_NONBLOCK, set O_NONBLOCK by default - userspace can clear it if it
> wants blocking behavior, and if blocking-style code forgets to do so,
> that will be much more obvious than the breakage we'd get the other way
> around.
> This also means that UAPI breakage from this change should be limited to
> blocking users of the API, of which, to my knowledge, there are none so far
> (outside of in-tree sample and selftest code, which this patch adjusts - in
> particular the code in samples/ has to change a bunch).
> 
> This should be backported because future userspace code might otherwise not
> work properly on old kernels.
> 
> Cc: sta...@vger.kernel.org
> Reported-by: Michael Kerrisk <mtk.manpa...@gmail.com>
> Signed-off-by: Jann Horn <ja...@google.com>
> ---
>  kernel/seccomp.c                              | 62 +++++++++++++------
>  samples/seccomp/user-trap.c                   | 16 +++++
>  tools/testing/selftests/seccomp/seccomp_bpf.c | 21 +++++++
>  3 files changed, 79 insertions(+), 20 deletions(-)
> 
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 8ad7a293255a..b3730740515f 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -43,6 +43,7 @@
>  #include <linux/uaccess.h>
>  #include <linux/anon_inodes.h>
>  #include <linux/lockdep.h>
> +#include <linux/freezer.h>
>  
>  /*
>   * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the
> @@ -138,7 +139,6 @@ struct seccomp_kaddfd {
>   * @notifications: A list of struct seccomp_knotif elements.
>   */
>  struct notification {
> -     struct semaphore request;
>       u64 next_id;
>       struct list_head notifications;
>  };
> @@ -863,7 +863,6 @@ static int seccomp_do_user_notification(int this_syscall,
>       list_add(&n.list, &match->notif->notifications);
>       INIT_LIST_HEAD(&n.addfd);
>  
> -     up(&match->notif->request);
>       wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
>       mutex_unlock(&match->notify_lock);
>  
> @@ -1179,9 +1178,10 @@ find_notification(struct seccomp_filter *filter, u64 
> id)
>  
>  
>  static long seccomp_notify_recv(struct seccomp_filter *filter,
> -                             void __user *buf)
> +                             void __user *buf, bool blocking)
>  {
>       struct seccomp_knotif *knotif = NULL, *cur;
> +     DEFINE_WAIT(wait);
>       struct seccomp_notif unotif;
>       ssize_t ret;
>  
> @@ -1194,11 +1194,9 @@ static long seccomp_notify_recv(struct seccomp_filter 
> *filter,
>  
>       memset(&unotif, 0, sizeof(unotif));
>  
> -     ret = down_interruptible(&filter->notif->request);
> -     if (ret < 0)
> -             return ret;
> -
>       mutex_lock(&filter->notify_lock);
> +
> +retry:
>       list_for_each_entry(cur, &filter->notif->notifications, list) {
>               if (cur->state == SECCOMP_NOTIFY_INIT) {
>                       knotif = cur;
> @@ -1206,14 +1204,40 @@ static long seccomp_notify_recv(struct seccomp_filter 
> *filter,
>               }
>       }
>  
> -     /*
> -      * If we didn't find a notification, it could be that the task was
> -      * interrupted by a fatal signal between the time we were woken and
> -      * when we were able to acquire the rw lock.
> -      */
>       if (!knotif) {
> -             ret = -ENOENT;
> -             goto out;
> +             if (!blocking) {
> +                     if (refcount_read(&filter->users) == 0)
> +                             ret = -ENOTCONN;
> +                     else
> +                             ret = -ENOENT;
> +                     goto out;
> +             }
> +
> +             /* This has to happen before checking &filter->users. */
> +             prepare_to_wait(&filter->wqh, &wait, TASK_INTERRUPTIBLE);

Isn't there a subtle race condition here, that if we are in blocking mode, and 
we get a notification here, we wont be woken up? since we enter the wait queue
after checking to see if there are pending notifications, and in that period
the wake_up_poll could have been called?

I very well might be missing something interesting about the semantics of
freezable_schedule.

shouldn't it read something like:

retry:
        mutex_lock(...);
        ret = prepare_to_wait_event(&filter->wqh, &wait, TASK_INTERRUPTIBLE);
        if (ret)
                goto out;
        list_for_each_entry(cur, &filter->notif->notifications, list) {
                if (cur->state == SECCOMP_NOTIFY_INIT) {
                        knotif = cur;
                        break;
                }
        }

        /* Maybe this should be before we iterate over the list of outstanding 
notifcations? */
        if (refcount_read(&filter->users) == 0) {
                ret = -ENOTCONN;
                goto out;
        }

        if (!knotif) {
                if (!blocking) {
                        ret = -ENOENT;
                        goto out;
                }
                mutex_unlock(...);
                freezable_schedule();
                goto retry;
        }


out:
        finish_wait(&filter->wqh, &wait);
        mutex_unlock(...);
> +
> +             /*
> +              * If all users of the filter are gone, throw an error
> +              * instead of pointlessly continuing to block.
> +              */
> +             if (refcount_read(&filter->users) == 0) {
> +                     finish_wait(&filter->wqh, &wait);
> +                     ret = -ENOTCONN;
> +                     goto out;
> +             }
> +
> +             /* No notifications - wait for one... */
> +             mutex_unlock(&filter->notify_lock);
> +             freezable_schedule();
> +             finish_wait(&filter->wqh, &wait);
> +
> +             /* ... and retry */
> +             mutex_lock(&filter->notify_lock);
> +             if (signal_pending(current)) {
> +                     ret = -EINTR;
> +                     goto out;
> +             }
> +             goto retry;
>       }
>  
>       unotif.id = knotif->id;
> @@ -1237,10 +1261,8 @@ static long seccomp_notify_recv(struct seccomp_filter 
> *filter,
>                */
>               mutex_lock(&filter->notify_lock);
>               knotif = find_notification(filter, unotif.id);
> -             if (knotif) {
> +             if (knotif)
>                       knotif->state = SECCOMP_NOTIFY_INIT;
> -                     up(&filter->notif->request);
> -             }
>               mutex_unlock(&filter->notify_lock);
>       }
>  
> @@ -1416,11 +1438,12 @@ static long seccomp_notify_ioctl(struct file *file, 
> unsigned int cmd,
>  {
>       struct seccomp_filter *filter = file->private_data;
>       void __user *buf = (void __user *)arg;
> +     bool blocking = !(file->f_flags & O_NONBLOCK);
>  
>       /* Fixed-size ioctls */
>       switch (cmd) {
>       case SECCOMP_IOCTL_NOTIF_RECV:
> -             return seccomp_notify_recv(filter, buf);
> +             return seccomp_notify_recv(filter, buf, blocking);
>       case SECCOMP_IOCTL_NOTIF_SEND:
>               return seccomp_notify_send(filter, buf);
>       case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
> @@ -1483,12 +1506,11 @@ static struct file *init_listener(struct 
> seccomp_filter *filter)
>       if (!filter->notif)
>               goto out;
>  
> -     sema_init(&filter->notif->request, 0);
>       filter->notif->next_id = get_random_u64();
>       INIT_LIST_HEAD(&filter->notif->notifications);
>  
>       ret = anon_inode_getfile("seccomp notify", &seccomp_notify_ops,
> -                              filter, O_RDWR);
> +                              filter, O_RDWR|O_NONBLOCK);
>       if (IS_ERR(ret))
>               goto out_notif;
>  
> diff --git a/samples/seccomp/user-trap.c b/samples/seccomp/user-trap.c
> index 20291ec6489f..b9e666f15998 100644
> --- a/samples/seccomp/user-trap.c
> +++ b/samples/seccomp/user-trap.c
> @@ -198,6 +198,17 @@ static int handle_req(struct seccomp_notif *req,
>       return ret;
>  }
>  
> +static void set_blocking(int fd)
> +{
> +     int file_status_flags = fcntl(fd, F_GETFL);
> +
> +     file_status_flags &= ~O_NONBLOCK;
> +     if (fcntl(fd, F_SETFL, file_status_flags)) {
> +             perror("F_SETFL");
> +             exit(1);
> +     }
> +}
> +
>  int main(void)
>  {
>       int sk_pair[2], ret = 1, status, listener;
> @@ -274,6 +285,8 @@ int main(void)
>       if (listener < 0)
>               goto out_kill;
>  
> +     set_blocking(listener);
> +
>       /*
>        * Fork a task to handle the requests. This isn't strictly necessary,
>        * but it makes the particular writing of this sample easier, since we
> @@ -307,6 +320,9 @@ int main(void)
>               while (1) {
>                       memset(req, 0, sizes.seccomp_notif);
>                       if (ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, req)) {
> +                             if (errno == ENOTCONN)
> +                                     exit(0);
> +
>                               perror("ioctl recv");
>                               goto out_resp;
>                       }
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c 
> b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 4a180439ee9e..5318f9cb1aec 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -295,6 +295,13 @@ static int __filecmp(pid_t pid1, pid_t pid2, int fd1, 
> int fd2)
>  #endif
>  }
>  
> +#define set_blocking(fd) ({                                  \
> +     int file_status_flags;                                  \
> +     file_status_flags = fcntl(fd, F_GETFL);                 \
> +     file_status_flags &= ~O_NONBLOCK;                       \
> +     ASSERT_EQ(fcntl(fd, F_SETFL, file_status_flags), 0);    \
> +})
> +
>  /* Have TH_LOG report actual location filecmp() is used. */
>  #define filecmp(pid1, pid2, fd1, fd2)        ({              \
>       int _ret;                                       \
> @@ -3422,6 +3429,8 @@ TEST(user_notification_kill_in_middle)
>                                     SECCOMP_FILTER_FLAG_NEW_LISTENER);
>       ASSERT_GE(listener, 0);
>  
> +     set_blocking(listener);
> +
>       /*
>        * Check that nothing bad happens when we kill the task in the middle
>        * of a syscall.
> @@ -3476,6 +3485,8 @@ TEST(user_notification_signal)
>                                     SECCOMP_FILTER_FLAG_NEW_LISTENER);
>       ASSERT_GE(listener, 0);
>  
> +     set_blocking(listener);
> +
>       pid = fork();
>       ASSERT_GE(pid, 0);
>  
> @@ -3583,6 +3594,8 @@ TEST(user_notification_child_pid_ns)
>                                     SECCOMP_FILTER_FLAG_NEW_LISTENER);
>       ASSERT_GE(listener, 0);
>  
> +     set_blocking(listener);
> +
>       pid = fork();
>       ASSERT_GE(pid, 0);
>  
> @@ -3623,6 +3636,8 @@ TEST(user_notification_sibling_pid_ns)
>                                     SECCOMP_FILTER_FLAG_NEW_LISTENER);
>       ASSERT_GE(listener, 0);
>  
> +     set_blocking(listener);
> +
>       pid = fork();
>       ASSERT_GE(pid, 0);
>  
> @@ -3691,6 +3706,8 @@ TEST(user_notification_fault_recv)
>                                     SECCOMP_FILTER_FLAG_NEW_LISTENER);
>       ASSERT_GE(listener, 0);
>  
> +     set_blocking(listener);
> +
>       pid = fork();
>       ASSERT_GE(pid, 0);
>  
> @@ -3972,6 +3989,8 @@ TEST(user_notification_addfd)
>                                     SECCOMP_FILTER_FLAG_NEW_LISTENER);
>       ASSERT_GE(listener, 0);
>  
> +     set_blocking(listener);
> +
>       pid = fork();
>       ASSERT_GE(pid, 0);
>  
> @@ -4099,6 +4118,8 @@ TEST(user_notification_addfd_rlimit)
>                                     SECCOMP_FILTER_FLAG_NEW_LISTENER);
>       ASSERT_GE(listener, 0);
>  
> +     set_blocking(listener);
> +
>       pid = fork();
>       ASSERT_GE(pid, 0);
>  
> 
> base-commit: 4525c8781ec0701ce824e8bd379ae1b129e26568
> -- 
> 2.29.1.341.ge80a0c044ae-goog
> 

Reply via email to