It is common for services to be stateless around their main event loop.
If a process passes the EPOLL_KILLME flag to epoll_wait5() then it
signals to the kernel that epoll_wait5() may not complete, and the kernel
may send SIGKILL if resources get tight.

See my systemd patch: https://github.com/shawnl/systemd/tree/killme

Android uses this memory model for all programs, and having it in the
kernel will enable integration with the page cache (not in this
series).
---
 arch/x86/entry/syscalls/syscall_32.tbl |  1 +
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 fs/eventpoll.c                         | 74 +++++++++++++++++++++++++++++++++-
 include/linux/eventpoll.h              |  2 +
 include/linux/sched.h                  |  3 ++
 include/uapi/asm-generic/unistd.h      |  5 ++-
 include/uapi/linux/eventpoll.h         |  3 ++
 kernel/exit.c                          |  2 +
 mm/oom_kill.c                          | 17 ++++++++
 9 files changed, 105 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl 
b/arch/x86/entry/syscalls/syscall_32.tbl
index 448ac2161112..040e5d02bdcc 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -391,3 +391,4 @@
 382    i386    pkey_free               sys_pkey_free
 383    i386    statx                   sys_statx
 384    i386    arch_prctl              sys_arch_prctl                  
compat_sys_arch_prctl
+385    i386    epoll_wait5             sys_epoll_wait5
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl 
b/arch/x86/entry/syscalls/syscall_64.tbl
index 5aef183e2f85..c72802e8cf65 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -339,6 +339,7 @@
 330    common  pkey_alloc              sys_pkey_alloc
 331    common  pkey_free               sys_pkey_free
 332    common  statx                   sys_statx
+333    common  epoll_wait5             sys_epoll_wait5
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2fabd19cdeea..76d1c91d940b 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -297,6 +297,14 @@ static LIST_HEAD(visited_list);
  */
 static LIST_HEAD(tfile_check_list);
 
+static LIST_HEAD(deathrow_q);
+static long deathrow_len __read_mostly;
+
+/* TODO: Can this lock be removed by using atomic instructions to update
+ * queue?
+ */
+static DEFINE_MUTEX(deathrow_mutex);
+
 #ifdef CONFIG_SYSCTL
 
 #include <linux/sysctl.h>
@@ -314,6 +322,15 @@ struct ctl_table epoll_table[] = {
                .extra1         = &zero,
                .extra2         = &long_max,
        },
+       {
+               .procname       = "deathrow_size",
+               .data           = &deathrow_len,
+               .maxlen         = sizeof(deathrow_len),
+               .mode           = 0444,
+               .proc_handler   = proc_doulongvec_minmax,
+               .extra1         = &zero,
+               .extra2         = &long_max,
+       },
        { }
 };
 #endif /* CONFIG_SYSCTL */
@@ -2164,9 +2181,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 /*
  * Implement the event wait interface for the eventpoll file. It is the kernel
  * part of the user space epoll_wait(2).
+ *
+ * A flags argument cannot be added to epoll_pwait cause it already has
+ * the maximum number of arguments (6). Can this be fixed?
  */
-SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
-               int, maxevents, int, timeout)
+SYSCALL_DEFINE5(epoll_wait5, int, epfd, struct epoll_event __user *, events,
+               int, maxevents, int, timeout, int, flags)
 {
        int error;
        struct fd f;
@@ -2199,14 +2219,44 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct 
epoll_event __user *, events,
         */
        ep = f.file->private_data;
 
+       /* Check the EPOLL_* constants for conflicts.  */
+       BUILD_BUG_ON(EPOLL_KILLME == EPOLL_CLOEXEC);
+
+       if (flags & ~EPOLL_KILLME)
+               return -EINVAL;
+
+       if (flags & EPOLL_KILLME) {
+               /* Put process on death row. */
+               mutex_lock(&deathrow_mutex);
+               deathrow_len++;
+               list_add(&current->se.deathrow, &deathrow_q);
+               current->se.on_deathrow = 1;
+               mutex_unlock(&deathrow_mutex);
+       }
+
        /* Time to fish for events ... */
        error = ep_poll(ep, events, maxevents, timeout);
 
+       if (flags & EPOLL_KILLME) {
+               /* Remove process from death row. */
+               mutex_lock(&deathrow_mutex);
+               current->se.on_deathrow = 0;
+               list_del(&current->se.deathrow);
+               deathrow_len--;
+               mutex_unlock(&deathrow_mutex);
+       }
+
 error_fput:
        fdput(f);
        return error;
 }
 
+SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
+               int, maxevents, int, timeout)
+{
+       return sys_epoll_wait5(epfd, events, maxevents, timeout, 0);
+}
+
 /*
  * Implement the event wait interface for the eventpoll file. It is the kernel
  * part of the user space epoll_pwait(2).
@@ -2297,6 +2347,26 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
 }
 #endif
 
+/* Clean up after a EPOLL_KILLME process quits.
+ * Called by kernel/exit.c.
+ */
+int exit_killme(void)
+{
+       if (current->se.on_deathrow) {
+               mutex_lock(&deathrow_mutex);
+               current->se.on_deathrow = 0;
+               list_del(&current->se.deathrow);
+               mutex_unlock(&deathrow_mutex);
+       }
+
+       return 0;
+}
+
+struct list_head *eventpoll_deathrow_list(void)
+{
+       return &deathrow_q;
+}
+
 static int __init eventpoll_init(void)
 {
        struct sysinfo si;
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h
index 2f14ac73d01d..f1e28d468de5 100644
--- a/include/linux/eventpoll.h
+++ b/include/linux/eventpoll.h
@@ -20,6 +20,8 @@
 /* Forward declarations to avoid compiler errors */
 struct file;
 
+int exit_killme(void);
+struct list_head *eventpoll_deathrow_list(void);
 
 #ifdef CONFIG_EPOLL
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a7df4e558c..66462bf27a29 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -380,6 +380,9 @@ struct sched_entity {
        struct list_head                group_node;
        unsigned int                    on_rq;
 
+       unsigned                        on_deathrow:1;
+       struct list_head                deathrow;
+
        u64                             exec_start;
        u64                             sum_exec_runtime;
        u64                             vruntime;
diff --git a/include/uapi/asm-generic/unistd.h 
b/include/uapi/asm-generic/unistd.h
index 061185a5eb51..843553a39388 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -893,8 +893,11 @@ __SYSCALL(__NR_fork, sys_fork)
 __SYSCALL(__NR_fork, sys_ni_syscall)
 #endif /* CONFIG_MMU */
 
+#define __NR_epoll_wait5 1080
+__SYSCALL(__NR_epoll_wait5, sys_epoll_wait5)
+
 #undef __NR_syscalls
-#define __NR_syscalls (__NR_fork+1)
+#define __NR_syscalls (__NR_fork+2)
 
 #endif /* __ARCH_WANT_SYSCALL_DEPRECATED */
 
diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h
index f4d5c998cc2b..ce150a3e7248 100644
--- a/include/uapi/linux/eventpoll.h
+++ b/include/uapi/linux/eventpoll.h
@@ -21,6 +21,9 @@
 /* Flags for epoll_create1.  */
 #define EPOLL_CLOEXEC O_CLOEXEC
 
+/* Flags for epoll_wait5.  */
+#define EPOLL_KILLME 0x00000001
+
 /* Valid opcodes to issue to sys_epoll_ctl() */
 #define EPOLL_CTL_ADD 1
 #define EPOLL_CTL_DEL 2
diff --git a/kernel/exit.c b/kernel/exit.c
index f6cad39f35df..cd089bdc5b17 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -62,6 +62,7 @@
 #include <linux/random.h>
 #include <linux/rcuwait.h>
 #include <linux/compat.h>
+#include <linux/eventpoll.h>
 
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
@@ -917,6 +918,7 @@ void __noreturn do_exit(long code)
                __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
        exit_rcu();
        exit_tasks_rcu_finish();
+       exit_killme();
 
        lockdep_free_task(tsk);
        do_task_dead();
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dee0f75c3013..d6252772d593 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -41,6 +41,7 @@
 #include <linux/kthread.h>
 #include <linux/init.h>
 #include <linux/mmu_notifier.h>
+#include <linux/eventpoll.h>
 
 #include <asm/tlb.h>
 #include "internal.h"
@@ -1029,6 +1030,22 @@ bool out_of_memory(struct oom_control *oc)
                return true;
        }
 
+       /*
+        * Check death row.
+        */
+       if (!list_empty(eventpoll_deathrow_list())) {
+               struct list_head *l = eventpoll_deathrow_list();
+               struct task_struct *ts = list_first_entry(l,
+                                        struct task_struct, se.deathrow);
+
+               pr_debug("Killing pid %u from EPOLL_KILLME death row.",
+                       ts->pid);
+
+               /* We use SIGKILL so as to cleanly interrupt ep_poll() */
+               kill_pid(task_pid(ts), SIGKILL, 1);
+               return true;
+       }
+
        /*
         * The OOM killer does not compensate for IO-less reclaim.
         * pagefault_out_of_memory lost its gfp context so we have to
-- 
2.15.0.rc2

Reply via email to