On Sun, Feb 02, 2025 at 08:29:20AM -0800, Eyal Birger wrote:
> When attaching uretprobes to processes running inside docker, the attached
> process is segfaulted when encountering the retprobe.
> 
> The reason is that now that uretprobe is a system call the default seccomp
> filters in docker block it as they only allow a specific set of known
> syscalls. This is true for other userspace applications which use seccomp
> to control their syscall surface.
> 
> Since uretprobe is a "kernel implementation detail" system call which is
> not used by userspace application code directly, it is impractical and
> there's very little point in forcing all userspace applications to
> explicitly allow it in order to avoid crashing tracked processes.
> 
> Pass this systemcall through seccomp without depending on configuration.
> 
> Note: uretprobe isn't supported in i386 and __NR_ia32_rt_tgsigqueueinfo
> uses the same number as __NR_uretprobe so the syscall isn't forced in the
> compat bitmap.
> 
> Fixes: ff474a78cef5 ("uprobe: Add uretprobe syscall to speed up return probe")
> Reported-by: Rafael Buchbinder <r...@rbk.io>
> Link: 
> https://lore.kernel.org/lkml/cahsh6gs3eh8dfu0wq58c_lf8a4_+o6z456j7bidmcvy2aqo...@mail.gmail.com/
> Link: 
> https://lore.kernel.org/lkml/20250121182939.33d05...@gandalf.local.home/T/#me2676c378eff2d6a33f3054fed4a5f3afa64e65b
> Link: 
> https://lore.kernel.org/lkml/20250128145806.1849977-1-eyal.bir...@gmail.com/
> Cc: sta...@vger.kernel.org
> Signed-off-by: Eyal Birger <eyal.bir...@gmail.com>
> ---
> v3: no change - deferring 32bit compat handling as there aren't plans to
>     support this syscall in compat mode.
> v2: use action_cache bitmap and mode1 array to check the syscall
> ---
>  kernel/seccomp.c | 24 +++++++++++++++++++++---
>  1 file changed, 21 insertions(+), 3 deletions(-)
> 
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index f59381c4a2ff..09b6f8e6db51 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -734,13 +734,13 @@ seccomp_prepare_user_filter(const char __user 
> *user_filter)
>  
>  #ifdef SECCOMP_ARCH_NATIVE
>  /**
> - * seccomp_is_const_allow - check if filter is constant allow with given data
> + * seccomp_is_filter_const_allow - check if filter is constant allow with 
> given data
>   * @fprog: The BPF programs
>   * @sd: The seccomp data to check against, only syscall number and arch
>   *      number are considered constant.
>   */
> -static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
> -                                struct seccomp_data *sd)
> +static bool seccomp_is_filter_const_allow(struct sock_fprog_kern *fprog,
> +                                       struct seccomp_data *sd)
>  {
>       unsigned int reg_value = 0;
>       unsigned int pc;
> @@ -812,6 +812,21 @@ static bool seccomp_is_const_allow(struct 
> sock_fprog_kern *fprog,
>       return false;
>  }
>  
> +static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog,
> +                                struct seccomp_data *sd)
> +{
> +#ifdef __NR_uretprobe
> +     if (sd->nr == __NR_uretprobe
> +#ifdef SECCOMP_ARCH_COMPAT
> +         && sd->arch != SECCOMP_ARCH_COMPAT
> +#endif
> +        )
> +             return true;
> +#endif
> +
> +     return seccomp_is_filter_const_allow(fprog, sd);
> +}
> +
>  static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter,
>                                        void *bitmap, const void *bitmap_prev,
>                                        size_t bitmap_size, int arch)

I minimized the above to:

@@ -749,6 +749,15 @@ static bool seccomp_is_const_allow(struct sock_fprog_kern 
*fprog,
        if (WARN_ON_ONCE(!fprog))
                return false;
 
+       /* Our single exception to filtering. */
+#ifdef __NR_uretprobe
+#ifdef SECCOMP_ARCH_COMPAT
+       if (sd->arch == SECCOMP_ARCH_NATIVE)
+#endif
+               if (sd->nr == __NR_uretprobe)
+                       return true;
+#endif
+
        for (pc = 0; pc < fprog->len; pc++) {
                struct sock_filter *insn = &fprog->filter[pc];
                u16 code = insn->code;

> @@ -1023,6 +1038,9 @@ static inline void seccomp_log(unsigned long syscall, 
> long signr, u32 action,
>   */
>  static const int mode1_syscalls[] = {
>       __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, 
> __NR_seccomp_sigreturn,
> +#ifdef __NR_uretprobe
> +     __NR_uretprobe,
> +#endif
>       -1, /* negative terminated */
>  };
>  
> -- 
> 2.43.0
> 

-Kees

-- 
Kees Cook

Reply via email to