> diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
> index 0323a2b6c..2fccae0df 100644
> --- a/include/linux/skmsg.h
> +++ b/include/linux/skmsg.h
> @@ -97,6 +97,7 @@ struct sk_psock {
>       struct sk_buff_head             ingress_skb;
>       struct list_head                ingress_msg;
>       spinlock_t                      ingress_lock;
> +     ssize_t                         ingress_size;
>       unsigned long                   state;
>       struct list_head                link;
>       spinlock_t                      link_lock;

[ ... ]

> @@ -321,6 +322,16 @@ static inline void sock_drop(struct sock *sk, struct 
> sk_buff *skb)
>       kfree_skb(skb);
>  }
>
> +static inline ssize_t sk_psock_get_msg_size(struct sk_psock *psock)
> +{
> +     return READ_ONCE(psock->ingress_size);
> +}
> +
> +static inline void sk_psock_inc_msg_size(struct sk_psock *psock, ssize_t 
> diff)
> +{
> +     WRITE_ONCE(psock->ingress_size, READ_ONCE(psock->ingress_size) + diff);
> +}
> +

Can sk_psock_inc_msg_size race with itself?  The function implements a
read-modify-write operation that isn't atomic.  Looking at the call sites:

- sk_psock_queue_msg() holds ingress_lock when calling this
- sk_psock_dequeue_msg() holds ingress_lock when calling this
- __sk_psock_purge_ingress_msg() holds ingress_lock when calling this
- __sk_msg_recvmsg() does NOT hold ingress_lock when calling this

The __sk_msg_recvmsg() path runs under the socket lock via
tcp_bpf_recvmsg()->lock_sock(), but sk_psock_queue_msg() can be called
from softirq context (network RX -> BPF verdict ->
sk_psock_verdict_apply() -> sk_psock_skb_ingress_self() ->
sk_psock_queue_msg()) without the socket lock.

This means concurrent updates from different lock contexts can race:

Thread A (softirq, ingress_lock): reads size=1000
Thread B (process, socket lock): reads size=1000, subtracts 50, writes 950
Thread A: adds 100 to stale 1000, writes 1100

Result: size=1100 but should be 1050 (lost the -50 update)

The ingress_size counter would drift from the actual queue size, breaking
the FIONREAD accuracy that this patch aims to provide.

>  static inline bool sk_psock_queue_msg(struct sk_psock *psock,
>                                     struct sk_msg *msg)
>  {
> @@ -329,6 +340,7 @@ static inline bool sk_psock_queue_msg(struct sk_psock 
> *psock,
>       spin_lock_bh(&psock->ingress_lock);
>       if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
>               list_add_tail(&msg->list, &psock->ingress_msg);
> +             sk_psock_inc_msg_size(psock, msg->sg.size);
>               ret = true;
>       } else {
>               sk_msg_free(psock->sk, msg);

[ ... ]

> @@ -345,18 +357,25 @@ static inline struct sk_msg 
> *sk_psock_dequeue_msg(struct sk_psock *psock)
>
>       spin_lock_bh(&psock->ingress_lock);
>       msg = list_first_entry_or_null(&psock->ingress_msg, struct sk_msg, 
> list);
> -     if (msg)
> +     if (msg) {
>               list_del(&msg->list);
> +             sk_psock_inc_msg_size(psock, -msg->sg.size);
> +     }
>       spin_unlock_bh(&psock->ingress_lock);
>       return msg;
>  }

[ ... ]

> diff --git a/net/core/skmsg.c b/net/core/skmsg.c
> index d73e03f77..c959d52a6 100644
> --- a/net/core/skmsg.c
> +++ b/net/core/skmsg.c
> @@ -455,6 +455,7 @@ int __sk_msg_recvmsg(struct sock *sk, struct sk_psock 
> *psock, struct msghdr *msg
>                                       atomic_sub(copy, &sk->sk_rmem_alloc);
>                               }
>                               msg_rx->sg.size -= copy;
> +                             sk_psock_inc_msg_size(psock, -copy);

This call to sk_psock_inc_msg_size() happens without holding ingress_lock,
just the socket lock.  Can this race with sk_psock_queue_msg() which holds
ingress_lock but not the socket lock?

>
>                               if (!sge->length) {
>                                       sk_msg_iter_var_next(i);
> @@ -819,9 +820,11 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock 
> *psock)
>               list_del(&msg->list);
>               if (!msg->skb)
>                       atomic_sub(msg->sg.size, &psock->sk->sk_rmem_alloc);
> +             sk_psock_inc_msg_size(psock, -((ssize_t)msg->sg.size));
>               sk_msg_free(psock->sk, msg);
>               kfree(msg);
>       }
> +     WARN_ON_ONCE(psock->ingress_size);
>  }

[ ... ]


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/19669112811

Reply via email to