On Sat, May 16, 2026 at 11:47:45AM +0800, Ziyu Zhang wrote:
vsock_poll() reads vsk->peer_shutdown before taking the socket
lock to set EPOLLHUP and EPOLLRDHUP, then reads it again under the
lock to report EOF readability. A shutdown packet can update
peer_shutdown while poll is waiting for the lock, so one poll invocation
can report EPOLLIN without the corresponding HUP/RDHUP bits.
Keep non-connectible sockets on a single lockless READ_ONCE()
Should this be paired with WRITE_ONCE() on writers?
snapshot. For connectible sockets, defer shutdown-derived poll bits
until after lock_sock() and use one READ_ONCE() snapshot for both EOF
readability and HUP/RDHUP. This preserves shutdowns that arrive before
the lock is acquired and keeps all peer-shutdown-derived bits consistent
for a poll pass.
Fixes: d021c344051a ("VSOCK: Introduce VM Sockets")
Signed-off-by: Ziyu Zhang <[email protected]>
---
net/vmw_vsock/af_vsock.c | 40 ++++++++++++++++++++++++++--------------
1 file changed, 26 insertions(+), 14 deletions(-)
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index adcba1b7b..bed42347b 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1122,6 +1122,25 @@ static int vsock_shutdown(struct socket *sock, int mode)
return err;
}
+static __poll_t vsock_poll_shutdown(struct sock *sk, u32 peer_shutdown)
+{
+ __poll_t mask = 0;
+
+ /* INET sockets treat local write shutdown and peer write shutdown as a
+ * case of EPOLLHUP set.
+ */
+ if (sk->sk_shutdown == SHUTDOWN_MASK ||
+ ((sk->sk_shutdown & SEND_SHUTDOWN) &&
+ (peer_shutdown & SEND_SHUTDOWN)))
+ mask |= EPOLLHUP;
+
+ if (sk->sk_shutdown & RCV_SHUTDOWN ||
+ peer_shutdown & SEND_SHUTDOWN)
+ mask |= EPOLLRDHUP;
+
+ return mask;
+}
+
static __poll_t vsock_poll(struct file *file, struct socket *sock,
poll_table *wait)
{
@@ -1139,19 +1158,9 @@ static __poll_t vsock_poll(struct file *file, struct
socket *sock,
/* Signify that there has been an error on this socket. */
mask |= EPOLLERR;
- /* INET sockets treat local write shutdown and peer write shutdown as a
- * case of EPOLLHUP set.
- */
- if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
- ((sk->sk_shutdown & SEND_SHUTDOWN) &&
- (vsk->peer_shutdown & SEND_SHUTDOWN))) {
- mask |= EPOLLHUP;
- }
-
- if (sk->sk_shutdown & RCV_SHUTDOWN ||
- vsk->peer_shutdown & SEND_SHUTDOWN) {
- mask |= EPOLLRDHUP;
- }
+ if (!sock_type_connectible(sk->sk_type))
+ mask |= vsock_poll_shutdown(sk,
+ READ_ONCE(vsk->peer_shutdown));
Can we move this in the `if (sock->type == SOCK_DGRAM)` branch ?
Not a strong opinion about that, but in any case IMO we should add a
comment here to explain why we are doing only for not connectible
sockets.
That said, if we use WRITE_ONCE in the writers, do we really need to
move this after the lock_sock for the connectable ones?
if (sk_is_readable(sk))
mask |= EPOLLIN | EPOLLRDNORM;
@@ -1171,6 +1180,7 @@ static __poll_t vsock_poll(struct file *file, struct
socket *sock,
} else if (sock_type_connectible(sk->sk_type)) {
const struct vsock_transport *transport;
+ u32 peer_shutdown;
lock_sock(sk);
@@ -1203,10 +1213,12 @@ static __poll_t vsock_poll(struct file *file, struct
socket *sock,
* terminated should also be considered read, and we check the
* shutdown flag for that.
*/
+ peer_shutdown = READ_ONCE(vsk->peer_shutdown);
if (sk->sk_shutdown & RCV_SHUTDOWN ||
- vsk->peer_shutdown & SEND_SHUTDOWN) {
+ peer_shutdown & SEND_SHUTDOWN) {
mask |= EPOLLIN | EPOLLRDNORM;
}
+ mask |= vsock_poll_shutdown(sk, peer_shutdown);
nit: to keep the order the same as before, I would move this call just
before this `if` block, but I don't think it makes any difference in the
end.
/* Connected sockets that can produce data can be written. */
if (transport && sk->sk_state == TCP_ESTABLISHED) {
--
2.43.0