On Wed, 2026-06-03 at 09:11 -0700, Breno Leitao wrote:
> Convert RDS socket's getsockopt implementation to use the new
> getsockopt_iter callback with sockopt_t.
>
> Key changes:
> - Replace (char __user *optval, int __user *optlen) with sockopt_t *opt
> - Use opt->optlen for buffer length (input) and returned size (output)
> - Use copy_to_iter() instead of put_user()/copy_to_user()
>
> The RDS_INFO_* snapshot path in rds_info_getsockopt() used to pin the
> userspace buffer with pin_user_pages_fast() on the raw optval address;
> the info producers then memcpy into those pages under a spinlock via
> kmap_atomic() and so must not fault. Obtain the same page array and
> starting offset from opt->iter_out with iov_iter_extract_pages(), which
> pins for write because iter_out is ITER_DEST.
>
> The page array is preallocated here (sized with iov_iter_npages()) and
> passed in, so iov_iter_extract_pages() fills it in place rather than
> allocating one for us; RDS therefore keeps ownership of the array on
> every return path and frees it itself. The rds_info_iterator /
> rds_info_copy machinery and all producer callbacks are unchanged.
>
> Kernel buffers (ITER_KVEC) are not page-backed in a way the info
> producers can use, so the RDS_INFO path returns -EOPNOTSUPP for them;
> this matches the previous behaviour, where a kernel-buffer getsockopt
> hit the WARN_ONCE() path in do_sock_getsockopt() and returned
> -EOPNOTSUPP. The simple RDS_RECVERR and SO_RDS_TRANSPORT options keep
> working for kernel buffers via copy_to_iter().
>
> Signed-off-by: Breno Leitao <[email protected]>
Hi Breno,
Thanks for working on this. The conversions look mostly correct to me, just a
few nits I noticed below:
> ---
> net/rds/af_rds.c | 36 ++++++++++++++++-------------
> net/rds/info.c | 70
> +++++++++++++++++++++++++++++++-------------------------
> net/rds/info.h | 3 +--
> 3 files changed, 60 insertions(+), 49 deletions(-)
>
> diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
> index 6f4f9cf352bd..d5defe9172e3 100644
> --- a/net/rds/af_rds.c
> +++ b/net/rds/af_rds.c
> @@ -37,6 +37,7 @@
> #include <linux/in.h>
> #include <linux/ipv6.h>
> #include <linux/poll.h>
> +#include <linux/uio.h>
> #include <net/sock.h>
>
> #include "rds.h"
> @@ -485,35 +486,36 @@ static int rds_setsockopt(struct socket *sock, int
> level, int optname,
> }
>
> static int rds_getsockopt(struct socket *sock, int level, int optname,
> - char __user *optval, int __user *optlen)
> + sockopt_t *opt)
> {
> struct rds_sock *rs = rds_sk_to_rs(sock->sk);
> int ret = -ENOPROTOOPT, len;
> int trans;
> + int val;
>
> if (level != SOL_RDS)
> goto out;
>
> - if (get_user(len, optlen)) {
> - ret = -EFAULT;
> - goto out;
> - }
> + len = opt->optlen;
>
> switch (optname) {
> case RDS_INFO_FIRST ... RDS_INFO_LAST:
> - ret = rds_info_getsockopt(sock, optname, optval,
> - optlen);
> + ret = rds_info_getsockopt(sock, optname, opt);
> break;
>
> case RDS_RECVERR:
> - if (len < sizeof(int))
> + if (len < sizeof(int)) {
> ret = -EINVAL;
> - else
> - if (put_user(rs->rs_recverr, (int __user *) optval) ||
> - put_user(sizeof(int), optlen))
> + break;
> + }
> + val = rs->rs_recverr;
> + if (copy_to_iter(&val, sizeof(int), &opt->iter_out) !=
> + sizeof(int)) {
> ret = -EFAULT;
> - else
> + } else {
> + opt->optlen = sizeof(int);
> ret = 0;
> + }
> break;
> case SO_RDS_TRANSPORT:
> if (len < sizeof(int)) {
> @@ -522,11 +524,13 @@ static int rds_getsockopt(struct socket *sock, int
> level, int optname,
> }
> trans = (rs->rs_transport ? rs->rs_transport->t_type :
> RDS_TRANS_NONE); /* unbound */
> - if (put_user(trans, (int __user *)optval) ||
> - put_user(sizeof(int), optlen))
> + if (copy_to_iter(&trans, sizeof(int), &opt->iter_out) !=
> + sizeof(int)) {
> ret = -EFAULT;
> - else
> + } else {
> + opt->optlen = sizeof(int);
> ret = 0;
> + }
> break;
> default:
> break;
> @@ -653,7 +657,7 @@ static const struct proto_ops rds_proto_ops = {
> .listen = sock_no_listen,
> .shutdown = sock_no_shutdown,
> .setsockopt = rds_setsockopt,
> - .getsockopt = rds_getsockopt,
> + .getsockopt_iter = rds_getsockopt,
> .sendmsg = rds_sendmsg,
> .recvmsg = rds_recvmsg,
> .mmap = sock_no_mmap,
> diff --git a/net/rds/info.c b/net/rds/info.c
> index f1b29994934a..171838782595 100644
> --- a/net/rds/info.c
> +++ b/net/rds/info.c
> @@ -35,6 +35,7 @@
> #include <linux/slab.h>
> #include <linux/proc_fs.h>
> #include <linux/export.h>
> +#include <linux/uio.h>
>
> #include "rds.h"
>
> @@ -144,60 +145,68 @@ void rds_info_copy(struct rds_info_iterator *iter, void
> *data,
> EXPORT_SYMBOL_GPL(rds_info_copy);
>
> /*
> - * @optval points to the userspace buffer that the information snapshot
> - * will be copied into.
> - *
> - * @optlen on input is the size of the buffer in userspace. @optlen
> - * on output is the size of the requested snapshot in bytes.
> + * @opt->iter_out describes the buffer that the information snapshot will be
> + * copied into, and @opt->optlen is the size of that buffer on input. On
> + * output @opt->optlen is set to the size of the requested snapshot in bytes.
> *
> * This function returns -errno if there is a failure, particularly -ENOSPC
> - * if the given userspace buffer was not large enough to fit the snapshot.
> - * On success it returns the positive number of bytes of each array element
> - * in the snapshot.
> + * if the given buffer was not large enough to fit the snapshot. On success
> + * it returns the positive number of bytes of each array element in the
> + * snapshot.
> */
> -int rds_info_getsockopt(struct socket *sock, int optname, char __user
> *optval,
> - int __user *optlen)
> +int rds_info_getsockopt(struct socket *sock, int optname, sockopt_t *opt)
> {
> struct rds_info_iterator iter;
> struct rds_info_lengths lens;
> unsigned long nr_pages = 0;
> - unsigned long start;
> rds_info_func func;
> struct page **pages = NULL;
> + size_t offset0 = 0;
> + int npages = 0;
> int ret;
> int len;
> int total;
>
> - if (get_user(len, optlen)) {
> - ret = -EFAULT;
> - goto out;
> - }
> + len = opt->optlen;
>
> /* check for all kinds of wrapping and the like */
> - start = (unsigned long)optval;
> - if (len < 0 || len > INT_MAX - PAGE_SIZE + 1 || start + len < start) {
> + if (len < 0 || len > INT_MAX - PAGE_SIZE + 1) {
> ret = -EINVAL;
> goto out;
> }
>
> + /* The info producers write into the pages with kmap_atomic() while
> + * holding a spinlock, so they need a genuine page-backed user buffer.
> + */
I think !user_backed_iter() is more accurate for what you're meaning to do
here? Technically we only ever get kvecs or
ubufs since rds_info_getsockopt is only called by do_sock_getsockopt. Those
appear to be the only two types it passes,
so it works out. But it means we're counting on that assumption from the
caller and ideally we should filter anything
that's not user backed. So, I'd replace the iov_iter_is_kvec check with:
if (!user_backed_iter(&opt->iter_out)) {
> + if (iov_iter_is_kvec(&opt->iter_out)) {
> + ret = -EOPNOTSUPP;
> + goto out;
> + }
> +
> /* a 0 len call is just trying to probe its length */
> if (len == 0)
> goto call_func;
>
> - nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK))
> - >> PAGE_SHIFT;
> -
> - pages = kmalloc_objs(struct page *, nr_pages);
> + /*
> + * Preallocate the page array and pass it in so that
> + * iov_iter_extract_pages() fills it in place rather than allocating
> + * one for us. Handing it a non-NULL array keeps ownership of the
> + * array with us on every return path, instead of depending on the
> + * iterator code to allocate and hand it back.
> + */
> + npages = iov_iter_npages(&opt->iter_out, INT_MAX);
> + pages = kvmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
> if (!pages) {
> ret = -ENOMEM;
> goto out;
> }
> - ret = pin_user_pages_fast(start, nr_pages, FOLL_WRITE, pages);
> - if (ret != nr_pages) {
> - if (ret > 0)
> - nr_pages = ret;
> - else
> - nr_pages = 0;
> +
> + ret = iov_iter_extract_pages(&opt->iter_out, &pages, len, npages,
> + 0, &offset0);
> + if (ret < 0)
> + goto out;
> + nr_pages = DIV_ROUND_UP(offset0 + ret, PAGE_SIZE);
> + if (ret != len) {
> ret = -EAGAIN; /* XXX ? */
> goto out;
> }
> @@ -213,7 +222,7 @@ int rds_info_getsockopt(struct socket *sock, int optname,
> char __user *optval,
>
> iter.pages = pages;
> iter.addr = NULL;
> - iter.offset = start & (PAGE_SIZE - 1);
> + iter.offset = offset0;
>
> func(sock, len, &iter, &lens);
> BUG_ON(lens.each == 0);
> @@ -230,13 +239,12 @@ int rds_info_getsockopt(struct socket *sock, int
> optname, char __user *optval,
> ret = lens.each;
> }
>
> - if (put_user(len, optlen))
> - ret = -EFAULT;
> + opt->optlen = len;
>
> out:
The unpin here works, but it took me a little bit to trace out where the
corresponding pin went since it is removed
earlier in the patch. Pages are pinned on extract, but only for user pages.
This works out since the only caller here
passes kvec or ubuf, and we error out on kvec iters. Pages are assumed pinned
if they're not null, but without the
!user_backed_iter check, it leans on this behavior from the caller. Ideally I
think iov_iter_extract_pages is supposed
to be paired with iov_iter_extract_will_pin. A quick comment would help
clarify too. So the closing gate would look
something like this:
/* unpin pages from ubuf iters */
if (pages && iov_iter_extract_will_pin(&opt->iter_out))
> if (pages)
> unpin_user_pages(pages, nr_pages);
Having both checks is somewhat redundant, but it doesnt hurt anything either.
And I think it helps make it a little
more uniform and readable. Other than that I think this patch is looking
pretty good to me.
Thank you!
Allison
> - kfree(pages);
> + kvfree(pages);
>
> return ret;
> }
> diff --git a/net/rds/info.h b/net/rds/info.h
> index a069b51c4679..1aab62ab6d00 100644
> --- a/net/rds/info.h
> +++ b/net/rds/info.h
> @@ -21,8 +21,7 @@ typedef void (*rds_info_func)(struct socket *sock, unsigned
> int len,
>
> void rds_info_register_func(int optname, rds_info_func func);
> void rds_info_deregister_func(int optname, rds_info_func func);
> -int rds_info_getsockopt(struct socket *sock, int optname, char __user
> *optval,
> - int __user *optlen);
> +int rds_info_getsockopt(struct socket *sock, int optname, sockopt_t *opt);
> void rds_info_copy(struct rds_info_iterator *iter, void *data,
> unsigned long bytes);
> void rds_info_iter_unmap(struct rds_info_iterator *iter);
>