Meta (formerly Facebook) developed a new netlink-based interface [1] since Linux 4.12 to replace the old ioctl-based interface for crash recovery and daemon hot upgrade.
[1] https://lore.kernel.org/r/1491512527-4286-1-git-send-email-jba...@fb.com Signed-off-by: Gao Xiang <hsiang...@linux.alibaba.com> --- configure.ac | 29 +++++++ lib/Makefile.am | 1 + lib/backends/nbd.c | 193 +++++++++++++++++++++++++++++++++++++++++++++ lib/liberofs_nbd.h | 2 + mount/Makefile.am | 2 +- mount/main.c | 82 +++++++++++++++---- 6 files changed, 294 insertions(+), 15 deletions(-) diff --git a/configure.ac b/configure.ac index 7db4489..0c03a1d 100644 --- a/configure.ac +++ b/configure.ac @@ -181,6 +181,10 @@ AC_ARG_WITH(json_c, [AS_HELP_STRING([--with-json-c], [Enable and build with json-c support @<:@default=auto@:>@])]) +AC_ARG_WITH(libnl3, + [AS_HELP_STRING([--with-libnl3], + [Enable and build with libnl3 support @<:@default=auto@:>@])]) + AC_ARG_ENABLE(s3, [AS_HELP_STRING([--enable-s3], [enable s3 image generation support @<:@default=no@:>@])], [enable_s3="$enableval"], [enable_s3="no"]) @@ -718,6 +722,31 @@ AS_IF([test "x$with_libxml2" != "xno"], [ ]) ]) +# Configure libnl3 +have_libnl3="no" +AS_IF([test "x$with_libnl3" != "xno"], [ + PKG_CHECK_MODULES([libnl3], [libnl-genl-3.0 >= 3.1], [ + # Paranoia: don't trust the result reported by pkgconfig before trying out + saved_LIBS="$LIBS" + saved_CPPFLAGS=${CPPFLAGS} + CPPFLAGS="${libnl3_CFLAGS} ${CPPFLAGS}" + LIBS="${libnl3_LIBS} $LIBS" + AC_CHECK_HEADERS([netlink/genl/genl.h],[ + AC_CHECK_LIB(nl-genl-3, genl_connect, [], [ + AC_MSG_ERROR([libnl3 doesn't work properly])]) + AC_CHECK_DECL(genl_connect, [have_libnl3="yes"], + [AC_MSG_ERROR([libnl3 doesn't work properly])], [[ +#include <netlink/genl/genl.h> + ]]) + ]) + LIBS="${saved_LIBS}" + CPPFLAGS="${saved_CPPFLAGS}"], [ + AS_IF([test "x$with_libnl3" = "xyes"], [ + AC_MSG_ERROR([Cannot find proper libnl3]) + ]) + ]) +]) + AS_IF([test "x$enable_s3" != "xno"], [ AS_IF( [test "x$have_libcurl" = "xyes" && \ diff --git a/lib/Makefile.am b/lib/Makefile.am index 1c8be2c..1d7958b 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -79,6 +79,7 @@ liberofs_la_LDFLAGS += -lpthread liberofs_la_SOURCES += workqueue.c endif if OS_LINUX +liberofs_la_CFLAGS += ${libnl3_CFLAGS} liberofs_la_SOURCES += backends/nbd.c endif if ENABLE_OCI diff --git a/lib/backends/nbd.c b/lib/backends/nbd.c index 43630f0..8b1842c 100644 --- a/lib/backends/nbd.c +++ b/lib/backends/nbd.c @@ -19,6 +19,12 @@ #include "erofs/print.h" #include "liberofs_nbd.h" +#ifdef HAVE_NETLINK_GENL_GENL_H +#include <netlink/netlink.h> +#include <netlink/genl/genl.h> +#include <netlink/genl/ctrl.h> +#endif + #define NBD_SET_SOCK _IO( 0xab, 0 ) #define NBD_SET_BLKSIZE _IO( 0xab, 1 ) #define NBD_DO_IT _IO( 0xab, 3 ) @@ -168,6 +174,193 @@ err_out: return err; } +#if defined(HAVE_NETLINK_GENL_GENL_H) && defined(HAVE_LIBNL_GENL_3) +enum { + NBD_ATTR_UNSPEC, + NBD_ATTR_INDEX, + NBD_ATTR_SIZE_BYTES, + NBD_ATTR_BLOCK_SIZE_BYTES, + NBD_ATTR_TIMEOUT, + NBD_ATTR_SERVER_FLAGS, + NBD_ATTR_CLIENT_FLAGS, + NBD_ATTR_SOCKETS, + NBD_ATTR_DEAD_CONN_TIMEOUT, + NBD_ATTR_DEVICE_LIST, + NBD_ATTR_BACKEND_IDENTIFIER, + __NBD_ATTR_MAX, +}; +#define NBD_ATTR_MAX (__NBD_ATTR_MAX - 1) + +enum { + NBD_SOCK_ITEM_UNSPEC, + NBD_SOCK_ITEM, + __NBD_SOCK_ITEM_MAX, +}; +#define NBD_SOCK_ITEM_MAX (__NBD_SOCK_ITEM_MAX - 1) + +enum { + NBD_SOCK_UNSPEC, + NBD_SOCK_FD, + __NBD_SOCK_MAX, +}; +#define NBD_SOCK_MAX (__NBD_SOCK_MAX - 1) + +enum { + NBD_CMD_UNSPEC, + NBD_CMD_CONNECT, + NBD_CMD_DISCONNECT, + __NBD_CMD_MAX, +}; + +/* client behavior specific flags */ +/* delete the nbd device on disconnect */ +#define NBD_CFLAG_DESTROY_ON_DISCONNECT (1 << 0) +/* disconnect the nbd device on close by last opener */ +#define NBD_CFLAG_DISCONNECT_ON_CLOSE (1 << 1) + +static struct nl_sock *erofs_nbd_get_nl_sock(int *driver_id) +{ + struct nl_sock *socket; + int err; + + socket = nl_socket_alloc(); + if (!socket) { + erofs_err("Couldn't allocate netlink socket"); + return ERR_PTR(-ENOMEM); + } + + err = genl_connect(socket); + if (err) { + erofs_err("Couldn't connect to the generic netlink socket"); + return ERR_PTR(err); + } + + err = genl_ctrl_resolve(socket, "nbd"); + if (err < 0) { + erofs_err("Failed to resolve NBD netlink family. Ensure the NBD module is loaded and it supports netlink."); + return ERR_PTR(err); + } + *driver_id = err; + return socket; +} + +struct erofs_nbd_nl_cfg_cbctx { + int *index; + int errcode; +}; + +static int erofs_nbd_nl_cfg_cb(struct nl_msg *msg, void *arg) +{ + struct genlmsghdr *gnlh = nlmsg_data(nlmsg_hdr(msg)); + struct nlattr *msg_attr[NBD_ATTR_MAX + 1]; + struct erofs_nbd_nl_cfg_cbctx *ctx = arg; + int err; + + err = nla_parse(msg_attr, NBD_ATTR_MAX, genlmsg_attrdata(gnlh, 0), + genlmsg_attrlen(gnlh, 0), NULL); + if (err) { + erofs_err("Invalid response from the kernel"); + ctx->errcode = err; + } + + if (!msg_attr[NBD_ATTR_INDEX]) { + erofs_err("Did not receive index from the kernel"); + ctx->errcode = -EBADMSG; + } + *ctx->index = nla_get_u32(msg_attr[NBD_ATTR_INDEX]); + erofs_dbg("Connected /dev/nbd%d\n", *ctx->index); + ctx->errcode = 0; + return NL_OK; +} + +int erofs_nbd_nl_connect(int *index, int blkbits, u64 blocks, + const char *identifier) +{ + struct erofs_nbd_nl_cfg_cbctx cbctx = { + .index = index, + }; + struct nlattr *sock_attr = NULL, *sock_opt = NULL; + struct nl_sock *socket; + struct nl_msg *msg; + int sv[2], err; + int driver_id; + + err = socketpair(AF_UNIX, SOCK_STREAM, 0, sv); + if (err < 0) + return -errno; + + socket = erofs_nbd_get_nl_sock(&driver_id); + if (IS_ERR(socket)) { + err = PTR_ERR(socket); + goto err_out; + } + nl_socket_modify_cb(socket, NL_CB_VALID, NL_CB_CUSTOM, + erofs_nbd_nl_cfg_cb, &cbctx); + + msg = nlmsg_alloc(); + if (!msg) { + erofs_err("Couldn't allocate netlink message"); + err = -ENOMEM; + goto err_nls_free; + } + + genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ, driver_id, 0, 0, + NBD_CMD_CONNECT, 0); + if (*index >= 0) + NLA_PUT_U32(msg, NBD_ATTR_INDEX, *index); + NLA_PUT_U64(msg, NBD_ATTR_BLOCK_SIZE_BYTES, 1u << blkbits); + NLA_PUT_U64(msg, NBD_ATTR_SIZE_BYTES, blocks << blkbits); + NLA_PUT_U64(msg, NBD_ATTR_SERVER_FLAGS, NBD_FLAG_READ_ONLY); + NLA_PUT_U64(msg, NBD_ATTR_TIMEOUT, 0); + if (identifier) + NLA_PUT_STRING(msg, NBD_ATTR_BACKEND_IDENTIFIER, identifier); + + err = -EINVAL; + sock_attr = nla_nest_start(msg, NBD_ATTR_SOCKETS); + if (!sock_attr) { + erofs_err("Couldn't nest the sockets for our connection"); + goto err_nlm_free; + } + + sock_opt = nla_nest_start(msg, NBD_SOCK_ITEM); + if (!sock_opt) { + nla_nest_cancel(msg, sock_attr); + goto err_nlm_free; + } + NLA_PUT_U32(msg, NBD_SOCK_FD, sv[1]); + nla_nest_end(msg, sock_opt); + nla_nest_end(msg, sock_attr); + + err = nl_send_sync(socket, msg); + if (err) + goto err_out; + nl_socket_free(socket); + if (cbctx.errcode) + return cbctx.errcode; + return sv[0]; + +nla_put_failure: + if (sock_opt) + nla_nest_cancel(msg, sock_opt); + if (sock_attr) + nla_nest_cancel(msg, sock_attr); +err_nlm_free: + nlmsg_free(msg); +err_nls_free: + nl_socket_free(socket); +err_out: + close(sv[0]); + close(sv[1]); + return err; +} +#else +int erofs_nbd_nl_connect(int *index, int blkbits, u64 blocks, + const char *identifier) +{ + return -EOPNOTSUPP; +} +#endif + int erofs_nbd_do_it(int nbdfd) { int err; diff --git a/lib/liberofs_nbd.h b/lib/liberofs_nbd.h index c493aca..89c4cf2 100644 --- a/lib/liberofs_nbd.h +++ b/lib/liberofs_nbd.h @@ -39,4 +39,6 @@ int erofs_nbd_get_request(int skfd, struct erofs_nbd_request *rq); int erofs_nbd_send_reply_header(int skfd, __le64 cookie, int err); int erofs_nbd_disconnect(int nbdfd); +int erofs_nbd_nl_connect(int *index, int blkbits, u64 blocks, + const char *identifier); #endif diff --git a/mount/Makefile.am b/mount/Makefile.am index b76e336..d93f3f4 100644 --- a/mount/Makefile.am +++ b/mount/Makefile.am @@ -9,5 +9,5 @@ mount_erofs_SOURCES = main.c mount_erofs_CFLAGS = -Wall -I$(top_srcdir)/include mount_erofs_LDADD = $(top_builddir)/lib/liberofs.la ${libselinux_LIBS} \ ${liblz4_LIBS} ${liblzma_LIBS} ${zlib_LIBS} ${libdeflate_LIBS} \ - ${libzstd_LIBS} ${libqpl_LIBS} ${libxxhash_LIBS} + ${libzstd_LIBS} ${libqpl_LIBS} ${libxxhash_LIBS} ${libnl3_LIBS} endif diff --git a/mount/main.c b/mount/main.c index c9deae2..d82e526 100644 --- a/mount/main.c +++ b/mount/main.c @@ -5,6 +5,7 @@ #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <signal.h> #include <sys/mount.h> #include <sys/types.h> #include <pthread.h> @@ -299,13 +300,60 @@ out_closefd: return err; } +static int erofsmount_startnbd_nl(pid_t *pid, const char *source) +{ + struct erofsmount_nbd_ctx ctx = {}; + int err, num; + int pipefd[2]; + + err = open(source, O_RDONLY); + if (err < 0) + return -errno; + ctx.vd.fd = err; + + err = pipe(pipefd); + if (err < 0) { + err = -errno; + close(ctx.vd.fd); + return err; + } + if ((*pid = fork()) == 0) { + /* Otherwise, NBD disconnect sends SIGPIPE, skipping cleanup */ + if (signal(SIGPIPE, SIG_IGN) == SIG_ERR) { + close(ctx.vd.fd); + exit(EXIT_FAILURE); + } + + num = -1; + err = erofs_nbd_nl_connect(&num, 9, INT64_MAX >> 9, NULL); + if (err >= 0) { + ctx.sk.fd = err; + err = write(pipefd[1], &num, sizeof(int)); + if (err >= sizeof(int)) { + close(pipefd[1]); + close(pipefd[0]); + err = (int)(uintptr_t)erofsmount_nbd_loopfn(&ctx); + exit(err ? EXIT_FAILURE : EXIT_SUCCESS); + } + } + close(ctx.vd.fd); + exit(EXIT_FAILURE); + } + close(pipefd[1]); + err = read(pipefd[0], &num, sizeof(int)); + close(pipefd[0]); + if (err < sizeof(int)) + return -EPIPE; + return num; +} + static int erofsmount_nbd(const char *source, const char *mountpoint, const char *fstype, int flags, const char *options) { char nbdpath[32]; int num, nbdfd; - pid_t pid; + pid_t pid = 0; long err; if (strcmp(fstype, "erofs")) { @@ -315,19 +363,25 @@ static int erofsmount_nbd(const char *source, const char *mountpoint, } flags |= MS_RDONLY; - num = erofs_nbd_devscan(); - if (num < 0) - return num; - - (void)snprintf(nbdpath, sizeof(nbdpath), "/dev/nbd%d", num); - nbdfd = open(nbdpath, O_RDWR); - if (nbdfd < 0) - return -errno; - - if ((pid = fork()) == 0) - return erofsmount_startnbd(nbdfd, source) ? - EXIT_FAILURE : EXIT_SUCCESS; - close(nbdfd); + err = erofsmount_startnbd_nl(&pid, source); + if (err < 0) { + num = erofs_nbd_devscan(); + if (num < 0) + return num; + + (void)snprintf(nbdpath, sizeof(nbdpath), "/dev/nbd%d", num); + nbdfd = open(nbdpath, O_RDWR); + if (nbdfd < 0) + return -errno; + + if ((pid = fork()) == 0) + return erofsmount_startnbd(nbdfd, source) ? + EXIT_FAILURE : EXIT_SUCCESS; + close(nbdfd); + } else { + num = err; + (void)snprintf(nbdpath, sizeof(nbdpath), "/dev/nbd%d", num); + } while (1) { err = erofs_nbd_in_service(num); -- 2.43.0