Signed-off-by: Thomas Graf <tg...@suug.ch> --- lib/dpif-linux.c | 3 +- lib/netlink-protocol.h | 39 ++++++ lib/netlink-socket.c | 345 ++++++++++++++++++++++++++++++++++++++++++++----- lib/netlink-socket.h | 10 +- 4 files changed, 361 insertions(+), 36 deletions(-)
diff --git a/lib/dpif-linux.c b/lib/dpif-linux.c index 63e66f3..42a3f72 100644 --- a/lib/dpif-linux.c +++ b/lib/dpif-linux.c @@ -1777,6 +1777,7 @@ dpif_linux_recv__(struct dpif_linux *dpif, uint32_t handler_id, while (handler->event_offset < handler->n_events) { int idx = handler->epoll_events[handler->event_offset].data.u32; struct dpif_channel *ch = &dpif->handlers[handler_id].channels[idx]; + int events = handler->epoll_events[handler->event_offset].events; handler->event_offset++; @@ -1788,7 +1789,7 @@ dpif_linux_recv__(struct dpif_linux *dpif, uint32_t handler_id, return EAGAIN; } - error = nl_sock_recv(ch->sock, buf, false); + error = nl_sock_recv_events(ch->sock, buf, false, events); if (error == ENOBUFS) { /* ENOBUFS typically means that we've received so many * packets that the buffer overflowed. Try again diff --git a/lib/netlink-protocol.h b/lib/netlink-protocol.h index 3009fc5..13e1216 100644 --- a/lib/netlink-protocol.h +++ b/lib/netlink-protocol.h @@ -161,6 +161,45 @@ enum { #define NETLINK_DROP_MEMBERSHIP 2 #endif +#ifndef __ALIGN_KERNEL +#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) +#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1) +#endif + +#ifndef NETLINK_RX_RING +#define NETLINK_RX_RING 6 +#define NETLINK_TX_RING 7 + +struct nl_mmap_req { + unsigned int nm_block_size; + unsigned int nm_block_nr; + unsigned int nm_frame_size; + unsigned int nm_frame_nr; +}; + +struct nl_mmap_hdr { + unsigned int nm_status; + unsigned int nm_len; + __u32 nm_group; + /* credentials */ + __u32 nm_pid; + __u32 nm_uid; + __u32 nm_gid; +}; + +enum nl_mmap_status { + NL_MMAP_STATUS_UNUSED, + NL_MMAP_STATUS_RESERVED, + NL_MMAP_STATUS_VALID, + NL_MMAP_STATUS_COPY, + NL_MMAP_STATUS_SKIP, +}; + +#define NL_MMAP_MSG_ALIGNMENT NLMSG_ALIGNTO +#define NL_MMAP_MSG_ALIGN(sz) __ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT) +#define NL_MMAP_HDRLEN NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr)) +#endif /* NETLINK_RX_RING */ + /* These were introduced all together in 2.6.23. (We want our programs to * support the newer kernel features even if compiled with older headers.) */ #ifndef CTRL_ATTR_MCAST_GRP_MAX diff --git a/lib/netlink-socket.c b/lib/netlink-socket.c index e4cc4ad..a264a0a 100644 --- a/lib/netlink-socket.c +++ b/lib/netlink-socket.c @@ -21,6 +21,8 @@ #include <stdlib.h> #include <sys/types.h> #include <sys/uio.h> +#include <sys/mman.h> +#include <sys/epoll.h> #include <unistd.h> #include "coverage.h" #include "dynamic-string.h" @@ -41,7 +43,9 @@ VLOG_DEFINE_THIS_MODULE(netlink_socket); COVERAGE_DEFINE(netlink_overflow); COVERAGE_DEFINE(netlink_received); COVERAGE_DEFINE(netlink_recv_jumbo); +COVERAGE_DEFINE(netlink_recv_mmap); COVERAGE_DEFINE(netlink_sent); +COVERAGE_DEFINE(netlink_sent_mmap); /* Linux header file confusion causes this to be undefined. */ #ifndef SOL_NETLINK @@ -59,12 +63,30 @@ static void log_nlmsg(const char *function, int error, /* Netlink sockets. */ +/* Memory mapped ring buffer for Netlink messages + * + * Total memory consumption per ring: + * PAGE_SIZE * NM_BLOCK_NPAGES * NM_BLOCK_NPAGES */ +#define NL_BLOCK_NPAGES 4 /* Number of pages per block */ +#define NL_NBLOCKS 64 /* Number of blocks per ring */ +#define NL_FRAME_SIZE 16384 /* Maximum message size to be carried */ + +struct nl_ring { + unsigned int head; + void *ring; +}; + struct nl_sock { int fd; uint32_t next_seq; uint32_t pid; int protocol; unsigned int rcvbuf; /* Receive buffer size (SO_RCVBUF). */ + unsigned int frame_size; + unsigned int frame_nr; + size_t ring_size; + struct nl_ring rx_ring; + struct nl_ring tx_ring; }; /* Compile-time limit on iovecs, so that we can allocate a maximum-size array @@ -112,7 +134,7 @@ nl_sock_create(int protocol, struct nl_sock **sockp) } *sockp = NULL; - sock = xmalloc(sizeof *sock); + sock = xzalloc(sizeof *sock); sock->fd = socket(AF_NETLINK, SOCK_RAW, protocol); if (sock->fd < 0) { @@ -179,13 +201,85 @@ error: return retval; } +static int +nl_sock_set_ring(struct nl_sock *sock) +{ + size_t block_size = NL_BLOCK_NPAGES * getpagesize(); + size_t ring_size; + void *ring; + struct nl_mmap_req req = { + .nm_block_size = block_size, + .nm_block_nr = NL_NBLOCKS, + .nm_frame_size = NL_FRAME_SIZE, + }; + + req.nm_frame_nr = req.nm_block_nr * block_size / req.nm_frame_size; + + if (setsockopt(sock->fd, SOL_NETLINK, NETLINK_RX_RING, &req, sizeof(req)) < 0 + || setsockopt(sock->fd, SOL_NETLINK, NETLINK_TX_RING, &req, sizeof(req)) < 0) { + VLOG_INFO("mmap netlink is not supported"); + return 0; + } + + ring_size = req.nm_block_nr * req.nm_block_size; + ring = mmap(NULL, 2 * ring_size, PROT_READ | PROT_WRITE, + MAP_SHARED, sock->fd, 0); + if (ring == MAP_FAILED) { + VLOG_ERR("netlink mmap: %s", ovs_strerror(errno)); + return errno; + } + + sock->frame_size = req.nm_frame_size; + sock->frame_nr = req.nm_frame_nr - 1; + sock->ring_size = ring_size; + sock->rx_ring.ring = ring; + sock->rx_ring.head = 0; + sock->tx_ring.ring = (char *) ring + ring_size; + sock->tx_ring.head = 0; + + return 0; +} + +/* Creates a new memory mapped netlink socket for the given netlink 'protocol' + * (NETLINK_ROUTE, NETLINK_GENERIC, ...). Falls back to unmapped socket if + * kernel side does not support it. Returns 0 and sets '*sockp' to the new + * socket if successful, otherwise returns a positive errno value. */ +int +nl_sock_create_mmap(int protocol, struct nl_sock **sockp) +{ + int retval; + + if ((retval = nl_sock_create(protocol, sockp)) < 0) { + return retval; + } + + if ((retval = nl_sock_set_ring(*sockp)) < 0) { + VLOG_ERR("failed to initialize memory mapped netlink socket"); + nl_sock_destroy(*sockp); + return retval; + } + + return retval; +} + +/* Returns true if netlink socked is memory mapped */ +bool +nl_sock_is_mapped(const struct nl_sock *sock) +{ + return sock->rx_ring.ring != NULL; +} + /* Creates a new netlink socket for the same protocol as 'src'. Returns 0 and * sets '*sockp' to the new socket if successful, otherwise returns a positive * errno value. */ int nl_sock_clone(const struct nl_sock *src, struct nl_sock **sockp) { - return nl_sock_create(src->protocol, sockp); + if (nl_sock_is_mapped(src)) { + return nl_sock_create_mmap(src->protocol, sockp); + } else { + return nl_sock_create(src->protocol, sockp); + } } /* Destroys netlink socket 'sock'. */ @@ -193,6 +287,9 @@ void nl_sock_destroy(struct nl_sock *sock) { if (sock) { + if (nl_sock_is_mapped(sock)) { + munmap(sock->rx_ring.ring, 2 * sock->ring_size); + } close(sock->fd); free(sock); } @@ -243,6 +340,105 @@ nl_sock_leave_mcgroup(struct nl_sock *sock, unsigned int multicast_group) return 0; } +static struct nl_mmap_hdr * +mmap_frame(struct nl_sock *sock, struct nl_ring *r) +{ + char *start = r->ring; + + return (struct nl_mmap_hdr *)(void *)(start + r->head * sock->frame_size); +} + +static struct nl_mmap_hdr * +mmap_next_rx_frame(struct nl_sock *sock) +{ + return mmap_frame(sock, &sock->rx_ring); +} + +static struct nl_mmap_hdr * +mmap_next_tx_frame(struct nl_sock *sock) +{ + return mmap_frame(sock, &sock->tx_ring); +} + +static void +mmap_advance_ring(struct nl_sock *sock, struct nl_ring *r) +{ + if (r->head != sock->frame_nr) { + r->head++; + } else { + r->head = 0; + } +} + +static void +mmap_advance_rx_ring(struct nl_sock *sock) +{ + mmap_advance_ring(sock, &sock->rx_ring); +} + +static void +mmap_advance_tx_ring(struct nl_sock *sock) +{ + mmap_advance_ring(sock, &sock->tx_ring); +} + +static int +nl_sock_send_linear(struct nl_sock *sock, const struct ofpbuf *msg, + bool wait) +{ + int retval, error; + + do { + retval = send(sock->fd, ofpbuf_data(msg), ofpbuf_size(msg), wait ? 0 : MSG_DONTWAIT); + error = retval < 0 ? errno : 0; + } while (error == EINTR); + + return error; +} + +static int +nl_sock_send_mmap(struct nl_sock *sock, const struct ofpbuf *msg, + bool wait) +{ + struct nl_mmap_hdr *hdr; + struct sockaddr_nl addr = { + .nl_family = AF_NETLINK, + }; + int retval, error; + + if ((ofpbuf_size(msg) + NL_MMAP_HDRLEN) > sock->frame_size) + return nl_sock_send_linear(sock, msg, wait); + + hdr = mmap_next_tx_frame(sock); + + if (hdr->nm_status != NL_MMAP_STATUS_UNUSED) { + /* No frame available. Block? */ + if (wait) { + nl_sock_wait(sock, POLLOUT | POLLERR); + poll_block(); + } else { + return EAGAIN; + } + } + + memcpy((char *) hdr + NL_MMAP_HDRLEN, ofpbuf_data(msg), ofpbuf_size(msg)); + hdr->nm_len = ofpbuf_size(msg); + hdr->nm_status = NL_MMAP_STATUS_VALID; + + mmap_advance_tx_ring(sock); + + do { + retval = sendto(sock->fd, NULL, 0, 0, (struct sockaddr *)&addr, sizeof(addr)); + error = retval < 0 ? errno : 0; + } while (error == EINTR); + + if (!error) { + COVERAGE_INC(netlink_sent_mmap); + } + + return error; +} + static int nl_sock_send__(struct nl_sock *sock, const struct ofpbuf *msg, uint32_t nlmsg_seq, bool wait) @@ -253,11 +449,11 @@ nl_sock_send__(struct nl_sock *sock, const struct ofpbuf *msg, nlmsg->nlmsg_len = ofpbuf_size(msg); nlmsg->nlmsg_seq = nlmsg_seq; nlmsg->nlmsg_pid = sock->pid; - do { - int retval; - retval = send(sock->fd, ofpbuf_data(msg), ofpbuf_size(msg), wait ? 0 : MSG_DONTWAIT); - error = retval < 0 ? errno : 0; - } while (error == EINTR); + if (nl_sock_is_mapped(sock)) { + error = nl_sock_send_mmap(sock, msg, wait); + } else { + error = nl_sock_send_linear(sock, msg, wait); + } log_nlmsg(__func__, error, ofpbuf_data(msg), ofpbuf_size(msg), sock->protocol); if (!error) { COVERAGE_INC(netlink_sent); @@ -298,26 +494,17 @@ nl_sock_send_seq(struct nl_sock *sock, const struct ofpbuf *msg, } static int -nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait) +nl_sock_recv_linear(struct nl_sock *sock, struct ofpbuf *buf, bool wait, + uint8_t *tail, size_t taillen) { - /* We can't accurately predict the size of the data to be received. The - * caller is supposed to have allocated enough space in 'buf' to handle the - * "typical" case. To handle exceptions, we make available enough space in - * 'tail' to allow Netlink messages to be up to 64 kB long (a reasonable - * figure since that's the maximum length of a Netlink attribute). */ - struct nlmsghdr *nlmsghdr; - uint8_t tail[65536]; struct iovec iov[2]; struct msghdr msg; - ssize_t retval; - - ovs_assert(buf->allocated >= sizeof *nlmsghdr); - ofpbuf_clear(buf); + int retval; iov[0].iov_base = ofpbuf_base(buf); iov[0].iov_len = buf->allocated; iov[1].iov_base = tail; - iov[1].iov_len = sizeof tail; + iov[1].iov_len = taillen; memset(&msg, 0, sizeof msg); msg.msg_iov = iov; @@ -343,21 +530,100 @@ nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait) return E2BIG; } - nlmsghdr = ofpbuf_data(buf); - if (retval < sizeof *nlmsghdr - || nlmsghdr->nlmsg_len < sizeof *nlmsghdr - || nlmsghdr->nlmsg_len > retval) { - VLOG_ERR_RL(&rl, "received invalid nlmsg (%"PRIuSIZE"d bytes < %"PRIuSIZE")", - retval, sizeof *nlmsghdr); - return EPROTO; - } - ofpbuf_set_size(buf, MIN(retval, buf->allocated)); if (retval > buf->allocated) { COVERAGE_INC(netlink_recv_jumbo); ofpbuf_put(buf, tail, retval - buf->allocated); } + return 0; +} + +static int +nl_sock_recv_mmap(struct nl_sock *sock, struct ofpbuf *buf, bool wait, + uint8_t *tail, size_t taillen) +{ + struct nl_mmap_hdr *hdr; + int retval = 0; + +restart: + hdr = mmap_next_rx_frame(sock); + + switch (hdr->nm_status) { + case NL_MMAP_STATUS_VALID: + if (hdr->nm_len == 0) { + /* error occured while constructing message on other side */ + hdr->nm_status = NL_MMAP_STATUS_UNUSED; + mmap_advance_rx_ring(sock); + goto restart; + } + + ofpbuf_put(buf, (char *) hdr + NL_MMAP_HDRLEN, hdr->nm_len); + COVERAGE_INC(netlink_recv_mmap); + break; + + case NL_MMAP_STATUS_COPY: + retval = nl_sock_recv_linear(sock, buf, MSG_DONTWAIT, tail, taillen); + if (retval < 0) { + return retval; + } + break; + + case NL_MMAP_STATUS_UNUSED: + case NL_MMAP_STATUS_RESERVED: + default: + if (wait) { + nl_sock_wait(sock, POLLIN | POLLERR); + poll_block(); + goto restart; + } + + return EAGAIN; + } + + hdr->nm_status = NL_MMAP_STATUS_UNUSED; + mmap_advance_rx_ring(sock); + + return retval; +} + +static int +nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait, + uint32_t events) +{ + /* We can't accurately predict the size of the data to be received. The + * caller is supposed to have allocated enough space in 'buf' to handle the + * "typical" case. To handle exceptions, we make available enough space in + * 'tail' to allow Netlink messages to be up to 64 kB long (a reasonable + * figure since that's the maximum length of a Netlink attribute). */ + struct nlmsghdr *nlmsghdr; + uint8_t tail[65536]; + int retval; + + ovs_assert(buf->allocated >= sizeof *nlmsghdr); + ofpbuf_clear(buf); + + /* nl_sock_recv_mmap() cannot handle EPOLLERR events, force linear receive + * to retrieve error notification. */ + if (!nl_sock_is_mapped(sock) || events == EPOLLERR) { + retval = nl_sock_recv_linear(sock, buf, wait, tail, sizeof(tail)); + } else { + retval = nl_sock_recv_mmap(sock, buf, wait, tail, sizeof(tail)); + } + + if (retval != 0) { + return retval; + } + + nlmsghdr = ofpbuf_data(buf); + if (ofpbuf_size(buf) < sizeof *nlmsghdr + || nlmsghdr->nlmsg_len < sizeof *nlmsghdr + || nlmsghdr->nlmsg_len > ofpbuf_size(buf)) { + VLOG_ERR_RL(&rl, "received invalid nlmsg (%u bytes < %"PRIuSIZE")", + ofpbuf_size(buf), sizeof *nlmsghdr); + return EPROTO; + } + log_nlmsg(__func__, 0, ofpbuf_data(buf), ofpbuf_size(buf), sock->protocol); COVERAGE_INC(netlink_received); @@ -384,7 +650,22 @@ nl_sock_recv__(struct nl_sock *sock, struct ofpbuf *buf, bool wait) int nl_sock_recv(struct nl_sock *sock, struct ofpbuf *buf, bool wait) { - return nl_sock_recv__(sock, buf, wait); + return nl_sock_recv__(sock, buf, wait, 0); +} + +/* Variation of nl_sock_recv() to be used in combination with event polling. + * Behaves the same but takes the additional 'events' provided by poll() or + * epoll_wait(). + * + * Required to receive error notifications on the socket for shared memory + * based ring buffer implementations which don't necessarily invoke recvmsg() + * on the socket. + */ +int +nl_sock_recv_events(struct nl_sock *sock, struct ofpbuf *buf, bool wait, + uint32_t events) +{ + return nl_sock_recv__(sock, buf, wait, events); } static void @@ -472,7 +753,7 @@ nl_sock_transact_multiple__(struct nl_sock *sock, } /* Receive a reply. */ - error = nl_sock_recv__(sock, buf_txn->reply, false); + error = nl_sock_recv__(sock, buf_txn->reply, false, 0); if (error) { if (error == EAGAIN) { nl_sock_record_errors__(transactions, n, 0); @@ -745,7 +1026,7 @@ nl_dump_next(struct nl_dump *dump, struct ofpbuf *reply, struct ofpbuf *buffer) return false; } - retval = nl_sock_recv__(dump->sock, buffer, false); + retval = nl_sock_recv__(dump->sock, buffer, false, 0); if (retval) { ofpbuf_clear(buffer); if (retval == EAGAIN) { diff --git a/lib/netlink-socket.h b/lib/netlink-socket.h index dd32409..fbb2b3f 100644 --- a/lib/netlink-socket.h +++ b/lib/netlink-socket.h @@ -38,9 +38,9 @@ * Most of the netlink functions are not fully thread-safe: Only a single * thread may use a given nl_sock or nl_dump at one time. The exceptions are: * - * - nl_sock_recv() is conditionally thread-safe: it may be called from - * different threads with the same nl_sock, but each caller must provide - * an independent receive buffer. + * - nl_sock_recv() and nl_sock_recv_events() are conditionally thread-safe: + * it may be called from different threads with the same nl_sock, but each + * caller must provide an independent receive buffer. * * - nl_dump_next() is conditionally thread-safe: it may be called from * different threads with the same nl_dump, but each caller must provide @@ -61,6 +61,8 @@ struct nl_sock; /* Netlink sockets. */ int nl_sock_create(int protocol, struct nl_sock **); +int nl_sock_create_mmap(int protocol, struct nl_sock **); +bool nl_sock_is_mapped(const struct nl_sock *); int nl_sock_clone(const struct nl_sock *, struct nl_sock **); void nl_sock_destroy(struct nl_sock *); @@ -71,6 +73,8 @@ int nl_sock_send(struct nl_sock *, const struct ofpbuf *, bool wait); int nl_sock_send_seq(struct nl_sock *, const struct ofpbuf *, uint32_t nlmsg_seq, bool wait); int nl_sock_recv(struct nl_sock *, struct ofpbuf *, bool wait); +int nl_sock_recv_events(struct nl_sock *, struct ofpbuf *, bool wait, + uint32_t events); int nl_sock_transact(struct nl_sock *, const struct ofpbuf *request, struct ofpbuf **replyp); -- 1.8.3.1 _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev