Hello again, In my spare time I did the following simple libc-only implementation of the syscalls. I did some tests in a VM adapting these experiments: https://blog.cloudflare.com/how-to-receive-a-million-packets/
Any comments about the diff are greatly appreciated. Best regards, Boris Astardzhiev On Fri, Jan 8, 2016 at 7:02 PM, Adrian Chadd <adrian.ch...@gmail.com> wrote: > On 8 January 2016 at 03:02, Bruce Evans <b...@optusnet.com.au> wrote: > > On Fri, 8 Jan 2016, Adrian Chadd wrote: > > > >> On 7 January 2016 at 23:58, Mark Delany <c...@romeo.emu.st> wrote: > >>> > >>> On 08Jan16, Bruce Evans allegedly wrote: > >>>> > >>>> If the NIC can't reach line rate > >>> > >>> > >>>> Network stack overheads are also enormous. > >>> > >>> > >>> Bruce makes some excellent points. > >>> > >>> I challenge anyone to get line rate UDP out of FBSD (or Linux) for a > >>> 1G NIC yet alone a 10G NIC listening to a single port. It was exactly > >>> my frustration with UDP performance that led me down the path of > >>> *mmsg() and netmap. > >>> > >>> Frankly this is an opportunity for FBSD as UDP performance appears to > >>> be a neglected area. > >> > >> > >> I'm there, on 16 threads. > >> > >> I'd rather we do it on two or three, as a lot of time is wasted in > >> producer/consumer locking. but yeah, 500k tx/rx should be doable per > >> CPU with only locking changes. > > .. and I did mean "kernel producer/consumer locking changes." > > > > > Line rate for 1 Gbps is about 1500 kpps (small packets). > > > > With I218V2 (em), I see enormous lock contention above 3 or 4 (user) > > threads, and 8 are slightly slower than 1. 1 doesn't saturate the NIC, > > and 2 is optimal. > > > > The RSS support in -HEAD lets you get away with parallelising UDP > streams very nicely. > > The framework is pretty simple (!): > > * drivers ask the RSS code for the RSS config and RSS hash to use, and > configure the hardware appropriately; > * the netisr input paths check the existence of the RSS hash and will > calculte it in software if reqiured; > * v4/v6 reassembly is done (at the IP level, /not/ at the protocol > level) and if it needs a new RSS hash / netisr reinjection, that'll > happen; > * the PCB lookup code for listen sockets now allows one listen socket > per RSS bucket - as the RSS / PCBGROUPS code already extended the PCB > to have one PCB table per RSS bucket (as well as a global one); > > So: > > * userland code queries RSS for the CPU and RSS bucket setup; > * you then create one listen socket per RSS bucket, bind it to the > local thread (if you want) and tell it "you're in RSS bucket X"; > * .. and then in the UDP case for local-bound sockets, the > transmit/receive path does not require modifying the global PCB state, > so the locking is kept per-RSS bucket, and scales linearly with the > number of CPUs you have (until you hit the NIC queue limits.) > > https://github.com/erikarn/freebsd-rss/ > > and: > > > http://adrianchadd.blogspot.com/2014/06/hacking-on-receive-side-scaling-rss-on.html > > http://adrianchadd.blogspot.com/2014/07/application-awareness-of-receive-side.html > > http://adrianchadd.blogspot.com/2014/08/receive-side-scaling-figuring-out-how.html > > http://adrianchadd.blogspot.com/2014/09/receive-side-scaling-testing-udp.html > > http://adrianchadd.blogspot.com/2014/10/more-rss-udp-tests-this-time-on-dell.html > > > > -adrian > _______________________________________________ > freebsd-net@freebsd.org mailing list > https://lists.freebsd.org/mailman/listinfo/freebsd-net > To unsubscribe, send any mail to "freebsd-net-unsubscr...@freebsd.org" >
diff --git a/lib/libc/include/libc_private.h b/lib/libc/include/libc_private.h index 5caf9a3..9a0d6cf 100644 --- a/lib/libc/include/libc_private.h +++ b/lib/libc/include/libc_private.h @@ -224,6 +224,8 @@ enum { INTERPOS_kevent, INTERPOS_wait6, INTERPOS_ppoll, + INTERPOS_sendmmsg, + INTERPOS_recvmmsg, INTERPOS_MAX }; diff --git a/lib/libc/include/namespace.h b/lib/libc/include/namespace.h index 739d7b1..c95829e 100644 --- a/lib/libc/include/namespace.h +++ b/lib/libc/include/namespace.h @@ -208,6 +208,7 @@ #define readv _readv #define recvfrom _recvfrom #define recvmsg _recvmsg +#define recvmmsg _recvmmsg #define select _select #define sem_close _sem_close #define sem_destroy _sem_destroy @@ -220,6 +221,7 @@ #define sem_unlink _sem_unlink #define sem_wait _sem_wait #define sendmsg _sendmsg +#define sendmmsg _sendmmsg #define sendto _sendto #define setsockopt _setsockopt /*#define sigaction _sigaction*/ diff --git a/lib/libc/include/un-namespace.h b/lib/libc/include/un-namespace.h index f31fa7a..0233348 100644 --- a/lib/libc/include/un-namespace.h +++ b/lib/libc/include/un-namespace.h @@ -189,6 +189,7 @@ #undef readv #undef recvfrom #undef recvmsg +#undef recvmmsg #undef select #undef sem_close #undef sem_destroy @@ -201,6 +202,7 @@ #undef sem_unlink #undef sem_wait #undef sendmsg +#undef sendmmsg #undef sendto #undef setsockopt #undef sigaction diff --git a/lib/libc/sys/Makefile.inc b/lib/libc/sys/Makefile.inc index e4fe1b2..ecb366a 100644 --- a/lib/libc/sys/Makefile.inc +++ b/lib/libc/sys/Makefile.inc @@ -28,6 +28,10 @@ SRCS+= futimens.c utimensat.c NOASM+= futimens.o utimensat.o PSEUDO+= _futimens.o _utimensat.o +SRCS+= recvmmsg.c sendmmsg.c +NOASM+= recvmmsg.o sendmmsg.o +PSEUDO+= _recvmmsg.o _sendmmsg.o + INTERPOSED = \ accept \ accept4 \ diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map index 7b3257c..724e1b4 100644 --- a/lib/libc/sys/Symbol.map +++ b/lib/libc/sys/Symbol.map @@ -399,6 +399,8 @@ FBSD_1.4 { utimensat; numa_setaffinity; numa_getaffinity; + sendmmsg; + recvmmsg; }; FBSDprivate_1.0 { @@ -1051,4 +1053,6 @@ FBSDprivate_1.0 { gssd_syscall; __libc_interposing_slot; __libc_sigwait; + _sendmmsg; + _recvmmsg; }; diff --git a/lib/libc/sys/recvmmsg.c b/lib/libc/sys/recvmmsg.c new file mode 100644 index 0000000..03ab379 --- /dev/null +++ b/lib/libc/sys/recvmmsg.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016 Boris Astardzhiev, Smartcom-Bulgaria AD + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice(s), this list of conditions and the following disclaimer as + * the first lines of this file unmodified other than the possible + * addition of one or more copyright notices. + * 2. Redistributions in binary form must reproduce the above copyright + * notice(s), this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <errno.h> +#include <sys/types.h> +#include <sys/syscall.h> +#include <sys/socket.h> +#include "libc_private.h" + +#define VLEN_MAX 1024 + +int +recvmmsg(int s, struct mmsghdr *msgvec, unsigned int vlen, int flags) +{ + int i, ret, rcvd; + + if (vlen > VLEN_MAX) + vlen = VLEN_MAX; + + rcvd = 0; + for (i = 0; i < vlen; i++) { + errno = 0; + ret = (((int (*)(int, const struct msghdr *, int)) + __libc_interposing[INTERPOS_recvmsg])(s, + &msgvec[i].msg_hdr, flags)); + if (ret < 0 || errno != 0) { + if (rcvd) { + /* We've received messages. Let caller know. */ + errno = 0; + return (rcvd); + } + return (-1); + } + + /* Save received bytes */ + msgvec[i].msg_len = ret; + + rcvd++; + } + + return (rcvd); +} + +#undef VLEN_MAX diff --git a/lib/libc/sys/sendmmsg.c b/lib/libc/sys/sendmmsg.c new file mode 100644 index 0000000..3387fdc --- /dev/null +++ b/lib/libc/sys/sendmmsg.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016 Boris Astardzhiev, Smartcom-Bulgaria AD + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice(s), this list of conditions and the following disclaimer as + * the first lines of this file unmodified other than the possible + * addition of one or more copyright notices. + * 2. Redistributions in binary form must reproduce the above copyright + * notice(s), this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <errno.h> +#include <sys/types.h> +#include <sys/syscall.h> +#include <sys/socket.h> +#include "libc_private.h" + +#define VLEN_MAX 1024 + +int +sendmmsg(int s, struct mmsghdr *msgvec, unsigned int vlen, int flags) +{ + int i, ret, sent; + + if (vlen > VLEN_MAX) + vlen = VLEN_MAX; + + sent = 0; + for (i = 0; i < vlen; i++) { + errno = 0; + ret = (((int (*)(int, const struct msghdr *, int)) + __libc_interposing[INTERPOS_sendmsg])(s, + &msgvec[i].msg_hdr, flags)); + if (ret < 0 || errno != 0) { + if (sent) { + /* We have sent messages. Let caller know. */ + errno = 0; + return (sent); + } + return (-1); + } + + /* Save sent bytes */ + msgvec[i].msg_len = ret; + + sent++; + } + + return (sent); +} + +#undef VLEN_MAX diff --git a/lib/libthr/thread/thr_syscalls.c b/lib/libthr/thread/thr_syscalls.c index 7c05697..7b5458d 100644 --- a/lib/libthr/thread/thr_syscalls.c +++ b/lib/libthr/thread/thr_syscalls.c @@ -606,6 +606,84 @@ __thr_writev(int fd, const struct iovec *iov, int iovcnt) return (ret); } +#define VLEN_MAX 1024 + +static int +__thr_sendmmsg(int s, struct mmsghdr *msgvec, unsigned int vlen, int flags) +{ + struct pthread *curthread; + int i, ret, sent; + + curthread = _get_curthread(); + _thr_cancel_enter(curthread); + + if (vlen > VLEN_MAX) + vlen = VLEN_MAX; + + sent = 0; + for (i = 0; i < (int)vlen; i++) { + errno = 0; + ret = __sys_sendmsg(s, &msgvec[i].msg_hdr, flags); + if (ret < 0 || errno != 0) { + if (sent) { + /* We have sent messages. Let caller know. */ + errno = 0; + _thr_cancel_leave(curthread, ret <= 0); + return (sent); + } + return (-1); + } + + /* Save sent bytes */ + msgvec[i].msg_len = ret; + + sent++; + } + + _thr_cancel_leave(curthread, ret <= 0); + + return (sent); +} + +static int +__thr_recvmmsg(int s, struct mmsghdr *msgvec, unsigned int vlen, int flags) +{ + struct pthread *curthread; + int i, ret, rcvd; + + curthread = _get_curthread(); + _thr_cancel_enter(curthread); + + if (vlen > VLEN_MAX) + vlen = VLEN_MAX; + + rcvd = 0; + for (i = 0; i < (int)vlen; i++) { + errno = 0; + ret = __sys_recvmsg(s, &msgvec[i].msg_hdr, flags); + if (ret < 0 || errno != 0) { + if (rcvd) { + /* We've received messages. Let caller know. */ + errno = 0; + _thr_cancel_leave(curthread, ret == -1); + return (rcvd); + } + return (-1); + } + + /* Save received bytes */ + msgvec[i].msg_len = ret; + + rcvd++; + } + + _thr_cancel_leave(curthread, ret == -1); + + return (rcvd); +} + +#undef VLEN_MAX + void __thr_interpose_libc(void) { @@ -652,6 +730,8 @@ __thr_interpose_libc(void) SLOT(kevent); SLOT(wait6); SLOT(ppoll); + SLOT(sendmmsg); + SLOT(recvmmsg); #undef SLOT *(__libc_interposing_slot( INTERPOS__pthread_mutex_init_calloc_cb)) = diff --git a/sys/sys/socket.h b/sys/sys/socket.h index 18e2de1..504313e 100644 --- a/sys/sys/socket.h +++ b/sys/sys/socket.h @@ -595,6 +595,18 @@ struct sf_hdtr { #endif /* _KERNEL */ #endif /* __BSD_VISIBLE */ +#ifndef _KERNEL +#ifdef __BSD_VISIBLE +/* + * Send/recvmmsg specific structure(s) + */ +struct mmsghdr { + struct msghdr msg_hdr; /* message header */ + unsigned int msg_len; /* message length */ +}; +#endif /* __BSD_VISIBLE */ +#endif /* !_KERNEL */ + #ifndef _KERNEL #include <sys/cdefs.h> @@ -615,11 +627,17 @@ int listen(int, int); ssize_t recv(int, void *, size_t, int); ssize_t recvfrom(int, void *, size_t, int, struct sockaddr * __restrict, socklen_t * __restrict); ssize_t recvmsg(int, struct msghdr *, int); +#if __BSD_VISIBLE +int recvmmsg(int, struct mmsghdr *, unsigned int, int); +#endif ssize_t send(int, const void *, size_t, int); ssize_t sendto(int, const void *, size_t, int, const struct sockaddr *, socklen_t); ssize_t sendmsg(int, const struct msghdr *, int); #if __BSD_VISIBLE +int sendmmsg(int, struct mmsghdr *, unsigned int, int); +#endif +#if __BSD_VISIBLE int sendfile(int, int, off_t, size_t, struct sf_hdtr *, off_t *, int); int setfib(int); #endif
_______________________________________________ freebsd-net@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/freebsd-net To unsubscribe, send any mail to "freebsd-net-unsubscr...@freebsd.org"