Hello again,

In my spare time I did the following simple libc-only implementation of the
syscalls.
I did some tests in a VM adapting these experiments:
https://blog.cloudflare.com/how-to-receive-a-million-packets/

Any comments about the diff are greatly appreciated.

Best regards,
Boris Astardzhiev

On Fri, Jan 8, 2016 at 7:02 PM, Adrian Chadd <adrian.ch...@gmail.com> wrote:

> On 8 January 2016 at 03:02, Bruce Evans <b...@optusnet.com.au> wrote:
> > On Fri, 8 Jan 2016, Adrian Chadd wrote:
> >
> >> On 7 January 2016 at 23:58, Mark Delany <c...@romeo.emu.st> wrote:
> >>>
> >>> On 08Jan16, Bruce Evans allegedly wrote:
> >>>>
> >>>> If the NIC can't reach line rate
> >>>
> >>>
> >>>> Network stack overheads are also enormous.
> >>>
> >>>
> >>> Bruce makes some excellent points.
> >>>
> >>> I challenge anyone to get line rate UDP out of FBSD (or Linux) for a
> >>> 1G NIC yet alone a 10G NIC listening to a single port. It was exactly
> >>> my frustration with UDP performance that led me down the path of
> >>> *mmsg() and netmap.
> >>>
> >>> Frankly this is an opportunity for FBSD as UDP performance appears to
> >>> be a neglected area.
> >>
> >>
> >> I'm there, on 16 threads.
> >>
> >> I'd rather we do it on two or three, as a lot of time is wasted in
> >> producer/consumer locking. but yeah, 500k tx/rx should be doable per
> >> CPU with only locking changes.
>
> .. and I did mean "kernel producer/consumer locking changes."
>
> >
> > Line rate for 1 Gbps is about 1500 kpps (small packets).
> >
> > With I218V2 (em), I see enormous lock contention above 3 or 4 (user)
> > threads, and 8 are slightly slower than 1.  1 doesn't saturate the NIC,
> > and 2 is optimal.
> >
>
> The RSS support in -HEAD lets you get away with parallelising UDP
> streams very nicely.
>
> The framework is pretty simple (!):
>
> * drivers ask the RSS code for the RSS config and RSS hash to use, and
> configure the hardware appropriately;
> * the netisr input paths check the existence of the RSS hash and will
> calculte it in software if reqiured;
> * v4/v6 reassembly is done (at the IP level, /not/ at the protocol
> level) and if it needs a new RSS hash / netisr reinjection, that'll
> happen;
> * the PCB lookup code for listen sockets now allows one listen socket
> per RSS bucket - as the RSS / PCBGROUPS code already extended the PCB
> to have one PCB table per RSS bucket (as well as a global one);
>
> So:
>
> * userland code queries RSS for the CPU and RSS bucket setup;
> * you then create one listen socket per RSS bucket, bind it to the
> local thread (if you want) and tell it "you're in RSS bucket X";
> * .. and then in the UDP case for local-bound sockets, the
> transmit/receive path does not require modifying the global PCB state,
> so the locking is kept per-RSS bucket, and scales linearly with the
> number of CPUs you have (until you hit the NIC queue limits.)
>
> https://github.com/erikarn/freebsd-rss/
>
> and:
>
>
> http://adrianchadd.blogspot.com/2014/06/hacking-on-receive-side-scaling-rss-on.html
>
> http://adrianchadd.blogspot.com/2014/07/application-awareness-of-receive-side.html
>
> http://adrianchadd.blogspot.com/2014/08/receive-side-scaling-figuring-out-how.html
>
> http://adrianchadd.blogspot.com/2014/09/receive-side-scaling-testing-udp.html
>
> http://adrianchadd.blogspot.com/2014/10/more-rss-udp-tests-this-time-on-dell.html
>
>
>
> -adrian
> _______________________________________________
> freebsd-net@freebsd.org mailing list
> https://lists.freebsd.org/mailman/listinfo/freebsd-net
> To unsubscribe, send any mail to "freebsd-net-unsubscr...@freebsd.org"
>
diff --git a/lib/libc/include/libc_private.h b/lib/libc/include/libc_private.h
index 5caf9a3..9a0d6cf 100644
--- a/lib/libc/include/libc_private.h
+++ b/lib/libc/include/libc_private.h
@@ -224,6 +224,8 @@ enum {
        INTERPOS_kevent,
        INTERPOS_wait6,
        INTERPOS_ppoll,
+       INTERPOS_sendmmsg,
+       INTERPOS_recvmmsg,
        INTERPOS_MAX
 };
 
diff --git a/lib/libc/include/namespace.h b/lib/libc/include/namespace.h
index 739d7b1..c95829e 100644
--- a/lib/libc/include/namespace.h
+++ b/lib/libc/include/namespace.h
@@ -208,6 +208,7 @@
 #define                readv                           _readv
 #define                recvfrom                        _recvfrom
 #define                recvmsg                         _recvmsg
+#define                recvmmsg                        _recvmmsg
 #define                select                          _select
 #define                sem_close                       _sem_close
 #define                sem_destroy                     _sem_destroy
@@ -220,6 +221,7 @@
 #define                sem_unlink                      _sem_unlink
 #define                sem_wait                        _sem_wait
 #define                sendmsg                         _sendmsg
+#define                sendmmsg                        _sendmmsg
 #define                sendto                          _sendto
 #define                setsockopt                      _setsockopt
 /*#define              sigaction                       _sigaction*/
diff --git a/lib/libc/include/un-namespace.h b/lib/libc/include/un-namespace.h
index f31fa7a..0233348 100644
--- a/lib/libc/include/un-namespace.h
+++ b/lib/libc/include/un-namespace.h
@@ -189,6 +189,7 @@
 #undef         readv
 #undef         recvfrom
 #undef         recvmsg
+#undef         recvmmsg
 #undef         select
 #undef         sem_close
 #undef         sem_destroy
@@ -201,6 +202,7 @@
 #undef         sem_unlink
 #undef         sem_wait
 #undef         sendmsg
+#undef         sendmmsg
 #undef         sendto
 #undef         setsockopt
 #undef         sigaction
diff --git a/lib/libc/sys/Makefile.inc b/lib/libc/sys/Makefile.inc
index e4fe1b2..ecb366a 100644
--- a/lib/libc/sys/Makefile.inc
+++ b/lib/libc/sys/Makefile.inc
@@ -28,6 +28,10 @@ SRCS+= futimens.c utimensat.c
 NOASM+= futimens.o utimensat.o
 PSEUDO+= _futimens.o _utimensat.o
 
+SRCS+= recvmmsg.c sendmmsg.c
+NOASM+= recvmmsg.o sendmmsg.o
+PSEUDO+= _recvmmsg.o _sendmmsg.o
+
 INTERPOSED = \
        accept \
        accept4 \
diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map
index 7b3257c..724e1b4 100644
--- a/lib/libc/sys/Symbol.map
+++ b/lib/libc/sys/Symbol.map
@@ -399,6 +399,8 @@ FBSD_1.4 {
        utimensat;
        numa_setaffinity;
        numa_getaffinity;
+       sendmmsg;
+       recvmmsg;
 };
 
 FBSDprivate_1.0 {
@@ -1051,4 +1053,6 @@ FBSDprivate_1.0 {
        gssd_syscall;
        __libc_interposing_slot;
        __libc_sigwait;
+       _sendmmsg;
+       _recvmmsg;
 };
diff --git a/lib/libc/sys/recvmmsg.c b/lib/libc/sys/recvmmsg.c
new file mode 100644
index 0000000..03ab379
--- /dev/null
+++ b/lib/libc/sys/recvmmsg.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016 Boris Astardzhiev, Smartcom-Bulgaria AD
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice(s), this list of conditions and the following disclaimer as
+ *    the first lines of this file unmodified other than the possible
+ *    addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice(s), this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/socket.h>
+#include "libc_private.h"
+
+#define VLEN_MAX 1024
+
+int
+recvmmsg(int s, struct mmsghdr *msgvec, unsigned int vlen, int flags)
+{
+       int i, ret, rcvd;
+
+       if (vlen > VLEN_MAX)
+               vlen = VLEN_MAX;
+
+       rcvd = 0;
+       for (i = 0; i < vlen; i++) {
+               errno = 0;
+               ret = (((int (*)(int, const struct msghdr *, int))
+                   __libc_interposing[INTERPOS_recvmsg])(s,
+                       &msgvec[i].msg_hdr, flags));
+               if (ret < 0 || errno != 0) {
+                       if (rcvd) {
+                               /* We've received messages. Let caller know. */
+                               errno = 0;
+                               return (rcvd);
+                       }
+                       return (-1);
+               }
+
+               /* Save received bytes */
+               msgvec[i].msg_len = ret;
+
+               rcvd++;
+       }
+
+       return (rcvd);
+}
+
+#undef VLEN_MAX
diff --git a/lib/libc/sys/sendmmsg.c b/lib/libc/sys/sendmmsg.c
new file mode 100644
index 0000000..3387fdc
--- /dev/null
+++ b/lib/libc/sys/sendmmsg.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016 Boris Astardzhiev, Smartcom-Bulgaria AD
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice(s), this list of conditions and the following disclaimer as
+ *    the first lines of this file unmodified other than the possible
+ *    addition of one or more copyright notices.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice(s), this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <sys/socket.h>
+#include "libc_private.h"
+
+#define VLEN_MAX 1024
+
+int
+sendmmsg(int s, struct mmsghdr *msgvec, unsigned int vlen, int flags)
+{
+       int i, ret, sent;
+
+       if (vlen > VLEN_MAX)
+               vlen = VLEN_MAX;
+
+       sent = 0;
+       for (i = 0; i < vlen; i++) {
+               errno = 0;
+               ret = (((int (*)(int, const struct msghdr *, int))
+                   __libc_interposing[INTERPOS_sendmsg])(s,
+                       &msgvec[i].msg_hdr, flags));
+               if (ret < 0 || errno != 0) {
+                       if (sent) {
+                               /* We have sent messages. Let caller know. */
+                               errno = 0;
+                               return (sent);
+                       }
+                       return (-1);
+               }
+
+               /* Save sent bytes */
+               msgvec[i].msg_len = ret;
+
+               sent++;
+       }
+
+       return (sent);
+}
+
+#undef VLEN_MAX
diff --git a/lib/libthr/thread/thr_syscalls.c b/lib/libthr/thread/thr_syscalls.c
index 7c05697..7b5458d 100644
--- a/lib/libthr/thread/thr_syscalls.c
+++ b/lib/libthr/thread/thr_syscalls.c
@@ -606,6 +606,84 @@ __thr_writev(int fd, const struct iovec *iov, int iovcnt)
        return (ret);
 }
 
+#define VLEN_MAX 1024
+
+static int
+__thr_sendmmsg(int s, struct mmsghdr *msgvec, unsigned int vlen, int flags)
+{
+       struct pthread *curthread;
+       int i, ret, sent;
+
+       curthread = _get_curthread();
+       _thr_cancel_enter(curthread);
+
+       if (vlen > VLEN_MAX)
+               vlen = VLEN_MAX;
+
+       sent = 0;
+       for (i = 0; i < (int)vlen; i++) {
+               errno = 0;
+               ret = __sys_sendmsg(s, &msgvec[i].msg_hdr, flags);
+               if (ret < 0 || errno != 0) {
+                       if (sent) {
+                               /* We have sent messages. Let caller know. */
+                               errno = 0;
+                               _thr_cancel_leave(curthread, ret <= 0);
+                               return (sent);
+                       }
+                       return (-1);
+               }
+
+               /* Save sent bytes */
+               msgvec[i].msg_len = ret;
+
+               sent++;
+       }
+
+       _thr_cancel_leave(curthread, ret <= 0);
+
+       return (sent);
+}
+
+static int
+__thr_recvmmsg(int s, struct mmsghdr *msgvec, unsigned int vlen, int flags)
+{
+       struct pthread *curthread;
+       int i, ret, rcvd;
+
+       curthread = _get_curthread();
+       _thr_cancel_enter(curthread);
+
+       if (vlen > VLEN_MAX)
+               vlen = VLEN_MAX;
+
+       rcvd = 0;
+       for (i = 0; i < (int)vlen; i++) {
+               errno = 0;
+               ret = __sys_recvmsg(s, &msgvec[i].msg_hdr, flags);
+               if (ret < 0 || errno != 0) {
+                       if (rcvd) {
+                               /* We've received messages. Let caller know. */
+                               errno = 0;
+                               _thr_cancel_leave(curthread, ret == -1);
+                               return (rcvd);
+                       }
+                       return (-1);
+               }
+
+               /* Save received bytes */
+               msgvec[i].msg_len = ret;
+
+               rcvd++;
+       }
+
+       _thr_cancel_leave(curthread, ret == -1);
+
+       return (rcvd);
+}
+
+#undef VLEN_MAX
+
 void
 __thr_interpose_libc(void)
 {
@@ -652,6 +730,8 @@ __thr_interpose_libc(void)
        SLOT(kevent);
        SLOT(wait6);
        SLOT(ppoll);
+       SLOT(sendmmsg);
+       SLOT(recvmmsg);
 #undef SLOT
        *(__libc_interposing_slot(
            INTERPOS__pthread_mutex_init_calloc_cb)) =
diff --git a/sys/sys/socket.h b/sys/sys/socket.h
index 18e2de1..504313e 100644
--- a/sys/sys/socket.h
+++ b/sys/sys/socket.h
@@ -595,6 +595,18 @@ struct sf_hdtr {
 #endif /* _KERNEL */
 #endif /* __BSD_VISIBLE */
 
+#ifndef _KERNEL
+#ifdef __BSD_VISIBLE
+/*
+ * Send/recvmmsg specific structure(s)
+ */
+struct mmsghdr {
+       struct msghdr   msg_hdr;                /* message header */
+       unsigned int    msg_len;                /* message length  */
+};
+#endif /* __BSD_VISIBLE */
+#endif /* !_KERNEL */
+
 #ifndef        _KERNEL
 
 #include <sys/cdefs.h>
@@ -615,11 +627,17 @@ int       listen(int, int);
 ssize_t        recv(int, void *, size_t, int);
 ssize_t        recvfrom(int, void *, size_t, int, struct sockaddr * 
__restrict, socklen_t * __restrict);
 ssize_t        recvmsg(int, struct msghdr *, int);
+#if __BSD_VISIBLE
+int    recvmmsg(int, struct mmsghdr *, unsigned int, int);
+#endif
 ssize_t        send(int, const void *, size_t, int);
 ssize_t        sendto(int, const void *,
            size_t, int, const struct sockaddr *, socklen_t);
 ssize_t        sendmsg(int, const struct msghdr *, int);
 #if __BSD_VISIBLE
+int    sendmmsg(int, struct mmsghdr *, unsigned int, int);
+#endif
+#if __BSD_VISIBLE
 int    sendfile(int, int, off_t, size_t, struct sf_hdtr *, off_t *, int);
 int    setfib(int);
 #endif
_______________________________________________
freebsd-net@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-net
To unsubscribe, send any mail to "freebsd-net-unsubscr...@freebsd.org"

Reply via email to