On Fri, Oct 31, 2014 at 02:50:00PM +1000, David Gwynne wrote:
> so without splicing, the payloads from multiple tcp packets (at least all of
> the ones in a single softnet run?) get bundled up into a buffer that userland
> reads and then writes out again in a single go. right?
>
> you're suggesting the taskq as a way to defer output till after the current
> softnet call has processed all its packets and queued all the tcp packet
> payloads onto the socket?
Exactly.
> its worth remembering there are other memory costs too. i think a kthread
> (the thing taskqs run on) is 5 pages amd64, so 20KB.
We can delay the creation of the sosplice thread until user-land
tries to splice for the first time.
I would like to get in the sosplice pool in first. The user-land
part was missing in my previous diff. Updated diff that actually
can do make build.
ok?
bluhm
Index: sys/kern/kern_sysctl.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_sysctl.c,v
retrieving revision 1.267
diff -u -p -u -p -r1.267 kern_sysctl.c
--- sys/kern/kern_sysctl.c 17 Oct 2014 01:51:39 -0000 1.267
+++ sys/kern/kern_sysctl.c 31 Oct 2014 10:23:44 -0000
@@ -1062,11 +1062,12 @@ fill_file(struct kinfo_file *kf, struct
kf->so_family = so->so_proto->pr_domain->dom_family;
kf->so_rcv_cc = so->so_rcv.sb_cc;
kf->so_snd_cc = so->so_snd.sb_cc;
- if (so->so_splice) {
+ if (isspliced(so)) {
if (show_pointers)
- kf->so_splice = PTRTOINT64(so->so_splice);
- kf->so_splicelen = so->so_splicelen;
- } else if (so->so_spliceback)
+ kf->so_splice =
+ PTRTOINT64(so->so_sp->ssp_socket);
+ kf->so_splicelen = so->so_sp->ssp_len;
+ } else if (issplicedback(so))
kf->so_splicelen = -1;
if (!so->so_pcb)
break;
Index: sys/kern/uipc_socket.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.133
diff -u -p -u -p -r1.133 uipc_socket.c
--- sys/kern/uipc_socket.c 9 Sep 2014 02:07:17 -0000 1.133
+++ sys/kern/uipc_socket.c 31 Oct 2014 10:23:44 -0000
@@ -80,12 +80,19 @@ int somaxconn = SOMAXCONN;
int sominconn = SOMINCONN;
struct pool socket_pool;
+#ifdef SOCKET_SPLICE
+struct pool sosplice_pool;
+#endif
void
soinit(void)
{
pool_init(&socket_pool, sizeof(struct socket), 0, 0, 0, "sockpl", NULL);
+#ifdef SOCKET_SPLICE
+ pool_init(&sosplice_pool, sizeof(struct sosplice), 0, 0, 0, "sosppl",
+ NULL);
+#endif
}
/*
@@ -157,7 +164,7 @@ solisten(struct socket *so, int backlog)
if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING))
return (EOPNOTSUPP);
#ifdef SOCKET_SPLICE
- if (so->so_splice || so->so_spliceback)
+ if (isspliced(so) || issplicedback(so))
return (EOPNOTSUPP);
#endif /* SOCKET_SPLICE */
s = splsoftnet();
@@ -199,10 +206,15 @@ sofree(struct socket *so)
return;
}
#ifdef SOCKET_SPLICE
- if (so->so_spliceback)
- sounsplice(so->so_spliceback, so, so->so_spliceback != so);
- if (so->so_splice)
- sounsplice(so, so->so_splice, 0);
+ if (so->so_sp) {
+ if (issplicedback(so))
+ sounsplice(so->so_sp->ssp_soback, so,
+ so->so_sp->ssp_soback != so);
+ if (isspliced(so))
+ sounsplice(so, so->so_sp->ssp_socket, 0);
+ pool_put(&sosplice_pool, so->so_sp);
+ so->so_sp = NULL;
+ }
#endif /* SOCKET_SPLICE */
sbrelease(&so->so_snd);
sorflush(so);
@@ -647,7 +659,7 @@ restart:
m = so->so_rcv.sb_mb;
#ifdef SOCKET_SPLICE
- if (so->so_splice)
+ if (isspliced(so))
m = NULL;
#endif /* SOCKET_SPLICE */
/*
@@ -669,7 +681,7 @@ restart:
#ifdef DIAGNOSTIC
if (m == NULL && so->so_rcv.sb_cc)
#ifdef SOCKET_SPLICE
- if (so->so_splice == NULL)
+ if (!isspliced(so))
#endif /* SOCKET_SPLICE */
panic("receive 1");
#endif
@@ -1021,6 +1033,12 @@ sorflush(struct socket *so)
}
#ifdef SOCKET_SPLICE
+
+#define so_splicelen so_sp->ssp_len
+#define so_splicemax so_sp->ssp_max
+#define so_idletv so_sp->ssp_idletv
+#define so_idleto so_sp->ssp_idleto
+
int
sosplice(struct socket *so, int fd, off_t max, struct timeval *tv)
{
@@ -1035,6 +1053,8 @@ sosplice(struct socket *so, int fd, off_
if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
(so->so_proto->pr_flags & PR_CONNREQUIRED))
return (ENOTCONN);
+ if (so->so_sp == NULL)
+ so->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
/* If no fd is given, unsplice by removing existing link. */
if (fd < 0) {
@@ -1043,8 +1063,8 @@ sosplice(struct socket *so, int fd, off_
(so->so_state & SS_NBIO) ? M_NOWAIT : M_WAITOK)) != 0)
return (error);
s = splsoftnet();
- if (so->so_splice)
- sounsplice(so, so->so_splice, 1);
+ if (so->so_sp->ssp_socket)
+ sounsplice(so, so->so_sp->ssp_socket, 1);
splx(s);
sbunlock(&so->so_rcv);
return (0);
@@ -1060,6 +1080,8 @@ sosplice(struct socket *so, int fd, off_
if ((error = getsock(curproc->p_fd, fd, &fp)) != 0)
return (error);
sosp = fp->f_data;
+ if (sosp->so_sp == NULL)
+ sosp->so_sp = pool_get(&sosplice_pool, PR_WAITOK | PR_ZERO);
/* Lock both receive and send buffer. */
if ((error = sblock(&so->so_rcv,
@@ -1074,7 +1096,7 @@ sosplice(struct socket *so, int fd, off_
}
s = splsoftnet();
- if (so->so_splice || sosp->so_spliceback) {
+ if (so->so_sp->ssp_socket || sosp->so_sp->ssp_soback) {
error = EBUSY;
goto release;
}
@@ -1092,8 +1114,8 @@ sosplice(struct socket *so, int fd, off_
}
/* Splice so and sosp together. */
- so->so_splice = sosp;
- sosp->so_spliceback = so;
+ so->so_sp->ssp_socket = sosp;
+ sosp->so_sp->ssp_soback = so;
so->so_splicelen = 0;
so->so_splicemax = max;
if (tv)
@@ -1127,7 +1149,7 @@ sounsplice(struct socket *so, struct soc
timeout_del(&so->so_idleto);
sosp->so_snd.sb_flagsintr &= ~SB_SPLICE;
so->so_rcv.sb_flagsintr &= ~SB_SPLICE;
- so->so_splice = sosp->so_spliceback = NULL;
+ so->so_sp->ssp_socket = sosp->so_sp->ssp_soback = NULL;
if (wakeup && soreadable(so))
sorwakeup(so);
}
@@ -1139,9 +1161,9 @@ soidle(void *arg)
int s;
s = splsoftnet();
- if (so->so_splice) {
+ if (so->so_rcv.sb_flagsintr & SB_SPLICE) {
so->so_error = ETIMEDOUT;
- sounsplice(so, so->so_splice, 1);
+ sounsplice(so, so->so_sp->ssp_socket, 1);
}
splx(s);
}
@@ -1155,7 +1177,7 @@ soidle(void *arg)
int
somove(struct socket *so, int wait)
{
- struct socket *sosp = so->so_splice;
+ struct socket *sosp = so->so_sp->ssp_socket;
struct mbuf *m, **mp, *nextrecord;
u_long len, off, oobmark;
long space;
@@ -1408,6 +1430,12 @@ somove(struct socket *so, int wait)
timeout_add_tv(&so->so_idleto, &so->so_idletv);
return (1);
}
+
+#undef so_splicelen
+#undef so_splicemax
+#undef so_idletv
+#undef so_idleto
+
#endif /* SOCKET_SPLICE */
void
@@ -1416,7 +1444,7 @@ sorwakeup(struct socket *so)
#ifdef SOCKET_SPLICE
if (so->so_rcv.sb_flagsintr & SB_SPLICE)
(void) somove(so, M_DONTWAIT);
- if (so->so_splice)
+ if (isspliced(so))
return;
#endif
sowakeup(so, &so->so_rcv);
@@ -1429,7 +1457,7 @@ sowwakeup(struct socket *so)
{
#ifdef SOCKET_SPLICE
if (so->so_snd.sb_flagsintr & SB_SPLICE)
- (void) somove(so->so_spliceback, M_DONTWAIT);
+ (void) somove(so->so_sp->ssp_soback, M_DONTWAIT);
#endif
sowakeup(so, &so->so_snd);
}
@@ -1722,11 +1750,12 @@ sogetopt(struct socket *so, int level, i
#ifdef SOCKET_SPLICE
case SO_SPLICE:
{
+ off_t len;
int s = splsoftnet();
m->m_len = sizeof(off_t);
- memcpy(mtod(m, off_t *), &so->so_splicelen,
- sizeof(off_t));
+ len = so->so_sp ? so->so_sp->ssp_len : 0;
+ memcpy(mtod(m, off_t *), &len, sizeof(off_t));
splx(s);
break;
}
@@ -1815,7 +1844,7 @@ filt_soread(struct knote *kn, long hint)
kn->kn_data = so->so_rcv.sb_cc;
#ifdef SOCKET_SPLICE
- if (so->so_splice)
+ if (isspliced(so))
return (0);
#endif /* SOCKET_SPLICE */
if (so->so_state & SS_CANTRCVMORE) {
Index: sys/sys/socketvar.h
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/sys/socketvar.h,v
retrieving revision 1.56
diff -u -p -u -p -r1.56 socketvar.h
--- sys/sys/socketvar.h 9 Sep 2014 02:07:17 -0000 1.56
+++ sys/sys/socketvar.h 31 Oct 2014 10:23:44 -0000
@@ -81,13 +81,17 @@ struct socket {
uid_t so_siguid; /* uid of process who set so_pgid */
uid_t so_sigeuid; /* euid of process who set so_pgid */
u_long so_oobmark; /* chars to oob mark */
-
- struct socket *so_splice; /* send data to drain socket */
- struct socket *so_spliceback; /* back ref for notify and cleanup */
- off_t so_splicelen; /* number of bytes spliced so far */
- off_t so_splicemax; /* maximum number of bytes to splice */
- struct timeval so_idletv; /* idle timeout */
- struct timeout so_idleto;
+/*
+ * Variables for socket splicing, allocated only when needed.
+ */
+ struct sosplice {
+ struct socket *ssp_socket; /* send data to drain socket */
+ struct socket *ssp_soback; /* back ref to source socket */
+ off_t ssp_len; /* number of bytes spliced */
+ off_t ssp_max; /* maximum number of bytes */
+ struct timeval ssp_idletv; /* idle timeout */
+ struct timeout ssp_idleto;
+ } *so_sp;
/*
* Variables for socket buffering.
*/
@@ -148,6 +152,9 @@ struct socket {
* Macros for sockets and socket buffering.
*/
+#define isspliced(so) ((so)->so_sp && (so)->so_sp->ssp_socket)
+#define issplicedback(so) ((so)->so_sp && (so)->so_sp->ssp_soback)
+
/*
* Do we need to notify the other side when I/O is possible?
*/
@@ -173,7 +180,7 @@ struct socket {
/* can we read something from so? */
#define soreadable(so) \
- ((so)->so_splice == NULL && \
+ (!isspliced(so) && \
((so)->so_rcv.sb_cc >= (so)->so_rcv.sb_lowat || \
((so)->so_state & SS_CANTRCVMORE) || \
(so)->so_qlen || (so)->so_error))
Index: lib/libkvm/kvm_file2.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/lib/libkvm/kvm_file2.c,v
retrieving revision 1.38
diff -u -p -u -p -r1.38 kvm_file2.c
--- lib/libkvm/kvm_file2.c 25 Oct 2014 03:18:58 -0000 1.38
+++ lib/libkvm/kvm_file2.c 31 Oct 2014 11:40:05 -0000
@@ -542,6 +542,7 @@ fill_file(kvm_t *kd, struct kinfo_file *
case DTYPE_SOCKET: {
struct socket sock;
+ struct sosplice ssp;
struct protosw protosw;
struct domain domain;
@@ -565,11 +566,18 @@ fill_file(kvm_t *kd, struct kinfo_file *
kf->so_family = domain.dom_family;
kf->so_rcv_cc = sock.so_rcv.sb_cc;
kf->so_snd_cc = sock.so_snd.sb_cc;
- if (sock.so_splice) {
- kf->so_splice = PTRTOINT64(sock.so_splice);
- kf->so_splicelen = sock.so_splicelen;
- } else if (sock.so_spliceback)
- kf->so_splicelen = -1;
+ if (sock.so_sp) {
+ if (KREAD(kd, (u_long)sock.so_sp, &ssp)) {
+ _kvm_err(kd, kd->program, "can't read splice");
+ return (-1);
+ }
+ if (ssp.ssp_socket) {
+ kf->so_splice = PTRTOINT64(ssp.ssp_socket);
+ kf->so_splicelen = ssp.ssp_len;
+ } else if (ssp.ssp_soback) {
+ kf->so_splicelen = -1;
+ }
+ }
if (!sock.so_pcb)
break;
switch (kf->so_family) {
Index: usr.bin/netstat/inet.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/usr.bin/netstat/inet.c,v
retrieving revision 1.136
diff -u -p -u -p -r1.136 inet.c
--- usr.bin/netstat/inet.c 26 Oct 2014 14:43:03 -0000 1.136
+++ usr.bin/netstat/inet.c 31 Oct 2014 11:00:17 -0000
@@ -91,6 +91,7 @@ char *inetname(struct in_addr *);
void inetprint(struct in_addr *, in_port_t, char *, int);
char *inet6name(struct in6_addr *);
void inet6print(struct in6_addr *, int, char *);
+void sosplice_dump(u_long);
void sockbuf_dump(struct sockbuf *, const char *);
void protosw_dump(u_long, u_long);
void domain_dump(u_long, u_long, short);
@@ -1166,7 +1167,6 @@ socket_dump(u_long off)
kread(off, &so, sizeof(so));
#define p(fmt, v, sep) printf(#v " " fmt sep, so.v);
-#define pll(fmt, v, sep) printf(#v " " fmt sep, (long long) so.v);
#define pp(fmt, v, sep) printf(#v " " fmt sep, hideroot ? 0 : so.v);
printf("socket %#lx\n ", hideroot ? 0 : off);
p("%#.4x", so_type, "\n ");
@@ -1185,12 +1185,8 @@ socket_dump(u_long off)
p("%u", so_siguid, ", ");
p("%u", so_sigeuid, "\n ");
p("%lu", so_oobmark, "\n ");
- pp("%p", so_splice, ", ");
- pp("%p", so_spliceback, "\n ");
- p("%lld", so_splicelen, ", ");
- p("%lld", so_splicemax, ", ");
- pll("%lld", so_idletv.tv_sec, ", ");
- p("%ld", so_idletv.tv_usec, "\n ");
+ if (so.so_sp)
+ sosplice_dump((u_long)so.so_sp);
sockbuf_dump(&so.so_rcv, "so_rcv");
sockbuf_dump(&so.so_snd, "so_snd");
p("%u", so_euid, ", ");
@@ -1204,6 +1200,32 @@ socket_dump(u_long off)
if (!vflag)
return;
protosw_dump((u_long)so.so_proto, (u_long)so.so_pcb);
+}
+
+/*
+ * Dump the contents of a struct sosplice
+ */
+void
+sosplice_dump(u_long off)
+{
+ struct sosplice ssp;
+
+ if (off == 0)
+ return;
+ kread(off, &ssp, sizeof(ssp));
+
+#define p(fmt, v, sep) printf(#v " " fmt sep, ssp.v);
+#define pll(fmt, v, sep) printf(#v " " fmt sep, (long long) ssp.v);
+#define pp(fmt, v, sep) printf(#v " " fmt sep, hideroot ? 0 : ssp.v);
+ pp("%p", ssp_socket, ", ");
+ pp("%p", ssp_soback, "\n ");
+ p("%lld", ssp_len, ", ");
+ p("%lld", ssp_max, ", ");
+ pll("%lld", ssp_idletv.tv_sec, ", ");
+ p("%ld", ssp_idletv.tv_usec, "\n ");
+#undef p
+#undef pll
+#undef pp
}
/*