Hi John,
On 07/03/14 13:43, Julien Charbon wrote:
On 06/03/14 22:57, John-Mark Gurney wrote:
Julien Charbon wrote this message on Thu, Feb 27, 2014 at 11:32 +0100:
[...]
Any thoughts on this particular behavior?
One thing that I noticed is that you now lock/unlock the tw and inp
lock a
lot... Have you thought about grabing the TW lock once, grabbing
some/all
of the ones necessary to process and then process them in a second step?
If the bulk processing is still an issue, then you could process them in
blocks of 50 or so, that way the number of lock/unlock cycles is
reduced...
First thanks, feedback are highly valuable to us. In first place, I
indeed tried a kind of bulk processing enforcement with something like:
tcp_tw_2msl_scan() {
struct tcptw *tw;
int i, end = 0, count = 100;
for (;;) {
INP_INFO_WLOCK(&V_tcbinfo);
for (i = 0; i < count; ++i) {
tw = TAILQ_FIRST(&V_twq_2msl);
if (tw == NULL || (tw->tw_time - ticks) > 0)) {
end = 1;
break;
}
INP_WLOCK(tw->tw_inpcb);
tcp_twclose(tw, 0);
}
if (end)
break;
INP_INFO_WUNLOCK(&V_tcbinfo);
}
return (NULL);
}
And I got best result with 'count' set to 1, this led us to current
proposed patch. [...]
I would like to continue the discussion about tcp_tw_2msl_scan().
The proposed patch makes tcp_tw_2msl_scan() less disturbing for the
TCP stack (another way to say it makes tcp_tw_2msl_scan() less efficient
to prioritize other TCP related tasks), and continuing on this way I
have tested below change on top of previous patch:
diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index 0230b88..da79120 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@@ -728,7 +728,7 @@ tcp_tw_2msl_scan(void)
TW_RUNLOCK(V_tw_lock);
/* Close timewait state */
- INP_INFO_WLOCK(&V_tcbinfo);
+ if(INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
TW_WLOCK(V_tw_lock);
if(tw_pcbrele(tw))
@@ -740,5 +740,9 @@ tcp_tw_2msl_scan(void)
INP_WLOCK(tw->tw_inpcb);
tcp_twclose(tw, 0);
INP_INFO_WUNLOCK(&V_tcbinfo);
+ } else {
+ /* INP_INFO lock is busy, continue later */
+ break;
+ }
}
}
Basically, it changes tcp_tw_2msl_scan() purpose from:
- Let's clean up all expired tw objects in a row.
to:
- Let's clean up expired tw objects as long as the INP_INFO lock is
not busy.
(See joined tw-lock-v2.patch). And corresponding performance results
being: Maximum short-lived TCP connection rate without dropping a
single packet:
- FreeBSD 10.0-RELEASE: 40.0k
- FreeBSD 10.0 + tw patch-v1: 52.8k (+32%)
- FreeBSD 10.0 + tw patch-v2: 56.8k (+42%)
Thus, a significant improvement and it makes the tcp_tw_2msl_scan()
purpose clear.
Moreover, it seems that two members of the struct tcptw are never
used: 'tw_pspare' and 'tw_spare':
struct tcptw {
[...]
u_int t_starttime;
int tw_time;
TAILQ_ENTRY(tcptw) tw_2msl;
void *tw_pspare; /* TCP_SIGNATURE */
u_int *tw_spare; /* TCP_SIGNATURE */
u_int tw_refcount; /* refcount */
};
I found no references of these members anywhere, even when setting the
TCP_SIGNATURE kernel option the kernel builds and runs just fine. As
the tcptw struct is private to kernel and its size should be as small as
possible, I would propose to remove them.
Joined tw-clock-v2.patch which includes:
- Use TRY_WLOCK (Performance improvements)
- Fix comment (After John-Mark Gurney's review)
- Two unused fields removed (After Mike Bentkofsky's review)
--
Julien
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index bde7503..b45a9ea 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -144,9 +144,7 @@ tcp_slowtimo(void)
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
- INP_INFO_WLOCK(&V_tcbinfo);
- (void) tcp_tw_2msl_scan(0);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ tcp_tw_2msl_scan();
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h
index 3115fb3..c04723a 100644
--- a/sys/netinet/tcp_timer.h
+++ b/sys/netinet/tcp_timer.h
@@ -178,7 +178,8 @@ extern int tcp_fast_finwait2_recycle;
void tcp_timer_init(void);
void tcp_timer_2msl(void *xtp);
struct tcptw *
- tcp_tw_2msl_scan(int _reuse); /* XXX temporary */
+ tcp_tw_2msl_reuse(void); /* XXX temporary? */
+void tcp_tw_2msl_scan(void);
void tcp_timer_keep(void *xtp);
void tcp_timer_persist(void *xtp);
void tcp_timer_rexmt(void *xtp);
diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index 7e6128b..afaf5f9 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$");
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/random.h>
+#include <sys/refcount.h>
#include <vm/uma.h>
@@ -98,13 +99,59 @@ static int maxtcptw;
* The timed wait queue contains references to each of the TCP sessions
* currently in the TIME_WAIT state. The queue pointers, including the
* queue pointers in each tcptw structure, are protected using the global
- * tcbinfo lock, which must be held over queue iteration and modification.
+ * timewait lock, which must be held over queue iteration and modification.
*/
static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl);
#define V_twq_2msl VNET(twq_2msl)
-static void tcp_tw_2msl_reset(struct tcptw *, int);
-static void tcp_tw_2msl_stop(struct tcptw *);
+/* Global timewait lock */
+static VNET_DEFINE(struct rwlock, tw_lock);
+#define V_tw_lock VNET(tw_lock)
+
+#define TW_LOCK_INIT(tw, d) rw_init_flags(&(tw), (d), 0)
+#define TW_LOCK_DESTROY(tw) rw_destroy(&(tw))
+#define TW_RLOCK(tw) rw_rlock(&(tw))
+#define TW_WLOCK(tw) rw_wlock(&(tw))
+#define TW_RUNLOCK(tw) rw_runlock(&(tw))
+#define TW_WUNLOCK(tw) rw_wunlock(&(tw))
+#define TW_LOCK_ASSERT(tw) rw_assert(&(tw), RA_LOCKED)
+#define TW_RLOCK_ASSERT(tw) rw_assert(&(tw), RA_RLOCKED)
+#define TW_WLOCK_ASSERT(tw) rw_assert(&(tw), RA_WLOCKED)
+#define TW_UNLOCK_ASSERT(tw) rw_assert(&(tw), RA_UNLOCKED)
+
+/*
+ * tw_pcbref() bumps the reference count on an tw in order to maintain
+ * stability of an tw pointer despite the tw lock being released.
+ */
+static void
+tw_pcbref(struct tcptw *tw)
+{
+ KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
+ refcount_acquire(&tw->tw_refcount);
+}
+
+/*
+ * Drop a refcount on an tw elevated using tw_pcbref(). Return
+ * the tw lock released.
+ */
+static int
+tw_pcbrele(struct tcptw *tw)
+{
+ TW_WLOCK_ASSERT(V_tw_lock);
+ KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
+
+ if (!refcount_release(&tw->tw_refcount)) {
+ TW_WUNLOCK(V_tw_lock);
+ return (0);
+ }
+
+ uma_zfree(V_tcptw_zone, tw);
+ TW_WUNLOCK(V_tw_lock);
+ return (1);
+}
+
+static void tcp_tw_2msl_reset(struct tcptw *, int ream);
+static void tcp_tw_2msl_stop(struct tcptw *, int reuse);
static int
tcptw_auto_size(void)
@@ -171,6 +218,7 @@ tcp_tw_init(void)
else
uma_zone_set_max(V_tcptw_zone, maxtcptw);
TAILQ_INIT(&V_twq_2msl);
+ TW_LOCK_INIT(V_tw_lock, "tcptw");
}
#ifdef VIMAGE
@@ -179,11 +227,14 @@ tcp_tw_destroy(void)
{
struct tcptw *tw;
- INP_INFO_WLOCK(&V_tcbinfo);
- while((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
- tcp_twclose(tw, 0);
- INP_INFO_WUNLOCK(&V_tcbinfo);
+ TW_WLOCK(V_tw_lock);
+ while((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) {
+ tcp_twclose(tw, 0, 1);
+ TW_WLOCK(V_tw_lock);
+ }
+ TW_WUNLOCK(V_tw_lock);
+ TW_LOCK_DESTROY(V_tw_lock);
uma_zdestroy(V_tcptw_zone);
}
#endif
@@ -204,7 +255,7 @@ tcp_twstart(struct tcpcb *tp)
int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
#endif
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_reset(). */
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
if (V_nolocaltimewait) {
@@ -229,7 +280,7 @@ tcp_twstart(struct tcpcb *tp)
tw = uma_zalloc(V_tcptw_zone, M_NOWAIT);
if (tw == NULL) {
- tw = tcp_tw_2msl_scan(1);
+ tw = tcp_tw_2msl_reuse();
if (tw == NULL) {
tp = tcp_close(tp);
if (tp != NULL)
@@ -238,6 +289,7 @@ tcp_twstart(struct tcpcb *tp)
}
}
tw->tw_inpcb = inp;
+ refcount_init(&tw->tw_refcount, 1);
/*
* Recover last window size sent.
@@ -356,7 +408,6 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct
tcphdr *th,
int thflags;
tcp_seq seq;
- /* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
@@ -458,11 +509,11 @@ tcp_twclose(struct tcptw *tw, int reuse)
inp = tw->tw_inpcb;
KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
- INP_INFO_WLOCK_ASSERT(&V_tcbinfo); /* tcp_tw_2msl_stop(). */
+ INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(inp);
tw->tw_inpcb = NULL;
- tcp_tw_2msl_stop(tw);
+ tcp_tw_2msl_stop(tw, reuse);
inp->inp_ppcb = NULL;
in_pcbdrop(inp);
@@ -494,11 +545,6 @@ tcp_twclose(struct tcptw *tw, int reuse)
} else
in_pcbfree(inp);
TCPSTAT_INC(tcps_closed);
- crfree(tw->tw_cred);
- tw->tw_cred = NULL;
- if (reuse)
- return;
- uma_zfree(V_tcptw_zone, tw);
}
int
@@ -616,34 +662,87 @@ tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
INP_WLOCK_ASSERT(tw->tw_inpcb);
+
+ TW_WLOCK(V_tw_lock);
if (rearm)
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
tw->tw_time = ticks + 2 * tcp_msl;
TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl);
+ TW_WUNLOCK(V_tw_lock);
}
static void
-tcp_tw_2msl_stop(struct tcptw *tw)
+tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
{
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
+ TW_WLOCK(V_tw_lock);
TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
+ crfree(tw->tw_cred);
+ tw->tw_cred = NULL;
+
+ if (!reuse) {
+ tw_pcbrele(tw);
+ return;
+ }
+
+ TW_WUNLOCK(V_tw_lock);
}
struct tcptw *
-tcp_tw_2msl_scan(int reuse)
+tcp_tw_2msl_reuse(void)
{
- struct tcptw *tw;
INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
+ struct tcptw *tw;
+
+ TW_WLOCK(V_tw_lock);
+ tw = TAILQ_FIRST(&V_twq_2msl);
+ if (tw == NULL) {
+ TW_WUNLOCK(V_tw_lock);
+ return NULL;
+ }
+ TW_WUNLOCK(V_tw_lock);
+
+ INP_WLOCK(tw->tw_inpcb);
+ tcp_twclose(tw, 1);
+
+ return (tw);
+}
+
+void
+tcp_tw_2msl_scan(void)
+{
+
+ struct tcptw *tw;
for (;;) {
+ TW_RLOCK(V_tw_lock);
tw = TAILQ_FIRST(&V_twq_2msl);
- if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0))
+ if (tw == NULL || ((tw->tw_time - ticks) > 0)) {
+ TW_RUNLOCK(V_tw_lock);
+ break;
+ }
+ tw_pcbref(tw);
+ TW_RUNLOCK(V_tw_lock);
+
+ /* Close timewait state */
+ if(INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
+
+ TW_WLOCK(V_tw_lock);
+ if(tw_pcbrele(tw))
+ continue;
+
+ KASSERT(tw->tw_inpcb != NULL,
+ ("%s: tw->tw_inpcb == NULL", __func__));
+
+ INP_WLOCK(tw->tw_inpcb);
+ tcp_twclose(tw, 0);
+ INP_INFO_WUNLOCK(&V_tcbinfo);
+ } else {
+ /* INP_INFO lock is busy, continue later */
break;
- INP_WLOCK(tw->tw_inpcb);
- tcp_twclose(tw, reuse);
- if (reuse)
- return (tw);
+ }
}
- return (NULL);
}
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index e3197e5..f13c1f4 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -353,8 +353,7 @@ struct tcptw {
u_int t_starttime;
int tw_time;
TAILQ_ENTRY(tcptw) tw_2msl;
- void *tw_pspare; /* TCP_SIGNATURE */
- u_int *tw_spare; /* TCP_SIGNATURE */
+ u_int tw_refcount; /* refcount */
};
#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)
_______________________________________________
freebsd-net@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-net
To unsubscribe, send any mail to "freebsd-net-unsubscr...@freebsd.org"