Hi John,

On 07/03/14 13:43, Julien Charbon wrote:
On 06/03/14 22:57, John-Mark Gurney wrote:
Julien Charbon wrote this message on Thu, Feb 27, 2014 at 11:32 +0100:
[...]
  Any thoughts on this particular behavior?

One thing that I noticed is that you now lock/unlock the tw and inp
lock a
lot...  Have you thought about grabing the TW lock once, grabbing
some/all
of the ones necessary to process and then process them in a second step?
If the bulk processing is still an issue, then you could process them in
blocks of 50 or so, that way the number of lock/unlock cycles is
reduced...

  First thanks, feedback are highly valuable to us.  In first place, I
indeed tried a kind of bulk processing enforcement with something like:

  tcp_tw_2msl_scan() {

         struct tcptw *tw;
         int i, end = 0, count = 100;

         for (;;) {
                 INP_INFO_WLOCK(&V_tcbinfo);
                 for (i = 0; i < count; ++i) {
                         tw = TAILQ_FIRST(&V_twq_2msl);
                         if (tw == NULL || (tw->tw_time - ticks) > 0)) {
                                 end = 1;
                                 break;
                         }
                         INP_WLOCK(tw->tw_inpcb);
                         tcp_twclose(tw, 0);
                 }
                 if (end)
                         break;
                INP_INFO_WUNLOCK(&V_tcbinfo);
         }
         return (NULL);
}

  And I got best result with 'count' set to 1, this led us to current
proposed patch. [...]

 I would like to continue the discussion about tcp_tw_2msl_scan().

The proposed patch makes tcp_tw_2msl_scan() less disturbing for the TCP stack (another way to say it makes tcp_tw_2msl_scan() less efficient to prioritize other TCP related tasks), and continuing on this way I have tested below change on top of previous patch:

diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index 0230b88..da79120 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@@ -728,7 +728,7 @@ tcp_tw_2msl_scan(void)
                TW_RUNLOCK(V_tw_lock);

                /* Close timewait state */
-               INP_INFO_WLOCK(&V_tcbinfo);
+               if(INP_INFO_TRY_WLOCK(&V_tcbinfo)) {

                        TW_WLOCK(V_tw_lock);
                        if(tw_pcbrele(tw))
@@ -740,5 +740,9 @@ tcp_tw_2msl_scan(void)
                        INP_WLOCK(tw->tw_inpcb);
                        tcp_twclose(tw, 0);
                        INP_INFO_WUNLOCK(&V_tcbinfo);
+               } else {
+                       /* INP_INFO lock is busy, continue later */
+                       break;
+               }
        }
 }

 Basically, it changes tcp_tw_2msl_scan() purpose from:

 - Let's clean up all expired tw objects in a row.

 to:

- Let's clean up expired tw objects as long as the INP_INFO lock is not busy.

(See joined tw-lock-v2.patch). And corresponding performance results being: Maximum short-lived TCP connection rate without dropping a single packet:

 - FreeBSD 10.0-RELEASE:        40.0k
 - FreeBSD 10.0 + tw patch-v1:  52.8k (+32%)
 - FreeBSD 10.0 + tw patch-v2:  56.8k (+42%)

Thus, a significant improvement and it makes the tcp_tw_2msl_scan() purpose clear.

Moreover, it seems that two members of the struct tcptw are never used: 'tw_pspare' and 'tw_spare':

struct tcptw {
        [...]
        u_int           t_starttime;
        int             tw_time;
        TAILQ_ENTRY(tcptw) tw_2msl;
        void            *tw_pspare;     /* TCP_SIGNATURE */
        u_int           *tw_spare;      /* TCP_SIGNATURE */
        u_int           tw_refcount;    /* refcount */
 };

I found no references of these members anywhere, even when setting the TCP_SIGNATURE kernel option the kernel builds and runs just fine. As the tcptw struct is private to kernel and its size should be as small as possible, I would propose to remove them.

 Joined tw-clock-v2.patch which includes:

 - Use TRY_WLOCK                  (Performance improvements)
 - Fix comment                    (After John-Mark Gurney's review)
 - Two unused fields removed      (After Mike Bentkofsky's review)

--
Julien
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index bde7503..b45a9ea 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -144,9 +144,7 @@ tcp_slowtimo(void)
        VNET_LIST_RLOCK_NOSLEEP();
        VNET_FOREACH(vnet_iter) {
                CURVNET_SET(vnet_iter);
-               INP_INFO_WLOCK(&V_tcbinfo);
-               (void) tcp_tw_2msl_scan(0);
-               INP_INFO_WUNLOCK(&V_tcbinfo);
+               tcp_tw_2msl_scan();
                CURVNET_RESTORE();
        }
        VNET_LIST_RUNLOCK_NOSLEEP();
diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h
index 3115fb3..c04723a 100644
--- a/sys/netinet/tcp_timer.h
+++ b/sys/netinet/tcp_timer.h
@@ -178,7 +178,8 @@ extern int tcp_fast_finwait2_recycle;
 void   tcp_timer_init(void);
 void   tcp_timer_2msl(void *xtp);
 struct tcptw *
-       tcp_tw_2msl_scan(int _reuse);           /* XXX temporary */
+       tcp_tw_2msl_reuse(void);        /* XXX temporary? */
+void   tcp_tw_2msl_scan(void);
 void   tcp_timer_keep(void *xtp);
 void   tcp_timer_persist(void *xtp);
 void   tcp_timer_rexmt(void *xtp);
diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index 7e6128b..afaf5f9 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/socketvar.h>
 #include <sys/protosw.h>
 #include <sys/random.h>
+#include <sys/refcount.h>
 
 #include <vm/uma.h>
 
@@ -98,13 +99,59 @@ static int  maxtcptw;
  * The timed wait queue contains references to each of the TCP sessions
  * currently in the TIME_WAIT state.  The queue pointers, including the
  * queue pointers in each tcptw structure, are protected using the global
- * tcbinfo lock, which must be held over queue iteration and modification.
+ * timewait lock, which must be held over queue iteration and modification.
  */
 static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl);
 #define        V_twq_2msl                      VNET(twq_2msl)
 
-static void    tcp_tw_2msl_reset(struct tcptw *, int);
-static void    tcp_tw_2msl_stop(struct tcptw *);
+/* Global timewait lock */
+static VNET_DEFINE(struct rwlock, tw_lock);
+#define        V_tw_lock                       VNET(tw_lock)
+
+#define TW_LOCK_INIT(tw, d)    rw_init_flags(&(tw), (d), 0)
+#define TW_LOCK_DESTROY(tw)    rw_destroy(&(tw))
+#define TW_RLOCK(tw)           rw_rlock(&(tw))
+#define TW_WLOCK(tw)           rw_wlock(&(tw))
+#define TW_RUNLOCK(tw)         rw_runlock(&(tw))
+#define TW_WUNLOCK(tw)         rw_wunlock(&(tw))
+#define TW_LOCK_ASSERT(tw)     rw_assert(&(tw), RA_LOCKED)
+#define TW_RLOCK_ASSERT(tw)    rw_assert(&(tw), RA_RLOCKED)
+#define TW_WLOCK_ASSERT(tw)    rw_assert(&(tw), RA_WLOCKED)
+#define TW_UNLOCK_ASSERT(tw)   rw_assert(&(tw), RA_UNLOCKED)
+
+/*
+ * tw_pcbref() bumps the reference count on an tw in order to maintain
+ * stability of an tw pointer despite the tw lock being released.
+ */
+static void
+tw_pcbref(struct tcptw *tw)
+{
+       KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
+       refcount_acquire(&tw->tw_refcount);
+}
+
+/*
+ * Drop a refcount on an tw elevated using tw_pcbref().  Return
+ * the tw lock released.
+ */
+static int
+tw_pcbrele(struct tcptw *tw)
+{
+       TW_WLOCK_ASSERT(V_tw_lock);
+       KASSERT(tw->tw_refcount > 0, ("%s: refcount 0", __func__));
+
+       if (!refcount_release(&tw->tw_refcount)) {
+               TW_WUNLOCK(V_tw_lock);
+               return (0);
+       }
+
+       uma_zfree(V_tcptw_zone, tw);
+       TW_WUNLOCK(V_tw_lock);
+       return (1);
+}
+
+static void    tcp_tw_2msl_reset(struct tcptw *, int ream);
+static void    tcp_tw_2msl_stop(struct tcptw *, int reuse);
 
 static int
 tcptw_auto_size(void)
@@ -171,6 +218,7 @@ tcp_tw_init(void)
        else
                uma_zone_set_max(V_tcptw_zone, maxtcptw);
        TAILQ_INIT(&V_twq_2msl);
+       TW_LOCK_INIT(V_tw_lock, "tcptw");
 }
 
 #ifdef VIMAGE
@@ -179,11 +227,14 @@ tcp_tw_destroy(void)
 {
        struct tcptw *tw;
 
-       INP_INFO_WLOCK(&V_tcbinfo);
-       while((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL)
-               tcp_twclose(tw, 0);
-       INP_INFO_WUNLOCK(&V_tcbinfo);
+       TW_WLOCK(V_tw_lock);
+       while((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) {
+               tcp_twclose(tw, 0, 1);
+               TW_WLOCK(V_tw_lock);
+       }
+       TW_WUNLOCK(V_tw_lock);
 
+       TW_LOCK_DESTROY(V_tw_lock);
        uma_zdestroy(V_tcptw_zone);
 }
 #endif
@@ -204,7 +255,7 @@ tcp_twstart(struct tcpcb *tp)
        int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6;
 #endif
 
-       INP_INFO_WLOCK_ASSERT(&V_tcbinfo);      /* tcp_tw_2msl_reset(). */
+       INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
        INP_WLOCK_ASSERT(inp);
 
        if (V_nolocaltimewait) {
@@ -229,7 +280,7 @@ tcp_twstart(struct tcpcb *tp)
 
        tw = uma_zalloc(V_tcptw_zone, M_NOWAIT);
        if (tw == NULL) {
-               tw = tcp_tw_2msl_scan(1);
+               tw = tcp_tw_2msl_reuse();
                if (tw == NULL) {
                        tp = tcp_close(tp);
                        if (tp != NULL)
@@ -238,6 +289,7 @@ tcp_twstart(struct tcpcb *tp)
                }
        }
        tw->tw_inpcb = inp;
+       refcount_init(&tw->tw_refcount, 1);
 
        /*
         * Recover last window size sent.
@@ -356,7 +408,6 @@ tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct 
tcphdr *th,
        int thflags;
        tcp_seq seq;
 
-       /* tcbinfo lock required for tcp_twclose(), tcp_tw_2msl_reset(). */
        INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
        INP_WLOCK_ASSERT(inp);
 
@@ -458,11 +509,11 @@ tcp_twclose(struct tcptw *tw, int reuse)
        inp = tw->tw_inpcb;
        KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait"));
        KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw"));
-       INP_INFO_WLOCK_ASSERT(&V_tcbinfo);      /* tcp_tw_2msl_stop(). */
+       INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
        INP_WLOCK_ASSERT(inp);
 
        tw->tw_inpcb = NULL;
-       tcp_tw_2msl_stop(tw);
+       tcp_tw_2msl_stop(tw, reuse);
        inp->inp_ppcb = NULL;
        in_pcbdrop(inp);
 
@@ -494,11 +545,6 @@ tcp_twclose(struct tcptw *tw, int reuse)
        } else
                in_pcbfree(inp);
        TCPSTAT_INC(tcps_closed);
-       crfree(tw->tw_cred);
-       tw->tw_cred = NULL;
-       if (reuse)
-               return;
-       uma_zfree(V_tcptw_zone, tw);
 }
 
 int
@@ -616,34 +662,87 @@ tcp_tw_2msl_reset(struct tcptw *tw, int rearm)
 
        INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
        INP_WLOCK_ASSERT(tw->tw_inpcb);
+
+       TW_WLOCK(V_tw_lock);
        if (rearm)
                TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
        tw->tw_time = ticks + 2 * tcp_msl;
        TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl);
+       TW_WUNLOCK(V_tw_lock);
 }
 
 static void
-tcp_tw_2msl_stop(struct tcptw *tw)
+tcp_tw_2msl_stop(struct tcptw *tw, int reuse)
 {
 
        INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
+       TW_WLOCK(V_tw_lock);
        TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl);
+       crfree(tw->tw_cred);
+       tw->tw_cred = NULL;
+
+       if (!reuse) {
+               tw_pcbrele(tw);
+               return;
+       }
+
+       TW_WUNLOCK(V_tw_lock);
 }
 
 struct tcptw *
-tcp_tw_2msl_scan(int reuse)
+tcp_tw_2msl_reuse(void)
 {
-       struct tcptw *tw;
 
        INP_INFO_WLOCK_ASSERT(&V_tcbinfo);
+
+       struct tcptw *tw;
+
+       TW_WLOCK(V_tw_lock);
+       tw = TAILQ_FIRST(&V_twq_2msl);
+       if (tw == NULL) {
+               TW_WUNLOCK(V_tw_lock);
+               return NULL;
+       }
+       TW_WUNLOCK(V_tw_lock);
+
+       INP_WLOCK(tw->tw_inpcb);
+       tcp_twclose(tw, 1);
+
+       return (tw);
+}
+
+void
+tcp_tw_2msl_scan(void)
+{
+
+       struct tcptw *tw;
        for (;;) {
+               TW_RLOCK(V_tw_lock);
                tw = TAILQ_FIRST(&V_twq_2msl);
-               if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0))
+               if (tw == NULL || ((tw->tw_time - ticks) > 0)) {
+                       TW_RUNLOCK(V_tw_lock);
+                       break;
+               }
+               tw_pcbref(tw);
+               TW_RUNLOCK(V_tw_lock);
+
+               /* Close timewait state */
+               if(INP_INFO_TRY_WLOCK(&V_tcbinfo)) {
+
+                       TW_WLOCK(V_tw_lock);
+                       if(tw_pcbrele(tw))
+                               continue;
+
+                       KASSERT(tw->tw_inpcb != NULL,
+                           ("%s: tw->tw_inpcb == NULL", __func__));
+
+                       INP_WLOCK(tw->tw_inpcb);
+                       tcp_twclose(tw, 0);
+                       INP_INFO_WUNLOCK(&V_tcbinfo);
+               } else {
+                       /* INP_INFO lock is busy, continue later */
                        break;
-               INP_WLOCK(tw->tw_inpcb);
-               tcp_twclose(tw, reuse);
-               if (reuse)
-                       return (tw);
+               }
        }
-       return (NULL);
 }
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index e3197e5..f13c1f4 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -353,8 +353,7 @@ struct tcptw {
        u_int           t_starttime;
        int             tw_time;
        TAILQ_ENTRY(tcptw) tw_2msl;
-       void            *tw_pspare;     /* TCP_SIGNATURE */
-       u_int           *tw_spare;      /* TCP_SIGNATURE */
+       u_int           tw_refcount;    /* refcount */
 };
 
 #define        intotcpcb(ip)   ((struct tcpcb *)(ip)->inp_ppcb)
_______________________________________________
freebsd-net@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-net
To unsubscribe, send any mail to "freebsd-net-unsubscr...@freebsd.org"

Reply via email to