Author: jch
Date: Fri May 15 12:07:43 2015
New Revision: 282964
URL: https://svnweb.freebsd.org/changeset/base/282964

Log:
  MFC: r280904, r280990, r281599
  
  r280904:
      Use appropriate timeout_t* instead of void* in tcp_timer_activate()
  
      Suggested by:               imp
      Differential Revision:      https://reviews.freebsd.org/D2154
      Reviewed by:                imp, jhb
      Approved by:                jhb
  
  r280990:
      Provide better debugging information in tcp_timer_activate() and
      tcp_timer_active()
  
      Differential Revision:      https://reviews.freebsd.org/D2179
      Suggested by:               bz
      Reviewed by:                jhb
      Approved by:                jhb
  
  r281599:
      Fix an old and well-documented use-after-free race condition in
      TCP timers:
       - Add a reference from tcpcb to its inpcb
       - Defer tcpcb deletion until TCP timers have finished
  
      Differential Revision:      https://reviews.freebsd.org/D2079
      Submitted by:               jch, Marc De La Gueronniere 
<mdelagueronni...@verisign.com>
      Reviewed by:                imp, rrs, adrian, jhb, bz
      Approved by:                jhb
      Sponsored by:               Verisign, Inc.

Modified:
  stable/10/sys/netinet/tcp_subr.c
  stable/10/sys/netinet/tcp_timer.c
  stable/10/sys/netinet/tcp_timer.h
  stable/10/sys/netinet/tcp_var.h
Directory Properties:
  stable/10/   (props changed)

Modified: stable/10/sys/netinet/tcp_subr.c
==============================================================================
--- stable/10/sys/netinet/tcp_subr.c    Fri May 15 11:10:01 2015        
(r282963)
+++ stable/10/sys/netinet/tcp_subr.c    Fri May 15 12:07:43 2015        
(r282964)
@@ -230,6 +230,7 @@ static struct inpcb *tcp_notify(struct i
 static struct inpcb *tcp_mtudisc_notify(struct inpcb *, int);
 static char *  tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
                    void *ip4hdr, const void *ip6hdr);
+static void    tcp_timer_discard(struct tcpcb *, uint32_t);
 
 /*
  * Target size of TCP PCB hash tables. Must be a power of two.
@@ -790,7 +791,13 @@ tcp_newtcpcb(struct inpcb *inp)
        if (V_tcp_do_sack)
                tp->t_flags |= TF_SACK_PERMIT;
        TAILQ_INIT(&tp->snd_holes);
-       tp->t_inpcb = inp;      /* XXX */
+       /*
+        * The tcpcb will hold a reference on its inpcb until tcp_discardcb()
+        * is called.
+        */
+       in_pcbref(inp); /* Reference for tcpcb */
+       tp->t_inpcb = inp;
+
        /*
         * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
         * rtt estimate.  Set rttvar so that srtt + 4 * rttvar gives
@@ -909,6 +916,7 @@ tcp_discardcb(struct tcpcb *tp)
 #ifdef INET6
        int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
 #endif /* INET6 */
+       int released;
 
        INP_WLOCK_ASSERT(inp);
 
@@ -916,22 +924,15 @@ tcp_discardcb(struct tcpcb *tp)
         * Make sure that all of our timers are stopped before we delete the
         * PCB.
         *
-        * XXXRW: Really, we would like to use callout_drain() here in order
-        * to avoid races experienced in tcp_timer.c where a timer is already
-        * executing at this point.  However, we can't, both because we're
-        * running in a context where we can't sleep, and also because we
-        * hold locks required by the timers.  What we instead need to do is
-        * test to see if callout_drain() is required, and if so, defer some
-        * portion of the remainder of tcp_discardcb() to an asynchronous
-        * context that can callout_drain() and then continue.  Some care
-        * will be required to ensure that no further processing takes place
-        * on the tcpcb, even though it hasn't been freed (a flag?).
-        */
-       callout_stop(&tp->t_timers->tt_rexmt);
-       callout_stop(&tp->t_timers->tt_persist);
-       callout_stop(&tp->t_timers->tt_keep);
-       callout_stop(&tp->t_timers->tt_2msl);
-       callout_stop(&tp->t_timers->tt_delack);
+        * If stopping a timer fails, we schedule a discard function in same
+        * callout, and the last discard function called will take care of
+        * deleting the tcpcb.
+        */
+       tcp_timer_stop(tp, TT_REXMT);
+       tcp_timer_stop(tp, TT_PERSIST);
+       tcp_timer_stop(tp, TT_KEEP);
+       tcp_timer_stop(tp, TT_2MSL);
+       tcp_timer_stop(tp, TT_DELACK);
 
        /*
         * If we got enough samples through the srtt filter,
@@ -1008,8 +1009,80 @@ tcp_discardcb(struct tcpcb *tp)
 
        CC_ALGO(tp) = NULL;
        inp->inp_ppcb = NULL;
-       tp->t_inpcb = NULL;
-       uma_zfree(V_tcpcb_zone, tp);
+       if ((tp->t_timers->tt_flags & TT_MASK) == 0) {
+               /* We own the last reference on tcpcb, let's free it. */
+               tp->t_inpcb = NULL;
+               uma_zfree(V_tcpcb_zone, tp);
+               released = in_pcbrele_wlocked(inp);
+               KASSERT(!released, ("%s: inp %p should not have been released "
+                       "here", __func__, inp));
+       }
+}
+
+void
+tcp_timer_2msl_discard(void *xtp)
+{
+
+       tcp_timer_discard((struct tcpcb *)xtp, TT_2MSL);
+}
+
+void
+tcp_timer_keep_discard(void *xtp)
+{
+
+       tcp_timer_discard((struct tcpcb *)xtp, TT_KEEP);
+}
+
+void
+tcp_timer_persist_discard(void *xtp)
+{
+
+       tcp_timer_discard((struct tcpcb *)xtp, TT_PERSIST);
+}
+
+void
+tcp_timer_rexmt_discard(void *xtp)
+{
+
+       tcp_timer_discard((struct tcpcb *)xtp, TT_REXMT);
+}
+
+void
+tcp_timer_delack_discard(void *xtp)
+{
+
+       tcp_timer_discard((struct tcpcb *)xtp, TT_DELACK);
+}
+
+void
+tcp_timer_discard(struct tcpcb *tp, uint32_t timer_type)
+{
+       struct inpcb *inp;
+
+       CURVNET_SET(tp->t_vnet);
+       INP_INFO_WLOCK(&V_tcbinfo);
+       inp = tp->t_inpcb;
+       KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL",
+               __func__, tp));
+       INP_WLOCK(inp);
+       KASSERT((tp->t_timers->tt_flags & TT_STOPPED) != 0,
+               ("%s: tcpcb has to be stopped here", __func__));
+       KASSERT((tp->t_timers->tt_flags & timer_type) != 0,
+               ("%s: discard callout should be running", __func__));
+       tp->t_timers->tt_flags &= ~timer_type;
+       if ((tp->t_timers->tt_flags & TT_MASK) == 0) {
+               /* We own the last reference on this tcpcb, let's free it. */
+               tp->t_inpcb = NULL;
+               uma_zfree(V_tcpcb_zone, tp);
+               if (in_pcbrele_wlocked(inp)) {
+                       INP_INFO_WUNLOCK(&V_tcbinfo);
+                       CURVNET_RESTORE();
+                       return;
+               }
+       }
+       INP_WUNLOCK(inp);
+       INP_INFO_WUNLOCK(&V_tcbinfo);
+       CURVNET_RESTORE();
 }
 
 /*

Modified: stable/10/sys/netinet/tcp_timer.c
==============================================================================
--- stable/10/sys/netinet/tcp_timer.c   Fri May 15 11:10:01 2015        
(r282963)
+++ stable/10/sys/netinet/tcp_timer.c   Fri May 15 12:07:43 2015        
(r282964)
@@ -209,10 +209,6 @@ int        tcp_backoff[TCP_MAXRXTSHIFT + 1] =
 
 static int tcp_totbackoff = 2559;      /* sum of tcp_backoff[] */
 
-static int tcp_timer_race;
-SYSCTL_INT(_net_inet_tcp, OID_AUTO, timer_race, CTLFLAG_RD, &tcp_timer_race,
-    0, "Count of t_inpcb races on tcp_discardcb");
-
 /*
  * TCP timer processing.
  */
@@ -225,18 +221,7 @@ tcp_timer_delack(void *xtp)
        CURVNET_SET(tp->t_vnet);
 
        inp = tp->t_inpcb;
-       /*
-        * XXXRW: While this assert is in fact correct, bugs in the tcpcb
-        * tear-down mean we need it as a work-around for races between
-        * timers and tcp_discardcb().
-        *
-        * KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL"));
-        */
-       if (inp == NULL) {
-               tcp_timer_race++;
-               CURVNET_RESTORE();
-               return;
-       }
+       KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
        INP_WLOCK(inp);
        if (callout_pending(&tp->t_timers->tt_delack) ||
            !callout_active(&tp->t_timers->tt_delack)) {
@@ -250,6 +235,10 @@ tcp_timer_delack(void *xtp)
                CURVNET_RESTORE();
                return;
        }
+       KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+               ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
+       KASSERT((tp->t_timers->tt_flags & TT_DELACK) != 0,
+               ("%s: tp %p delack callout should be running", __func__, tp));
 
        tp->t_flags |= TF_ACKNOW;
        TCPSTAT_INC(tcps_delack);
@@ -269,24 +258,9 @@ tcp_timer_2msl(void *xtp)
 
        ostate = tp->t_state;
 #endif
-       /*
-        * XXXRW: Does this actually happen?
-        */
        INP_INFO_WLOCK(&V_tcbinfo);
        inp = tp->t_inpcb;
-       /*
-        * XXXRW: While this assert is in fact correct, bugs in the tcpcb
-        * tear-down mean we need it as a work-around for races between
-        * timers and tcp_discardcb().
-        *
-        * KASSERT(inp != NULL, ("tcp_timer_2msl: inp == NULL"));
-        */
-       if (inp == NULL) {
-               tcp_timer_race++;
-               INP_INFO_WUNLOCK(&V_tcbinfo);
-               CURVNET_RESTORE();
-               return;
-       }
+       KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
        INP_WLOCK(inp);
        tcp_free_sackholes(tp);
        if (callout_pending(&tp->t_timers->tt_2msl) ||
@@ -303,6 +277,10 @@ tcp_timer_2msl(void *xtp)
                CURVNET_RESTORE();
                return;
        }
+       KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+               ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
+       KASSERT((tp->t_timers->tt_flags & TT_2MSL) != 0,
+               ("%s: tp %p 2msl callout should be running", __func__, tp));
        /*
         * 2 MSL timeout in shutdown went off.  If we're closed but
         * still waiting for peer to close and connection has been idle
@@ -352,19 +330,7 @@ tcp_timer_keep(void *xtp)
 #endif
        INP_INFO_WLOCK(&V_tcbinfo);
        inp = tp->t_inpcb;
-       /*
-        * XXXRW: While this assert is in fact correct, bugs in the tcpcb
-        * tear-down mean we need it as a work-around for races between
-        * timers and tcp_discardcb().
-        *
-        * KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL"));
-        */
-       if (inp == NULL) {
-               tcp_timer_race++;
-               INP_INFO_WUNLOCK(&V_tcbinfo);
-               CURVNET_RESTORE();
-               return;
-       }
+       KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
        INP_WLOCK(inp);
        if (callout_pending(&tp->t_timers->tt_keep) ||
            !callout_active(&tp->t_timers->tt_keep)) {
@@ -380,6 +346,10 @@ tcp_timer_keep(void *xtp)
                CURVNET_RESTORE();
                return;
        }
+       KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+               ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
+       KASSERT((tp->t_timers->tt_flags & TT_KEEP) != 0,
+               ("%s: tp %p keep callout should be running", __func__, tp));
        /*
         * Keep-alive timer went off; send something
         * or drop connection if idle for too long.
@@ -455,19 +425,7 @@ tcp_timer_persist(void *xtp)
 #endif
        INP_INFO_WLOCK(&V_tcbinfo);
        inp = tp->t_inpcb;
-       /*
-        * XXXRW: While this assert is in fact correct, bugs in the tcpcb
-        * tear-down mean we need it as a work-around for races between
-        * timers and tcp_discardcb().
-        *
-        * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL"));
-        */
-       if (inp == NULL) {
-               tcp_timer_race++;
-               INP_INFO_WUNLOCK(&V_tcbinfo);
-               CURVNET_RESTORE();
-               return;
-       }
+       KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
        INP_WLOCK(inp);
        if (callout_pending(&tp->t_timers->tt_persist) ||
            !callout_active(&tp->t_timers->tt_persist)) {
@@ -483,6 +441,10 @@ tcp_timer_persist(void *xtp)
                CURVNET_RESTORE();
                return;
        }
+       KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+               ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
+       KASSERT((tp->t_timers->tt_flags & TT_PERSIST) != 0,
+               ("%s: tp %p persist callout should be running", __func__, tp));
        /*
         * Persistance timer into zero window.
         * Force a byte to be output, if possible.
@@ -544,19 +506,7 @@ tcp_timer_rexmt(void * xtp)
 
        INP_INFO_RLOCK(&V_tcbinfo);
        inp = tp->t_inpcb;
-       /*
-        * XXXRW: While this assert is in fact correct, bugs in the tcpcb
-        * tear-down mean we need it as a work-around for races between
-        * timers and tcp_discardcb().
-        *
-        * KASSERT(inp != NULL, ("tcp_timer_rexmt: inp == NULL"));
-        */
-       if (inp == NULL) {
-               tcp_timer_race++;
-               INP_INFO_RUNLOCK(&V_tcbinfo);
-               CURVNET_RESTORE();
-               return;
-       }
+       KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp));
        INP_WLOCK(inp);
        if (callout_pending(&tp->t_timers->tt_rexmt) ||
            !callout_active(&tp->t_timers->tt_rexmt)) {
@@ -572,6 +522,10 @@ tcp_timer_rexmt(void * xtp)
                CURVNET_RESTORE();
                return;
        }
+       KASSERT((tp->t_timers->tt_flags & TT_STOPPED) == 0,
+               ("%s: tp %p tcpcb can't be stopped here", __func__, tp));
+       KASSERT((tp->t_timers->tt_flags & TT_REXMT) != 0,
+               ("%s: tp %p rexmt callout should be running", __func__, tp));
        tcp_free_sackholes(tp);
        /*
         * Retransmission timer went off.  Message has not
@@ -800,10 +754,10 @@ out:
 }
 
 void
-tcp_timer_activate(struct tcpcb *tp, int timer_type, u_int delta)
+tcp_timer_activate(struct tcpcb *tp, uint32_t timer_type, u_int delta)
 {
        struct callout *t_callout;
-       void *f_callout;
+       timeout_t *f_callout;
        struct inpcb *inp = tp->t_inpcb;
        int cpu = INP_CPU(inp);
 
@@ -812,6 +766,9 @@ tcp_timer_activate(struct tcpcb *tp, int
                return;
 #endif
 
+       if (tp->t_timers->tt_flags & TT_STOPPED)
+               return;
+
        switch (timer_type) {
                case TT_DELACK:
                        t_callout = &tp->t_timers->tt_delack;
@@ -834,17 +791,26 @@ tcp_timer_activate(struct tcpcb *tp, int
                        f_callout = tcp_timer_2msl;
                        break;
                default:
-                       panic("bad timer_type");
+                       panic("tp %p bad timer_type %#x", tp, timer_type);
                }
        if (delta == 0) {
-               callout_stop(t_callout);
+               if ((tp->t_timers->tt_flags & timer_type) &&
+                   callout_stop(t_callout)) {
+                       tp->t_timers->tt_flags &= ~timer_type;
+               }
        } else {
-               callout_reset_on(t_callout, delta, f_callout, tp, cpu);
+               if ((tp->t_timers->tt_flags & timer_type) == 0) {
+                       tp->t_timers->tt_flags |= timer_type;
+                       callout_reset_on(t_callout, delta, f_callout, tp, cpu);
+               } else {
+                       /* Reset already running callout on the same CPU. */
+                       callout_reset(t_callout, delta, f_callout, tp);
+               }
        }
 }
 
 int
-tcp_timer_active(struct tcpcb *tp, int timer_type)
+tcp_timer_active(struct tcpcb *tp, uint32_t timer_type)
 {
        struct callout *t_callout;
 
@@ -865,11 +831,63 @@ tcp_timer_active(struct tcpcb *tp, int t
                        t_callout = &tp->t_timers->tt_2msl;
                        break;
                default:
-                       panic("bad timer_type");
+                       panic("tp %p bad timer_type %#x", tp, timer_type);
                }
        return callout_active(t_callout);
 }
 
+void
+tcp_timer_stop(struct tcpcb *tp, uint32_t timer_type)
+{
+       struct callout *t_callout;
+       timeout_t *f_callout;
+
+       tp->t_timers->tt_flags |= TT_STOPPED;
+
+       switch (timer_type) {
+               case TT_DELACK:
+                       t_callout = &tp->t_timers->tt_delack;
+                       f_callout = tcp_timer_delack_discard;
+                       break;
+               case TT_REXMT:
+                       t_callout = &tp->t_timers->tt_rexmt;
+                       f_callout = tcp_timer_rexmt_discard;
+                       break;
+               case TT_PERSIST:
+                       t_callout = &tp->t_timers->tt_persist;
+                       f_callout = tcp_timer_persist_discard;
+                       break;
+               case TT_KEEP:
+                       t_callout = &tp->t_timers->tt_keep;
+                       f_callout = tcp_timer_keep_discard;
+                       break;
+               case TT_2MSL:
+                       t_callout = &tp->t_timers->tt_2msl;
+                       f_callout = tcp_timer_2msl_discard;
+                       break;
+               default:
+                       panic("tp %p bad timer_type %#x", tp, timer_type);
+               }
+
+       if (tp->t_timers->tt_flags & timer_type) {
+               if (callout_stop(t_callout)) {
+                       tp->t_timers->tt_flags &= ~timer_type;
+               } else {
+                       /*
+                        * Can't stop the callout, defer tcpcb actual deletion
+                        * to the last tcp timer discard callout.
+                        * The TT_STOPPED flag will ensure that no tcp timer
+                        * callouts can be restarted on our behalf, and
+                        * past this point currently running callouts waiting
+                        * on inp lock will return right away after the
+                        * classical check for callout reset/stop events:
+                        * callout_pending() || !callout_active()
+                        */
+                       callout_reset(t_callout, 1, f_callout, tp);
+               }
+       }
+}
+
 #define        ticks_to_msecs(t)       (1000*(t) / hz)
 
 void

Modified: stable/10/sys/netinet/tcp_timer.h
==============================================================================
--- stable/10/sys/netinet/tcp_timer.h   Fri May 15 11:10:01 2015        
(r282963)
+++ stable/10/sys/netinet/tcp_timer.h   Fri May 15 12:07:43 2015        
(r282964)
@@ -146,12 +146,21 @@ struct tcp_timer {
        struct  callout tt_keep;        /* keepalive */
        struct  callout tt_2msl;        /* 2*msl TIME_WAIT timer */
        struct  callout tt_delack;      /* delayed ACK timer */
+       uint32_t        tt_flags;       /* Timers flags */
+       uint32_t        tt_spare;       /* TDB */
 };
-#define TT_DELACK      0x01
-#define TT_REXMT       0x02
-#define TT_PERSIST     0x04
-#define TT_KEEP                0x08
-#define TT_2MSL                0x10
+
+/*
+ * Flags for the tt_flags field.
+ */
+#define TT_DELACK      0x0001
+#define TT_REXMT       0x0002
+#define TT_PERSIST     0x0004
+#define TT_KEEP                0x0008
+#define TT_2MSL                0x0010
+#define TT_MASK                (TT_DELACK|TT_REXMT|TT_PERSIST|TT_KEEP|TT_2MSL)
+
+#define TT_STOPPED     0x00010000
 
 #define        TP_KEEPINIT(tp) ((tp)->t_keepinit ? (tp)->t_keepinit : 
tcp_keepinit)
 #define        TP_KEEPIDLE(tp) ((tp)->t_keepidle ? (tp)->t_keepidle : 
tcp_keepidle)
@@ -183,6 +192,11 @@ void       tcp_timer_keep(void *xtp);
 void   tcp_timer_persist(void *xtp);
 void   tcp_timer_rexmt(void *xtp);
 void   tcp_timer_delack(void *xtp);
+void   tcp_timer_2msl_discard(void *xtp);
+void   tcp_timer_keep_discard(void *xtp);
+void   tcp_timer_persist_discard(void *xtp);
+void   tcp_timer_rexmt_discard(void *xtp);
+void   tcp_timer_delack_discard(void *xtp);
 void   tcp_timer_to_xtimer(struct tcpcb *tp, struct tcp_timer *timer,
        struct xtcp_timer *xtimer);
 

Modified: stable/10/sys/netinet/tcp_var.h
==============================================================================
--- stable/10/sys/netinet/tcp_var.h     Fri May 15 11:10:01 2015        
(r282963)
+++ stable/10/sys/netinet/tcp_var.h     Fri May 15 12:07:43 2015        
(r282964)
@@ -718,8 +718,9 @@ void         tcp_slowtimo(void);
 struct tcptemp *
         tcpip_maketemplate(struct inpcb *);
 void    tcpip_fillheaders(struct inpcb *, void *, void *);
-void    tcp_timer_activate(struct tcpcb *, int, u_int);
-int     tcp_timer_active(struct tcpcb *, int);
+void    tcp_timer_activate(struct tcpcb *, uint32_t, u_int);
+int     tcp_timer_active(struct tcpcb *, uint32_t);
+void    tcp_timer_stop(struct tcpcb *, uint32_t);
 void    tcp_trace(short, short, struct tcpcb *, void *, struct tcphdr *, int);
 /*
  * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)
_______________________________________________
svn-src-stable-10@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-stable-10
To unsubscribe, send any mail to "svn-src-stable-10-unsubscr...@freebsd.org"

Reply via email to