On Wed, Apr 06, 2022 at 05:01:55PM +0200, Alexandr Nedvedicky wrote:
> Hello,
>
> Hrvoje was testing pf(4) and pfsync(4) with parallel forwarding diff
> which bluhm@ has shared sometime ago.
>
> Hrvoje found a bug in my very naive implementation of 'snapshots':
>
> 1573 void
> 1574 pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc)
> 1575 {
> 1576 int q;
> 1577
> 1578 sn->sn_sc = sc;
> 1579
> 1580 mtx_enter(&sc->sc_st_mtx);
> 1581 mtx_enter(&sc->sc_upd_req_mtx);
> 1582 mtx_enter(&sc->sc_tdb_mtx);
> 1583
> 1584 for (q = 0; q < PFSYNC_S_COUNT; q++) {
> 1585 TAILQ_INIT(&sn->sn_qs[q]);
> 1586 TAILQ_CONCAT(&sn->sn_qs[q], &sc->sc_qs[q], sync_list);
> 1587 }
> 1588
> 1589 TAILQ_INIT(&sn->sn_upd_req_list);
> 1590 TAILQ_CONCAT(&sn->sn_upd_req_list, &sc->sc_upd_req_list,
> ur_entry);
> 1591
> 1592 TAILQ_INIT(&sn->sn_tdb_q);
> 1593 TAILQ_CONCAT(&sn->sn_tdb_q, &sc->sc_tdb_q, tdb_sync_entry);
> 1594
>
>
> the problem with code above is that we just take care about heads of various
> queues. However individual objects may get re-inserted to queue on behalf of
> state update. This creates a havoc. The proposed change introduces a dedicated
> link member for snapshot, so we can move elements from sync_list to
> snapshot_list.
>
> The diff below does not hurt pfsync(4) in current tree, because
> we still don't forward packets in parallel. It will just make
> things bit easier for Hrvoje et. al. so we can keep smaller diff
> against current tree.
>
>
> OK ?
I think there is a use after free in you diff. After you return
from pfsync_delete_tdb() you must not access the TDB again.
Comments inline.
> thanks and
> regards
> sashan
>
> --------8<---------------8<---------------8<------------------8<--------
> diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c
> index cb0f3fbdf52..161f8c89317 100644
> --- a/sys/net/if_pfsync.c
> +++ b/sys/net/if_pfsync.c
> @@ -181,6 +181,7 @@ void pfsync_q_del(struct pf_state *);
>
> struct pfsync_upd_req_item {
> TAILQ_ENTRY(pfsync_upd_req_item) ur_entry;
> + TAILQ_ENTRY(pfsync_upd_req_item) ur_snap;
> struct pfsync_upd_req ur_msg;
> };
> TAILQ_HEAD(pfsync_upd_reqs, pfsync_upd_req_item);
> @@ -295,7 +296,7 @@ void pfsync_bulk_update(void *);
> void pfsync_bulk_fail(void *);
>
> void pfsync_grab_snapshot(struct pfsync_snapshot *, struct pfsync_softc *);
> -void pfsync_drop_snapshot(struct pfsync_snapshot *, struct pfsync_softc *);
> +void pfsync_drop_snapshot(struct pfsync_snapshot *);
>
> void pfsync_send_dispatch(void *);
> void pfsync_send_pkt(struct mbuf *);
> @@ -1574,6 +1575,9 @@ void
> pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc)
> {
> int q;
> + struct pf_state *st;
> + struct pfsync_upd_req_item *ur;
> + struct tdb *tdb;
>
> sn->sn_sc = sc;
>
> @@ -1583,14 +1587,33 @@ pfsync_grab_snapshot(struct pfsync_snapshot *sn,
> struct pfsync_softc *sc)
>
> for (q = 0; q < PFSYNC_S_COUNT; q++) {
> TAILQ_INIT(&sn->sn_qs[q]);
> - TAILQ_CONCAT(&sn->sn_qs[q], &sc->sc_qs[q], sync_list);
> +
> + while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) {
> +#ifdef PFSYNC_DEBUG
> + KASSERT(st->snapped == 0);
> +#endif
> + TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
> + TAILQ_INSERT_TAIL(&sn->sn_qs[q], st, sync_snap);
> + st->snapped = 1;
> + }
> }
>
> TAILQ_INIT(&sn->sn_upd_req_list);
> - TAILQ_CONCAT(&sn->sn_upd_req_list, &sc->sc_upd_req_list, ur_entry);
> + while (!TAILQ_EMPTY(&sc->sc_upd_req_list)) {
> + ur = TAILQ_FIRST(&sc->sc_upd_req_list);
Other loops have this idiom
while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list) != NULL) {
> + TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry);
> + TAILQ_INSERT_TAIL(&sn->sn_upd_req_list, ur, ur_snap);
> + }
>
> TAILQ_INIT(&sn->sn_tdb_q);
> - TAILQ_CONCAT(&sn->sn_tdb_q, &sc->sc_tdb_q, tdb_sync_entry);
> + while ((tdb = TAILQ_FIRST(&sc->sc_tdb_q)) != NULL) {
> +#ifdef PFSYNC_DEBUG
> + KASSERT(tdb->snapped == 0);
> +#endif
> + TAILQ_REMOVE(&sc->sc_tdb_q, tdb, tdb_sync_entry);
> + TAILQ_INSERT_TAIL(&sn->sn_tdb_q, tdb, tdb_sync_snap);
> + tdb->tdb_snapped = 1;
> + }
>
> sn->sn_len = sc->sc_len;
> sc->sc_len = PFSYNC_MINPKT;
> @@ -1606,41 +1629,44 @@ pfsync_grab_snapshot(struct pfsync_snapshot *sn,
> struct pfsync_softc *sc)
> }
>
> void
> -pfsync_drop_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc * sc)
> +pfsync_drop_snapshot(struct pfsync_snapshot *sn)
> {
> struct pf_state *st;
> struct pfsync_upd_req_item *ur;
> struct tdb *t;
> int q;
>
> -
> for (q = 0; q < PFSYNC_S_COUNT; q++) {
> if (TAILQ_EMPTY(&sn->sn_qs[q]))
> continue;
>
> while ((st = TAILQ_FIRST(&sn->sn_qs[q])) != NULL) {
> - TAILQ_REMOVE(&sn->sn_qs[q], st, sync_list);
> #ifdef PFSYNC_DEBUG
> KASSERT(st->sync_state == q);
> + KASSERT(st->snapped == 1);
> #endif
> + TAILQ_REMOVE(&sn->sn_qs[q], st, sync_snap);
> st->sync_state = PFSYNC_S_NONE;
> + st->snapped = 0;
> pf_state_unref(st);
> }
> }
>
> while ((ur = TAILQ_FIRST(&sn->sn_upd_req_list)) != NULL) {
> - TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_entry);
> + TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_snap);
> pool_put(&sn->sn_sc->sc_pool, ur);
> }
>
> - mtx_enter(&sc->sc_tdb_mtx);
> while ((t = TAILQ_FIRST(&sn->sn_tdb_q)) != NULL) {
> - TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_entry);
> + TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_snap);
> mtx_enter(&t->tdb_mtx);
> +#ifdef PFSYNC_DEBUG
> + KASSERT(t->tdb_snapped == 1);
> +#endif
> + t->tdb_snapped = 0;
> CLR(t->tdb_flags, TDBF_PFSYNC);
> mtx_leave(&t->tdb_mtx);
> }
> - mtx_leave(&sc->sc_tdb_mtx);
> }
>
> int
> @@ -1667,7 +1693,7 @@ pfsync_drop(struct pfsync_softc *sc)
> struct pfsync_snapshot sn;
>
> pfsync_grab_snapshot(&sn, sc);
> - pfsync_drop_snapshot(&sn, sc);
> + pfsync_drop_snapshot(&sn);
> }
>
> void
> @@ -1759,7 +1785,7 @@ pfsync_sendout(void)
> if (m == NULL) {
> sc->sc_if.if_oerrors++;
> pfsyncstat_inc(pfsyncs_onomem);
> - pfsync_drop_snapshot(&sn, sc);
> + pfsync_drop_snapshot(&sn);
> return;
> }
>
> @@ -1769,7 +1795,7 @@ pfsync_sendout(void)
> m_free(m);
> sc->sc_if.if_oerrors++;
> pfsyncstat_inc(pfsyncs_onomem);
> - pfsync_drop_snapshot(&sn, sc);
> + pfsync_drop_snapshot(&sn);
> return;
> }
> }
> @@ -1799,7 +1825,7 @@ pfsync_sendout(void)
>
> count = 0;
> while ((ur = TAILQ_FIRST(&sn.sn_upd_req_list)) != NULL) {
> - TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_entry);
> + TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_snap);
>
> bcopy(&ur->ur_msg, m->m_data + offset,
> sizeof(ur->ur_msg));
> @@ -1827,18 +1853,20 @@ pfsync_sendout(void)
> subh = (struct pfsync_subheader *)(m->m_data + offset);
> offset += sizeof(*subh);
>
> - mtx_enter(&sc->sc_tdb_mtx);
> count = 0;
> while ((t = TAILQ_FIRST(&sn.sn_tdb_q)) != NULL) {
> - TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_entry);
> + TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_snap);
I think the TDB in tdb_sync_snap list may be freed. Maybe
you should grab a refcount if you store them into a list.
> pfsync_out_tdb(t, m->m_data + offset);
> offset += sizeof(struct pfsync_tdb);
> +#ifdef PFSYNC_DEBUG
> + KASSERT(t->tdb_snapped == 1);
> +#endif
> + t->tdb_snapped = 0;
This may also be use after free.
> mtx_enter(&t->tdb_mtx);
> CLR(t->tdb_flags, TDBF_PFSYNC);
> mtx_leave(&t->tdb_mtx);
> count++;
> }
> - mtx_leave(&sc->sc_tdb_mtx);
>
> bzero(subh, sizeof(*subh));
> subh->action = PFSYNC_ACT_TDB;
> @@ -1856,11 +1884,13 @@ pfsync_sendout(void)
>
> count = 0;
> while ((st = TAILQ_FIRST(&sn.sn_qs[q])) != NULL) {
> - TAILQ_REMOVE(&sn.sn_qs[q], st, sync_list);
> + TAILQ_REMOVE(&sn.sn_qs[q], st, sync_snap);
> #ifdef PFSYNC_DEBUG
> KASSERT(st->sync_state == q);
> + KASSERT(st->snapped == 1);
> #endif
> st->sync_state = PFSYNC_S_NONE;
> + st->snapped = 0;
> pfsync_qs[q].write(st, m->m_data + offset);
> offset += pfsync_qs[q].len;
>
> @@ -2411,8 +2441,9 @@ pfsync_q_ins(struct pf_state *st, int q)
> mtx_enter(&sc->sc_st_mtx);
>
> /*
> - * If two threads are competing to insert the same state, then
> - * there must be just single winner.
> + * There are either two threads trying to update the
> + * the same state, or the state is just being processed
> + * (is on snapshot queue).
> */
> if (st->sync_state != PFSYNC_S_NONE) {
> mtx_leave(&sc->sc_st_mtx);
> @@ -2450,6 +2481,15 @@ pfsync_q_del(struct pf_state *st)
>
> mtx_enter(&sc->sc_st_mtx);
> q = st->sync_state;
> + /*
> + * re-check under mutex
> + * if state is snapped already, then just bail out, because we came
> + * too late, the state is being just processed/dispatched to peer.
> + */
> + if ((q == PFSYNC_S_NONE) || (st->snapped)) {
> + mtx_leave(&sc->sc_st_mtx);
> + return;
> + }
> atomic_sub_long(&sc->sc_len, pfsync_qs[q].len);
> TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list);
> if (TAILQ_EMPTY(&sc->sc_qs[q]))
> @@ -2525,7 +2565,17 @@ pfsync_delete_tdb(struct tdb *t)
>
> mtx_enter(&sc->sc_tdb_mtx);
>
> + /*
> + * if tdb entry is just being processed (found in snapshot),
> + * then it can not be deleted. we just came too late
> + */
> + if (t->tdb_snapped) {
> + mtx_leave(&sc->sc_tdb_mtx);
You must not access the TDB after this point. I thnik you cannot
guarantee that. The memory will freed after return.
> + return;
> + }
> +
> TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry);
> +
> mtx_enter(&t->tdb_mtx);
> CLR(t->tdb_flags, TDBF_PFSYNC);
> mtx_leave(&t->tdb_mtx);
> diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h
> index bd7ec1d88e7..558618a0f14 100644
> --- a/sys/net/pfvar.h
> +++ b/sys/net/pfvar.h
> @@ -749,6 +749,7 @@ struct pf_state {
> u_int8_t pad[3];
>
> TAILQ_ENTRY(pf_state) sync_list;
> + TAILQ_ENTRY(pf_state) sync_snap;
> TAILQ_ENTRY(pf_state) entry_list;
> SLIST_ENTRY(pf_state) gc_list;
> RB_ENTRY(pf_state) entry_id;
> @@ -797,6 +798,7 @@ struct pf_state {
> pf_refcnt_t refcnt;
> u_int16_t delay;
> u_int8_t rt;
> + u_int8_t snapped;
> };
>
> /*
> diff --git a/sys/netinet/ip_ipsp.h b/sys/netinet/ip_ipsp.h
> index c697994047b..ebdb6ada1ae 100644
> --- a/sys/netinet/ip_ipsp.h
> +++ b/sys/netinet/ip_ipsp.h
> @@ -405,6 +405,7 @@ struct tdb { /* tunnel
> descriptor block */
> u_int8_t tdb_wnd; /* Replay window */
> u_int8_t tdb_satype; /* SA type (RFC2367, PF_KEY) */
> u_int8_t tdb_updates; /* pfsync update counter */
> + u_int8_t tdb_snapped; /* dispatched by pfsync(4) */
u_int8_t is not atomic. I you want to change tdb_snapped, you need
a mutex that also protects the othere fields in the same 32 bit
word everywhere. I think a new flag TDBF_PFSYNC_SNAPPED in tdb_flags
would be easier. The tdb_flags are protected by tdb_mtx.
> union sockaddr_union tdb_dst; /* [N] Destination address */
> union sockaddr_union tdb_src; /* [N] Source address */
> @@ -439,6 +440,7 @@ struct tdb { /* tunnel
> descriptor block */
>
> TAILQ_HEAD(tdb_policy_head, ipsec_policy) tdb_policy_head; /* [p] */
> TAILQ_ENTRY(tdb) tdb_sync_entry;
> + TAILQ_ENTRY(tdb) tdb_sync_snap;
> };
>
> enum tdb_counters {