Hello, below is a diff, which makes pfsync(4) more pf_lock friendly. The diff is sitting in my tree for some time. Hrvoje did some preliminary testing already. He could trigger some sparks and smoke, which I could put off. However don't get too much excited about the change yet. At this phase I really need some testing focused on spotting any undesired side effects.
There should be no change in pf(4) + pfsync(4) behaviour in terms of functionality or stability if diff is applied as-is. All SMP related changes in pf/pfsync stuff are protected by WITH_PF_LOCK guard. I think it's still worth to give a try and test diff below _with___out_ 'WITH_PF_LOCK' being set and watching for changes in pfsync behavior. Brave hearts can compile kernel with -DWITH_PF_LOCK. For example one can something like this: cd sys/arch/`uname -p`/conf echo 'option WIT_PF_LOCK' >> GENERIC.MP config GENERIC.MP cd ../compile/GENERIC.MP/ make clean make The kernel, which is compiled with -DWITH_PF_LOCK option will execute a new pf_lock friendly code in pfsync. But don't expect any performance improvement yet. Remember there is no concurrency yet, which would run more instances of pf_test() function. If testing won't reveal any problems I'll re-post the diff asking for OKs. The diff just extends current pfsync implementation so pfsync_update_state() function can run concurrently. Every state queue maintained by pfsync gets its own access to synchronize insert/remove operations. The send operation (pfsync_sendout()) is split to two steps: the first stop is invoked synchronously on behalf of caller from pf or periodic task. Here we grab mutex to grab all states from global queues to local data (a snapshot). We release mutex once we have snapshot. The next step moves data from snapshot to pfsync packet. Once this is done, we dispatch packet to task, which pushes packet to network. The changes and reception side are not significant, we just teach pfsync to grab PF_LOCK() and state rw-lock to insert states received from peer. feedback and testing is much appreciated. thanks and regards sashan --------8<---------------8<---------------8<------------------8<-------- diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c index 34dd370078a..e0508623b97 100644 --- a/sys/net/if_pfsync.c +++ b/sys/net/if_pfsync.c @@ -210,9 +210,11 @@ struct pfsync_softc { struct ip sc_template; struct pf_state_queue sc_qs[PFSYNC_S_COUNT]; + struct mutex sc_mtx[PFSYNC_S_COUNT]; size_t sc_len; struct pfsync_upd_reqs sc_upd_req_list; + struct mutex sc_upd_req_mtx; int sc_initial_bulk; int sc_link_demoted; @@ -220,6 +222,7 @@ struct pfsync_softc { int sc_defer; struct pfsync_deferrals sc_deferrals; u_int sc_deferred; + struct mutex sc_deferrals_mtx; void *sc_plus; size_t sc_pluslen; @@ -234,6 +237,7 @@ struct pfsync_softc { struct timeout sc_bulk_tmo; TAILQ_HEAD(, tdb) sc_tdb_q; + struct mutex sc_tdb_mtx; struct task sc_ltask; struct task sc_dtask; @@ -241,6 +245,16 @@ struct pfsync_softc { struct timeout sc_tmo; }; +struct pfsync_snapshot { + struct pfsync_softc *sn_sc; + struct pf_state_queue sn_qs[PFSYNC_S_COUNT]; + struct pfsync_upd_reqs sn_upd_req_list; + TAILQ_HEAD(, tdb) sn_tdb_q; + size_t sn_len; + void *sn_plus; + size_t sn_pluslen; +}; + struct pfsync_softc *pfsyncif = NULL; struct cpumem *pfsynccounters; @@ -276,6 +290,10 @@ void pfsync_bulk_start(void); void pfsync_bulk_status(u_int8_t); void pfsync_bulk_update(void *); void pfsync_bulk_fail(void *); + +void pfsync_grab_snapshot(struct pfsync_snapshot *, struct pfsync_softc *); +void pfsync_drop_snapshot(struct pfsync_snapshot *); + #ifdef WITH_PF_LOCK void pfsync_send_dispatch(void *); void pfsync_send_pkt(struct mbuf *); @@ -314,18 +332,23 @@ pfsync_clone_create(struct if_clone *ifc, int unit) pfsync_sync_ok = 1; sc = malloc(sizeof(*pfsyncif), M_DEVBUF, M_WAITOK|M_ZERO); - for (q = 0; q < PFSYNC_S_COUNT; q++) + for (q = 0; q < PFSYNC_S_COUNT; q++) { TAILQ_INIT(&sc->sc_qs[q]); + mtx_init(&sc->sc_mtx[q], IPL_SOFTNET); + } pool_init(&sc->sc_pool, PFSYNC_PLSIZE, 0, IPL_SOFTNET, 0, "pfsync", NULL); TAILQ_INIT(&sc->sc_upd_req_list); + mtx_init(&sc->sc_upd_req_mtx, IPL_SOFTNET); TAILQ_INIT(&sc->sc_deferrals); + mtx_init(&sc->sc_deferrals_mtx, IPL_SOFTNET); task_set(&sc->sc_ltask, pfsync_syncdev_state, sc); task_set(&sc->sc_dtask, pfsync_ifdetach, sc); sc->sc_deferred = 0; TAILQ_INIT(&sc->sc_tdb_q); + mtx_init(&sc->sc_tdb_mtx, IPL_SOFTNET); sc->sc_len = PFSYNC_MINPKT; sc->sc_maxupdates = 128; @@ -370,6 +393,7 @@ pfsync_clone_destroy(struct ifnet *ifp) { struct pfsync_softc *sc = ifp->if_softc; struct pfsync_deferral *pd; + struct pfsync_deferrals deferrals; NET_LOCK(); @@ -391,10 +415,18 @@ pfsync_clone_destroy(struct ifnet *ifp) pfsync_drop(sc); - while (sc->sc_deferred > 0) { - pd = TAILQ_FIRST(&sc->sc_deferrals); - timeout_del(&pd->pd_tmo); - pfsync_undefer(pd, 0); + if (sc->sc_deferred > 0) { + TAILQ_INIT(&deferrals); + mtx_enter(&sc->sc_deferrals_mtx); + TAILQ_CONCAT(&deferrals, &sc->sc_deferrals, pd_entry); + sc->sc_deferred = 0; + mtx_leave(&sc->sc_deferrals_mtx); + + while (!TAILQ_EMPTY(&deferrals)) { + pd = TAILQ_FIRST(&deferrals); + TAILQ_REMOVE(&deferrals, pd, pd_entry); + pfsync_undefer(pd, 0); + } } pfsyncif = NULL; @@ -764,10 +796,8 @@ pfsync_input(struct mbuf **mp, int *offp, int proto, int af) return IPPROTO_DONE; } - PF_LOCK(); e = pfsync_acts[subh.action].in(n->m_data + noff, mlen, count, flags); - PF_UNLOCK(); if (e != 0) goto done; @@ -788,6 +818,7 @@ pfsync_in_clr(caddr_t buf, int len, int count, int flags) u_int32_t creatorid; int i; + PF_LOCK(); for (i = 0; i < count; i++) { clr = (struct pfsync_clr *)buf + len * i; kif = NULL; @@ -796,6 +827,7 @@ pfsync_in_clr(caddr_t buf, int len, int count, int flags) (kif = pfi_kif_find(clr->ifname)) == NULL) continue; + PF_STATE_ENTER_WRITE(); for (st = RB_MIN(pf_state_tree_id, &tree_id); st; st = nexts) { nexts = RB_NEXT(pf_state_tree_id, &tree_id, st); if (st->creatorid == creatorid && @@ -804,7 +836,9 @@ pfsync_in_clr(caddr_t buf, int len, int count, int flags) pf_remove_state(st); } } + PF_STATE_EXIT_WRITE(); } + PF_UNLOCK(); return (0); } @@ -816,6 +850,7 @@ pfsync_in_ins(caddr_t buf, int len, int count, int flags) sa_family_t af1, af2; int i; + PF_LOCK(); for (i = 0; i < count; i++) { sp = (struct pfsync_state *)(buf + len * i); af1 = sp->key[0].af; @@ -841,6 +876,7 @@ pfsync_in_ins(caddr_t buf, int len, int count, int flags) break; } } + PF_UNLOCK(); return (0); } @@ -859,12 +895,17 @@ pfsync_in_iack(caddr_t buf, int len, int count, int flags) id_key.id = ia->id; id_key.creatorid = ia->creatorid; + PF_STATE_ENTER_READ(); st = pf_find_state_byid(&id_key); + pf_state_ref(st); + PF_STATE_EXIT_READ(); if (st == NULL) continue; if (ISSET(st->state_flags, PFSTATE_ACK)) pfsync_deferred(st, 0); + + pf_state_unref(st); } return (0); @@ -908,8 +949,7 @@ pfsync_in_upd(caddr_t buf, int len, int count, int flags) struct pfsync_state *sp; struct pf_state_cmp id_key; struct pf_state *st; - int sync; - + int sync, error; int i; for (i = 0; i < count; i++) { @@ -928,11 +968,17 @@ pfsync_in_upd(caddr_t buf, int len, int count, int flags) id_key.id = sp->id; id_key.creatorid = sp->creatorid; + PF_STATE_ENTER_READ(); st = pf_find_state_byid(&id_key); + pf_state_ref(st); + PF_STATE_EXIT_READ(); if (st == NULL) { /* insert the update */ - if (pfsync_state_import(sp, flags)) + PF_LOCK(); + error = pfsync_state_import(sp, flags); + if (error) pfsyncstat_inc(pfsyncs_badstate); + PF_UNLOCK(); continue; } @@ -970,9 +1016,11 @@ pfsync_in_upd(caddr_t buf, int len, int count, int flags) if (sync) { pfsyncstat_inc(pfsyncs_stale); - pfsync_update_state_locked(st); + pfsync_update_state(st); schednetisr(NETISR_PFSYNC); } + + pf_state_unref(st); } return (0); @@ -1005,7 +1053,10 @@ pfsync_in_upd_c(caddr_t buf, int len, int count, int flags) id_key.id = up->id; id_key.creatorid = up->creatorid; + PF_STATE_ENTER_READ(); st = pf_find_state_byid(&id_key); + pf_state_ref(st); + PF_STATE_EXIT_READ(); if (st == NULL) { /* We don't have this state. Ask for it. */ pfsync_request_update(id_key.creatorid, id_key.id); @@ -1044,9 +1095,11 @@ pfsync_in_upd_c(caddr_t buf, int len, int count, int flags) if (sync) { pfsyncstat_inc(pfsyncs_stale); - pfsync_update_state_locked(st); + pfsync_update_state(st); schednetisr(NETISR_PFSYNC); } + + pf_state_unref(st); } return (0); @@ -1070,15 +1123,21 @@ pfsync_in_ureq(caddr_t buf, int len, int count, int flags) if (id_key.id == 0 && id_key.creatorid == 0) pfsync_bulk_start(); else { + PF_STATE_ENTER_READ(); st = pf_find_state_byid(&id_key); + pf_state_ref(st); + PF_STATE_EXIT_READ(); if (st == NULL) { pfsyncstat_inc(pfsyncs_badstate); continue; } - if (ISSET(st->state_flags, PFSTATE_NOSYNC)) + if (ISSET(st->state_flags, PFSTATE_NOSYNC)) { + pf_state_unref(st); continue; + } pfsync_update_state_req(st); + pf_state_unref(st); } } @@ -1093,6 +1152,7 @@ pfsync_in_del(caddr_t buf, int len, int count, int flags) struct pf_state *st; int i; + PF_STATE_ENTER_WRITE(); for (i = 0; i < count; i++) { sp = (struct pfsync_state *)(buf + len * i); @@ -1101,12 +1161,14 @@ pfsync_in_del(caddr_t buf, int len, int count, int flags) st = pf_find_state_byid(&id_key); if (st == NULL) { + PF_STATE_EXIT_WRITE(); pfsyncstat_inc(pfsyncs_badstate); continue; } SET(st->state_flags, PFSTATE_NOSYNC); pf_remove_state(st); } + PF_STATE_EXIT_WRITE(); return (0); } @@ -1119,6 +1181,8 @@ pfsync_in_del_c(caddr_t buf, int len, int count, int flags) struct pf_state *st; int i; + PF_LOCK(); + PF_STATE_ENTER_WRITE(); for (i = 0; i < count; i++) { sp = (struct pfsync_del_c *)(buf + len * i); @@ -1134,6 +1198,8 @@ pfsync_in_del_c(caddr_t buf, int len, int count, int flags) SET(st->state_flags, PFSTATE_NOSYNC); pf_remove_state(st); } + PF_STATE_EXIT_WRITE(); + PF_UNLOCK(); return (0); } @@ -1473,19 +1539,59 @@ pfsync_out_del(struct pf_state *st, void *buf) } void -pfsync_drop(struct pfsync_softc *sc) +pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc) +{ + int q; + + sn->sn_sc = sc; + + for (q = 0; q < PFSYNC_S_COUNT; q++) + mtx_enter(&sc->sc_mtx[q]); + + mtx_enter(&sc->sc_upd_req_mtx); + mtx_enter(&sc->sc_tdb_mtx); + + for (q = 0; q < PFSYNC_S_COUNT; q++) { + TAILQ_INIT(&sn->sn_qs[q]); + TAILQ_CONCAT(&sn->sn_qs[q], &sc->sc_qs[q], sync_list); + } + + TAILQ_INIT(&sn->sn_upd_req_list); + TAILQ_CONCAT(&sn->sn_upd_req_list, &sc->sc_upd_req_list, ur_entry); + + TAILQ_INIT(&sn->sn_tdb_q); + TAILQ_CONCAT(&sn->sn_tdb_q, &sc->sc_tdb_q, tdb_sync_entry); + + sn->sn_len = sc->sc_len; + sc->sc_len = PFSYNC_MINPKT; + + sn->sn_plus = sc->sc_plus; + sc->sc_plus = NULL; + sn->sn_pluslen = sc->sc_pluslen; + sc->sc_pluslen = 0; + + mtx_leave(&sc->sc_tdb_mtx); + mtx_leave(&sc->sc_upd_req_mtx); + + for (q = (PFSYNC_S_COUNT - 1); q >= 0; q--) + mtx_leave(&sc->sc_mtx[q]); +} + +void +pfsync_drop_snapshot(struct pfsync_snapshot *sn) { struct pf_state *st; struct pfsync_upd_req_item *ur; struct tdb *t; int q; + for (q = 0; q < PFSYNC_S_COUNT; q++) { - if (TAILQ_EMPTY(&sc->sc_qs[q])) + if (TAILQ_EMPTY(&sn->sn_qs[q])) continue; - while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) { - TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); + while ((st = TAILQ_FIRST(&sn->sn_qs[q])) != NULL) { + TAILQ_REMOVE(&sn->sn_qs[q], st, sync_list); #ifdef PFSYNC_DEBUG KASSERT(st->sync_state == q); #endif @@ -1494,19 +1600,42 @@ pfsync_drop(struct pfsync_softc *sc) } } - while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) { - TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry); - pool_put(&sc->sc_pool, ur); + while ((ur = TAILQ_FIRST(&sn->sn_upd_req_list)) != NULL) { + TAILQ_REMOVE(&sn->sn_upd_req_list, ur, ur_entry); + pool_put(&sn->sn_sc->sc_pool, ur); } - sc->sc_plus = NULL; - - while ((t = TAILQ_FIRST(&sc->sc_tdb_q)) != NULL) { - TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry); + while ((t = TAILQ_FIRST(&sn->sn_tdb_q)) != NULL) { + TAILQ_REMOVE(&sn->sn_tdb_q, t, tdb_sync_entry); CLR(t->tdb_flags, TDBF_PFSYNC); } +} - sc->sc_len = PFSYNC_MINPKT; +int +pfsync_is_snapshot_empty(struct pfsync_snapshot *sn) +{ + int q; + + for (q = 0; q < PFSYNC_S_COUNT; q++) + if (!TAILQ_EMPTY(&sn->sn_qs[q])) + return (0); + + if (!TAILQ_EMPTY(&sn->sn_upd_req_list)) + return (0); + + if (!TAILQ_EMPTY(&sn->sn_tdb_q)) + return (0); + + return (sn->sn_plus == NULL); +} + +void +pfsync_drop(struct pfsync_softc *sc) +{ + struct pfsync_snapshot sn; + + pfsync_grab_snapshot(&sn, sc); + pfsync_drop_snapshot(&sn); } #ifdef WITH_PF_LOCK @@ -1559,6 +1688,7 @@ pfsync_send_pkt(struct mbuf *m) void pfsync_sendout(void) { + struct pfsync_snapshot sn; struct pfsync_softc *sc = pfsyncif; #if NBPFILTER > 0 struct ifnet *ifp = &sc->sc_if; @@ -1570,12 +1700,9 @@ pfsync_sendout(void) struct pf_state *st; struct pfsync_upd_req_item *ur; struct tdb *t; - int offset; int q, count = 0; - PF_ASSERT_LOCKED(); - if (sc == NULL || sc->sc_len == PFSYNC_MINPKT) return; @@ -1589,26 +1716,35 @@ pfsync_sendout(void) return; } + pfsync_grab_snapshot(&sn, sc); + + /* + * Check below is sufficient to prevent us from sending empty packets, + * but it does not stop us from sending short packets. + */ + if (pfsync_is_snapshot_empty(&sn)) + return; + MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == NULL) { sc->sc_if.if_oerrors++; pfsyncstat_inc(pfsyncs_onomem); - pfsync_drop(sc); + pfsync_drop_snapshot(&sn); return; } - if (max_linkhdr + sc->sc_len > MHLEN) { - MCLGETI(m, M_DONTWAIT, NULL, max_linkhdr + sc->sc_len); + if (max_linkhdr + sn.sn_len > MHLEN) { + MCLGETI(m, M_DONTWAIT, NULL, max_linkhdr + sn.sn_len); if (!ISSET(m->m_flags, M_EXT)) { m_free(m); sc->sc_if.if_oerrors++; pfsyncstat_inc(pfsyncs_onomem); - pfsync_drop(sc); + pfsync_drop_snapshot(&sn); return; } } m->m_data += max_linkhdr; - m->m_len = m->m_pkthdr.len = sc->sc_len; + m->m_len = m->m_pkthdr.len = sn.sn_len; /* build the ip header */ ip = mtod(m, struct ip *); @@ -1624,16 +1760,16 @@ pfsync_sendout(void) offset += sizeof(*ph); ph->version = PFSYNC_VERSION; - ph->len = htons(sc->sc_len - sizeof(*ip)); + ph->len = htons(sn.sn_len - sizeof(*ip)); bcopy(pf_status.pf_chksum, ph->pfcksum, PF_MD5_DIGEST_LENGTH); - if (!TAILQ_EMPTY(&sc->sc_upd_req_list)) { + if (!TAILQ_EMPTY(&sn.sn_upd_req_list)) { subh = (struct pfsync_subheader *)(m->m_data + offset); offset += sizeof(*subh); count = 0; - while ((ur = TAILQ_FIRST(&sc->sc_upd_req_list)) != NULL) { - TAILQ_REMOVE(&sc->sc_upd_req_list, ur, ur_entry); + while ((ur = TAILQ_FIRST(&sn.sn_upd_req_list)) != NULL) { + TAILQ_REMOVE(&sn.sn_upd_req_list, ur, ur_entry); bcopy(&ur->ur_msg, m->m_data + offset, sizeof(ur->ur_msg)); @@ -1651,20 +1787,19 @@ pfsync_sendout(void) } /* has someone built a custom region for us to add? */ - if (sc->sc_plus != NULL) { - bcopy(sc->sc_plus, m->m_data + offset, sc->sc_pluslen); - offset += sc->sc_pluslen; - - sc->sc_plus = NULL; + if (sn.sn_plus != NULL) { + bcopy(sn.sn_plus, m->m_data + offset, sn.sn_pluslen); + offset += sn.sn_pluslen; + sn.sn_plus = NULL; /* XXX memory leak ? */ } - if (!TAILQ_EMPTY(&sc->sc_tdb_q)) { + if (!TAILQ_EMPTY(&sn.sn_tdb_q)) { subh = (struct pfsync_subheader *)(m->m_data + offset); offset += sizeof(*subh); count = 0; - while ((t = TAILQ_FIRST(&sc->sc_tdb_q)) != NULL) { - TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry); + while ((t = TAILQ_FIRST(&sn.sn_tdb_q)) != NULL) { + TAILQ_REMOVE(&sn.sn_tdb_q, t, tdb_sync_entry); pfsync_out_tdb(t, m->m_data + offset); offset += sizeof(struct pfsync_tdb); CLR(t->tdb_flags, TDBF_PFSYNC); @@ -1679,15 +1814,15 @@ pfsync_sendout(void) /* walk the queues */ for (q = 0; q < PFSYNC_S_COUNT; q++) { - if (TAILQ_EMPTY(&sc->sc_qs[q])) + if (TAILQ_EMPTY(&sn.sn_qs[q])) continue; subh = (struct pfsync_subheader *)(m->m_data + offset); offset += sizeof(*subh); count = 0; - while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) { - TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); + while ((st = TAILQ_FIRST(&sn.sn_qs[q])) != NULL) { + TAILQ_REMOVE(&sn.sn_qs[q], st, sync_list); st->sync_state = PFSYNC_S_NONE; #ifdef PFSYNC_DEBUG KASSERT(st->sync_state == q); @@ -1709,22 +1844,18 @@ pfsync_sendout(void) #if NBPFILTER > 0 if (ifp->if_bpf) { m->m_data += sizeof(*ip); - m->m_len = m->m_pkthdr.len = sc->sc_len - sizeof(*ip); + m->m_len = m->m_pkthdr.len = sn.sn_len - sizeof(*ip); bpf_mtap(ifp->if_bpf, m, BPF_DIRECTION_OUT); m->m_data -= sizeof(*ip); - m->m_len = m->m_pkthdr.len = sc->sc_len; + m->m_len = m->m_pkthdr.len = sn.sn_len; } if (sc->sc_sync_if == NULL) { - sc->sc_len = PFSYNC_MINPKT; m_freem(m); return; } #endif - /* start again */ - sc->sc_len = PFSYNC_MINPKT; - sc->sc_if.if_opackets++; sc->sc_if.if_obytes += m->m_pkthdr.len; @@ -1783,7 +1914,11 @@ pfsync_defer(struct pf_state *st, struct mbuf *m) return (0); if (sc->sc_deferred >= 128) { + mtx_enter(&sc->sc_deferrals_mtx); pd = TAILQ_FIRST(&sc->sc_deferrals); + TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); + sc->sc_deferred--; + mtx_leave(&sc->sc_deferrals_mtx); if (timeout_del(&pd->pd_tmo)) pfsync_undefer(pd, 0); } @@ -1798,8 +1933,10 @@ pfsync_defer(struct pf_state *st, struct mbuf *m) pd->pd_st = pf_state_ref(st); pd->pd_m = m; + mtx_enter(&sc->sc_deferrals_mtx); sc->sc_deferred++; TAILQ_INSERT_TAIL(&sc->sc_deferrals, pd, pd_entry); + mtx_leave(&sc->sc_deferrals_mtx); timeout_set_proc(&pd->pd_tmo, pfsync_defer_tmo, pd); timeout_add_msec(&pd->pd_tmo, 20); @@ -1810,73 +1947,96 @@ pfsync_defer(struct pf_state *st, struct mbuf *m) } void -pfsync_undefer(struct pfsync_deferral *pd, int drop) +pfsync_undefer_notify(struct pfsync_deferral *pd) { - struct pfsync_softc *sc = pfsyncif; struct pf_pdesc pdesc; - NET_ASSERT_LOCKED(); - - if (sc == NULL) - return; - - TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); - sc->sc_deferred--; - - CLR(pd->pd_st->state_flags, PFSTATE_ACK); - if (drop) - m_freem(pd->pd_m); - else { - if (pd->pd_st->rule.ptr->rt == PF_ROUTETO) { - if (pf_setup_pdesc(&pdesc, - pd->pd_st->key[PF_SK_WIRE]->af, - pd->pd_st->direction, pd->pd_st->rt_kif, - pd->pd_m, NULL) != PF_PASS) { - m_freem(pd->pd_m); - goto out; - } - switch (pd->pd_st->key[PF_SK_WIRE]->af) { - case AF_INET: - pf_route(&pdesc, - pd->pd_st->rule.ptr, pd->pd_st); - break; + if (pd->pd_st->rule.ptr->rt == PF_ROUTETO) { + if (pf_setup_pdesc(&pdesc, + pd->pd_st->key[PF_SK_WIRE]->af, + pd->pd_st->direction, pd->pd_st->rt_kif, + pd->pd_m, NULL) != PF_PASS) { + m_freem(pd->pd_m); + return; + } + switch (pd->pd_st->key[PF_SK_WIRE]->af) { + case AF_INET: + pf_route(&pdesc, + pd->pd_st->rule.ptr, pd->pd_st); + break; #ifdef INET6 - case AF_INET6: - pf_route6(&pdesc, - pd->pd_st->rule.ptr, pd->pd_st); - break; + case AF_INET6: + pf_route6(&pdesc, + pd->pd_st->rule.ptr, pd->pd_st); + break; #endif /* INET6 */ - default: - unhandled_af(pd->pd_st->key[PF_SK_WIRE]->af); - } - pd->pd_m = pdesc.m; - } else { - switch (pd->pd_st->key[PF_SK_WIRE]->af) { - case AF_INET: - ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, - 0); - break; + default: + unhandled_af(pd->pd_st->key[PF_SK_WIRE]->af); + } + pd->pd_m = pdesc.m; + } else { + switch (pd->pd_st->key[PF_SK_WIRE]->af) { + case AF_INET: + ip_output(pd->pd_m, NULL, NULL, 0, NULL, NULL, + 0); + break; #ifdef INET6 - case AF_INET6: - ip6_output(pd->pd_m, NULL, NULL, 0, - NULL, NULL); - break; + case AF_INET6: + ip6_output(pd->pd_m, NULL, NULL, 0, + NULL, NULL); + break; #endif /* INET6 */ - default: - unhandled_af(pd->pd_st->key[PF_SK_WIRE]->af); - } + default: + unhandled_af(pd->pd_st->key[PF_SK_WIRE]->af); } + + pd->pd_m = NULL; } - out: +} + +void +pfsync_free_deferral(struct pfsync_deferral *pd) +{ + struct pfsync_softc *sc = pfsyncif; + pf_state_unref(pd->pd_st); + if (pd->pd_m != NULL) + m_freem(pd->pd_m); pool_put(&sc->sc_pool, pd); } +void +pfsync_undefer(struct pfsync_deferral *pd, int drop) +{ + struct pfsync_softc *sc = pfsyncif; + + NET_ASSERT_LOCKED(); + + if (sc == NULL) + return; + + CLR(pd->pd_st->state_flags, PFSTATE_ACK); + if (drop) { + m_freem(pd->pd_m); + pd->pd_m = NULL; + } else + pfsync_undefer_notify(pd); + + pfsync_free_deferral(pd); +} + void pfsync_defer_tmo(void *arg) { + struct pfsync_softc *sc = pfsyncif; + struct pfsync_deferral *pd = arg; + + mtx_enter(&sc->sc_deferrals_mtx); + TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); + sc->sc_deferred--; + mtx_leave(&sc->sc_deferrals_mtx); NET_LOCK(); - pfsync_undefer(arg, 0); + pfsync_undefer(pd, 0); NET_UNLOCK(); } @@ -1888,25 +2048,29 @@ pfsync_deferred(struct pf_state *st, int drop) NET_ASSERT_LOCKED(); + mtx_enter(&sc->sc_deferrals_mtx); TAILQ_FOREACH(pd, &sc->sc_deferrals, pd_entry) { if (pd->pd_st == st) { - if (timeout_del(&pd->pd_tmo)) + if (timeout_del(&pd->pd_tmo)) { + TAILQ_REMOVE(&sc->sc_deferrals, pd, pd_entry); + sc->sc_deferred--; + mtx_leave(&sc->sc_deferrals_mtx); pfsync_undefer(pd, drop); + } else + mtx_leave(&sc->sc_deferrals_mtx); return; } } - - panic("pfsync_deferred: unable to find deferred state"); + mtx_leave(&sc->sc_deferrals_mtx); } void -pfsync_update_state_locked(struct pf_state *st) +pfsync_update_state(struct pf_state *st) { struct pfsync_softc *sc = pfsyncif; int sync = 0; NET_ASSERT_LOCKED(); - PF_ASSERT_LOCKED(); if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING)) return; @@ -1951,22 +2115,6 @@ pfsync_update_state_locked(struct pf_state *st) schednetisr(NETISR_PFSYNC); } -void -pfsync_update_state(struct pf_state *st, int *have_pf_lock) -{ - struct pfsync_softc *sc = pfsyncif; - - if (sc == NULL || !ISSET(sc->sc_if.if_flags, IFF_RUNNING)) - return; - - if (*have_pf_lock == 0) { - PF_LOCK(); - *have_pf_lock = 1; - } - - pfsync_update_state_locked(st); -} - void pfsync_cancel_full_update(struct pfsync_softc *sc) { @@ -2019,7 +2167,7 @@ pfsync_request_update(u_int32_t creatorid, u_int64_t id) { struct pfsync_softc *sc = pfsyncif; struct pfsync_upd_req_item *item; - size_t nlen = sizeof(struct pfsync_upd_req); + size_t nlen, sc_len; /* * this code does nothing to prevent multiple update requests for the @@ -2035,18 +2183,24 @@ pfsync_request_update(u_int32_t creatorid, u_int64_t id) item->ur_msg.id = id; item->ur_msg.creatorid = creatorid; - if (TAILQ_EMPTY(&sc->sc_upd_req_list)) - nlen += sizeof(struct pfsync_subheader); + do { + mtx_enter(&sc->sc_upd_req_mtx); - if (sc->sc_len + nlen > sc->sc_if.if_mtu) { - pfsync_sendout(); + nlen = sizeof(struct pfsync_upd_req); + if (TAILQ_EMPTY(&sc->sc_upd_req_list)) + nlen += sizeof(struct pfsync_subheader); - nlen = sizeof(struct pfsync_subheader) + - sizeof(struct pfsync_upd_req); - } + sc_len = atomic_add_long_nv(&sc->sc_len, nlen); + if (sc_len > sc->sc_if.if_mtu) { + atomic_sub_long(&sc->sc_len, nlen); + mtx_leave(&sc->sc_upd_req_mtx); + pfsync_sendout(); + continue; + } - TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry); - sc->sc_len += nlen; + TAILQ_INSERT_TAIL(&sc->sc_upd_req_list, item, ur_entry); + mtx_leave(&sc->sc_upd_req_mtx); + } while (0); schednetisr(NETISR_PFSYNC); } @@ -2172,27 +2326,45 @@ void pfsync_q_ins(struct pf_state *st, int q) { struct pfsync_softc *sc = pfsyncif; - size_t nlen = pfsync_qs[q].len; + size_t nlen, sc_len; KASSERT(st->sync_state == PFSYNC_S_NONE); #if defined(PFSYNC_DEBUG) if (sc->sc_len < PFSYNC_MINPKT) - panic("pfsync pkt len is too low %d", sc->sc_len); + panic("pfsync pkt len is too low %zu", sc->sc_len); #endif - if (TAILQ_EMPTY(&sc->sc_qs[q])) - nlen += sizeof(struct pfsync_subheader); + do { + mtx_enter(&sc->sc_mtx[q]); - if (sc->sc_len + nlen > sc->sc_if.if_mtu) { - pfsync_sendout(); + /* + * If two threads are competing to insert the same state, then + * there must be just single winner. + */ + if (st->sync_state != PFSYNC_S_NONE) { + mtx_leave(&sc->sc_mtx[q]); + break; + } - nlen = sizeof(struct pfsync_subheader) + pfsync_qs[q].len; - } + nlen = pfsync_qs[q].len; + + if (TAILQ_EMPTY(&sc->sc_qs[q])) + nlen += sizeof(struct pfsync_subheader); + + sc_len = atomic_add_long_nv(&sc->sc_len, nlen); + if (sc_len > sc->sc_if.if_mtu) { + atomic_sub_long(&sc->sc_len, nlen); + mtx_leave(&sc->sc_mtx[q]); + pfsync_sendout(); + continue; + } + + pf_state_ref(st); - sc->sc_len += nlen; - pf_state_ref(st); - TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list); - st->sync_state = q; + TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list); + st->sync_state = q; + mtx_leave(&sc->sc_mtx[q]); + } while (0); } void @@ -2203,39 +2375,49 @@ pfsync_q_del(struct pf_state *st) KASSERT(st->sync_state != PFSYNC_S_NONE); - sc->sc_len -= pfsync_qs[q].len; + mtx_enter(&sc->sc_mtx[q]); + atomic_sub_long(&sc->sc_len, pfsync_qs[q].len); TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); + if (TAILQ_EMPTY(&sc->sc_qs[q])) + atomic_sub_long(&sc->sc_len, sizeof (struct pfsync_subheader)); + mtx_leave(&sc->sc_mtx[q]); + st->sync_state = PFSYNC_S_NONE; pf_state_unref(st); - if (TAILQ_EMPTY(&sc->sc_qs[q])) - sc->sc_len -= sizeof(struct pfsync_subheader); } void pfsync_update_tdb(struct tdb *t, int output) { struct pfsync_softc *sc = pfsyncif; - size_t nlen = sizeof(struct pfsync_tdb); + size_t nlen, sc_len; if (sc == NULL) return; if (!ISSET(t->tdb_flags, TDBF_PFSYNC)) { - if (TAILQ_EMPTY(&sc->sc_tdb_q)) - nlen += sizeof(struct pfsync_subheader); - - if (sc->sc_len + nlen > sc->sc_if.if_mtu) { - pfsync_sendout(); + do { + mtx_enter(&sc->sc_tdb_mtx); + nlen = sizeof(struct pfsync_tdb); + + if (TAILQ_EMPTY(&sc->sc_tdb_q)) + nlen += sizeof(struct pfsync_subheader); + + sc_len = atomic_add_long_nv(&sc->sc_len, nlen); + if (sc_len > sc->sc_if.if_mtu) { + atomic_sub_long(&sc->sc_len, nlen); + mtx_leave(&sc->sc_tdb_mtx); + pfsync_sendout(); + continue; + } - nlen = sizeof(struct pfsync_subheader) + - sizeof(struct pfsync_tdb); - } + TAILQ_INSERT_TAIL(&sc->sc_tdb_q, t, tdb_sync_entry); + mtx_leave(&sc->sc_tdb_mtx); - sc->sc_len += nlen; - TAILQ_INSERT_TAIL(&sc->sc_tdb_q, t, tdb_sync_entry); - SET(t->tdb_flags, TDBF_PFSYNC); - t->tdb_updates = 0; + SET(t->tdb_flags, TDBF_PFSYNC); + t->tdb_updates = 0; + } while (0); } else { if (++t->tdb_updates >= sc->sc_maxupdates) schednetisr(NETISR_PFSYNC); @@ -2251,16 +2433,22 @@ void pfsync_delete_tdb(struct tdb *t) { struct pfsync_softc *sc = pfsyncif; + size_t nlen; if (sc == NULL || !ISSET(t->tdb_flags, TDBF_PFSYNC)) return; - sc->sc_len -= sizeof(struct pfsync_tdb); + mtx_enter(&sc->sc_tdb_mtx); + TAILQ_REMOVE(&sc->sc_tdb_q, t, tdb_sync_entry); CLR(t->tdb_flags, TDBF_PFSYNC); + nlen = sizeof(struct pfsync_tdb); if (TAILQ_EMPTY(&sc->sc_tdb_q)) - sc->sc_len -= sizeof(struct pfsync_subheader); + nlen += sizeof(struct pfsync_subheader); + atomic_sub_long(&sc->sc_len, nlen); + + mtx_leave(&sc->sc_tdb_mtx); } void @@ -2308,9 +2496,14 @@ pfsync_bulk_start(void) else { sc->sc_ureq_received = time_uptime; - if (sc->sc_bulk_next == NULL) + if (sc->sc_bulk_next == NULL) { + PF_STATE_ENTER_READ(); sc->sc_bulk_next = TAILQ_FIRST(&state_list); + pf_state_ref(sc->sc_bulk_next); + PF_STATE_EXIT_READ(); + } sc->sc_bulk_last = sc->sc_bulk_next; + pf_state_ref(sc->sc_bulk_last); pfsync_bulk_status(PFSYNC_BUS_START); timeout_add(&sc->sc_bulk_tmo, 0); @@ -2329,22 +2522,32 @@ pfsync_bulk_update(void *arg) if (sc == NULL) goto out; st = sc->sc_bulk_next; + sc->sc_bulk_next = NULL; for (;;) { if (st->sync_state == PFSYNC_S_NONE && st->timeout < PFTM_MAX && st->pfsync_time <= sc->sc_ureq_received) { pfsync_update_state_req(st); + pf_state_unref(st); i++; } + /* + * I wonder how we prevent infinite bulk update. IMO it can + * happen when sc_bulk_last state expires before we iterate + * through the whole list. + */ + PF_STATE_ENTER_READ(); st = TAILQ_NEXT(st, entry_list); if (st == NULL) st = TAILQ_FIRST(&state_list); + pf_state_ref(st); + PF_STATE_EXIT_READ(); if (st == sc->sc_bulk_last) { /* we're done */ - sc->sc_bulk_next = NULL; + pf_state_unref(sc->sc_bulk_last); sc->sc_bulk_last = NULL; pfsync_bulk_status(PFSYNC_BUS_END); break; @@ -2467,9 +2670,7 @@ void pfsync_timeout(void *arg) { NET_LOCK(); - PF_LOCK(); pfsync_sendout(); - PF_UNLOCK(); NET_UNLOCK(); } @@ -2477,9 +2678,7 @@ pfsync_timeout(void *arg) void pfsyncintr(void) { - PF_LOCK(); pfsync_sendout(); - PF_UNLOCK(); } int diff --git a/sys/net/if_pfsync.h b/sys/net/if_pfsync.h index 3c3b7dd90f6..ed2dbae3dd8 100644 --- a/sys/net/if_pfsync.h +++ b/sys/net/if_pfsync.h @@ -328,7 +328,7 @@ void pfsync_state_export(struct pfsync_state *, struct pf_state *); void pfsync_insert_state(struct pf_state *); -void pfsync_update_state(struct pf_state *, int *); +void pfsync_update_state(struct pf_state *); void pfsync_delete_state(struct pf_state *); void pfsync_clear_states(u_int32_t, const char *); diff --git a/sys/net/pf.c b/sys/net/pf.c index ebe339921fa..25c80d6b75b 100644 --- a/sys/net/pf.c +++ b/sys/net/pf.c @@ -6980,7 +6980,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0) PF_STATE_EXIT_READ(); if (action == PF_PASS || action == PF_AFRT) { #if NPFSYNC > 0 - pfsync_update_state(s, &have_pf_lock); + pfsync_update_state(s); #endif /* NPFSYNC > 0 */ r = s->rule.ptr; a = s->anchor.ptr; @@ -7012,7 +7012,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0) PF_STATE_EXIT_READ(); if (action == PF_PASS || action == PF_AFRT) { #if NPFSYNC > 0 - pfsync_update_state(s, &have_pf_lock); + pfsync_update_state(s); #endif /* NPFSYNC > 0 */ r = s->rule.ptr; a = s->anchor.ptr; @@ -7088,7 +7088,7 @@ pf_test(sa_family_t af, int fwdir, struct ifnet *ifp, struct mbuf **m0) if (action == PF_PASS || action == PF_AFRT) { #if NPFSYNC > 0 - pfsync_update_state(s, &have_pf_lock); + pfsync_update_state(s); #endif /* NPFSYNC > 0 */ r = s->rule.ptr; a = s->anchor.ptr;