Hello,
On Tue, May 09, 2023 at 06:26:43PM +0000, mabi wrote: > Hi, > > On a brand new OpenBSD 7.3 firewall (amd64) I get a kernel panic every few > days and was wondering if this panic I get is related to this issue/bug? > your panic got fixed by recent commit [1] Hrvoje was/is hitting very close to that KASSERT() (now removed) at line 2274. in Hrvoje's case the TAILQ_REMOVE() macro complains we attempt to remove state which is removed already: 2273 atomic_sub_long(&sc->sc_len, pfsync_qs[q].len); 2274 TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); 2275 if (TAILQ_EMPTY(&sc->sc_qs[q])) 2276 atomic_sub_long(&sc->sc_len, sizeof (struct pfsync_subheader)); 2277 st->sync_state = PFSYNC_S_NONE; 2278 mtx_leave(&sc->sc_st_mtx); 2279 2280 pf_state_unref(st); the cause is very similar pfsync relies on volatile value in ->sync_state member. The ->sync_state member must be modified under protection of ->mtx. The issue has been pointed out by bluhm@ during m2k23 hackathon where I shared my pfsync(4) headache with him. Diff below is my attempt to fix it. I had no chance to test it. I'll appreciate If you will give it a try and let me know how things look like. thanks and regards sashan [1] https://marc.info/?l=openbsd-cvs&m=168269695603160&w=2 --------8<---------------8<---------------8<------------------8<-------- diff --git a/sys/net/if_pfsync.c b/sys/net/if_pfsync.c index 822b4211d0f..811d9d59666 100644 --- a/sys/net/if_pfsync.c +++ b/sys/net/if_pfsync.c @@ -1362,14 +1362,17 @@ pfsync_grab_snapshot(struct pfsync_snapshot *sn, struct pfsync_softc *sc) while ((st = TAILQ_FIRST(&sc->sc_qs[q])) != NULL) { TAILQ_REMOVE(&sc->sc_qs[q], st, sync_list); + mtx_enter(&st->mtx); if (st->snapped == 0) { TAILQ_INSERT_TAIL(&sn->sn_qs[q], st, sync_snap); st->snapped = 1; + mtx_leave(&st->mtx); } else { /* * item is on snapshot list already, so we can * skip it now. */ + mtx_leave(&st->mtx); pf_state_unref(st); } } @@ -1422,11 +1425,13 @@ pfsync_drop_snapshot(struct pfsync_snapshot *sn) continue; while ((st = TAILQ_FIRST(&sn->sn_qs[q])) != NULL) { + mtx_enter(&st->mtx); KASSERT(st->sync_state == q); KASSERT(st->snapped == 1); TAILQ_REMOVE(&sn->sn_qs[q], st, sync_snap); st->sync_state = PFSYNC_S_NONE; st->snapped = 0; + mtx_leave(&st->mtx); pf_state_unref(st); } } @@ -1665,6 +1670,7 @@ pfsync_sendout(void) count = 0; while ((st = TAILQ_FIRST(&sn.sn_qs[q])) != NULL) { + mtx_enter(&st->mtx); TAILQ_REMOVE(&sn.sn_qs[q], st, sync_snap); KASSERT(st->sync_state == q); KASSERT(st->snapped == 1); @@ -1672,6 +1678,7 @@ pfsync_sendout(void) st->snapped = 0; pfsync_qs[q].write(st, m->m_data + offset); offset += pfsync_qs[q].len; + mtx_leave(&st->mtx); pf_state_unref(st); count++; @@ -1725,8 +1732,6 @@ pfsync_insert_state(struct pf_state *st) ISSET(st->state_flags, PFSTATE_NOSYNC)) return; - KASSERT(st->sync_state == PFSYNC_S_NONE); - if (sc->sc_len == PFSYNC_MINPKT) timeout_add_sec(&sc->sc_tmo, 1); @@ -2221,6 +2226,7 @@ pfsync_q_ins(struct pf_state *st, int q) panic("pfsync pkt len is too low %zd", sc->sc_len); do { mtx_enter(&sc->sc_st_mtx); + mtx_enter(&st->mtx); /* * There are either two threads trying to update the @@ -2228,6 +2234,7 @@ pfsync_q_ins(struct pf_state *st, int q) * (is on snapshot queue). */ if (st->sync_state != PFSYNC_S_NONE) { + mtx_leave(&st->mtx); mtx_leave(&sc->sc_st_mtx); break; } @@ -2240,6 +2247,7 @@ pfsync_q_ins(struct pf_state *st, int q) sclen = atomic_add_long_nv(&sc->sc_len, nlen); if (sclen > sc->sc_if.if_mtu) { atomic_sub_long(&sc->sc_len, nlen); + mtx_leave(&st->mtx); mtx_leave(&sc->sc_st_mtx); pfsync_sendout(); continue; @@ -2249,6 +2257,7 @@ pfsync_q_ins(struct pf_state *st, int q) TAILQ_INSERT_TAIL(&sc->sc_qs[q], st, sync_list); st->sync_state = q; + mtx_leave(&st->mtx); mtx_leave(&sc->sc_st_mtx); } while (0); } @@ -2260,6 +2269,7 @@ pfsync_q_del(struct pf_state *st) int q; mtx_enter(&sc->sc_st_mtx); + mtx_enter(&st->mtx); q = st->sync_state; /* * re-check under mutex @@ -2267,6 +2277,7 @@ pfsync_q_del(struct pf_state *st) * too late, the state is being just processed/dispatched to peer. */ if ((q == PFSYNC_S_NONE) || (st->snapped)) { + mtx_leave(&st->mtx); mtx_leave(&sc->sc_st_mtx); return; } @@ -2275,6 +2286,7 @@ pfsync_q_del(struct pf_state *st) if (TAILQ_EMPTY(&sc->sc_qs[q])) atomic_sub_long(&sc->sc_len, sizeof (struct pfsync_subheader)); st->sync_state = PFSYNC_S_NONE; + mtx_leave(&st->mtx); mtx_leave(&sc->sc_st_mtx); pf_state_unref(st);