The branch main has been updated by kp: URL: https://cgit.FreeBSD.org/src/commit/?id=defc181278cc8e04369cf439d18f1de184532a34
commit defc181278cc8e04369cf439d18f1de184532a34 Author: Kristof Provost <k...@freebsd.org> AuthorDate: 2025-05-29 11:27:17 +0000 Commit: Kristof Provost <k...@freebsd.org> CommitDate: 2025-06-09 19:37:56 +0000 pf: reorganise fragment reassembly To avoid packet loss due to reuse of the 16 bit IPv4 fragment id, we need suitable data structures. Organize the pf fragments with two red-black trees. One is holding the address and protocol information and the other has only the fragment id. This will allow to drop fragemts for specific connections more aggressively. from markus@; OK sashan@ Obtained from: OpenBSD, bluhm <bl...@openbsd.org>, 09228e5ff0 Sponsored by: Rubicon Communications, LLC ("Netgate") Differential Revision: https://reviews.freebsd.org/D50723 --- sys/netpfil/pf/pf_norm.c | 169 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 114 insertions(+), 55 deletions(-) diff --git a/sys/netpfil/pf/pf_norm.c b/sys/netpfil/pf/pf_norm.c index ed9aed157993..45f4415d084b 100644 --- a/sys/netpfil/pf/pf_norm.c +++ b/sys/netpfil/pf/pf_norm.c @@ -76,21 +76,20 @@ struct pf_frent { uint16_t fe_mff; /* more fragment flag */ }; -struct pf_fragment_cmp { - struct pf_addr frc_src; - struct pf_addr frc_dst; - uint32_t frc_id; - sa_family_t frc_af; - uint8_t frc_proto; +RB_HEAD(pf_frag_tree, pf_fragment); +struct pf_frnode { + struct pf_addr fn_src; /* ip source address */ + struct pf_addr fn_dst; /* ip destination address */ + sa_family_t fn_af; /* address family */ + u_int8_t fn_proto; /* protocol for fragments in fn_tree */ + u_int32_t fn_fragments; /* number of entries in fn_tree */ + + RB_ENTRY(pf_frnode) fn_entry; + struct pf_frag_tree fn_tree; /* matching fragments, lookup by id */ }; struct pf_fragment { - struct pf_fragment_cmp fr_key; -#define fr_src fr_key.frc_src -#define fr_dst fr_key.frc_dst -#define fr_id fr_key.frc_id -#define fr_af fr_key.frc_af -#define fr_proto fr_key.frc_proto + uint32_t fr_id; /* fragment id for reassemble */ /* pointers to queue element */ struct pf_frent *fr_firstoff[PF_FRAG_ENTRY_POINTS]; @@ -102,6 +101,7 @@ struct pf_fragment { TAILQ_HEAD(pf_fragq, pf_frent) fr_queue; uint16_t fr_maxlen; /* maximum length of single fragment */ u_int16_t fr_holes; /* number of holes in the queue */ + struct pf_frnode *fr_node; /* ip src/dst/proto/af for fragments */ }; VNET_DEFINE_STATIC(struct mtx, pf_frag_mtx); @@ -114,16 +114,25 @@ VNET_DEFINE(uma_zone_t, pf_state_scrub_z); /* XXX: shared with pfsync */ VNET_DEFINE_STATIC(uma_zone_t, pf_frent_z); #define V_pf_frent_z VNET(pf_frent_z) +VNET_DEFINE_STATIC(uma_zone_t, pf_frnode_z); +#define V_pf_frnode_z VNET(pf_frnode_z) VNET_DEFINE_STATIC(uma_zone_t, pf_frag_z); #define V_pf_frag_z VNET(pf_frag_z) TAILQ_HEAD(pf_fragqueue, pf_fragment); TAILQ_HEAD(pf_cachequeue, pf_fragment); +RB_HEAD(pf_frnode_tree, pf_frnode); VNET_DEFINE_STATIC(struct pf_fragqueue, pf_fragqueue); #define V_pf_fragqueue VNET(pf_fragqueue) -RB_HEAD(pf_frag_tree, pf_fragment); VNET_DEFINE_STATIC(struct pf_frag_tree, pf_frag_tree); #define V_pf_frag_tree VNET(pf_frag_tree) +static __inline int pf_frnode_compare(struct pf_frnode *, + struct pf_frnode *); +VNET_DEFINE_STATIC(struct pf_frnode_tree, pf_frnode_tree); +#define V_pf_frnode_tree VNET(pf_frnode_tree) +RB_PROTOTYPE(pf_frnode_tree, pf_frnode, fn_entry, pf_frnode_compare); +RB_GENERATE(pf_frnode_tree, pf_frnode, fn_entry, pf_frnode_compare); + static int pf_frag_compare(struct pf_fragment *, struct pf_fragment *); static RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare); @@ -134,8 +143,7 @@ static void pf_free_fragment(struct pf_fragment *); static struct pf_frent *pf_create_fragment(u_short *); static int pf_frent_holes(struct pf_frent *frent); -static struct pf_fragment *pf_find_fragment(struct pf_fragment_cmp *key, - struct pf_frag_tree *tree); +static struct pf_fragment *pf_find_fragment(struct pf_frnode *, u_int32_t); static inline int pf_frent_index(struct pf_frent *); static int pf_frent_insert(struct pf_fragment *, struct pf_frent *, struct pf_frent *); @@ -143,7 +151,7 @@ void pf_frent_remove(struct pf_fragment *, struct pf_frent *); struct pf_frent *pf_frent_previous(struct pf_fragment *, struct pf_frent *); -static struct pf_fragment *pf_fillup_fragment(struct pf_fragment_cmp *, +static struct pf_fragment *pf_fillup_fragment(struct pf_frnode *, u_int32_t, struct pf_frent *, u_short *); static struct mbuf *pf_join_fragment(struct pf_fragment *); #ifdef INET @@ -163,14 +171,13 @@ static int pf_reassemble6(struct mbuf **, #ifdef INET static void -pf_ip2key(struct ip *ip, struct pf_fragment_cmp *key) +pf_ip2key(struct ip *ip, struct pf_frnode *key) { - key->frc_src.v4 = ip->ip_src; - key->frc_dst.v4 = ip->ip_dst; - key->frc_af = AF_INET; - key->frc_proto = ip->ip_p; - key->frc_id = ip->ip_id; + key->fn_src.v4 = ip->ip_src; + key->fn_dst.v4 = ip->ip_dst; + key->fn_af = AF_INET; + key->fn_proto = ip->ip_p; } #endif /* INET */ @@ -180,6 +187,9 @@ pf_normalize_init(void) V_pf_frag_z = uma_zcreate("pf frags", sizeof(struct pf_fragment), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + V_pf_frnode_z = uma_zcreate("pf fragment node", + sizeof(struct pf_frnode), NULL, NULL, NULL, NULL, + UMA_ALIGN_PTR, 0); V_pf_frent_z = uma_zcreate("pf frag entries", sizeof(struct pf_frent), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); V_pf_state_scrub_z = uma_zcreate("pf state scrubs", @@ -202,26 +212,36 @@ pf_normalize_cleanup(void) uma_zdestroy(V_pf_state_scrub_z); uma_zdestroy(V_pf_frent_z); + uma_zdestroy(V_pf_frnode_z); uma_zdestroy(V_pf_frag_z); mtx_destroy(&V_pf_frag_mtx); } static int -pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b) +pf_frnode_compare(struct pf_frnode *a, struct pf_frnode *b) { int diff; - if ((diff = a->fr_id - b->fr_id) != 0) + if ((diff = a->fn_proto - b->fn_proto) != 0) return (diff); - if ((diff = a->fr_proto - b->fr_proto) != 0) + if ((diff = a->fn_af - b->fn_af) != 0) return (diff); - if ((diff = a->fr_af - b->fr_af) != 0) + if ((diff = pf_addr_cmp(&a->fn_src, &b->fn_src, a->fn_af)) != 0) return (diff); - if ((diff = pf_addr_cmp(&a->fr_src, &b->fr_src, a->fr_af)) != 0) + if ((diff = pf_addr_cmp(&a->fn_dst, &b->fn_dst, a->fn_af)) != 0) return (diff); - if ((diff = pf_addr_cmp(&a->fr_dst, &b->fr_dst, a->fr_af)) != 0) + return (0); +} + +static __inline int +pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b) +{ + int diff; + + if ((diff = a->fr_id - b->fr_id) != 0) return (diff); + return (0); } @@ -281,10 +301,20 @@ static void pf_free_fragment(struct pf_fragment *frag) { struct pf_frent *frent; + struct pf_frnode *frnode; PF_FRAG_ASSERT(); - RB_REMOVE(pf_frag_tree, &V_pf_frag_tree, frag); + frnode = frag->fr_node; + RB_REMOVE(pf_frag_tree, &frnode->fn_tree, frag); + MPASS(frnode->fn_fragments >= 1); + frnode->fn_fragments--; + if (frnode->fn_fragments == 0) { + MPASS(RB_EMPTY(&frnode->fn_tree)); + RB_REMOVE(pf_frnode_tree, &V_pf_frnode_tree, frnode); + uma_zfree(V_pf_frnode_z, frnode); + } + TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next); /* Free all fragment entries */ @@ -299,17 +329,23 @@ pf_free_fragment(struct pf_fragment *frag) } static struct pf_fragment * -pf_find_fragment(struct pf_fragment_cmp *key, struct pf_frag_tree *tree) +pf_find_fragment(struct pf_frnode *key, uint32_t id) { - struct pf_fragment *frag; + struct pf_fragment *frag, idkey; + struct pf_frnode *frnode; PF_FRAG_ASSERT(); - frag = RB_FIND(pf_frag_tree, tree, (struct pf_fragment *)key); - if (frag != NULL) { - TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next); - TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next); - } + frnode = RB_FIND(pf_frnode_tree, &V_pf_frnode_tree, key); + if (frnode == NULL) + return (NULL); + MPASS(frnode->fn_fragments >= 1); + idkey.fr_id = id; + frag = RB_FIND(pf_frag_tree, &frnode->fn_tree, &idkey); + if (frag == NULL) + return (NULL); + TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next); + TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next); return (frag); } @@ -527,11 +563,12 @@ pf_frent_previous(struct pf_fragment *frag, struct pf_frent *frent) } static struct pf_fragment * -pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent, - u_short *reason) +pf_fillup_fragment(struct pf_frnode *key, uint32_t id, + struct pf_frent *frent, u_short *reason) { struct pf_frent *after, *next, *prev; struct pf_fragment *frag; + struct pf_frnode *frnode; uint16_t total; PF_FRAG_ASSERT(); @@ -555,12 +592,12 @@ pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent, goto bad_fragment; } - DPFPRINTF((key->frc_af == AF_INET ? + DPFPRINTF((key->fn_af == AF_INET ? "reass frag %d @ %d-%d\n" : "reass frag %#08x @ %d-%d\n", - key->frc_id, frent->fe_off, frent->fe_off + frent->fe_len)); + id, frent->fe_off, frent->fe_off + frent->fe_len)); /* Fully buffer all of the fragments in this fragment queue. */ - frag = pf_find_fragment(key, &V_pf_frag_tree); + frag = pf_find_fragment(key, id); /* Create a new reassembly queue for this packet. */ if (frag == NULL) { @@ -574,7 +611,22 @@ pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent, } } - *(struct pf_fragment_cmp *)frag = *key; + frnode = RB_FIND(pf_frnode_tree, &V_pf_frnode_tree, key); + if (frnode == NULL) { + frnode = uma_zalloc(V_pf_frnode_z, M_NOWAIT); + if (frnode == NULL) { + pf_flush_fragments(); + frnode = uma_zalloc(V_pf_frnode_z, M_NOWAIT); + if (frnode == NULL) { + REASON_SET(reason, PFRES_MEMORY); + uma_zfree(V_pf_frag_z, frag); + goto drop_fragment; + } + } + *frnode = *key; + RB_INIT(&frnode->fn_tree); + frnode->fn_fragments = 0; + } memset(frag->fr_firstoff, 0, sizeof(frag->fr_firstoff)); memset(frag->fr_entries, 0, sizeof(frag->fr_entries)); frag->fr_timeout = time_uptime; @@ -582,7 +634,14 @@ pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent, frag->fr_maxlen = frent->fe_len; frag->fr_holes = 1; - RB_INSERT(pf_frag_tree, &V_pf_frag_tree, frag); + frag->fr_id = id; + frag->fr_node = frnode; + /* RB_INSERT cannot fail as pf_find_fragment() found nothing */ + RB_INSERT(pf_frag_tree, &frnode->fn_tree, frag); + frnode->fn_fragments++; + if (frnode->fn_fragments == 1) + RB_INSERT(pf_frnode_tree, &V_pf_frnode_tree, frnode); + TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next); /* We do not have a previous fragment, cannot fail. */ @@ -592,6 +651,7 @@ pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent, } KASSERT(!TAILQ_EMPTY(&frag->fr_queue), ("!TAILQ_EMPTY()->fr_queue")); + MPASS(frag->fr_node); /* Remember maximum fragment len for refragmentation. */ if (frent->fe_len > frag->fr_maxlen) @@ -627,7 +687,7 @@ pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent, if (prev != NULL && prev->fe_off + prev->fe_len > frent->fe_off) { uint16_t precut; - if (frag->fr_af == AF_INET6) + if (frag->fr_node->fn_af == AF_INET6) goto free_fragment; precut = prev->fe_off + prev->fe_len - frent->fe_off; @@ -681,7 +741,7 @@ pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent, return (frag); free_ipv6_fragment: - if (frag->fr_af == AF_INET) + if (frag->fr_node->fn_af == AF_INET) goto bad_fragment; free_fragment: /* @@ -743,8 +803,8 @@ pf_reassemble(struct mbuf **m0, u_short *reason) struct pf_fragment *frag; struct m_tag *mtag; struct pf_fragment_tag *ftag; - struct pf_fragment_cmp key; - uint16_t total, hdrlen; + struct pf_frnode key; + uint16_t total, hdrlen; uint32_t frag_id; uint16_t maxlen; @@ -761,7 +821,7 @@ pf_reassemble(struct mbuf **m0, u_short *reason) pf_ip2key(ip, &key); - if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) + if ((frag = pf_fillup_fragment(&key, ip->ip_id, frent, reason)) == NULL) return (PF_DROP); /* The mbuf is part of the fragment entry, no direct free or access */ @@ -835,7 +895,7 @@ pf_reassemble6(struct mbuf **m0, struct ip6_frag *fraghdr, struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct pf_frent *frent; struct pf_fragment *frag; - struct pf_fragment_cmp key; + struct pf_frnode key; struct m_tag *mtag; struct pf_fragment_tag *ftag; int off; @@ -858,14 +918,13 @@ pf_reassemble6(struct mbuf **m0, struct ip6_frag *fraghdr, frent->fe_off = ntohs(fraghdr->ip6f_offlg & IP6F_OFF_MASK); frent->fe_mff = fraghdr->ip6f_offlg & IP6F_MORE_FRAG; - key.frc_src.v6 = ip6->ip6_src; - key.frc_dst.v6 = ip6->ip6_dst; - key.frc_af = AF_INET6; + key.fn_src.v6 = ip6->ip6_src; + key.fn_dst.v6 = ip6->ip6_dst; + key.fn_af = AF_INET6; /* Only the first fragment's protocol is relevant. */ - key.frc_proto = 0; - key.frc_id = fraghdr->ip6f_ident; + key.fn_proto = 0; - if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) { + if ((frag = pf_fillup_fragment(&key, fraghdr->ip6f_ident, frent, reason)) == NULL) { PF_FRAG_UNLOCK(); return (PF_DROP); }