The branch main has been updated by kp:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=defc181278cc8e04369cf439d18f1de184532a34

commit defc181278cc8e04369cf439d18f1de184532a34
Author:     Kristof Provost <k...@freebsd.org>
AuthorDate: 2025-05-29 11:27:17 +0000
Commit:     Kristof Provost <k...@freebsd.org>
CommitDate: 2025-06-09 19:37:56 +0000

    pf: reorganise fragment reassembly
    
    To avoid packet loss due to reuse of the 16 bit IPv4 fragment id,
    we need suitable data structures.  Organize the pf fragments with
    two red-black trees.  One is holding the address and protocol
    information and the other has only the fragment id.  This will allow
    to drop fragemts for specific connections more aggressively.
    from markus@; OK sashan@
    
    Obtained from:  OpenBSD, bluhm <bl...@openbsd.org>, 09228e5ff0
    Sponsored by:   Rubicon Communications, LLC ("Netgate")
    Differential Revision:  https://reviews.freebsd.org/D50723
---
 sys/netpfil/pf/pf_norm.c | 169 ++++++++++++++++++++++++++++++++---------------
 1 file changed, 114 insertions(+), 55 deletions(-)

diff --git a/sys/netpfil/pf/pf_norm.c b/sys/netpfil/pf/pf_norm.c
index ed9aed157993..45f4415d084b 100644
--- a/sys/netpfil/pf/pf_norm.c
+++ b/sys/netpfil/pf/pf_norm.c
@@ -76,21 +76,20 @@ struct pf_frent {
        uint16_t        fe_mff;         /* more fragment flag */
 };
 
-struct pf_fragment_cmp {
-       struct pf_addr  frc_src;
-       struct pf_addr  frc_dst;
-       uint32_t        frc_id;
-       sa_family_t     frc_af;
-       uint8_t         frc_proto;
+RB_HEAD(pf_frag_tree, pf_fragment);
+struct pf_frnode {
+       struct pf_addr          fn_src;         /* ip source address */
+       struct pf_addr          fn_dst;         /* ip destination address */
+       sa_family_t             fn_af;          /* address family */
+       u_int8_t                fn_proto;       /* protocol for fragments in 
fn_tree */
+       u_int32_t               fn_fragments;   /* number of entries in fn_tree 
*/
+
+       RB_ENTRY(pf_frnode)     fn_entry;
+       struct pf_frag_tree     fn_tree;        /* matching fragments, lookup 
by id */
 };
 
 struct pf_fragment {
-       struct pf_fragment_cmp  fr_key;
-#define fr_src fr_key.frc_src
-#define fr_dst fr_key.frc_dst
-#define fr_id  fr_key.frc_id
-#define fr_af  fr_key.frc_af
-#define fr_proto       fr_key.frc_proto
+       uint32_t        fr_id;  /* fragment id for reassemble */
 
        /* pointers to queue element */
        struct pf_frent *fr_firstoff[PF_FRAG_ENTRY_POINTS];
@@ -102,6 +101,7 @@ struct pf_fragment {
        TAILQ_HEAD(pf_fragq, pf_frent) fr_queue;
        uint16_t        fr_maxlen;      /* maximum length of single fragment */
        u_int16_t       fr_holes;       /* number of holes in the queue */
+       struct pf_frnode *fr_node;      /* ip src/dst/proto/af for fragments */
 };
 
 VNET_DEFINE_STATIC(struct mtx, pf_frag_mtx);
@@ -114,16 +114,25 @@ VNET_DEFINE(uma_zone_t, pf_state_scrub_z);        /* XXX: 
shared with pfsync */
 
 VNET_DEFINE_STATIC(uma_zone_t, pf_frent_z);
 #define        V_pf_frent_z    VNET(pf_frent_z)
+VNET_DEFINE_STATIC(uma_zone_t, pf_frnode_z);
+#define        V_pf_frnode_z   VNET(pf_frnode_z)
 VNET_DEFINE_STATIC(uma_zone_t, pf_frag_z);
 #define        V_pf_frag_z     VNET(pf_frag_z)
 
 TAILQ_HEAD(pf_fragqueue, pf_fragment);
 TAILQ_HEAD(pf_cachequeue, pf_fragment);
+RB_HEAD(pf_frnode_tree, pf_frnode);
 VNET_DEFINE_STATIC(struct pf_fragqueue,        pf_fragqueue);
 #define        V_pf_fragqueue                  VNET(pf_fragqueue)
-RB_HEAD(pf_frag_tree, pf_fragment);
 VNET_DEFINE_STATIC(struct pf_frag_tree,        pf_frag_tree);
 #define        V_pf_frag_tree                  VNET(pf_frag_tree)
+static __inline int    pf_frnode_compare(struct pf_frnode *,
+                           struct pf_frnode *);
+VNET_DEFINE_STATIC(struct pf_frnode_tree, pf_frnode_tree);
+#define        V_pf_frnode_tree                VNET(pf_frnode_tree)
+RB_PROTOTYPE(pf_frnode_tree, pf_frnode, fn_entry, pf_frnode_compare);
+RB_GENERATE(pf_frnode_tree, pf_frnode, fn_entry, pf_frnode_compare);
+
 static int              pf_frag_compare(struct pf_fragment *,
                            struct pf_fragment *);
 static RB_PROTOTYPE(pf_frag_tree, pf_fragment, fr_entry, pf_frag_compare);
@@ -134,8 +143,7 @@ static void pf_free_fragment(struct pf_fragment *);
 
 static struct pf_frent *pf_create_fragment(u_short *);
 static int     pf_frent_holes(struct pf_frent *frent);
-static struct pf_fragment *pf_find_fragment(struct pf_fragment_cmp *key,
-                   struct pf_frag_tree *tree);
+static struct pf_fragment      *pf_find_fragment(struct pf_frnode *, 
u_int32_t);
 static inline int      pf_frent_index(struct pf_frent *);
 static int     pf_frent_insert(struct pf_fragment *,
                            struct pf_frent *, struct pf_frent *);
@@ -143,7 +151,7 @@ void                        pf_frent_remove(struct 
pf_fragment *,
                            struct pf_frent *);
 struct pf_frent                *pf_frent_previous(struct pf_fragment *,
                            struct pf_frent *);
-static struct pf_fragment *pf_fillup_fragment(struct pf_fragment_cmp *,
+static struct pf_fragment *pf_fillup_fragment(struct pf_frnode *, u_int32_t,
                    struct pf_frent *, u_short *);
 static struct mbuf *pf_join_fragment(struct pf_fragment *);
 #ifdef INET
@@ -163,14 +171,13 @@ static int        pf_reassemble6(struct mbuf **,
 
 #ifdef INET
 static void
-pf_ip2key(struct ip *ip, struct pf_fragment_cmp *key)
+pf_ip2key(struct ip *ip, struct pf_frnode *key)
 {
 
-       key->frc_src.v4 = ip->ip_src;
-       key->frc_dst.v4 = ip->ip_dst;
-       key->frc_af = AF_INET;
-       key->frc_proto = ip->ip_p;
-       key->frc_id = ip->ip_id;
+       key->fn_src.v4 = ip->ip_src;
+       key->fn_dst.v4 = ip->ip_dst;
+       key->fn_af = AF_INET;
+       key->fn_proto = ip->ip_p;
 }
 #endif /* INET */
 
@@ -180,6 +187,9 @@ pf_normalize_init(void)
 
        V_pf_frag_z = uma_zcreate("pf frags", sizeof(struct pf_fragment),
            NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+       V_pf_frnode_z = uma_zcreate("pf fragment node",
+           sizeof(struct pf_frnode), NULL, NULL, NULL, NULL,
+           UMA_ALIGN_PTR, 0);
        V_pf_frent_z = uma_zcreate("pf frag entries", sizeof(struct pf_frent),
            NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
        V_pf_state_scrub_z = uma_zcreate("pf state scrubs",
@@ -202,26 +212,36 @@ pf_normalize_cleanup(void)
 
        uma_zdestroy(V_pf_state_scrub_z);
        uma_zdestroy(V_pf_frent_z);
+       uma_zdestroy(V_pf_frnode_z);
        uma_zdestroy(V_pf_frag_z);
 
        mtx_destroy(&V_pf_frag_mtx);
 }
 
 static int
-pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b)
+pf_frnode_compare(struct pf_frnode *a, struct pf_frnode *b)
 {
        int     diff;
 
-       if ((diff = a->fr_id - b->fr_id) != 0)
+       if ((diff = a->fn_proto - b->fn_proto) != 0)
                return (diff);
-       if ((diff = a->fr_proto - b->fr_proto) != 0)
+       if ((diff = a->fn_af - b->fn_af) != 0)
                return (diff);
-       if ((diff = a->fr_af - b->fr_af) != 0)
+       if ((diff = pf_addr_cmp(&a->fn_src, &b->fn_src, a->fn_af)) != 0)
                return (diff);
-       if ((diff = pf_addr_cmp(&a->fr_src, &b->fr_src, a->fr_af)) != 0)
+       if ((diff = pf_addr_cmp(&a->fn_dst, &b->fn_dst, a->fn_af)) != 0)
                return (diff);
-       if ((diff = pf_addr_cmp(&a->fr_dst, &b->fr_dst, a->fr_af)) != 0)
+       return (0);
+}
+
+static __inline int
+pf_frag_compare(struct pf_fragment *a, struct pf_fragment *b)
+{
+       int     diff;
+
+       if ((diff = a->fr_id - b->fr_id) != 0)
                return (diff);
+
        return (0);
 }
 
@@ -281,10 +301,20 @@ static void
 pf_free_fragment(struct pf_fragment *frag)
 {
        struct pf_frent         *frent;
+       struct pf_frnode        *frnode;
 
        PF_FRAG_ASSERT();
 
-       RB_REMOVE(pf_frag_tree, &V_pf_frag_tree, frag);
+       frnode = frag->fr_node;
+       RB_REMOVE(pf_frag_tree, &frnode->fn_tree, frag);
+       MPASS(frnode->fn_fragments >= 1);
+       frnode->fn_fragments--;
+       if (frnode->fn_fragments == 0) {
+               MPASS(RB_EMPTY(&frnode->fn_tree));
+               RB_REMOVE(pf_frnode_tree, &V_pf_frnode_tree, frnode);
+               uma_zfree(V_pf_frnode_z, frnode);
+       }
+
        TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next);
 
        /* Free all fragment entries */
@@ -299,17 +329,23 @@ pf_free_fragment(struct pf_fragment *frag)
 }
 
 static struct pf_fragment *
-pf_find_fragment(struct pf_fragment_cmp *key, struct pf_frag_tree *tree)
+pf_find_fragment(struct pf_frnode *key, uint32_t id)
 {
-       struct pf_fragment      *frag;
+       struct pf_fragment      *frag, idkey;
+       struct pf_frnode        *frnode;
 
        PF_FRAG_ASSERT();
 
-       frag = RB_FIND(pf_frag_tree, tree, (struct pf_fragment *)key);
-       if (frag != NULL) {
-               TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next);
-               TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next);
-       }
+       frnode = RB_FIND(pf_frnode_tree, &V_pf_frnode_tree, key);
+       if (frnode == NULL)
+               return (NULL);
+       MPASS(frnode->fn_fragments >= 1);
+       idkey.fr_id = id;
+       frag = RB_FIND(pf_frag_tree, &frnode->fn_tree, &idkey);
+       if (frag == NULL)
+               return (NULL);
+       TAILQ_REMOVE(&V_pf_fragqueue, frag, frag_next);
+       TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next);
 
        return (frag);
 }
@@ -527,11 +563,12 @@ pf_frent_previous(struct pf_fragment *frag, struct 
pf_frent *frent)
 }
 
 static struct pf_fragment *
-pf_fillup_fragment(struct pf_fragment_cmp *key, struct pf_frent *frent,
-    u_short *reason)
+pf_fillup_fragment(struct pf_frnode *key, uint32_t id,
+    struct pf_frent *frent, u_short *reason)
 {
        struct pf_frent         *after, *next, *prev;
        struct pf_fragment      *frag;
+       struct pf_frnode        *frnode;
        uint16_t                total;
 
        PF_FRAG_ASSERT();
@@ -555,12 +592,12 @@ pf_fillup_fragment(struct pf_fragment_cmp *key, struct 
pf_frent *frent,
                goto bad_fragment;
        }
 
-       DPFPRINTF((key->frc_af == AF_INET ?
+       DPFPRINTF((key->fn_af == AF_INET ?
            "reass frag %d @ %d-%d\n" : "reass frag %#08x @ %d-%d\n",
-           key->frc_id, frent->fe_off, frent->fe_off + frent->fe_len));
+           id, frent->fe_off, frent->fe_off + frent->fe_len));
 
        /* Fully buffer all of the fragments in this fragment queue. */
-       frag = pf_find_fragment(key, &V_pf_frag_tree);
+       frag = pf_find_fragment(key, id);
 
        /* Create a new reassembly queue for this packet. */
        if (frag == NULL) {
@@ -574,7 +611,22 @@ pf_fillup_fragment(struct pf_fragment_cmp *key, struct 
pf_frent *frent,
                        }
                }
 
-               *(struct pf_fragment_cmp *)frag = *key;
+               frnode = RB_FIND(pf_frnode_tree, &V_pf_frnode_tree, key);
+               if (frnode == NULL) {
+                       frnode = uma_zalloc(V_pf_frnode_z, M_NOWAIT);
+                       if (frnode == NULL) {
+                               pf_flush_fragments();
+                               frnode = uma_zalloc(V_pf_frnode_z, M_NOWAIT);
+                               if (frnode == NULL) {
+                                       REASON_SET(reason, PFRES_MEMORY);
+                                       uma_zfree(V_pf_frag_z, frag);
+                                       goto drop_fragment;
+                               }
+                       }
+                       *frnode = *key;
+                       RB_INIT(&frnode->fn_tree);
+                       frnode->fn_fragments = 0;
+               }
                memset(frag->fr_firstoff, 0, sizeof(frag->fr_firstoff));
                memset(frag->fr_entries, 0, sizeof(frag->fr_entries));
                frag->fr_timeout = time_uptime;
@@ -582,7 +634,14 @@ pf_fillup_fragment(struct pf_fragment_cmp *key, struct 
pf_frent *frent,
                frag->fr_maxlen = frent->fe_len;
                frag->fr_holes = 1;
 
-               RB_INSERT(pf_frag_tree, &V_pf_frag_tree, frag);
+               frag->fr_id = id;
+               frag->fr_node = frnode;
+               /* RB_INSERT cannot fail as pf_find_fragment() found nothing */
+               RB_INSERT(pf_frag_tree, &frnode->fn_tree, frag);
+               frnode->fn_fragments++;
+               if (frnode->fn_fragments == 1)
+                       RB_INSERT(pf_frnode_tree, &V_pf_frnode_tree, frnode);
+
                TAILQ_INSERT_HEAD(&V_pf_fragqueue, frag, frag_next);
 
                /* We do not have a previous fragment, cannot fail. */
@@ -592,6 +651,7 @@ pf_fillup_fragment(struct pf_fragment_cmp *key, struct 
pf_frent *frent,
        }
 
        KASSERT(!TAILQ_EMPTY(&frag->fr_queue), ("!TAILQ_EMPTY()->fr_queue"));
+       MPASS(frag->fr_node);
 
        /* Remember maximum fragment len for refragmentation. */
        if (frent->fe_len > frag->fr_maxlen)
@@ -627,7 +687,7 @@ pf_fillup_fragment(struct pf_fragment_cmp *key, struct 
pf_frent *frent,
        if (prev != NULL && prev->fe_off + prev->fe_len > frent->fe_off) {
                uint16_t precut;
 
-               if (frag->fr_af == AF_INET6)
+               if (frag->fr_node->fn_af == AF_INET6)
                        goto free_fragment;
 
                precut = prev->fe_off + prev->fe_len - frent->fe_off;
@@ -681,7 +741,7 @@ pf_fillup_fragment(struct pf_fragment_cmp *key, struct 
pf_frent *frent,
        return (frag);
 
 free_ipv6_fragment:
-       if (frag->fr_af == AF_INET)
+       if (frag->fr_node->fn_af == AF_INET)
                goto bad_fragment;
 free_fragment:
        /*
@@ -743,8 +803,8 @@ pf_reassemble(struct mbuf **m0, u_short *reason)
        struct pf_fragment      *frag;
        struct m_tag            *mtag;
        struct pf_fragment_tag  *ftag;
-       struct pf_fragment_cmp  key;
-       uint16_t                total, hdrlen;
+       struct pf_frnode         key;
+       uint16_t                 total, hdrlen;
        uint32_t                 frag_id;
        uint16_t                 maxlen;
 
@@ -761,7 +821,7 @@ pf_reassemble(struct mbuf **m0, u_short *reason)
 
        pf_ip2key(ip, &key);
 
-       if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL)
+       if ((frag = pf_fillup_fragment(&key, ip->ip_id, frent, reason)) == NULL)
                return (PF_DROP);
 
        /* The mbuf is part of the fragment entry, no direct free or access */
@@ -835,7 +895,7 @@ pf_reassemble6(struct mbuf **m0, struct ip6_frag *fraghdr,
        struct ip6_hdr          *ip6 = mtod(m, struct ip6_hdr *);
        struct pf_frent         *frent;
        struct pf_fragment      *frag;
-       struct pf_fragment_cmp   key;
+       struct pf_frnode         key;
        struct m_tag            *mtag;
        struct pf_fragment_tag  *ftag;
        int                      off;
@@ -858,14 +918,13 @@ pf_reassemble6(struct mbuf **m0, struct ip6_frag *fraghdr,
        frent->fe_off = ntohs(fraghdr->ip6f_offlg & IP6F_OFF_MASK);
        frent->fe_mff = fraghdr->ip6f_offlg & IP6F_MORE_FRAG;
 
-       key.frc_src.v6 = ip6->ip6_src;
-       key.frc_dst.v6 = ip6->ip6_dst;
-       key.frc_af = AF_INET6;
+       key.fn_src.v6 = ip6->ip6_src;
+       key.fn_dst.v6 = ip6->ip6_dst;
+       key.fn_af = AF_INET6;
        /* Only the first fragment's protocol is relevant. */
-       key.frc_proto = 0;
-       key.frc_id = fraghdr->ip6f_ident;
+       key.fn_proto = 0;
 
-       if ((frag = pf_fillup_fragment(&key, frent, reason)) == NULL) {
+       if ((frag = pf_fillup_fragment(&key, fraghdr->ip6f_ident, frent, 
reason)) == NULL) {
                PF_FRAG_UNLOCK();
                return (PF_DROP);
        }

Reply via email to