On 06.07.2012 10:11, Luigi Rizzo wrote:
On Thu, Jul 05, 2012 at 05:40:37PM +0400, Alexander V. Chernikov wrote:
On 04.07.2012 19:48, Luigi Rizzo wrote:
the thing discussed a few years ago (at least the one i took out of the
discussion) was that the counter fields in rules should hold the
index of a per-cpu counter associated to the rule. So CTR_INC(rule->ctr)
becomes something like pcpu->ipfw_ctrs[rule->ctr]++
Once you create a new rule you also grab one free index from ipfw_ctrs[],
and the same should go for dummynet counters.

Old kernel from previous letters, same setup:

net.inet.ip.fw.enable=0
2.3 MPPS
net.inet.ip.fw.update_counters=0
net.inet.ip.fw.enable=1
1.93MPPS
net.inet.ip.fw.update_counters=1
1.74MPPS

Kernel with ipfw pcpu counters:

net.inet.ip.fw.enable=0
2.3 MPPS
net.inet.ip.fw.update_counters=0
net.inet.ip.fw.enable=1
1.93MPPS
net.inet.ip.fw.update_counters=1
1.93MPPS

Counters seems to be working without any (significant) overhead.
(Maybe I'm wrong somewhere?)

Additionally, I've got (from my previous pcpu attempt) a small patch permitting ipfw to re-use rule map allocation instead of reallocating on every rule. This saves a bit of system time:

loading 20k rules with ipfw binary gives us:
5.1s system time before and 4.1s system time after.


--
WBR, Alexander
>From 931ac84b88829fe15bb4410bd2b06614f41c99b5 Mon Sep 17 00:00:00 2001
From: "Alexander V. Chernikov" <melif...@ipfw.ru>
Date: Mon, 9 Jul 2012 23:10:02 +0400
Subject: [PATCH 1/3] Do not allocate data buffers per each rule
 addition/deletion

---
 sys/netinet/ipfw/ip_fw2.c        |    2 +
 sys/netinet/ipfw/ip_fw_private.h |    4 ++
 sys/netinet/ipfw/ip_fw_sockopt.c |  134 ++++++++++++++++++++++++++++++++++----
 3 files changed, 126 insertions(+), 14 deletions(-)

diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
index 98766f3..bf06c1e 100644
--- a/sys/netinet/ipfw/ip_fw2.c
+++ b/sys/netinet/ipfw/ip_fw2.c
@@ -2701,6 +2701,8 @@ vnet_ipfw_uninit(const void *unused)
        }
        if (chain->map)
                free(chain->map, M_IPFW);
+       if (chain->b_map)
+               free(chain->b_map, M_IPFW);
        IPFW_WUNLOCK(chain);
        IPFW_UH_WUNLOCK(chain);
        if (reap != NULL)
diff --git a/sys/netinet/ipfw/ip_fw_private.h b/sys/netinet/ipfw/ip_fw_private.h
index 2806741..272cef3 100644
--- a/sys/netinet/ipfw/ip_fw_private.h
+++ b/sys/netinet/ipfw/ip_fw_private.h
@@ -232,6 +232,10 @@ struct ip_fw_chain {
 #endif
        uint32_t        id;             /* ruleset id */
        uint32_t        gencnt;         /* generation count */
+       int             n_allocated;    /* Size of primary map in items */
+       struct ip_fw    **b_map;        /* Backup map to use in swapping */
+       int             b_allocated;    /* Size of backup map in items */
+
 };
 
 struct sockopt;        /* used by tcp_var.h */
diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c
index a245143..4e71a11 100644
--- a/sys/netinet/ipfw/ip_fw_sockopt.c
+++ b/sys/netinet/ipfw/ip_fw_sockopt.c
@@ -97,29 +97,92 @@ ipfw_find_rule(struct ip_fw_chain *chain, uint32_t key, 
uint32_t id)
        return hi;
 }
 
+
+
+/*
+ * In order to minimize block allocations on every rule addition we
+ * use the following technique:
+ * 1) round up allocation to MAP_GRANULARITY
+ * 2) Save previously allocated map to the special bakup pointer.
+ *
+ * This permits us do backup <> primary (and vise versa) map swapping
+ * without reallocations in most cases.
+ *
+ */
+
+struct map_ptrs {
+       int             backup;         /* 1 if backup map used */
+       int             items;          /* number of items allocated */
+       struct ip_fw    **map;
+       struct ip_fw    **free_map;     /* map pointer to free */
+};
+int
+get_map(struct ip_fw_chain *chain, int extra, int locked, struct map_ptrs 
*ptrs);
+
+#define        MAP_GRANULARITY 128     /* 1k overhead per buffer */
+#define        MAP_ROUNDUP(i)  (i) + MAP_GRANULARITY - ((i) % MAP_GRANULARITY)
 /*
  * allocate a new map, returns the chain locked. extra is the number
  * of entries to add or delete.
  */
-static struct ip_fw **
-get_map(struct ip_fw_chain *chain, int extra, int locked)
+int
+get_map(struct ip_fw_chain *chain, int extra, int locked, struct map_ptrs 
*ptrs)
 {
 
        for (;;) {
                struct ip_fw **map;
                int i;
 
+               /* Be optimistic by default */
+               if (!locked)
+                       IPFW_UH_WLOCK(chain);
+
                i = chain->n_rules + extra;
+
+               /*
+                * We have to fit to the following conditions in order
+                * to use backup map:
+                *
+                * 1) i <= chain->b_allocated (add case)
+                * 2) roundup(i) == chain->b_allocated (del case) (saves memory)
+                *
+                * Since roundup adds 0..(MAP_GRANULARITY-1)
+                * rule 2 requires rule 1 to be true.
+                */
+               CTR4(KTR_NET, "%s: req: %d roundup: %d, b_allocated: %d",
+                       __func__, i, MAP_ROUNDUP(i), chain->b_allocated);
+
+               if (MAP_ROUNDUP(i) == chain->b_allocated) {
+                       /* no new allocation. Just return backup map */
+                       ptrs->backup = 1;
+                       ptrs->items = chain->b_allocated;
+                       ptrs->map = chain->b_map;
+                       CTR3(KTR_NET, "%s: BACKUP SELECTED: %p items: %d", 
__func__, ptrs->map, ptrs->items);
+
+                       return 1;
+               }
+
+               /* Pessimistic scenario - we need reallocation */
+               i = MAP_ROUNDUP(i);
+
+               if (!locked)
+                       IPFW_UH_WUNLOCK(chain);
+
                map = malloc(i * sizeof(struct ip_fw *), M_IPFW,
                        locked ? M_NOWAIT : M_WAITOK);
                if (map == NULL) {
                        printf("%s: cannot allocate map\n", __FUNCTION__);
-                       return NULL;
+                       return 0;
                }
                if (!locked)
                        IPFW_UH_WLOCK(chain);
-               if (i >= chain->n_rules + extra) /* good */
-                       return map;
+               if (i >= chain->n_rules + extra) { /* good */
+                       ptrs->backup = 0;
+                       ptrs->items = i;
+                       ptrs->map = map;
+                       CTR3(KTR_NET, "%s: NEW MAP: %p items: %d", __func__, 
ptrs->map, ptrs->items);
+                       return 1;
+               }
                /* otherwise we lost the race, free and retry */
                if (!locked)
                        IPFW_UH_WUNLOCK(chain);
@@ -128,10 +191,11 @@ get_map(struct ip_fw_chain *chain, int extra, int locked)
 }
 
 /*
- * swap the maps. It is supposed to be called with IPFW_UH_WLOCK
+ * swap the maps. It is supposed to be called with IPFW_UH_WLOCK.
+ * Returns old map pointer to free.
  */
 static struct ip_fw **
-swap_map(struct ip_fw_chain *chain, struct ip_fw **new_map, int new_len)
+swap_map(struct ip_fw_chain *chain, struct map_ptrs *ptrs, int new_len)
 {
        struct ip_fw **old_map;
 
@@ -139,8 +203,43 @@ swap_map(struct ip_fw_chain *chain, struct ip_fw 
**new_map, int new_len)
        chain->id++;
        chain->n_rules = new_len;
        old_map = chain->map;
-       chain->map = new_map;
+       chain->map = ptrs->map;
        IPFW_WUNLOCK(chain);
+
+       CTR4(KTR_NET, "%s: old: %p new: %p, backup: %d",
+               __func__, old_map, chain->map, ptrs->backup);
+
+       /* Finalize map change */
+       if (ptrs->backup) {
+               /*
+                * Old map has (at least) the same size as current map.
+                * Save it as backup map.
+                */
+               chain->b_allocated = chain->n_allocated;
+               chain->n_allocated = ptrs->items;
+               chain->b_map = old_map;
+               ptrs->free_map = NULL;
+               return old_map;
+       }
+
+       /*
+        * Backup map is not used. Free old map and use
+        * current map as candidate.
+        */
+
+       /* Set free pointer to old backup map */
+       ptrs->free_map = chain->b_map;
+       /* Set backup map to be old current map */
+       chain->b_map = old_map;
+
+       /* Save backup map size */
+       chain->b_allocated = chain->n_allocated;
+       chain->n_allocated = ptrs->items;
+
+       CTR6(KTR_NET, "%s: new: %p alloc: %d backup: %p alloc: %d free: %p",
+               __func__, chain->map, chain->n_allocated,
+               chain->b_map, chain->b_allocated, ptrs->free_map);
+
        return old_map;
 }
 
@@ -156,6 +255,7 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw 
*input_rule)
 {
        struct ip_fw *rule;
        int i, l, insert_before;
+       struct map_ptrs ptrs;
        struct ip_fw **map;     /* the new array of pointers */
 
        if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1)
@@ -164,12 +264,13 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw 
*input_rule)
        l = RULESIZE(input_rule);
        rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO);
        /* get_map returns with IPFW_UH_WLOCK if successful */
-       map = get_map(chain, 1, 0 /* not locked */);
-       if (map == NULL) {
+       if (get_map(chain, 1, 0 /* not locked */, &ptrs) == 0) {
                free(rule, M_IPFW);
                return ENOSPC;
        }
 
+       map = ptrs.map;
+
        bcopy(input_rule, rule, l);
        /* clear fields not settable from userland */
        rule->x_next = NULL;
@@ -201,7 +302,8 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw 
*input_rule)
        }
 
        rule->id = chain->id + 1;
-       map = swap_map(chain, map, chain->n_rules + 1);
+       swap_map(chain, &ptrs, chain->n_rules + 1);
+       map = ptrs.free_map;
        chain->static_len += l;
        IPFW_UH_WUNLOCK(chain);
        if (map)
@@ -283,6 +385,7 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
        uint32_t num;   /* rule number or old_set */
        uint8_t cmd, new_set;
        int start, end, i, ofs, n;
+       struct map_ptrs ptrs;
        struct ip_fw **map = NULL;
        int error = 0;
 
@@ -353,12 +456,13 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
                }
 
                /* We have something to delete. Allocate the new map */
-               map = get_map(chain, -n, 1 /* locked */);
-               if (map == NULL) {
+               if (get_map(chain, -n, 1 /* locked */, &ptrs) == 0) {
                        error = EINVAL;
                        break;
                }
 
+               map = ptrs.map;
+
                /* 1. bcopy the initial part of the map */
                if (start > 0)
                        bcopy(chain->map, map, start * sizeof(struct ip_fw *));
@@ -372,7 +476,7 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
                bcopy(chain->map + end, map + ofs,
                        (chain->n_rules - end) * sizeof(struct ip_fw *));
                /* 4. swap the maps (under BH_LOCK) */
-               map = swap_map(chain, map, chain->n_rules - n);
+               map = swap_map(chain, &ptrs, chain->n_rules - n);
                /* 5. now remove the rules deleted from the old map */
                for (i = start; i < end; i++) {
                        int l;
@@ -385,6 +489,8 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
                        rule->x_next = chain->reap;
                        chain->reap = rule;
                }
+               /* Free map if we need to */
+               map = ptrs.free_map;
                break;
 
        /*
-- 
1.7.9.4

>From baad4a88bba898baa3aa5f0dffc8a40182bcfcd6 Mon Sep 17 00:00:00 2001
From: Charlie Root <r...@test15.yandex.net>
Date: Thu, 12 Jul 2012 17:19:10 +0000
Subject: [PATCH 1/2] Make accounting in ipfw optional

---
 sys/netinet/ipfw/ip_fw2.c |   32 +++++++++++++++++++++-----------
 1 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
index e5a13e4..7bb73b0 100644
--- a/sys/netinet/ipfw/ip_fw2.c
+++ b/sys/netinet/ipfw/ip_fw2.c
@@ -112,6 +112,8 @@ static int default_to_accept = 1;
 static int default_to_accept;
 #endif
 
+static int update_counters = 0;
+
 VNET_DEFINE(int, autoinc_step);
 
 VNET_DEFINE(unsigned int, fw_tables_max);
@@ -190,6 +192,8 @@ SYSCTL_VNET_INT(_net_inet6_ip6_fw, OID_AUTO, 
permit_single_frag6,
     "Permit single packet IPv6 fragments");
 #endif /* INET6 */
 
+SYSCTL_INT(_net_inet_ip_fw, OID_AUTO, update_counters, CTLFLAG_RW,
+    &update_counters, 0, "Update counters on match");
 SYSEND
 
 #endif /* SYSCTL_NODE */
@@ -2011,16 +2015,20 @@ do {                                                    
        \
                                break;
 
                        case O_COUNT:
-                               f->pcnt++;      /* update stats */
-                               f->bcnt += pktlen;
-                               f->timestamp = time_uptime;
+                               if (update_counters) {
+                                       f->pcnt++;      /* update stats */
+                                       f->bcnt += pktlen;
+                                       f->timestamp = time_uptime;
+                               }
                                l = 0;          /* exit inner loop */
                                break;
 
                        case O_SKIPTO:
-                           f->pcnt++;  /* update stats */
-                           f->bcnt += pktlen;
-                           f->timestamp = time_uptime;
+                           if (update_counters) {
+                                   f->pcnt++;  /* update stats */
+                                   f->bcnt += pktlen;
+                                   f->timestamp = time_uptime;
+                           }
                            /* If possible use cached f_pos (in f->next_rule),
                             * whose version is written in f->next_rule
                             * (horrible hacks to avoid changing the ABI).
@@ -2375,11 +2383,13 @@ do {                                                    
        \
        }               /* end of outer for, scan rules */
 
        if (done) {
-               struct ip_fw *rule = chain->map[f_pos];
-               /* Update statistics */
-               rule->pcnt++;
-               rule->bcnt += pktlen;
-               rule->timestamp = time_uptime;
+               if (update_counters) {
+                       struct ip_fw *rule = chain->map[f_pos];
+                       /* Update statistics */
+                       rule->pcnt++;
+                       rule->bcnt += pktlen;
+                       rule->timestamp = time_uptime;
+               }
        } else {
                retval = IP_FW_DENY;
                printf("ipfw: ouch!, skip past end of rules, denying packet\n");
-- 
1.7.6.1

diff --git a/sys/netinet/ip_fw.h b/sys/netinet/ip_fw.h
index 589c19c..b1f83bb 100644
--- a/sys/netinet/ip_fw.h
+++ b/sys/netinet/ip_fw.h
@@ -493,9 +493,14 @@ struct ip_fw {
        uint32_t        id;             /* rule id */
 
        /* These fields are present in all rules.                       */
+#if 0
        uint64_t        pcnt;           /* Packet counter               */
        uint64_t        bcnt;           /* Byte counter                 */
        uint32_t        timestamp;      /* tv_sec of last match         */
+#endif
+       uint64_t        _pad0;
+       uint64_t        _pad1;
+       uint32_t        cntr_idx;       /* Index in pcpu counters */
 
        ipfw_insn       cmd[1];         /* storage for commands         */
 };
diff --git a/sys/netinet/ipfw/ip_fw2.c b/sys/netinet/ipfw/ip_fw2.c
index 7bb73b0..db93a5c 100644
--- a/sys/netinet/ipfw/ip_fw2.c
+++ b/sys/netinet/ipfw/ip_fw2.c
@@ -2016,18 +2016,20 @@ do {                                                    
        \
 
                        case O_COUNT:
                                if (update_counters) {
-                                       f->pcnt++;      /* update stats */
-                                       f->bcnt += pktlen;
-                                       f->timestamp = time_uptime;
+                                       struct ip_fw_cntr *c = 
IPFW_PCPU_CNTR(chain, f->cntr_idx);
+                                       c->pcnt++;      /* update stats */
+                                       c->bcnt += pktlen;
+                                       c->timestamp = time_uptime;
                                }
                                l = 0;          /* exit inner loop */
                                break;
 
                        case O_SKIPTO:
                            if (update_counters) {
-                                   f->pcnt++;  /* update stats */
-                                   f->bcnt += pktlen;
-                                   f->timestamp = time_uptime;
+                                   struct ip_fw_cntr *c = 
IPFW_PCPU_CNTR(chain, f->cntr_idx);
+                                   c->pcnt++;  /* update stats */
+                                   c->bcnt += pktlen;
+                                   c->timestamp = time_uptime;
                            }
                            /* If possible use cached f_pos (in f->next_rule),
                             * whose version is written in f->next_rule
@@ -2125,9 +2127,10 @@ do {                                                     
        \
                                        break;
                                }
 
-                               f->pcnt++;      /* update stats */
-                               f->bcnt += pktlen;
-                               f->timestamp = time_uptime;
+                               struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, 
f->cntr_idx);
+                               c->pcnt++;      /* update stats */
+                               c->bcnt += pktlen;
+                               c->timestamp = time_uptime;
                                stack = (uint16_t *)(mtag + 1);
 
                                /*
@@ -2263,9 +2266,10 @@ do {                                                     
        \
                        case O_SETFIB: {
                                uint32_t fib;
 
-                               f->pcnt++;      /* update stats */
-                               f->bcnt += pktlen;
-                               f->timestamp = time_uptime;
+                               struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, 
f->cntr_idx);
+                               c->pcnt++;      /* update stats */
+                               c->bcnt += pktlen;
+                               c->timestamp = time_uptime;
                                fib = (cmd->arg1 == IP_FW_TABLEARG) ? tablearg:
                                    cmd->arg1;
                                if (fib >= rt_numfibs)
@@ -2315,8 +2319,9 @@ do {                                                      
        \
                        case O_REASS: {
                                int ip_off;
 
-                               f->pcnt++;
-                               f->bcnt += pktlen;
+                               struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, 
f->cntr_idx);
+                               c->pcnt++;
+                               c->bcnt += pktlen;
                                l = 0;  /* in any case exit inner loop */
                                ip_off = ntohs(ip->ip_off);
 
@@ -2386,9 +2391,10 @@ do {                                                     
        \
                if (update_counters) {
                        struct ip_fw *rule = chain->map[f_pos];
                        /* Update statistics */
-                       rule->pcnt++;
-                       rule->bcnt += pktlen;
-                       rule->timestamp = time_uptime;
+                       struct ip_fw_cntr *c = IPFW_PCPU_CNTR(chain, 
rule->cntr_idx);
+                       c->pcnt++;
+                       c->bcnt += pktlen;
+                       c->timestamp = time_uptime;
                }
        } else {
                retval = IP_FW_DENY;
@@ -2439,6 +2445,7 @@ ipfw_init(void)
 {
        int error = 0;
 
+       _dpcpu_init();
        ipfw_dyn_attach();
        /*
         * Only print out this stuff the first time around,
@@ -2500,6 +2507,7 @@ ipfw_destroy(void)
 
        ipfw_log_bpf(0); /* uninit */
        ipfw_dyn_detach();
+       _dpcpu_uninit();
        printf("IP firewall unloaded\n");
 }
 
@@ -2546,6 +2554,9 @@ vnet_ipfw_init(const void *unused)
                return (ENOSPC);
        }
 
+       /* Init per-cpu counters */
+       ipfw_init_counters(chain);
+
        /* fill and insert the default rule */
        rule->act_ofs = 0;
        rule->rulenum = IPFW_DEFAULT_RULE;
@@ -2559,6 +2570,9 @@ vnet_ipfw_init(const void *unused)
        IPFW_LOCK_INIT(chain);
        ipfw_dyn_init();
 
+       /* Allocate first pcpu counter */
+       rule->cntr_idx = ipfw_alloc_counter(chain);
+
        /* First set up some values that are compile time options */
        V_ipfw_vnet_ready = 1;          /* Open for business */
 
@@ -2619,10 +2633,11 @@ vnet_ipfw_uninit(const void *unused)
        }
        if (chain->map)
                free(chain->map, M_IPFW);
+       _dpcpu_free(chain->cntrs);
        IPFW_WUNLOCK(chain);
        IPFW_UH_WUNLOCK(chain);
        if (reap != NULL)
-               ipfw_reap_rules(reap);
+               ipfw_reap_rules(chain, reap);
        IPFW_LOCK_DESTROY(chain);
        ipfw_dyn_uninit(1);     /* free the remaining parts */
        return 0;
diff --git a/sys/netinet/ipfw/ip_fw_private.h b/sys/netinet/ipfw/ip_fw_private.h
index 46d011c..bd4fad3 100644
--- a/sys/netinet/ipfw/ip_fw_private.h
+++ b/sys/netinet/ipfw/ip_fw_private.h
@@ -222,6 +222,7 @@ struct ip_fw_chain {
        struct radix_node_head **tables;        /* IPv4 tables */
        struct radix_node_head **xtables;       /* extended tables */
        uint8_t         *tabletype;     /* Array of table types */
+       uintptr_t       *cntrs;         /* Per-cpu counters */
 #if defined( __linux__ ) || defined( _WIN32 )
        spinlock_t rwmtx;
        spinlock_t uh_lock;
@@ -231,8 +232,25 @@ struct ip_fw_chain {
 #endif
        uint32_t        id;             /* ruleset id */
        uint32_t        gencnt;         /* generation count */
+       int             cntr_allocated; /* Size of allocated counters map */
+       uint32_t        *cntr_idx;      /* bit index of counters map */
 };
 
+struct ip_fw_cntr {
+       uint64_t        pcnt;           /* Packet counter               */
+       uint64_t        bcnt;           /* Byte counter                 */
+       uint32_t        timestamp;      /* tv_sec of last match         */
+};
+#define        IPFW_PCPU_CNTR(chain, idx)      ((struct ip_fw_cntr 
*)(chain)->cntrs[curcpu] + idx)
+#define        IPFW_PCPU_DEFAULT_COUNTERS      128
+void _dpcpu_init(void);
+void _dpcpu_uninit(void);
+uintptr_t *_dpcpu_alloc(size_t size, int flags);
+void _dpcpu_free(uintptr_t *ptr);
+void ipfw_init_counters(struct ip_fw_chain *chain);
+int ipfw_alloc_counter(struct ip_fw_chain *chain);
+void ipfw_free_counter(struct ip_fw_chain *chain, int idx);
+
 struct sockopt;        /* used by tcp_var.h */
 
 /*
@@ -267,7 +285,10 @@ int ipfw_find_rule(struct ip_fw_chain *chain, uint32_t 
key, uint32_t id);
 int ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule);
 int ipfw_ctl(struct sockopt *sopt);
 int ipfw_chk(struct ip_fw_args *args);
-void ipfw_reap_rules(struct ip_fw *head);
+void ipfw_reap_rules(struct ip_fw_chain *chain, struct ip_fw *head);
+
+void ipfw_zero_counter(struct ip_fw_chain *chain, int idx);
+void ipfw_export_counter(struct ip_fw_chain *chain, int idx, struct ip_fw_cntr 
*cntr);
 
 /* In ip_fw_pfil */
 int ipfw_check_hook(void *arg, struct mbuf **m0, struct ifnet *ifp, int dir,
diff --git a/sys/netinet/ipfw/ip_fw_sockopt.c b/sys/netinet/ipfw/ip_fw_sockopt.c
index 0aecc2c..db308c2 100644
--- a/sys/netinet/ipfw/ip_fw_sockopt.c
+++ b/sys/netinet/ipfw/ip_fw_sockopt.c
@@ -45,6 +45,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/malloc.h>
 #include <sys/mbuf.h>  /* struct m_tag used by nested headers */
 #include <sys/kernel.h>
+#include <sys/smp.h>
 #include <sys/lock.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
@@ -72,6 +73,212 @@ MALLOC_DEFINE(M_IPFW, "IpFw/IpAcct", "IpFw/IpAcct chain's");
  * static variables followed by global ones (none in this file)
  */
 
+MALLOC_DEFINE(M_XDPCPU, "XPCPU", "PCPU data");
+
+uma_zone_t _dpcpu_zone;
+
+static void ipfw_extend_counters(struct ip_fw_chain *chain);
+
+void
+_dpcpu_init()
+{
+       _dpcpu_zone = uma_zcreate("pcpu ptrs", sizeof(uintptr_t) * MAXCPU,
+               NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+
+void
+_dpcpu_uninit()
+{
+       uma_zdestroy(_dpcpu_zone);
+}
+
+#define        _DPCPU_ALIGN    512
+uintptr_t *
+_dpcpu_alloc(size_t size, int flags)
+{
+       uintptr_t *ptr;
+       int id, fail = 0;
+
+       if (size % _DPCPU_ALIGN)
+               size += _DPCPU_ALIGN - (size % _DPCPU_ALIGN);
+
+       if ((ptr = uma_zalloc(_dpcpu_zone, flags | M_ZERO)) == NULL)
+               return NULL;
+
+       CPU_FOREACH(id) {
+               /*
+                * We believe in aligned allocation here
+                */
+               if ((ptr[id] = (uintptr_t)malloc(size, M_XDPCPU, flags)) == 0) {
+                       fail = 1;
+                       break;
+               }
+
+               KASSERT((uintptr_t)ptr % _DPCPU_ALIGN == 0, ("malloc(9) returns 
unaligned data :("));
+       }
+
+       if (fail == 0)
+               return ptr;
+
+       CPU_FOREACH(id) {
+               if (ptr[id] == 0)
+                       break;
+
+               free((void *)ptr[id], M_XDPCPU);
+       }
+
+       uma_zfree(_dpcpu_zone, ptr);
+
+       return NULL;
+}
+
+void
+_dpcpu_free(uintptr_t *ptr)
+{
+       int id;
+
+       CPU_FOREACH(id) {
+               if (ptr[id] == 0)
+                       continue;
+               free((void *)ptr[id], M_XDPCPU);
+       }
+
+       uma_zfree(_dpcpu_zone, ptr);
+}
+
+void
+ipfw_init_counters(struct ip_fw_chain *chain)
+{
+       chain->cntrs = _dpcpu_alloc(IPFW_PCPU_DEFAULT_COUNTERS *
+               sizeof(struct ip_fw_cntr), M_WAITOK | M_ZERO);
+       chain->cntr_idx = malloc(IPFW_PCPU_DEFAULT_COUNTERS / 8, M_IPFW, 
M_WAITOK);
+       /* Free positions are marked by 1 */
+       memset(chain->cntr_idx, 0xFF, IPFW_PCPU_DEFAULT_COUNTERS / 8);
+       chain->cntr_allocated = IPFW_PCPU_DEFAULT_COUNTERS;
+}
+
+
+static void
+ipfw_extend_counters(struct ip_fw_chain *chain)
+{
+       uintptr_t *new_ptr, *old_ptr;
+       uint32_t *old_idx, *new_idx;
+       int i, id;
+
+       for (;;) {
+
+               i = 2 * chain->cntr_allocated;
+
+               new_ptr = _dpcpu_alloc(i * sizeof(struct ip_fw_cntr), M_WAITOK 
| M_ZERO);
+               new_idx = malloc(i / 8, M_IPFW, M_WAITOK);
+               /* Free positions are marked by 1 */
+               memset(new_idx, 0xFF, i / 8);
+
+               IPFW_UH_WLOCK(chain);
+               if (i == 2 * chain->cntr_allocated) {
+                       CTR4(KTR_NET, "%s: NEW INDEX: %p cntr_idx: %d count: 
%d", __func__, new_ptr, new_idx, i);
+                       break;
+               }
+               /* otherwise we lost the race, free and retry */
+               IPFW_UH_WUNLOCK(chain);
+
+               free(new_idx, M_IPFW);
+               _dpcpu_free(new_ptr);
+       }
+
+       /* Pointers allocated, let's migrate data */
+
+       IPFW_WLOCK(chain);
+       /* Copy pcpu counters */
+       CPU_FOREACH(id) {
+               memcpy((void *)new_ptr[id], (void *)chain->cntrs[id], 
chain->cntr_allocated * sizeof(struct ip_fw_cntr));
+       }
+       /* Copy index */
+       memcpy(new_idx, chain->cntr_idx, chain->cntr_allocated / 8);
+       /* Swap old and new pointers */
+       old_ptr = chain->cntrs;
+       chain->cntrs = new_ptr;
+       old_idx = chain->cntr_idx;
+       chain->cntr_idx = new_idx;
+       chain->cntr_allocated = i;
+
+       IPFW_WUNLOCK(chain);
+       IPFW_UH_WUNLOCK(chain);
+
+       /* Free old counters and index */
+       free(old_idx, M_IPFW);
+       _dpcpu_free(old_ptr);
+}
+
+int
+ipfw_alloc_counter(struct ip_fw_chain *chain)
+{
+       int i, x, idx;
+       uint32_t *v;
+
+       for (;;) {
+               IPFW_UH_WLOCK(chain);
+               for (i = 0, v = chain->cntr_idx; i < chain->cntr_allocated / 8 
/ 4; i++, v++) {
+                       if ((x = ffsl(*v)) != 0) {
+                               /* Found. Mark as used */
+                               *v &= ~ (1 << (x - 1));
+                               IPFW_UH_WUNLOCK(chain);
+                               idx = i * 32 + x - 1;
+                               ipfw_zero_counter(chain, idx);
+                               CTR3(KTR_NET, "%s: allocated counter %d, map 
after: %X", __func__, idx, *v);
+                               return idx;
+                       }
+               }
+               IPFW_UH_WUNLOCK(chain);
+
+               /* Not found. Let's allocate data and retry */
+               ipfw_extend_counters(chain);
+       }
+}
+
+void
+ipfw_free_counter(struct ip_fw_chain *chain, int idx)
+{
+       uint32_t *v;
+
+       IPFW_UH_WLOCK(chain);
+       v = chain->cntr_idx;
+       v += idx / 32;
+       CTR3(KTR_NET, "%s: returned counter %d, map before: %X", __func__, idx, 
*v);
+       idx %= 32;
+       *v |= 1 << idx;
+
+       IPFW_UH_WUNLOCK(chain);
+}
+
+void
+ipfw_zero_counter(struct ip_fw_chain *chain, int idx)
+{
+       int id;
+       struct ip_fw_cntr *pcpu;
+
+       CTR2(KTR_NET, "%s: zeroing counter %d", __func__, idx);
+       CPU_FOREACH(id) {
+               pcpu = ((struct ip_fw_cntr *)chain->cntrs[id]) + idx;
+               memset(pcpu, 0, sizeof(struct ip_fw_cntr));
+       }
+}
+
+void
+ipfw_export_counter(struct ip_fw_chain *chain, int idx, struct ip_fw_cntr 
*cntr)
+{
+       int id;
+       struct ip_fw_cntr *pcpu;
+
+       CPU_FOREACH(id) {
+               pcpu = ((struct ip_fw_cntr *)chain->cntrs[id]) + idx;
+               cntr->pcnt += pcpu->pcnt;
+               cntr->bcnt += pcpu->bcnt;
+               if (cntr->timestamp < pcpu->timestamp)
+                       cntr->timestamp = pcpu->timestamp;
+       }
+}
+
 /*
  * Find the smallest rule >= key, id.
  * We could use bsearch but it is so simple that we code it directly
@@ -155,7 +362,7 @@ int
 ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw *input_rule)
 {
        struct ip_fw *rule;
-       int i, l, insert_before;
+       int i, l, cntr_idx, insert_before;
        struct ip_fw **map;     /* the new array of pointers */
 
        if (chain->rules == NULL || input_rule->rulenum > IPFW_DEFAULT_RULE-1)
@@ -163,12 +370,14 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw 
*input_rule)
 
        l = RULESIZE(input_rule);
        rule = malloc(l, M_IPFW, M_WAITOK | M_ZERO);
+       cntr_idx = ipfw_alloc_counter(chain);
        if (rule == NULL)
                return (ENOSPC);
        /* get_map returns with IPFW_UH_WLOCK if successful */
        map = get_map(chain, 1, 0 /* not locked */);
        if (map == NULL) {
                free(rule, M_IPFW);
+               ipfw_free_counter(chain, cntr_idx);
                return ENOSPC;
        }
 
@@ -176,10 +385,12 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw 
*input_rule)
        /* clear fields not settable from userland */
        rule->x_next = NULL;
        rule->next_rule = NULL;
+       rule->cntr_idx = cntr_idx;
+#if 0
        rule->pcnt = 0;
        rule->bcnt = 0;
        rule->timestamp = 0;
-
+#endif
        if (V_autoinc_step < 1)
                V_autoinc_step = 1;
        else if (V_autoinc_step > 1000)
@@ -217,12 +428,13 @@ ipfw_add_rule(struct ip_fw_chain *chain, struct ip_fw 
*input_rule)
  * A NULL pointer on input is handled correctly.
  */
 void
-ipfw_reap_rules(struct ip_fw *head)
+ipfw_reap_rules(struct ip_fw_chain *chain, struct ip_fw *head)
 {
        struct ip_fw *rule;
 
        while ((rule = head) != NULL) {
                head = head->x_next;
+               ipfw_free_counter(chain, rule->cntr_idx);
                free(rule, M_IPFW);
        }
 }
@@ -424,7 +636,7 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
        rule = chain->reap;
        chain->reap = NULL;
        IPFW_UH_WUNLOCK(chain);
-       ipfw_reap_rules(rule);
+       ipfw_reap_rules(chain, rule);
        if (map)
                free(map, M_IPFW);
        return error;
@@ -436,13 +648,17 @@ del_entry(struct ip_fw_chain *chain, uint32_t arg)
  * so we only care that rules do not disappear.
  */
 static void
-clear_counters(struct ip_fw *rule, int log_only)
+clear_counters(struct ip_fw_chain *chain, struct ip_fw *rule, int log_only)
 {
        ipfw_insn_log *l = (ipfw_insn_log *)ACTION_PTR(rule);
 
        if (log_only == 0) {
+#if 0
                rule->bcnt = rule->pcnt = 0;
                rule->timestamp = 0;
+#endif
+       ipfw_zero_counter(chain, rule->cntr_idx);
+
        }
        if (l->o.opcode == O_LOG)
                l->log_left = l->max_log;
@@ -481,7 +697,7 @@ zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int 
log_only)
                        /* Skip rules not in our set. */
                        if (cmd == 1 && rule->set != set)
                                continue;
-                       clear_counters(rule, log_only);
+                       clear_counters(chain, rule, log_only);
                }
                msg = log_only ? "All logging counts reset" :
                    "Accounting cleared";
@@ -491,7 +707,7 @@ zero_entry(struct ip_fw_chain *chain, u_int32_t arg, int 
log_only)
                        rule = chain->map[i];
                        if (rule->rulenum == rulenum) {
                                if (cmd == 0 || rule->set == set)
-                                       clear_counters(rule, log_only);
+                                       clear_counters(chain, rule, log_only);
                                cleared = 1;
                        }
                        if (rule->rulenum > rulenum)
@@ -854,7 +1070,7 @@ struct ip_fw7 {
        ipfw_insn       cmd[1];         /* storage for commands     */
 };
 
-       int convert_rule_to_7(struct ip_fw *rule);
+int convert_rule_to_7(struct ip_fw_chain *chain, struct ip_fw *rule);
 int convert_rule_to_8(struct ip_fw *rule);
 
 #ifndef RULESIZE7
@@ -887,7 +1103,7 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, size_t 
space)
                    if (bp + l + sizeof(uint32_t) <= ep) {
                        int error;
                        bcopy(rule, bp, l + sizeof(uint32_t));
-                       error = convert_rule_to_7((struct ip_fw *) bp);
+                       error = convert_rule_to_7(chain, (struct ip_fw *) bp);
                        if (error)
                                return 0; /*XXX correct? */
                        /*
@@ -913,14 +1129,19 @@ ipfw_getrules(struct ip_fw_chain *chain, void *buf, 
size_t space)
                }
                dst = (struct ip_fw *)bp;
                bcopy(rule, dst, l);
+               /* copy statistics */
+               struct ip_fw_cntr *c = (struct ip_fw_cntr *)&dst->_pad0;
+               ipfw_export_counter(chain, rule->cntr_idx, c);
+               if (c->timestamp)
+                       c->timestamp += boot_seconds;
+
                /*
                 * XXX HACK. Store the disable mask in the "next"
                 * pointer in a wild attempt to keep the ABI the same.
                 * Why do we do this on EVERY rule?
                 */
                bcopy(&V_set_disable, &dst->next_rule, sizeof(V_set_disable));
-               if (dst->timestamp)
-                       dst->timestamp += boot_seconds;
+
                bp += l;
        }
        ipfw_get_dynamic(&bp, ep); /* protected by the dynamic lock */
@@ -1051,7 +1272,7 @@ ipfw_ctl(struct sockopt *sopt)
                        size = RULESIZE(rule);
                        if (!error && sopt->sopt_dir == SOPT_GET) {
                                if (is7) {
-                                       error = convert_rule_to_7(rule);
+                                       error = convert_rule_to_7(chain, rule);
                                        size = RULESIZE7(rule);
                                        if (error)
                                                return error;
@@ -1329,7 +1550,7 @@ ipfw_ctl(struct sockopt *sopt)
 
 /* Functions to convert rules 7.2 <==> 8.0 */
 int
-convert_rule_to_7(struct ip_fw *rule)
+convert_rule_to_7(struct ip_fw_chain *chain, struct ip_fw *rule)
 {
        /* Used to modify original rule */
        struct ip_fw7 *rule7 = (struct ip_fw7 *)rule;
@@ -1355,9 +1576,12 @@ convert_rule_to_7(struct ip_fw *rule)
        rule7->next_rule = (struct ip_fw7 *)tmp->next_rule;
        rule7->next = (struct ip_fw7 *)tmp->x_next;
        rule7->cmd_len = tmp->cmd_len;
+       ipfw_export_counter(chain, rule->cntr_idx, (struct ip_fw_cntr 
*)&rule7->pcnt);
+#if 0
        rule7->pcnt = tmp->pcnt;
        rule7->bcnt = tmp->bcnt;
        rule7->timestamp = tmp->timestamp;
+#endif
 
        /* Copy commands */
        for (ll = tmp->cmd_len, ccmd = tmp->cmd, dst = rule7->cmd ;
@@ -1429,10 +1653,11 @@ convert_rule_to_8(struct ip_fw *rule)
        rule->x_next = (struct ip_fw *)tmp->next;
        rule->cmd_len = tmp->cmd_len;
        rule->id = 0; /* XXX see if is ok = 0 */
+#if 0
        rule->pcnt = tmp->pcnt;
        rule->bcnt = tmp->bcnt;
        rule->timestamp = tmp->timestamp;
-
+#endif
        free (tmp, M_TEMP);
        return 0;
 }
_______________________________________________
freebsd-net@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-net
To unsubscribe, send any mail to "freebsd-net-unsubscr...@freebsd.org"

Reply via email to