Previously, memory allocated for a map was not accounted. Therefore,
this memory could not be taken into consideration by the cgroups
memory controller.

This patch introduces the "BPF_F_ACCOUNT_MEM" flag which enables
the memory accounting for a map, and it can be set during
the map creation ("BPF_MAP_CREATE") in "map_flags".

When enabled, we account only that amount of memory which is charged
against the "RLIMIT_MEMLOCK" limit.

To validate the change, first we create the memory cgroup-v1 "test-map":

    # mkdir /sys/fs/cgroup/memory/test-map

And then we run the following program against the cgroup:

    $ cat test_map.c
    <..>
    int main() {
        usleep(3 * 1000000);
        assert(bpf_create_map(BPF_MAP_TYPE_HASH, 8, 16, 65536, 0) > 0);
        usleep(3 * 1000000);
    }
    # cgexec -g memory:test-map ./test_map &
    # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
    397312
    258048

    <after 3 sec the map has been created>

    # bpftool map list
    19: hash  flags 0x0
        key 8B  value 16B  max_entries 65536  memlock 5771264B
    # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
    401408
    262144

As we can see, the memory allocated for map is not accounted, as
397312B + 5771264B > 401408B.

Next, we enabled the accounting and re-run the test:

    $ cat test_map.c
    <..>
    int main() {
        usleep(3 * 1000000);
        assert(bpf_create_map(BPF_MAP_TYPE_HASH, 8, 16, 65536, 
BPF_F_ACCOUNT_MEM) > 0);
        usleep(3 * 1000000);
    }
    # cgexec -g memory:test-map ./test_map &
    # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
    450560
    307200

    <after 3 sec the map has been created>

    # bpftool map list
    20: hash  flags 0x80
        key 8B  value 16B  max_entries 65536  memlock 5771264B
    # cat /sys/fs/cgroup/memory/test-map/memory{,.kmem}.usage_in_bytes
    6221824
    6078464

This time, the memory (including kmem) is accounted, as
450560B + 5771264B <= 6221824B

Acked-by: Yonghong Song <y...@fb.com>
Signed-off-by: Martynas Pumputis <m...@lambda.lt>
---
 include/linux/bpf.h            |  5 +++--
 include/uapi/linux/bpf.h       |  3 +++
 kernel/bpf/arraymap.c          | 14 +++++++++-----
 kernel/bpf/bpf_lru_list.c      | 11 +++++++++--
 kernel/bpf/bpf_lru_list.h      |  1 +
 kernel/bpf/cpumap.c            | 12 +++++++++---
 kernel/bpf/devmap.c            | 10 ++++++++--
 kernel/bpf/hashtab.c           | 19 ++++++++++++++-----
 kernel/bpf/lpm_trie.c          | 19 +++++++++++++------
 kernel/bpf/queue_stack_maps.c  |  5 +++--
 kernel/bpf/reuseport_array.c   |  3 ++-
 kernel/bpf/stackmap.c          | 12 ++++++++----
 kernel/bpf/syscall.c           | 12 ++++++++----
 kernel/bpf/xskmap.c            |  9 +++++++--
 net/core/sock_map.c            | 13 +++++++++----
 tools/include/uapi/linux/bpf.h |  4 ++++
 16 files changed, 110 insertions(+), 42 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index e734f163bd0b..353a3f4304fe 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -79,7 +79,8 @@ struct bpf_map {
        u32 btf_value_type_id;
        struct btf *btf;
        bool unpriv_array;
-       /* 55 bytes hole */
+       bool account_mem;
+       /* 54 bytes hole */
 
        /* The 3rd and 4th cacheline with misc members to avoid false sharing
         * particularly with refcounting.
@@ -506,7 +507,7 @@ void bpf_map_put(struct bpf_map *map);
 int bpf_map_precharge_memlock(u32 pages);
 int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
 void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages);
-void *bpf_map_area_alloc(size_t size, int numa_node);
+void *bpf_map_area_alloc(size_t size, int numa_node, bool account_mem);
 void bpf_map_area_free(void *base);
 void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);
 
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 91c43884f295..cff34678caf6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -291,6 +291,9 @@ enum bpf_attach_type {
 /* Zero-initialize hash function seed. This should only be used for testing. */
 #define BPF_F_ZERO_SEED                (1U << 6)
 
+/* Enable memory accounting for map (only for BPF_MAP_CREATE) */
+#define BPF_F_ACCOUNT_MEM      (1U << 7)
+
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE  (1U << 0)
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 25632a75d630..86417f2e6f1b 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -34,14 +34,17 @@ static void bpf_array_free_percpu(struct bpf_array *array)
        }
 }
 
-static int bpf_array_alloc_percpu(struct bpf_array *array)
+static int bpf_array_alloc_percpu(struct bpf_array *array, bool account_mem)
 {
        void __percpu *ptr;
+       gfp_t gfp = GFP_USER | __GFP_NOWARN;
        int i;
 
+       if (account_mem)
+               gfp |= __GFP_ACCOUNT;
+
        for (i = 0; i < array->map.max_entries; i++) {
-               ptr = __alloc_percpu_gfp(array->elem_size, 8,
-                                        GFP_USER | __GFP_NOWARN);
+               ptr = __alloc_percpu_gfp(array->elem_size, 8, gfp);
                if (!ptr) {
                        bpf_array_free_percpu(array);
                        return -ENOMEM;
@@ -82,6 +85,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
        u32 elem_size, index_mask, max_entries;
        bool unpriv = !capable(CAP_SYS_ADMIN);
        u64 cost, array_size, mask64;
+       bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
        struct bpf_array *array;
 
        elem_size = round_up(attr->value_size, 8);
@@ -129,7 +133,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
                return ERR_PTR(ret);
 
        /* allocate all map elements and zero-initialize them */
-       array = bpf_map_area_alloc(array_size, numa_node);
+       array = bpf_map_area_alloc(array_size, numa_node, account_mem);
        if (!array)
                return ERR_PTR(-ENOMEM);
        array->index_mask = index_mask;
@@ -140,7 +144,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
        array->map.pages = cost;
        array->elem_size = elem_size;
 
-       if (percpu && bpf_array_alloc_percpu(array)) {
+       if (percpu && bpf_array_alloc_percpu(array, account_mem)) {
                bpf_map_area_free(array);
                return ERR_PTR(-ENOMEM);
        }
diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c
index e6ef4401a138..4d58537e0af2 100644
--- a/kernel/bpf/bpf_lru_list.c
+++ b/kernel/bpf/bpf_lru_list.c
@@ -7,6 +7,7 @@
 #include <linux/cpumask.h>
 #include <linux/spinlock.h>
 #include <linux/percpu.h>
+#include <linux/gfp.h>
 
 #include "bpf_lru_list.h"
 
@@ -646,12 +647,17 @@ static void bpf_lru_list_init(struct bpf_lru_list *l)
 }
 
 int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
+                bool account_mem,
                 del_from_htab_func del_from_htab, void *del_arg)
 {
+       gfp_t gfp = GFP_KERNEL;
        int cpu;
 
+       if (account_mem)
+               gfp |= __GFP_ACCOUNT;
+
        if (percpu) {
-               lru->percpu_lru = alloc_percpu(struct bpf_lru_list);
+               lru->percpu_lru = alloc_percpu_gfp(struct bpf_lru_list, gfp);
                if (!lru->percpu_lru)
                        return -ENOMEM;
 
@@ -665,7 +671,8 @@ int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 
hash_offset,
        } else {
                struct bpf_common_lru *clru = &lru->common_lru;
 
-               clru->local_list = alloc_percpu(struct bpf_lru_locallist);
+               clru->local_list = alloc_percpu_gfp(struct bpf_lru_locallist,
+                                                   gfp);
                if (!clru->local_list)
                        return -ENOMEM;
 
diff --git a/kernel/bpf/bpf_lru_list.h b/kernel/bpf/bpf_lru_list.h
index 7d4f89b7cb84..89566665592b 100644
--- a/kernel/bpf/bpf_lru_list.h
+++ b/kernel/bpf/bpf_lru_list.h
@@ -74,6 +74,7 @@ static inline void bpf_lru_node_set_ref(struct bpf_lru_node 
*node)
 }
 
 int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
+                bool account_mem,
                 del_from_htab_func del_from_htab, void *delete_arg);
 void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
                      u32 elem_size, u32 nr_elems);
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8974b3755670..1e84bf78716e 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -81,6 +81,8 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
        struct bpf_cpu_map *cmap;
        int err = -ENOMEM;
        u64 cost;
+       bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
+       gfp_t gfp = GFP_KERNEL;
        int ret;
 
        if (!capable(CAP_SYS_ADMIN))
@@ -117,16 +119,20 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
                goto free_cmap;
        }
 
+       if (account_mem)
+               gfp |= __GFP_ACCOUNT;
        /* A per cpu bitfield with a bit per possible CPU in map  */
-       cmap->flush_needed = __alloc_percpu(cpu_map_bitmap_size(attr),
-                                           __alignof__(unsigned long));
+       cmap->flush_needed = __alloc_percpu_gfp(cpu_map_bitmap_size(attr),
+                                           __alignof__(unsigned long),
+                                           gfp);
        if (!cmap->flush_needed)
                goto free_cmap;
 
        /* Alloc array for possible remote "destination" CPUs */
        cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries *
                                           sizeof(struct bpf_cpu_map_entry *),
-                                          cmap->map.numa_node);
+                                          cmap->map.numa_node,
+                                          account_mem);
        if (!cmap->cpu_map)
                goto free_percpu;
 
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 191b79948424..acfc1b35aa51 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -90,6 +90,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
        struct bpf_dtab *dtab;
        int err = -EINVAL;
        u64 cost;
+       bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
+       gfp_t gfp;
 
        if (!capable(CAP_NET_ADMIN))
                return ERR_PTR(-EPERM);
@@ -120,16 +122,20 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 
        err = -ENOMEM;
 
+       gfp = GFP_KERNEL | __GFP_NOWARN;
+       if (account_mem)
+               gfp |= __GFP_ACCOUNT;
        /* A per cpu bitfield with a bit per possible net device */
        dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
                                                __alignof__(unsigned long),
-                                               GFP_KERNEL | __GFP_NOWARN);
+                                               gfp);
        if (!dtab->flush_needed)
                goto free_dtab;
 
        dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
                                              sizeof(struct bpf_dtab_netdev *),
-                                             dtab->map.numa_node);
+                                             dtab->map.numa_node,
+                                             account_mem);
        if (!dtab->netdev_map)
                goto free_dtab;
 
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 4b7c76765d9d..fc2f44451256 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -23,6 +23,7 @@
 
 #define HTAB_CREATE_FLAG_MASK                                          \
        (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |    \
+        BPF_F_ACCOUNT_MEM |                                            \
         BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ZERO_SEED)
 
 struct bucket {
@@ -139,27 +140,32 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab 
*htab, void *key,
        return NULL;
 }
 
-static int prealloc_init(struct bpf_htab *htab)
+static int prealloc_init(struct bpf_htab *htab, bool account_mem)
 {
        u32 num_entries = htab->map.max_entries;
+       gfp_t gfp = GFP_USER | __GFP_NOWARN;
        int err = -ENOMEM, i;
 
        if (!htab_is_percpu(htab) && !htab_is_lru(htab))
                num_entries += num_possible_cpus();
 
        htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries,
-                                        htab->map.numa_node);
+                                        htab->map.numa_node,
+                                        account_mem);
        if (!htab->elems)
                return -ENOMEM;
 
        if (!htab_is_percpu(htab))
                goto skip_percpu_elems;
 
+       if (account_mem)
+               gfp |= __GFP_ACCOUNT;
+
        for (i = 0; i < num_entries; i++) {
                u32 size = round_up(htab->map.value_size, 8);
                void __percpu *pptr;
 
-               pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
+               pptr = __alloc_percpu_gfp(size, 8, gfp);
                if (!pptr)
                        goto free_elems;
                htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
@@ -173,6 +179,7 @@ static int prealloc_init(struct bpf_htab *htab)
                                   htab->map.map_flags & BPF_F_NO_COMMON_LRU,
                                   offsetof(struct htab_elem, hash) -
                                   offsetof(struct htab_elem, lru_node),
+                                  account_mem,
                                   htab_lru_map_delete_node,
                                   htab);
        else
@@ -313,6 +320,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
         */
        bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
        bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
+       bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
        struct bpf_htab *htab;
        int err, i;
        u64 cost;
@@ -374,7 +382,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
        err = -ENOMEM;
        htab->buckets = bpf_map_area_alloc(htab->n_buckets *
                                           sizeof(struct bucket),
-                                          htab->map.numa_node);
+                                          htab->map.numa_node,
+                                          account_mem);
        if (!htab->buckets)
                goto free_htab;
 
@@ -389,7 +398,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
        }
 
        if (prealloc) {
-               err = prealloc_init(htab);
+               err = prealloc_init(htab, account_mem);
                if (err)
                        goto free_buckets;
 
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index abf1002080df..8421fdb816f3 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -277,16 +277,19 @@ static void *trie_lookup_elem(struct bpf_map *map, void 
*_key)
 }
 
 static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
-                                                const void *value)
+                                                const void *value,
+                                                bool account_mem)
 {
        struct lpm_trie_node *node;
        size_t size = sizeof(struct lpm_trie_node) + trie->data_size;
+       gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
 
        if (value)
                size += trie->map.value_size;
 
-       node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN,
-                           trie->map.numa_node);
+       if (account_mem)
+               gfp |= __GFP_ACCOUNT;
+       node = kmalloc_node(size, gfp, trie->map.numa_node);
        if (!node)
                return NULL;
 
@@ -327,7 +330,7 @@ static int trie_update_elem(struct bpf_map *map,
                goto out;
        }
 
-       new_node = lpm_trie_node_alloc(trie, value);
+       new_node = lpm_trie_node_alloc(trie, value, map->account_mem);
        if (!new_node) {
                ret = -ENOMEM;
                goto out;
@@ -394,7 +397,7 @@ static int trie_update_elem(struct bpf_map *map,
                goto out;
        }
 
-       im_node = lpm_trie_node_alloc(trie, NULL);
+       im_node = lpm_trie_node_alloc(trie, NULL, map->account_mem);
        if (!im_node) {
                ret = -ENOMEM;
                goto out;
@@ -542,6 +545,8 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
 static struct bpf_map *trie_alloc(union bpf_attr *attr)
 {
        struct lpm_trie *trie;
+       bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
+       gfp_t gfp = GFP_USER | __GFP_NOWARN;
        u64 cost = sizeof(*trie), cost_per_node;
        int ret;
 
@@ -558,7 +563,9 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
            attr->value_size > LPM_VAL_SIZE_MAX)
                return ERR_PTR(-EINVAL);
 
-       trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN);
+       if (account_mem)
+               gfp |= __GFP_ACCOUNT;
+       trie = kzalloc(sizeof(*trie), gfp);
        if (!trie)
                return ERR_PTR(-ENOMEM);
 
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index b384ea9f3254..040ec350af3d 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -11,7 +11,7 @@
 #include "percpu_freelist.h"
 
 #define QUEUE_STACK_CREATE_FLAG_MASK \
-       (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+       (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | BPF_F_ACCOUNT_MEM)
 
 
 struct bpf_queue_stack {
@@ -69,6 +69,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr 
*attr)
        int ret, numa_node = bpf_map_attr_numa_node(attr);
        struct bpf_queue_stack *qs;
        u64 size, queue_size, cost;
+       bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 
        size = (u64) attr->max_entries + 1;
        cost = queue_size = sizeof(*qs) + size * attr->value_size;
@@ -81,7 +82,7 @@ static struct bpf_map *queue_stack_map_alloc(union bpf_attr 
*attr)
        if (ret < 0)
                return ERR_PTR(ret);
 
-       qs = bpf_map_area_alloc(queue_size, numa_node);
+       qs = bpf_map_area_alloc(queue_size, numa_node, account_mem);
        if (!qs)
                return ERR_PTR(-ENOMEM);
 
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 18e225de80ff..a9a2709c7507 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -152,6 +152,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr 
*attr)
        int err, numa_node = bpf_map_attr_numa_node(attr);
        struct reuseport_array *array;
        u64 cost, array_size;
+       bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 
        if (!capable(CAP_SYS_ADMIN))
                return ERR_PTR(-EPERM);
@@ -170,7 +171,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr 
*attr)
                return ERR_PTR(err);
 
        /* allocate all map elements and zero-initialize them */
-       array = bpf_map_area_alloc(array_size, numa_node);
+       array = bpf_map_area_alloc(array_size, numa_node, account_mem);
        if (!array)
                return ERR_PTR(-ENOMEM);
 
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index d43b14535827..46d37c7e09a2 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -61,13 +61,15 @@ static inline int stack_map_data_size(struct bpf_map *map)
                sizeof(struct bpf_stack_build_id) : sizeof(u64);
 }
 
-static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
+static int prealloc_elems_and_freelist(struct bpf_stack_map *smap,
+                                      bool account_mem)
 {
        u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
        int err;
 
        smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries,
-                                        smap->map.numa_node);
+                                        smap->map.numa_node,
+                                        account_mem);
        if (!smap->elems)
                return -ENOMEM;
 
@@ -90,6 +92,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
        u32 value_size = attr->value_size;
        struct bpf_stack_map *smap;
        u64 cost, n_buckets;
+       bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
        int err;
 
        if (!capable(CAP_SYS_ADMIN))
@@ -119,7 +122,8 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
        if (cost >= U32_MAX - PAGE_SIZE)
                return ERR_PTR(-E2BIG);
 
-       smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr));
+       smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr),
+                                 account_mem);
        if (!smap)
                return ERR_PTR(-ENOMEM);
 
@@ -141,7 +145,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
        if (err)
                goto free_smap;
 
-       err = prealloc_elems_and_freelist(smap);
+       err = prealloc_elems_and_freelist(smap, account_mem);
        if (err)
                goto put_buffers;
 
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b155cd17c1bd..13f2e1731a47 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -131,25 +131,29 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr 
*attr)
                return map;
        map->ops = ops;
        map->map_type = type;
+       map->account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
        return map;
 }
 
-void *bpf_map_area_alloc(size_t size, int numa_node)
+void *bpf_map_area_alloc(size_t size, int numa_node, bool account_mem)
 {
        /* We definitely need __GFP_NORETRY, so OOM killer doesn't
         * trigger under memory pressure as we really just want to
         * fail instead.
         */
-       const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO;
+       gfp_t gfp = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO;
        void *area;
 
+       if (account_mem)
+               gfp |= __GFP_ACCOUNT;
+
        if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-               area = kmalloc_node(size, GFP_USER | flags, numa_node);
+               area = kmalloc_node(size, GFP_USER | gfp, numa_node);
                if (area != NULL)
                        return area;
        }
 
-       return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags,
+       return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | gfp,
                                           __builtin_return_address(0));
 }
 
diff --git a/kernel/bpf/xskmap.c b/kernel/bpf/xskmap.c
index 686d244e798d..bbc1f142326f 100644
--- a/kernel/bpf/xskmap.c
+++ b/kernel/bpf/xskmap.c
@@ -20,6 +20,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
        int cpu, err = -EINVAL;
        struct xsk_map *m;
        u64 cost;
+       bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
+       gfp_t gfp = GFP_KERNEL;
 
        if (!capable(CAP_NET_ADMIN))
                return ERR_PTR(-EPERM);
@@ -49,7 +51,9 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 
        err = -ENOMEM;
 
-       m->flush_list = alloc_percpu(struct list_head);
+       if (account_mem)
+               gfp |= __GFP_ACCOUNT;
+       m->flush_list = alloc_percpu_gfp(struct list_head, gfp);
        if (!m->flush_list)
                goto free_m;
 
@@ -58,7 +62,8 @@ static struct bpf_map *xsk_map_alloc(union bpf_attr *attr)
 
        m->xsk_map = bpf_map_area_alloc(m->map.max_entries *
                                        sizeof(struct xdp_sock *),
-                                       m->map.numa_node);
+                                       m->map.numa_node,
+                                       account_mem);
        if (!m->xsk_map)
                goto free_percpu;
        return &m->map;
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index be6092ac69f8..eefcfd1294c0 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -18,13 +18,15 @@ struct bpf_stab {
        raw_spinlock_t lock;
 };
 
-#define SOCK_CREATE_FLAG_MASK                          \
-       (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+#define SOCK_CREATE_FLAG_MASK                                  \
+       (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY |        \
+        BPF_F_ACCOUNT_MEM)
 
 static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 {
        struct bpf_stab *stab;
        u64 cost;
+       bool account_mem = (attr->map_flags & BPF_F_ACCOUNT_MEM);
        int err;
 
        if (!capable(CAP_NET_ADMIN))
@@ -56,7 +58,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
 
        stab->sks = bpf_map_area_alloc(stab->map.max_entries *
                                       sizeof(struct sock *),
-                                      stab->map.numa_node);
+                                      stab->map.numa_node,
+                                      account_mem);
        if (stab->sks)
                return &stab->map;
        err = -ENOMEM;
@@ -788,6 +791,7 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
        struct bpf_htab *htab;
        int i, err;
        u64 cost;
+       bool account = (attr->map_flags & BPF_F_ACCOUNT_MEM);
 
        if (!capable(CAP_NET_ADMIN))
                return ERR_PTR(-EPERM);
@@ -823,7 +827,8 @@ static struct bpf_map *sock_hash_alloc(union bpf_attr *attr)
 
        htab->buckets = bpf_map_area_alloc(htab->buckets_num *
                                           sizeof(struct bpf_htab_bucket),
-                                          htab->map.numa_node);
+                                          htab->map.numa_node,
+                                          account);
        if (!htab->buckets) {
                err = -ENOMEM;
                goto free_htab;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 91c43884f295..a2b6cb200c9b 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -279,6 +279,7 @@ enum bpf_attach_type {
 /* Specify numa node during map creation */
 #define BPF_F_NUMA_NODE                (1U << 2)
 
+
 #define BPF_OBJ_NAME_LEN 16U
 
 /* Flags for accessing BPF object */
@@ -291,6 +292,9 @@ enum bpf_attach_type {
 /* Zero-initialize hash function seed. This should only be used for testing. */
 #define BPF_F_ZERO_SEED                (1U << 6)
 
+/* Enable memory accounting for map (only for BPF_MAP_CREATE) */
+#define BPF_F_ACCOUNT_MEM      (1U << 7)
+
 /* flags for BPF_PROG_QUERY */
 #define BPF_F_QUERY_EFFECTIVE  (1U << 0)
 
-- 
2.20.1

Reply via email to