[RFC PATCH 03/30] Lazy percpu counters

2022-08-30 Thread Suren Baghdasaryan
From: Kent Overstreet 

This patch adds lib/lazy-percpu-counter.c, which implements counters
that start out as atomics, but lazily switch to percpu mode if the
update rate crosses some threshold (arbitrarily set at 256 per second).

Signed-off-by: Kent Overstreet 
---
 include/linux/lazy-percpu-counter.h |  67 +
 lib/Kconfig |   3 +
 lib/Makefile|   2 +
 lib/lazy-percpu-counter.c   | 141 
 4 files changed, 213 insertions(+)
 create mode 100644 include/linux/lazy-percpu-counter.h
 create mode 100644 lib/lazy-percpu-counter.c

diff --git a/include/linux/lazy-percpu-counter.h 
b/include/linux/lazy-percpu-counter.h
new file mode 100644
index ..a22a2b9a9f32
--- /dev/null
+++ b/include/linux/lazy-percpu-counter.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Lazy percpu counters:
+ * (C) 2022 Kent Overstreet
+ *
+ * Lazy percpu counters start out in atomic mode, then switch to percpu mode if
+ * the update rate crosses some threshold.
+ *
+ * This means we don't have to decide between low memory overhead atomic
+ * counters and higher performance percpu counters - we can have our cake and
+ * eat it, too!
+ *
+ * Internally we use an atomic64_t, where the low bit indicates whether we're 
in
+ * percpu mode, and the high 8 bits are a secondary counter that's incremented
+ * when the counter is modified - meaning 55 bits of precision are available 
for
+ * the counter itself.
+ *
+ * lazy_percpu_counter is 16 bytes (on 64 bit machines), 
raw_lazy_percpu_counter
+ * is 8 bytes but requires a separate unsigned long to record when the counter
+ * wraps - because sometimes multiple counters are used together and can share
+ * the same timestamp.
+ */
+
+#ifndef _LINUX_LAZY_PERCPU_COUNTER_H
+#define _LINUX_LAZY_PERCPU_COUNTER_H
+
+struct raw_lazy_percpu_counter {
+   atomic64_t  v;
+};
+
+void __lazy_percpu_counter_exit(struct raw_lazy_percpu_counter *c);
+void __lazy_percpu_counter_add(struct raw_lazy_percpu_counter *c,
+  unsigned long *last_wrap, s64 i);
+s64 __lazy_percpu_counter_read(struct raw_lazy_percpu_counter *c);
+
+static inline void __lazy_percpu_counter_sub(struct raw_lazy_percpu_counter *c,
+unsigned long *last_wrap, s64 i)
+{
+   __lazy_percpu_counter_add(c, last_wrap, -i);
+}
+
+struct lazy_percpu_counter {
+   struct raw_lazy_percpu_counter  v;
+   unsigned long   last_wrap;
+};
+
+static inline void lazy_percpu_counter_exit(struct lazy_percpu_counter *c)
+{
+   __lazy_percpu_counter_exit(&c->v);
+}
+
+static inline void lazy_percpu_counter_add(struct lazy_percpu_counter *c, s64 
i)
+{
+   __lazy_percpu_counter_add(&c->v, &c->last_wrap, i);
+}
+
+static inline void lazy_percpu_counter_sub(struct lazy_percpu_counter *c, s64 
i)
+{
+   __lazy_percpu_counter_sub(&c->v, &c->last_wrap, i);
+}
+
+static inline s64 lazy_percpu_counter_read(struct lazy_percpu_counter *c)
+{
+   return __lazy_percpu_counter_read(&c->v);
+}
+
+#endif /* _LINUX_LAZY_PERCPU_COUNTER_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index dc1ab2ed1dc6..fc6dbc425728 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -498,6 +498,9 @@ config ASSOCIATIVE_ARRAY
 
  for more information.
 
+config LAZY_PERCPU_COUNTER
+   bool
+
 config HAS_IOMEM
bool
depends on !NO_IOMEM
diff --git a/lib/Makefile b/lib/Makefile
index ffabc30a27d4..cc7762748708 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -163,6 +163,8 @@ obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
 obj-$(CONFIG_DEBUG_LIST) += list_debug.o
 obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o
 
+obj-$(CONFIG_LAZY_PERCPU_COUNTER) += lazy-percpu-counter.o
+
 obj-$(CONFIG_BITREVERSE) += bitrev.o
 obj-$(CONFIG_LINEAR_RANGES) += linear_ranges.o
 obj-$(CONFIG_PACKING)  += packing.o
diff --git a/lib/lazy-percpu-counter.c b/lib/lazy-percpu-counter.c
new file mode 100644
index ..299ef36137ee
--- /dev/null
+++ b/lib/lazy-percpu-counter.c
@@ -0,0 +1,141 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/*
+ * We use the high bits of the atomic counter for a secondary counter, which is
+ * incremented every time the counter is touched. When the secondary counter
+ * wraps, we check the time the counter last wrapped, and if it was recent
+ * enough that means the update frequency has crossed our threshold and we
+ * switch to percpu mode:
+ */
+#define COUNTER_MOD_BITS   8
+#define COUNTER_MOD_MASK   ~(~0ULL >> COUNTER_MOD_BITS)
+#define COUNTER_MOD_BITS_START (64 - COUNTER_MOD_BITS)
+
+/*
+ * We use the low bit of the counter to indicate whether we're in atomic mode
+ * (low bit clear), or percpu mode (low bit set, counter is a pointer to actual
+ * percpu counters:
+ */
+#define COUNTER_IS_PCPU_BIT1
+
+static 

[RFC PATCH 07/30] lib: add support for allocation tagging

2022-08-30 Thread Suren Baghdasaryan
Introduce CONFIG_ALLOC_TAGGING which provides definitions to easily
instrument allocators. It also registers an "alloc_tags" codetag type
with defbugfs interface to output allocation tags information.

Signed-off-by: Suren Baghdasaryan 
Co-developed-by: Kent Overstreet 
Signed-off-by: Kent Overstreet 
---
 include/asm-generic/codetag.lds.h |  14 +++
 include/asm-generic/vmlinux.lds.h |   3 +
 include/linux/alloc_tag.h |  66 +
 lib/Kconfig.debug |   5 +
 lib/Makefile  |   2 +
 lib/alloc_tag.c   | 158 ++
 scripts/module.lds.S  |   7 ++
 7 files changed, 255 insertions(+)
 create mode 100644 include/asm-generic/codetag.lds.h
 create mode 100644 include/linux/alloc_tag.h
 create mode 100644 lib/alloc_tag.c

diff --git a/include/asm-generic/codetag.lds.h 
b/include/asm-generic/codetag.lds.h
new file mode 100644
index ..64f536b80380
--- /dev/null
+++ b/include/asm-generic/codetag.lds.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef __ASM_GENERIC_CODETAG_LDS_H
+#define __ASM_GENERIC_CODETAG_LDS_H
+
+#define SECTION_WITH_BOUNDARIES(_name) \
+   . = ALIGN(8);   \
+   __start_##_name = .;\
+   KEEP(*(_name))  \
+   __stop_##_name = .;
+
+#define CODETAG_SECTIONS() \
+   SECTION_WITH_BOUNDARIES(alloc_tags)
+
+#endif /* __ASM_GENERIC_CODETAG_LDS_H */
diff --git a/include/asm-generic/vmlinux.lds.h 
b/include/asm-generic/vmlinux.lds.h
index 7515a465ec03..c2dc2a59ab2e 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -50,6 +50,8 @@
  *   [__nosave_begin, __nosave_end] for the nosave data
  */
 
+#include 
+
 #ifndef LOAD_OFFSET
 #define LOAD_OFFSET 0
 #endif
@@ -348,6 +350,7 @@
__start___dyndbg = .;   \
KEEP(*(__dyndbg))   \
__stop___dyndbg = .;\
+   CODETAG_SECTIONS()  \
LIKELY_PROFILE()\
BRANCH_PROFILE()\
TRACE_PRINTKS() \
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
new file mode 100644
index ..b3f589afb1c9
--- /dev/null
+++ b/include/linux/alloc_tag.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * allocation tagging
+ */
+#ifndef _LINUX_ALLOC_TAG_H
+#define _LINUX_ALLOC_TAG_H
+
+#include 
+#include 
+#include 
+#include 
+
+/*
+ * An instance of this structure is created in a special ELF section at every
+ * allocation callsite. At runtime, the special section is treated as
+ * an array of these. Embedded codetag utilizes codetag framework.
+ */
+struct alloc_tag {
+   struct codetag  ct;
+   unsigned long   last_wrap;
+   struct raw_lazy_percpu_counter  call_count;
+   struct raw_lazy_percpu_counter  bytes_allocated;
+} __aligned(8);
+
+static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
+{
+   return container_of(ct, struct alloc_tag, ct);
+}
+
+#define DEFINE_ALLOC_TAG(_alloc_tag)   \
+   static struct alloc_tag _alloc_tag __used __aligned(8)  \
+   __section("alloc_tags") = { .ct = CODE_TAG_INIT }
+
+#define alloc_tag_counter_read(counter)
\
+   __lazy_percpu_counter_read(counter)
+
+static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes)
+{
+   struct alloc_tag *tag = ct_to_alloc_tag(ref->ct);
+
+   __lazy_percpu_counter_add(&tag->call_count, &tag->last_wrap, -1);
+   __lazy_percpu_counter_add(&tag->bytes_allocated, &tag->last_wrap, 
-bytes);
+   ref->ct = NULL;
+}
+
+#define alloc_tag_sub(_ref, _bytes)\
+do {   \
+   if ((_ref) && (_ref)->ct)   \
+   __alloc_tag_sub(_ref, _bytes);  \
+} while (0)
+
+static inline void __alloc_tag_add(struct alloc_tag *tag, union codetag_ref 
*ref, size_t bytes)
+{
+   ref->ct = &tag->ct;
+   __lazy_percpu_counter_add(&tag->call_count, &tag->last_wrap, 1);
+   __lazy_percpu_counter_add(&tag->bytes_allocated, &tag->last_wrap, 
bytes);
+}
+
+#define alloc_tag_add(_ref, _bytes)\
+do {   \
+   DEFINE_ALLOC_TAG(_alloc_tag);   \
+   if (_ref && !WARN_ONCE(_ref->ct, &quo

[RFC PATCH 13/30] mm/slab: introduce SLAB_NO_OBJ_EXT to avoid obj_ext creation

2022-08-30 Thread Suren Baghdasaryan
Slab extension objects can't be allocated before slab infrastructure is
initialized. Some caches, like kmem_cache and kmem_cache_node, are created
before slab infrastructure is initialized. Objects from these caches can't
have extension objects. Introduce SLAB_NO_OBJ_EXT slab flag to mark these
caches and avoid creating extensions for objects allocated from these
slabs.

Signed-off-by: Suren Baghdasaryan 
---
 include/linux/slab.h | 7 +++
 mm/slab.c| 2 +-
 mm/slub.c| 5 +++--
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 0fefdf528e0d..55ae3ea864a4 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -124,6 +124,13 @@
 #define SLAB_RECLAIM_ACCOUNT   ((slab_flags_t __force)0x0002U)
 #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT/* Objects are 
short-lived */
 
+#ifdef CONFIG_SLAB_OBJ_EXT
+/* Slab created using create_boot_cache */
+#define SLAB_NO_OBJ_EXT ((slab_flags_t __force)0x2000U)
+#else
+#define SLAB_NO_OBJ_EXT 0
+#endif
+
 /*
  * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
  *
diff --git a/mm/slab.c b/mm/slab.c
index 10e96137b44f..ba97aeef7ec1 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1233,7 +1233,7 @@ void __init kmem_cache_init(void)
create_boot_cache(kmem_cache, "kmem_cache",
offsetof(struct kmem_cache, node) +
  nr_node_ids * sizeof(struct kmem_cache_node 
*),
- SLAB_HWCACHE_ALIGN, 0, 0);
+ SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
list_add(&kmem_cache->list, &slab_caches);
slab_state = PARTIAL;
 
diff --git a/mm/slub.c b/mm/slub.c
index 862dbd9af4f5..80199d5ac7c9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4825,7 +4825,8 @@ void __init kmem_cache_init(void)
node_set(node, slab_nodes);
 
create_boot_cache(kmem_cache_node, "kmem_cache_node",
-   sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0);
+   sizeof(struct kmem_cache_node),
+   SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
 
register_hotmemory_notifier(&slab_memory_callback_nb);
 
@@ -4835,7 +4836,7 @@ void __init kmem_cache_init(void)
create_boot_cache(kmem_cache, "kmem_cache",
offsetof(struct kmem_cache, node) +
nr_node_ids * sizeof(struct kmem_cache_node *),
-  SLAB_HWCACHE_ALIGN, 0, 0);
+   SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0);
 
kmem_cache = bootstrap(&boot_kmem_cache);
kmem_cache_node = bootstrap(&boot_kmem_cache_node);
-- 
2.37.2.672.g94769d06f0-goog




[RFC PATCH 01/30] kernel/module: move find_kallsyms_symbol_value declaration

2022-08-30 Thread Suren Baghdasaryan
Allow find_kallsyms_symbol_value to be called by code outside of
kernel/module. It will be used for code tagging module support.

Signed-off-by: Suren Baghdasaryan 
---
 include/linux/module.h   | 1 +
 kernel/module/internal.h | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/module.h b/include/linux/module.h
index 518296ea7f73..563d38ad84ed 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -605,6 +605,7 @@ struct module *find_module(const char *name);
 int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
char *name, char *module_name, int *exported);
 
+unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
 /* Look for this name: can be of form module:name. */
 unsigned long module_kallsyms_lookup_name(const char *name);
 
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index 680d980a4fb2..f1b6c477bd93 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -246,7 +246,6 @@ static inline void kmemleak_load_module(const struct module 
*mod,
 void init_build_id(struct module *mod, const struct load_info *info);
 void layout_symtab(struct module *mod, struct load_info *info);
 void add_kallsyms(struct module *mod, const struct load_info *info);
-unsigned long find_kallsyms_symbol_value(struct module *mod, const char *name);
 
 static inline bool sect_empty(const Elf_Shdr *sect)
 {
-- 
2.37.2.672.g94769d06f0-goog




[RFC PATCH 02/30] lib/string_helpers: Drop space in string_get_size's output

2022-08-30 Thread Suren Baghdasaryan
From: Kent Overstreet 

Previously, string_get_size() outputted a space between the number and
the units, i.e.
  9.88 MiB

This changes it to
  9.88MiB

which allows it to be parsed correctly by the 'sort -h' command.

Signed-off-by: Kent Overstreet 
Cc: Andy Shevchenko 
Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: "Michael S. Tsirkin" 
Cc: Jason Wang 
Cc: "Noralf Trønnes" 
Cc: Jens Axboe 
---
 lib/string_helpers.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 5ed3beb066e6..3032d1b04ca3 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -126,8 +126,7 @@ void string_get_size(u64 size, u64 blk_size, const enum 
string_size_units units,
else
unit = units_str[units][i];
 
-   snprintf(buf, len, "%u%s %s", (u32)size,
-tmp, unit);
+   snprintf(buf, len, "%u%s%s", (u32)size, tmp, unit);
 }
 EXPORT_SYMBOL(string_get_size);
 
-- 
2.37.2.672.g94769d06f0-goog




[RFC PATCH 00/30] Code tagging framework and applications

2022-08-30 Thread Suren Baghdasaryan
s func:xfs_vn_get_linkclass:memory disabled "
  fs/xfs/xfs_mount.c:85 module:xfs func:xfs_uuid_mountclass:memory disabled "

===
Latency tracking
===
This lets you instrument code for measuring latency with just two calls
to code_tag_time_stats_start() and code_tag_time_stats_finish(), and
makes statistics available in debugfs on a per-callsite basis.

Recorded statistics include total count, frequency/rate, average
duration, max duration, and event duration quantiles.

Additionally, this patch instruments prepare_to_wait() and finish_wait().

Example output:

  fs/xfs/xfs_extent_busy.c:589 module:xfs func:xfs_extent_busy_flush
  count:  61
  rate:   0/sec
  frequency:19 sec
  avg duration:   632 us
  max duration:   2 ms
  quantiles (us): 274 288 288 296 296 296 296 336 336 336 336 336 336 336 336

===
Improved error codes
===
Ever waste hours trying to figure out which line of code from some
obscure module is returning you -EINVAL and nothing else?

What if we had... more error codes?

This patch adds ERR(), which returns a unique error code that is related
to the error code that passed to it: the original error code can be
recovered with error_class(), and errname() (as well as %pE) returns an
error string that includes the file and line number of the ERR() call.

Example output:

  VFS: Cannot open root device "sda" or unknown-block(8,0): error -EINVAL at 
fs/ext4/super.c:4387

===
Dynamic debug conversion to code tagging
===
There are several open coded implementations of the "define a special elf
section for objects and iterate" technique that should be converted to
code tagging. This series just converts dynamic debug; there are others
(multiple in ftrace, in particular) that should also be converted.

===

The patchset applies cleanly over Linux 6.0-rc3
The tree for testing is published at:
https://github.com/surenbaghdasaryan/linux/tree/alloc_tags_rfc

The structure of the patchset is:
- code tagging framework (patches 1-6)
- page allocation tracking (patches 7-10)
- slab allocation tracking (patch 11-16)
- allocation context capture (patch 17-21)
- dynamic fault injection (patch 22)
- latency tracking (patch 23-27)
- improved error codes (patch 28)
- dynamic debug conversion to code tagging (patch 29)
- MAINTAINERS update (patch 30)

Next steps:
- track and fix slab allocator leak mentioned earlier;
- instrument more allocators: vmalloc, per-cpu allocations, others?


Kent Overstreet (14):
  lib/string_helpers: Drop space in string_get_size's output
  Lazy percpu counters
  scripts/kallysms: Always include __start and __stop symbols
  lib/string.c: strsep_no_empty()
  codetag: add codetag query helper functions
  Code tagging based fault injection
  timekeeping: Add a missing include
  wait: Clean up waitqueue_entry initialization
  lib/time_stats: New library for statistics on events
  bcache: Convert to lib/time_stats
  Code tagging based latency tracking
  Improved symbolic error names
  dyndbg: Convert to code tagging
  MAINTAINERS: Add entries for code tagging & related

Suren Baghdasaryan (16):
  kernel/module: move find_kallsyms_symbol_value declaration
  lib: code tagging framework
  lib: code tagging module support
  lib: add support for allocation tagging
  lib: introduce page allocation tagging
  change alloc_pages name in dma_map_ops to avoid name conflicts
  mm: enable page allocation tagging for __get_free_pages and
alloc_pages
  mm: introduce slabobj_ext to support slab object extensions
  mm: introduce __GFP_NO_OBJ_EXT flag to selectively prevent slabobj_ext
creation
  mm/slab: introduce SLAB_NO_OBJ_EXT to avoid obj_ext creation
  mm: prevent slabobj_ext allocations for slabobj_ext and kmem_cache
objects
  lib: introduce slab allocation tagging
  mm: enable slab allocation tagging for kmalloc and friends
  move stack capture functionality into a separate function for reuse
  lib: introduce support for storing code tag context
  lib: implement context capture support for page and slab allocators

 MAINTAINERS |  34 ++
 arch/x86/kernel/amd_gart_64.c   |   2 +-
 drivers/iommu/dma-iommu.c   |   2 +-
 drivers/md/bcache/Kconfig   |   1 +
 drivers/md/bcache/bcache.h  |   1 +
 drivers/md/bcache/bset.c|   8 +-
 drivers/md/bcache/bset.h|   1 +
 drivers/md/bcache/btree.c   |  12 +-
 drivers/md/bcache/super.c   |   3 +
 drivers/md/bcache/sysfs.c   |  43 ++-
 drivers/md/bcache/util.c|  30 --
 drivers/md/bcache/util.h|  57 ---
 drivers/xen/grant-dma-ops.c |   2 +-
 drivers/xen/swiotlb-xen.c   |   2 +-
 include/asm-generic/codetag.lds.h   |  18 +
 include/asm-generic/vmlinux.lds.h   |   8 +-
 include/linux/alloc_tag.h

[RFC PATCH 05/30] lib: code tagging framework

2022-08-30 Thread Suren Baghdasaryan
Add basic infrastructure to support code tagging which stores tag common
information consisting of the module name, function, file name and line
number. Provide functions to register a new code tag type and navigate
between code tags.

Co-developed-by: Kent Overstreet 
Signed-off-by: Kent Overstreet 
Signed-off-by: Suren Baghdasaryan 
---
 include/linux/codetag.h |  71 ++
 lib/Kconfig.debug   |   4 +
 lib/Makefile|   1 +
 lib/codetag.c   | 199 
 4 files changed, 275 insertions(+)
 create mode 100644 include/linux/codetag.h
 create mode 100644 lib/codetag.c

diff --git a/include/linux/codetag.h b/include/linux/codetag.h
new file mode 100644
index ..a9d7adecc2a5
--- /dev/null
+++ b/include/linux/codetag.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * code tagging framework
+ */
+#ifndef _LINUX_CODETAG_H
+#define _LINUX_CODETAG_H
+
+#include 
+
+struct codetag_iterator;
+struct codetag_type;
+struct seq_buf;
+struct module;
+
+/*
+ * An instance of this structure is created in a special ELF section at every
+ * code location being tagged.  At runtime, the special section is treated as
+ * an array of these.
+ */
+struct codetag {
+   unsigned int flags; /* used in later patches */
+   unsigned int lineno;
+   const char *modname;
+   const char *function;
+   const char *filename;
+} __aligned(8);
+
+union codetag_ref {
+   struct codetag *ct;
+};
+
+struct codetag_range {
+   struct codetag *start;
+   struct codetag *stop;
+};
+
+struct codetag_module {
+   struct module *mod;
+   struct codetag_range range;
+};
+
+struct codetag_type_desc {
+   const char *section;
+   size_t tag_size;
+};
+
+struct codetag_iterator {
+   struct codetag_type *cttype;
+   struct codetag_module *cmod;
+   unsigned long mod_id;
+   struct codetag *ct;
+};
+
+#define CODE_TAG_INIT {\
+   .modname= KBUILD_MODNAME,   \
+   .function   = __func__, \
+   .filename   = __FILE__, \
+   .lineno = __LINE__, \
+   .flags  = 0,\
+}
+
+void codetag_lock_module_list(struct codetag_type *cttype, bool lock);
+struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype);
+struct codetag *codetag_next_ct(struct codetag_iterator *iter);
+
+void codetag_to_text(struct seq_buf *out, struct codetag *ct);
+
+struct codetag_type *
+codetag_register_type(const struct codetag_type_desc *desc);
+
+#endif /* _LINUX_CODETAG_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index bcbe60d6c80c..22bc1eff7f8f 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -969,6 +969,10 @@ config DEBUG_STACKOVERFLOW
 
  If in doubt, say "N".
 
+config CODE_TAGGING
+   bool
+   select KALLSYMS
+
 source "lib/Kconfig.kasan"
 source "lib/Kconfig.kfence"
 
diff --git a/lib/Makefile b/lib/Makefile
index cc7762748708..574d7716e640 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -227,6 +227,7 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \
of-reconfig-notifier-error-inject.o
 obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
+obj-$(CONFIG_CODE_TAGGING) += codetag.o
 lib-$(CONFIG_GENERIC_BUG) += bug.o
 
 obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
diff --git a/lib/codetag.c b/lib/codetag.c
new file mode 100644
index ..7708f8388e55
--- /dev/null
+++ b/lib/codetag.c
@@ -0,0 +1,199 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct codetag_type {
+   struct list_head link;
+   unsigned int count;
+   struct idr mod_idr;
+   struct rw_semaphore mod_lock; /* protects mod_idr */
+   struct codetag_type_desc desc;
+};
+
+static DEFINE_MUTEX(codetag_lock);
+static LIST_HEAD(codetag_types);
+
+void codetag_lock_module_list(struct codetag_type *cttype, bool lock)
+{
+   if (lock)
+   down_read(&cttype->mod_lock);
+   else
+   up_read(&cttype->mod_lock);
+}
+
+struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype)
+{
+   struct codetag_iterator iter = {
+   .cttype = cttype,
+   .cmod = NULL,
+   .mod_id = 0,
+   .ct = NULL,
+   };
+
+   return iter;
+}
+
+static inline struct codetag *get_first_module_ct(struct codetag_module *cmod)
+{
+   return cmod->range.start < cmod->range.stop ? cmod->range.start : NULL;
+}
+
+static inline
+struct codetag *get_next_module_ct(struct codetag_iterator *iter)
+{
+   struct codetag *res = (struct codetag *)
+   ((char *)iter->ct + iter->cttype->desc.tag_size);
+
+   return res < iter->cmod->range.stop ? res :

[RFC PATCH 06/30] lib: code tagging module support

2022-08-30 Thread Suren Baghdasaryan
Add support for code tagging from dynamically loaded modules.

Signed-off-by: Suren Baghdasaryan 
Co-developed-by: Kent Overstreet 
Signed-off-by: Kent Overstreet 
---
 include/linux/codetag.h | 12 ++
 kernel/module/main.c|  4 
 lib/codetag.c   | 51 -
 3 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index a9d7adecc2a5..386733e89b31 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -42,6 +42,10 @@ struct codetag_module {
 struct codetag_type_desc {
const char *section;
size_t tag_size;
+   void (*module_load)(struct codetag_type *cttype,
+   struct codetag_module *cmod);
+   void (*module_unload)(struct codetag_type *cttype,
+ struct codetag_module *cmod);
 };
 
 struct codetag_iterator {
@@ -68,4 +72,12 @@ void codetag_to_text(struct seq_buf *out, struct codetag 
*ct);
 struct codetag_type *
 codetag_register_type(const struct codetag_type_desc *desc);
 
+#ifdef CONFIG_CODE_TAGGING
+void codetag_load_module(struct module *mod);
+void codetag_unload_module(struct module *mod);
+#else
+static inline void codetag_load_module(struct module *mod) {}
+static inline void codetag_unload_module(struct module *mod) {}
+#endif
+
 #endif /* _LINUX_CODETAG_H */
diff --git a/kernel/module/main.c b/kernel/module/main.c
index a4e4d84b6f4e..d253277492fd 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -53,6 +53,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include "internal.h"
 
@@ -1151,6 +1152,7 @@ static void free_module(struct module *mod)
 {
trace_module_free(mod);
 
+   codetag_unload_module(mod);
mod_sysfs_teardown(mod);
 
/*
@@ -2849,6 +2851,8 @@ static int load_module(struct load_info *info, const char 
__user *uargs,
/* Get rid of temporary copy. */
free_copy(info, flags);
 
+   codetag_load_module(mod);
+
/* Done! */
trace_module_load(mod);
 
diff --git a/lib/codetag.c b/lib/codetag.c
index 7708f8388e55..f0a3174f9b71 100644
--- a/lib/codetag.c
+++ b/lib/codetag.c
@@ -157,8 +157,11 @@ static int codetag_module_init(struct codetag_type 
*cttype, struct module *mod)
 
down_write(&cttype->mod_lock);
err = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL);
-   if (err >= 0)
+   if (err >= 0) {
cttype->count += range_size(cttype, &range);
+   if (cttype->desc.module_load)
+   cttype->desc.module_load(cttype, cmod);
+   }
up_write(&cttype->mod_lock);
 
if (err < 0) {
@@ -197,3 +200,49 @@ codetag_register_type(const struct codetag_type_desc *desc)
 
return cttype;
 }
+
+void codetag_load_module(struct module *mod)
+{
+   struct codetag_type *cttype;
+
+   if (!mod)
+   return;
+
+   mutex_lock(&codetag_lock);
+   list_for_each_entry(cttype, &codetag_types, link)
+   codetag_module_init(cttype, mod);
+   mutex_unlock(&codetag_lock);
+}
+
+void codetag_unload_module(struct module *mod)
+{
+   struct codetag_type *cttype;
+
+   if (!mod)
+   return;
+
+   mutex_lock(&codetag_lock);
+   list_for_each_entry(cttype, &codetag_types, link) {
+   struct codetag_module *found = NULL;
+   struct codetag_module *cmod;
+   unsigned long mod_id, tmp;
+
+   down_write(&cttype->mod_lock);
+   idr_for_each_entry_ul(&cttype->mod_idr, cmod, tmp, mod_id) {
+   if (cmod->mod && cmod->mod == mod) {
+   found = cmod;
+   break;
+   }
+   }
+   if (found) {
+   if (cttype->desc.module_unload)
+   cttype->desc.module_unload(cttype, cmod);
+
+   cttype->count -= range_size(cttype, &cmod->range);
+   idr_remove(&cttype->mod_idr, mod_id);
+   kfree(cmod);
+   }
+   up_write(&cttype->mod_lock);
+   }
+   mutex_unlock(&codetag_lock);
+}
-- 
2.37.2.672.g94769d06f0-goog




[RFC PATCH 04/30] scripts/kallysms: Always include __start and __stop symbols

2022-08-30 Thread Suren Baghdasaryan
From: Kent Overstreet 

These symbols are used to denote section boundaries: by always including
them we can unify loading sections from modules with loading built-in
sections, which leads to some significant cleanup.

Signed-off-by: Kent Overstreet 
---
 scripts/kallsyms.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c
index f18e6dfc68c5..3d51639a595d 100644
--- a/scripts/kallsyms.c
+++ b/scripts/kallsyms.c
@@ -263,6 +263,11 @@ static int symbol_in_range(const struct sym_entry *s,
return 0;
 }
 
+static bool string_starts_with(const char *s, const char *prefix)
+{
+   return strncmp(s, prefix, strlen(prefix)) == 0;
+}
+
 static int symbol_valid(const struct sym_entry *s)
 {
const char *name = sym_name(s);
@@ -270,6 +275,14 @@ static int symbol_valid(const struct sym_entry *s)
/* if --all-symbols is not specified, then symbols outside the text
 * and inittext sections are discarded */
if (!all_symbols) {
+   /*
+* Symbols starting with __start and __stop are used to denote
+* section boundaries, and should always be included:
+*/
+   if (string_starts_with(name, "__start_") ||
+   string_starts_with(name, "__stop_"))
+   return 1;
+
if (symbol_in_range(s, text_ranges,
ARRAY_SIZE(text_ranges)) == 0)
return 0;
-- 
2.37.2.672.g94769d06f0-goog




[RFC PATCH 08/30] lib: introduce page allocation tagging

2022-08-30 Thread Suren Baghdasaryan
Introduce CONFIG_PAGE_ALLOC_TAGGING which provides helper functions to
easily instrument page allocators and adds a page_ext field to store a
pointer to the allocation tag associated with the code that allocated
the page.

Signed-off-by: Suren Baghdasaryan 
Co-developed-by: Kent Overstreet 
Signed-off-by: Kent Overstreet 
---
 include/linux/pgalloc_tag.h | 28 
 lib/Kconfig.debug   | 11 +++
 lib/Makefile|  1 +
 lib/pgalloc_tag.c   | 22 ++
 mm/page_ext.c   |  6 ++
 5 files changed, 68 insertions(+)
 create mode 100644 include/linux/pgalloc_tag.h
 create mode 100644 lib/pgalloc_tag.c

diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
new file mode 100644
index ..f525abfe51d4
--- /dev/null
+++ b/include/linux/pgalloc_tag.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * page allocation tagging
+ */
+#ifndef _LINUX_PGALLOC_TAG_H
+#define _LINUX_PGALLOC_TAG_H
+
+#include 
+#include 
+
+extern struct page_ext_operations page_alloc_tagging_ops;
+struct page_ext *lookup_page_ext(const struct page *page);
+
+static inline union codetag_ref *get_page_tag_ref(struct page *page)
+{
+   struct page_ext *page_ext = lookup_page_ext(page);
+
+   return page_ext ? (void *)page_ext + page_alloc_tagging_ops.offset
+   : NULL;
+}
+
+static inline void pgalloc_tag_dec(struct page *page, unsigned int order)
+{
+   if (page)
+   alloc_tag_sub(get_page_tag_ref(page), PAGE_SIZE << order);
+}
+
+#endif /* _LINUX_PGALLOC_TAG_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 795bf6993f8a..6686648843b3 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -978,6 +978,17 @@ config ALLOC_TAGGING
select CODE_TAGGING
select LAZY_PERCPU_COUNTER
 
+config PAGE_ALLOC_TAGGING
+   bool "Enable page allocation tagging"
+   default n
+   select ALLOC_TAGGING
+   select PAGE_EXTENSION
+   help
+ Instrument page allocators to track allocation source code and
+ collect statistics on the number of allocations and their total size
+ initiated at that code location. The mechanism can be used to track
+ memory leaks with a low performance impact.
+
 source "lib/Kconfig.kasan"
 source "lib/Kconfig.kfence"
 
diff --git a/lib/Makefile b/lib/Makefile
index dc00533fc5c8..99f732156673 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -229,6 +229,7 @@ obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
 
 obj-$(CONFIG_CODE_TAGGING) += codetag.o
 obj-$(CONFIG_ALLOC_TAGGING) += alloc_tag.o
+obj-$(CONFIG_PAGE_ALLOC_TAGGING) += pgalloc_tag.o
 
 lib-$(CONFIG_GENERIC_BUG) += bug.o
 
diff --git a/lib/pgalloc_tag.c b/lib/pgalloc_tag.c
new file mode 100644
index ..7d97372ca0df
--- /dev/null
+++ b/lib/pgalloc_tag.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include 
+#include 
+#include 
+#include 
+
+static __init bool need_page_alloc_tagging(void)
+{
+   return true;
+}
+
+static __init void init_page_alloc_tagging(void)
+{
+}
+
+struct page_ext_operations page_alloc_tagging_ops = {
+   .size = sizeof(union codetag_ref),
+   .need = need_page_alloc_tagging,
+   .init = init_page_alloc_tagging,
+};
+EXPORT_SYMBOL(page_alloc_tagging_ops);
+
diff --git a/mm/page_ext.c b/mm/page_ext.c
index 3dc715d7ac29..a22f514ff4da 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * struct page extension
@@ -76,6 +77,9 @@ static struct page_ext_operations *page_ext_ops[] __initdata 
= {
 #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
&page_idle_ops,
 #endif
+#ifdef CONFIG_PAGE_ALLOC_TAGGING
+   &page_alloc_tagging_ops,
+#endif
 #ifdef CONFIG_PAGE_TABLE_CHECK
&page_table_check_ops,
 #endif
@@ -152,6 +156,7 @@ struct page_ext *lookup_page_ext(const struct page *page)
MAX_ORDER_NR_PAGES);
return get_entry(base, index);
 }
+EXPORT_SYMBOL(lookup_page_ext);
 
 static int __init alloc_node_page_ext(int nid)
 {
@@ -221,6 +226,7 @@ struct page_ext *lookup_page_ext(const struct page *page)
return NULL;
return get_entry(section->page_ext, pfn);
 }
+EXPORT_SYMBOL(lookup_page_ext);
 
 static void *__meminit alloc_page_ext(size_t size, int nid)
 {
-- 
2.37.2.672.g94769d06f0-goog




[RFC PATCH 12/30] mm: introduce __GFP_NO_OBJ_EXT flag to selectively prevent slabobj_ext creation

2022-08-30 Thread Suren Baghdasaryan
Introduce __GFP_NO_OBJ_EXT flag in order to prevent recursive allocations
when allocating slabobj_ext on a slab.

Signed-off-by: Suren Baghdasaryan 
---
 include/linux/gfp_types.h | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h
index d88c46ca82e1..a2cba1d20b86 100644
--- a/include/linux/gfp_types.h
+++ b/include/linux/gfp_types.h
@@ -55,8 +55,13 @@ typedef unsigned int __bitwise gfp_t;
 #define ___GFP_SKIP_KASAN_UNPOISON 0
 #define ___GFP_SKIP_KASAN_POISON   0
 #endif
+#ifdef CONFIG_SLAB_OBJ_EXT
+#define ___GFP_NO_OBJ_EXT   0x800u
+#else
+#define ___GFP_NO_OBJ_EXT   0
+#endif
 #ifdef CONFIG_LOCKDEP
-#define ___GFP_NOLOCKDEP   0x800u
+#define ___GFP_NOLOCKDEP   0x1000u
 #else
 #define ___GFP_NOLOCKDEP   0
 #endif
@@ -101,12 +106,15 @@ typedef unsigned int __bitwise gfp_t;
  * node with no fallbacks or placement policy enforcements.
  *
  * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg.
+ *
+ * %__GFP_NO_OBJ_EXT causes slab allocation to have no object extension.
  */
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE)
 #define __GFP_WRITE((__force gfp_t)___GFP_WRITE)
 #define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL)
 #define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)
 #define __GFP_ACCOUNT  ((__force gfp_t)___GFP_ACCOUNT)
+#define __GFP_NO_OBJ_EXT   ((__force gfp_t)___GFP_NO_OBJ_EXT)
 
 /**
  * DOC: Watermark modifiers
@@ -256,7 +264,7 @@ typedef unsigned int __bitwise gfp_t;
 #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
 
 /* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT (27 + IS_ENABLED(CONFIG_LOCKDEP))
+#define __GFP_BITS_SHIFT (28 + IS_ENABLED(CONFIG_LOCKDEP))
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /**
-- 
2.37.2.672.g94769d06f0-goog




[RFC PATCH 20/30] lib: introduce support for storing code tag context

2022-08-30 Thread Suren Baghdasaryan
Add support for code tag context capture when registering a new code tag
type. When context capture for a specific code tag is enabled,
codetag_ref will point to a codetag_ctx object which can be attached
to an application-specific object storing code invocation context.
codetag_ctx has a pointer to its codetag_with_ctx object with embedded
codetag object in it. All context objects of the same code tag are placed
into codetag_with_ctx.ctx_head linked list. codetag.flag is used to
indicate when a context capture for the associated code tag is
initialized and enabled.

Signed-off-by: Suren Baghdasaryan 
---
 include/linux/codetag.h |  50 +-
 include/linux/codetag_ctx.h |  48 +
 lib/codetag.c   | 134 
 3 files changed, 231 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/codetag_ctx.h

diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index 0c605417ebbe..57736ec77b45 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -5,8 +5,12 @@
 #ifndef _LINUX_CODETAG_H
 #define _LINUX_CODETAG_H
 
+#include 
+#include 
 #include 
 
+struct kref;
+struct codetag_ctx;
 struct codetag_iterator;
 struct codetag_type;
 struct seq_buf;
@@ -18,15 +22,38 @@ struct module;
  * an array of these.
  */
 struct codetag {
-   unsigned int flags; /* used in later patches */
+   unsigned int flags; /* has to be the first member shared with 
codetag_ctx */
unsigned int lineno;
const char *modname;
const char *function;
const char *filename;
 } __aligned(8);
 
+/* codetag_with_ctx flags */
+#define CTC_FLAG_CTX_PTR   (1 << 0)
+#define CTC_FLAG_CTX_READY (1 << 1)
+#define CTC_FLAG_CTX_ENABLED   (1 << 2)
+
+/*
+ * Code tag with context capture support. Contains a list to store context for
+ * each tag hit, a lock protecting the list and a flag to indicate whether
+ * context capture is enabled for the tag.
+ */
+struct codetag_with_ctx {
+   struct codetag ct;
+   struct list_head ctx_head;
+   spinlock_t ctx_lock;
+} __aligned(8);
+
+/*
+ * Tag reference can point to codetag directly or indirectly via codetag_ctx.
+ * Direct codetag pointer is used when context capture is disabled or not
+ * supported. When context capture for the tag is used, the reference points
+ * to the codetag_ctx through which the codetag can be reached.
+ */
 union codetag_ref {
struct codetag *ct;
+   struct codetag_ctx *ctx;
 };
 
 struct codetag_range {
@@ -46,6 +73,7 @@ struct codetag_type_desc {
struct codetag_module *cmod);
void (*module_unload)(struct codetag_type *cttype,
  struct codetag_module *cmod);
+   void (*free_ctx)(struct kref *ref);
 };
 
 struct codetag_iterator {
@@ -53,6 +81,7 @@ struct codetag_iterator {
struct codetag_module *cmod;
unsigned long mod_id;
struct codetag *ct;
+   struct codetag_ctx *ctx;
 };
 
 #define CODE_TAG_INIT {\
@@ -63,9 +92,28 @@ struct codetag_iterator {
.flags  = 0,\
 }
 
+static inline bool is_codetag_ctx_ref(union codetag_ref *ref)
+{
+   return !!(ref->ct->flags & CTC_FLAG_CTX_PTR);
+}
+
+static inline
+struct codetag_with_ctx *ct_to_ctc(struct codetag *ct)
+{
+   return container_of(ct, struct codetag_with_ctx, ct);
+}
+
 void codetag_lock_module_list(struct codetag_type *cttype, bool lock);
 struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype);
 struct codetag *codetag_next_ct(struct codetag_iterator *iter);
+struct codetag_ctx *codetag_next_ctx(struct codetag_iterator *iter);
+
+bool codetag_enable_ctx(struct codetag_with_ctx *ctc, bool enable);
+static inline bool codetag_ctx_enabled(struct codetag_with_ctx *ctc)
+{
+   return !!(ctc->ct.flags & CTC_FLAG_CTX_ENABLED);
+}
+bool codetag_has_ctx(struct codetag_with_ctx *ctc);
 
 void codetag_to_text(struct seq_buf *out, struct codetag *ct);
 
diff --git a/include/linux/codetag_ctx.h b/include/linux/codetag_ctx.h
new file mode 100644
index ..e741484f0e08
--- /dev/null
+++ b/include/linux/codetag_ctx.h
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * code tag context
+ */
+#ifndef _LINUX_CODETAG_CTX_H
+#define _LINUX_CODETAG_CTX_H
+
+#include 
+#include 
+
+/* Code tag hit context. */
+struct codetag_ctx {
+   unsigned int flags; /* has to be the first member shared with codetag */
+   struct codetag_with_ctx *ctc;
+   struct list_head node;
+   struct kref refcount;
+} __aligned(8);
+
+static inline struct codetag_ctx *kref_to_ctx(struct kref *refcount)
+{
+   return container_of(refcount, struct codetag_ctx, refcount);
+}
+
+static inline void add_ctx(struct codetag_ctx *ctx,
+  struct codetag_with_ctx *ctc)
+{
+   kref_init(&ctx->refcount);
+   

[RFC PATCH 09/30] change alloc_pages name in dma_map_ops to avoid name conflicts

2022-08-30 Thread Suren Baghdasaryan
After redefining alloc_pages, all uses of that name are being replaced.
Change the conflicting names to prevent preprocessor from replacing them
when it's not intended.

Signed-off-by: Suren Baghdasaryan 
---
 arch/x86/kernel/amd_gart_64.c | 2 +-
 drivers/iommu/dma-iommu.c | 2 +-
 drivers/xen/grant-dma-ops.c   | 2 +-
 drivers/xen/swiotlb-xen.c | 2 +-
 include/linux/dma-map-ops.h   | 2 +-
 kernel/dma/mapping.c  | 4 ++--
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 194d54eed537..5e83a387bfef 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -676,7 +676,7 @@ static const struct dma_map_ops gart_dma_ops = {
.get_sgtable= dma_common_get_sgtable,
.dma_supported  = dma_direct_supported,
.get_required_mask  = dma_direct_get_required_mask,
-   .alloc_pages= dma_direct_alloc_pages,
+   .alloc_pages_op = dma_direct_alloc_pages,
.free_pages = dma_direct_free_pages,
 };
 
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 17dd683b2fce..58b4878ef930 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1547,7 +1547,7 @@ static const struct dma_map_ops iommu_dma_ops = {
.flags  = DMA_F_PCI_P2PDMA_SUPPORTED,
.alloc  = iommu_dma_alloc,
.free   = iommu_dma_free,
-   .alloc_pages= dma_common_alloc_pages,
+   .alloc_pages_op = dma_common_alloc_pages,
.free_pages = dma_common_free_pages,
.alloc_noncontiguous= iommu_dma_alloc_noncontiguous,
.free_noncontiguous = iommu_dma_free_noncontiguous,
diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c
index 8973fc1e9ccc..0e26d066036e 100644
--- a/drivers/xen/grant-dma-ops.c
+++ b/drivers/xen/grant-dma-ops.c
@@ -262,7 +262,7 @@ static int xen_grant_dma_supported(struct device *dev, u64 
mask)
 static const struct dma_map_ops xen_grant_dma_ops = {
.alloc = xen_grant_dma_alloc,
.free = xen_grant_dma_free,
-   .alloc_pages = xen_grant_dma_alloc_pages,
+   .alloc_pages_op = xen_grant_dma_alloc_pages,
.free_pages = xen_grant_dma_free_pages,
.mmap = dma_common_mmap,
.get_sgtable = dma_common_get_sgtable,
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 67aa74d20162..5ab2616153f0 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -403,6 +403,6 @@ const struct dma_map_ops xen_swiotlb_dma_ops = {
.dma_supported = xen_swiotlb_dma_supported,
.mmap = dma_common_mmap,
.get_sgtable = dma_common_get_sgtable,
-   .alloc_pages = dma_common_alloc_pages,
+   .alloc_pages_op = dma_common_alloc_pages,
.free_pages = dma_common_free_pages,
 };
diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index d678afeb8a13..e8e2d210ba68 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -27,7 +27,7 @@ struct dma_map_ops {
unsigned long attrs);
void (*free)(struct device *dev, size_t size, void *vaddr,
dma_addr_t dma_handle, unsigned long attrs);
-   struct page *(*alloc_pages)(struct device *dev, size_t size,
+   struct page *(*alloc_pages_op)(struct device *dev, size_t size,
dma_addr_t *dma_handle, enum dma_data_direction dir,
gfp_t gfp);
void (*free_pages)(struct device *dev, size_t size, struct page *vaddr,
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 49cbf3e33de7..80a2bfeed8d0 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -552,9 +552,9 @@ static struct page *__dma_alloc_pages(struct device *dev, 
size_t size,
size = PAGE_ALIGN(size);
if (dma_alloc_direct(dev, ops))
return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp);
-   if (!ops->alloc_pages)
+   if (!ops->alloc_pages_op)
return NULL;
-   return ops->alloc_pages(dev, size, dma_handle, dir, gfp);
+   return ops->alloc_pages_op(dev, size, dma_handle, dir, gfp);
 }
 
 struct page *dma_alloc_pages(struct device *dev, size_t size,
-- 
2.37.2.672.g94769d06f0-goog




[RFC PATCH 10/30] mm: enable page allocation tagging for __get_free_pages and alloc_pages

2022-08-30 Thread Suren Baghdasaryan
Redefine alloc_pages, __get_free_pages to record allocations done by
these functions. Instrument deallocation hooks to record object freeing.

Signed-off-by: Suren Baghdasaryan 
---
 include/linux/gfp.h | 10 +++---
 include/linux/page_ext.h|  3 ++-
 include/linux/pgalloc_tag.h | 35 +++
 mm/mempolicy.c  |  4 ++--
 mm/page_alloc.c | 13 ++---
 5 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f314be58fa77..5cb950a49d40 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -6,6 +6,7 @@
 
 #include 
 #include 
+#include 
 
 struct vm_area_struct;
 
@@ -267,12 +268,12 @@ static inline struct page *alloc_pages_node(int nid, 
gfp_t gfp_mask,
 }
 
 #ifdef CONFIG_NUMA
-struct page *alloc_pages(gfp_t gfp, unsigned int order);
+struct page *_alloc_pages(gfp_t gfp, unsigned int order);
 struct folio *folio_alloc(gfp_t gfp, unsigned order);
 struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, bool hugepage);
 #else
-static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
+static inline struct page *_alloc_pages(gfp_t gfp_mask, unsigned int order)
 {
return alloc_pages_node(numa_node_id(), gfp_mask, order);
 }
@@ -283,6 +284,7 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned 
int order)
 #define vma_alloc_folio(gfp, order, vma, addr, hugepage)   \
folio_alloc(gfp, order)
 #endif
+#define alloc_pages(gfp, order) pgtag_alloc_pages(gfp, order)
 #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
 static inline struct page *alloc_page_vma(gfp_t gfp,
struct vm_area_struct *vma, unsigned long addr)
@@ -292,7 +294,9 @@ static inline struct page *alloc_page_vma(gfp_t gfp,
return &folio->page;
 }
 
-extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
+extern unsigned long _get_free_pages(gfp_t gfp_mask, unsigned int order,
+struct page **ppage);
+#define __get_free_pages(gfp_mask, order) pgtag_get_free_pages(gfp_mask, order)
 extern unsigned long get_zeroed_page(gfp_t gfp_mask);
 
 void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1);
diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h
index fabb2e1e087f..b26077110fb3 100644
--- a/include/linux/page_ext.h
+++ b/include/linux/page_ext.h
@@ -4,7 +4,6 @@
 
 #include 
 #include 
-#include 
 
 struct pglist_data;
 struct page_ext_operations {
@@ -14,6 +13,8 @@ struct page_ext_operations {
void (*init)(void);
 };
 
+#include 
+
 #ifdef CONFIG_PAGE_EXTENSION
 
 enum page_ext_flags {
diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h
index f525abfe51d4..154ea7436fec 100644
--- a/include/linux/pgalloc_tag.h
+++ b/include/linux/pgalloc_tag.h
@@ -5,6 +5,8 @@
 #ifndef _LINUX_PGALLOC_TAG_H
 #define _LINUX_PGALLOC_TAG_H
 
+#ifdef CONFIG_PAGE_ALLOC_TAGGING
+
 #include 
 #include 
 
@@ -25,4 +27,37 @@ static inline void pgalloc_tag_dec(struct page *page, 
unsigned int order)
alloc_tag_sub(get_page_tag_ref(page), PAGE_SIZE << order);
 }
 
+/*
+ * Redefinitions of the common page allocators/destructors
+ */
+#define pgtag_alloc_pages(gfp, order)  \
+({ \
+   struct page *_page = _alloc_pages((gfp), (order));  \
+   \
+   if (_page)  \
+   alloc_tag_add(get_page_tag_ref(_page), PAGE_SIZE << (order));\
+   _page;  \
+})
+
+#define pgtag_get_free_pages(gfp_mask, order)  \
+({ \
+   struct page *_page; \
+   unsigned long _res = _get_free_pages((gfp_mask), (order), &_page);\
+   \
+   if (_res)   \
+   alloc_tag_add(get_page_tag_ref(_page), PAGE_SIZE << (order));\
+   _res;   \
+})
+
+#else /* CONFIG_PAGE_ALLOC_TAGGING */
+
+#define pgtag_alloc_pages(gfp, order) _alloc_pages(gfp, order)
+
+#define pgtag_get_free_pages(gfp_mask, order) \
+   _get_free_pages((gfp_mask), (order), NULL)
+
+#define pgalloc_tag_dec(__page, __size)do {} while (0)
+
+#endif /* CONFIG_PAGE_ALLOC_TAGGING */
+
 #endif /* _LINUX_PGALLOC_TAG_H */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b73d3248d976..f7e6d9564a49 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2249,7 +2249,7 @@ EXPORT_SYMBOL(vma_all

[RFC PATCH 14/30] mm: prevent slabobj_ext allocations for slabobj_ext and kmem_cache objects

2022-08-30 Thread Suren Baghdasaryan
Use __GFP_NO_OBJ_EXT to prevent recursions when allocating slabobj_ext
objects. Also prevent slabobj_ext allocations for kmem_cache objects.

Signed-off-by: Suren Baghdasaryan 
---
 mm/memcontrol.c | 2 ++
 mm/slab.h   | 6 ++
 2 files changed, 8 insertions(+)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3f407ef2f3f1..dabb451dc364 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2809,6 +2809,8 @@ int alloc_slab_obj_exts(struct slab *slab, struct 
kmem_cache *s,
void *vec;
 
gfp &= ~OBJCGS_CLEAR_MASK;
+   /* Prevent recursive extension vector allocation */
+   gfp |= __GFP_NO_OBJ_EXT;
vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp,
   slab_nid(slab));
if (!vec)
diff --git a/mm/slab.h b/mm/slab.h
index c767ce3f0fe2..d93b22b8bbe2 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -475,6 +475,12 @@ static inline void prepare_slab_obj_exts_hook(struct 
kmem_cache *s, gfp_t flags,
if (is_kmem_only_obj_ext())
return;
 
+   if (s->flags & SLAB_NO_OBJ_EXT)
+   return;
+
+   if (flags & __GFP_NO_OBJ_EXT)
+   return;
+
slab = virt_to_slab(p);
if (!slab_obj_exts(slab))
WARN(alloc_slab_obj_exts(slab, s, flags, false),
-- 
2.37.2.672.g94769d06f0-goog




[RFC PATCH 11/30] mm: introduce slabobj_ext to support slab object extensions

2022-08-30 Thread Suren Baghdasaryan
Currently slab pages can store only vectors of obj_cgroup pointers in
page->memcg_data. Introduce slabobj_ext structure to allow more data
to be stored for each slab object. Wraps obj_cgroup into slabobj_ext
to support current functionality while allowing to extend slabobj_ext
in the future.

Note: ideally the config dependency should be turned the other way around:
MEMCG should depend on SLAB_OBJ_EXT and {page|slab|folio}.memcg_data would
be renamed to something like {page|slab|folio}.objext_data. However doing
this in RFC would introduce considerable churn unrelated to the overall
idea, so avoiding this until v1.

Signed-off-by: Suren Baghdasaryan 
---
 include/linux/memcontrol.h |  18 --
 init/Kconfig   |   5 ++
 mm/kfence/core.c   |   2 +-
 mm/memcontrol.c|  60 ++-
 mm/page_owner.c|   2 +-
 mm/slab.h  | 119 +
 6 files changed, 131 insertions(+), 75 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 6257867fbf95..315399f77173 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -227,6 +227,14 @@ struct obj_cgroup {
};
 };
 
+/*
+ * Extended information for slab objects stored as an array in page->memcg_data
+ * if MEMCG_DATA_OBJEXTS is set.
+ */
+struct slabobj_ext {
+   struct obj_cgroup *objcg;
+} __aligned(8);
+
 /*
  * The memory controller data structure. The memory controller controls both
  * page cache and RSS per cgroup. We would eventually like to provide
@@ -363,7 +371,7 @@ extern struct mem_cgroup *root_mem_cgroup;
 
 enum page_memcg_data_flags {
/* page->memcg_data is a pointer to an objcgs vector */
-   MEMCG_DATA_OBJCGS = (1UL << 0),
+   MEMCG_DATA_OBJEXTS = (1UL << 0),
/* page has been accounted as a non-slab kernel page */
MEMCG_DATA_KMEM = (1UL << 1),
/* the next bit after the last actual flag */
@@ -401,7 +409,7 @@ static inline struct mem_cgroup *__folio_memcg(struct folio 
*folio)
unsigned long memcg_data = folio->memcg_data;
 
VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
-   VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
+   VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);
 
return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
@@ -422,7 +430,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio 
*folio)
unsigned long memcg_data = folio->memcg_data;
 
VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
-   VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
+   VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);
 
return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
@@ -517,7 +525,7 @@ static inline struct mem_cgroup *page_memcg_check(struct 
page *page)
 */
unsigned long memcg_data = READ_ONCE(page->memcg_data);
 
-   if (memcg_data & MEMCG_DATA_OBJCGS)
+   if (memcg_data & MEMCG_DATA_OBJEXTS)
return NULL;
 
if (memcg_data & MEMCG_DATA_KMEM) {
@@ -556,7 +564,7 @@ static inline struct mem_cgroup 
*get_mem_cgroup_from_objcg(struct obj_cgroup *ob
 static inline bool folio_memcg_kmem(struct folio *folio)
 {
VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page);
-   VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJCGS, folio);
+   VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJEXTS, folio);
return folio->memcg_data & MEMCG_DATA_KMEM;
 }
 
diff --git a/init/Kconfig b/init/Kconfig
index 532362fcfe31..82396d7a2717 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -958,6 +958,10 @@ config MEMCG
help
  Provides control over the memory footprint of tasks in a cgroup.
 
+config SLAB_OBJ_EXT
+   bool
+   depends on MEMCG
+
 config MEMCG_SWAP
bool
depends on MEMCG && SWAP
@@ -966,6 +970,7 @@ config MEMCG_SWAP
 config MEMCG_KMEM
bool
depends on MEMCG && !SLOB
+   select SLAB_OBJ_EXT
default y
 
 config BLK_CGROUP
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index c252081b11df..c0958e4a32e2 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -569,7 +569,7 @@ static unsigned long kfence_init_pool(void)
__folio_set_slab(slab_folio(slab));
 #ifdef CONFIG_MEMCG
slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 
1].objcg |
-  MEMCG_DATA_OBJCGS;
+  MEMCG_DATA_OBJEXTS;
 #endif
}
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b69979c9ced5..3f407ef2f3f1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2793,7 +2793,7 @

[RFC PATCH 15/30] lib: introduce slab allocation tagging

2022-08-30 Thread Suren Baghdasaryan
Introduce CONFIG_SLAB_ALLOC_TAGGING which provides helper functions
to easily instrument slab allocators and adds a codetag_ref field into
slabobj_ext to store a pointer to the allocation tag associated with
the code that allocated the slab object.

Signed-off-by: Suren Baghdasaryan 
Co-developed-by: Kent Overstreet 
Signed-off-by: Kent Overstreet 
---
 include/linux/memcontrol.h |  5 +
 include/linux/slab.h   | 25 +
 include/linux/slab_def.h   |  2 +-
 include/linux/slub_def.h   |  4 ++--
 lib/Kconfig.debug  | 11 +++
 mm/slab_common.c   | 33 +
 6 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 315399f77173..97c0153f0247 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -232,7 +232,12 @@ struct obj_cgroup {
  * if MEMCG_DATA_OBJEXTS is set.
  */
 struct slabobj_ext {
+#ifdef CONFIG_MEMCG_KMEM
struct obj_cgroup *objcg;
+#endif
+#ifdef CONFIG_SLAB_ALLOC_TAGGING
+   union codetag_ref ref;
+#endif
 } __aligned(8);
 
 /*
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 55ae3ea864a4..5a198aa02a08 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -438,6 +438,31 @@ static __always_inline unsigned int __kmalloc_index(size_t 
size,
 #define kmalloc_index(s) __kmalloc_index(s, true)
 #endif /* !CONFIG_SLOB */
 
+#ifdef CONFIG_SLAB_ALLOC_TAGGING
+
+#include 
+
+union codetag_ref *get_slab_tag_ref(const void *objp);
+
+#define slab_tag_add(_old, _new)   \
+do {   \
+   if (!ZERO_OR_NULL_PTR(_new) && _old != _new)\
+   alloc_tag_add(get_slab_tag_ref(_new), __ksize(_new));   \
+} while (0)
+
+static inline void slab_tag_dec(const void *ptr)
+{
+   if (!ZERO_OR_NULL_PTR(ptr))
+   alloc_tag_sub(get_slab_tag_ref(ptr), __ksize(ptr));
+}
+
+#else
+
+#define slab_tag_add(_old, _new) do {} while (0)
+static inline void slab_tag_dec(const void *ptr) {}
+
+#endif
+
 void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment 
__alloc_size(1);
 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags) 
__assume_slab_alignment __malloc;
 void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index e24c9aff6fed..25feb5f7dc32 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -106,7 +106,7 @@ static inline void *nearest_obj(struct kmem_cache *cache, 
const struct slab *sla
  *   reciprocal_divide(offset, cache->reciprocal_buffer_size)
  */
 static inline unsigned int obj_to_index(const struct kmem_cache *cache,
-   const struct slab *slab, void *obj)
+   const struct slab *slab, const void 
*obj)
 {
u32 offset = (obj - slab->s_mem);
return reciprocal_divide(offset, cache->reciprocal_buffer_size);
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index f9c68a9dac04..940c146768d4 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -170,14 +170,14 @@ static inline void *nearest_obj(struct kmem_cache *cache, 
const struct slab *sla
 
 /* Determine object index from a given position */
 static inline unsigned int __obj_to_index(const struct kmem_cache *cache,
- void *addr, void *obj)
+ void *addr, const void *obj)
 {
return reciprocal_divide(kasan_reset_tag(obj) - addr,
 cache->reciprocal_size);
 }
 
 static inline unsigned int obj_to_index(const struct kmem_cache *cache,
-   const struct slab *slab, void *obj)
+   const struct slab *slab, const void 
*obj)
 {
if (is_kfence_address(obj))
return 0;
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 6686648843b3..08c97a978906 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -989,6 +989,17 @@ config PAGE_ALLOC_TAGGING
  initiated at that code location. The mechanism can be used to track
  memory leaks with a low performance impact.
 
+config SLAB_ALLOC_TAGGING
+   bool "Enable slab allocation tagging"
+   default n
+   select ALLOC_TAGGING
+   select SLAB_OBJ_EXT
+   help
+ Instrument slab allocators to track allocation source code and
+ collect statistics on the number of allocations and their total size
+ initiated at that code location. The mechanism can be used to track
+ memory leaks with a low performance impact.
+
 source "lib/Kconfig.kasan"
 source "lib/Kconfig.kfence"
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 179966

[RFC PATCH 17/30] lib/string.c: strsep_no_empty()

2022-08-30 Thread Suren Baghdasaryan
From: Kent Overstreet 

This adds a new helper which is like strsep, except that it skips empty
tokens.

Signed-off-by: Kent Overstreet 
---
 include/linux/string.h |  1 +
 lib/string.c   | 19 +++
 2 files changed, 20 insertions(+)

diff --git a/include/linux/string.h b/include/linux/string.h
index 61ec7e4f6311..b950ac9cfa56 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -96,6 +96,7 @@ extern char * strpbrk(const char *,const char *);
 #ifndef __HAVE_ARCH_STRSEP
 extern char * strsep(char **,const char *);
 #endif
+extern char *strsep_no_empty(char **, const char *);
 #ifndef __HAVE_ARCH_STRSPN
 extern __kernel_size_t strspn(const char *,const char *);
 #endif
diff --git a/lib/string.c b/lib/string.c
index 6f334420f687..6939f5b751f2 100644
--- a/lib/string.c
+++ b/lib/string.c
@@ -596,6 +596,25 @@ char *strsep(char **s, const char *ct)
 EXPORT_SYMBOL(strsep);
 #endif
 
+/**
+ * strsep_no_empt - Split a string into tokens, but don't return empty tokens
+ * @s: The string to be searched
+ * @ct: The characters to search for
+ *
+ * strsep() updates @s to point after the token, ready for the next call.
+ */
+char *strsep_no_empty(char **s, const char *ct)
+{
+   char *ret;
+
+   do {
+   ret = strsep(s, ct);
+   } while (ret && !*ret);
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(strsep_no_empty);
+
 #ifndef __HAVE_ARCH_MEMSET
 /**
  * memset - Fill a region of memory with the given value
-- 
2.37.2.672.g94769d06f0-goog




[RFC PATCH 16/30] mm: enable slab allocation tagging for kmalloc and friends

2022-08-30 Thread Suren Baghdasaryan
Redefine kmalloc, krealloc, kzalloc, kcalloc, etc. to record allocations
and deallocations done by these functions.

Signed-off-by: Suren Baghdasaryan 
Co-developed-by: Kent Overstreet 
Signed-off-by: Kent Overstreet 
---
 include/linux/slab.h | 103 +--
 mm/slab.c|   2 +
 mm/slab_common.c |  16 +++
 mm/slob.c|   2 +
 mm/slub.c|   2 +
 5 files changed, 75 insertions(+), 50 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 5a198aa02a08..89273be35743 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -191,7 +191,10 @@ int kmem_cache_shrink(struct kmem_cache *s);
 /*
  * Common kmalloc functions provided by all allocators
  */
-void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) 
__alloc_size(2);
+void * __must_check _krealloc(const void *objp, size_t new_size, gfp_t flags) 
__alloc_size(2);
+#define krealloc(_p, _size, _flags)\
+   krealloc_hooks(_p, _krealloc(_p, _size, _flags))
+
 void kfree(const void *objp);
 void kfree_sensitive(const void *objp);
 size_t __ksize(const void *objp);
@@ -463,6 +466,15 @@ static inline void slab_tag_dec(const void *ptr) {}
 
 #endif
 
+#define krealloc_hooks(_p, _do_alloc)  \
+({ \
+   void *_res = _do_alloc; \
+   slab_tag_add(_p, _res); \
+   _res;   \
+})
+
+#define kmalloc_hooks(_do_alloc)   krealloc_hooks(NULL, _do_alloc)
+
 void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment 
__alloc_size(1);
 void *kmem_cache_alloc(struct kmem_cache *s, gfp_t flags) 
__assume_slab_alignment __malloc;
 void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru,
@@ -541,25 +553,31 @@ static __always_inline void 
*kmem_cache_alloc_node_trace(struct kmem_cache *s, g
 }
 #endif /* CONFIG_TRACING */
 
-extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) 
__assume_page_alignment
+extern void *_kmalloc_order(size_t size, gfp_t flags, unsigned int order) 
__assume_page_alignment
 
__alloc_size(1);
+#define kmalloc_order(_size, _flags, _order)  \
+   kmalloc_hooks(_kmalloc_order(_size, _flags, _order))
 
 #ifdef CONFIG_TRACING
-extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+extern void *_kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
__assume_page_alignment __alloc_size(1);
 #else
-static __always_inline __alloc_size(1) void *kmalloc_order_trace(size_t size, 
gfp_t flags,
+static __always_inline __alloc_size(1) void *_kmalloc_order_trace(size_t size, 
gfp_t flags,
 unsigned int 
order)
 {
-   return kmalloc_order(size, flags, order);
+   return _kmalloc_order(size, flags, order);
 }
 #endif
+#define kmalloc_order_trace(_size, _flags, _order)  \
+   kmalloc_hooks(_kmalloc_order_trace(_size, _flags, _order))
 
-static __always_inline __alloc_size(1) void *kmalloc_large(size_t size, gfp_t 
flags)
+static __always_inline __alloc_size(1) void *_kmalloc_large(size_t size, gfp_t 
flags)
 {
unsigned int order = get_order(size);
-   return kmalloc_order_trace(size, flags, order);
+   return _kmalloc_order_trace(size, flags, order);
 }
+#define kmalloc_large(_size, _flags)\
+   kmalloc_hooks(_kmalloc_large(_size, _flags))
 
 /**
  * kmalloc - allocate memory
@@ -615,14 +633,14 @@ static __always_inline __alloc_size(1) void 
*kmalloc_large(size_t size, gfp_t fl
  * Try really hard to succeed the allocation but fail
  * eventually.
  */
-static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags)
+static __always_inline __alloc_size(1) void *_kmalloc(size_t size, gfp_t flags)
 {
if (__builtin_constant_p(size)) {
 #ifndef CONFIG_SLOB
unsigned int index;
 #endif
if (size > KMALLOC_MAX_CACHE_SIZE)
-   return kmalloc_large(size, flags);
+   return _kmalloc_large(size, flags);
 #ifndef CONFIG_SLOB
index = kmalloc_index(size);
 
@@ -636,8 +654,9 @@ static __always_inline __alloc_size(1) void *kmalloc(size_t 
size, gfp_t flags)
}
return __kmalloc(size, flags);
 }
+#define kmalloc(_size, _flags) kmalloc_hooks(_kmalloc(_size, 
_flags))
 
-static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t 
flags, int node)
+static __always_inline __alloc_size(1) void *_kmalloc_node(size_t size, gfp_t 
flags, int node)
 {
 #ifndef CONFIG_SLOB
if (__builtin_constant_p(s

[RFC PATCH 21/30] lib: implement context capture support for page and slab allocators

2022-08-30 Thread Suren Baghdasaryan
Implement mechanisms for capturing allocation call context which consists
of:
- allocation size
- pid, tgid and name of the allocating task
- allocation timestamp
- allocation call stack
The patch creates alloc_tags.ctx file which can be written to
enable/disable context capture for a specific code tag. Captured context
can be obtained by reading alloc_tags.ctx file.
Usage example:

echo "file include/asm-generic/pgalloc.h line 63 enable" > \
/sys/kernel/debug/alloc_tags.ctx
cat alloc_tags.ctx
 91.0MiB  212 include/asm-generic/pgalloc.h:63 module:pgtable 
func:__pte_alloc_one
size: 4096
pid: 1551
tgid: 1551
comm: cat
ts: 670109646361
call stack:
 pte_alloc_one+0xfe/0x130
 __pte_alloc+0x22/0x90
 move_page_tables.part.0+0x994/0xa60
 shift_arg_pages+0xa4/0x180
 setup_arg_pages+0x286/0x2d0
 load_elf_binary+0x4e1/0x18d0
 bprm_execve+0x26b/0x660
 do_execveat_common.isra.0+0x19d/0x220
 __x64_sys_execve+0x2e/0x40
 do_syscall_64+0x38/0x90
 entry_SYSCALL_64_after_hwframe+0x63/0xcd

size: 4096
pid: 1551
tgid: 1551
comm: cat
ts: 670109711801
call stack:
 pte_alloc_one+0xfe/0x130
 __do_fault+0x52/0xc0
 __handle_mm_fault+0x7d9/0xdd0
 handle_mm_fault+0xc0/0x2b0
 do_user_addr_fault+0x1c3/0x660
 exc_page_fault+0x62/0x150
 asm_exc_page_fault+0x22/0x30
...

echo "file include/asm-generic/pgalloc.h line 63 disable" > \
/sys/kernel/debug/alloc_tags.ctx

Note that disabling context capture will not clear already captured
context but no new context will be captured.

Signed-off-by: Suren Baghdasaryan 
---
 include/linux/alloc_tag.h |  28 -
 include/linux/codetag.h   |   3 +-
 lib/Kconfig.debug |   1 +
 lib/alloc_tag.c   | 239 +-
 lib/codetag.c |  20 ++--
 5 files changed, 273 insertions(+), 18 deletions(-)

diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index b3f589afb1c9..66638cbf349a 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -16,27 +16,41 @@
  * an array of these. Embedded codetag utilizes codetag framework.
  */
 struct alloc_tag {
-   struct codetag  ct;
+   struct codetag_with_ctx ctc;
unsigned long   last_wrap;
struct raw_lazy_percpu_counter  call_count;
struct raw_lazy_percpu_counter  bytes_allocated;
 } __aligned(8);
 
+static inline struct alloc_tag *ctc_to_alloc_tag(struct codetag_with_ctx *ctc)
+{
+   return container_of(ctc, struct alloc_tag, ctc);
+}
+
 static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct)
 {
-   return container_of(ct, struct alloc_tag, ct);
+   return container_of(ct_to_ctc(ct), struct alloc_tag, ctc);
 }
 
+struct codetag_ctx *alloc_tag_create_ctx(struct alloc_tag *tag, size_t size);
+void alloc_tag_free_ctx(struct codetag_ctx *ctx, struct alloc_tag **ptag);
+bool alloc_tag_enable_ctx(struct alloc_tag *tag, bool enable);
+
 #define DEFINE_ALLOC_TAG(_alloc_tag)   \
static struct alloc_tag _alloc_tag __used __aligned(8)  \
-   __section("alloc_tags") = { .ct = CODE_TAG_INIT }
+   __section("alloc_tags") = { .ctc.ct = CODE_TAG_INIT }
 
 #define alloc_tag_counter_read(counter)
\
__lazy_percpu_counter_read(counter)
 
 static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes)
 {
-   struct alloc_tag *tag = ct_to_alloc_tag(ref->ct);
+   struct alloc_tag *tag;
+
+   if (is_codetag_ctx_ref(ref))
+   alloc_tag_free_ctx(ref->ctx, &tag);
+   else
+   tag = ct_to_alloc_tag(ref->ct);
 
__lazy_percpu_counter_add(&tag->call_count, &tag->last_wrap, -1);
__lazy_percpu_counter_add(&tag->bytes_allocated, &tag->last_wrap, 
-bytes);
@@ -51,7 +65,11 @@ do { 
\
 
 static inline void __alloc_tag_add(struct alloc_tag *tag, union codetag_ref 
*ref, size_t bytes)
 {
-   ref->ct = &tag->ct;
+   if (codetag_ctx_enabled(&tag->ctc))
+   ref->ctx = alloc_tag_create_ctx(tag, bytes);
+   else
+   ref->ct = &tag->ctc.ct;
+
__lazy_percpu_counter_add(&tag->call_count, &tag->last_wrap, 1);
__lazy_percpu_counter_add(&tag->bytes_allocated, &tag->last_wrap, 
bytes);
 }
diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index 57736ec77b45..a10c5fcbdd20 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -104,7 +104,8 @@ struct codetag_with_ctx *ct_to_ctc(struct codetag *ct)
 }
 
 void codetag_lock_module_list(struct codetag_type *cttype, bool lock);
-struct codeta

[RFC PATCH 18/30] codetag: add codetag query helper functions

2022-08-30 Thread Suren Baghdasaryan
From: Kent Overstreet 

Provide codetag_query_parse() to parse codetag queries and
codetag_matches_query() to check if the query affects a given codetag.

Signed-off-by: Kent Overstreet 
---
 include/linux/codetag.h |  27 
 lib/codetag.c   | 135 
 2 files changed, 162 insertions(+)

diff --git a/include/linux/codetag.h b/include/linux/codetag.h
index 386733e89b31..0c605417ebbe 100644
--- a/include/linux/codetag.h
+++ b/include/linux/codetag.h
@@ -80,4 +80,31 @@ static inline void codetag_load_module(struct module *mod) {}
 static inline void codetag_unload_module(struct module *mod) {}
 #endif
 
+/* Codetag query parsing */
+
+struct codetag_query {
+   const char  *filename;
+   const char  *module;
+   const char  *function;
+   const char  *class;
+   unsigned intfirst_line, last_line;
+   unsigned intfirst_index, last_index;
+   unsigned intcur_index;
+
+   boolmatch_line:1;
+   boolmatch_index:1;
+
+   unsigned intset_enabled:1;
+   unsigned intenabled:2;
+
+   unsigned intset_frequency:1;
+   unsigned intfrequency;
+};
+
+char *codetag_query_parse(struct codetag_query *q, char *buf);
+bool codetag_matches_query(struct codetag_query *q,
+  const struct codetag *ct,
+  const struct codetag_module *mod,
+  const char *class);
+
 #endif /* _LINUX_CODETAG_H */
diff --git a/lib/codetag.c b/lib/codetag.c
index f0a3174f9b71..288ccfd5cbd0 100644
--- a/lib/codetag.c
+++ b/lib/codetag.c
@@ -246,3 +246,138 @@ void codetag_unload_module(struct module *mod)
}
mutex_unlock(&codetag_lock);
 }
+
+/* Codetag query parsing */
+
+#define CODETAG_QUERY_TOKENS() \
+   x(func) \
+   x(file) \
+   x(line) \
+   x(module)   \
+   x(class)\
+   x(index)
+
+enum tokens {
+#define x(name)TOK_##name,
+   CODETAG_QUERY_TOKENS()
+#undef x
+};
+
+static const char * const token_strs[] = {
+#define x(name)#name,
+   CODETAG_QUERY_TOKENS()
+#undef x
+   NULL
+};
+
+static int parse_range(char *str, unsigned int *first, unsigned int *last)
+{
+   char *first_str = str;
+   char *last_str = strchr(first_str, '-');
+
+   if (last_str)
+   *last_str++ = '\0';
+
+   if (kstrtouint(first_str, 10, first))
+   return -EINVAL;
+
+   if (!last_str)
+   *last = *first;
+   else if (kstrtouint(last_str, 10, last))
+   return -EINVAL;
+
+   return 0;
+}
+
+char *codetag_query_parse(struct codetag_query *q, char *buf)
+{
+   while (1) {
+   char *p = buf;
+   char *str1 = strsep_no_empty(&p, " \t\r\n");
+   char *str2 = strsep_no_empty(&p, " \t\r\n");
+   int ret, token;
+
+   if (!str1 || !str2)
+   break;
+
+   token = match_string(token_strs, ARRAY_SIZE(token_strs), str1);
+   if (token < 0)
+   break;
+
+   switch (token) {
+   case TOK_func:
+   q->function = str2;
+   break;
+   case TOK_file:
+   q->filename = str2;
+   break;
+   case TOK_line:
+   ret = parse_range(str2, &q->first_line, &q->last_line);
+   if (ret)
+   return ERR_PTR(ret);
+   q->match_line = true;
+   break;
+   case TOK_module:
+   q->module = str2;
+   break;
+   case TOK_class:
+   q->class = str2;
+   break;
+   case TOK_index:
+   ret = parse_range(str2, &q->first_index, 
&q->last_index);
+   if (ret)
+   return ERR_PTR(ret);
+   q->match_index = true;
+   break;
+   }
+
+   buf = p;
+   }
+
+   return buf;
+}
+
+bool codetag_matches_query(struct codetag_query *q,
+  const struct codetag *ct,
+  const struct codetag_module *mod,
+  const char *class)
+{
+   size_t classlen = q->class ? strlen(q->class) : 0;
+
+   if (q->module &&
+   (!mod->mod ||
+strcmp(q->module, ct->modname)))
+   return false;
+
+   if (q->filename &&
+   strcmp(q->filename, ct->filename) &&
+   strcmp(q->filename, kbasename(ct->filename)))
+   return false;
+
+   if (q->function &&
+   strcmp(q->function, ct->function))
+   return false;
+
+ 

[RFC PATCH 22/30] Code tagging based fault injection

2022-08-30 Thread Suren Baghdasaryan
From: Kent Overstreet 

This adds a new fault injection capability, based on code tagging.

To use, simply insert somewhere in your code

  dynamic_fault("fault_class_name")

and check whether it returns true - if so, inject the error.
For example

  if (dynamic_fault("init"))
  return -EINVAL;

There's no need to define faults elsewhere, as with
include/linux/fault-injection.h. Faults show up in debugfs, under
/sys/kernel/debug/dynamic_faults, and can be selected based on
file/module/function/line number/class, and enabled permanently, or in
oneshot mode, or with a specified frequency.

Signed-off-by: Kent Overstreet 
---
 include/asm-generic/codetag.lds.h |   3 +-
 include/linux/dynamic_fault.h |  79 +++
 include/linux/slab.h  |   3 +-
 lib/Kconfig.debug |   6 +
 lib/Makefile  |   2 +
 lib/dynamic_fault.c   | 372 ++
 6 files changed, 463 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/dynamic_fault.h
 create mode 100644 lib/dynamic_fault.c

diff --git a/include/asm-generic/codetag.lds.h 
b/include/asm-generic/codetag.lds.h
index 64f536b80380..16fbf74edc3d 100644
--- a/include/asm-generic/codetag.lds.h
+++ b/include/asm-generic/codetag.lds.h
@@ -9,6 +9,7 @@
__stop_##_name = .;
 
 #define CODETAG_SECTIONS() \
-   SECTION_WITH_BOUNDARIES(alloc_tags)
+   SECTION_WITH_BOUNDARIES(alloc_tags) \
+   SECTION_WITH_BOUNDARIES(dynamic_fault_tags)
 
 #endif /* __ASM_GENERIC_CODETAG_LDS_H */
diff --git a/include/linux/dynamic_fault.h b/include/linux/dynamic_fault.h
new file mode 100644
index ..526a33209e94
--- /dev/null
+++ b/include/linux/dynamic_fault.h
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_DYNAMIC_FAULT_H
+#define _LINUX_DYNAMIC_FAULT_H
+
+/*
+ * Dynamic/code tagging fault injection:
+ *
+ * Originally based on the dynamic debug trick of putting types in a special 
elf
+ * section, then rewritten using code tagging:
+ *
+ * To use, simply insert a call to dynamic_fault("fault_class"), which will
+ * return true if an error should be injected.
+ *
+ * Fault injection sites may be listed and enabled via debugfs, under
+ * /sys/kernel/debug/dynamic_faults.
+ */
+
+#ifdef CONFIG_CODETAG_FAULT_INJECTION
+
+#include 
+#include 
+
+#define DFAULT_STATES()\
+   x(disabled) \
+   x(enabled)  \
+   x(oneshot)
+
+enum dfault_enabled {
+#define x(n)   DFAULT_##n,
+   DFAULT_STATES()
+#undef x
+};
+
+union dfault_state {
+   struct {
+   unsigned intenabled:2;
+   unsigned intcount:30;
+   };
+
+   struct {
+   unsigned intv;
+   };
+};
+
+struct dfault {
+   struct codetag  tag;
+   const char  *class;
+   unsigned intfrequency;
+   union dfault_state  state;
+   struct static_key_false enabled;
+};
+
+bool __dynamic_fault_enabled(struct dfault *df);
+
+#define dynamic_fault(_class)  \
+({ \
+   static struct dfault\
+   __used  \
+   __section("dynamic_fault_tags") \
+   __aligned(8) df = { \
+   .tag= CODE_TAG_INIT,\
+   .class  = _class,   \
+   .enabled = STATIC_KEY_FALSE_INIT,   \
+   };  \
+   \
+   static_key_false(&df.enabled.key) &&\
+   __dynamic_fault_enabled(&df);   \
+})
+
+#else
+
+#define dynamic_fault(_class)  false
+
+#endif /* CODETAG_FAULT_INJECTION */
+
+#define memory_fault() dynamic_fault("memory")
+
+#endif /* _LINUX_DYNAMIC_FAULT_H */
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 89273be35743..4be5a93ed15a 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -17,6 +17,7 @@
 #include 
 #include 
 #include 
+#include 
 
 
 /*
@@ -468,7 +469,7 @@ static inline void slab_tag_dec(const void *ptr) {}
 
 #define krealloc_hooks(_p, _do_alloc)  \
 ({ \
-   void *_res = _do_alloc; \
+   void *_res = !memory_fault() ? _do_alloc : NULL;\
slab_tag_add(_p, _res); \
_res;   \
 })
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 2790848464f1..b7d03afbc808 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1982,6 +1982,12 @@ config FAULT_INJECTION_STACKTRACE_FILTER
help
   

[RFC PATCH 19/30] move stack capture functionality into a separate function for reuse

2022-08-30 Thread Suren Baghdasaryan
Make save_stack() function part of stackdepot API to be used outside of
page_owner. Also rename task_struct's in_page_owner to in_capture_stack
flag to better convey the wider use of this flag.

Signed-off-by: Suren Baghdasaryan 
---
 include/linux/sched.h  |  6 ++--
 include/linux/stackdepot.h |  3 ++
 lib/stackdepot.c   | 68 ++
 mm/page_owner.c| 52 ++---
 4 files changed, 77 insertions(+), 52 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e7b2f8a5c711..d06cad6c14bd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -930,9 +930,9 @@ struct task_struct {
/* Stalled due to lack of memory */
unsignedin_memstall:1;
 #endif
-#ifdef CONFIG_PAGE_OWNER
-   /* Used by page_owner=on to detect recursion in page tracking. */
-   unsignedin_page_owner:1;
+#ifdef CONFIG_STACKDEPOT
+   /* Used by stack_depot_capture_stack to detect recursion. */
+   unsignedin_capture_stack:1;
 #endif
 #ifdef CONFIG_EVENTFD
/* Recursion prevention for eventfd_signal() */
diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index bc2797955de9..8dc9fdb2c4dd 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -64,4 +64,7 @@ int stack_depot_snprint(depot_stack_handle_t handle, char 
*buf, size_t size,
 
 void stack_depot_print(depot_stack_handle_t stack);
 
+bool stack_depot_capture_init(void);
+depot_stack_handle_t stack_depot_capture_stack(gfp_t flags);
+
 #endif
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index e73fda23388d..c8615bd6dc25 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -514,3 +514,71 @@ depot_stack_handle_t stack_depot_save(unsigned long 
*entries,
return __stack_depot_save(entries, nr_entries, alloc_flags, true);
 }
 EXPORT_SYMBOL_GPL(stack_depot_save);
+
+static depot_stack_handle_t recursion_handle;
+static depot_stack_handle_t failure_handle;
+
+static __always_inline depot_stack_handle_t create_custom_stack(void)
+{
+   unsigned long entries[4];
+   unsigned int nr_entries;
+
+   nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
+   return stack_depot_save(entries, nr_entries, GFP_KERNEL);
+}
+
+static noinline void register_recursion_stack(void)
+{
+   recursion_handle = create_custom_stack();
+}
+
+static noinline void register_failure_stack(void)
+{
+   failure_handle = create_custom_stack();
+}
+
+bool stack_depot_capture_init(void)
+{
+   static DEFINE_MUTEX(stack_depot_capture_init_mutex);
+   static bool utility_stacks_ready;
+
+   mutex_lock(&stack_depot_capture_init_mutex);
+   if (!utility_stacks_ready) {
+   register_recursion_stack();
+   register_failure_stack();
+   utility_stacks_ready = true;
+   }
+   mutex_unlock(&stack_depot_capture_init_mutex);
+
+   return utility_stacks_ready;
+}
+
+/* TODO: teach stack_depot_capture_stack to use off stack temporal storage */
+#define CAPTURE_STACK_DEPTH (16)
+
+depot_stack_handle_t stack_depot_capture_stack(gfp_t flags)
+{
+   unsigned long entries[CAPTURE_STACK_DEPTH];
+   depot_stack_handle_t handle;
+   unsigned int nr_entries;
+
+   /*
+* Avoid recursion.
+*
+* Sometimes page metadata allocation tracking requires more
+* memory to be allocated:
+* - when new stack trace is saved to stack depot
+* - when backtrace itself is calculated (ia64)
+*/
+   if (current->in_capture_stack)
+   return recursion_handle;
+   current->in_capture_stack = 1;
+
+   nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
+   handle = stack_depot_save(entries, nr_entries, flags);
+   if (!handle)
+   handle = failure_handle;
+
+   current->in_capture_stack = 0;
+   return handle;
+}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index fd4af1ad34b8..c3173e34a779 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -15,12 +15,6 @@
 
 #include "internal.h"
 
-/*
- * TODO: teach PAGE_OWNER_STACK_DEPTH (__dump_page_owner and save_stack)
- * to use off stack temporal storage
- */
-#define PAGE_OWNER_STACK_DEPTH (16)
-
 struct page_owner {
unsigned short order;
short last_migrate_reason;
@@ -37,8 +31,6 @@ struct page_owner {
 static bool page_owner_enabled __initdata;
 DEFINE_STATIC_KEY_FALSE(page_owner_inited);
 
-static depot_stack_handle_t dummy_handle;
-static depot_stack_handle_t failure_handle;
 static depot_stack_handle_t early_handle;
 
 static void init_early_allocated_pages(void);
@@ -68,16 +60,6 @@ static __always_inline depot_stack_handle_t 
create_dummy_stack(void)
return stack_depot_save(entries, nr_entries, GFP_KERNEL);
 }
 
-static noinline void register_dummy_sta

[RFC PATCH 26/30] bcache: Convert to lib/time_stats

2022-08-30 Thread Suren Baghdasaryan
From: Kent Overstreet 

This patch converts bcache to the new generic time_stats code
lib/time_stats.c. The new code is from bcachefs, and has some changes
from the version in bcache:

 - we now use ktime_get_ns(), not local_clock(). When the code was
   originally written multi processor systems that lacked synchronized
   TSCs were still common, and so local_clock() was much cheaper than
   sched_clock() (though not necessarily fully accurate, due to TSC
   drift). ktime_get_ns() should be cheap enough on all common hardware
   now, and more standard/correct.

 - time_stats are now exported in a single file in sysfs, which means we
   can improve the statistics we keep track of without changing all
   users. This also means we don't have to manually specify which units
   (ms, us, ns) a given time_stats should be printed in; that's handled
   dynamically.

 - There's a lazily-allocated percpu buffer, which now needs to be freed
   with time_stats_exit().

Signed-off-by: Kent Overstreet 
Cc: Coly Li 
---
 drivers/md/bcache/Kconfig  |  1 +
 drivers/md/bcache/bcache.h |  1 +
 drivers/md/bcache/bset.c   |  8 +++---
 drivers/md/bcache/bset.h   |  1 +
 drivers/md/bcache/btree.c  | 12 
 drivers/md/bcache/super.c  |  3 ++
 drivers/md/bcache/sysfs.c  | 43 
 drivers/md/bcache/util.c   | 30 
 drivers/md/bcache/util.h   | 57 --
 9 files changed, 47 insertions(+), 109 deletions(-)

diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
index 529c9d04e9a4..8d165052e508 100644
--- a/drivers/md/bcache/Kconfig
+++ b/drivers/md/bcache/Kconfig
@@ -4,6 +4,7 @@ config BCACHE
tristate "Block device as cache"
select BLOCK_HOLDER_DEPRECATED if SYSFS
select CRC64
+   select TIME_STATS
help
Allows a block device to be used as cache for other devices; uses
a btree for indexing and the layout is optimized for SSDs.
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 2acda9cea0f9..5100010a3897 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -185,6 +185,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 94d38e8a59b3..727e9b7aead4 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -1251,7 +1251,7 @@ static void __btree_sort(struct btree_keys *b, struct 
btree_iter *iter,
order = state->page_order;
}
 
-   start_time = local_clock();
+   start_time = ktime_get_ns();
 
btree_mergesort(b, out, iter, fixup, false);
b->nsets = start;
@@ -1286,7 +1286,7 @@ static void __btree_sort(struct btree_keys *b, struct 
btree_iter *iter,
bch_bset_build_written_tree(b);
 
if (!start)
-   bch_time_stats_update(&state->time, start_time);
+   time_stats_update(&state->time, start_time);
 }
 
 void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
@@ -1322,14 +1322,14 @@ void bch_btree_sort_and_fix_extents(struct btree_keys 
*b,
 void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
 struct bset_sort_state *state)
 {
-   uint64_t start_time = local_clock();
+   uint64_t start_time = ktime_get_ns();
struct btree_iter iter;
 
bch_btree_iter_init(b, &iter, NULL);
 
btree_mergesort(b, new->set->data, &iter, false, true);
 
-   bch_time_stats_update(&state->time, start_time);
+   time_stats_update(&state->time, start_time);
 
new->set->size = 0; // XXX: why?
 }
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index d795c84246b0..13e524ad7783 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -3,6 +3,7 @@
 #define _BCACHE_BSET_H
 
 #include 
+#include 
 #include 
 
 #include "bcache_ondisk.h"
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 147c493a989a..abf543bc7551 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -242,7 +242,7 @@ static void btree_node_read_endio(struct bio *bio)
 
 static void bch_btree_node_read(struct btree *b)
 {
-   uint64_t start_time = local_clock();
+   uint64_t start_time = ktime_get_ns();
struct closure cl;
struct bio *bio;
 
@@ -270,7 +270,7 @@ static void bch_btree_node_read(struct btree *b)
goto err;
 
bch_btree_node_read_done(b);
-   bch_time_stats_update(&b->c->btree_read_time, start_time);
+   time_stats_update(&b->c->btree_read_time, start_time);
 
return;
 err:
@@ -1789,7 +1789,7 @@ static void bch_btree_gc(struct cache_set *c)
struct gc_stat stats;
struct closure writes;
struct btree_op op;
-   uint64_t start_time = local_clock();
+   uint64_t start_time = ktime_get_ns();
 
trace_bcache_gc_start(c);
 
@@ -1815,7 +1815,7 @@ st

[RFC PATCH 25/30] lib/time_stats: New library for statistics on events

2022-08-30 Thread Suren Baghdasaryan
From: Kent Overstreet 

This adds a small new library for tracking statistics on events that
have a duration, i.e. a start and end time.

 - number of events
 - rate/frequency
 - average duration
 - max duration
 - duration quantiles

This code comes from bcachefs, and originally bcache: the next patch
will be converting bcache to use this version, and a subsequent patch
will be using code_tagging to instrument all wait_event() calls in the
kernel.

Signed-off-by: Kent Overstreet 
---
 include/linux/time_stats.h |  44 +++
 lib/Kconfig|   3 +
 lib/Makefile   |   1 +
 lib/time_stats.c   | 236 +
 4 files changed, 284 insertions(+)
 create mode 100644 include/linux/time_stats.h
 create mode 100644 lib/time_stats.c

diff --git a/include/linux/time_stats.h b/include/linux/time_stats.h
new file mode 100644
index ..7ae929e6f836
--- /dev/null
+++ b/include/linux/time_stats.h
@@ -0,0 +1,44 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_TIMESTATS_H
+#define _LINUX_TIMESTATS_H
+
+#include 
+#include 
+
+#define NR_QUANTILES   15
+
+struct quantiles {
+   struct quantile_entry {
+   u64 m;
+   u64 step;
+   }   entries[NR_QUANTILES];
+};
+
+struct time_stat_buffer {
+   unsigned intnr;
+   struct time_stat_buffer_entry {
+   u64 start;
+   u64 end;
+   }   entries[32];
+};
+
+struct time_stats {
+   spinlock_t  lock;
+   u64 count;
+   /* all fields are in nanoseconds */
+   u64 average_duration;
+   u64 average_frequency;
+   u64 max_duration;
+   u64 last_event;
+   struct quantiles quantiles;
+
+   struct time_stat_buffer __percpu *buffer;
+};
+
+struct seq_buf;
+void time_stats_update(struct time_stats *stats, u64 start);
+void time_stats_to_text(struct seq_buf *out, struct time_stats *stats);
+void time_stats_exit(struct time_stats *stats);
+
+#endif /* _LINUX_TIMESTATS_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index fc6dbc425728..884fd9f2f06d 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -744,3 +744,6 @@ config ASN1_ENCODER
 
 config POLYNOMIAL
tristate
+
+config TIME_STATS
+   bool
diff --git a/lib/Makefile b/lib/Makefile
index 489ea000c528..e54392011f5e 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -232,6 +232,7 @@ obj-$(CONFIG_ALLOC_TAGGING) += alloc_tag.o
 obj-$(CONFIG_PAGE_ALLOC_TAGGING) += pgalloc_tag.o
 
 obj-$(CONFIG_CODETAG_FAULT_INJECTION) += dynamic_fault.o
+obj-$(CONFIG_TIME_STATS) += time_stats.o
 
 lib-$(CONFIG_GENERIC_BUG) += bug.o
 
diff --git a/lib/time_stats.c b/lib/time_stats.c
new file mode 100644
index ..30362364fdd2
--- /dev/null
+++ b/lib/time_stats.c
@@ -0,0 +1,236 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static inline unsigned int eytzinger1_child(unsigned int i, unsigned int child)
+{
+   return (i << 1) + child;
+}
+
+static inline unsigned int eytzinger1_right_child(unsigned int i)
+{
+   return eytzinger1_child(i, 1);
+}
+
+static inline unsigned int eytzinger1_next(unsigned int i, unsigned int size)
+{
+   if (eytzinger1_right_child(i) <= size) {
+   i = eytzinger1_right_child(i);
+
+   i <<= __fls(size + 1) - __fls(i);
+   i >>= i > size;
+   } else {
+   i >>= ffz(i) + 1;
+   }
+
+   return i;
+}
+
+static inline unsigned int eytzinger0_child(unsigned int i, unsigned int child)
+{
+   return (i << 1) + 1 + child;
+}
+
+static inline unsigned int eytzinger0_first(unsigned int size)
+{
+   return rounddown_pow_of_two(size) - 1;
+}
+
+static inline unsigned int eytzinger0_next(unsigned int i, unsigned int size)
+{
+   return eytzinger1_next(i + 1, size) - 1;
+}
+
+#define eytzinger0_for_each(_i, _size) \
+   for ((_i) = eytzinger0_first((_size));  \
+(_i) != -1;\
+(_i) = eytzinger0_next((_i), (_size)))
+
+#define ewma_add(ewma, val, weight)\
+({ \
+   typeof(ewma) _ewma = (ewma);\
+   typeof(weight) _weight = (weight);  \
+   \
+   (((_ewma << _weight) - _ewma) + (val)) >> _weight;  \
+})
+
+static void quantiles_update(struct quantiles *q, u64 v)
+{
+   unsigned int i = 0;
+
+   while (i < ARRAY_SIZE(q->entries)) {
+   struct quantile_entry *e = q->entries + i;
+
+   if (unlikely(!e->step)) {
+   e->m = v;
+   e->step = max_t(unsigned int, v / 2, 1024)

[RFC PATCH 24/30] wait: Clean up waitqueue_entry initialization

2022-08-30 Thread Suren Baghdasaryan
From: Kent Overstreet 

Cleanup for code tagging latency tracking:

Add an initializer, WAIT_FUNC_INITIALIZER(), to be used by initializers
for structs that include wait_queue_entries.

Also, change init_wait(), init_wait_entry etc.  to be a wrapper around
the new __init_waitqueue_entry(); more de-duplication prep work.

Signed-off-by: Kent Overstreet 
Cc: Ingo Molnar 
Cc: Peter Zijlstra 
---
 include/linux/sbitmap.h  |  6 +
 include/linux/wait.h | 52 +++-
 include/linux/wait_bit.h |  7 +-
 kernel/sched/wait.c  |  9 ---
 4 files changed, 27 insertions(+), 47 deletions(-)

diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 8f5a86e210b9..f696c29d9ab3 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -596,11 +596,7 @@ struct sbq_wait {
 #define DEFINE_SBQ_WAIT(name)  
\
struct sbq_wait name = {
\
.sbq = NULL,
\
-   .wait = {   
\
-   .private= current,  
\
-   .func   = autoremove_wake_function, 
\
-   .entry  = LIST_HEAD_INIT((name).wait.entry),
\
-   }   
\
+   .wait = WAIT_FUNC_INITIALIZER((name).wait, 
autoremove_wake_function),\
}
 
 /*
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 58cfbf81447c..91ced6a118bc 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -79,21 +79,38 @@ extern void __init_waitqueue_head(struct wait_queue_head 
*wq_head, const char *n
 # define DECLARE_WAIT_QUEUE_HEAD_ONSTACK(name) DECLARE_WAIT_QUEUE_HEAD(name)
 #endif
 
-static inline void init_waitqueue_entry(struct wait_queue_entry *wq_entry, 
struct task_struct *p)
-{
-   wq_entry->flags = 0;
-   wq_entry->private   = p;
-   wq_entry->func  = default_wake_function;
+#define WAIT_FUNC_INITIALIZER(name, function) {
\
+   .private= current,  
\
+   .func   = function, 
\
+   .entry  = LIST_HEAD_INIT((name).entry), 
\
 }
 
+#define DEFINE_WAIT_FUNC(name, function)   
\
+   struct wait_queue_entry name = WAIT_FUNC_INITIALIZER(name, function)
+
+#define DEFINE_WAIT(name) DEFINE_WAIT_FUNC(name, autoremove_wake_function)
+
 static inline void
-init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t 
func)
+__init_waitqueue_entry(struct wait_queue_entry *wq_entry, unsigned int flags,
+  void *private, wait_queue_func_t func)
 {
-   wq_entry->flags = 0;
-   wq_entry->private   = NULL;
+   wq_entry->flags = flags;
+   wq_entry->private   = private;
wq_entry->func  = func;
+   INIT_LIST_HEAD(&wq_entry->entry);
 }
 
+#define init_waitqueue_func_entry(_wq_entry, _func)\
+   __init_waitqueue_entry(_wq_entry, 0, NULL, _func)
+
+#define init_waitqueue_entry(_wq_entry, _task) \
+   __init_waitqueue_entry(_wq_entry, 0, _task, default_wake_function)
+
+#define init_wait_entry(_wq_entry, _flags) \
+   __init_waitqueue_entry(_wq_entry, _flags, current, 
autoremove_wake_function)
+
+#define init_wait(wait)init_wait_entry(wait, 0)
+
 /**
  * waitqueue_active -- locklessly test for waiters on the queue
  * @wq_head: the waitqueue to test for waiters
@@ -283,8 +300,6 @@ static inline void wake_up_pollfree(struct wait_queue_head 
*wq_head)
(!__builtin_constant_p(state) ||
\
state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE)  
\
 
-extern void init_wait_entry(struct wait_queue_entry *wq_entry, int flags);
-
 /*
  * The below macro ___wait_event() has an explicit shadow of the __ret
  * variable when used from the wait_event_*() macros.
@@ -1170,23 +1185,6 @@ long wait_woken(struct wait_queue_entry *wq_entry, 
unsigned mode, long timeout);
 int woken_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int 
sync, void *key);
 int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, 
int sync, void *key);
 
-#define DEFINE_WAIT_FUNC(name, function)   
\
-   struct wait_queue_entry name = {
\
-   .private= current,  
\
-   .func   = function,

[RFC PATCH 30/30] MAINTAINERS: Add entries for code tagging & related

2022-08-30 Thread Suren Baghdasaryan
From: Kent Overstreet 

The new code & libraries added are being maintained - mark them as such.

Signed-off-by: Kent Overstreet 
---
 MAINTAINERS | 34 ++
 1 file changed, 34 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 589517372408..902c96744bcb 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5111,6 +5111,19 @@ S:   Supported
 F: Documentation/process/code-of-conduct-interpretation.rst
 F: Documentation/process/code-of-conduct.rst
 
+CODE TAGGING
+M: Suren Baghdasaryan 
+M: Kent Overstreet 
+S: Maintained
+F: lib/codetag.c
+F: include/linux/codetag.h
+
+CODE TAGGING TIME STATS
+M: Kent Overstreet 
+S: Maintained
+F: lib/codetag_time_stats.c
+F: include/linux/codetag_time_stats.h
+
 COMEDI DRIVERS
 M: Ian Abbott 
 M: H Hartley Sweeten 
@@ -11405,6 +11418,12 @@ M: John Hawley 
 S: Maintained
 F: tools/testing/ktest
 
+LAZY PERCPU COUNTERS
+M: Kent Overstreet 
+S: Maintained
+F: lib/lazy-percpu-counter.c
+F: include/linux/lazy-percpu-counter.h
+
 L3MDEV
 M: David Ahern 
 L: net...@vger.kernel.org
@@ -13124,6 +13143,15 @@ F: include/linux/memblock.h
 F: mm/memblock.c
 F: tools/testing/memblock/
 
+MEMORY ALLOCATION TRACKING
+M: Suren Baghdasaryan 
+M: Kent Overstreet 
+S: Maintained
+F: lib/alloc_tag.c
+F: lib/pgalloc_tag.c
+F: include/linux/alloc_tag.h
+F: include/linux/codetag_ctx.h
+
 MEMORY CONTROLLER DRIVERS
 M: Krzysztof Kozlowski 
 L: linux-ker...@vger.kernel.org
@@ -20421,6 +20449,12 @@ T: git 
git://git.kernel.org/pub/scm/linux/kernel/git/luca/wl12xx.git
 F: drivers/net/wireless/ti/
 F: include/linux/wl12xx.h
 
+TIME STATS
+M: Kent Overstreet 
+S: Maintained
+F: lib/time_stats.c
+F: include/linux/time_stats.h
+
 TIMEKEEPING, CLOCKSOURCE CORE, NTP, ALARMTIMER
 M: John Stultz 
 M: Thomas Gleixner 
-- 
2.37.2.672.g94769d06f0-goog




[RFC PATCH 29/30] dyndbg: Convert to code tagging

2022-08-30 Thread Suren Baghdasaryan
From: Kent Overstreet 

This converts dynamic debug to the new code tagging framework, which
provides an interface for iterating over objects in a particular elf
section.

It also converts the debugfs interface from seq_file to the style used
by other code tagging users, which also makes the code a bit smaller and
simpler.

It doesn't yet convert struct _ddebug to use struct codetag; another
cleanup could convert it to that, and to codetag_query_parse().

Signed-off-by: Kent Overstreet 
Cc: Jason Baron 
Cc: Luis Chamberlain 
---
 include/asm-generic/codetag.lds.h |   5 +-
 include/asm-generic/vmlinux.lds.h |   5 -
 include/linux/dynamic_debug.h |  11 +-
 kernel/module/internal.h  |   2 -
 kernel/module/main.c  |  23 --
 lib/dynamic_debug.c   | 452 ++
 6 files changed, 158 insertions(+), 340 deletions(-)

diff --git a/include/asm-generic/codetag.lds.h 
b/include/asm-generic/codetag.lds.h
index b087cf1874a9..b7e351f80e9e 100644
--- a/include/asm-generic/codetag.lds.h
+++ b/include/asm-generic/codetag.lds.h
@@ -8,10 +8,11 @@
KEEP(*(_name))  \
__stop_##_name = .;
 
-#define CODETAG_SECTIONS() \
+#define CODETAG_SECTIONS() \
SECTION_WITH_BOUNDARIES(alloc_tags) \
SECTION_WITH_BOUNDARIES(dynamic_fault_tags) \
SECTION_WITH_BOUNDARIES(time_stats_tags)\
-   SECTION_WITH_BOUNDARIES(error_code_tags)
+   SECTION_WITH_BOUNDARIES(error_code_tags)\
+   SECTION_WITH_BOUNDARIES(dyndbg)
 
 #endif /* __ASM_GENERIC_CODETAG_LDS_H */
diff --git a/include/asm-generic/vmlinux.lds.h 
b/include/asm-generic/vmlinux.lds.h
index c2dc2a59ab2e..d3fb914d157f 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -345,11 +345,6 @@
__end_once = .; \
STRUCT_ALIGN(); \
*(__tracepoints)\
-   /* implement dynamic printk debug */\
-   . = ALIGN(8);   \
-   __start___dyndbg = .;   \
-   KEEP(*(__dyndbg))   \
-   __stop___dyndbg = .;\
CODETAG_SECTIONS()  \
LIKELY_PROFILE()\
BRANCH_PROFILE()\
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
index dce631e678dd..6a57009dd29e 100644
--- a/include/linux/dynamic_debug.h
+++ b/include/linux/dynamic_debug.h
@@ -58,9 +58,6 @@ struct _ddebug {
 /* exported for module authors to exercise >control */
 int dynamic_debug_exec_queries(const char *query, const char *modname);
 
-int ddebug_add_module(struct _ddebug *tab, unsigned int n,
-   const char *modname);
-extern int ddebug_remove_module(const char *mod_name);
 extern __printf(2, 3)
 void __dynamic_pr_debug(struct _ddebug *descriptor, const char *fmt, ...);
 
@@ -89,7 +86,7 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
 
 #define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt)   \
static struct _ddebug  __aligned(8) \
-   __section("__dyndbg") name = {  \
+   __section("dyndbg") name = {\
.modname = KBUILD_MODNAME,  \
.function = __func__,   \
.filename = __FILE__,   \
@@ -187,12 +184,6 @@ void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
 #include 
 #include 
 
-static inline int ddebug_add_module(struct _ddebug *tab, unsigned int n,
-   const char *modname)
-{
-   return 0;
-}
-
 static inline int ddebug_remove_module(const char *mod)
 {
return 0;
diff --git a/kernel/module/internal.h b/kernel/module/internal.h
index f1b6c477bd93..f867c57ab74f 100644
--- a/kernel/module/internal.h
+++ b/kernel/module/internal.h
@@ -62,8 +62,6 @@ struct load_info {
Elf_Shdr *sechdrs;
char *secstrings, *strtab;
unsigned long symoffs, stroffs, init_typeoffs, core_typeoffs;
-   struct _ddebug *debug;
-   unsigned int num_debug;
bool sig_ok;
 #ifdef CONFIG_KALLSYMS
unsigned long mod_kallsyms_init_off;
diff --git a/kernel/module/main.c b/kernel/module/main.c
index d253277492fd..28e3b337841b 100644
--- a/kernel/module/main.c
+++ b/kernel/module/main.c
@@ -1163,9 +1163,6 @@ static void free_module(struct module *mod)
mod->state = MODULE_STATE_UNFORMED;
mutex_unlock(&module_mutex);
 
-   /* Remove dynamic debug info */

[RFC PATCH 28/30] Improved symbolic error names

2022-08-30 Thread Suren Baghdasaryan
From: Kent Overstreet 

This patch adds per-error-site error codes, with error strings that
include their file and line number.

To use, change code that returns an error, e.g.
return -ENOMEM;
to
return -ERR(ENOMEM);

Then, errname() will return a string that includes the file and line
number of the ERR() call, for example
printk("Got error %s!\n", errname(err));
will result in
Got error ENOMEM at foo.c:1234

To convert back to the original error code (before returning it to
outside code that does not understand dynamic error codes), use
return error_class(err);

To test if an error is of some type, replace
if (err == -ENOMEM)
with
if (error_matches(err, ENOMEM))

Implementation notes:

Error codes are allocated dynamically on module load and deallocated on
module unload. On memory allocation failure (i.e. the data structures
for indexing error strings and error parents), ERR() will fall back to
returning the error code that it was passed.

MAX_ERRNO has been raised from 4096 to 1 million, which should be
sufficient given the number of lines of code and the fraction that throw
errors in the kernel codebase.

This has implications for ERR_PTR(), since the range of the address
space reserved for errors is unavailable for other purposes. Since
ERR_PTR() ptrs are at the top of the address space there should not be
any major difficulties.

Signed-off-by: Kent Overstreet 
---
 include/asm-generic/codetag.lds.h |   3 +-
 include/linux/err.h   |   2 +-
 include/linux/errname.h   |  50 +++
 lib/errname.c | 103 ++
 4 files changed, 156 insertions(+), 2 deletions(-)

diff --git a/include/asm-generic/codetag.lds.h 
b/include/asm-generic/codetag.lds.h
index d799f4aced82..b087cf1874a9 100644
--- a/include/asm-generic/codetag.lds.h
+++ b/include/asm-generic/codetag.lds.h
@@ -11,6 +11,7 @@
 #define CODETAG_SECTIONS() \
SECTION_WITH_BOUNDARIES(alloc_tags) \
SECTION_WITH_BOUNDARIES(dynamic_fault_tags) \
-   SECTION_WITH_BOUNDARIES(time_stats_tags)
+   SECTION_WITH_BOUNDARIES(time_stats_tags)\
+   SECTION_WITH_BOUNDARIES(error_code_tags)
 
 #endif /* __ASM_GENERIC_CODETAG_LDS_H */
diff --git a/include/linux/err.h b/include/linux/err.h
index a139c64aef2a..1d8d6c46ab9c 100644
--- a/include/linux/err.h
+++ b/include/linux/err.h
@@ -15,7 +15,7 @@
  * This should be a per-architecture thing, to allow different
  * error and pointer decisions.
  */
-#define MAX_ERRNO  4095
+#define MAX_ERRNO  ((1 << 20) - 1)
 
 #ifndef __ASSEMBLY__
 
diff --git a/include/linux/errname.h b/include/linux/errname.h
index e8576ad90cb7..dd39fe7120bb 100644
--- a/include/linux/errname.h
+++ b/include/linux/errname.h
@@ -5,12 +5,62 @@
 #include 
 
 #ifdef CONFIG_SYMBOLIC_ERRNAME
+
 const char *errname(int err);
+
+#include 
+
+struct codetag_error_code {
+   const char  *str;
+   int err;
+};
+
+/**
+ * ERR - return an error code that records the error site
+ *
+ * E.g., instead of
+ *   return -ENOMEM;
+ * Use
+ *   return -ERR(ENOMEM);
+ *
+ * Then, when a caller prints out the error with errname(), the error string
+ * will include the file and line number.
+ */
+#define ERR(_err)  \
+({ \
+   static struct codetag_error_code\
+   __used  \
+   __section("error_code_tags")\
+   __aligned(8) e = {  \
+   .str= #_err " at " __FILE__ ":" __stringify(__LINE__),\
+   .err= _err, \
+   };  \
+   \
+   e.err;  \
+})
+
+int error_class(int err);
+bool error_matches(int err, int class);
+
 #else
+
+static inline int error_class(int err)
+{
+   return err;
+}
+
+static inline bool error_matches(int err, int class)
+{
+   return err == class;
+}
+
+#define ERR(_err)  _err
+
 static inline const char *errname(int err)
 {
return NULL;
 }
+
 #endif
 
 #endif /* _LINUX_ERRNAME_H */
diff --git a/lib/errname.c b/lib/errname.c
index 05cbf731545f..2db8f5301ba0 100644
--- a/lib/errname.c
+++ b/lib/errname.c
@@ -1,9 +1,20 @@
 // SPDX-License-Identifier: GPL-2.0
 #include 
+#include 
 #include 
 #include 
+#include 
 #include 
 #include 
+#include 
+#include 
+
+#define DYNAMIC_ERRCODE_START  4096
+
+static DEFINE_IDR(dynamic_error_strings);
+static DEFINE_XARRAY(error_classes);
+
+static struct codetag_type *cttype;
 
 /*
  * Ensure these tables do not accidentally become giganti

[RFC PATCH 23/30] timekeeping: Add a missing include

2022-08-30 Thread Suren Baghdasaryan
From: Kent Overstreet 

We need ktime.h for ktime_t.

Signed-off-by: Kent Overstreet 
---
 include/linux/timekeeping.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index fe1e467ba046..7c43e98cf211 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -4,6 +4,7 @@
 
 #include 
 #include 
+#include 
 
 /* Included from linux/ktime.h */
 
-- 
2.37.2.672.g94769d06f0-goog




[RFC PATCH 27/30] Code tagging based latency tracking

2022-08-30 Thread Suren Baghdasaryan
From: Kent Overstreet 

This adds the ability to easily instrument code for measuring latency.
To use, add the following to calls to your code, at the start and end of
the event you wish to measure:

  code_tag_time_stats_start(start_time);
  code_tag_time_stats_finish(start_time);

Stastistics will then show up in debugfs under
/sys/kernel/debug/time_stats, listed by file and line number.

Stastics measured include weighted averages of frequency, duration, max
duration, as well as quantiles.

This patch also instruments all calls to init_wait and finish_wait,
which includes all calls to wait_event. Example debugfs output:

fs/xfs/xfs_trans_ail.c:746 module:xfs func:xfs_ail_push_all_sync
count:  17
rate:   0/sec
frequency:  2 sec
avg duration:   10 us
max duration:   232 us
quantiles (ns): 128 128 128 128 128 128 128 128 128 128 128 128 128 128 128

lib/sbitmap.c:813 module:sbitmap func:sbitmap_finish_wait
count:  3
rate:   0/sec
frequency:  4 sec
avg duration:   4 sec
max duration:   4 sec
quantiles (ns): 0 4288669120 4288669120 5360836048 5360836048 5360836048 
5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 5360836048 
5360836048 5360836048

net/core/datagram.c:122 module:datagram func:__skb_wait_for_more_packets
count:  10
rate:   1/sec
frequency:  859 ms
avg duration:   472 ms
max duration:   30 sec
quantiles (ns): 0 12279 12279 15669 15669 15669 15669 17217 17217 17217 17217 
17217 17217 17217 17217

Signed-off-by: Kent Overstreet 
---
 include/asm-generic/codetag.lds.h  |   3 +-
 include/linux/codetag_time_stats.h |  54 +++
 include/linux/io_uring_types.h |   2 +-
 include/linux/wait.h   |  22 -
 kernel/sched/wait.c|   6 +-
 lib/Kconfig.debug  |   8 ++
 lib/Makefile   |   1 +
 lib/codetag_time_stats.c   | 143 +
 8 files changed, 233 insertions(+), 6 deletions(-)
 create mode 100644 include/linux/codetag_time_stats.h
 create mode 100644 lib/codetag_time_stats.c

diff --git a/include/asm-generic/codetag.lds.h 
b/include/asm-generic/codetag.lds.h
index 16fbf74edc3d..d799f4aced82 100644
--- a/include/asm-generic/codetag.lds.h
+++ b/include/asm-generic/codetag.lds.h
@@ -10,6 +10,7 @@
 
 #define CODETAG_SECTIONS() \
SECTION_WITH_BOUNDARIES(alloc_tags) \
-   SECTION_WITH_BOUNDARIES(dynamic_fault_tags)
+   SECTION_WITH_BOUNDARIES(dynamic_fault_tags) \
+   SECTION_WITH_BOUNDARIES(time_stats_tags)
 
 #endif /* __ASM_GENERIC_CODETAG_LDS_H */
diff --git a/include/linux/codetag_time_stats.h 
b/include/linux/codetag_time_stats.h
new file mode 100644
index ..7e44c7ee9e9b
--- /dev/null
+++ b/include/linux/codetag_time_stats.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_CODETAG_TIMESTATS_H
+#define _LINUX_CODETAG_TIMESTATS_H
+
+/*
+ * Code tagging based latency tracking:
+ * (C) 2022 Kent Overstreet
+ *
+ * This allows you to easily instrument code to track latency, and have the
+ * results show up in debugfs. To use, add the following two calls to your code
+ * at the beginning and end of the event you wish to instrument:
+ *
+ * code_tag_time_stats_start(start_time);
+ * code_tag_time_stats_finish(start_time);
+ *
+ * Statistics will then show up in debugfs under /sys/kernel/debug/time_stats,
+ * listed by file and line number.
+ */
+
+#ifdef CONFIG_CODETAG_TIME_STATS
+
+#include 
+#include 
+#include 
+
+struct codetag_time_stats {
+   struct codetag  tag;
+   struct time_stats   stats;
+};
+
+#define codetag_time_stats_start(_start_time)  u64 _start_time = ktime_get_ns()
+
+#define codetag_time_stats_finish(_start_time) \
+do {   \
+   static struct codetag_time_stats\
+   __used  \
+   __section("time_stats_tags")\
+   __aligned(8) s = {  \
+   .tag= CODE_TAG_INIT,\
+   .stats.lock = __SPIN_LOCK_UNLOCKED(_lock)   \
+   };  \
+   \
+   WARN_ONCE(!(_start_time), "codetag_time_stats_start() not called");\
+   time_stats_update(&s.stats, _start_time);   \
+} while (0)
+
+#else
+
+#define codetag_time_stats_finish(_start_time) do {} while (0)
+#define codetag_time_stats_start(_start_time)  do {} while (0)
+
+#endif /* CODETAG_CODETAG_TIME_STATS */
+
+#endif
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index 677a25d44d7f..3bcef85eacd8 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -488,7 +488,7 @@ struct io_cqe {
 struct io_cmd_data {
s

Re: [RFC PATCH 00/30] Code tagging framework and applications

2022-08-31 Thread Suren Baghdasaryan
On Wed, Aug 31, 2022 at 3:47 AM Michal Hocko  wrote:
>
> On Wed 31-08-22 11:19:48, Mel Gorman wrote:
> > On Wed, Aug 31, 2022 at 04:42:30AM -0400, Kent Overstreet wrote:
> > > On Wed, Aug 31, 2022 at 09:38:27AM +0200, Peter Zijlstra wrote:
> > > > On Tue, Aug 30, 2022 at 02:48:49PM -0700, Suren Baghdasaryan wrote:
> > > > > ===
> > > > > Code tagging framework
> > > > > ===
> > > > > Code tag is a structure identifying a specific location in the source 
> > > > > code
> > > > > which is generated at compile time and can be embedded in an 
> > > > > application-
> > > > > specific structure. Several applications of code tagging are included 
> > > > > in
> > > > > this RFC, such as memory allocation tracking, dynamic fault injection,
> > > > > latency tracking and improved error code reporting.
> > > > > Basically, it takes the old trick of "define a special elf section for
> > > > > objects of a given type so that we can iterate over them at runtime" 
> > > > > and
> > > > > creates a proper library for it.
> > > >
> > > > I might be super dense this morning, but what!? I've skimmed through the
> > > > set and I don't think I get it.
> > > >
> > > > What does this provide that ftrace/kprobes don't already allow?
> > >
> > > You're kidding, right?
> >
> > It's a valid question. From the description, it main addition that would
> > be hard to do with ftrace or probes is catching where an error code is
> > returned. A secondary addition would be catching all historical state and
> > not just state since the tracing started.
> >
> > It's also unclear *who* would enable this. It looks like it would mostly
> > have value during the development stage of an embedded platform to track
> > kernel memory usage on a per-application basis in an environment where it
> > may be difficult to setup tracing and tracking. Would it ever be enabled
> > in production? Would a distribution ever enable this? If it's enabled, any
> > overhead cannot be disabled/enabled at run or boot time so anyone enabling
> > this would carry the cost without never necessarily consuming the data.

Thank you for the question.
For memory tracking my intent is to have a mechanism that can be enabled in
the field testing (pre-production testing on a large population of
internal users).
The issue that we are often facing is when some memory leaks are happening
in the field but very hard to reproduce locally. We get a bugreport
from the user
which indicates it but often has not enough information to track it. Note that
quite often these leaks/issues happen in the drivers, so even simply finding out
where they came from is a big help.
The way I envision this mechanism to be used is to enable the basic memory
tracking in the field tests and have a user space process collecting
the allocation
statistics periodically (say once an hour). Once it detects some counter growing
infinitely or atypically (the definition of this is left to the user
space) it can enable
context capturing only for that specific location, still keeping the
overhead to the
minimum but getting more information about potential issues. Collected stats and
contexts are then attached to the bugreport and we get more visibility
into the issue
when we receive it.
The goal is to provide a mechanism with low enough overhead that it
can be enabled
all the time during these field tests without affecting the device's
performance profiles.
Tracing is very cheap when it's disabled but having it enabled all the
time would
introduce higher overhead than the counter manipulations.
My apologies, I should have clarified all this in this cover letter
from the beginning.

As for other applications, maybe I'm not such an advanced user of
tracing but I think only
the latency tracking application might be done with tracing, assuming
we have all the
right tracepoints but I don't see how we would use tracing for fault
injections and
descriptive error codes. Again, I might be mistaken.

Thanks,
Suren.

> >
> > It might be an ease-of-use thing. Gathering the information from traces
> > is tricky and would need combining multiple different elements and that
> > is development effort but not impossible.
> >
> > Whatever asking for an explanation as to why equivalent functionality
> > cannot not be created from ftrace/kprobe/eBPF/whatever is reasonable.
>
> Fully agreed and this is especially true for a change this size
> 77 files changed, 3406 insertions(+), 703 deletions(-)
>
> --
> Michal Hocko
> SUSE Labs



Re: [RFC PATCH 03/30] Lazy percpu counters

2022-08-31 Thread Suren Baghdasaryan
On Wed, Aug 31, 2022 at 3:02 AM Mel Gorman  wrote:
>
> On Tue, Aug 30, 2022 at 02:48:52PM -0700, Suren Baghdasaryan wrote:
> > From: Kent Overstreet 
> >
> > This patch adds lib/lazy-percpu-counter.c, which implements counters
> > that start out as atomics, but lazily switch to percpu mode if the
> > update rate crosses some threshold (arbitrarily set at 256 per second).
> >
> > Signed-off-by: Kent Overstreet 
>
> Why not use percpu_counter? It has a per-cpu counter that is synchronised
> when a batch threshold (default 32) is exceeded and can explicitly sync
> the counters when required assuming the synchronised count is only needed
> when reading debugfs.

The intent is to use atomic counters for places that are not updated very often.
This would save memory required for the counters. Originally I had a config
option to choose which counter type to use but with lazy counters we sacrifice
memory for performance only when needed while keeping the other counters
small.

>
> --
> Mel Gorman
> SUSE Labs



Re: [RFC PATCH 10/30] mm: enable page allocation tagging for __get_free_pages and alloc_pages

2022-08-31 Thread Suren Baghdasaryan
On Wed, Aug 31, 2022 at 3:11 AM Mel Gorman  wrote:
>
> On Tue, Aug 30, 2022 at 02:48:59PM -0700, Suren Baghdasaryan wrote:
> > Redefine alloc_pages, __get_free_pages to record allocations done by
> > these functions. Instrument deallocation hooks to record object freeing.
> >
> > Signed-off-by: Suren Baghdasaryan 
> > +#ifdef CONFIG_PAGE_ALLOC_TAGGING
> > +
> >  #include 
> >  #include 
> >
> > @@ -25,4 +27,37 @@ static inline void pgalloc_tag_dec(struct page *page, 
> > unsigned int order)
> >   alloc_tag_sub(get_page_tag_ref(page), PAGE_SIZE << order);
> >  }
> >
> > +/*
> > + * Redefinitions of the common page allocators/destructors
> > + */
> > +#define pgtag_alloc_pages(gfp, order)  
> >   \
> > +({   \
> > + struct page *_page = _alloc_pages((gfp), (order));  \
> > + \
> > + if (_page)  \
> > + alloc_tag_add(get_page_tag_ref(_page), PAGE_SIZE << (order));\
> > + _page;  \
> > +})
> > +
>
> Instead of renaming alloc_pages, why is the tagging not done in
> __alloc_pages()? At least __alloc_pages_bulk() is also missed. The branch
> can be guarded with IS_ENABLED.

Hmm. Assuming all the other allocators using __alloc_pages are inlined, that
should work. I'll try that and if that works will incorporate in the
next respin.
Thanks!

I don't think IS_ENABLED is required because the tagging functions are already
defined as empty if the appropriate configs are not enabled. Unless I
misunderstood
your node.

>
> > +#define pgtag_get_free_pages(gfp_mask, order)  
> >   \
> > +({   \
> > + struct page *_page; \
> > + unsigned long _res = _get_free_pages((gfp_mask), (order), &_page);\
> > + \
> > + if (_res)   \
> > + alloc_tag_add(get_page_tag_ref(_page), PAGE_SIZE << (order));\
> > + _res;   \
> > +})
> > +
>
> Similar, the tagging could happen in a core function instead of a wrapper.
>
> > +#else /* CONFIG_PAGE_ALLOC_TAGGING */
> > +
> > +#define pgtag_alloc_pages(gfp, order) _alloc_pages(gfp, order)
> > +
> > +#define pgtag_get_free_pages(gfp_mask, order) \
> > + _get_free_pages((gfp_mask), (order), NULL)
> > +
> > +#define pgalloc_tag_dec(__page, __size)  do {} while (0)
> > +
> > +#endif /* CONFIG_PAGE_ALLOC_TAGGING */
> > +
> >  #endif /* _LINUX_PGALLOC_TAG_H */
> > diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> > index b73d3248d976..f7e6d9564a49 100644
> > --- a/mm/mempolicy.c
> > +++ b/mm/mempolicy.c
> > @@ -2249,7 +2249,7 @@ EXPORT_SYMBOL(vma_alloc_folio);
> >   * flags are used.
> >   * Return: The page on success or NULL if allocation fails.
> >   */
> > -struct page *alloc_pages(gfp_t gfp, unsigned order)
> > +struct page *_alloc_pages(gfp_t gfp, unsigned int order)
> >  {
> >   struct mempolicy *pol = &default_policy;
> >   struct page *page;
> > @@ -2273,7 +2273,7 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
> >
> >   return page;
> >  }
> > -EXPORT_SYMBOL(alloc_pages);
> > +EXPORT_SYMBOL(_alloc_pages);
> >
> >  struct folio *folio_alloc(gfp_t gfp, unsigned order)
> >  {
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index e5486d47406e..165daba19e2a 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -763,6 +763,7 @@ static inline bool pcp_allowed_order(unsigned int order)
> >
> >  static inline void free_the_page(struct page *page, unsigned int order)
> >  {
> > +
> >   if (pcp_allowed_order(order))   /* Via pcp? */
> >   free_unref_page(page, order);
> >   else
>
> Spurious wide-space change.
>
> --
> Mel Gorman
> SUSE Labs



Re: [RFC PATCH 22/30] Code tagging based fault injection

2022-08-31 Thread Suren Baghdasaryan
On Wed, Aug 31, 2022 at 3:37 AM Dmitry Vyukov  wrote:
>
> On Tue, 30 Aug 2022 at 23:50, Suren Baghdasaryan  wrote:
> >
> > From: Kent Overstreet 
> >
> > This adds a new fault injection capability, based on code tagging.
> >
> > To use, simply insert somewhere in your code
> >
> >   dynamic_fault("fault_class_name")
> >
> > and check whether it returns true - if so, inject the error.
> > For example
> >
> >   if (dynamic_fault("init"))
> >   return -EINVAL;
>
> Hi Suren,
>
> If this is going to be used by mainline kernel, it would be good to
> integrate this with fail_nth systematic fault injection:
> https://elixir.bootlin.com/linux/latest/source/lib/fault-inject.c#L109
>
> Otherwise these dynamic sites won't be tested by testing systems doing
> systematic fault injection testing.

Hi Dmitry,
Thanks for the information! Will look into it and try to integrate.
Suren.

>
>
> > There's no need to define faults elsewhere, as with
> > include/linux/fault-injection.h. Faults show up in debugfs, under
> > /sys/kernel/debug/dynamic_faults, and can be selected based on
> > file/module/function/line number/class, and enabled permanently, or in
> > oneshot mode, or with a specified frequency.
> >
> > Signed-off-by: Kent Overstreet 
> > ---
> >  include/asm-generic/codetag.lds.h |   3 +-
> >  include/linux/dynamic_fault.h |  79 +++
> >  include/linux/slab.h  |   3 +-
> >  lib/Kconfig.debug |   6 +
> >  lib/Makefile  |   2 +
> >  lib/dynamic_fault.c   | 372 ++
> >  6 files changed, 463 insertions(+), 2 deletions(-)
> >  create mode 100644 include/linux/dynamic_fault.h
> >  create mode 100644 lib/dynamic_fault.c
> >
> > diff --git a/include/asm-generic/codetag.lds.h 
> > b/include/asm-generic/codetag.lds.h
> > index 64f536b80380..16fbf74edc3d 100644
> > --- a/include/asm-generic/codetag.lds.h
> > +++ b/include/asm-generic/codetag.lds.h
> > @@ -9,6 +9,7 @@
> > __stop_##_name = .;
> >
> >  #define CODETAG_SECTIONS() \
> > -   SECTION_WITH_BOUNDARIES(alloc_tags)
> > +   SECTION_WITH_BOUNDARIES(alloc_tags) \
> > +   SECTION_WITH_BOUNDARIES(dynamic_fault_tags)
> >
> >  #endif /* __ASM_GENERIC_CODETAG_LDS_H */
> > diff --git a/include/linux/dynamic_fault.h b/include/linux/dynamic_fault.h
> > new file mode 100644
> > index ..526a33209e94
> > --- /dev/null
> > +++ b/include/linux/dynamic_fault.h
> > @@ -0,0 +1,79 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +
> > +#ifndef _LINUX_DYNAMIC_FAULT_H
> > +#define _LINUX_DYNAMIC_FAULT_H
> > +
> > +/*
> > + * Dynamic/code tagging fault injection:
> > + *
> > + * Originally based on the dynamic debug trick of putting types in a 
> > special elf
> > + * section, then rewritten using code tagging:
> > + *
> > + * To use, simply insert a call to dynamic_fault("fault_class"), which will
> > + * return true if an error should be injected.
> > + *
> > + * Fault injection sites may be listed and enabled via debugfs, under
> > + * /sys/kernel/debug/dynamic_faults.
> > + */
> > +
> > +#ifdef CONFIG_CODETAG_FAULT_INJECTION
> > +
> > +#include 
> > +#include 
> > +
> > +#define DFAULT_STATES()\
> > +   x(disabled) \
> > +   x(enabled)  \
> > +   x(oneshot)
> > +
> > +enum dfault_enabled {
> > +#define x(n)   DFAULT_##n,
> > +   DFAULT_STATES()
> > +#undef x
> > +};
> > +
> > +union dfault_state {
> > +   struct {
> > +   unsigned intenabled:2;
> > +   unsigned intcount:30;
> > +   };
> > +
> > +   struct {
> > +   unsigned intv;
> > +   };
> > +};
> > +
> > +struct dfault {
> > +   struct codetag  tag;
> > +   const char  *class;
> > +   unsigned intfrequency;
> > +   union dfault_state  state;
> > +   struct static_key_false enabled;
> > +};
> > +
> > +bool __dynamic_fault_enabled(struct dfault *df);
> > +
> > +#define dynamic_fault(_class)  \
> > +({ \
> > +   static struct dfault\
> > +   

Re: [RFC PATCH 10/30] mm: enable page allocation tagging for __get_free_pages and alloc_pages

2022-08-31 Thread Suren Baghdasaryan
On Wed, Aug 31, 2022 at 8:45 AM Suren Baghdasaryan  wrote:
>
> On Wed, Aug 31, 2022 at 3:11 AM Mel Gorman  wrote:
> >
> > On Tue, Aug 30, 2022 at 02:48:59PM -0700, Suren Baghdasaryan wrote:
> > > Redefine alloc_pages, __get_free_pages to record allocations done by
> > > these functions. Instrument deallocation hooks to record object freeing.
> > >
> > > Signed-off-by: Suren Baghdasaryan 
> > > +#ifdef CONFIG_PAGE_ALLOC_TAGGING
> > > +
> > >  #include 
> > >  #include 
> > >
> > > @@ -25,4 +27,37 @@ static inline void pgalloc_tag_dec(struct page *page, 
> > > unsigned int order)
> > >   alloc_tag_sub(get_page_tag_ref(page), PAGE_SIZE << order);
> > >  }
> > >
> > > +/*
> > > + * Redefinitions of the common page allocators/destructors
> > > + */
> > > +#define pgtag_alloc_pages(gfp, order)
> > > \
> > > +({   \
> > > + struct page *_page = _alloc_pages((gfp), (order));  \
> > > + \
> > > + if (_page)  \
> > > + alloc_tag_add(get_page_tag_ref(_page), PAGE_SIZE << 
> > > (order));\
> > > + _page;  \
> > > +})
> > > +
> >
> > Instead of renaming alloc_pages, why is the tagging not done in
> > __alloc_pages()? At least __alloc_pages_bulk() is also missed. The branch
> > can be guarded with IS_ENABLED.
>
> Hmm. Assuming all the other allocators using __alloc_pages are inlined, that
> should work. I'll try that and if that works will incorporate in the
> next respin.
> Thanks!
>
> I don't think IS_ENABLED is required because the tagging functions are already
> defined as empty if the appropriate configs are not enabled. Unless I
> misunderstood
> your node.
>
> >
> > > +#define pgtag_get_free_pages(gfp_mask, order)
> > > \
> > > +({   \
> > > + struct page *_page; \
> > > + unsigned long _res = _get_free_pages((gfp_mask), (order), &_page);\
> > > + \
> > > + if (_res)   \
> > > + alloc_tag_add(get_page_tag_ref(_page), PAGE_SIZE << 
> > > (order));\
> > > + _res;   \
> > > +})
> > > +
> >
> > Similar, the tagging could happen in a core function instead of a wrapper.

Ack.

> >
> > > +#else /* CONFIG_PAGE_ALLOC_TAGGING */
> > > +
> > > +#define pgtag_alloc_pages(gfp, order) _alloc_pages(gfp, order)
> > > +
> > > +#define pgtag_get_free_pages(gfp_mask, order) \
> > > + _get_free_pages((gfp_mask), (order), NULL)
> > > +
> > > +#define pgalloc_tag_dec(__page, __size)  do {} while (0)
> > > +
> > > +#endif /* CONFIG_PAGE_ALLOC_TAGGING */
> > > +
> > >  #endif /* _LINUX_PGALLOC_TAG_H */
> > > diff --git a/mm/mempolicy.c b/mm/mempolicy.c
> > > index b73d3248d976..f7e6d9564a49 100644
> > > --- a/mm/mempolicy.c
> > > +++ b/mm/mempolicy.c
> > > @@ -2249,7 +2249,7 @@ EXPORT_SYMBOL(vma_alloc_folio);
> > >   * flags are used.
> > >   * Return: The page on success or NULL if allocation fails.
> > >   */
> > > -struct page *alloc_pages(gfp_t gfp, unsigned order)
> > > +struct page *_alloc_pages(gfp_t gfp, unsigned int order)
> > >  {
> > >   struct mempolicy *pol = &default_policy;
> > >   struct page *page;
> > > @@ -2273,7 +2273,7 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
> > >
> > >   return page;
> > >  }
> > > -EXPORT_SYMBOL(alloc_pages);
> > > +EXPORT_SYMBOL(_alloc_pages);
> > >
> > >  struct folio *folio_alloc(gfp_t gfp, unsigned order)
> > >  {
> > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > > index e5486d47406e..165daba19e2a 100644
> > > --- a/mm/page_alloc.c
> > > +++ b/mm/page_alloc.c
> > > @@ -763,6 +763,7 @@ static inline bool pcp_allowed_order(unsigned int 
> > > order)
> > >
> > >  static inline void free_the_page(struct page *page, unsigned int order)
> > >  {
> > > +
> > >   if (pcp_allowed_order(order))   /* Via pcp? */
> > >   free_unref_page(page, order);
> > >   else
> >
> > Spurious wide-space change.

Ack.

> >
> > --
> > Mel Gorman
> > SUSE Labs



Re: [RFC PATCH 27/30] Code tagging based latency tracking

2022-08-31 Thread Suren Baghdasaryan
On Tue, Aug 30, 2022 at 6:53 PM Randy Dunlap  wrote:
>
>
>
> On 8/30/22 14:49, Suren Baghdasaryan wrote:
> > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
> > index b7d03afbc808..b0f86643b8f0 100644
> > --- a/lib/Kconfig.debug
> > +++ b/lib/Kconfig.debug
> > @@ -1728,6 +1728,14 @@ config LATENCYTOP
> > Enable this option if you want to use the LatencyTOP tool
> > to find out which userspace is blocking on what kernel operations.
> >
> > +config CODETAG_TIME_STATS
> > + bool "Code tagging based latency measuring"
> > + depends on DEBUG_FS
> > + select TIME_STATS
> > + select CODE_TAGGING
> > + help
> > +   Enabling this option makes latency statistics available in debugfs
>
> Missing period at the end of the sentence.

Ack.

>
> --
> ~Randy



Re: [RFC PATCH 22/30] Code tagging based fault injection

2022-08-31 Thread Suren Baghdasaryan
On Tue, Aug 30, 2022 at 6:52 PM Randy Dunlap  wrote:
>
>
>
> On 8/30/22 14:49, Suren Baghdasaryan wrote:
> > From: Kent Overstreet 
> >
> > This adds a new fault injection capability, based on code tagging.
> >
> > To use, simply insert somewhere in your code
> >
> >   dynamic_fault("fault_class_name")
> >
> > and check whether it returns true - if so, inject the error.
> > For example
> >
> >   if (dynamic_fault("init"))
> >   return -EINVAL;
> >
> > There's no need to define faults elsewhere, as with
> > include/linux/fault-injection.h. Faults show up in debugfs, under
> > /sys/kernel/debug/dynamic_faults, and can be selected based on
> > file/module/function/line number/class, and enabled permanently, or in
> > oneshot mode, or with a specified frequency.
> >
> > Signed-off-by: Kent Overstreet 
>
> Missing Signed-off-by: from Suren.
> See Documentation/process/submitting-patches.rst:
>
> When to use Acked-by:, Cc:, and Co-developed-by:
> 
>
> The Signed-off-by: tag indicates that the signer was involved in the
> development of the patch, or that he/she was in the patch's delivery path.

Thanks for the note! Will fix in the next respin.

>
>
> --
> ~Randy



Re: [RFC PATCH 00/30] Code tagging framework and applications

2022-08-31 Thread Suren Baghdasaryan
On Wed, Aug 31, 2022 at 8:28 AM Suren Baghdasaryan  wrote:
>
> On Wed, Aug 31, 2022 at 3:47 AM Michal Hocko  wrote:
> >
> > On Wed 31-08-22 11:19:48, Mel Gorman wrote:
> > > On Wed, Aug 31, 2022 at 04:42:30AM -0400, Kent Overstreet wrote:
> > > > On Wed, Aug 31, 2022 at 09:38:27AM +0200, Peter Zijlstra wrote:
> > > > > On Tue, Aug 30, 2022 at 02:48:49PM -0700, Suren Baghdasaryan wrote:
> > > > > > ===
> > > > > > Code tagging framework
> > > > > > ===
> > > > > > Code tag is a structure identifying a specific location in the 
> > > > > > source code
> > > > > > which is generated at compile time and can be embedded in an 
> > > > > > application-
> > > > > > specific structure. Several applications of code tagging are 
> > > > > > included in
> > > > > > this RFC, such as memory allocation tracking, dynamic fault 
> > > > > > injection,
> > > > > > latency tracking and improved error code reporting.
> > > > > > Basically, it takes the old trick of "define a special elf section 
> > > > > > for
> > > > > > objects of a given type so that we can iterate over them at 
> > > > > > runtime" and
> > > > > > creates a proper library for it.
> > > > >
> > > > > I might be super dense this morning, but what!? I've skimmed through 
> > > > > the
> > > > > set and I don't think I get it.
> > > > >
> > > > > What does this provide that ftrace/kprobes don't already allow?
> > > >
> > > > You're kidding, right?
> > >
> > > It's a valid question. From the description, it main addition that would
> > > be hard to do with ftrace or probes is catching where an error code is
> > > returned. A secondary addition would be catching all historical state and
> > > not just state since the tracing started.
> > >
> > > It's also unclear *who* would enable this. It looks like it would mostly
> > > have value during the development stage of an embedded platform to track
> > > kernel memory usage on a per-application basis in an environment where it
> > > may be difficult to setup tracing and tracking. Would it ever be enabled
> > > in production? Would a distribution ever enable this? If it's enabled, any
> > > overhead cannot be disabled/enabled at run or boot time so anyone enabling
> > > this would carry the cost without never necessarily consuming the data.
>
> Thank you for the question.
> For memory tracking my intent is to have a mechanism that can be enabled in
> the field testing (pre-production testing on a large population of
> internal users).
> The issue that we are often facing is when some memory leaks are happening
> in the field but very hard to reproduce locally. We get a bugreport
> from the user
> which indicates it but often has not enough information to track it. Note that
> quite often these leaks/issues happen in the drivers, so even simply finding 
> out
> where they came from is a big help.
> The way I envision this mechanism to be used is to enable the basic memory
> tracking in the field tests and have a user space process collecting
> the allocation
> statistics periodically (say once an hour). Once it detects some counter 
> growing
> infinitely or atypically (the definition of this is left to the user
> space) it can enable
> context capturing only for that specific location, still keeping the
> overhead to the
> minimum but getting more information about potential issues. Collected stats 
> and
> contexts are then attached to the bugreport and we get more visibility
> into the issue
> when we receive it.
> The goal is to provide a mechanism with low enough overhead that it
> can be enabled
> all the time during these field tests without affecting the device's
> performance profiles.
> Tracing is very cheap when it's disabled but having it enabled all the
> time would
> introduce higher overhead than the counter manipulations.
> My apologies, I should have clarified all this in this cover letter
> from the beginning.
>
> As for other applications, maybe I'm not such an advanced user of
> tracing but I think only
> the latency tracking application might be done with tracing, assuming
> we have all the
> right tracepoints but I don't see how we would use tracing for fault
> injections and
> descriptive error codes. Again, I might be mistaken.

Sorry about the formatting of my reply. Forgot to reconfigure the editor on
the new machine.

>
> Thanks,
> Suren.
>
> > >
> > > It might be an ease-of-use thing. Gathering the information from traces
> > > is tricky and would need combining multiple different elements and that
> > > is development effort but not impossible.
> > >
> > > Whatever asking for an explanation as to why equivalent functionality
> > > cannot not be created from ftrace/kprobe/eBPF/whatever is reasonable.
> >
> > Fully agreed and this is especially true for a change this size
> > 77 files changed, 3406 insertions(+), 703 deletions(-)
> >
> > --
> > Michal Hocko
> > SUSE Labs



Re: [RFC PATCH 00/30] Code tagging framework and applications

2022-08-31 Thread Suren Baghdasaryan
On Wed, Aug 31, 2022 at 1:56 PM Yosry Ahmed  wrote:
>
> On Wed, Aug 31, 2022 at 12:02 PM Kent Overstreet
>  wrote:
> >
> > On Wed, Aug 31, 2022 at 12:47:32PM +0200, Michal Hocko wrote:
> > > On Wed 31-08-22 11:19:48, Mel Gorman wrote:
> > > > Whatever asking for an explanation as to why equivalent functionality
> > > > cannot not be created from ftrace/kprobe/eBPF/whatever is reasonable.
> > >
> > > Fully agreed and this is especially true for a change this size
> > > 77 files changed, 3406 insertions(+), 703 deletions(-)
> >
> > In the case of memory allocation accounting, you flat cannot do this with 
> > ftrace
> > - you could maybe do a janky version that isn't fully accurate, much slower,
> > more complicated for the developer to understand and debug and more 
> > complicated
> > for the end user.
> >
> > But please, I invite anyone who's actually been doing this with ftrace to
> > demonstrate otherwise.
> >
> > Ftrace just isn't the right tool for the job here - we're talking about 
> > adding
> > per callsite accounting to some of the fastest fast paths in the kernel.
> >
> > And the size of the changes for memory allocation accounting are much more
> > reasonable:
> >  33 files changed, 623 insertions(+), 99 deletions(-)
> >
> > The code tagging library should exist anyways, it's been open coded half a 
> > dozen
> > times in the kernel already.
> >
> > And once we've got that, the time stats code is _also_ far simpler than 
> > doing it
> > with ftrace would be. If anyone here has successfully debugged latency 
> > issues
> > with ftrace, I'd really like to hear it. Again, for debugging latency 
> > issues you
> > want something that can always be on, and that's not cheap with ftrace - and
> > never mind the hassle of correlating start and end wait trace events, 
> > builting
> > up histograms, etc. - that's all handled here.
> >
> > Cheap, simple, easy to use. What more could you want?
> >
>
> This is very interesting work! Do you have any data about the overhead
> this introduces, especially in a production environment? I am
> especially interested in memory allocations tracking and detecting
> leaks.

I had the numbers for my previous implementation, before we started using the
lazy percpu counters but that would not apply to the new implementation. I'll
rerun the measurements and will post the exact numbers in a day or so.

> (Sorry if you already posted this kind of data somewhere that I missed)



Re: [RFC PATCH 10/30] mm: enable page allocation tagging for __get_free_pages and alloc_pages

2022-08-31 Thread Suren Baghdasaryan
On Wed, Aug 31, 2022 at 10:46 AM Kent Overstreet
 wrote:
>
> On Wed, Aug 31, 2022 at 11:11:03AM +0100, Mel Gorman wrote:
> > On Tue, Aug 30, 2022 at 02:48:59PM -0700, Suren Baghdasaryan wrote:
> > > Redefine alloc_pages, __get_free_pages to record allocations done by
> > > these functions. Instrument deallocation hooks to record object freeing.
> > >
> > > Signed-off-by: Suren Baghdasaryan 
> > > +#ifdef CONFIG_PAGE_ALLOC_TAGGING
> > > +
> > >  #include 
> > >  #include 
> > >
> > > @@ -25,4 +27,37 @@ static inline void pgalloc_tag_dec(struct page *page, 
> > > unsigned int order)
> > > alloc_tag_sub(get_page_tag_ref(page), PAGE_SIZE << order);
> > >  }
> > >
> > > +/*
> > > + * Redefinitions of the common page allocators/destructors
> > > + */
> > > +#define pgtag_alloc_pages(gfp, order)
> > >   \
> > > +({ \
> > > +   struct page *_page = _alloc_pages((gfp), (order));  \
> > > +   \
> > > +   if (_page)  \
> > > +   alloc_tag_add(get_page_tag_ref(_page), PAGE_SIZE << (order));\
> > > +   _page;  \
> > > +})
> > > +
> >
> > Instead of renaming alloc_pages, why is the tagging not done in
> > __alloc_pages()? At least __alloc_pages_bulk() is also missed. The branch
> > can be guarded with IS_ENABLED.
>
> It can't be in a function, it has to be in a wrapper macro.

Ah, right. __FILE__, __LINE__ and others we use to record the call
location would point to include/linux/gfp.h instead of the location
allocation is performed at.

>
> alloc_tag_add() is a macro that defines a static struct in a special elf
> section. That struct holds the allocation counters, and putting it in a 
> special
> elf section is how the code to list it in debugfs finds it.
>
> Look at the dynamic debug code for prior precedence for this trick in the 
> kernel
> - that's how it makes pr_debug() calls dynamically controllable at runtime, 
> from
> debugfs. We're taking that method and turning it into a proper library.
>
> Because all the counters are statically allocated, without even a pointer 
> deref
> to get to them in the allocation path (one pointer deref to get to them in the
> deallocate path), that makes this _much, much_ cheaper than anything that 
> could
> be done with tracing - cheap enough that I expect many users will want to 
> enable
> it in production.
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an 
> email to kernel-team+unsubscr...@android.com.
>



Re: [RFC PATCH 00/30] Code tagging framework and applications

2022-08-31 Thread Suren Baghdasaryan
On Wed, Aug 31, 2022 at 9:52 PM Oscar Salvador  wrote:
>
> On Tue, Aug 30, 2022 at 02:48:49PM -0700, Suren Baghdasaryan wrote:
> > ===
> > Code tagging framework
> > ===
> > Code tag is a structure identifying a specific location in the source code
> > which is generated at compile time and can be embedded in an application-
> > specific structure. Several applications of code tagging are included in
> > this RFC, such as memory allocation tracking, dynamic fault injection,
> > latency tracking and improved error code reporting.
> > Basically, it takes the old trick of "define a special elf section for
> > objects of a given type so that we can iterate over them at runtime" and
> > creates a proper library for it.
> >
> > ===
> > Memory allocation tracking
> > ===
> > The goal for using codetags for memory allocation tracking is to minimize
> > performance and memory overhead. By recording only the call count and
> > allocation size, the required operations are kept at the minimum while
> > collecting statistics for every allocation in the codebase. With that
> > information, if users are interested in mode detailed context for a
> > specific allocation, they can enable more in-depth context tracking,
> > which includes capturing the pid, tgid, task name, allocation size,
> > timestamp and call stack for every allocation at the specified code
> > location.
> > Memory allocation tracking is implemented in two parts:
> >
> > part1: instruments page and slab allocators to record call count and total
> > memory allocated at every allocation in the source code. Every time an
> > allocation is performed by an instrumented allocator, the codetag at that
> > location increments its call and size counters. Every time the memory is
> > freed these counters are decremented. To decrement the counters upon free,
> > allocated object needs a reference to its codetag. Page allocators use
> > page_ext to record this reference while slab allocators use memcg_data of
> > the slab page.
> > The data is exposed to the user space via a read-only debugfs file called
> > alloc_tags.
>
> Hi Suren,
>
> I just posted a patch [1] and reading through your changelog and seeing your 
> PoC,
> I think we have some kind of overlap.
> My patchset aims to give you the stacktrace <-> relationship information and 
> it is
> achieved by a little amount of extra code mostly in page_owner.c/ and 
> lib/stackdepot.
>
> Of course, your works seems to be more complete wrt. the information you get.
>
> I CCed you in case you want to have a look
>
> [1] https://lkml.org/lkml/2022/9/1/36

Hi Oscar,
Thanks for the note. I'll take a look most likely on Friday and will
follow up with you.
Thanks,
Suren.

>
> Thanks
>
>
> --
> Oscar Salvador
> SUSE Labs



Re: [RFC PATCH 00/30] Code tagging framework and applications

2022-09-01 Thread Suren Baghdasaryan
On Thu, Sep 1, 2022 at 12:18 AM Michal Hocko  wrote:
>
> On Wed 31-08-22 15:01:54, Kent Overstreet wrote:
> > On Wed, Aug 31, 2022 at 12:47:32PM +0200, Michal Hocko wrote:
> > > On Wed 31-08-22 11:19:48, Mel Gorman wrote:
> > > > Whatever asking for an explanation as to why equivalent functionality
> > > > cannot not be created from ftrace/kprobe/eBPF/whatever is reasonable.
> > >
> > > Fully agreed and this is especially true for a change this size
> > > 77 files changed, 3406 insertions(+), 703 deletions(-)
> >
> > In the case of memory allocation accounting, you flat cannot do this with 
> > ftrace
> > - you could maybe do a janky version that isn't fully accurate, much slower,
> > more complicated for the developer to understand and debug and more 
> > complicated
> > for the end user.
> >
> > But please, I invite anyone who's actually been doing this with ftrace to
> > demonstrate otherwise.
> >
> > Ftrace just isn't the right tool for the job here - we're talking about 
> > adding
> > per callsite accounting to some of the fastest fast paths in the kernel.
> >
> > And the size of the changes for memory allocation accounting are much more
> > reasonable:
> >  33 files changed, 623 insertions(+), 99 deletions(-)
> >
> > The code tagging library should exist anyways, it's been open coded half a 
> > dozen
> > times in the kernel already.
> >
> > And once we've got that, the time stats code is _also_ far simpler than 
> > doing it
> > with ftrace would be. If anyone here has successfully debugged latency 
> > issues
> > with ftrace, I'd really like to hear it. Again, for debugging latency 
> > issues you
> > want something that can always be on, and that's not cheap with ftrace - and
> > never mind the hassle of correlating start and end wait trace events, 
> > builting
> > up histograms, etc. - that's all handled here.
> >
> > Cheap, simple, easy to use. What more could you want?
>
> A big ad on a banner. But more seriously.
>
> This patchset is _huge_ and touching a lot of different areas. It will
> be not only hard to review but even harder to maintain longterm. So
> it is completely reasonable to ask for potential alternatives with a
> smaller code footprint. I am pretty sure you are aware of that workflow.

The patchset is huge because it introduces a reusable part (the first
6 patches introducing code tagging) and 6 different applications in
very different areas of the kernel. We wanted to present all of them
in the RFC to show the variety of cases this mechanism can be reused
for. If the code tagging is accepted, each application can be posted
separately to the appropriate group of people. Hopefully that makes it
easier to review. Those first 6 patches are not that big and are quite
isolated IMHO:

 include/linux/codetag.h |  83 ++
 include/linux/lazy-percpu-counter.h |  67 
 include/linux/module.h  |   1 +
 kernel/module/internal.h|   1 -
 kernel/module/main.c|   4 +
 lib/Kconfig |   3 +
 lib/Kconfig.debug   |   4 +
 lib/Makefile|   3 +
 lib/codetag.c   | 248 
 lib/lazy-percpu-counter.c   | 141 
 lib/string_helpers.c|   3 +-
 scripts/kallsyms.c  |  13 ++

>
> So I find Peter's question completely appropriate while your response to
> that not so much! Maybe ftrace is not the right tool for the intented
> job. Maybe there are other ways and it would be really great to show
> that those have been evaluated and they are not suitable for a), b) and
> c) reasons.

That's fair.
For memory tracking I looked into using kmemleak and page_owner which
can't match the required functionality at an overhead acceptable for
production and pre-production testing environments. traces + BPF I
haven't evaluated myself but heard from other members of my team who
tried using that in production environment with poor results. I'll try
to get more specific information on that.

>
> E.g. Oscar has been working on extending page_ext to track number of
> allocations for specific calltrace[1]. Is this 1:1 replacement? No! But
> it can help in environments where page_ext can be enabled and it is
> completely non-intrusive to the MM code.

Thanks for pointing out this work. I'll need to review and maybe
profile it before making any claims.

>
> If the page_ext overhead is not desirable/acceptable then I am sure
> there are other options. E.g. kprobes/LivePatching framework can hook
> into functions and alter their behavior. So why not use that for data
> collection? Has this been evaluated at all?

I'm not sure how I can hook into say alloc_pages() to find out where
it was called from without capturing the call stack (which would
introduce an overhead at every allocation). Would love to discuss this
or other alternatives if they can be done with low enough overhead.
Thanks,
Suren.

>
> And please

Re: [RFC PATCH 00/30] Code tagging framework and applications

2022-09-01 Thread Suren Baghdasaryan
On Thu, Sep 1, 2022 at 8:07 AM David Hildenbrand  wrote:
>
> On 01.09.22 16:23, Kent Overstreet wrote:
> > On Thu, Sep 01, 2022 at 10:05:03AM +0200, David Hildenbrand wrote:
> >> On 31.08.22 21:01, Kent Overstreet wrote:
> >>> On Wed, Aug 31, 2022 at 12:47:32PM +0200, Michal Hocko wrote:
>  On Wed 31-08-22 11:19:48, Mel Gorman wrote:
> > Whatever asking for an explanation as to why equivalent functionality
> > cannot not be created from ftrace/kprobe/eBPF/whatever is reasonable.
> 
>  Fully agreed and this is especially true for a change this size
>  77 files changed, 3406 insertions(+), 703 deletions(-)
> >>>
> >>> In the case of memory allocation accounting, you flat cannot do this with 
> >>> ftrace
> >>> - you could maybe do a janky version that isn't fully accurate, much 
> >>> slower,
> >>> more complicated for the developer to understand and debug and more 
> >>> complicated
> >>> for the end user.
> >>>
> >>> But please, I invite anyone who's actually been doing this with ftrace to
> >>> demonstrate otherwise.
> >>>
> >>> Ftrace just isn't the right tool for the job here - we're talking about 
> >>> adding
> >>> per callsite accounting to some of the fastest fast paths in the kernel.
> >>>
> >>> And the size of the changes for memory allocation accounting are much more
> >>> reasonable:
> >>>  33 files changed, 623 insertions(+), 99 deletions(-)
> >>>
> >>> The code tagging library should exist anyways, it's been open coded half 
> >>> a dozen
> >>> times in the kernel already.
> >>
> >> Hi Kent,
> >>
> >> independent of the other discussions, if it's open coded already, does
> >> it make sense to factor that already-open-coded part out independently
> >> of the remainder of the full series here?
> >
> > It's discussed in the cover letter, that is exactly how the patch series is
> > structured.
>
> Skimming over the patches (that I was CCed on) and skimming over the
> cover letter, I got the impression that everything after patch 7 is
> introducing something new instead of refactoring something out.

Hi David,
Yes, you are right, the RFC does incorporate lots of parts which can
be considered separately. They are sent together to present the
overall scope of the proposal but I do intend to send them separately
once we decide if it's worth working on.
Thanks,
Suren.

>
> >
> >> [I didn't immediately spot if this series also attempts already to
> >> replace that open-coded part]
> >
> > Uh huh.
> >
> > Honestly, some days it feels like lkml is just as bad as slashdot, with 
> > people
> > wanting to get in their two cents without actually reading...
>
> ... and of course you had to reply like that. I should just have learned
> from my last upstream experience with you and kept you on my spam list.
>
> Thanks, bye
>
> --
> Thanks,
>
> David / dhildenb
>



Re: [RFC PATCH 00/30] Code tagging framework and applications

2022-09-01 Thread Suren Baghdasaryan
On Thu, Sep 1, 2022 at 12:15 PM Michal Hocko  wrote:
>
> On Thu 01-09-22 08:33:19, Suren Baghdasaryan wrote:
> > On Thu, Sep 1, 2022 at 12:18 AM Michal Hocko  wrote:
> [...]
> > > So I find Peter's question completely appropriate while your response to
> > > that not so much! Maybe ftrace is not the right tool for the intented
> > > job. Maybe there are other ways and it would be really great to show
> > > that those have been evaluated and they are not suitable for a), b) and
> > > c) reasons.
> >
> > That's fair.
> > For memory tracking I looked into using kmemleak and page_owner which
> > can't match the required functionality at an overhead acceptable for
> > production and pre-production testing environments.
>
> Being more specific would be really helpful. Especially when your cover
> letter suggests that you rely on page_owner/memcg metadata as well to
> match allocation and their freeing parts.

kmemleak is known to be slow and it's even documented [1], so I hope I
can skip that part. For page_owner to provide the comparable
information we would have to capture the call stacks for all page
allocations unlike our proposal which allows to do that selectively
for specific call sites. I'll post the overhead numbers of call stack
capturing once I'm finished with profiling the latest code, hopefully
sometime tomorrow, in the worst case after the long weekend.

>
> > traces + BPF I
> > haven't evaluated myself but heard from other members of my team who
> > tried using that in production environment with poor results. I'll try
> > to get more specific information on that.
>
> That would be helpful as well.

Ack.

>
> > > E.g. Oscar has been working on extending page_ext to track number of
> > > allocations for specific calltrace[1]. Is this 1:1 replacement? No! But
> > > it can help in environments where page_ext can be enabled and it is
> > > completely non-intrusive to the MM code.
> >
> > Thanks for pointing out this work. I'll need to review and maybe
> > profile it before making any claims.
> >
> > >
> > > If the page_ext overhead is not desirable/acceptable then I am sure
> > > there are other options. E.g. kprobes/LivePatching framework can hook
> > > into functions and alter their behavior. So why not use that for data
> > > collection? Has this been evaluated at all?
> >
> > I'm not sure how I can hook into say alloc_pages() to find out where
> > it was called from without capturing the call stack (which would
> > introduce an overhead at every allocation). Would love to discuss this
> > or other alternatives if they can be done with low enough overhead.
>
> Yes, tracking back the call trace would be really needed. The question
> is whether this is really prohibitively expensive. How much overhead are
> we talking about? There is no free lunch here, really.  You either have
> the overhead during runtime when the feature is used or on the source
> code level for all the future development (with a maze of macros and
> wrappers).

Will post the overhead numbers soon.
What I hear loud and clear is that we need a kernel command-line kill
switch that mitigates the overhead for having this feature. That seems
to be the main concern.
Thanks,
Suren.

[1] https://docs.kernel.org/dev-tools/kmemleak.html#limitations-and-drawbacks

>
> Thanks!
> --
> Michal Hocko
> SUSE Labs



Re: [RFC PATCH 00/30] Code tagging framework and applications

2022-09-01 Thread Suren Baghdasaryan
On Thu, Sep 1, 2022 at 3:54 PM Roman Gushchin  wrote:
>
> On Thu, Sep 01, 2022 at 06:37:20PM -0400, Kent Overstreet wrote:
> > On Thu, Sep 01, 2022 at 03:27:27PM -0700, Roman Gushchin wrote:
> > > On Wed, Aug 31, 2022 at 01:56:08PM -0700, Yosry Ahmed wrote:
> > > > This is very interesting work! Do you have any data about the overhead
> > > > this introduces, especially in a production environment? I am
> > > > especially interested in memory allocations tracking and detecting
> > > > leaks.
> > >
> > > +1
> > >
> > > I think the question whether it indeed can be always turned on in the 
> > > production
> > > or not is the main one. If not, the advantage over ftrace/bpf/... is not 
> > > that
> > > obvious. Otherwise it will be indeed a VERY useful thing.
> >
> > Low enough overhead to run in production was my primary design goal.
> >
> > Stats are kept in a struct that's defined at the callsite. So this adds _no_
> > pointer chasing to the allocation path, unless we've switch to percpu 
> > counters
> > at that callsite (see the lazy percpu counters patch), where we need to 
> > deref
> > one percpu pointer to save an atomic.
> >
> > Then we need to stash a pointer to the alloc_tag, so that kfree() can find 
> > it.
> > For slab allocations this uses the same storage area as memcg, so for
> > allocations that are using that we won't be touching any additional 
> > cachelines.
> > (I wanted the pointer to the alloc_tag to be stored inline with the 
> > allocation,
> > but that would've caused alignment difficulties).
> >
> > Then there's a pointer deref introduced to the kfree() path, to get back to 
> > the
> > original alloc_tag and subtract the allocation from that callsite. That one
> > won't be free, and with percpu counters we've got another dependent load 
> > too -
> > hmm, it might be worth benchmarking with just atomics, skipping the percpu
> > counters.
> >
> > So the overhead won't be zero, I expect it'll show up in some synthetic
> > benchmarks, but yes I do definitely expect this to be worth enabling in
> > production in many scenarios.
>
> I'm somewhat sceptical, but I usually am. And in this case I'll be really 
> happy
> to be wrong.
>
> On a bright side, maybe most of the overhead will come from few allocations,
> so an option to explicitly exclude them will do the trick.
>
> I'd suggest to run something like iperf on a fast hardware. And maybe some
> io_uring stuff too. These are two places which were historically most 
> sensitive
> to the (kernel) memory accounting speed.

Thanks for the suggestions, Roman. I'll see how I can get this done.
I'll have to find someone with access to fast hardware (Android is not
great for that) and backporting the patchset to the supported kernel
version. Will do my best.
Thanks,
Suren.

>
> Thanks!
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an 
> email to kernel-team+unsubscr...@android.com.
>



Re: [RFC PATCH 11/30] mm: introduce slabobj_ext to support slab object extensions

2022-09-01 Thread Suren Baghdasaryan
On Thu, Sep 1, 2022 at 4:36 PM Roman Gushchin  wrote:
>
> On Tue, Aug 30, 2022 at 02:49:00PM -0700, Suren Baghdasaryan wrote:
> > Currently slab pages can store only vectors of obj_cgroup pointers in
> > page->memcg_data. Introduce slabobj_ext structure to allow more data
> > to be stored for each slab object. Wraps obj_cgroup into slabobj_ext
> > to support current functionality while allowing to extend slabobj_ext
> > in the future.
> >
> > Note: ideally the config dependency should be turned the other way around:
> > MEMCG should depend on SLAB_OBJ_EXT and {page|slab|folio}.memcg_data would
> > be renamed to something like {page|slab|folio}.objext_data. However doing
> > this in RFC would introduce considerable churn unrelated to the overall
> > idea, so avoiding this until v1.
>
> Hi Suren!

Hi Roman,

>
> I'd say CONFIG_MEMCG_KMEM and CONFIG_YOUR_NEW_STUFF should both depend on
> SLAB_OBJ_EXT.
> CONFIG_MEMCG_KMEM depend on CONFIG_MEMCG anyway.

Yes, I agree. I wanted to mention here that the current dependency is
incorrect and should be reworked. Having both depending on
SLAB_OBJ_EXT seems like the right approach.

>
> >
> > Signed-off-by: Suren Baghdasaryan 
> > ---
> >  include/linux/memcontrol.h |  18 --
> >  init/Kconfig   |   5 ++
> >  mm/kfence/core.c   |   2 +-
> >  mm/memcontrol.c|  60 ++-
> >  mm/page_owner.c|   2 +-
> >  mm/slab.h  | 119 +
> >  6 files changed, 131 insertions(+), 75 deletions(-)
> >
> > diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> > index 6257867fbf95..315399f77173 100644
> > --- a/include/linux/memcontrol.h
> > +++ b/include/linux/memcontrol.h
> > @@ -227,6 +227,14 @@ struct obj_cgroup {
> >   };
> >  };
> >
> > +/*
> > + * Extended information for slab objects stored as an array in 
> > page->memcg_data
> > + * if MEMCG_DATA_OBJEXTS is set.
> > + */
> > +struct slabobj_ext {
> > + struct obj_cgroup *objcg;
> > +} __aligned(8);
>
> Why do we need this aligment requirement?

To save space by avoiding padding, however, all members today will be
pointers, so it's meaningless and we can safely drop it.

>
> > +
> >  /*
> >   * The memory controller data structure. The memory controller controls 
> > both
> >   * page cache and RSS per cgroup. We would eventually like to provide
> > @@ -363,7 +371,7 @@ extern struct mem_cgroup *root_mem_cgroup;
> >
> >  enum page_memcg_data_flags {
> >   /* page->memcg_data is a pointer to an objcgs vector */
> > - MEMCG_DATA_OBJCGS = (1UL << 0),
> > + MEMCG_DATA_OBJEXTS = (1UL << 0),
> >   /* page has been accounted as a non-slab kernel page */
> >   MEMCG_DATA_KMEM = (1UL << 1),
> >   /* the next bit after the last actual flag */
> > @@ -401,7 +409,7 @@ static inline struct mem_cgroup *__folio_memcg(struct 
> > folio *folio)
> >   unsigned long memcg_data = folio->memcg_data;
> >
> >   VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
> > - VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
> > + VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
> >   VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio);
> >
> >   return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
> > @@ -422,7 +430,7 @@ static inline struct obj_cgroup *__folio_objcg(struct 
> > folio *folio)
> >   unsigned long memcg_data = folio->memcg_data;
> >
> >   VM_BUG_ON_FOLIO(folio_test_slab(folio), folio);
> > - VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio);
> > + VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio);
> >   VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio);
> >
> >   return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
> > @@ -517,7 +525,7 @@ static inline struct mem_cgroup 
> > *page_memcg_check(struct page *page)
> >*/
> >   unsigned long memcg_data = READ_ONCE(page->memcg_data);
> >
> > - if (memcg_data & MEMCG_DATA_OBJCGS)
> > + if (memcg_data & MEMCG_DATA_OBJEXTS)
> >   return NULL;
> >
> >   if (memcg_data & MEMCG_DATA_KMEM) {
> > @@ -556,7 +564,7 @@ static inline struct mem_cgroup 
> > *get_mem_cgroup_from_objcg(struct obj_cgroup *ob
> >  static inline bool folio_memcg_kmem(struct folio *folio)
> >  {
> >   

Re: [RFC PATCH 14/30] mm: prevent slabobj_ext allocations for slabobj_ext and kmem_cache objects

2022-09-01 Thread Suren Baghdasaryan
On Thu, Sep 1, 2022 at 4:41 PM Roman Gushchin  wrote:
>
> On Tue, Aug 30, 2022 at 02:49:03PM -0700, Suren Baghdasaryan wrote:
> > Use __GFP_NO_OBJ_EXT to prevent recursions when allocating slabobj_ext
> > objects. Also prevent slabobj_ext allocations for kmem_cache objects.
> >
> > Signed-off-by: Suren Baghdasaryan 
>
> Patches 12-14 look good to me.
> It's probably to early to ack anything, but otherwise I'd ack them.

Thank you for reviewing!

>
> Thanks!



Re: [RFC PATCH 00/30] Code tagging framework and applications

2022-09-04 Thread Suren Baghdasaryan
On Thu, Sep 1, 2022 at 12:15 PM Michal Hocko  wrote:
>
> On Thu 01-09-22 08:33:19, Suren Baghdasaryan wrote:
> > On Thu, Sep 1, 2022 at 12:18 AM Michal Hocko  wrote:
> [...]
> > > So I find Peter's question completely appropriate while your response to
> > > that not so much! Maybe ftrace is not the right tool for the intented
> > > job. Maybe there are other ways and it would be really great to show
> > > that those have been evaluated and they are not suitable for a), b) and
> > > c) reasons.
> >
> > That's fair.
> > For memory tracking I looked into using kmemleak and page_owner which
> > can't match the required functionality at an overhead acceptable for
> > production and pre-production testing environments.
>
> Being more specific would be really helpful. Especially when your cover
> letter suggests that you rely on page_owner/memcg metadata as well to
> match allocation and their freeing parts.
>
> > traces + BPF I
> > haven't evaluated myself but heard from other members of my team who
> > tried using that in production environment with poor results. I'll try
> > to get more specific information on that.
>
> That would be helpful as well.
>
> > > E.g. Oscar has been working on extending page_ext to track number of
> > > allocations for specific calltrace[1]. Is this 1:1 replacement? No! But
> > > it can help in environments where page_ext can be enabled and it is
> > > completely non-intrusive to the MM code.
> >
> > Thanks for pointing out this work. I'll need to review and maybe
> > profile it before making any claims.
> >
> > >
> > > If the page_ext overhead is not desirable/acceptable then I am sure
> > > there are other options. E.g. kprobes/LivePatching framework can hook
> > > into functions and alter their behavior. So why not use that for data
> > > collection? Has this been evaluated at all?
> >
> > I'm not sure how I can hook into say alloc_pages() to find out where
> > it was called from without capturing the call stack (which would
> > introduce an overhead at every allocation). Would love to discuss this
> > or other alternatives if they can be done with low enough overhead.
>
> Yes, tracking back the call trace would be really needed. The question
> is whether this is really prohibitively expensive. How much overhead are
> we talking about? There is no free lunch here, really.  You either have
> the overhead during runtime when the feature is used or on the source
> code level for all the future development (with a maze of macros and
> wrappers).

As promised, I profiled a simple code that repeatedly makes 10
allocations/frees in a loop and measured overheads of code tagging,
call stack capturing and tracing+BPF for page and slab allocations.
Summary:

Page allocations (overheads are compared to get_free_pages() duration):
6.8% Codetag counter manipulations (__lazy_percpu_counter_add + __alloc_tag_add)
8.8% lookup_page_ext
1237% call stack capture
139% tracepoint with attached empty BPF program

Slab allocations (overheads are compared to __kmalloc() duration):
With CONFIG_MEMCG_KMEM=y
39% Codetag counter manipulations(__lazy_percpu_counter_add + __alloc_tag_add)
55% get_slab_tag_ref
3.9% __ksize
3027% call stack capture
397% tracepoint with attached empty BPF program

With CONFIG_MEMCG_KMEM=n
26% Codetag counter manipulation(__lazy_percpu_counter_add + __alloc_tag_add)
72% get_slab_tag_ref
7.4% __ksize
2789% call stack capture
345% tracepoint with attached empty BPF program

Details:
_get_free_pages is used as page allocation duration baseline
__kmalloc is used as slab allocation duration baseline

1. Profile with instrumented page allocator
|--50.13%--my__get_free_page
|  |
|  |--38.99%--_get_free_pages
|  |  |
|  |  |--34.75%--__alloc_pages
|  |  |  |
|  |  |  |--27.59%--get_page_from_freelist
|  |  |
|  |   --3.98%--_alloc_pages
|  | |
|  |  --0.53%--policy_node
|  |
|  |--3.45%--lookup_page_ext
|  |
|  |--1.59%--__lazy_percpu_counter_add
|  |  |
|  |   --0.80%--pcpu_alloc
|  | memset_orig
|  |
|   --1.06%--__alloc_tag_add
| |
|  --0.80%--__lazy_percpu_counter_add
|
|--35.28%--free_unref_page
|  |
|  |--23.08%--_raw_spin_unlock_irqrestore
|  |
|  |--2.39%--preempt_count_add
|  |  |
|  |   --0.80%--in_lock_functions
|  |
|  |--1.59%--free_pcp_pre

Re: [RFC PATCH 00/30] Code tagging framework and applications

2022-09-05 Thread Suren Baghdasaryan
On Mon, Sep 5, 2022 at 1:12 AM Michal Hocko  wrote:
>
> On Sun 04-09-22 18:32:58, Suren Baghdasaryan wrote:
> > On Thu, Sep 1, 2022 at 12:15 PM Michal Hocko  wrote:
> [...]
> > > Yes, tracking back the call trace would be really needed. The question
> > > is whether this is really prohibitively expensive. How much overhead are
> > > we talking about? There is no free lunch here, really.  You either have
> > > the overhead during runtime when the feature is used or on the source
> > > code level for all the future development (with a maze of macros and
> > > wrappers).
> >
> > As promised, I profiled a simple code that repeatedly makes 10
> > allocations/frees in a loop and measured overheads of code tagging,
> > call stack capturing and tracing+BPF for page and slab allocations.
> > Summary:
> >
> > Page allocations (overheads are compared to get_free_pages() duration):
> > 6.8% Codetag counter manipulations (__lazy_percpu_counter_add + 
> > __alloc_tag_add)
> > 8.8% lookup_page_ext
> > 1237% call stack capture
> > 139% tracepoint with attached empty BPF program
>
> Yes, I am not surprised that the call stack capturing is really
> expensive comparing to the allocator fast path (which is really highly
> optimized and I suspect that with 10 allocation/free loop you mostly get
> your memory from the pcp lists). Is this overhead still _that_ visible
> for somehow less microoptimized workloads which have to take slow paths
> as well?

Correct, it's a comparison with the allocation fast path, so in a
sense represents the worst case scenario. However at the same time the
measurements are fair because they measure the overheads against the
same meaningful baseline, therefore can be used for comparison.

>
> Also what kind of stack unwinder is configured (I guess ORC)? This is
> not my area but from what I remember the unwinder overhead varies
> between ORC and FP.

I used whatever is default and didn't try other mechanisms. Don't
think the difference would be orders of magnitude better though.

>
> And just to make it clear. I do realize that an overhead from the stack
> unwinding is unavoidable. And code tagging would logically have lower
> overhead as it performs much less work. But the main point is whether
> our existing stack unwiding approach is really prohibitively expensive
> to be used for debugging purposes on production systems. I might
> misremember but I recall people having bigger concerns with page_owner
> memory footprint than the actual stack unwinder overhead.

That's one of those questions which are very difficult to answer (if
even possible) because that would depend on the use scenario. If the
workload allocates frequently then adding the overhead will likely
affect it, otherwise might not be even noticeable. In general, in
pre-production testing we try to minimize the difference in
performance and memory profiles between the software we are testing
and the production one. From that point of view, the smaller the
overhead, the better. I know it's kinda obvious but unfortunately I
have no better answer to that question.

For the memory overhead, in my early internal proposal with assumption
of 1 instrumented allocation call sites, I've made some
calculations for an 8GB 8-core system (quite typical for Android) and
ended up with the following:

per-cpu counters  atomic counters
page_ext references 16MB  16MB
slab object references   10.5MB   10.5MB
alloc_tags  900KB312KB
Total memory overhead 27.4MB  26.8MB

so, about 0.34% of the total memory. Our implementation has changed
since then and the number might not be completely correct but it
should be in the ballpark.
I just checked the number of instrumented calls that we currently have
in the 6.0-rc3 built with defconfig and it's 165 page allocation and
2684 slab allocation sites. I readily accept that we are probably
missing some allocations and additional modules can also contribute to
these numbers but my guess it's still less than 1 that I used in
my calculations.
I don't claim that 0.34% overhead is low enough to be always
acceptable, just posting the numbers to provide some reference points.

> --
> Michal Hocko
> SUSE Labs



Re: [RFC PATCH 00/30] Code tagging framework and applications

2022-09-05 Thread Suren Baghdasaryan
On Mon, Sep 5, 2022 at 1:58 AM Marco Elver  wrote:
>
> On Mon, 5 Sept 2022 at 10:12, Michal Hocko  wrote:
> > On Sun 04-09-22 18:32:58, Suren Baghdasaryan wrote:
> > > On Thu, Sep 1, 2022 at 12:15 PM Michal Hocko  wrote:
> > [...]
> > > > Yes, tracking back the call trace would be really needed. The question
> > > > is whether this is really prohibitively expensive. How much overhead are
> > > > we talking about? There is no free lunch here, really.  You either have
> > > > the overhead during runtime when the feature is used or on the source
> > > > code level for all the future development (with a maze of macros and
> > > > wrappers).
> > >
> > > As promised, I profiled a simple code that repeatedly makes 10
> > > allocations/frees in a loop and measured overheads of code tagging,
> > > call stack capturing and tracing+BPF for page and slab allocations.
> > > Summary:
> > >
> > > Page allocations (overheads are compared to get_free_pages() duration):
> > > 6.8% Codetag counter manipulations (__lazy_percpu_counter_add + 
> > > __alloc_tag_add)
> > > 8.8% lookup_page_ext
> > > 1237% call stack capture
> > > 139% tracepoint with attached empty BPF program
> >
> > Yes, I am not surprised that the call stack capturing is really
> > expensive comparing to the allocator fast path (which is really highly
> > optimized and I suspect that with 10 allocation/free loop you mostly get
> > your memory from the pcp lists). Is this overhead still _that_ visible
> > for somehow less microoptimized workloads which have to take slow paths
> > as well?
> >
> > Also what kind of stack unwinder is configured (I guess ORC)? This is
> > not my area but from what I remember the unwinder overhead varies
> > between ORC and FP.
> >
> > And just to make it clear. I do realize that an overhead from the stack
> > unwinding is unavoidable. And code tagging would logically have lower
> > overhead as it performs much less work. But the main point is whether
> > our existing stack unwiding approach is really prohibitively expensive
> > to be used for debugging purposes on production systems. I might
> > misremember but I recall people having bigger concerns with page_owner
> > memory footprint than the actual stack unwinder overhead.
>
> This is just to point out that we've also been looking at cheaper
> collection of the stack trace (for KASAN and other sanitizers). The
> cheapest way to unwind the stack would be a system with "shadow call
> stack" enabled. With compiler support it's available on arm64, see
> CONFIG_SHADOW_CALL_STACK. For x86 the hope is that at one point the
> kernel will support CET, which newer Intel and AMD CPUs support.
> Collecting the call stack would then be a simple memcpy.

Thanks for the note Marco! I'll check out the CONFIG_SHADOW_CALL_STACK
on Android.



Re: [RFC PATCH 00/30] Code tagging framework and applications

2022-09-05 Thread Suren Baghdasaryan
On Mon, Sep 5, 2022 at 8:06 AM Steven Rostedt  wrote:
>
> On Sun, 4 Sep 2022 18:32:58 -0700
> Suren Baghdasaryan  wrote:
>
> > Page allocations (overheads are compared to get_free_pages() duration):
> > 6.8% Codetag counter manipulations (__lazy_percpu_counter_add + 
> > __alloc_tag_add)
> > 8.8% lookup_page_ext
> > 1237% call stack capture
> > 139% tracepoint with attached empty BPF program
>
> Have you tried tracepoint with custom callback?
>
> static void my_callback(void *data, unsigned long call_site,
> const void *ptr, struct kmem_cache *s,
> size_t bytes_req, size_t bytes_alloc,
> gfp_t gfp_flags)
> {
> struct my_data_struct *my_data = data;
>
> { do whatever }
> }
>
> [..]
> register_trace_kmem_alloc(my_callback, my_data);
>
> Now the my_callback function will be called directly every time the
> kmem_alloc tracepoint is hit.
>
> This avoids that perf and BPF overhead.

Haven't tried that yet but will do. Thanks for the reference code!

>
> -- Steve



Re: [RFC PATCH 00/30] Code tagging framework and applications

2022-09-06 Thread Suren Baghdasaryan
On Tue, Sep 6, 2022 at 1:01 AM Michal Hocko  wrote:
>
> On Mon 05-09-22 11:03:35, Suren Baghdasaryan wrote:
> > On Mon, Sep 5, 2022 at 1:12 AM Michal Hocko  wrote:
> > >
> > > On Sun 04-09-22 18:32:58, Suren Baghdasaryan wrote:
> > > > On Thu, Sep 1, 2022 at 12:15 PM Michal Hocko  wrote:
> > > [...]
> > > > > Yes, tracking back the call trace would be really needed. The question
> > > > > is whether this is really prohibitively expensive. How much overhead 
> > > > > are
> > > > > we talking about? There is no free lunch here, really.  You either 
> > > > > have
> > > > > the overhead during runtime when the feature is used or on the source
> > > > > code level for all the future development (with a maze of macros and
> > > > > wrappers).
> > > >
> > > > As promised, I profiled a simple code that repeatedly makes 10
> > > > allocations/frees in a loop and measured overheads of code tagging,
> > > > call stack capturing and tracing+BPF for page and slab allocations.
> > > > Summary:
> > > >
> > > > Page allocations (overheads are compared to get_free_pages() duration):
> > > > 6.8% Codetag counter manipulations (__lazy_percpu_counter_add + 
> > > > __alloc_tag_add)
> > > > 8.8% lookup_page_ext
> > > > 1237% call stack capture
> > > > 139% tracepoint with attached empty BPF program
> > >
> > > Yes, I am not surprised that the call stack capturing is really
> > > expensive comparing to the allocator fast path (which is really highly
> > > optimized and I suspect that with 10 allocation/free loop you mostly get
> > > your memory from the pcp lists). Is this overhead still _that_ visible
> > > for somehow less microoptimized workloads which have to take slow paths
> > > as well?
> >
> > Correct, it's a comparison with the allocation fast path, so in a
> > sense represents the worst case scenario. However at the same time the
> > measurements are fair because they measure the overheads against the
> > same meaningful baseline, therefore can be used for comparison.
>
> Yes, I am not saying it is an unfair comparision. It is just not a
> particularly practical one for real life situations. So I am not sure
> you can draw many conclusions from that. Or let me put it differently.
> There is not real point comparing the code tagging and stack unwiding
> approaches because the later is simply more complex because it collects
> more state. The main question is whether that additional state
> collection is too expensive to be practically used.

You asked me to provide the numbers in one of your replies, that's what I did.

>
> > > Also what kind of stack unwinder is configured (I guess ORC)? This is
> > > not my area but from what I remember the unwinder overhead varies
> > > between ORC and FP.
> >
> > I used whatever is default and didn't try other mechanisms. Don't
> > think the difference would be orders of magnitude better though.
> >
> > >
> > > And just to make it clear. I do realize that an overhead from the stack
> > > unwinding is unavoidable. And code tagging would logically have lower
> > > overhead as it performs much less work. But the main point is whether
> > > our existing stack unwiding approach is really prohibitively expensive
> > > to be used for debugging purposes on production systems. I might
> > > misremember but I recall people having bigger concerns with page_owner
> > > memory footprint than the actual stack unwinder overhead.
> >
> > That's one of those questions which are very difficult to answer (if
> > even possible) because that would depend on the use scenario. If the
> > workload allocates frequently then adding the overhead will likely
> > affect it, otherwise might not be even noticeable. In general, in
> > pre-production testing we try to minimize the difference in
> > performance and memory profiles between the software we are testing
> > and the production one. From that point of view, the smaller the
> > overhead, the better. I know it's kinda obvious but unfortunately I
> > have no better answer to that question.
>
> This is clear but it doesn't really tell whether the existing tooling is
> unusable for _your_ or any specific scenarios. Because when we are
> talking about adding quite a lot of code and make our allocators APIs
> more complicated to track the state then we should carefully weigh the
> benefit and the c

Re: [RFC PATCH 00/30] Code tagging framework and applications

2022-09-07 Thread Suren Baghdasaryan
On Wed, Sep 7, 2022 at 11:35 PM Kent Overstreet
 wrote:
>
> On Wed, Sep 07, 2022 at 09:45:18AM -0400, Steven Rostedt wrote:
> > On Wed, 7 Sep 2022 09:04:28 -0400
> > Kent Overstreet  wrote:
> >
> > > On Wed, Sep 07, 2022 at 01:00:09PM +0200, Michal Hocko wrote:
> > > > Hmm, it seems that further discussion doesn't really make much sense
> > > > here. I know how to use my time better.
> > >
> > > Just a thought, but I generally find it more productive to propose ideas 
> > > than to
> > > just be disparaging.
> > >
> >
> > But it's not Michal's job to do so. He's just telling you that the given
> > feature is not worth the burden. He's telling you the issues that he has
> > with the patch set. It's the submitter's job to address those concerns and
> > not the maintainer's to tell you how to make it better.
> >
> > When Linus tells us that a submission is crap, we don't ask him how to make
> > it less crap, we listen to why he called it crap, and then rewrite to be
> > not so crappy. If we cannot figure it out, it doesn't get in.
>
> When Linus tells someone a submission is crap, he _always_ has a sound, and
> _specific_ technical justification for doing so.
>
> "This code is going to be a considerable maintenance burden" is vapid, and 
> lazy.
> It's the kind of feedback made by someone who has looked at the number of 
> lines
> of code a patch touches and not much more.

I would really appreciate if everyone could please stick to the
technical side of the conversation. That way we can get some
constructive feedback. Everything else is not helpful and at best is a
distraction.
Maintenance burden is a price we pay and I think it's the prerogative
of the maintainers to take that into account. Our job is to prove that
the price is worth paying.

>
> --
> To unsubscribe from this group and stop receiving emails from it, send an 
> email to kernel-team+unsubscr...@android.com.
>



[PATCH v2 2/6] mm: replace VM_LOCKED_CLEAR_MASK with VM_LOCKED_MASK

2023-01-25 Thread Suren Baghdasaryan
To simplify the usage of VM_LOCKED_CLEAR_MASK in clear_vm_flags(),
replace it with VM_LOCKED_MASK bitmask and convert all users.

Signed-off-by: Suren Baghdasaryan 
---
 include/linux/mm.h | 4 ++--
 kernel/fork.c  | 2 +-
 mm/hugetlb.c   | 4 ++--
 mm/mlock.c | 6 +++---
 mm/mmap.c  | 6 +++---
 mm/mremap.c| 2 +-
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b71f2809caac..da62bdd627bf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -421,8 +421,8 @@ extern unsigned int kobjsize(const void *objp);
 /* This mask defines which mm->def_flags a process can inherit its parent */
 #define VM_INIT_DEF_MASK   VM_NOHUGEPAGE
 
-/* This mask is used to clear all the VMA flags used by mlock */
-#define VM_LOCKED_CLEAR_MASK   (~(VM_LOCKED | VM_LOCKONFAULT))
+/* This mask represents all the VMA flag bits used by mlock */
+#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT)
 
 /* Arch-specific flags to clear when updating VM flags on protection change */
 #ifndef VM_ARCH_CLEAR
diff --git a/kernel/fork.c b/kernel/fork.c
index 6683c1b0f460..03d472051236 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -669,7 +669,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp->anon_vma = NULL;
} else if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
-   tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
+   clear_vm_flags(tmp, VM_LOCKED_MASK);
file = tmp->vm_file;
if (file) {
struct address_space *mapping = file->f_mapping;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d20c8b09890e..4ecdbad9a451 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6973,8 +6973,8 @@ static unsigned long page_table_shareable(struct 
vm_area_struct *svma,
unsigned long s_end = sbase + PUD_SIZE;
 
/* Allow segments to share if only one is marked locked */
-   unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
-   unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
+   unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
+   unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
 
/*
 * match the virtual addresses, permission and the alignment of the
diff --git a/mm/mlock.c b/mm/mlock.c
index 0336f52e03d7..5c4fff93cd6b 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -497,7 +497,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t 
len,
if (vma->vm_start != tmp)
return -ENOMEM;
 
-   newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+   newflags = vma->vm_flags & ~VM_LOCKED_MASK;
newflags |= flags;
/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
tmp = vma->vm_end;
@@ -661,7 +661,7 @@ static int apply_mlockall_flags(int flags)
struct vm_area_struct *vma, *prev = NULL;
vm_flags_t to_add = 0;
 
-   current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
+   current->mm->def_flags &= ~VM_LOCKED_MASK;
if (flags & MCL_FUTURE) {
current->mm->def_flags |= VM_LOCKED;
 
@@ -681,7 +681,7 @@ static int apply_mlockall_flags(int flags)
for_each_vma(vmi, vma) {
vm_flags_t newflags;
 
-   newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+   newflags = vma->vm_flags & ~VM_LOCKED_MASK;
newflags |= to_add;
 
/* Ignore errors */
diff --git a/mm/mmap.c b/mm/mmap.c
index d4abc6feced1..323bd253b25a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2671,7 +2671,7 @@ unsigned long mmap_region(struct file *file, unsigned 
long addr,
if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))
-   vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+   clear_vm_flags(vma, VM_LOCKED_MASK);
else
mm->locked_vm += (len >> PAGE_SHIFT);
}
@@ -3340,8 +3340,8 @@ static struct vm_area_struct *__install_special_mapping(
vma->vm_start = addr;
vma->vm_end = addr + len;
 
-   vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
-   vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+   init_vm_flags(vma, (vm_flags | mm->def_flags |
+ VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK);
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 
vma->vm_ops = ops;
diff --git a/mm/mremap.c b/mm/mremap.c
index 1b3ee02bead7..35db9752c

[PATCH v2 0/6] introduce vm_flags modifier functions

2023-01-25 Thread Suren Baghdasaryan
This patchset was originally published as a part of per-VMA locking [1] and
was split after suggestion that it's viable on its own and to facilitate
the review process. It is now a preprequisite for the next version of per-VMA
lock patchset, which reuses vm_flags modifier functions to lock the VMA when
vm_flags are being updated.

VMA vm_flags modifications are usually done under exclusive mmap_lock
protection because this attrubute affects other decisions like VMA merging
or splitting and races should be prevented. Introduce vm_flags modifier
functions to enforce correct locking.

[1] https://lore.kernel.org/all/20230109205336.3665937-1-sur...@google.com/

The patchset applies cleanly over mm-unstable branch of mm tree.

My apologies for an extremely large distribution list. The patch touches
lots of files and many are in arch/ and drivers/.

Suren Baghdasaryan (6):
  mm: introduce vma->vm_flags modifier functions
  mm: replace VM_LOCKED_CLEAR_MASK with VM_LOCKED_MASK
  mm: replace vma->vm_flags direct modifications with modifier calls
  mm: replace vma->vm_flags indirect modification in ksm_madvise
  mm: introduce mod_vm_flags_nolock and use it in untrack_pfn
  mm: export dump_mm()

 arch/arm/kernel/process.c |  2 +-
 arch/ia64/mm/init.c   |  8 +--
 arch/loongarch/include/asm/tlb.h  |  2 +-
 arch/powerpc/kvm/book3s_hv_uvmem.c|  5 +-
 arch/powerpc/kvm/book3s_xive_native.c |  2 +-
 arch/powerpc/mm/book3s64/subpage_prot.c   |  2 +-
 arch/powerpc/platforms/book3s/vas-api.c   |  2 +-
 arch/powerpc/platforms/cell/spufs/file.c  | 14 ++---
 arch/s390/mm/gmap.c   |  8 +--
 arch/x86/entry/vsyscall/vsyscall_64.c |  2 +-
 arch/x86/kernel/cpu/sgx/driver.c  |  2 +-
 arch/x86/kernel/cpu/sgx/virt.c|  2 +-
 arch/x86/mm/pat/memtype.c | 14 +++--
 arch/x86/um/mem_32.c  |  2 +-
 drivers/acpi/pfr_telemetry.c  |  2 +-
 drivers/android/binder.c  |  3 +-
 drivers/char/mspec.c  |  2 +-
 drivers/crypto/hisilicon/qm.c |  2 +-
 drivers/dax/device.c  |  2 +-
 drivers/dma/idxd/cdev.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c   |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c |  4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_events.c   |  4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  |  4 +-
 drivers/gpu/drm/drm_gem.c |  2 +-
 drivers/gpu/drm/drm_gem_dma_helper.c  |  3 +-
 drivers/gpu/drm/drm_gem_shmem_helper.c|  2 +-
 drivers/gpu/drm/drm_vm.c  |  8 +--
 drivers/gpu/drm/etnaviv/etnaviv_gem.c |  2 +-
 drivers/gpu/drm/exynos/exynos_drm_gem.c   |  4 +-
 drivers/gpu/drm/gma500/framebuffer.c  |  2 +-
 drivers/gpu/drm/i810/i810_dma.c   |  2 +-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c  |  4 +-
 drivers/gpu/drm/mediatek/mtk_drm_gem.c|  2 +-
 drivers/gpu/drm/msm/msm_gem.c |  2 +-
 drivers/gpu/drm/omapdrm/omap_gem.c|  3 +-
 drivers/gpu/drm/rockchip/rockchip_drm_gem.c   |  3 +-
 drivers/gpu/drm/tegra/gem.c   |  5 +-
 drivers/gpu/drm/ttm/ttm_bo_vm.c   |  3 +-
 drivers/gpu/drm/virtio/virtgpu_vram.c |  2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c  |  2 +-
 drivers/gpu/drm/xen/xen_drm_front_gem.c   |  3 +-
 drivers/hsi/clients/cmt_speech.c  |  2 +-
 drivers/hwtracing/intel_th/msu.c  |  2 +-
 drivers/hwtracing/stm/core.c  |  2 +-
 drivers/infiniband/hw/hfi1/file_ops.c |  4 +-
 drivers/infiniband/hw/mlx5/main.c |  4 +-
 drivers/infiniband/hw/qib/qib_file_ops.c  | 13 +++--
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c  |  2 +-
 .../infiniband/hw/vmw_pvrdma/pvrdma_verbs.c   |  2 +-
 .../common/videobuf2/videobuf2-dma-contig.c   |  2 +-
 .../common/videobuf2/videobuf2-vmalloc.c  |  2 +-
 drivers/media/v4l2-core/videobuf-dma-contig.c |  2 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c |  4 +-
 drivers/media/v4l2-core/videobuf-vmalloc.c|  2 +-
 drivers/misc/cxl/context.c|  2 +-
 drivers/misc/habanalabs/common/memory.c   |  2 +-
 drivers/misc/habanalabs/gaudi/gaudi.c |  4 +-
 drivers/misc/habanalabs/gaudi2/gaudi2.c   |  8 +--
 drivers/misc/habanalabs/goya/goya.c   |  4 +-
 drivers/misc/ocxl/context.c   |  4 +-
 drivers/misc/ocxl/sysfs.c |  2 +-
 drivers/misc/open-dice.c  |  4 +-
 drivers/misc/sgi-gru/grufile.c|  4 +-
 drivers/misc/uacce/uacce.c|  2 +-
 drivers/sbus/char/oradax.c|  2 +-
 drivers/scsi/cxlflash/ocxl_hw.c   |  2 +-
 dri

[PATCH v2 1/6] mm: introduce vma->vm_flags modifier functions

2023-01-25 Thread Suren Baghdasaryan
vm_flags are among VMA attributes which affect decisions like VMA merging
and splitting. Therefore all vm_flags modifications are performed after
taking exclusive mmap_lock to prevent vm_flags updates racing with such
operations. Introduce modifier functions for vm_flags to be used whenever
flags are updated. This way we can better check and control correct
locking behavior during these updates.

Signed-off-by: Suren Baghdasaryan 
---
 include/linux/mm.h   | 37 +
 include/linux/mm_types.h |  8 +++-
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c2f62bdce134..b71f2809caac 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -627,6 +627,43 @@ static inline void vma_init(struct vm_area_struct *vma, 
struct mm_struct *mm)
INIT_LIST_HEAD(&vma->anon_vma_chain);
 }
 
+/* Use when VMA is not part of the VMA tree and needs no locking */
+static inline void init_vm_flags(struct vm_area_struct *vma,
+unsigned long flags)
+{
+   vma->vm_flags = flags;
+}
+
+/* Use when VMA is part of the VMA tree and modifications need coordination */
+static inline void reset_vm_flags(struct vm_area_struct *vma,
+ unsigned long flags)
+{
+   mmap_assert_write_locked(vma->vm_mm);
+   init_vm_flags(vma, flags);
+}
+
+static inline void set_vm_flags(struct vm_area_struct *vma,
+   unsigned long flags)
+{
+   mmap_assert_write_locked(vma->vm_mm);
+   vma->vm_flags |= flags;
+}
+
+static inline void clear_vm_flags(struct vm_area_struct *vma,
+ unsigned long flags)
+{
+   mmap_assert_write_locked(vma->vm_mm);
+   vma->vm_flags &= ~flags;
+}
+
+static inline void mod_vm_flags(struct vm_area_struct *vma,
+   unsigned long set, unsigned long clear)
+{
+   mmap_assert_write_locked(vma->vm_mm);
+   vma->vm_flags |= set;
+   vma->vm_flags &= ~clear;
+}
+
 static inline void vma_set_anonymous(struct vm_area_struct *vma)
 {
vma->vm_ops = NULL;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 2d6d790d9bed..6c7c70bf50dd 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -491,7 +491,13 @@ struct vm_area_struct {
 * See vmf_insert_mixed_prot() for discussion.
 */
pgprot_t vm_page_prot;
-   unsigned long vm_flags; /* Flags, see mm.h. */
+
+   /*
+* Flags, see mm.h.
+* WARNING! Do not modify directly.
+* Use {init|reset|set|clear|mod}_vm_flags() functions instead.
+*/
+   unsigned long vm_flags;
 
/*
 * For areas with an address space and backing store,
-- 
2.39.1




[PATCH v2 3/6] mm: replace vma->vm_flags direct modifications with modifier calls

2023-01-25 Thread Suren Baghdasaryan
Replace direct modifications to vma->vm_flags with calls to modifier
functions to be able to track flag changes and to keep vma locking
correctness.

Signed-off-by: Suren Baghdasaryan 
---
 arch/arm/kernel/process.c  |  2 +-
 arch/ia64/mm/init.c|  8 
 arch/loongarch/include/asm/tlb.h   |  2 +-
 arch/powerpc/kvm/book3s_xive_native.c  |  2 +-
 arch/powerpc/mm/book3s64/subpage_prot.c|  2 +-
 arch/powerpc/platforms/book3s/vas-api.c|  2 +-
 arch/powerpc/platforms/cell/spufs/file.c   | 14 +++---
 arch/s390/mm/gmap.c|  3 +--
 arch/x86/entry/vsyscall/vsyscall_64.c  |  2 +-
 arch/x86/kernel/cpu/sgx/driver.c   |  2 +-
 arch/x86/kernel/cpu/sgx/virt.c |  2 +-
 arch/x86/mm/pat/memtype.c  |  6 +++---
 arch/x86/um/mem_32.c   |  2 +-
 drivers/acpi/pfr_telemetry.c   |  2 +-
 drivers/android/binder.c   |  3 +--
 drivers/char/mspec.c   |  2 +-
 drivers/crypto/hisilicon/qm.c  |  2 +-
 drivers/dax/device.c   |  2 +-
 drivers/dma/idxd/cdev.c|  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c|  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c   |  4 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c  |  4 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_events.c|  4 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_process.c   |  4 ++--
 drivers/gpu/drm/drm_gem.c  |  2 +-
 drivers/gpu/drm/drm_gem_dma_helper.c   |  3 +--
 drivers/gpu/drm/drm_gem_shmem_helper.c |  2 +-
 drivers/gpu/drm/drm_vm.c   |  8 
 drivers/gpu/drm/etnaviv/etnaviv_gem.c  |  2 +-
 drivers/gpu/drm/exynos/exynos_drm_gem.c|  4 ++--
 drivers/gpu/drm/gma500/framebuffer.c   |  2 +-
 drivers/gpu/drm/i810/i810_dma.c|  2 +-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c   |  4 ++--
 drivers/gpu/drm/mediatek/mtk_drm_gem.c |  2 +-
 drivers/gpu/drm/msm/msm_gem.c  |  2 +-
 drivers/gpu/drm/omapdrm/omap_gem.c |  3 +--
 drivers/gpu/drm/rockchip/rockchip_drm_gem.c|  3 +--
 drivers/gpu/drm/tegra/gem.c|  5 ++---
 drivers/gpu/drm/ttm/ttm_bo_vm.c|  3 +--
 drivers/gpu/drm/virtio/virtgpu_vram.c  |  2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c   |  2 +-
 drivers/gpu/drm/xen/xen_drm_front_gem.c|  3 +--
 drivers/hsi/clients/cmt_speech.c   |  2 +-
 drivers/hwtracing/intel_th/msu.c   |  2 +-
 drivers/hwtracing/stm/core.c   |  2 +-
 drivers/infiniband/hw/hfi1/file_ops.c  |  4 ++--
 drivers/infiniband/hw/mlx5/main.c  |  4 ++--
 drivers/infiniband/hw/qib/qib_file_ops.c   | 13 ++---
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c   |  2 +-
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c|  2 +-
 .../media/common/videobuf2/videobuf2-dma-contig.c  |  2 +-
 drivers/media/common/videobuf2/videobuf2-vmalloc.c |  2 +-
 drivers/media/v4l2-core/videobuf-dma-contig.c  |  2 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c  |  4 ++--
 drivers/media/v4l2-core/videobuf-vmalloc.c |  2 +-
 drivers/misc/cxl/context.c |  2 +-
 drivers/misc/habanalabs/common/memory.c|  2 +-
 drivers/misc/habanalabs/gaudi/gaudi.c  |  4 ++--
 drivers/misc/habanalabs/gaudi2/gaudi2.c|  8 
 drivers/misc/habanalabs/goya/goya.c|  4 ++--
 drivers/misc/ocxl/context.c|  4 ++--
 drivers/misc/ocxl/sysfs.c  |  2 +-
 drivers/misc/open-dice.c   |  4 ++--
 drivers/misc/sgi-gru/grufile.c |  4 ++--
 drivers/misc/uacce/uacce.c |  2 +-
 drivers/sbus/char/oradax.c |  2 +-
 drivers/scsi/cxlflash/ocxl_hw.c|  2 +-
 drivers/scsi/sg.c  |  2 +-
 drivers/staging/media/atomisp/pci/hmm/hmm_bo.c |  2 +-
 drivers/staging/media/deprecated/meye/meye.c   |  4 ++--
 .../media/deprecated/stkwebcam/stk-webcam.c|  2 +-
 drivers/target/target_core_user.c  |  2 +-
 drivers/uio/uio.c  |  2 +-
 drivers/usb/core/devio.c   |  3 +--
 drivers/usb/mon/mon_bin.c  |  3 +--
 drivers/vdpa/vdpa_user/iova_domain.c   |  2 +-
 drivers/vfio/pci/vfio_pci_core.c   |  2 +-
 drivers/vhost/vdpa.c   |  2 +-
 drivers/video/fbdev/68328f

[PATCH v2 6/6] mm: export dump_mm()

2023-01-25 Thread Suren Baghdasaryan
mmap_assert_write_locked() is used in vm_flags modifiers. Because
mmap_assert_write_locked() uses dump_mm() and vm_flags are sometimes
modified from from inside a module, it's necessary to export
dump_mm() function.

Signed-off-by: Suren Baghdasaryan 
---
 mm/debug.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/debug.c b/mm/debug.c
index 9d3d893dc7f4..96d594e16292 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -215,6 +215,7 @@ void dump_mm(const struct mm_struct *mm)
mm->def_flags, &mm->def_flags
);
 }
+EXPORT_SYMBOL(dump_mm);
 
 static bool page_init_poisoning __read_mostly = true;
 
-- 
2.39.1




[PATCH v2 5/6] mm: introduce mod_vm_flags_nolock and use it in untrack_pfn

2023-01-25 Thread Suren Baghdasaryan
In cases when VMA flags are modified after VMA was isolated and mmap_lock
was downgraded, flags modifications would result in an assertion because
mmap write lock is not held.
Introduce mod_vm_flags_nolock to be used in such situation.
Pass a hint to untrack_pfn to conditionally use mod_vm_flags_nolock for
flags modification and to avoid assertion.

Signed-off-by: Suren Baghdasaryan 
---
 arch/x86/mm/pat/memtype.c | 10 +++---
 include/linux/mm.h| 12 +---
 include/linux/pgtable.h   |  5 +++--
 mm/memory.c   | 13 +++--
 mm/memremap.c |  4 ++--
 mm/mmap.c | 16 ++--
 6 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index ae9645c900fa..d8adc0b42cf2 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -1046,7 +1046,7 @@ void track_pfn_insert(struct vm_area_struct *vma, 
pgprot_t *prot, pfn_t pfn)
  * can be for the entire vma (in which case pfn, size are zero).
  */
 void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
-unsigned long size)
+unsigned long size, bool mm_wr_locked)
 {
resource_size_t paddr;
unsigned long prot;
@@ -1065,8 +1065,12 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned 
long pfn,
size = vma->vm_end - vma->vm_start;
}
free_pfn_range(paddr, size);
-   if (vma)
-   clear_vm_flags(vma, VM_PAT);
+   if (vma) {
+   if (mm_wr_locked)
+   clear_vm_flags(vma, VM_PAT);
+   else
+   mod_vm_flags_nolock(vma, 0, VM_PAT);
+   }
 }
 
 /*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 55335edd1373..48d49930c411 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -656,12 +656,18 @@ static inline void clear_vm_flags(struct vm_area_struct 
*vma,
vma->vm_flags &= ~flags;
 }
 
+static inline void mod_vm_flags_nolock(struct vm_area_struct *vma,
+  unsigned long set, unsigned long clear)
+{
+   vma->vm_flags |= set;
+   vma->vm_flags &= ~clear;
+}
+
 static inline void mod_vm_flags(struct vm_area_struct *vma,
unsigned long set, unsigned long clear)
 {
mmap_assert_write_locked(vma->vm_mm);
-   vma->vm_flags |= set;
-   vma->vm_flags &= ~clear;
+   mod_vm_flags_nolock(vma, set, clear);
 }
 
 static inline void vma_set_anonymous(struct vm_area_struct *vma)
@@ -2087,7 +2093,7 @@ static inline void zap_vma_pages(struct vm_area_struct 
*vma)
 }
 void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
struct vm_area_struct *start_vma, unsigned long start,
-   unsigned long end);
+   unsigned long end, bool mm_wr_locked);
 
 struct mmu_notifier_range;
 
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 5fd45454c073..c63cd44777ec 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1185,7 +1185,8 @@ static inline int track_pfn_copy(struct vm_area_struct 
*vma)
  * can be for the entire vma (in which case pfn, size are zero).
  */
 static inline void untrack_pfn(struct vm_area_struct *vma,
-  unsigned long pfn, unsigned long size)
+  unsigned long pfn, unsigned long size,
+  bool mm_wr_locked)
 {
 }
 
@@ -1203,7 +1204,7 @@ extern void track_pfn_insert(struct vm_area_struct *vma, 
pgprot_t *prot,
 pfn_t pfn);
 extern int track_pfn_copy(struct vm_area_struct *vma);
 extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
-   unsigned long size);
+   unsigned long size, bool mm_wr_locked);
 extern void untrack_pfn_moved(struct vm_area_struct *vma);
 #endif
 
diff --git a/mm/memory.c b/mm/memory.c
index d6902065e558..5b11b50e2c4a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1613,7 +1613,7 @@ void unmap_page_range(struct mmu_gather *tlb,
 static void unmap_single_vma(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr,
-   struct zap_details *details)
+   struct zap_details *details, bool mm_wr_locked)
 {
unsigned long start = max(vma->vm_start, start_addr);
unsigned long end;
@@ -1628,7 +1628,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
uprobe_munmap(vma, start, end);
 
if (unlikely(vma->vm_flags & VM_PFNMAP))
-   untrack_pfn(vma, 0, 0);
+   untrack_pfn(vma, 0, 0, mm_wr_locked);
 
if (start != end) {
if (unlikely(is_vm_hugetlb_page(vma))) {
@@ -1675,7 +1675,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
  */
 void unmap_vmas

[PATCH v2 4/6] mm: replace vma->vm_flags indirect modification in ksm_madvise

2023-01-25 Thread Suren Baghdasaryan
Replace indirect modifications to vma->vm_flags with calls to modifier
functions to be able to track flag changes and to keep vma locking
correctness. Add a BUG_ON check in ksm_madvise() to catch indirect
vm_flags modification attempts.

Signed-off-by: Suren Baghdasaryan 
---
 arch/powerpc/kvm/book3s_hv_uvmem.c | 5 -
 arch/s390/mm/gmap.c| 5 -
 mm/khugepaged.c| 2 ++
 mm/ksm.c   | 2 ++
 4 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 1d67baa5557a..325a7a47d348 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -393,6 +393,7 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm,
 {
unsigned long gfn = memslot->base_gfn;
unsigned long end, start = gfn_to_hva(kvm, gfn);
+   unsigned long vm_flags;
int ret = 0;
struct vm_area_struct *vma;
int merge_flag = (merge) ? MADV_MERGEABLE : MADV_UNMERGEABLE;
@@ -409,12 +410,14 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm,
ret = H_STATE;
break;
}
+   vm_flags = vma->vm_flags;
ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
- merge_flag, &vma->vm_flags);
+ merge_flag, &vm_flags);
if (ret) {
ret = H_STATE;
break;
}
+   reset_vm_flags(vma, vm_flags);
start = vma->vm_end;
} while (end > vma->vm_end);
 
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 3a695b8a1e3c..d5eb47dcdacb 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2587,14 +2587,17 @@ int gmap_mark_unmergeable(void)
 {
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
+   unsigned long vm_flags;
int ret;
VMA_ITERATOR(vmi, mm, 0);
 
for_each_vma(vmi, vma) {
+   vm_flags = vma->vm_flags;
ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
- MADV_UNMERGEABLE, &vma->vm_flags);
+ MADV_UNMERGEABLE, &vm_flags);
if (ret)
return ret;
+   reset_vm_flags(vma, vm_flags);
}
mm->def_flags &= ~VM_MERGEABLE;
return 0;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 8abc59345bf2..76b24cd0c179 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -354,6 +354,8 @@ struct attribute_group khugepaged_attr_group = {
 int hugepage_madvise(struct vm_area_struct *vma,
 unsigned long *vm_flags, int advice)
 {
+   /* vma->vm_flags can be changed only using modifier functions */
+   BUG_ON(vm_flags == &vma->vm_flags);
switch (advice) {
case MADV_HUGEPAGE:
 #ifdef CONFIG_S390
diff --git a/mm/ksm.c b/mm/ksm.c
index 04f1c8c2df11..992b2be9f5e6 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2573,6 +2573,8 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long 
start,
struct mm_struct *mm = vma->vm_mm;
int err;
 
+   /* vma->vm_flags can be changed only using modifier functions */
+   BUG_ON(vm_flags == &vma->vm_flags);
switch (advice) {
case MADV_MERGEABLE:
/*
-- 
2.39.1




Re: [PATCH v2 4/6] mm: replace vma->vm_flags indirect modification in ksm_madvise

2023-01-25 Thread Suren Baghdasaryan
On Wed, Jan 25, 2023 at 1:38 AM 'Michal Hocko' via kernel-team
 wrote:
>
> On Wed 25-01-23 00:38:49, Suren Baghdasaryan wrote:
> > Replace indirect modifications to vma->vm_flags with calls to modifier
> > functions to be able to track flag changes and to keep vma locking
> > correctness. Add a BUG_ON check in ksm_madvise() to catch indirect
> > vm_flags modification attempts.
>
> Those BUG_ONs scream to much IMHO. KSM is an MM internal code so I
> gueess we should be willing to trust it.

Yes, but I really want to prevent an indirect misuse since it was not
easy to find these. If you feel strongly about it I will remove them
or if you have a better suggestion I'm all for it.

>
> > Signed-off-by: Suren Baghdasaryan 
>
> Acked-by: Michal Hocko 
> --
> Michal Hocko
> SUSE Labs
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an 
> email to kernel-team+unsubscr...@android.com.
>



Re: [PATCH v2 1/6] mm: introduce vma->vm_flags modifier functions

2023-01-25 Thread Suren Baghdasaryan
On Wed, Jan 25, 2023 at 10:37 AM Matthew Wilcox  wrote:
>
> On Wed, Jan 25, 2023 at 08:49:50AM -0800, Suren Baghdasaryan wrote:
> > On Wed, Jan 25, 2023 at 1:10 AM Peter Zijlstra  wrote:
> > > > + /*
> > > > +  * Flags, see mm.h.
> > > > +  * WARNING! Do not modify directly.
> > > > +  * Use {init|reset|set|clear|mod}_vm_flags() functions instead.
> > > > +  */
> > > > + unsigned long vm_flags;
> > >
> > > We have __private and ACCESS_PRIVATE() to help with enforcing this.
> >
> > Thanks for pointing this out, Peter! I guess for that I'll need to
> > convert all read accesses and provide get_vm_flags() too? That will
> > cause some additional churt (a quick search shows 801 hits over 248
> > files) but maybe it's worth it? I think Michal suggested that too in
> > another patch. Should I do that while we are at it?
>
> Here's a trick I saw somewhere in the VFS:
>
> union {
> const vm_flags_t vm_flags;
> vm_flags_t __private __vm_flags;
> };
>
> Now it can be read by anybody but written only by those using
> ACCESS_PRIVATE.

Huh, this is quite nice! I think it does not save us from the cases
when vma->vm_flags is passed by a reference and modified indirectly,
like in ksm_madvise()? Though maybe such usecases are so rare (I found
only 2 cases) that we can ignore this?



Re: [PATCH v2 3/6] mm: replace vma->vm_flags direct modifications with modifier calls

2023-01-25 Thread Suren Baghdasaryan
On Wed, Jan 25, 2023 at 1:30 AM 'Michal Hocko' via kernel-team
 wrote:
>
> On Wed 25-01-23 00:38:48, Suren Baghdasaryan wrote:
> > Replace direct modifications to vma->vm_flags with calls to modifier
> > functions to be able to track flag changes and to keep vma locking
> > correctness.
>
> Is this a manual (git grep) based work or have you used Coccinele for
> the patch generation?

It was a manual "search and replace" and in the process I temporarily
renamed vm_flags to ensure I did not miss any usage.

>
> My potentially incomplete check
> $ git grep ">[[:space:]]*vm_flags[[:space:]]*[&|^]="
>
> shows that nothing should be left after this. There is still quite a lot
> of direct checks of the flags (more than 600). Maybe it would be good to
> make flags accessible only via accessors which would also prevent any
> future direct setting of those flags in uncontrolled way as well.

Yes, I think Peter's suggestion in the first patch would also require
that. Much more churn but probably worth it for the future
maintenance. I'll add a patch which converts all readers as well.

>
> Anyway
> Acked-by: Michal Hocko 

Thanks for all the reviews!

> --
> Michal Hocko
> SUSE Labs
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an 
> email to kernel-team+unsubscr...@android.com.
>



Re: [PATCH v2 4/6] mm: replace vma->vm_flags indirect modification in ksm_madvise

2023-01-25 Thread Suren Baghdasaryan
On Wed, Jan 25, 2023 at 9:08 AM Michal Hocko  wrote:
>
> On Wed 25-01-23 08:57:48, Suren Baghdasaryan wrote:
> > On Wed, Jan 25, 2023 at 1:38 AM 'Michal Hocko' via kernel-team
> >  wrote:
> > >
> > > On Wed 25-01-23 00:38:49, Suren Baghdasaryan wrote:
> > > > Replace indirect modifications to vma->vm_flags with calls to modifier
> > > > functions to be able to track flag changes and to keep vma locking
> > > > correctness. Add a BUG_ON check in ksm_madvise() to catch indirect
> > > > vm_flags modification attempts.
> > >
> > > Those BUG_ONs scream to much IMHO. KSM is an MM internal code so I
> > > gueess we should be willing to trust it.
> >
> > Yes, but I really want to prevent an indirect misuse since it was not
> > easy to find these. If you feel strongly about it I will remove them
> > or if you have a better suggestion I'm all for it.
>
> You can avoid that by making flags inaccesible directly, right?

Ah, you mean Peter's suggestion of using __private? I guess that would
cover it. I'll drop these BUG_ONs in the next version. Thanks!

>
> --
> Michal Hocko
> SUSE Labs



Re: [PATCH v2 1/6] mm: introduce vma->vm_flags modifier functions

2023-01-25 Thread Suren Baghdasaryan
On Wed, Jan 25, 2023 at 1:10 AM Peter Zijlstra  wrote:
>
> On Wed, Jan 25, 2023 at 12:38:46AM -0800, Suren Baghdasaryan wrote:
>
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index 2d6d790d9bed..6c7c70bf50dd 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -491,7 +491,13 @@ struct vm_area_struct {
> >* See vmf_insert_mixed_prot() for discussion.
> >*/
> >   pgprot_t vm_page_prot;
> > - unsigned long vm_flags; /* Flags, see mm.h. */
> > +
> > + /*
> > +  * Flags, see mm.h.
> > +  * WARNING! Do not modify directly.
> > +  * Use {init|reset|set|clear|mod}_vm_flags() functions instead.
> > +  */
> > + unsigned long vm_flags;
>
> We have __private and ACCESS_PRIVATE() to help with enforcing this.

Thanks for pointing this out, Peter! I guess for that I'll need to
convert all read accesses and provide get_vm_flags() too? That will
cause some additional churt (a quick search shows 801 hits over 248
files) but maybe it's worth it? I think Michal suggested that too in
another patch. Should I do that while we are at it?

>



Re: [PATCH v2 5/6] mm: introduce mod_vm_flags_nolock and use it in untrack_pfn

2023-01-25 Thread Suren Baghdasaryan
On Wed, Jan 25, 2023 at 1:42 AM Michal Hocko  wrote:
>
> On Wed 25-01-23 00:38:50, Suren Baghdasaryan wrote:
> > In cases when VMA flags are modified after VMA was isolated and mmap_lock
> > was downgraded, flags modifications would result in an assertion because
> > mmap write lock is not held.
> > Introduce mod_vm_flags_nolock to be used in such situation.
> > Pass a hint to untrack_pfn to conditionally use mod_vm_flags_nolock for
> > flags modification and to avoid assertion.
>
> The changelog nor the documentation of mod_vm_flags_nolock
> really explain when it is safe to use it. This is really important for
> future potential users.

True. I'll add clarification in the comments and in the changelog. Thanks!

>
> > Signed-off-by: Suren Baghdasaryan 
> > ---
> >  arch/x86/mm/pat/memtype.c | 10 +++---
> >  include/linux/mm.h| 12 +---
> >  include/linux/pgtable.h   |  5 +++--
> >  mm/memory.c   | 13 +++--
> >  mm/memremap.c |  4 ++--
> >  mm/mmap.c | 16 ++--
> >  6 files changed, 38 insertions(+), 22 deletions(-)
> >
> > diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
> > index ae9645c900fa..d8adc0b42cf2 100644
> > --- a/arch/x86/mm/pat/memtype.c
> > +++ b/arch/x86/mm/pat/memtype.c
> > @@ -1046,7 +1046,7 @@ void track_pfn_insert(struct vm_area_struct *vma, 
> > pgprot_t *prot, pfn_t pfn)
> >   * can be for the entire vma (in which case pfn, size are zero).
> >   */
> >  void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
> > -  unsigned long size)
> > +  unsigned long size, bool mm_wr_locked)
> >  {
> >   resource_size_t paddr;
> >   unsigned long prot;
> > @@ -1065,8 +1065,12 @@ void untrack_pfn(struct vm_area_struct *vma, 
> > unsigned long pfn,
> >   size = vma->vm_end - vma->vm_start;
> >   }
> >   free_pfn_range(paddr, size);
> > - if (vma)
> > - clear_vm_flags(vma, VM_PAT);
> > + if (vma) {
> > + if (mm_wr_locked)
> > + clear_vm_flags(vma, VM_PAT);
> > + else
> > + mod_vm_flags_nolock(vma, 0, VM_PAT);
> > + }
> >  }
> >
> >  /*
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 55335edd1373..48d49930c411 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -656,12 +656,18 @@ static inline void clear_vm_flags(struct 
> > vm_area_struct *vma,
> >   vma->vm_flags &= ~flags;
> >  }
> >
> > +static inline void mod_vm_flags_nolock(struct vm_area_struct *vma,
> > +unsigned long set, unsigned long clear)
> > +{
> > + vma->vm_flags |= set;
> > + vma->vm_flags &= ~clear;
> > +}
> > +
> >  static inline void mod_vm_flags(struct vm_area_struct *vma,
> >   unsigned long set, unsigned long clear)
> >  {
> >   mmap_assert_write_locked(vma->vm_mm);
> > - vma->vm_flags |= set;
> > - vma->vm_flags &= ~clear;
> > + mod_vm_flags_nolock(vma, set, clear);
> >  }
> >
> >  static inline void vma_set_anonymous(struct vm_area_struct *vma)
> > @@ -2087,7 +2093,7 @@ static inline void zap_vma_pages(struct 
> > vm_area_struct *vma)
> >  }
> >  void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
> >   struct vm_area_struct *start_vma, unsigned long start,
> > - unsigned long end);
> > + unsigned long end, bool mm_wr_locked);
> >
> >  struct mmu_notifier_range;
> >
> > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> > index 5fd45454c073..c63cd44777ec 100644
> > --- a/include/linux/pgtable.h
> > +++ b/include/linux/pgtable.h
> > @@ -1185,7 +1185,8 @@ static inline int track_pfn_copy(struct 
> > vm_area_struct *vma)
> >   * can be for the entire vma (in which case pfn, size are zero).
> >   */
> >  static inline void untrack_pfn(struct vm_area_struct *vma,
> > -unsigned long pfn, unsigned long size)
> > +unsigned long pfn, unsigned long size,
> > +bool mm_wr_locked)
> >  {
> >  }
> >
> > @@ -1203,7 +1204,7 @@ extern void track_pfn_insert(struct vm_area_struct 
> > *vma, pgprot_t *prot,
> >pfn_t pfn);
> >  extern int track_pfn

Re: [PATCH v2 1/6] mm: introduce vma->vm_flags modifier functions

2023-01-25 Thread Suren Baghdasaryan
On Wed, Jan 25, 2023 at 10:33 AM Matthew Wilcox  wrote:
>
> On Wed, Jan 25, 2023 at 12:38:46AM -0800, Suren Baghdasaryan wrote:
> > +/* Use when VMA is not part of the VMA tree and needs no locking */
> > +static inline void init_vm_flags(struct vm_area_struct *vma,
> > +  unsigned long flags)
> > +{
> > + vma->vm_flags = flags;
>
> vm_flags are supposed to have type vm_flags_t.  That's not been
> fully realised yet, but perhaps we could avoid making it worse?
>
> >   pgprot_t vm_page_prot;
> > - unsigned long vm_flags; /* Flags, see mm.h. */
> > +
> > + /*
> > +  * Flags, see mm.h.
> > +  * WARNING! Do not modify directly.
> > +  * Use {init|reset|set|clear|mod}_vm_flags() functions instead.
> > +  */
> > + unsigned long vm_flags;
>
> Including changing this line to vm_flags_t

Good point. Will make the change. Thanks!



Re: [PATCH v2 1/6] mm: introduce vma->vm_flags modifier functions

2023-01-26 Thread Suren Baghdasaryan
On Thu, Jan 26, 2023 at 7:09 AM Matthew Wilcox  wrote:
>
> On Thu, Jan 26, 2023 at 04:50:59PM +0200, Mike Rapoport wrote:
> > On Thu, Jan 26, 2023 at 11:17:09AM +0200, Mike Rapoport wrote:
> > > On Wed, Jan 25, 2023 at 12:38:46AM -0800, Suren Baghdasaryan wrote:
> > > > +/* Use when VMA is not part of the VMA tree and needs no locking */
> > > > +static inline void init_vm_flags(struct vm_area_struct *vma,
> > > > +  unsigned long flags)
> > >
> > > I'd suggest to make it vm_flags_init() etc.
> >
> > Thinking more about it, it will be even clearer to name these 
> > vma_flags_xyz()
>
> Perhaps vma_VERB_flags()?
>
> vma_init_flags()
> vma_reset_flags()
> vma_set_flags()
> vma_clear_flags()
> vma_mod_flags()

Due to excessive email bouncing I posted the v3 of this patchset using
the original per-VMA patchset's distribution list. That might have
dropped Mike from the list. Sorry about that Mike, I'll add you to my
usual list of suspects :)
The v3 is here:
https://lore.kernel.org/all/20230125233554.153109-1-sur...@google.com/
and Andrew did suggest the same renames, so I'll be posting v4 with
those changes later today.
Thanks for the feedback!

>