offline_pages() should not be called outside of the MM core. Especially,
having to manually fiddle with the memory block states is a sign that
this is not a good idea. To offline memory block devices cleanly,
device_offline() should be used. This is the only remaining caller of
offline_pages(), except the official device_offline() way.

E.g., when trying to allocate right now we trigger messages like
[   11.227817] page:c00c000000182000 refcount:1 mapcount:0 
mapping:0000000000000000 index:0x0
[   11.228056] raw: 007ffff000000000 c000000001538860 c000000001538860 
0000000000000000
[   11.228070] raw: 0000000000000000 0000000000000001 00000001ffffffff 
0000000000000000
[   11.228097] page dumped because: unmovable page

and theoretically we might end up looping quite a long time trying to
offline memory, which would have to be canceled by the user manually
(CTRL-C).

Memtrace needs to identify+allocate multiple consecutive memory blocks.
It also has to remove the memory blocks to remove all page tables
(HW requirement).

Let's use alloc_contig_pages() to allocate memory that spans multiple
memory block devices. We can then set all pages PageOffline() to allow
these pages to get isolated. A temporary memory notifier can then make
offlining of these pages succeed by dropping its reference to the pages
on MEM_GOING_OFFLINE events(as documented in include/linux/page-flags.h
for PageOffline() pages). Error handling is a bit tricky.

Note1: ZONE_MOVABLE memory blocks won't be considered. Not sure if that
was ever really relevant. (unmovable data would end up on these memory
blocks for a tiny little time frame)

Note2: We don't have to care about online_page_callback_t, as we forbid
re-onlining from our memory notifier.

Note3: I was told this feature is never used along with DIMM-based memory
hotunplug - otherwise bad things will happen when a DIMM would try to
remove "alread-removed" memory (that is still in use).

Tested under QEMU with powernv emulation (kernel + initramfs).

$ mount -t debugfs none /sys/kernel/debug/
$ cat /sys/devices/system/memory/block_size_bytes
10000000
$ echo 0x20000000 > /sys/kernel/debug/powerpc/memtrace/enable
[   19.809790] Offlined Pages 4096
[   19.835842] Offlined Pages 4096
[   19.853136] memtrace: Allocated trace memory on node 0 at 0x0000000040000000

Unfortunately, QEMU does not support NUMA for powernv yet, so I cannot
test that.

Cc: Benjamin Herrenschmidt <b...@kernel.crashing.org>
Cc: Paul Mackerras <pau...@samba.org>
Cc: Michael Ellerman <m...@ellerman.id.au>
Cc: Andrew Morton <a...@linux-foundation.org>
Cc: David Hildenbrand <da...@redhat.com>
Cc: Allison Randal <alli...@lohutok.net>
Cc: Jens Axboe <ax...@kernel.dk>
Cc: Anshuman Khandual <anshuman.khand...@arm.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: Michal Hocko <mho...@kernel.org>
Cc: Oscar Salvador <osalva...@suse.de>
Cc: Balbir Singh <bsinghar...@gmail.com>
Cc: Rashmica Gupta <rashmic...@gmail.com>
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: David Hildenbrand <da...@redhat.com>
---
 arch/powerpc/platforms/powernv/Kconfig    |   1 +
 arch/powerpc/platforms/powernv/memtrace.c | 175 ++++++++++++++--------
 2 files changed, 112 insertions(+), 64 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/Kconfig 
b/arch/powerpc/platforms/powernv/Kconfig
index 938803eab0ad..571a0fa9f055 100644
--- a/arch/powerpc/platforms/powernv/Kconfig
+++ b/arch/powerpc/platforms/powernv/Kconfig
@@ -29,6 +29,7 @@ config OPAL_PRD
 config PPC_MEMTRACE
        bool "Enable removal of RAM from kernel mappings for tracing"
        depends on PPC_POWERNV && MEMORY_HOTREMOVE
+       select CONTIG_ALLOC
        help
          Enabling this option allows for the removal of memory (RAM)
          from the kernel mappings to be used for hardware tracing.
diff --git a/arch/powerpc/platforms/powernv/memtrace.c 
b/arch/powerpc/platforms/powernv/memtrace.c
index 2d2a0a2acd60..fe1e8f3926a1 100644
--- a/arch/powerpc/platforms/powernv/memtrace.c
+++ b/arch/powerpc/platforms/powernv/memtrace.c
@@ -76,83 +76,130 @@ static int memtrace_free_node(int nid, unsigned long 
start, unsigned long size)
        return ret;
 }
 
-static int check_memblock_online(struct memory_block *mem, void *arg)
-{
-       if (mem->state != MEM_ONLINE)
-               return -1;
-
-       return 0;
-}
-
-static int change_memblock_state(struct memory_block *mem, void *arg)
-{
-       unsigned long state = (unsigned long)arg;
-
-       mem->state = state;
-
-       return 0;
-}
+struct memtrace_alloc_info {
+       struct notifier_block memory_notifier;
+       unsigned long base_pfn;
+       unsigned long nr_pages;
+};
 
-/* called with device_hotplug_lock held */
-static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages)
+static int memtrace_memory_notifier_cb(struct notifier_block *nb,
+                                      unsigned long action, void *arg)
 {
-       const unsigned long start = PFN_PHYS(start_pfn);
-       const unsigned long size = PFN_PHYS(nr_pages);
-
-       if (walk_memory_blocks(start, size, NULL, check_memblock_online))
-               return false;
-
-       walk_memory_blocks(start, size, (void *)MEM_GOING_OFFLINE,
-                          change_memblock_state);
-
-       if (offline_pages(start_pfn, nr_pages)) {
-               walk_memory_blocks(start, size, (void *)MEM_ONLINE,
-                                  change_memblock_state);
-               return false;
+       struct memtrace_alloc_info *info = container_of(nb,
+                                                    struct memtrace_alloc_info,
+                                                    memory_notifier);
+       unsigned long pfn, start_pfn, end_pfn;
+       const struct memory_notify *mhp = arg;
+       static bool going_offline;
+
+       /* Ignore ranges that don't overlap. */
+       if (mhp->start_pfn + mhp->nr_pages <= info->base_pfn ||
+           info->base_pfn + info->nr_pages <= mhp->start_pfn)
+               return NOTIFY_OK;
+
+       start_pfn = max_t(unsigned long, info->base_pfn, mhp->start_pfn);
+       end_pfn = min_t(unsigned long, info->base_pfn + info->nr_pages,
+                       mhp->start_pfn + mhp->nr_pages);
+
+       /*
+        * Drop our reference to the allocated (PageOffline()) pages, but
+        * reaquire them in case offlining fails. We might get called for
+        * MEM_CANCEL_OFFLINE but not for MEM_GOING_OFFLINE in case another
+        * notifier aborted offlining.
+        */
+       switch (action) {
+       case MEM_GOING_OFFLINE:
+               for (pfn = start_pfn; pfn < end_pfn; pfn++)
+                       page_ref_dec(pfn_to_page(pfn));
+               going_offline = true;
+               break;
+       case MEM_CANCEL_OFFLINE:
+               if (going_offline)
+                       for (pfn = start_pfn; pfn < end_pfn; pfn++)
+                               page_ref_inc(pfn_to_page(pfn));
+               going_offline = false;
+               break;
+       case MEM_GOING_ONLINE:
+               /*
+                * While our notifier is active, user space could
+                * offline+re-online this memory. Disallow any such activity.
+                */
+               return notifier_to_errno(-EBUSY);
        }
-
-       walk_memory_blocks(start, size, (void *)MEM_OFFLINE,
-                          change_memblock_state);
-
-
-       return true;
+       return NOTIFY_OK;
 }
 
 static u64 memtrace_alloc_node(u32 nid, u64 size)
 {
-       u64 start_pfn, end_pfn, nr_pages, pfn;
-       u64 base_pfn;
-       u64 bytes = memory_block_size_bytes();
+       const unsigned long memory_block_bytes = memory_block_size_bytes();
+       const unsigned long nr_pages = size >> PAGE_SHIFT;
+       struct memtrace_alloc_info info = {
+               .memory_notifier = {
+                       .notifier_call = memtrace_memory_notifier_cb,
+               },
+       };
+       unsigned long base_pfn, to_remove_pfn, pfn;
+       struct page *page;
+       int ret;
 
        if (!node_spanned_pages(nid))
                return 0;
 
-       start_pfn = node_start_pfn(nid);
-       end_pfn = node_end_pfn(nid);
-       nr_pages = size >> PAGE_SHIFT;
-
-       /* Trace memory needs to be aligned to the size */
-       end_pfn = round_down(end_pfn - nr_pages, nr_pages);
-
-       lock_device_hotplug();
-       for (base_pfn = end_pfn; base_pfn > start_pfn; base_pfn -= nr_pages) {
-               if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) {
-                       /*
-                        * Remove memory in memory block size chunks so that
-                        * iomem resources are always split to the same size and
-                        * we never try to remove memory that spans two iomem
-                        * resources.
-                        */
-                       end_pfn = base_pfn + nr_pages;
-                       for (pfn = base_pfn; pfn < end_pfn; pfn += bytes>> 
PAGE_SHIFT) {
-                               __remove_memory(nid, pfn << PAGE_SHIFT, bytes);
-                       }
-                       unlock_device_hotplug();
-                       return base_pfn << PAGE_SHIFT;
-               }
+       /*
+        * Try to allocate memory (that might span multiple memory blocks)
+        * on the requested node. Trace memory needs to be aligned to the size,
+        * which is guaranteed by alloc_contig_pages().
+        */
+       page = alloc_contig_pages(nr_pages, __GFP_THISNODE, nid, NULL);
+       if (!page)
+               return 0;
+       to_remove_pfn = base_pfn = page_to_pfn(page);
+       info.base_pfn = base_pfn;
+       info.nr_pages = nr_pages;
+
+       /* PageOffline() allows to isolate the memory when offlining. */
+       for (pfn = base_pfn; pfn < base_pfn + nr_pages; pfn++)
+               __SetPageOffline(pfn_to_page(pfn));
+
+       /* A temporary memory notifier allows to offline the isolated memory. */
+       ret = register_memory_notifier(&info.memory_notifier);
+       if (ret)
+               goto out_free_pages;
+
+       /*
+        * Try to offline and remove all involved memory blocks. This will
+        * only fail in the unlikely event that another memory notifier NACKs
+        * the offlining request - no memory has to be migrated.
+        *
+        * Remove memory in memory block size chunks so that iomem resources
+        * are always split to the same size and we never try to remove memory
+        * that spans two iomem resources.
+        */
+       for (; to_remove_pfn < base_pfn + nr_pages;
+            to_remove_pfn += PHYS_PFN(memory_block_bytes)) {
+               ret = offline_and_remove_memory(nid, PFN_PHYS(to_remove_pfn),
+                                               memory_block_bytes);
+               if (ret)
+                       goto out_readd_memory;
        }
-       unlock_device_hotplug();
 
+       unregister_memory_notifier(&info.memory_notifier);
+       return PFN_PHYS(base_pfn);
+out_readd_memory:
+       /* Unregister before adding+onlining (notifer blocks onlining). */
+       unregister_memory_notifier(&info.memory_notifier);
+       if (to_remove_pfn != base_pfn) {
+               ret = memtrace_free_node(nid, PFN_PHYS(base_pfn),
+                                        PFN_PHYS(to_remove_pfn - base_pfn));
+               if (ret)
+                       /* Even more unlikely, log and ignore. */
+                       pr_err("Failed to add trace memory to node %d\n", nid);
+       }
+out_free_pages:
+       /* Only free memory that was not temporarily offlined+removed. */
+       for (pfn = to_remove_pfn; pfn < base_pfn + nr_pages; pfn++)
+               __ClearPageOffline(pfn_to_page(pfn));
+       free_contig_range(to_remove_pfn, nr_pages - (to_remove_pfn - base_pfn));
        return 0;
 }
 
-- 
2.23.0

Reply via email to