On Thu, Feb 23, 2017 at 8:39 AM, Rashmica Gupta <rashmic...@gmail.com> wrote: > Some powerpc hardware features may want to gain access to a > chunk of undisturbed real memory. This update provides a means to unplug > said memory from the kernel with a set of sysfs calls. By writing an integer > containing the size of memory to be unplugged into > /sys/kernel/debug/powerpc/memtrace/enable, the code will remove that much > memory from the end of each available chip's memory space.
Could we define this better - what is chips's memory space, the local node? In addition, the > means to read out the contents of the unplugged memory is also provided by > reading out the /sys/kernel/debug/powerpc/memtrace/<chip-id>/dump file. > > Signed-off-by: Rashmica Gupta <rashmic...@gmail.com> > --- > Written by Douglas Lehr <dll...@us.ibm.com>. > Have tested and seems to work as I would expect. Only change I have made from > the original is to check that the value being written to the debugfs file is > not 0 (or obscenely large), as otherwise you get a nice kernel oops where the > kernel attempts to access data at 0xfffffffe0. > > Thoughts about doing this with hot unplug or other changes? > > arch/powerpc/mm/hash_native_64.c | 39 +++- > arch/powerpc/platforms/powernv/Makefile | 1 + > arch/powerpc/platforms/powernv/memtrace.c | 285 > ++++++++++++++++++++++++++++++ > 3 files changed, 321 insertions(+), 4 deletions(-) > create mode 100644 arch/powerpc/platforms/powernv/memtrace.c > > diff --git a/arch/powerpc/mm/hash_native_64.c > b/arch/powerpc/mm/hash_native_64.c > index cc33260..44cc6ce 100644 > --- a/arch/powerpc/mm/hash_native_64.c > +++ b/arch/powerpc/mm/hash_native_64.c > @@ -3,7 +3,7 @@ > * > * SMP scalability work: > * Copyright (C) 2001 Anton Blanchard <an...@au.ibm.com>, IBM > - * > + * > * This program is free software; you can redistribute it and/or > * modify it under the terms of the GNU General Public License > * as published by the Free Software Foundation; either version > @@ -181,7 +181,7 @@ static inline void native_lock_hpte(struct hash_pte > *hptep) > while (1) { > if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) > break; > - while(test_bit(HPTE_LOCK_BIT, word)) > + while (test_bit(HPTE_LOCK_BIT, word)) Can we segregate white space fixes from actual changes > cpu_relax(); > } > } > @@ -208,10 +208,10 @@ static long native_hpte_insert(unsigned long > hpte_group, unsigned long vpn, > } > > for (i = 0; i < HPTES_PER_GROUP; i++) { > - if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) { > + if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID)) { > /* retry with lock held */ > native_lock_hpte(hptep); > - if (! (be64_to_cpu(hptep->v) & HPTE_V_VALID)) > + if (!(be64_to_cpu(hptep->v) & HPTE_V_VALID)) > break; > native_unlock_hpte(hptep); > } > @@ -407,6 +407,36 @@ static void native_hpte_updateboltedpp(unsigned long > newpp, unsigned long ea, > tlbie(vpn, psize, psize, ssize, 0); > } > > +/* > + * Remove a bolted kernel entry. Memory hotplug uses this. > + * > + * No need to lock here because we should be the only user. > + */ > +static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) > +{ > + unsigned long vpn; > + unsigned long vsid; > + long slot; > + struct hash_pte *hptep; > + > + vsid = get_kernel_vsid(ea, ssize); > + vpn = hpt_vpn(ea, vsid, ssize); > + Do we need checks to ensure that ea does not belong to _stext or _etext > + slot = native_hpte_find(vpn, psize, ssize); > + if (slot == -1) > + return -ENOENT; > + What does this mean, we have an invalid address? > + hptep = htab_address + slot; > + > + /* Invalidate the hpte */ > + hptep->v = 0; > + > + /* Invalidate the TLB */ > + tlbie(vpn, psize, psize, ssize, 0); > + return 0; > +} > + > + > static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, > int bpsize, int apsize, int ssize, int > local) > { > @@ -722,6 +752,7 @@ void __init hpte_init_native(void) > mmu_hash_ops.hpte_invalidate = native_hpte_invalidate; > mmu_hash_ops.hpte_updatepp = native_hpte_updatepp; > mmu_hash_ops.hpte_updateboltedpp = native_hpte_updateboltedpp; > + mmu_hash_ops.hpte_removebolted = native_hpte_removebolted; > mmu_hash_ops.hpte_insert = native_hpte_insert; > mmu_hash_ops.hpte_remove = native_hpte_remove; > mmu_hash_ops.hpte_clear_all = native_hpte_clear; > diff --git a/arch/powerpc/platforms/powernv/Makefile > b/arch/powerpc/platforms/powernv/Makefile > index b5d98cb..2026661 100644 > --- a/arch/powerpc/platforms/powernv/Makefile > +++ b/arch/powerpc/platforms/powernv/Makefile > @@ -11,4 +11,5 @@ obj-$(CONFIG_EEH) += eeh-powernv.o > obj-$(CONFIG_PPC_SCOM) += opal-xscom.o > obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o > obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o > +obj-$(CONFIG_MEMORY_HOTREMOVE) += memtrace.o > obj-$(CONFIG_OPAL_PRD) += opal-prd.o > diff --git a/arch/powerpc/platforms/powernv/memtrace.c > b/arch/powerpc/platforms/powernv/memtrace.c > new file mode 100644 > index 0000000..fdba3ee > --- /dev/null > +++ b/arch/powerpc/platforms/powernv/memtrace.c > @@ -0,0 +1,285 @@ > +/* > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * Copyright (C) IBM Corporation, 2014 > + * > + * Author: Anton Blanchard <an...@au.ibm.com> > + */ > + > +#define pr_fmt(fmt) "powernv-memtrace: " fmt > + > +#include <linux/bitops.h> > +#include <linux/string.h> > +#include <linux/memblock.h> > +#include <linux/init.h> > +#include <linux/moduleparam.h> > +#include <linux/fs.h> > +#include <linux/debugfs.h> > +#include <linux/slab.h> > +#include <linux/memory.h> > +#include <linux/memory_hotplug.h> > +#include <asm/machdep.h> > + > +struct memtrace_entry { > + void *mem; > + u64 start; > + u64 size; > + u32 nid; > + struct dentry *dir; > + char name[16]; > +}; > + > +static struct memtrace_entry *memtrace_array; > +static unsigned int memtrace_array_nr; > + > +static ssize_t memtrace_read(struct file *filp, char __user *ubuf, > + size_t count, loff_t *ppos) > +{ > + struct memtrace_entry *ent = filp->private_data; > + > + return simple_read_from_buffer(ubuf, count, ppos, ent->mem, > ent->size); > +} > + > +static bool valid_memtrace_range(struct memtrace_entry *dev, > + unsigned long start, unsigned long size) > +{ > + if ((dev->start <= start) && > + ((start + size) <= (dev->start + dev->size))) > + return true; > + > + return false; > +} > + > +static int memtrace_mmap(struct file *filp, struct vm_area_struct *vma) > +{ > + unsigned long size = vma->vm_end - vma->vm_start; > + struct memtrace_entry *dev = filp->private_data; > + > + if (!valid_memtrace_range(dev, vma->vm_pgoff << PAGE_SHIFT, size)) > + return -EINVAL; > + > + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); > + > + if (io_remap_pfn_range(vma, vma->vm_start, > + vma->vm_pgoff + (dev->start >> PAGE_SHIFT), > + size, vma->vm_page_prot)) > + return -EAGAIN; > + > + return 0; > +} > + > +static const struct file_operations memtrace_fops = { > + .llseek = default_llseek, > + .read = memtrace_read, > + .mmap = memtrace_mmap, > + .open = simple_open, > +}; > + > +static void flush_memory_region(u64 base, u64 size) > +{ > + unsigned long line_size = ppc64_caches.dline_size; > + u64 end = base + size; > + u64 addr; > + > + base = round_down(base, line_size); > + end = round_up(end, line_size); > + > + for (addr = base; addr < end; addr += line_size) > + asm volatile("dcbf 0,%0" : "=r" (addr) :: "memory"); > +} > + > +static int check_memblock_online(struct memory_block *mem, void *arg) > +{ > + if (mem->state != MEM_ONLINE) > + return -1; > + > + return 0; > +} > + > +static int change_memblock_state(struct memory_block *mem, void *arg) > +{ > + unsigned long state = (unsigned long)arg; > + > + mem->state = state; > + return 0; > +} > + > +static bool memtrace_offline_pages(u32 nid, u64 start_pfn, u64 nr_pages) > +{ > + u64 end_pfn = start_pfn + nr_pages - 1; > + > + if (walk_memory_range(start_pfn, end_pfn, NULL, > + check_memblock_online)) > + return false; > + > + walk_memory_range(start_pfn, end_pfn, (void *)MEM_GOING_OFFLINE, > + change_memblock_state); > + > + if (offline_pages(start_pfn, nr_pages)) { > + walk_memory_range(start_pfn, end_pfn, (void *)MEM_ONLINE, > + change_memblock_state); > + return false; > + } > + > + walk_memory_range(start_pfn, end_pfn, (void *)MEM_OFFLINE, > + change_memblock_state); > + > + /* RCU grace period? */ > + flush_memory_region((u64)__va(start_pfn << PAGE_SHIFT), nr_pages << > PAGE_SHIFT); > + > + remove_memory(nid, start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT); > + > + return true; > +} > + > +static u64 memtrace_alloc_node(u32 nid, u64 size) > +{ > + u64 start_pfn, end_pfn, nr_pages; > + u64 base_pfn; > + I wonder if it makes more sense to find a free region in the node via the allocator and then offline it, but that can be racy was well I suppose > + if (!NODE_DATA(nid) || !node_spanned_pages(nid)) > + return 0; > + > + start_pfn = node_start_pfn(nid); > + end_pfn = node_end_pfn(nid); > + nr_pages = size >> PAGE_SHIFT; > + > + /* Trace memory needs to be aligned to the size */ > + start_pfn = round_up(start_pfn, nr_pages); > + > + for (base_pfn = start_pfn; base_pfn < end_pfn; base_pfn += nr_pages) { > + if (memtrace_offline_pages(nid, base_pfn, nr_pages) == true) > + return base_pfn << PAGE_SHIFT; > + } > + > + return 0; > +} > + > +static int memtrace_init_regions_runtime(u64 size) > +{ > + u64 m; > + u32 nid; > + > + memtrace_array = kzalloc(sizeof(struct memtrace_entry) * > + num_online_nodes(), GFP_KERNEL); > + if (!memtrace_array) { > + pr_err("Failed to allocate memtrace_array\n"); > + return -EINVAL; > + } > + > + for_each_online_node(nid) { What makes this hotplug safe? > + m = memtrace_alloc_node(nid, size); > + /* > + * A node might not have any local memory, so warn but > + * continue on. > + */ Honestly, I think we need some more design around this. The patches work, but I am not quite sure if there is a better way to release memory. I've not looked at the memblock API's, but I am concerned about blowing holes in memory that might impact our organization of zones, generally we do ZONE_DMA, so we should be good. What happens if we end up off-lining all our CMA memory? I don't know if we are guarded against it. Balbir Singh