On Mon, 2014-11-17 at 15:54 -0600, Nathan Fontenot wrote: > Move handling of memory hotplug add on pseries completely into the kernel. > > The current memory hotplug add path involves the drmgr command doing part > of this work in userspace and requesting the kernel to do additional pieces. > This patch allows us to handle the act completely in the kernel via rtas > hotplug events. This allows us to perform the operation faster and provide > a common memory hotplug add path for PowerVM and PowerKVM systems. > > The patch does introduce a static rtas_hp_event variable that is set to > true when updating the device tree during memory hotplug initiated from > a rtas hotplug event. This is needed because we do not need to do the > work in the of notifier, this work is already performed in handling the > hotplug request. At a later time we can remove this when we deprecate the > previous method of memory hotplug. > > Signed-off-by: Nathan Fontenot <nf...@linux.vnet.ibm.com> > --- > arch/powerpc/platforms/pseries/hotplug-memory.c | 244 > +++++++++++++++++++++++ > 1 file changed, 243 insertions(+), 1 deletion(-) > > diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c > b/arch/powerpc/platforms/pseries/hotplug-memory.c > index 69d178b..b57d42b 100644 > --- a/arch/powerpc/platforms/pseries/hotplug-memory.c > +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c > @@ -16,6 +16,7 @@ > #include <linux/memblock.h> > #include <linux/memory.h> > #include <linux/memory_hotplug.h> > +#include <linux/slab.h> > > #include <asm/firmware.h> > #include <asm/machdep.h> > @@ -23,6 +24,8 @@ > #include <asm/sparsemem.h> > #include "pseries.h" > > +static bool rtas_hp_event; > + > unsigned long pseries_memory_block_size(void) > { > struct device_node *np; > @@ -66,6 +69,52 @@ unsigned long pseries_memory_block_size(void) > return memblock_size; > } > > +static void dlpar_free_drconf_property(struct property *prop) > +{ > + kfree(prop->name); > + kfree(prop->value); > + kfree(prop); > +} > + > +static struct property *dlpar_clone_drconf_property(struct device_node *dn) > +{ > + struct property *prop, *new_prop; > + > + prop = of_find_property(dn, "ibm,dynamic-memory", NULL); > + if (!prop) > + return NULL; > + > + new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL); > + if (!new_prop) > + return NULL; > + > + new_prop->name = kstrdup(prop->name, GFP_KERNEL); > + new_prop->value = kmalloc(prop->length, GFP_KERNEL); > + if (!new_prop->name || !new_prop->value) { > + dlpar_free_drconf_property(new_prop); > + return NULL; > + } > + > + memcpy(new_prop->value, prop->value, prop->length); > + new_prop->length = prop->length; > + > + return new_prop; > +} > + > +static struct memory_block *lmb_to_memblock(struct of_drconf_cell *lmb) > +{ > + unsigned long section_nr; > + struct mem_section *mem_sect; > + struct memory_block *mem_block; > + u64 phys_addr = be64_to_cpu(lmb->base_addr); > + > + section_nr = pfn_to_section_nr(PFN_DOWN(phys_addr)); > + mem_sect = __nr_to_section(section_nr); > + > + mem_block = find_memory_block(mem_sect); > + return mem_block; > +} > + > #ifdef CONFIG_MEMORY_HOTREMOVE > static int pseries_remove_memblock(unsigned long base, unsigned int > memblock_size) > { > @@ -136,19 +185,209 @@ static inline int pseries_remove_mem_node(struct > device_node *np) > } > #endif /* CONFIG_MEMORY_HOTREMOVE */ > > +static int dlpar_add_lmb(struct of_drconf_cell *lmb) > +{ > + struct memory_block *mem_block; > + u64 phys_addr; > + uint32_t drc_index; I started commenting this and it turns out you've used uint32_t almost everywhere for values you've pulled from the device tree or the elog. These should be u32. > + unsigned long pages_per_block; > + unsigned long block_sz; > + int nid, sections_per_block; > + int rc; > + > + if (be32_to_cpu(lmb->flags) & DRCONF_MEM_ASSIGNED) > + return -EINVAL; > + > + phys_addr = be64_to_cpu(lmb->base_addr); > + drc_index = be32_to_cpu(lmb->drc_index); > + block_sz = memory_block_size_bytes(); > + sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; > + pages_per_block = PAGES_PER_SECTION * sections_per_block; > + Perhaps I'm being a bit slow here but it isn't exactly clear what you're getting with all those variables and could you explain what that statement below is checking for?
> + if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) > + return -EINVAL; > + > + rc = dlpar_acquire_drc(drc_index); > + if (rc) > + return rc; > + > + /* Find the node id for this address */ > + nid = memory_add_physaddr_to_nid(phys_addr); > + > + /* Add the memory */ > + rc = add_memory(nid, phys_addr, block_sz); > + if (rc) { > + dlpar_release_drc(drc_index); > + return rc; > + } > + > + /* Register this block of memory */ > + rc = memblock_add(phys_addr, block_sz); > + if (rc) { > + remove_memory(nid, phys_addr, block_sz); > + dlpar_release_drc(drc_index); > + return rc; > + } > + > + mem_block = lmb_to_memblock(lmb); > + if (!mem_block) { > + remove_memory(nid, phys_addr, block_sz); > + dlpar_release_drc(drc_index); > + return -EINVAL; > + } > + > + rc = device_online(&mem_block->dev); > + put_device(&mem_block->dev); > + if (rc) { > + remove_memory(nid, phys_addr, block_sz); > + dlpar_release_drc(drc_index); > + return rc; > + } > + > + lmb->flags |= cpu_to_be32(DRCONF_MEM_ASSIGNED); > + return 0; > +} > + > +static int dlpar_memory_add_by_count(struct pseries_hp_errorlog *hp_elog, > + struct property *prop) > +{ > + struct of_drconf_cell *lmbs; > + uint32_t num_lmbs; > + __be32 *p; > + int i, lmbs_to_add; > + int lmbs_available = 0; > + int lmbs_added = 0; > + int rc; > + > + lmbs_to_add = be32_to_cpu(hp_elog->_drc_u.drc_count); Didn't you already do the endian conversion back in handle_dlpar_errorlog? > + pr_info("Attempting to hot-add %d LMB(s)\n", lmbs_to_add); > + > + if (lmbs_to_add == 0) > + return -EINVAL; > + > + p = prop->value; > + num_lmbs = be32_to_cpu(*p++); > + lmbs = (struct of_drconf_cell *)p; > + > + /* Validate that there are enough LMBs to satisfy the request */ > + for (i = 0; i < num_lmbs; i++) { > + if (!(be32_to_cpu(lmbs[i].flags) & DRCONF_MEM_ASSIGNED)) > + lmbs_available++; > + } > + > + if (lmbs_available < lmbs_to_add) > + return -EINVAL; > + I'm wondering why the next 3 lines when the if could be incorporated into the for condition for (i = 0; i < num_lmbs && lmbs_to_add != lmbs_added; i++) { (or lmbs_to_add < lmbs_added)... > + for (i = 0; i < num_lmbs; i++) { > + if (lmbs_to_add == lmbs_added) > + break; > + > + rc = dlpar_add_lmb(&lmbs[i]); > + if (rc) > + continue; > + > + lmbs_added++; > + pr_info("Memory at %llx (drc index %x) has been hot-added\n", > + be64_to_cpu(lmbs[i].base_addr), > + be32_to_cpu(lmbs[i].drc_index)); This message feels a tad premature, you're going to roll back if the requested amount couldn't be added which is good but then there's going to be a confusing mix of 'hot-added' followed by 'hot-removed'. Could this message be moved to when you clear the reserved fields? > + > + /* Mark this lmb so we can remove it later if all of the > + * requested LMBs cannot be added. > + */ > + lmbs[i].reserved = 1; Writing 1 like that into a __be variable will upset sparse. As a more general comment that it might be worth not passing around __be structures and convert them all into CPU endian in one place so that all these functions can do away with the endian conversion calls. > + } > + > + if (lmbs_added != lmbs_to_add) { > + /* TODO: remove added lmbs */ > + rc = -EINVAL; > + } > + > + /* Clear the reserved fields */ > + for (i = 0; i < num_lmbs; i++) > + lmbs[i].reserved = 0; > + > + return rc; > +} > + > +static int dlpar_memory_add_by_index(struct pseries_hp_errorlog *hp_elog, > + struct property *prop) > +{ > + struct of_drconf_cell *lmbs; > + uint32_t num_lmbs, drc_index; > + __be32 *p; > + int i, lmb_found; > + int rc; > + > + drc_index = be32_to_cpu(hp_elog->_drc_u.drc_index); Didn't you already do the endian conversion back in handle_dlpar_errorlog? > + pr_info("Attempting to hot-add LMB, drc index %x\n", drc_index); > + > + p = prop->value; > + num_lmbs = be32_to_cpu(*p++); > + lmbs = (struct of_drconf_cell *)p; > + > + lmb_found = 0; > + for (i = 0; i < num_lmbs; i++) { > + if (lmbs[i].drc_index == hp_elog->_drc_u.drc_index) { Probably wanted to use your local variable drc_index here also lmbs[i].drc_index is __be and I don't think you've converted it but the other side of the condition has been... > + lmb_found = 1; > + rc = dlpar_add_lmb(&lmbs[i]); > + break; > + } > + } > + > + if (!lmb_found) > + rc = -EINVAL; > + > + if (rc) > + pr_info("Failed to hot-add memory, drc index %x\n", drc_index); > + else > + pr_info("Memory at %llx (drc index %x) has been hot-added\n", > + be64_to_cpu(lmbs[i].base_addr), drc_index); > + > + return rc; > +} > + > int dlpar_memory(struct pseries_hp_errorlog *hp_elog) > { > - int rc = 0; > + struct device_node *dn; > + struct property *prop; > + int rc; > > lock_device_hotplug(); > > + dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); > + if (!dn) > + return -EINVAL; > + > + prop = dlpar_clone_drconf_property(dn); > + if (!prop) { > + of_node_put(dn); > + return -EINVAL; > + } > + > switch (hp_elog->action) { > + case PSERIES_HP_ELOG_ACTION_ADD: > + if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) > + rc = dlpar_memory_add_by_count(hp_elog, prop); > + else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) > + rc = dlpar_memory_add_by_index(hp_elog, prop); > + else > + rc = -EINVAL; > + break; > default: > pr_err("Invalid action (%d) specified\n", hp_elog->action); > rc = -EINVAL; > break; > } > > + if (rc) > + dlpar_free_drconf_property(prop); > + else { > + rtas_hp_event = true; > + of_update_property(dn, prop); > + rtas_hp_event = false; > + } > + > + of_node_put(dn); > unlock_device_hotplug(); > return rc; > } > @@ -193,6 +432,9 @@ static int pseries_update_drconf_memory(struct > of_prop_reconfig *pr) > __be32 *p; > int i, rc = -EINVAL; > > + if (rtas_hp_event) > + return 0; > + > memblock_size = pseries_memory_block_size(); > if (!memblock_size) > return -EINVAL; > > _______________________________________________ > Linuxppc-dev mailing list > Linuxppc-dev@lists.ozlabs.org > https://lists.ozlabs.org/listinfo/linuxppc-dev _______________________________________________ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev