On 11/21/2014 01:49 AM, Cyril Bur wrote: > > On Mon, 2014-11-17 at 15:54 -0600, Nathan Fontenot wrote: >> Move handling of memory hotplug add on pseries completely into the kernel. >> >> The current memory hotplug add path involves the drmgr command doing part >> of this work in userspace and requesting the kernel to do additional pieces. >> This patch allows us to handle the act completely in the kernel via rtas >> hotplug events. This allows us to perform the operation faster and provide >> a common memory hotplug add path for PowerVM and PowerKVM systems. >> >> The patch does introduce a static rtas_hp_event variable that is set to >> true when updating the device tree during memory hotplug initiated from >> a rtas hotplug event. This is needed because we do not need to do the >> work in the of notifier, this work is already performed in handling the >> hotplug request. At a later time we can remove this when we deprecate the >> previous method of memory hotplug. >> >> Signed-off-by: Nathan Fontenot <nf...@linux.vnet.ibm.com> >> --- >> arch/powerpc/platforms/pseries/hotplug-memory.c | 244 >> +++++++++++++++++++++++ >> 1 file changed, 243 insertions(+), 1 deletion(-) >> >> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c >> b/arch/powerpc/platforms/pseries/hotplug-memory.c >> index 69d178b..b57d42b 100644 >> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c >> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c >> @@ -16,6 +16,7 @@ >> #include <linux/memblock.h> >> #include <linux/memory.h> >> #include <linux/memory_hotplug.h> >> +#include <linux/slab.h> >> >> #include <asm/firmware.h> >> #include <asm/machdep.h> >> @@ -23,6 +24,8 @@ >> #include <asm/sparsemem.h> >> #include "pseries.h" >> >> +static bool rtas_hp_event; >> + >> unsigned long pseries_memory_block_size(void) >> { >> struct device_node *np; >> @@ -66,6 +69,52 @@ unsigned long pseries_memory_block_size(void) >> return memblock_size; >> } >> >> +static void dlpar_free_drconf_property(struct property *prop) >> +{ >> + kfree(prop->name); >> + kfree(prop->value); >> + kfree(prop); >> +} >> + >> +static struct property *dlpar_clone_drconf_property(struct device_node *dn) >> +{ >> + struct property *prop, *new_prop; >> + >> + prop = of_find_property(dn, "ibm,dynamic-memory", NULL); >> + if (!prop) >> + return NULL; >> + >> + new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL); >> + if (!new_prop) >> + return NULL; >> + >> + new_prop->name = kstrdup(prop->name, GFP_KERNEL); >> + new_prop->value = kmalloc(prop->length, GFP_KERNEL); >> + if (!new_prop->name || !new_prop->value) { >> + dlpar_free_drconf_property(new_prop); >> + return NULL; >> + } >> + >> + memcpy(new_prop->value, prop->value, prop->length); >> + new_prop->length = prop->length; >> + >> + return new_prop; >> +} >> + >> +static struct memory_block *lmb_to_memblock(struct of_drconf_cell *lmb) >> +{ >> + unsigned long section_nr; >> + struct mem_section *mem_sect; >> + struct memory_block *mem_block; >> + u64 phys_addr = be64_to_cpu(lmb->base_addr); >> + >> + section_nr = pfn_to_section_nr(PFN_DOWN(phys_addr)); >> + mem_sect = __nr_to_section(section_nr); >> + >> + mem_block = find_memory_block(mem_sect); >> + return mem_block; >> +} >> + >> #ifdef CONFIG_MEMORY_HOTREMOVE >> static int pseries_remove_memblock(unsigned long base, unsigned int >> memblock_size) >> { >> @@ -136,19 +185,209 @@ static inline int pseries_remove_mem_node(struct >> device_node *np) >> } >> #endif /* CONFIG_MEMORY_HOTREMOVE */ >> >> +static int dlpar_add_lmb(struct of_drconf_cell *lmb) >> +{ >> + struct memory_block *mem_block; >> + u64 phys_addr; >> + uint32_t drc_index; > I started commenting this and it turns out you've used uint32_t almost > everywhere for values you've pulled from the device tree or the elog. > These should be u32.
ok, that should be an easy fix. >> + unsigned long pages_per_block; >> + unsigned long block_sz; >> + int nid, sections_per_block; >> + int rc; >> + >> + if (be32_to_cpu(lmb->flags) & DRCONF_MEM_ASSIGNED) >> + return -EINVAL; >> + >> + phys_addr = be64_to_cpu(lmb->base_addr); >> + drc_index = be32_to_cpu(lmb->drc_index); >> + block_sz = memory_block_size_bytes(); >> + sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; >> + pages_per_block = PAGES_PER_SECTION * sections_per_block; >> + > Perhaps I'm being a bit slow here but it isn't exactly clear what you're > getting with all those variables and could you explain what that > statement below is checking for? > >> + if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1)) >> + return -EINVAL; We could probably get rid of this check, The values were are validating come from the device tree and if it's wrong we have more serious problems than a failure in mem hotplug. This would also remove the need for the pages_per_block and sections_per_block variables. >> + >> + rc = dlpar_acquire_drc(drc_index); >> + if (rc) >> + return rc; >> + >> + /* Find the node id for this address */ >> + nid = memory_add_physaddr_to_nid(phys_addr); >> + >> + /* Add the memory */ >> + rc = add_memory(nid, phys_addr, block_sz); >> + if (rc) { >> + dlpar_release_drc(drc_index); >> + return rc; >> + } >> + >> + /* Register this block of memory */ >> + rc = memblock_add(phys_addr, block_sz); >> + if (rc) { >> + remove_memory(nid, phys_addr, block_sz); >> + dlpar_release_drc(drc_index); >> + return rc; >> + } >> + >> + mem_block = lmb_to_memblock(lmb); >> + if (!mem_block) { >> + remove_memory(nid, phys_addr, block_sz); >> + dlpar_release_drc(drc_index); >> + return -EINVAL; >> + } >> + >> + rc = device_online(&mem_block->dev); >> + put_device(&mem_block->dev); >> + if (rc) { >> + remove_memory(nid, phys_addr, block_sz); >> + dlpar_release_drc(drc_index); >> + return rc; >> + } >> + >> + lmb->flags |= cpu_to_be32(DRCONF_MEM_ASSIGNED); >> + return 0; >> +} >> + >> +static int dlpar_memory_add_by_count(struct pseries_hp_errorlog *hp_elog, >> + struct property *prop) >> +{ >> + struct of_drconf_cell *lmbs; >> + uint32_t num_lmbs; >> + __be32 *p; >> + int i, lmbs_to_add; >> + int lmbs_available = 0; >> + int lmbs_added = 0; >> + int rc; >> + >> + lmbs_to_add = be32_to_cpu(hp_elog->_drc_u.drc_count); > Didn't you already do the endian conversion back in > handle_dlpar_errorlog? Yep, already fixed in the next patchset. >> + pr_info("Attempting to hot-add %d LMB(s)\n", lmbs_to_add); >> + >> + if (lmbs_to_add == 0) >> + return -EINVAL; >> + >> + p = prop->value; >> + num_lmbs = be32_to_cpu(*p++); >> + lmbs = (struct of_drconf_cell *)p; >> + >> + /* Validate that there are enough LMBs to satisfy the request */ >> + for (i = 0; i < num_lmbs; i++) { >> + if (!(be32_to_cpu(lmbs[i].flags) & DRCONF_MEM_ASSIGNED)) >> + lmbs_available++; >> + } >> + >> + if (lmbs_available < lmbs_to_add) >> + return -EINVAL; >> + > I'm wondering why the next 3 lines when the if could be incorporated > into the for condition > for (i = 0; i < num_lmbs && lmbs_to_add != lmbs_added; i++) { > (or lmbs_to_add < lmbs_added)... It could. I tend to put less in the for() statement. Just my preference on readability. >> + for (i = 0; i < num_lmbs; i++) { >> + if (lmbs_to_add == lmbs_added) >> + break; >> + >> + rc = dlpar_add_lmb(&lmbs[i]); >> + if (rc) >> + continue; >> + >> + lmbs_added++; >> + pr_info("Memory at %llx (drc index %x) has been hot-added\n", >> + be64_to_cpu(lmbs[i].base_addr), >> + be32_to_cpu(lmbs[i].drc_index)); > This message feels a tad premature, you're going to roll back if the > requested amount couldn't be added which is good but then there's going > to be a confusing mix of 'hot-added' followed by 'hot-removed'. Could > this message be moved to when you clear the reserved fields? Ahh... good point. I will do that. >> + >> + /* Mark this lmb so we can remove it later if all of the >> + * requested LMBs cannot be added. >> + */ >> + lmbs[i].reserved = 1; > Writing 1 like that into a __be variable will upset sparse. > > As a more general comment that it might be worth not passing around __be > structures and convert them all into CPU endian in one place so that all > these functions can do away with the endian conversion calls. This may be possible. I will have to look at how the struct is passed around what is used in it. It would be nice to have fewer conversion calls. >> + } >> + >> + if (lmbs_added != lmbs_to_add) { >> + /* TODO: remove added lmbs */ >> + rc = -EINVAL; >> + } >> + >> + /* Clear the reserved fields */ >> + for (i = 0; i < num_lmbs; i++) >> + lmbs[i].reserved = 0; >> + >> + return rc; >> +} >> + >> +static int dlpar_memory_add_by_index(struct pseries_hp_errorlog *hp_elog, >> + struct property *prop) >> +{ >> + struct of_drconf_cell *lmbs; >> + uint32_t num_lmbs, drc_index; >> + __be32 *p; >> + int i, lmb_found; >> + int rc; >> + >> + drc_index = be32_to_cpu(hp_elog->_drc_u.drc_index); > Didn't you already do the endian conversion back in > handle_dlpar_errorlog? >> + pr_info("Attempting to hot-add LMB, drc index %x\n", drc_index); >> + >> + p = prop->value; >> + num_lmbs = be32_to_cpu(*p++); >> + lmbs = (struct of_drconf_cell *)p; >> + >> + lmb_found = 0; >> + for (i = 0; i < num_lmbs; i++) { >> + if (lmbs[i].drc_index == hp_elog->_drc_u.drc_index) { > Probably wanted to use your local variable drc_index here also > lmbs[i].drc_index is __be and I don't think you've converted it but the > other side of the condition has been... Yep. -Nathan >> + lmb_found = 1; >> + rc = dlpar_add_lmb(&lmbs[i]); >> + break; >> + } >> + } >> + >> + if (!lmb_found) >> + rc = -EINVAL; >> + >> + if (rc) >> + pr_info("Failed to hot-add memory, drc index %x\n", drc_index); >> + else >> + pr_info("Memory at %llx (drc index %x) has been hot-added\n", >> + be64_to_cpu(lmbs[i].base_addr), drc_index); >> + >> + return rc; >> +} >> + >> int dlpar_memory(struct pseries_hp_errorlog *hp_elog) >> { >> - int rc = 0; >> + struct device_node *dn; >> + struct property *prop; >> + int rc; >> >> lock_device_hotplug(); >> >> + dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); >> + if (!dn) >> + return -EINVAL; >> + >> + prop = dlpar_clone_drconf_property(dn); >> + if (!prop) { >> + of_node_put(dn); >> + return -EINVAL; >> + } >> + >> switch (hp_elog->action) { >> + case PSERIES_HP_ELOG_ACTION_ADD: >> + if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) >> + rc = dlpar_memory_add_by_count(hp_elog, prop); >> + else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) >> + rc = dlpar_memory_add_by_index(hp_elog, prop); >> + else >> + rc = -EINVAL; >> + break; >> default: >> pr_err("Invalid action (%d) specified\n", hp_elog->action); >> rc = -EINVAL; >> break; >> } >> >> + if (rc) >> + dlpar_free_drconf_property(prop); >> + else { >> + rtas_hp_event = true; >> + of_update_property(dn, prop); >> + rtas_hp_event = false; >> + } >> + >> + of_node_put(dn); >> unlock_device_hotplug(); >> return rc; >> } >> @@ -193,6 +432,9 @@ static int pseries_update_drconf_memory(struct >> of_prop_reconfig *pr) >> __be32 *p; >> int i, rc = -EINVAL; >> >> + if (rtas_hp_event) >> + return 0; >> + >> memblock_size = pseries_memory_block_size(); >> if (!memblock_size) >> return -EINVAL; >> >> _______________________________________________ >> Linuxppc-dev mailing list >> Linuxppc-dev@lists.ozlabs.org >> https://lists.ozlabs.org/listinfo/linuxppc-dev > > > _______________________________________________ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev