[4/5] pseries: Implement memory hotplug add in the kernel

Michael Ellerman mpe at ellerman.id.au
Wed Sep 17 17:07:19 EST 2014


On Mon, 2014-09-15 at 15:32 -0500, Nathan Fontenot wrote:
> This patch adds the ability to do memory hotplug adding in the kernel.
> 
> Currently the hotplug add/remove of memory is handled by the drmgr
> command. The drmgr command performs the add/remove by performing
> some work in user-space and making requests to the kernel to handle 
> other pieces. By moving all of the work to the kernel we can do the
> add and remove faster, and provide a common place to do memory hotplug
> for both the PowerVM and PowerKVM environments.
> 
> Signed-off-by: Nathan Fontenot <nfont at linux.vnet.ibm.com>
> ---
>  arch/powerpc/platforms/pseries/hotplug-memory.c |  170 +++++++++++++++++++++++
>  1 file changed, 170 insertions(+)
> 
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 0e60e15..b254773 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -17,6 +17,7 @@
>  #include <linux/vmalloc.h>
>  #include <linux/memory.h>
>  #include <linux/memory_hotplug.h>
> +#include <linux/slab.h>
>  
>  #include <asm/firmware.h>
>  #include <asm/machdep.h>
> @@ -24,6 +25,8 @@
>  #include <asm/sparsemem.h>
>  #include <asm/rtas.h>
>  
> +#include "pseries.h"
> +
>  DEFINE_MUTEX(dlpar_mem_mutex);
>  
>  unsigned long pseries_memory_block_size(void)
> @@ -69,6 +72,53 @@ unsigned long pseries_memory_block_size(void)
>  	return memblock_size;
>  }
>  
> +static void dlpar_free_drconf_property(struct property *prop)
> +{
> +	kfree(prop->name);
> +	kfree(prop->value);
> +	kfree(prop);
> +}
> +
> +static struct property *dlpar_clone_drconf_property(struct device_node *dn)
> +{
> +	struct property *prop, *new_prop;
> +
> +	prop = of_find_property(dn, "ibm,dynamic-memory", NULL);
> +	if (!prop)
> +		return NULL;
> +
> +	new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
> +	if (!new_prop)
> +		return NULL;
> +
> +	new_prop->name = kstrdup(prop->name, GFP_KERNEL);
> +	new_prop->value = kmalloc(prop->length + 1, GFP_KERNEL);
> +	if (!new_prop->name || !new_prop->value) {
> +		dlpar_free_drconf_property(new_prop);
> +		return NULL;
> +	}
> +
> +	memcpy(new_prop->value, prop->value, prop->length);
> +	new_prop->length = prop->length;
> +	*(((char *)new_prop->value) + new_prop->length) = 0;

It's not a string, is it?

> +	return new_prop;
> +}
> +
> +static struct memory_block *lmb_to_memblock(struct of_drconf_cell *lmb)
> +{
> +	unsigned long section_nr;
> +	struct mem_section *mem_sect;
> +	struct memory_block *mem_block;
> +	u64 phys_addr = be64_to_cpu(lmb->base_addr);
> +
> +	section_nr = pfn_to_section_nr(PFN_DOWN(phys_addr));
> +	mem_sect = __nr_to_section(section_nr);
> +
> +	mem_block = find_memory_block(mem_sect);
> +	return mem_block;
> +}
> +
>  #ifdef CONFIG_MEMORY_HOTREMOVE
>  static int pseries_remove_memory(u64 start, u64 size)
>  {
> @@ -155,13 +205,133 @@ static inline int pseries_remove_mem_node(struct device_node *np)
>  }
>  #endif /* CONFIG_MEMORY_HOTREMOVE */
>  
> +static int dlpar_add_one_lmb(struct of_drconf_cell *lmb)
> +{
> +	struct memory_block *mem_block;
> +	u64 phys_addr;
> +	unsigned long pages_per_block;
> +	unsigned long block_sz;
> +	int nid, sections_per_block;
> +	int rc;
> +
> +	phys_addr = be64_to_cpu(lmb->base_addr);

of_drconf_cell needs endian annotations.

> +	block_sz = memory_block_size_bytes();
> +	sections_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
> +	pages_per_block = PAGES_PER_SECTION * sections_per_block;
> +
> +	if (phys_addr & ((pages_per_block << PAGE_SHIFT) - 1))
> +		return -EINVAL;
> +
> +	nid = memory_add_physaddr_to_nid(phys_addr);
> +	rc = add_memory(nid, phys_addr, block_sz);
> +	if (rc)
> +		return rc;
> +
> +	rc = memblock_add(phys_addr, block_sz);
> +	if (rc) {
> +		remove_memory(nid, phys_addr, block_sz);
> +		return rc;
> +	}
> +
> +	mem_block = lmb_to_memblock(lmb);
> +	if (!mem_block) {
> +		remove_memory(nid, phys_addr, block_sz);
> +		return -EINVAL;
> +	}

That could all use a lot of comments. ie. why do we have to add it twice?

> +	rc = device_online(&mem_block->dev);
> +	put_device(&mem_block->dev);
> +	if (rc)
> +		remove_memory(nid, phys_addr, block_sz);
> +
> +	return rc;
> +}
> +
> +static int dlpar_memory_add(struct pseries_hp_errorlog *hp_elog)
> +{
> +	struct of_drconf_cell *lmb;
> +	struct device_node *dn;
> +	struct property *prop;
> +	uint32_t entries, *p;

*p should be __be32.

> +	int i, lmbs_to_add;
> +	int lmbs_added = 0;
> +	int rc = -EINVAL;

Don't pre-initialise your rc variables.

> +	if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT) {
> +		lmbs_to_add = be32_to_cpu(hp_elog->_drc_u.drc_count);
> +		pr_info("Attempting to hot-add %d LMB(s)\n", lmbs_to_add);
> +	} else {
> +		lmbs_to_add = 1;
> +		pr_info("Attempting to hot-add LMB, drc index %x\n",
> +			be32_to_cpu(hp_elog->_drc_u.drc_index));
> +	}
> +
> +	dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
> +	if (!dn)
> +		return -EINVAL;
> +
> +	prop = dlpar_clone_drconf_property(dn);
> +	if (!prop) {
> +		of_node_put(dn);
> +		return -EINVAL;
> +	}
> +
> +	p = prop->value;
> +	entries = be32_to_cpu(*p++);
> +	lmb = (struct of_drconf_cell *)p;


So if I'm reading this right the hp_elog either contains an index or a count of
LMBs to add. But it doesn't contain anything about which address ranges to add
or any of those details. That is all in the ibm,dynamic-memory property - but
how did it get in there?

> +
> +	for (i = 0; i < entries; i++, lmb++) {
> +		u32 drc_index = be32_to_cpu(lmb->drc_index);
> +
> +		if (lmbs_to_add == lmbs_added)
> +			break;
> +
> +		if (be32_to_cpu(lmb->flags) & DRCONF_MEM_ASSIGNED)
> +			continue;
> +
> +		if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX
> +		    && lmb->drc_index != hp_elog->_drc_u.drc_index)
> +			continue;
> +
> +		rc = dlpar_acquire_drc(drc_index);
> +		if (rc)
> +			continue;
> +
> +		rc = dlpar_add_one_lmb(lmb);
> +		if (rc) {
> +			dlpar_release_drc(drc_index);
> +			continue;
> +		}

In both the above error cases you just move along. That means we potentially
hotplugged some memory but not everything that we were asked to. That seems
like a bad idea, we should either do everything or nothing.


> +
> +		lmb->flags |= cpu_to_be32(DRCONF_MEM_ASSIGNED);
> +		lmbs_added++;
> +		pr_info("Memory at %llx (drc index %x) has been hot-added\n",
> +			be64_to_cpu(lmb->base_addr), drc_index);
> +	}
> +
> +	if (lmbs_added)
> +		rc = of_update_property(dn, prop);
> +	else
> +		dlpar_free_drconf_property(prop);

The value of rc here is not clear. It could be EINVAL or it could be the result
of the last dlpar_add_one_lmb(lmb). gcc would have told you that if you hadn't
initialised it.

> +
> +	of_node_put(dn);
> +	return rc ? rc : lmbs_added;

This looks wrong.

Doesn't the rc eventually go back to dlpar_write(), which expects 0 for success?

That should show up as the write failing in userspace.


>  int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
>  {
>  	int rc = 0;

Don't initialise to zero, that way gcc can tell you if there's a path where you
forget to initialise it. It also means you can't accidentally return success.

> +	if (hp_elog->id_type != PSERIES_HP_ELOG_ID_DRC_COUNT
> +	    && hp_elog->id_type != PSERIES_HP_ELOG_ID_DRC_INDEX)
> +		return -EINVAL;

This would look nicer as a switch I think.

>  	mutex_lock(&dlpar_mem_mutex);
>  
>  	switch (hp_elog->action) {
> +	case PSERIES_HP_ELOG_ACTION_ADD:
> +		rc = dlpar_memory_add(hp_elog);
> +		break;
>  	default:
>  		pr_err("Invalid action (%d) specified\n", hp_elog->action);
>  		rc = -EINVAL;
> 

cheers







More information about the Linuxppc-dev mailing list