[PATCH v2 6/6] pseries: Implement memory hotplug remove in the kernel

Cyril Bur cyril.bur at au1.ibm.com
Tue Nov 25 09:05:31 AEDT 2014


On Mon, 2014-11-24 at 09:03 -0600, Nathan Fontenot wrote:
> 
> On 11/21/2014 01:49 AM, Cyril Bur wrote:
> > 
> > On Mon, 2014-11-17 at 15:56 -0600, Nathan Fontenot wrote:
> >> Move handling of memory hotplug remove on pseries completely into the kernel.
> >>
> >> The current memory hotplug remove path involves the drmgr command doing part
> >> of this work in userspace and requesting the kernel to do additional pieces.
> >> This patch allows us to handle the act completely in the kernel via rtas
> >> hotplug events. This allows us to perform the operation faster and provide
> >> a common memory hotplug remove path for PowerVM and PowerKVM systems.
> >>
> >> Signed-off-by: Nathan Fontenot <nfont at linux.vnet.ibm.com>
> >> ---
> >>  arch/powerpc/platforms/pseries/hotplug-memory.c |  206 ++++++++++++++++++++++-
> >>  1 file changed, 201 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
> >> index b57d42b..c8189e8 100644
> >> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> >> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> >> @@ -173,6 +173,179 @@ static int pseries_remove_mem_node(struct device_node *np)
> >>  	pseries_remove_memblock(base, lmb_size);
> >>  	return 0;
> >>  }
> >> +
> >> +static int lmb_is_removable(struct of_drconf_cell *lmb)
> >> +{
> >> +	int i, scns_per_block;
> >> +	int rc = 1;
> >> +	unsigned long pfn, block_sz;
> >> +	u64 phys_addr;
> >> +
> >> +	if (!(be32_to_cpu(lmb->flags) & DRCONF_MEM_ASSIGNED))
> >> +		return -1;
> > This makes me kind of nervous. You're using the return value of
> > lmb_is_removable as a boolean but it returns three possible values -1,0
> > and 1. Functionally it looks correct to me so its not a massive issue
> 
> Oh yuck. this should be a boolean return. I'll fix that.
> 
> >> +
> >> +	phys_addr = be64_to_cpu(lmb->base_addr);
> >> +	block_sz = memory_block_size_bytes();
> >> +	scns_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
> >> +
> >> +	for (i = 0; i < scns_per_block; i++) {
> >> +		pfn = PFN_DOWN(phys_addr);
> >> +		if (!pfn_present(pfn))
> >> +			continue;
> >> +
> >> +		rc &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
> >> +		phys_addr += MIN_MEMORY_BLOCK_SIZE;
> >> +	}
> >> +
> >> +	return rc;
> >> +}
> >> +
> >> +static int dlpar_add_lmb(struct of_drconf_cell *);
> >> +
> >> +static int dlpar_remove_lmb(struct of_drconf_cell *lmb)
> >> +{
> >> +	struct memory_block *mem_block;
> >> +	unsigned long block_sz;
> >> +	u64 phys_addr;
> >> +	uint32_t drc_index;
> >> +	int nid, rc;
> >> +
> >> +	if (!lmb_is_removable(lmb))
> >> +		return -EINVAL;
> >> +
> >> +	phys_addr = be64_to_cpu(lmb->base_addr);
> >> +	drc_index = be32_to_cpu(lmb->drc_index);
> >> +
> >> +	mem_block = lmb_to_memblock(lmb);
> >> +	if (!mem_block)
> >> +		return -EINVAL;
> >> +
> >> +	rc = device_offline(&mem_block->dev);
> >> +	put_device(&mem_block->dev);
> >> +	if (rc)
> >> +		return rc;
> >> +
> >> +	block_sz = pseries_memory_block_size();
> >> +	nid = memory_add_physaddr_to_nid(phys_addr);
> >> +
> >> +	remove_memory(nid, phys_addr, block_sz);
> >> +
> >> +	/* Update memory regions for memory remove */
> >> +	memblock_remove(phys_addr, block_sz);
> >> +
> >> +	dlpar_release_drc(drc_index);
> >> +
> >> +	lmb->flags &= cpu_to_be32(~DRCONF_MEM_ASSIGNED);
> >> +	pr_info("Memory at %llx (drc index %x) has been hot-removed\n",
> >> +		be64_to_cpu(lmb->base_addr), drc_index);
> > dlpar_add_lmb doesn't print anything but dlpar_remove_lmb does? Related
> > to my comment about printing a 'hot-add' messages prematurely, perhaps
> > move this to the callers
> 
> Yes, I'll update the remove mem messages also.
> 
> >> +
> >> +	return 0;
> >> +}
> >> +
> >> +static int dlpar_memory_remove_by_count(struct pseries_hp_errorlog *hp_elog,
> >> +					struct property *prop)
> >> +{
> >> +	struct of_drconf_cell *lmbs;
> >> +	int lmbs_to_remove, lmbs_removed = 0;
> >> +	int lmbs_available = 0;
> >> +	uint32_t num_lmbs;
> >> +	__be32 *p;
> >> +	int i, rc;
> >> +
> >> +	lmbs_to_remove = be32_to_cpu(hp_elog->_drc_u.drc_count);
> > Didn't you already do the endian conversion back in
> > handle_dlpar_errorlog?
> >> +	pr_info("Attempting to hot-remove %d LMB(s)\n", lmbs_to_remove);
> >> +
> >> +	if (lmbs_to_remove == 0)
> >> +		return -EINVAL;
> >> +
> >> +	p = prop->value;
> >> +	num_lmbs = be32_to_cpu(*p++);
> >> +	lmbs = (struct of_drconf_cell *)p;
> >> +
> >> +	/* Validate that there are enough LMBs to satisfy the request */
> >> +	for (i = 0; i < num_lmbs; i++) {
> >> +		if (be32_to_cpu(lmbs[i].flags) & DRCONF_MEM_ASSIGNED)
> >> +			lmbs_available++;
> >> +	}
> >> +
> >> +	if (lmbs_available < lmbs_to_remove)
> >> +		return -EINVAL;
> >> +
> >> +	for (i = 0; i < num_lmbs; i++) {
> >> +		if (lmbs_to_remove == lmbs_removed)
> >> +			break;
> >> +
> >> +		rc = dlpar_remove_lmb(&lmbs[i]);
> >> +		if (rc)
> >> +			continue;
> >> +
> >> +		lmbs_removed++;
> >> +
> >> +		/* Mark this lmb so we can add it later if all of the
> >> +		 * requested LMBs cannot be removed.
> >> +		 */
> >> +		lmbs[i].reserved = 1;
> >> +	}
> >> +
> >> +	if (lmbs_removed != lmbs_to_remove) {
> >> +		pr_err("Memory hot-remove failed, adding LMB's back\n");
> >> +
> >> +		for (i = 0; i < num_lmbs; i++) {
> >> +			if (!lmbs[i].reserved)
> >> +				continue;
> >> +
> >> +			rc = dlpar_add_lmb(&lmbs[i]);
> >> +			if (rc)
> > If this happens you this will leave an LMB removed but an error code
> > from this function will cause dlpar_memory to discard the dn prop that
> > it passed in. If you couldn't roll back completely the caller should
> > still update the dn property with the ones that it has left
> > removed/failed to re-add. Perhaps this function needs a mechanism to
> > report success/failure (as it currently does) and a mechanism to say
> > whether or not the dn property is dirty or not.
> 
> Good catch. The device tree should be updated to denote lmb's that were
> added but we failed to remove because of some error.
> 
> >> +				pr_err("Failed to add LMB back, drc index %x\n",
> >> +				       be32_to_cpu(lmbs[i].drc_index));
> >> +
> > On a clean (everything rolled back successfully) failure, I don't think
> > you _need_ to bother although this is nice cleanup, you didn't do it
> > dlpar_memory_add_by_count - I'm in favour of being clean especially
> > since on an 'unclean' failure you should clear that flag - if you're
> > going to still update the device tree.
> 
> I'll re-visit the cleanup code to make sure the device tree property is left
> in the correct state on a failure during roll-back.
> 
> >> +			lmbs[i].reserved = 0;
> >> +		}
> >> +		rc = -EINVAL;
> >> +	} else {
> >> +		/* remove any reserved markings */
> >> +		for (i = 0; i < num_lmbs; i++)
> >> +			lmbs[i].reserved = 0;
> >> +	}
> > Hmmmm since the if statements are checking for failures. It might be
> > more clear to return -EINVAL (rather than setting rc) and the code in
> > the else block can be moved out of it and come to think of it, return 0.
> > Otherwise it looks odd to me with the 'normal' success case code being
> > in an else statement.
> 
> I like this.
> 
> >> +
> >> +	return rc;
> >> +}
> >> +
> >> +static int dlpar_memory_remove_by_index(struct pseries_hp_errorlog *hp_elog,
> >> +					struct property *prop)
> >> +{
> >> +	struct of_drconf_cell *lmbs;
> >> +	uint32_t num_lmbs, drc_index;
> >> +	int lmb_found;
> >> +	__be32 *p;
> >> +	int i, rc;
> >> +
> >> +	drc_index = be32_to_cpu(hp_elog->_drc_u.drc_index);
> > Didn't you already do the endian conversion back in
> > handle_dlpar_errorlog?
> >> +	pr_info("Attempting to hot-remove LMB, drc index %x\n", drc_index);
> >> +
> >> +	p = prop->value;
> >> +	num_lmbs = be32_to_cpu(*p++);
> >> +	lmbs = (struct of_drconf_cell *)p;
> >> +
> >> +	lmb_found = 0;
> >> +	for (i = 0; i < num_lmbs; i++) {
> >> +		if (lmbs[i].drc_index == hp_elog->_drc_u.drc_index) {
> >> +			lmb_found = 1;
> >> +			rc = dlpar_remove_lmb(&lmbs[i]);
> >> +			break;
> >> +		}
> >> +	}
> >> +
> >> +	if (!lmb_found)
> >> +		rc = -EINVAL;
> >> +
> >> +	if (rc)
> >> +		pr_info("Failed to hot-remove memory, drc index %x\n",
> >> +			drc_index);
> >> +
> >> +	return rc;
> >> +}
> >> +
> >>  #else
> >>  static inline int pseries_remove_memblock(unsigned long base,
> >>  					  unsigned int memblock_size)
> >> @@ -183,6 +356,11 @@ static inline int pseries_remove_mem_node(struct device_node *np)
> >>  {
> >>  	return 0;
> >>  }
> >> +static inline int dlpar_memory_remove(struct pseries_hp_errorlog *hp_elog)
> >> +{
> >> +	return -EOPNOTSUPP;
> >> +}
> >> +
> >>  #endif /* CONFIG_MEMORY_HOTREMOVE */
> >>  
> >>  static int dlpar_add_lmb(struct of_drconf_cell *lmb)
> >> @@ -298,14 +476,24 @@ static int dlpar_memory_add_by_count(struct pseries_hp_errorlog *hp_elog,
> >>  	}
> >>  
> >>  	if (lmbs_added != lmbs_to_add) {
> >> -		/* TODO: remove added lmbs */
> >> +		pr_err("Memory hot-add failed, removing any added LMBs\n");
> >> +
> >> +		for (i = 0; i < num_lmbs; i++) {
> >> +			if (!lmbs[i].reserved)
> >> +				continue;
> >> +
> >> +			rc = dlpar_remove_lmb(&lmbs[i]);
> >> +			if (rc)
> > There is a very much related comment in dlpar_memory_remove_by_count
> > about this condition being true.
> >> +				pr_err("Failed to remove LMB, drc index %x\n",
> >> +				       be32_to_cpu(lmbs[i].drc_index));
> > 
> > You don't clear the reserved flag here?
> In the failure case here the device tree property is not updated, so no
> cleanup needed. This is not really obvious so it would probably be best
> to do the cleanup.
>  
Of course except when you fail to roll back and you'll be forced to
update the device tree - but I suppose clearing it here or not really
depends on how you do that...
> > 
> >> +		}
> >>  		rc = -EINVAL;
> > Same observation as in dlpar_memory_remove_by_count
> 
> ok.
> 
> Thanks for the review.
> 
> -Nathan
> 
> >> +	} else {
> >> +		/* Clear the reserved fields */
> >> +		for (i = 0; i < num_lmbs; i++)
> >> +			lmbs[i].reserved = 0;
> >>  	}
> >>  
> >> -	/* Clear the reserved fields */
> >> -	for (i = 0; i < num_lmbs; i++)
> >> -		lmbs[i].reserved = 0;
> >> -
> >>  	return rc;
> >>  }
> >>  
> >> @@ -373,6 +561,14 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
> >>  		else
> >>  			rc = -EINVAL;
> >>  		break;
> >> +	case PSERIES_HP_ELOG_ACTION_REMOVE:
> >> +		if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT)
> >> +			rc = dlpar_memory_remove_by_count(hp_elog, prop);
> >> +		else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX)
> >> +			rc = dlpar_memory_remove_by_index(hp_elog, prop);
> >> +		else
> >> +			rc = -EINVAL;
> >> +		break;
> >>  	default:
> >>  		pr_err("Invalid action (%d) specified\n", hp_elog->action);
> >>  		rc = -EINVAL;
> >>
> >> _______________________________________________
> >> Linuxppc-dev mailing list
> >> Linuxppc-dev at lists.ozlabs.org
> >> https://lists.ozlabs.org/listinfo/linuxppc-dev
> > 
> > 
> > 
> 
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev at lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev




More information about the Linuxppc-dev mailing list