[PATCH v2 6/6] pseries: Implement memory hotplug remove in the kernel

Nathan Fontenot nfont at linux.vnet.ibm.com
Tue Nov 25 02:03:48 AEDT 2014



On 11/21/2014 01:49 AM, Cyril Bur wrote:
> 
> On Mon, 2014-11-17 at 15:56 -0600, Nathan Fontenot wrote:
>> Move handling of memory hotplug remove on pseries completely into the kernel.
>>
>> The current memory hotplug remove path involves the drmgr command doing part
>> of this work in userspace and requesting the kernel to do additional pieces.
>> This patch allows us to handle the act completely in the kernel via rtas
>> hotplug events. This allows us to perform the operation faster and provide
>> a common memory hotplug remove path for PowerVM and PowerKVM systems.
>>
>> Signed-off-by: Nathan Fontenot <nfont at linux.vnet.ibm.com>
>> ---
>>  arch/powerpc/platforms/pseries/hotplug-memory.c |  206 ++++++++++++++++++++++-
>>  1 file changed, 201 insertions(+), 5 deletions(-)
>>
>> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
>> index b57d42b..c8189e8 100644
>> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
>> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
>> @@ -173,6 +173,179 @@ static int pseries_remove_mem_node(struct device_node *np)
>>  	pseries_remove_memblock(base, lmb_size);
>>  	return 0;
>>  }
>> +
>> +static int lmb_is_removable(struct of_drconf_cell *lmb)
>> +{
>> +	int i, scns_per_block;
>> +	int rc = 1;
>> +	unsigned long pfn, block_sz;
>> +	u64 phys_addr;
>> +
>> +	if (!(be32_to_cpu(lmb->flags) & DRCONF_MEM_ASSIGNED))
>> +		return -1;
> This makes me kind of nervous. You're using the return value of
> lmb_is_removable as a boolean but it returns three possible values -1,0
> and 1. Functionally it looks correct to me so its not a massive issue

Oh yuck. this should be a boolean return. I'll fix that.

>> +
>> +	phys_addr = be64_to_cpu(lmb->base_addr);
>> +	block_sz = memory_block_size_bytes();
>> +	scns_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
>> +
>> +	for (i = 0; i < scns_per_block; i++) {
>> +		pfn = PFN_DOWN(phys_addr);
>> +		if (!pfn_present(pfn))
>> +			continue;
>> +
>> +		rc &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
>> +		phys_addr += MIN_MEMORY_BLOCK_SIZE;
>> +	}
>> +
>> +	return rc;
>> +}
>> +
>> +static int dlpar_add_lmb(struct of_drconf_cell *);
>> +
>> +static int dlpar_remove_lmb(struct of_drconf_cell *lmb)
>> +{
>> +	struct memory_block *mem_block;
>> +	unsigned long block_sz;
>> +	u64 phys_addr;
>> +	uint32_t drc_index;
>> +	int nid, rc;
>> +
>> +	if (!lmb_is_removable(lmb))
>> +		return -EINVAL;
>> +
>> +	phys_addr = be64_to_cpu(lmb->base_addr);
>> +	drc_index = be32_to_cpu(lmb->drc_index);
>> +
>> +	mem_block = lmb_to_memblock(lmb);
>> +	if (!mem_block)
>> +		return -EINVAL;
>> +
>> +	rc = device_offline(&mem_block->dev);
>> +	put_device(&mem_block->dev);
>> +	if (rc)
>> +		return rc;
>> +
>> +	block_sz = pseries_memory_block_size();
>> +	nid = memory_add_physaddr_to_nid(phys_addr);
>> +
>> +	remove_memory(nid, phys_addr, block_sz);
>> +
>> +	/* Update memory regions for memory remove */
>> +	memblock_remove(phys_addr, block_sz);
>> +
>> +	dlpar_release_drc(drc_index);
>> +
>> +	lmb->flags &= cpu_to_be32(~DRCONF_MEM_ASSIGNED);
>> +	pr_info("Memory at %llx (drc index %x) has been hot-removed\n",
>> +		be64_to_cpu(lmb->base_addr), drc_index);
> dlpar_add_lmb doesn't print anything but dlpar_remove_lmb does? Related
> to my comment about printing a 'hot-add' messages prematurely, perhaps
> move this to the callers

Yes, I'll update the remove mem messages also.

>> +
>> +	return 0;
>> +}
>> +
>> +static int dlpar_memory_remove_by_count(struct pseries_hp_errorlog *hp_elog,
>> +					struct property *prop)
>> +{
>> +	struct of_drconf_cell *lmbs;
>> +	int lmbs_to_remove, lmbs_removed = 0;
>> +	int lmbs_available = 0;
>> +	uint32_t num_lmbs;
>> +	__be32 *p;
>> +	int i, rc;
>> +
>> +	lmbs_to_remove = be32_to_cpu(hp_elog->_drc_u.drc_count);
> Didn't you already do the endian conversion back in
> handle_dlpar_errorlog?
>> +	pr_info("Attempting to hot-remove %d LMB(s)\n", lmbs_to_remove);
>> +
>> +	if (lmbs_to_remove == 0)
>> +		return -EINVAL;
>> +
>> +	p = prop->value;
>> +	num_lmbs = be32_to_cpu(*p++);
>> +	lmbs = (struct of_drconf_cell *)p;
>> +
>> +	/* Validate that there are enough LMBs to satisfy the request */
>> +	for (i = 0; i < num_lmbs; i++) {
>> +		if (be32_to_cpu(lmbs[i].flags) & DRCONF_MEM_ASSIGNED)
>> +			lmbs_available++;
>> +	}
>> +
>> +	if (lmbs_available < lmbs_to_remove)
>> +		return -EINVAL;
>> +
>> +	for (i = 0; i < num_lmbs; i++) {
>> +		if (lmbs_to_remove == lmbs_removed)
>> +			break;
>> +
>> +		rc = dlpar_remove_lmb(&lmbs[i]);
>> +		if (rc)
>> +			continue;
>> +
>> +		lmbs_removed++;
>> +
>> +		/* Mark this lmb so we can add it later if all of the
>> +		 * requested LMBs cannot be removed.
>> +		 */
>> +		lmbs[i].reserved = 1;
>> +	}
>> +
>> +	if (lmbs_removed != lmbs_to_remove) {
>> +		pr_err("Memory hot-remove failed, adding LMB's back\n");
>> +
>> +		for (i = 0; i < num_lmbs; i++) {
>> +			if (!lmbs[i].reserved)
>> +				continue;
>> +
>> +			rc = dlpar_add_lmb(&lmbs[i]);
>> +			if (rc)
> If this happens you this will leave an LMB removed but an error code
> from this function will cause dlpar_memory to discard the dn prop that
> it passed in. If you couldn't roll back completely the caller should
> still update the dn property with the ones that it has left
> removed/failed to re-add. Perhaps this function needs a mechanism to
> report success/failure (as it currently does) and a mechanism to say
> whether or not the dn property is dirty or not.

Good catch. The device tree should be updated to denote lmb's that were
added but we failed to remove because of some error.

>> +				pr_err("Failed to add LMB back, drc index %x\n",
>> +				       be32_to_cpu(lmbs[i].drc_index));
>> +
> On a clean (everything rolled back successfully) failure, I don't think
> you _need_ to bother although this is nice cleanup, you didn't do it
> dlpar_memory_add_by_count - I'm in favour of being clean especially
> since on an 'unclean' failure you should clear that flag - if you're
> going to still update the device tree.

I'll re-visit the cleanup code to make sure the device tree property is left
in the correct state on a failure during roll-back.

>> +			lmbs[i].reserved = 0;
>> +		}
>> +		rc = -EINVAL;
>> +	} else {
>> +		/* remove any reserved markings */
>> +		for (i = 0; i < num_lmbs; i++)
>> +			lmbs[i].reserved = 0;
>> +	}
> Hmmmm since the if statements are checking for failures. It might be
> more clear to return -EINVAL (rather than setting rc) and the code in
> the else block can be moved out of it and come to think of it, return 0.
> Otherwise it looks odd to me with the 'normal' success case code being
> in an else statement.

I like this.

>> +
>> +	return rc;
>> +}
>> +
>> +static int dlpar_memory_remove_by_index(struct pseries_hp_errorlog *hp_elog,
>> +					struct property *prop)
>> +{
>> +	struct of_drconf_cell *lmbs;
>> +	uint32_t num_lmbs, drc_index;
>> +	int lmb_found;
>> +	__be32 *p;
>> +	int i, rc;
>> +
>> +	drc_index = be32_to_cpu(hp_elog->_drc_u.drc_index);
> Didn't you already do the endian conversion back in
> handle_dlpar_errorlog?
>> +	pr_info("Attempting to hot-remove LMB, drc index %x\n", drc_index);
>> +
>> +	p = prop->value;
>> +	num_lmbs = be32_to_cpu(*p++);
>> +	lmbs = (struct of_drconf_cell *)p;
>> +
>> +	lmb_found = 0;
>> +	for (i = 0; i < num_lmbs; i++) {
>> +		if (lmbs[i].drc_index == hp_elog->_drc_u.drc_index) {
>> +			lmb_found = 1;
>> +			rc = dlpar_remove_lmb(&lmbs[i]);
>> +			break;
>> +		}
>> +	}
>> +
>> +	if (!lmb_found)
>> +		rc = -EINVAL;
>> +
>> +	if (rc)
>> +		pr_info("Failed to hot-remove memory, drc index %x\n",
>> +			drc_index);
>> +
>> +	return rc;
>> +}
>> +
>>  #else
>>  static inline int pseries_remove_memblock(unsigned long base,
>>  					  unsigned int memblock_size)
>> @@ -183,6 +356,11 @@ static inline int pseries_remove_mem_node(struct device_node *np)
>>  {
>>  	return 0;
>>  }
>> +static inline int dlpar_memory_remove(struct pseries_hp_errorlog *hp_elog)
>> +{
>> +	return -EOPNOTSUPP;
>> +}
>> +
>>  #endif /* CONFIG_MEMORY_HOTREMOVE */
>>  
>>  static int dlpar_add_lmb(struct of_drconf_cell *lmb)
>> @@ -298,14 +476,24 @@ static int dlpar_memory_add_by_count(struct pseries_hp_errorlog *hp_elog,
>>  	}
>>  
>>  	if (lmbs_added != lmbs_to_add) {
>> -		/* TODO: remove added lmbs */
>> +		pr_err("Memory hot-add failed, removing any added LMBs\n");
>> +
>> +		for (i = 0; i < num_lmbs; i++) {
>> +			if (!lmbs[i].reserved)
>> +				continue;
>> +
>> +			rc = dlpar_remove_lmb(&lmbs[i]);
>> +			if (rc)
> There is a very much related comment in dlpar_memory_remove_by_count
> about this condition being true.
>> +				pr_err("Failed to remove LMB, drc index %x\n",
>> +				       be32_to_cpu(lmbs[i].drc_index));
> 
> You don't clear the reserved flag here?
In the failure case here the device tree property is not updated, so no
cleanup needed. This is not really obvious so it would probably be best
to do the cleanup.
 
> 
>> +		}
>>  		rc = -EINVAL;
> Same observation as in dlpar_memory_remove_by_count

ok.

Thanks for the review.

-Nathan

>> +	} else {
>> +		/* Clear the reserved fields */
>> +		for (i = 0; i < num_lmbs; i++)
>> +			lmbs[i].reserved = 0;
>>  	}
>>  
>> -	/* Clear the reserved fields */
>> -	for (i = 0; i < num_lmbs; i++)
>> -		lmbs[i].reserved = 0;
>> -
>>  	return rc;
>>  }
>>  
>> @@ -373,6 +561,14 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
>>  		else
>>  			rc = -EINVAL;
>>  		break;
>> +	case PSERIES_HP_ELOG_ACTION_REMOVE:
>> +		if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT)
>> +			rc = dlpar_memory_remove_by_count(hp_elog, prop);
>> +		else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX)
>> +			rc = dlpar_memory_remove_by_index(hp_elog, prop);
>> +		else
>> +			rc = -EINVAL;
>> +		break;
>>  	default:
>>  		pr_err("Invalid action (%d) specified\n", hp_elog->action);
>>  		rc = -EINVAL;
>>
>> _______________________________________________
>> Linuxppc-dev mailing list
>> Linuxppc-dev at lists.ozlabs.org
>> https://lists.ozlabs.org/listinfo/linuxppc-dev
> 
> 
> 



More information about the Linuxppc-dev mailing list