[PATCH v2 6/6] pseries: Implement memory hotplug remove in the kernel
Cyril Bur
cyril.bur at au1.ibm.com
Tue Nov 25 09:05:31 AEDT 2014
On Mon, 2014-11-24 at 09:03 -0600, Nathan Fontenot wrote:
>
> On 11/21/2014 01:49 AM, Cyril Bur wrote:
> >
> > On Mon, 2014-11-17 at 15:56 -0600, Nathan Fontenot wrote:
> >> Move handling of memory hotplug remove on pseries completely into the kernel.
> >>
> >> The current memory hotplug remove path involves the drmgr command doing part
> >> of this work in userspace and requesting the kernel to do additional pieces.
> >> This patch allows us to handle the act completely in the kernel via rtas
> >> hotplug events. This allows us to perform the operation faster and provide
> >> a common memory hotplug remove path for PowerVM and PowerKVM systems.
> >>
> >> Signed-off-by: Nathan Fontenot <nfont at linux.vnet.ibm.com>
> >> ---
> >> arch/powerpc/platforms/pseries/hotplug-memory.c | 206 ++++++++++++++++++++++-
> >> 1 file changed, 201 insertions(+), 5 deletions(-)
> >>
> >> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
> >> index b57d42b..c8189e8 100644
> >> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> >> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> >> @@ -173,6 +173,179 @@ static int pseries_remove_mem_node(struct device_node *np)
> >> pseries_remove_memblock(base, lmb_size);
> >> return 0;
> >> }
> >> +
> >> +static int lmb_is_removable(struct of_drconf_cell *lmb)
> >> +{
> >> + int i, scns_per_block;
> >> + int rc = 1;
> >> + unsigned long pfn, block_sz;
> >> + u64 phys_addr;
> >> +
> >> + if (!(be32_to_cpu(lmb->flags) & DRCONF_MEM_ASSIGNED))
> >> + return -1;
> > This makes me kind of nervous. You're using the return value of
> > lmb_is_removable as a boolean but it returns three possible values -1,0
> > and 1. Functionally it looks correct to me so its not a massive issue
>
> Oh yuck. this should be a boolean return. I'll fix that.
>
> >> +
> >> + phys_addr = be64_to_cpu(lmb->base_addr);
> >> + block_sz = memory_block_size_bytes();
> >> + scns_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
> >> +
> >> + for (i = 0; i < scns_per_block; i++) {
> >> + pfn = PFN_DOWN(phys_addr);
> >> + if (!pfn_present(pfn))
> >> + continue;
> >> +
> >> + rc &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
> >> + phys_addr += MIN_MEMORY_BLOCK_SIZE;
> >> + }
> >> +
> >> + return rc;
> >> +}
> >> +
> >> +static int dlpar_add_lmb(struct of_drconf_cell *);
> >> +
> >> +static int dlpar_remove_lmb(struct of_drconf_cell *lmb)
> >> +{
> >> + struct memory_block *mem_block;
> >> + unsigned long block_sz;
> >> + u64 phys_addr;
> >> + uint32_t drc_index;
> >> + int nid, rc;
> >> +
> >> + if (!lmb_is_removable(lmb))
> >> + return -EINVAL;
> >> +
> >> + phys_addr = be64_to_cpu(lmb->base_addr);
> >> + drc_index = be32_to_cpu(lmb->drc_index);
> >> +
> >> + mem_block = lmb_to_memblock(lmb);
> >> + if (!mem_block)
> >> + return -EINVAL;
> >> +
> >> + rc = device_offline(&mem_block->dev);
> >> + put_device(&mem_block->dev);
> >> + if (rc)
> >> + return rc;
> >> +
> >> + block_sz = pseries_memory_block_size();
> >> + nid = memory_add_physaddr_to_nid(phys_addr);
> >> +
> >> + remove_memory(nid, phys_addr, block_sz);
> >> +
> >> + /* Update memory regions for memory remove */
> >> + memblock_remove(phys_addr, block_sz);
> >> +
> >> + dlpar_release_drc(drc_index);
> >> +
> >> + lmb->flags &= cpu_to_be32(~DRCONF_MEM_ASSIGNED);
> >> + pr_info("Memory at %llx (drc index %x) has been hot-removed\n",
> >> + be64_to_cpu(lmb->base_addr), drc_index);
> > dlpar_add_lmb doesn't print anything but dlpar_remove_lmb does? Related
> > to my comment about printing a 'hot-add' messages prematurely, perhaps
> > move this to the callers
>
> Yes, I'll update the remove mem messages also.
>
> >> +
> >> + return 0;
> >> +}
> >> +
> >> +static int dlpar_memory_remove_by_count(struct pseries_hp_errorlog *hp_elog,
> >> + struct property *prop)
> >> +{
> >> + struct of_drconf_cell *lmbs;
> >> + int lmbs_to_remove, lmbs_removed = 0;
> >> + int lmbs_available = 0;
> >> + uint32_t num_lmbs;
> >> + __be32 *p;
> >> + int i, rc;
> >> +
> >> + lmbs_to_remove = be32_to_cpu(hp_elog->_drc_u.drc_count);
> > Didn't you already do the endian conversion back in
> > handle_dlpar_errorlog?
> >> + pr_info("Attempting to hot-remove %d LMB(s)\n", lmbs_to_remove);
> >> +
> >> + if (lmbs_to_remove == 0)
> >> + return -EINVAL;
> >> +
> >> + p = prop->value;
> >> + num_lmbs = be32_to_cpu(*p++);
> >> + lmbs = (struct of_drconf_cell *)p;
> >> +
> >> + /* Validate that there are enough LMBs to satisfy the request */
> >> + for (i = 0; i < num_lmbs; i++) {
> >> + if (be32_to_cpu(lmbs[i].flags) & DRCONF_MEM_ASSIGNED)
> >> + lmbs_available++;
> >> + }
> >> +
> >> + if (lmbs_available < lmbs_to_remove)
> >> + return -EINVAL;
> >> +
> >> + for (i = 0; i < num_lmbs; i++) {
> >> + if (lmbs_to_remove == lmbs_removed)
> >> + break;
> >> +
> >> + rc = dlpar_remove_lmb(&lmbs[i]);
> >> + if (rc)
> >> + continue;
> >> +
> >> + lmbs_removed++;
> >> +
> >> + /* Mark this lmb so we can add it later if all of the
> >> + * requested LMBs cannot be removed.
> >> + */
> >> + lmbs[i].reserved = 1;
> >> + }
> >> +
> >> + if (lmbs_removed != lmbs_to_remove) {
> >> + pr_err("Memory hot-remove failed, adding LMB's back\n");
> >> +
> >> + for (i = 0; i < num_lmbs; i++) {
> >> + if (!lmbs[i].reserved)
> >> + continue;
> >> +
> >> + rc = dlpar_add_lmb(&lmbs[i]);
> >> + if (rc)
> > If this happens you this will leave an LMB removed but an error code
> > from this function will cause dlpar_memory to discard the dn prop that
> > it passed in. If you couldn't roll back completely the caller should
> > still update the dn property with the ones that it has left
> > removed/failed to re-add. Perhaps this function needs a mechanism to
> > report success/failure (as it currently does) and a mechanism to say
> > whether or not the dn property is dirty or not.
>
> Good catch. The device tree should be updated to denote lmb's that were
> added but we failed to remove because of some error.
>
> >> + pr_err("Failed to add LMB back, drc index %x\n",
> >> + be32_to_cpu(lmbs[i].drc_index));
> >> +
> > On a clean (everything rolled back successfully) failure, I don't think
> > you _need_ to bother although this is nice cleanup, you didn't do it
> > dlpar_memory_add_by_count - I'm in favour of being clean especially
> > since on an 'unclean' failure you should clear that flag - if you're
> > going to still update the device tree.
>
> I'll re-visit the cleanup code to make sure the device tree property is left
> in the correct state on a failure during roll-back.
>
> >> + lmbs[i].reserved = 0;
> >> + }
> >> + rc = -EINVAL;
> >> + } else {
> >> + /* remove any reserved markings */
> >> + for (i = 0; i < num_lmbs; i++)
> >> + lmbs[i].reserved = 0;
> >> + }
> > Hmmmm since the if statements are checking for failures. It might be
> > more clear to return -EINVAL (rather than setting rc) and the code in
> > the else block can be moved out of it and come to think of it, return 0.
> > Otherwise it looks odd to me with the 'normal' success case code being
> > in an else statement.
>
> I like this.
>
> >> +
> >> + return rc;
> >> +}
> >> +
> >> +static int dlpar_memory_remove_by_index(struct pseries_hp_errorlog *hp_elog,
> >> + struct property *prop)
> >> +{
> >> + struct of_drconf_cell *lmbs;
> >> + uint32_t num_lmbs, drc_index;
> >> + int lmb_found;
> >> + __be32 *p;
> >> + int i, rc;
> >> +
> >> + drc_index = be32_to_cpu(hp_elog->_drc_u.drc_index);
> > Didn't you already do the endian conversion back in
> > handle_dlpar_errorlog?
> >> + pr_info("Attempting to hot-remove LMB, drc index %x\n", drc_index);
> >> +
> >> + p = prop->value;
> >> + num_lmbs = be32_to_cpu(*p++);
> >> + lmbs = (struct of_drconf_cell *)p;
> >> +
> >> + lmb_found = 0;
> >> + for (i = 0; i < num_lmbs; i++) {
> >> + if (lmbs[i].drc_index == hp_elog->_drc_u.drc_index) {
> >> + lmb_found = 1;
> >> + rc = dlpar_remove_lmb(&lmbs[i]);
> >> + break;
> >> + }
> >> + }
> >> +
> >> + if (!lmb_found)
> >> + rc = -EINVAL;
> >> +
> >> + if (rc)
> >> + pr_info("Failed to hot-remove memory, drc index %x\n",
> >> + drc_index);
> >> +
> >> + return rc;
> >> +}
> >> +
> >> #else
> >> static inline int pseries_remove_memblock(unsigned long base,
> >> unsigned int memblock_size)
> >> @@ -183,6 +356,11 @@ static inline int pseries_remove_mem_node(struct device_node *np)
> >> {
> >> return 0;
> >> }
> >> +static inline int dlpar_memory_remove(struct pseries_hp_errorlog *hp_elog)
> >> +{
> >> + return -EOPNOTSUPP;
> >> +}
> >> +
> >> #endif /* CONFIG_MEMORY_HOTREMOVE */
> >>
> >> static int dlpar_add_lmb(struct of_drconf_cell *lmb)
> >> @@ -298,14 +476,24 @@ static int dlpar_memory_add_by_count(struct pseries_hp_errorlog *hp_elog,
> >> }
> >>
> >> if (lmbs_added != lmbs_to_add) {
> >> - /* TODO: remove added lmbs */
> >> + pr_err("Memory hot-add failed, removing any added LMBs\n");
> >> +
> >> + for (i = 0; i < num_lmbs; i++) {
> >> + if (!lmbs[i].reserved)
> >> + continue;
> >> +
> >> + rc = dlpar_remove_lmb(&lmbs[i]);
> >> + if (rc)
> > There is a very much related comment in dlpar_memory_remove_by_count
> > about this condition being true.
> >> + pr_err("Failed to remove LMB, drc index %x\n",
> >> + be32_to_cpu(lmbs[i].drc_index));
> >
> > You don't clear the reserved flag here?
> In the failure case here the device tree property is not updated, so no
> cleanup needed. This is not really obvious so it would probably be best
> to do the cleanup.
>
Of course except when you fail to roll back and you'll be forced to
update the device tree - but I suppose clearing it here or not really
depends on how you do that...
> >
> >> + }
> >> rc = -EINVAL;
> > Same observation as in dlpar_memory_remove_by_count
>
> ok.
>
> Thanks for the review.
>
> -Nathan
>
> >> + } else {
> >> + /* Clear the reserved fields */
> >> + for (i = 0; i < num_lmbs; i++)
> >> + lmbs[i].reserved = 0;
> >> }
> >>
> >> - /* Clear the reserved fields */
> >> - for (i = 0; i < num_lmbs; i++)
> >> - lmbs[i].reserved = 0;
> >> -
> >> return rc;
> >> }
> >>
> >> @@ -373,6 +561,14 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
> >> else
> >> rc = -EINVAL;
> >> break;
> >> + case PSERIES_HP_ELOG_ACTION_REMOVE:
> >> + if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_COUNT)
> >> + rc = dlpar_memory_remove_by_count(hp_elog, prop);
> >> + else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX)
> >> + rc = dlpar_memory_remove_by_index(hp_elog, prop);
> >> + else
> >> + rc = -EINVAL;
> >> + break;
> >> default:
> >> pr_err("Invalid action (%d) specified\n", hp_elog->action);
> >> rc = -EINVAL;
> >>
> >> _______________________________________________
> >> Linuxppc-dev mailing list
> >> Linuxppc-dev at lists.ozlabs.org
> >> https://lists.ozlabs.org/listinfo/linuxppc-dev
> >
> >
> >
>
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev at lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
More information about the Linuxppc-dev
mailing list