[PATCH 1/3] powerpc/pseries/cpuhp: cache node corrections

Nathan Lynch nathanl at linux.ibm.com
Mon Sep 20 23:55:02 AEST 2021


On pseries, cache nodes in the device tree can be added and removed by the
CPU DLPAR code as well as the partition migration (mobility) code. PowerVM
partitions in dedicated processor mode typically have L2 and L3 cache
nodes.

The CPU DLPAR code has the following shortcomings:

* Cache nodes returned as siblings of a new CPU node by
  ibm,configure-connector are silently discarded; only the CPU node is
  added to the device tree.

* Cache nodes which become unreferenced in the processor removal path are
  not removed from the device tree. This can lead to duplicate nodes when
  the post-migration device tree update code replaces cache nodes.

This is long-standing behavior. Presumably it has gone mostly unnoticed
because the two bugs have the property of obscuring each other in common
simple scenarios (e.g. remove a CPU and add it back). Likely you'd notice
only if you cared to inspect the device tree or the sysfs cacheinfo
information.

Booted with two processors:

  $ pwd
  /sys/firmware/devicetree/base/cpus
  $ ls -1d */
  l2-cache at 2010/
  l2-cache at 2011/
  l3-cache at 3110/
  l3-cache at 3111/
  PowerPC,POWER9 at 0/
  PowerPC,POWER9 at 8/
  $ lsprop */l2-cache
  l2-cache at 2010/l2-cache
                 00003110 (12560)
  l2-cache at 2011/l2-cache
                 00003111 (12561)
  PowerPC,POWER9 at 0/l2-cache
                 00002010 (8208)
  PowerPC,POWER9 at 8/l2-cache
                 00002011 (8209)
  $ ls /sys/devices/system/cpu/cpu0/cache/
  index0  index1  index2  index3

After DLPAR-adding PowerPC,POWER9 at 10, we see that its associated cache
nodes are absent, its threads' L2+L3 cacheinfo is unpopulated, and it is
missing a cache level in its sched domain hierarchy:

  $ ls -1d */
  l2-cache at 2010/
  l2-cache at 2011/
  l3-cache at 3110/
  l3-cache at 3111/
  PowerPC,POWER9 at 0/
  PowerPC,POWER9 at 10/
  PowerPC,POWER9 at 8/
  $ lsprop PowerPC\,POWER9 at 10/l2-cache
  PowerPC,POWER9 at 10/l2-cache
                 00002012 (8210)
  $ ls /sys/devices/system/cpu/cpu16/cache/
  index0  index1
  $ grep . /sys/kernel/debug/sched/domains/cpu{0,8,16}/domain*/name
  /sys/kernel/debug/sched/domains/cpu0/domain0/name:SMT
  /sys/kernel/debug/sched/domains/cpu0/domain1/name:CACHE
  /sys/kernel/debug/sched/domains/cpu0/domain2/name:DIE
  /sys/kernel/debug/sched/domains/cpu8/domain0/name:SMT
  /sys/kernel/debug/sched/domains/cpu8/domain1/name:CACHE
  /sys/kernel/debug/sched/domains/cpu8/domain2/name:DIE
  /sys/kernel/debug/sched/domains/cpu16/domain0/name:SMT
  /sys/kernel/debug/sched/domains/cpu16/domain1/name:DIE

When removing PowerPC,POWER9 at 8, we see that its cache nodes are left
behind:

  $ ls -1d */
  l2-cache at 2010/
  l2-cache at 2011/
  l3-cache at 3110/
  l3-cache at 3111/
  PowerPC,POWER9 at 0/

When DLPAR is combined with VM migration, we can get duplicate nodes. E.g.
removing one processor, then migrating, adding a processor, and then
migrating again can result in warnings from the OF core during
post-migration device tree updates:

  Duplicate name in cpus, renamed to "l2-cache at 2011#1"
  Duplicate name in cpus, renamed to "l3-cache at 3111#1"

and nodes with duplicated phandles in the tree, making lookup behavior
unpredictable:

  $ lsprop l[23]-cache@*/ibm,phandle
  l2-cache at 2010/ibm,phandle
                   00002010 (8208)
  l2-cache at 2011#1/ibm,phandle
                   00002011 (8209)
  l2-cache at 2011/ibm,phandle
                   00002011 (8209)
  l3-cache at 3110/ibm,phandle
                   00003110 (12560)
  l3-cache at 3111#1/ibm,phandle
                   00003111 (12561)
  l3-cache at 3111/ibm,phandle
                   00003111 (12561)

Address these issues by:

* Correctly processing siblings of the node returned from
  dlpar_configure_connector().
* Removing cache nodes in the CPU remove path when it can be determined
  that they are not associated with other CPUs or caches.

Use the of_changeset API in both cases, which allows us to keep the error
handling in this code from becoming more complex while ensuring that the
device tree cannot become inconsistent.

Signed-off-by: Nathan Lynch <nathanl at linux.ibm.com>
Fixes: ac71380 ("powerpc/pseries: Add CPU dlpar remove functionality")
Fixes: 90edf18 ("powerpc/pseries: Add CPU dlpar add functionality")
---
 arch/powerpc/platforms/pseries/hotplug-cpu.c | 72 +++++++++++++++++++-
 1 file changed, 70 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
index d646c22e94ab..87a0fbe9cf12 100644
--- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
+++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
@@ -521,6 +521,27 @@ static bool valid_cpu_drc_index(struct device_node *parent, u32 drc_index)
 	return found;
 }
 
+static int pseries_cpuhp_attach_nodes(struct device_node *dn)
+{
+	struct of_changeset cs;
+	int ret;
+
+	/*
+	 * This device node is unattached but may have siblings; open-code the
+	 * traversal.
+	 */
+	for (of_changeset_init(&cs); dn != NULL; dn = dn->sibling) {
+		ret = of_changeset_attach_node(&cs, dn);
+		if (ret)
+			goto out;
+	}
+
+	ret = of_changeset_apply(&cs);
+out:
+	of_changeset_destroy(&cs);
+	return ret;
+}
+
 static ssize_t dlpar_cpu_add(u32 drc_index)
 {
 	struct device_node *dn, *parent;
@@ -563,7 +584,7 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
 		return -EINVAL;
 	}
 
-	rc = dlpar_attach_node(dn, parent);
+	rc = pseries_cpuhp_attach_nodes(dn);
 
 	/* Regardless we are done with parent now */
 	of_node_put(parent);
@@ -600,6 +621,53 @@ static ssize_t dlpar_cpu_add(u32 drc_index)
 	return rc;
 }
 
+static unsigned int pseries_cpuhp_cache_use_count(const struct device_node *cachedn)
+{
+	unsigned int use_count = 0;
+	struct device_node *dn;
+
+	WARN_ON(!of_node_is_type(cachedn, "cache"));
+
+	for_each_of_cpu_node(dn) {
+		if (of_find_next_cache_node(dn) == cachedn)
+			use_count++;
+	}
+
+	for_each_node_by_type(dn, "cache") {
+		if (of_find_next_cache_node(dn) == cachedn)
+			use_count++;
+	}
+
+	return use_count;
+}
+
+static int pseries_cpuhp_detach_nodes(struct device_node *cpudn)
+{
+	struct device_node *dn;
+	struct of_changeset cs;
+	int ret = 0;
+
+	of_changeset_init(&cs);
+	ret = of_changeset_detach_node(&cs, cpudn);
+	if (ret)
+		goto out;
+
+	dn = cpudn;
+	while ((dn = of_find_next_cache_node(dn))) {
+		if (pseries_cpuhp_cache_use_count(dn) > 1)
+			break;
+
+		ret = of_changeset_detach_node(&cs, dn);
+		if (ret)
+			goto out;
+	}
+
+	ret = of_changeset_apply(&cs);
+out:
+	of_changeset_destroy(&cs);
+	return ret;
+}
+
 static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index)
 {
 	int rc;
@@ -621,7 +689,7 @@ static ssize_t dlpar_cpu_remove(struct device_node *dn, u32 drc_index)
 		return rc;
 	}
 
-	rc = dlpar_detach_node(dn);
+	rc = pseries_cpuhp_detach_nodes(dn);
 	if (rc) {
 		int saved_rc = rc;
 
-- 
2.31.1



More information about the Linuxppc-dev mailing list