[RFC/PATCH] numa: distinguish associativity domain from node id

Wed Apr 6 15:30:02 EST 2005

Yes, yet another numa.c patch...  this is strictly rfc for now.

The ppc64 numa code makes some possibly invalid assumptions about the
numbering of "associativity domains" (which may be considered NUMA
nodes).  As far as I've been able to determine from the architecture
docs, there is no guarantee about the numbering of associativity
domains, i.e. the values that are contained in ibm,associativity
device node properties.  Yet we seem to assume that the numbering of
the domains begins at zero and that the range is contiguous, and we
use the domain number for a given resource as its logical node id.
This strikes me as a problem waiting to happen, and in fact I've been
seeing some problems in the lab with larger machines violating or at
least straining these assumptions.

Consider one such case: the associativity domain for all memory in a
partition is 0x1, but the processors are in shared mode (so no
associativity info for them) -- all the memory is placed in node 1
while all cpus are mapped to node 0.  But in this case, we should
really have only one logical node, with all memory and cpus mapped to
it.

Another case I've seen is that of a partition with all processors and
memory having an associativity domain of 0x1.  We end up with
everything in node 1 and an empty (yet online) node 0.

I propose treating the associativity domain for a resource as a
"cookie" without making any assumptions about its value.  During numa
init, each distinct domain is mapped to a logical node. so that the
following holds: the logical node numbering begins at zero and is
contiguous, and resources added after boot which do not map to an
already initialized domain are associated with logical node 0.  The
patch implements these, and attempts to separate the notion of
associativity domain from that of logical node where appropriate.

Lightly tested on Power5 LPAR with two numa nodes - it boots and the
information under /sys/devices/system/node looks correct.

I'm going to omit a signed-off line for now; there's probably some
stupid bug introduced or something that someone will find
objectionable (memory hotplug folks?)...

Please review.

Thanks,
Nathan


 arch/ppc64/mm/numa.c |  207 +++++++++++++++++++++++++++----------------
 1 files changed, 131 insertions(+), 76 deletions(-)

Index: linux-2.6.12-rc2/arch/ppc64/mm/numa.c
===================================================================

--- linux-2.6.12-rc2.orig/arch/ppc64/mm/numa.c	2005-04-05 12:59:28.000000000 -0500
+++ linux-2.6.12-rc2/arch/ppc64/mm/numa.c	2005-04-06 00:17:08.000000000 -0500
@@ -58,6 +58,69 @@ EXPORT_SYMBOL(numa_memory_lookup_table);
 EXPORT_SYMBOL(numa_cpumask_lookup_table);
 EXPORT_SYMBOL(nr_cpus_in_node);
 
+/* Maps nid to platform "associativity domain". */
+#define INVALID_DOMAIN (-1)
+static int nid_domain[MAX_NUMNODES] = { [0 ... (MAX_NUMNODES - 1)] =
+							INVALID_DOMAIN };
+/* nid to platform domain id. O(1). */
+static int nid_to_domain(int nid)
+{
+	BUG_ON(0 > nid || nid >= MAX_NUMNODES);
+	return nid_domain[nid];
+}
+
+/* Platform domain to nid.  If the given domain does not map to a node,
+ * return -1. O(n).
+ */
+static int domain_to_nid(int domain)
+{
+	int nid;
+
+	for_each_node(nid)
+		if (domain == nid_to_domain(nid))
+			return nid;
+	return -1;
+}
+
+/* Associate domain with the given nid. */
+static void __init assign_domain_to_nid(int domain, int nid)
+{
+	BUG_ON(0 > nid || nid >= MAX_NUMNODES);
+	BUG_ON(domain == INVALID_DOMAIN);
+	BUG_ON(nid_domain[nid] != INVALID_DOMAIN);
+
+	nid_domain[nid] = domain;
+	dbg("OF associativity domain 0x%x mapped to node %i\n", domain, nid);
+}
+
+/* Given a previously unencountered associativity domain, find the
+ * first unused slot in the nid_domain array where it can be plugged
+ * in.
+ */
+static int __init setup_domain(int domain)
+{
+	int nid;
+
+	for_each_node(nid) {
+		int tmp = nid_to_domain(nid);
+
+		if (tmp != INVALID_DOMAIN)
+			continue;
+
+		/* Do not set up the same domain twice. */
+		BUG_ON(tmp == domain);
+
+		assign_domain_to_nid(domain, nid);
+		return nid;
+	}
+
+	printk(KERN_WARNING "Can't associate domain 0x%x with a node, "
+	       "MAX_NUMNODES=%i, num_online_nodes=%i\n", domain,
+	       MAX_NUMNODES, num_online_nodes());
+
+	return 0;
+}
+
 static inline void map_cpu_to_node(int cpu, int node)
 {
 	numa_cpu_lookup_table[cpu] = node;
@@ -117,26 +180,58 @@ static struct device_node * __devinit fi
 /* must hold reference to node during call */
 static int *of_get_associativity(struct device_node *dev)
 {
-	return (unsigned int *)get_property(dev, "ibm,associativity", NULL);
+	return (int *)get_property(dev, "ibm,associativity", NULL);
 }
 
-static int of_node_numa_domain(struct device_node *device)
+/*
+ * Given an OF device node, return the logical node id to which it
+ * belongs.  If the node has no associativity information, the result
+ * is 0.  During boot, this function will map domains to nodes as
+ * necessary.
+ */
+static int of_node_to_nid(struct device_node *dn)
 {
-	int numa_domain;
-	unsigned int *tmp;
+	int *tmp, domain, nid;
 
 	if (min_common_depth == -1)
 		return 0;
 
-	tmp = of_get_associativity(device);
-	if (tmp && (tmp[0] >= min_common_depth)) {
-		numa_domain = tmp[min_common_depth];
-	} else {
-		dbg("WARNING: no NUMA information for %s\n",
-		    device->full_name);
-		numa_domain = 0;
+	tmp = of_get_associativity(dn);
+
+	if (!tmp || (tmp[0] < min_common_depth)) {
+		dbg("no NUMA information for %s\n", dn->full_name);
+		return 0;
 	}
-	return numa_domain;
+
+	domain = tmp[min_common_depth];
+
+	/*
+	 * POWER4 LPAR uses 0xffff as invalid node,
+	 * just use node zero.
+	 */
+	if (domain == 0xffff)
+		nid = 0;
+	else
+		nid = domain_to_nid(domain);
+
+	/* If we haven't seen this domain before, associate it with a
+	 * node if we're still in boot.  If we're up and running and
+	 * the domain is previously unknown, we have no choice but to
+	 * map the resource to an initialized node, so we map it to
+	 * nid 0.
+	 */
+	if (nid < 0) {
+		if (system_state < SYSTEM_RUNNING) {
+			nid = setup_domain(domain);
+		} else {
+			nid = 0;
+			dbg("Resource %s has associativity domain"
+			    " %x which was not known at boot, assigning"
+			    " to node %i\n", dn->full_name, domain, nid);
+		}
+	}
+	node_set_online(nid);
+	return nid;
 }
 
 /*
@@ -228,7 +323,7 @@ static unsigned long read_n_cells(int n,
  */
 static int numa_setup_cpu(unsigned long lcpu)
 {
-	int numa_domain = 0;
+	int nid = 0;
 	struct device_node *cpu = find_cpu_node(lcpu);
 
 	if (!cpu) {
@@ -236,27 +331,16 @@ static int numa_setup_cpu(unsigned long 
 		goto out;
 	}
 
-	numa_domain = of_node_numa_domain(cpu);
+	nid = of_node_to_nid(cpu);
 
-	if (numa_domain >= num_online_nodes()) {
-		/*
-		 * POWER4 LPAR uses 0xffff as invalid node,
-		 * dont warn in this case.
-		 */
-		if (numa_domain != 0xffff)
-			printk(KERN_ERR "WARNING: cpu %ld "
-			       "maps to invalid NUMA node %d\n",
-			       lcpu, numa_domain);
-		numa_domain = 0;
-	}
 out:
-	node_set_online(numa_domain);
+	node_set_online(nid);
 
-	map_cpu_to_node(lcpu, numa_domain);
+	map_cpu_to_node(lcpu, nid);
 
 	of_node_put(cpu);
 
-	return numa_domain;
+	return nid;
 }
 
 static int cpu_numa_callback(struct notifier_block *nfb,
@@ -319,7 +403,6 @@ static int __init parse_numa_properties(
 	struct device_node *cpu = NULL;
 	struct device_node *memory = NULL;
 	int addr_cells, size_cells;
-	int max_domain = 0;
 	long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
 	unsigned long i;
 
@@ -341,7 +424,7 @@ static int __init parse_numa_properties(
 	if (min_common_depth < 0)
 		return min_common_depth;
 
-	max_domain = numa_setup_cpu(boot_cpuid);
+	numa_setup_cpu(boot_cpuid);
 
 	/*
 	 * Even though we connect cpus to numa domains later in SMP init,
@@ -350,20 +433,8 @@ static int __init parse_numa_properties(
 	 * As a result of hotplug we could still have cpus appear later on
 	 * with larger node ids. In that case we force the cpu into node 0.
 	 */
-	for_each_cpu(i) {
-		int numa_domain;
-
-		cpu = find_cpu_node(i);
-
-		if (cpu) {
-			numa_domain = of_node_numa_domain(cpu);
-			of_node_put(cpu);
-
-			if (numa_domain < MAX_NUMNODES &&
-			    max_domain < numa_domain)
-				max_domain = numa_domain;
-		}
-	}
+	while ((cpu = of_find_node_by_type(cpu, "cpu")) != NULL)
+		of_node_to_nid(cpu);
 
 	addr_cells = get_mem_addr_cells();
 	size_cells = get_mem_size_cells();
@@ -371,7 +442,7 @@ static int __init parse_numa_properties(
 	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
 		unsigned long start;
 		unsigned long size;
-		int numa_domain;
+		int nid;
 		int ranges;
 		unsigned int *memcell_buf;
 		unsigned int len;
@@ -389,18 +460,7 @@ new_range:
 		start = _ALIGN_DOWN(start, MEMORY_INCREMENT);
 		size = _ALIGN_UP(size, MEMORY_INCREMENT);
 
-		numa_domain = of_node_numa_domain(memory);
-
-		if (numa_domain >= MAX_NUMNODES) {
-			if (numa_domain != 0xffff)
-				printk(KERN_ERR "WARNING: memory at %lx maps "
-				       "to invalid NUMA node %d\n", start,
-				       numa_domain);
-			numa_domain = 0;
-		}
-
-		if (max_domain < numa_domain)
-			max_domain = numa_domain;
+		nid = of_node_to_nid(memory);
 
 		if (! (size = numa_enforce_memory_limit(start, size))) {
 			if (--ranges)
@@ -412,42 +472,37 @@ new_range:
 		/*
 		 * Initialize new node struct, or add to an existing one.
 		 */
-		if (init_node_data[numa_domain].node_end_pfn) {
+		if (init_node_data[nid].node_end_pfn) {
 			if ((start / PAGE_SIZE) <
-			    init_node_data[numa_domain].node_start_pfn)
-				init_node_data[numa_domain].node_start_pfn =
+			    init_node_data[nid].node_start_pfn)
+				init_node_data[nid].node_start_pfn =
 					start / PAGE_SIZE;
 			if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) >
-			    init_node_data[numa_domain].node_end_pfn)
-				init_node_data[numa_domain].node_end_pfn =
+			    init_node_data[nid].node_end_pfn)
+				init_node_data[nid].node_end_pfn =
 					(start / PAGE_SIZE) +
 					(size / PAGE_SIZE);
 
-			init_node_data[numa_domain].node_present_pages +=
+			init_node_data[nid].node_present_pages +=
 				size / PAGE_SIZE;
 		} else {
-			node_set_online(numa_domain);
-
-			init_node_data[numa_domain].node_start_pfn =
+			init_node_data[nid].node_start_pfn =
 				start / PAGE_SIZE;
-			init_node_data[numa_domain].node_end_pfn =
-				init_node_data[numa_domain].node_start_pfn +
+			init_node_data[nid].node_end_pfn =
+				init_node_data[nid].node_start_pfn +
 				size / PAGE_SIZE;
-			init_node_data[numa_domain].node_present_pages =
+			init_node_data[nid].node_present_pages =
 				size / PAGE_SIZE;
 		}
 
 		for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
 			numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
-				numa_domain;
+				nid;
 
 		if (--ranges)
 			goto new_range;
 	}
 
-	for (i = 0; i <= max_domain; i++)
-		node_set_online(i);
-
 	return 0;
 }
 
@@ -632,7 +687,7 @@ void __init do_init_bootmem(void)
 		memory = NULL;
 		while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
 			unsigned long mem_start, mem_size;
-			int numa_domain, ranges;
+			int devnid, ranges;
 			unsigned int *memcell_buf;
 			unsigned int len;
 
@@ -644,9 +699,9 @@ void __init do_init_bootmem(void)
 new_range:
 			mem_start = read_n_cells(addr_cells, &memcell_buf);
 			mem_size = read_n_cells(size_cells, &memcell_buf);
-			numa_domain = numa_enabled ? of_node_numa_domain(memory) : 0;
+			devnid = numa_enabled ? of_node_to_nid(memory) : 0;
 
-			if (numa_domain != nid)
+			if (devnid != nid)
 				continue;
 
 			mem_size = numa_enforce_memory_limit(mem_start, mem_size);