[PATCH 1/2] logical numbering for numa nodes (2nd try)

Fri May 6 08:15:20 EST 2005

(version 2)

This patch fixes the ppc64 numa code to be more consistent with the
conversion from numnodes to node_online_mask etc. and removes the
dependence on the platform numa numbering by setting up a mapping
between the platform ids found in the ibm,associativity properties and
logical node numbers.  The main reason I want to make this change is
that the numbering scheme of the platform ids is unspecified and we
really can't rely on the values being below MAX_NUMNODES.  I know you
weren't really keen on having this mapping but I think in the long
term this is what we'll wind up having to do anyway.

I've also ripped out DEBUG_NUMA -- the effect is that it's now as if
DEBUG_NUMA is always on.  This means that resources have to be
explicitly associated with their nodes.

As Dave Hansen suggested in response to the original version of the
patch, I've made it so that establishing a mapping between the domain
and logical node has to be done explicitly instead of implicitly on
the first lookup.

This patch exposes some latent issues in the interaction of cpu hotplug,
numa, and sched domains which are addressed in the next patch.


 arch/ppc64/mm/numa.c         |  208 ++++++++++++++++++++++++++-----------------
 include/asm-ppc64/mmzone.h   |   17 ---
 include/asm-ppc64/topology.h |   10 --
 3 files changed, 130 insertions(+), 105 deletions(-)

Signed-off-by: Nathan Lynch <ntl at pobox.com>

Index: linux-2.6.12-rc3-mm3/arch/ppc64/mm/numa.c
===================================================================

--- linux-2.6.12-rc3-mm3.orig/arch/ppc64/mm/numa.c
+++ linux-2.6.12-rc3-mm3/arch/ppc64/mm/numa.c
@@ -26,11 +26,7 @@ static int numa_enabled = 1;
 static int numa_debug;
 #define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
 
-#ifdef DEBUG_NUMA
 #define ARRAY_INITIALISER -1
-#else
-#define ARRAY_INITIALISER 0
-#endif
 
 int numa_cpu_lookup_table[NR_CPUS] = { [ 0 ... (NR_CPUS - 1)] =
 	ARRAY_INITIALISER};
@@ -58,6 +54,64 @@ EXPORT_SYMBOL(numa_memory_lookup_table);
 EXPORT_SYMBOL(numa_cpumask_lookup_table);
 EXPORT_SYMBOL(nr_cpus_in_node);
 
+#define INVALID_DOMAIN (-1)
+static int nid_to_domain_tbl[MAX_NUMNODES] = { [0 ... MAX_NUMNODES - 1] = INVALID_DOMAIN };
+
+static int nid_to_domain(int nid)
+{
+	BUG_ON(nid >= MAX_NUMNODES);
+	BUG_ON(nid < 0);
+
+	return nid_to_domain_tbl[nid];
+}
+
+/* Returns -1 if domain not mapped */
+static int domain_to_nid(int domain)
+{
+	int nid;
+
+	WARN_ON(domain == INVALID_DOMAIN);
+
+	for (nid = 0; nid < MAX_NUMNODES; nid++) {
+		int tmp = nid_to_domain(nid);
+		if (tmp == domain)
+			return nid;
+	}
+
+	return -1;
+}
+
+/* Map the given domain to the next available node id if it is not
+ * already mapped.  If this is a new mapping, set the node online.
+ */
+static int __init establish_domain_mapping(int domain)
+{
+	int nid;
+
+	WARN_ON(domain == INVALID_DOMAIN);
+
+	for (nid = 0; nid < MAX_NUMNODES; nid++) {
+		if (nid_to_domain_tbl[nid] == domain) {
+			WARN_ON(!node_online(nid));
+			return nid;
+		}
+		else if (nid_to_domain_tbl[nid] != INVALID_DOMAIN)
+			continue;
+
+		printk(KERN_INFO
+		       "Mapping platform domain %i to logical node %i\n",
+			domain, nid);
+
+		nid_to_domain_tbl[nid] = domain;
+		node_set_online(nid);
+		return nid;
+	}
+	printk(KERN_WARNING "nid_to_domain_tbl full; time to increase"
+		" NODES_SHIFT?\n");
+
+	return -1;
+}
+
 static inline void map_cpu_to_node(int cpu, int node)
 {
 	numa_cpu_lookup_table[cpu] = node;
@@ -126,16 +180,23 @@ static int of_node_numa_domain(struct de
 	unsigned int *tmp;
 
 	if (min_common_depth == -1)
-		return 0;
+		return INVALID_DOMAIN;
 
 	tmp = of_get_associativity(device);
 	if (tmp && (tmp[0] >= min_common_depth)) {
 		numa_domain = tmp[min_common_depth];
 	} else {
-		dbg("WARNING: no NUMA information for %s\n",
+		dbg("no NUMA information for %s\n",
 		    device->full_name);
-		numa_domain = 0;
+		numa_domain = INVALID_DOMAIN;
 	}
+
+	/* POWER4 LPAR uses 0xffff for invalid domain;
+	 * fix that up here so callers don't have to worry about it.
+	 */
+	if (numa_domain == 0xffff)
+		numa_domain = INVALID_DOMAIN;
+
 	return numa_domain;
 }
 
@@ -223,12 +284,12 @@ static unsigned long read_n_cells(int n,
 }
 
 /*
- * Figure out to which domain a cpu belongs and stick it there.
- * Return the id of the domain used.
+ * Figure out to which node a cpu belongs and stick it there.
+ * Return the id of the node used.
  */
 static int numa_setup_cpu(unsigned long lcpu)
 {
-	int numa_domain = 0;
+	int nid = 0, numa_domain = INVALID_DOMAIN;
 	struct device_node *cpu = find_cpu_node(lcpu);
 
 	if (!cpu) {
@@ -238,25 +299,17 @@ static int numa_setup_cpu(unsigned long 
 
 	numa_domain = of_node_numa_domain(cpu);
 
-	if (numa_domain >= num_online_nodes()) {
-		/*
-		 * POWER4 LPAR uses 0xffff as invalid node,
-		 * dont warn in this case.
-		 */
-		if (numa_domain != 0xffff)
-			printk(KERN_ERR "WARNING: cpu %ld "
-			       "maps to invalid NUMA node %d\n",
-			       lcpu, numa_domain);
-		numa_domain = 0;
-	}
-out:
-	node_set_online(numa_domain);
+	if (numa_domain != INVALID_DOMAIN)
+		nid = domain_to_nid(numa_domain);
 
-	map_cpu_to_node(lcpu, numa_domain);
+	if (nid < 0)
+		nid = 0;
+out:
+	map_cpu_to_node(lcpu, nid);
 
 	of_node_put(cpu);
 
-	return numa_domain;
+	return nid;
 }
 
 static int cpu_numa_callback(struct notifier_block *nfb,
@@ -278,8 +331,8 @@ static int cpu_numa_callback(struct noti
 	case CPU_DEAD:
 	case CPU_UP_CANCELED:
 		unmap_cpu_from_node(lcpu);
-		break;
 		ret = NOTIFY_OK;
+		break;
 #endif
 	}
 	return ret;
@@ -319,7 +372,6 @@ static int __init parse_numa_properties(
 	struct device_node *cpu = NULL;
 	struct device_node *memory = NULL;
 	int addr_cells, size_cells;
-	int max_domain = 0;
 	long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
 	unsigned long i;
 
@@ -341,37 +393,13 @@ static int __init parse_numa_properties(
 	if (min_common_depth < 0)
 		return min_common_depth;
 
-	max_domain = numa_setup_cpu(boot_cpuid);
-
-	/*
-	 * Even though we connect cpus to numa domains later in SMP init,
-	 * we need to know the maximum node id now. This is because each
-	 * node id must have NODE_DATA etc backing it.
-	 * As a result of hotplug we could still have cpus appear later on
-	 * with larger node ids. In that case we force the cpu into node 0.
-	 */
-	for_each_cpu(i) {
-		int numa_domain;
-
-		cpu = find_cpu_node(i);
-
-		if (cpu) {
-			numa_domain = of_node_numa_domain(cpu);
-			of_node_put(cpu);
-
-			if (numa_domain < MAX_NUMNODES &&
-			    max_domain < numa_domain)
-				max_domain = numa_domain;
-		}
-	}
-
 	addr_cells = get_mem_addr_cells();
 	size_cells = get_mem_size_cells();
 	memory = NULL;
 	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
 		unsigned long start;
 		unsigned long size;
-		int numa_domain;
+		int numa_domain, nid;
 		int ranges;
 		unsigned int *memcell_buf;
 		unsigned int len;
@@ -391,17 +419,16 @@ new_range:
 
 		numa_domain = of_node_numa_domain(memory);
 
-		if (numa_domain >= MAX_NUMNODES) {
-			if (numa_domain != 0xffff)
-				printk(KERN_ERR "WARNING: memory at %lx maps "
-				       "to invalid NUMA node %d\n", start,
-				       numa_domain);
-			numa_domain = 0;
+		if (numa_domain < 0)
+			nid = 0;
+		else {
+			nid = domain_to_nid(numa_domain);
+			if (nid < 0)
+				nid = establish_domain_mapping(numa_domain);
+			if (nid < 0)
+				nid = 0;
 		}
 
-		if (max_domain < numa_domain)
-			max_domain = numa_domain;
-
 		if (! (size = numa_enforce_memory_limit(start, size))) {
 			if (--ranges)
 				goto new_range;
@@ -412,41 +439,53 @@ new_range:
 		/*
 		 * Initialize new node struct, or add to an existing one.
 		 */
-		if (init_node_data[numa_domain].node_end_pfn) {
+		if (init_node_data[nid].node_end_pfn) {
 			if ((start / PAGE_SIZE) <
-			    init_node_data[numa_domain].node_start_pfn)
-				init_node_data[numa_domain].node_start_pfn =
+			    init_node_data[nid].node_start_pfn)
+				init_node_data[nid].node_start_pfn =
 					start / PAGE_SIZE;
 			if (((start / PAGE_SIZE) + (size / PAGE_SIZE)) >
-			    init_node_data[numa_domain].node_end_pfn)
-				init_node_data[numa_domain].node_end_pfn =
+			    init_node_data[nid].node_end_pfn)
+				init_node_data[nid].node_end_pfn =
 					(start / PAGE_SIZE) +
 					(size / PAGE_SIZE);
 
-			init_node_data[numa_domain].node_present_pages +=
+			init_node_data[nid].node_present_pages +=
 				size / PAGE_SIZE;
 		} else {
-			node_set_online(numa_domain);
-
-			init_node_data[numa_domain].node_start_pfn =
+			init_node_data[nid].node_start_pfn =
 				start / PAGE_SIZE;
-			init_node_data[numa_domain].node_end_pfn =
-				init_node_data[numa_domain].node_start_pfn +
+			init_node_data[nid].node_end_pfn =
+				init_node_data[nid].node_start_pfn +
 				size / PAGE_SIZE;
-			init_node_data[numa_domain].node_present_pages =
+			init_node_data[nid].node_present_pages =
 				size / PAGE_SIZE;
 		}
 
 		for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
 			numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
-				numa_domain;
+				nid;
 
 		if (--ranges)
 			goto new_range;
 	}
 
-	for (i = 0; i <= max_domain; i++)
-		node_set_online(i);
+	/* We need to establish domain<->nid mappings for any
+	 * cpu nodes in the device tree with domains which were not
+	 * encountered in the memory loop above.
+	 */
+	while ((cpu = of_find_node_by_type(cpu, "cpu"))) {
+		int domain = of_node_numa_domain(cpu);
+		if (domain < 0)
+			continue;
+		if (domain_to_nid(domain) < 0)
+			establish_domain_mapping(domain);
+	}
+
+	/* Secondary logical cpus are associated with nids later in
+	 * boot, but we need to explicitly set up the boot cpu.
+	 */
+	numa_setup_cpu(boot_cpuid);
 
 	return 0;
 }
@@ -541,7 +580,7 @@ static unsigned long careful_allocation(
 	 * If the memory came from a previously allocated node, we must
 	 * retry with the bootmem allocator.
 	 */
-	if (pa_to_nid(ret) < nid) {
+	if (pa_to_nid(ret) != nid) {
 		nid = pa_to_nid(ret);
 		ret = (unsigned long)__alloc_bootmem_node(NODE_DATA(nid),
 				size, align, 0);
@@ -632,7 +671,7 @@ void __init do_init_bootmem(void)
 		memory = NULL;
 		while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
 			unsigned long mem_start, mem_size;
-			int numa_domain, ranges;
+			int numa_domain, ranges, thisnid;
 			unsigned int *memcell_buf;
 			unsigned int len;
 
@@ -644,9 +683,18 @@ void __init do_init_bootmem(void)
 new_range:
 			mem_start = read_n_cells(addr_cells, &memcell_buf);
 			mem_size = read_n_cells(size_cells, &memcell_buf);
-			numa_domain = numa_enabled ? of_node_numa_domain(memory) : 0;
 
-			if (numa_domain != nid)
+			if (numa_enabled)
+				numa_domain = of_node_numa_domain(memory);
+			else
+				numa_domain = -1;
+
+			if (numa_domain < 0)
+				thisnid = 0;
+			else
+				thisnid = domain_to_nid(numa_domain);
+
+			if (thisnid != nid)
 				continue;
 
 			mem_size = numa_enforce_memory_limit(mem_start, mem_size);
Index: linux-2.6.12-rc3-mm3/include/asm-ppc64/mmzone.h
===================================================================
--- linux-2.6.12-rc3-mm3.orig/include/asm-ppc64/mmzone.h
+++ linux-2.6.12-rc3-mm3/include/asm-ppc64/mmzone.h
@@ -27,24 +27,9 @@ extern int nr_cpus_in_node[];
 #define MEMORY_INCREMENT_SHIFT 24
 #define MEMORY_INCREMENT (1UL << MEMORY_INCREMENT_SHIFT)
 
-/* NUMA debugging, will not work on a DLPAR machine */
-#undef DEBUG_NUMA
-
 static inline int pa_to_nid(unsigned long pa)
 {
-	int nid;
-
-	nid = numa_memory_lookup_table[pa >> MEMORY_INCREMENT_SHIFT];
-
-#ifdef DEBUG_NUMA
-	/* the physical address passed in is not in the map for the system */
-	if (nid == -1) {
-		printk("bad address: %lx\n", pa);
-		BUG();
-	}
-#endif
-
-	return nid;
+	return numa_memory_lookup_table[pa >> MEMORY_INCREMENT_SHIFT];
 }
 
 #define pfn_to_nid(pfn)		pa_to_nid((pfn) << PAGE_SHIFT)
Index: linux-2.6.12-rc3-mm3/include/asm-ppc64/topology.h
===================================================================
--- linux-2.6.12-rc3-mm3.orig/include/asm-ppc64/topology.h
+++ linux-2.6.12-rc3-mm3/include/asm-ppc64/topology.h
@@ -8,15 +8,7 @@
 
 static inline int cpu_to_node(int cpu)
 {
-	int node;
-
-	node = numa_cpu_lookup_table[cpu];
-
-#ifdef DEBUG_NUMA
-	BUG_ON(node == -1);
-#endif
-
-	return node;
+	return numa_cpu_lookup_table[cpu];
 }
 
 #define parent_node(node)	(node)