[PATCH] NUMA memory fixup

Mike Kravetz kravetz at us.ibm.com
Wed Mar 2 09:27:13 EST 2005


When I booted my new 720 on a kernel configured for NUMA, I received
the following during bootup:

WARNING: Unexpected node layout: region start 44000000 length 2000000
NUMA is disabled

This is due to memory 'holes' within nodes.  If such holes are
encountered, then NUMA is disabled.  The following patch adds support
for such configurations.  My 720 now boots with the following message:

[boot]0012 Setup Arch
Node 0 Memory: 0x0-0x8000000 0x44000000-0x12a000000
Node 1 Memory: 0x8000000-0x44000000 0x12a000000-0x1ea000000

I'd appreciate any comments on the approach taken.  I'm also working
on adding NUMA support on top of the SPARSEMEM implementation being
pushed as part of memory hot add.  However, it seems important to get
the current implementation based on DISCONTIGMEM working first.  This
patch is against 2.6.11-rc3, but I can provide a later version if needed.

-- 
Signed-off-by: Mike Kravetz <kravetz at us.ibm.com>


diff -Naupr linux-2.6.11-rc3/arch/ppc64/mm/numa.c linux-2.6.11-rc3.work/arch/ppc64/mm/numa.c
--- linux-2.6.11-rc3/arch/ppc64/mm/numa.c	2005-02-03 01:57:16.000000000 +0000
+++ linux-2.6.11-rc3.work/arch/ppc64/mm/numa.c	2005-03-01 19:39:21.000000000 +0000
@@ -40,7 +40,6 @@ int nr_cpus_in_node[MAX_NUMNODES] = { [0
 
 struct pglist_data *node_data[MAX_NUMNODES];
 bootmem_data_t __initdata plat_node_bdata[MAX_NUMNODES];
-static unsigned long node0_io_hole_size;
 static int min_common_depth;
 
 /*
@@ -49,7 +48,8 @@ static int min_common_depth;
  */
 static struct {
 	unsigned long node_start_pfn;
-	unsigned long node_spanned_pages;
+	unsigned long node_end_pfn;
+	unsigned long node_present_pages;
 } init_node_data[MAX_NUMNODES] __initdata;
 
 EXPORT_SYMBOL(node_data);
@@ -348,33 +348,28 @@ new_range:
 		if (max_domain < numa_domain)
 			max_domain = numa_domain;
 
-		/* 
-		 * For backwards compatibility, OF splits the first node
-		 * into two regions (the first being 0-4GB). Check for
-		 * this simple case and complain if there is a gap in
-		 * memory
+		/*
+		 * Initialize new node struct, or add to an existing one.
 		 */
-		if (init_node_data[numa_domain].node_spanned_pages) {
-			unsigned long shouldstart =
-				init_node_data[numa_domain].node_start_pfn +
-				init_node_data[numa_domain].node_spanned_pages;
-			if (shouldstart != (start / PAGE_SIZE)) {
-				/* Revert to non-numa for now */
-				printk(KERN_ERR
-				       "WARNING: Unexpected node layout: "
-				       "region start %lx length %lx\n",
-				       start, size);
-				printk(KERN_ERR "NUMA is disabled\n");
-				goto err;
-			}
-			init_node_data[numa_domain].node_spanned_pages +=
+		if (init_node_data[numa_domain].node_end_pfn) {
+			if ((start / PAGE_SIZE) <
+			    init_node_data[numa_domain].node_start_pfn)
+				init_node_data[numa_domain].node_start_pfn =
+					start / PAGE_SIZE;
+			else
+				init_node_data[numa_domain].node_end_pfn =
+					(start / PAGE_SIZE) +
+					(size / PAGE_SIZE);
+
+			init_node_data[numa_domain].node_present_pages +=
 				size / PAGE_SIZE;
 		} else {
 			node_set_online(numa_domain);
 
 			init_node_data[numa_domain].node_start_pfn =
 				start / PAGE_SIZE;
-			init_node_data[numa_domain].node_spanned_pages =
+			init_node_data[numa_domain].node_end_pfn =
+				init_node_data[numa_domain].node_start_pfn +
 				size / PAGE_SIZE;
 		}
 
@@ -391,14 +386,6 @@ new_range:
 		node_set_online(i);
 
 	return 0;
-err:
-	/* Something has gone wrong; revert any setup we've done */
-	for_each_node(i) {
-		node_set_offline(i);
-		init_node_data[i].node_start_pfn = 0;
-		init_node_data[i].node_spanned_pages = 0;
-	}
-	return -1;
 }
 
 static void __init setup_nonnuma(void)
@@ -426,12 +413,11 @@ static void __init setup_nonnuma(void)
 	node_set_online(0);
 
 	init_node_data[0].node_start_pfn = 0;
-	init_node_data[0].node_spanned_pages = lmb_end_of_DRAM() / PAGE_SIZE;
+	init_node_data[0].node_end_pfn = lmb_end_of_DRAM() / PAGE_SIZE;
+	init_node_data[0].node_present_pages = total_ram / PAGE_SIZE;
 
 	for (i = 0 ; i < top_of_ram; i += MEMORY_INCREMENT)
 		numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] = 0;
-
-	node0_io_hole_size = top_of_ram - total_ram;
 }
 
 static void __init dump_numa_topology(void)
@@ -512,6 +498,7 @@ static unsigned long careful_allocation(
 void __init do_init_bootmem(void)
 {
 	int nid;
+	struct device_node *memory = NULL;
 	static struct notifier_block ppc64_numa_nb = {
 		.notifier_call = cpu_numa_callback,
 		.priority = 1 /* Must run before sched domains notifier. */
@@ -535,7 +522,7 @@ void __init do_init_bootmem(void)
 		unsigned long bootmap_pages;
 
 		start_paddr = init_node_data[nid].node_start_pfn * PAGE_SIZE;
-		end_paddr = start_paddr + (init_node_data[nid].node_spanned_pages * PAGE_SIZE);
+		end_paddr = init_node_data[nid].node_end_pfn * PAGE_SIZE;
 
 		/* Allocate the node structure node local if possible */
 		NODE_DATA(nid) = (struct pglist_data *)careful_allocation(nid,
@@ -551,9 +538,9 @@ void __init do_init_bootmem(void)
 		NODE_DATA(nid)->node_start_pfn =
 			init_node_data[nid].node_start_pfn;
 		NODE_DATA(nid)->node_spanned_pages =
-			init_node_data[nid].node_spanned_pages;
+			end_paddr - start_paddr;
 
-		if (init_node_data[nid].node_spanned_pages == 0)
+		if (NODE_DATA(nid)->node_spanned_pages == 0)
   			continue;
 
   		dbg("start_paddr = %lx\n", start_paddr);
@@ -572,33 +559,48 @@ void __init do_init_bootmem(void)
 				  start_paddr >> PAGE_SHIFT,
 				  end_paddr >> PAGE_SHIFT);
 
-		for (i = 0; i < lmb.memory.cnt; i++) {
-			unsigned long physbase, size;
-
-			physbase = lmb.memory.region[i].physbase;
-			size = lmb.memory.region[i].size;
-
-			if (physbase < end_paddr &&
-			    (physbase+size) > start_paddr) {
-				/* overlaps */
-				if (physbase < start_paddr) {
-					size -= start_paddr - physbase;
-					physbase = start_paddr;
-				}
-
-				if (size > end_paddr - physbase)
-					size = end_paddr - physbase;
-
-				dbg("free_bootmem %lx %lx\n", physbase, size);
-				free_bootmem_node(NODE_DATA(nid), physbase,
-						  size);
+		/*
+		 * We need to do another scan of all memory sections to
+		 * associate memory with the correct node.
+		 */
+		memory = NULL;
+		while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
+			unsigned long mem_start, mem_size;
+			int numa_domain;
+			unsigned int *memcell_buf;
+			unsigned int len;
+
+			memcell_buf = (unsigned int *)get_property(memory, "reg", &len);
+			if (!memcell_buf || len <= 0)
+				continue;
+
+			mem_start = read_cell_ul(memory, &memcell_buf);
+			mem_size = read_cell_ul(memory, &memcell_buf);
+			numa_domain = of_node_numa_domain(memory);
+
+			if (numa_domain != nid)
+				continue;
+
+			if (mem_start < end_paddr &&
+			    (mem_start+mem_size) > start_paddr) {
+				/* should be no overlaps ! */
+				dbg("free_bootmem %lx %lx\n", mem_start, mem_size);
+				free_bootmem_node(NODE_DATA(nid), mem_start,
+						  mem_size);
 			}
 		}
 
+		/*
+		 * Mark reserved regions on this node
+		 */
 		for (i = 0; i < lmb.reserved.cnt; i++) {
 			unsigned long physbase = lmb.reserved.region[i].physbase;
 			unsigned long size = lmb.reserved.region[i].size;
 
+			if (pa_to_nid(physbase) != nid &&
+			    pa_to_nid(physbase+size-1) != nid)
+				continue;
+
 			if (physbase < end_paddr &&
 			    (physbase+size) > start_paddr) {
 				/* overlaps */
@@ -632,13 +634,12 @@ void __init paging_init(void)
 		unsigned long start_pfn;
 		unsigned long end_pfn;
 
-		start_pfn = plat_node_bdata[nid].node_boot_start >> PAGE_SHIFT;
-		end_pfn = plat_node_bdata[nid].node_low_pfn;
+		start_pfn = init_node_data[nid].node_start_pfn;
+		end_pfn = init_node_data[nid].node_end_pfn;
 
 		zones_size[ZONE_DMA] = end_pfn - start_pfn;
-		zholes_size[ZONE_DMA] = 0;
-		if (nid == 0)
-			zholes_size[ZONE_DMA] = node0_io_hole_size >> PAGE_SHIFT;
+		zholes_size[ZONE_DMA] = zones_size[ZONE_DMA] -
+			init_node_data[nid].node_present_pages;
 
 		dbg("free_area_init node %d %lx %lx (hole: %lx)\n", nid,
 		    zones_size[ZONE_DMA], start_pfn, zholes_size[ZONE_DMA]);



More information about the Linuxppc64-dev mailing list