[RFC/PATCH] ppc64: Add mem=X option

Michael Ellerman michael at ellerman.id.au
Tue Feb 22 19:24:23 EST 2005


Hi Anton, Ben, and the rest of ya,

Here is my first take at adding support for the mem=X boot option.
Please check it out.

Anton can you have a look at the NUMA code? It's probably bogus. It works on vego but those LPARs only have one NUMA node, if you can try it on some beefy NUMA thing that'd be sweet.

I've successfully booted this on iSeries, Power3, Power4/5 LPAR and a G5.

cheers!

---
 arch/ppc64/kernel/iSeries_setup.c |   38 ++++++++----
 arch/ppc64/kernel/lmb.c           |   31 +++++++++
 arch/ppc64/kernel/prom.c          |   15 ++++
 arch/ppc64/kernel/prom_init.c     |  118 +++++++++++++++++++++++++++++++++++---
 arch/ppc64/kernel/setup.c         |   14 +++-
 arch/ppc64/mm/hash_utils.c        |   19 +++++-
 arch/ppc64/mm/numa.c              |   32 +++++++++-
 include/asm-ppc64/lmb.h           |    1
 8 files changed, 238 insertions(+), 30 deletions(-)


Index: latest/arch/ppc64/kernel/setup.c
===================================================================
--- latest.orig/arch/ppc64/kernel/setup.c
+++ latest/arch/ppc64/kernel/setup.c
@@ -641,12 +641,11 @@ void __init setup_system(void)
 	early_console_initialized = 1;
 	register_console(&udbg_console);
 
-#endif /* !CONFIG_PPC_ISERIES */
-
 	/* Save unparsed command line copy for /proc/cmdline */
 	strlcpy(saved_command_line, cmd_line, COMMAND_LINE_SIZE);
 
 	parse_early_param();
+#endif /* !CONFIG_PPC_ISERIES */
 
 #if defined(CONFIG_SMP) && !defined(CONFIG_PPC_ISERIES)
 	/*
@@ -805,9 +804,16 @@ struct seq_operations cpuinfo_op = {
 	.show =	show_cpuinfo,
 };
 
-#if 0 /* XXX not currently used */
+/* These three variables are used to save values passed to us by prom_init()
+ * via the device tree. The TCE variables are needed because with a memory_limit
+ * in force we may need to explicitly map the TCE are at the top of RAM. */
 unsigned long memory_limit;
+unsigned long tce_alloc_start;
+unsigned long tce_alloc_end;
 
+#ifdef CONFIG_PPC_ISERIES
+/* On iSeries we just parse the mem=X option from the command line.
+ * On pSeries it's a bit more complicated, see prom_init_mem() */
 static int __init early_parsemem(char *p)
 {
 	if (!p)
@@ -818,7 +824,7 @@ static int __init early_parsemem(char *p
 	return 0;
 }
 early_param("mem", early_parsemem);
-#endif
+#endif /* CONFIG_PPC_ISERIES */
 
 #ifdef CONFIG_PPC_MULTIPLATFORM
 static int __init set_preferred_console(void)
Index: latest/arch/ppc64/kernel/lmb.c
===================================================================
--- latest.orig/arch/ppc64/kernel/lmb.c
+++ latest/arch/ppc64/kernel/lmb.c
@@ -344,3 +344,34 @@ lmb_abs_to_phys(unsigned long aa)
 
 	return pa;
 }
+
+/* Truncate the lmb list to memory_limit if it's set
+ * You must call lmb_analyze() after this. */
+void __init lmb_apply_memory_limit(void)
+{
+	extern unsigned long memory_limit;
+	unsigned long i, total = 0, crop;
+	struct lmb_region *mem = &(lmb.memory);
+
+	if (likely(!memory_limit))
+		return;
+
+	for (i = 0; i < mem->cnt; i++) {
+		total += mem->region[i].size;
+		
+		if (total <= memory_limit)
+			continue;
+			
+		crop = (memory_limit - (total - mem->region[i].size));
+#ifdef DEBUG
+		udbg_printf("lmb_truncate(): truncating at region %x\n", i);
+		udbg_printf("lmb_truncate(): total = %x\n", total);
+		udbg_printf("lmb_truncate(): size  = %x\n", mem->region[i].size);
+		udbg_printf("lmb_truncate(): crop = %x\n", crop);
+#endif
+
+		mem->region[i].size = crop;
+		mem->cnt = i + 1;
+		break;
+	}
+}
Index: latest/include/asm-ppc64/lmb.h
===================================================================
--- latest.orig/include/asm-ppc64/lmb.h
+++ latest/include/asm-ppc64/lmb.h
@@ -53,6 +53,7 @@ extern unsigned long __init lmb_alloc_ba
 extern unsigned long __init lmb_phys_mem_size(void);
 extern unsigned long __init lmb_end_of_DRAM(void);
 extern unsigned long __init lmb_abs_to_phys(unsigned long);
+extern void __init lmb_apply_memory_limit(void);
 
 extern void lmb_dump_all(void);
 
Index: latest/arch/ppc64/kernel/iSeries_setup.c
===================================================================
--- latest.orig/arch/ppc64/kernel/iSeries_setup.c
+++ latest/arch/ppc64/kernel/iSeries_setup.c
@@ -284,7 +284,7 @@ unsigned long iSeries_process_mainstore_
 	return mem_blocks;
 }
 
-static void __init iSeries_parse_cmdline(void)
+static void __init iSeries_get_cmdline(void)
 {
 	char *p, *q;
 
@@ -304,6 +304,8 @@ static void __init iSeries_parse_cmdline
 
 /*static*/ void __init iSeries_init_early(void)
 {
+	extern unsigned long memory_limit;
+
 	DBG(" -> iSeries_init_early()\n");
 
 	ppcdbg_initialize();
@@ -351,6 +353,29 @@ static void __init iSeries_parse_cmdline
 	 */
 	build_iSeries_Memory_Map();
 
+	iSeries_get_cmdline();
+	
+	/* Save unparsed command line copy for /proc/cmdline */
+	strlcpy(saved_command_line, cmd_line, COMMAND_LINE_SIZE);
+	
+	/* Parse early parameters, in particular mem=x */
+	parse_early_param();
+	
+	if (unlikely(memory_limit)) {
+		if (memory_limit > systemcfg->physicalMemorySize)
+			printk("Ignoring 'mem' option, value %lu is too large.\n", memory_limit);
+		else
+			systemcfg->physicalMemorySize = memory_limit;
+	}
+	
+	/* Bolt kernel mappings for all of memory */
+	iSeries_bolt_kernel(0, systemcfg->physicalMemorySize);
+
+	lmb_init();
+	lmb_add(0, systemcfg->physicalMemorySize);
+	lmb_analyze();	/* ?? */
+	lmb_reserve(0, __pa(klimit));
+	
 	/* Initialize machine-dependency vectors */
 #ifdef CONFIG_SMP
 	smp_init_iSeries();
@@ -376,9 +401,6 @@ static void __init iSeries_parse_cmdline
 		initrd_start = initrd_end = 0;
 #endif /* CONFIG_BLK_DEV_INITRD */
 
-
-	iSeries_parse_cmdline();
-
 	DBG(" <- iSeries_init_early()\n");
 }
 
@@ -539,14 +561,6 @@ static void __init build_iSeries_Memory_
 	 *   nextPhysChunk
 	 */
 	systemcfg->physicalMemorySize = chunk_to_addr(nextPhysChunk);
-
-	/* Bolt kernel mappings for all of memory */
-	iSeries_bolt_kernel(0, systemcfg->physicalMemorySize);
-
-	lmb_init();
-	lmb_add(0, systemcfg->physicalMemorySize);
-	lmb_analyze();	/* ?? */
-	lmb_reserve(0, __pa(klimit));
 }
 
 /*
Index: latest/arch/ppc64/kernel/prom.c
===================================================================
--- latest.orig/arch/ppc64/kernel/prom.c
+++ latest/arch/ppc64/kernel/prom.c
@@ -875,6 +875,8 @@ static int __init early_init_dt_scan_cho
 					    const char *full_path, void *data)
 {
 	u32 *prop;
+	u64 *prop64;
+	extern unsigned long memory_limit, tce_alloc_start, tce_alloc_end;
 
 	if (strcmp(full_path, "/chosen") != 0)
 		return 0;
@@ -891,6 +893,18 @@ static int __init early_init_dt_scan_cho
 	if (get_flat_dt_prop(node, "linux,iommu-force-on", NULL) != NULL)
 		iommu_force_on = 1;
 
+	prop64 = (u64*)get_flat_dt_prop(node, "linux,memory-limit", NULL);
+	if (prop64)
+		memory_limit = *prop64;
+		
+	prop64 = (u64*)get_flat_dt_prop(node, "linux,tce-alloc-start", NULL);
+	if (prop64)
+		tce_alloc_start = *prop64;
+		
+	prop64 = (u64*)get_flat_dt_prop(node, "linux,tce-alloc-end", NULL);
+	if (prop64)
+		tce_alloc_end = *prop64;
+
 #ifdef CONFIG_PPC_PSERIES
 	/* To help early debugging via the front panel, we retreive a minimal
 	 * set of RTAS infos now if available
@@ -1030,6 +1044,7 @@ void __init early_init_devtree(void *par
 	lmb_init();
 	scan_flat_dt(early_init_dt_scan_root, NULL);
 	scan_flat_dt(early_init_dt_scan_memory, NULL);
+	lmb_apply_memory_limit();
 	lmb_analyze();
 	systemcfg->physicalMemorySize = lmb_phys_mem_size();
 	lmb_reserve(0, __pa(klimit));
Index: latest/arch/ppc64/mm/hash_utils.c
===================================================================
--- latest.orig/arch/ppc64/mm/hash_utils.c
+++ latest/arch/ppc64/mm/hash_utils.c
@@ -140,6 +140,8 @@ void __init htab_initialize(void)
 	unsigned long pteg_count;
 	unsigned long mode_rw;
 	int i, use_largepages = 0;
+	unsigned long base = 0, size = 0;
+	extern unsigned long memory_limit, tce_alloc_start, tce_alloc_end;
 
 	DBG(" -> htab_initialize()\n");
 
@@ -195,8 +197,6 @@ void __init htab_initialize(void)
 
 	/* create bolted the linear mapping in the hash table */
 	for (i=0; i < lmb.memory.cnt; i++) {
-		unsigned long base, size;
-
 		base = lmb.memory.region[i].physbase + KERNELBASE;
 		size = lmb.memory.region[i].size;
 
@@ -225,6 +225,21 @@ void __init htab_initialize(void)
 #endif /* CONFIG_U3_DART */
 		create_pte_mapping(base, base + size, mode_rw, use_largepages);
 	}
+
+	/* If we have a memory_limit and we've allocated TCEs then we need to
+	 * explicitly map the TCE area at the top of RAM. We also cope with the
+	 * case that the TCEs start below memory_limit. */
+	if (unlikely(memory_limit && tce_alloc_start && tce_alloc_end)) {
+		tce_alloc_start += KERNELBASE;
+		tce_alloc_end += KERNELBASE;
+		
+		if (base + size >= tce_alloc_start)
+			tce_alloc_start = base + size + 1;
+		
+		create_pte_mapping(tce_alloc_start, tce_alloc_end,
+			mode_rw, use_largepages);
+	}
+	
 	DBG(" <- htab_initialize()\n");
 }
 #undef KB
Index: latest/arch/ppc64/mm/numa.c
===================================================================
--- latest.orig/arch/ppc64/mm/numa.c
+++ latest/arch/ppc64/mm/numa.c
@@ -270,6 +270,7 @@ static int __init parse_numa_properties(
 	int max_domain = 0;
 	long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
 	unsigned long i;
+	extern unsigned long memory_limit;
 
 	if (numa_enabled == 0) {
 		printk(KERN_WARNING "NUMA disabled by user\n");
@@ -378,7 +379,7 @@ new_range:
 				size / PAGE_SIZE;
 		}
 
-		for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
+		for (i = start; i < (start+size) && i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT)
 			numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
 				numa_domain;
 
@@ -387,8 +388,33 @@ new_range:
 			goto new_range;
 	}
 
-	for (i = 0; i <= max_domain; i++)
-		node_set_online(i);
+	if (unlikely(memory_limit)) {
+		unsigned long size, total = 0;
+
+		for (i = 0; i <= max_domain; i++) {
+			size = init_node_data[i].node_spanned_pages * PAGE_SIZE;
+			total += size;
+
+			if (total <= memory_limit)
+				continue;
+
+			size = (memory_limit - (total - size)) / PAGE_SIZE;
+			dbg("NUMA: truncating node %ld to %ld pages\n", i, size);
+			init_node_data[i].node_spanned_pages = size;
+			break;
+		}
+
+		for (i++; i <= max_domain; i++) {
+			dbg("NUMA: offlining node %ld for memory_limit\n", i);
+			node_set_offline(i);
+			init_node_data[i].node_start_pfn = 0;
+			init_node_data[i].node_spanned_pages = 0;
+		}
+	} else {
+		/* FIXME do we need this? haven't we already done it in the else above? */
+		for (i = 0; i <= max_domain; i++)
+			node_set_online(i);
+	}
 
 	return 0;
 err:
Index: latest/arch/ppc64/kernel/prom_init.c
===================================================================
--- latest.orig/arch/ppc64/kernel/prom_init.c
+++ latest/arch/ppc64/kernel/prom_init.c
@@ -178,6 +178,9 @@ static int __initdata of_platform;
 
 static char __initdata prom_cmd_line[COMMAND_LINE_SIZE];
 
+static unsigned long __initdata memory_limit;
+static unsigned long __initdata tce_alloc_start;
+static unsigned long __initdata tce_alloc_end;
 static unsigned long __initdata alloc_top;
 static unsigned long __initdata alloc_top_high;
 static unsigned long __initdata alloc_bottom;
@@ -385,10 +388,64 @@ static int __init prom_setprop(phandle n
 			 (u32)(unsigned long) value, (u32) valuelen);
 }
 
+/* We can't use the standard versions because of RELOC headaches. */
+#define isxdigit(c)	(('0' <= (c) && (c) <= '9') \
+			 || ('a' <= (c) && (c) <= 'f') \
+			 || ('A' <= (c) && (c) <= 'F'))
+			 
+#define isdigit(c)	('0' <= (c) && (c) <= '9')
+#define islower(c)	('a' <= (c) && (c) <= 'z')
+#define toupper(c)	(islower(c) ? ((c) - 'a' + 'A') : (c))
+
+unsigned long prom_strtoul(const char *cp, const char **endp)
+{
+	unsigned long result = 0, base = 10, value;
+	
+	if (*cp == '0') {
+		base = 8;
+		cp++;
+		if (toupper(*cp) == 'X') {
+			cp++;
+			base = 16;
+		}
+	}
+
+	while (isxdigit(*cp) &&
+	       (value = isdigit(*cp) ? *cp - '0' : toupper(*cp) - 'A' + 10) < base) {
+		result = result * base + value;
+		cp++;
+	}
+	
+	if (endp)
+		*endp = cp;
+	
+	return result;
+}
+
+unsigned long prom_memparse(const char *ptr, const char **retptr)
+{
+	unsigned long ret = prom_strtoul(ptr, retptr);
+
+	switch (**retptr) {
+	case 'G':
+	case 'g':
+		ret <<= 10;
+	case 'M':
+	case 'm':
+		ret <<= 10;
+	case 'K':
+	case 'k':
+		ret <<= 10;
+		(*retptr)++;
+	default:
+		break;
+	}
+	return ret;
+}
 
 /*
  * Early parsing of the command line passed to the kernel, used for
- * the options that affect the iommu
+ * "mem=x" and the options that affect the iommu
  */
 static void __init early_cmdline_parse(void)
 {
@@ -419,6 +476,14 @@ static void __init early_cmdline_parse(v
 		else if (!strncmp(opt, RELOC("force"), 5))
 			RELOC(iommu_force_on) = 1;
 	}
+	
+	opt = strstr(RELOC(prom_cmd_line), RELOC("mem="));
+	if (opt) {
+		opt += 4;
+		RELOC(memory_limit) = prom_memparse(opt, (const char **)&opt);
+		/* Align to 16 MB == size of large page */
+		RELOC(memory_limit) = ALIGN(RELOC(memory_limit), 0x1000000);
+	}
 }
 
 /*
@@ -665,15 +730,7 @@ static void __init prom_init_mem(void)
 		}
 	}
 
-	/* Setup our top/bottom alloc points, that is top of RMO or top of
-	 * segment 0 when running non-LPAR
-	 */
-	if ( RELOC(of_platform) == PLATFORM_PSERIES_LPAR )
-		RELOC(alloc_top) = RELOC(rmo_top);
-	else
-		RELOC(alloc_top) = RELOC(rmo_top) = min(0x40000000ul, RELOC(ram_top));
 	RELOC(alloc_bottom) = PAGE_ALIGN(RELOC(klimit) - offset + 0x4000);
-	RELOC(alloc_top_high) = RELOC(ram_top);
 
 	/* Check if we have an initrd after the kernel, if we do move our bottom
 	 * point to after it
@@ -683,8 +740,37 @@ static void __init prom_init_mem(void)
 		    > RELOC(alloc_bottom))
 			RELOC(alloc_bottom) = PAGE_ALIGN(RELOC(prom_initrd_end));
 	}
+	
+	/* If memory_limit is set we reduce the upper limits *except* for
+	 * alloc_top_high. This must be the real top of RAM so we can put
+	 * TCE's up there.  */
+	
+	RELOC(alloc_top_high) = RELOC(ram_top);
+	
+	if (unlikely(RELOC(memory_limit))) {
+		if (RELOC(memory_limit) <= RELOC(alloc_bottom)) {
+			prom_printf("Ignoring mem=%x <= alloc_bottom.\n",
+				RELOC(memory_limit));
+			RELOC(memory_limit) = 0;
+		} else if (RELOC(memory_limit) >= RELOC(ram_top)) {
+			prom_printf("Ignoring mem=%x >= ram_top.\n",
+				RELOC(memory_limit));
+			RELOC(memory_limit) = 0;
+		} else {
+			RELOC(ram_top) = RELOC(memory_limit);
+			RELOC(rmo_top) = min(RELOC(rmo_top), RELOC(memory_limit));
+		}
+	}
+
+	/* Setup our top alloc point, that is top of RMO or top of
+	 * segment 0 when running non-LPAR. */
+	if ( RELOC(of_platform) == PLATFORM_PSERIES_LPAR )
+		RELOC(alloc_top) = RELOC(rmo_top);
+	else
+		RELOC(alloc_top) = RELOC(rmo_top) = min(0x40000000ul, RELOC(ram_top));
 
 	prom_printf("memory layout at init:\n");
+	prom_printf("  memory_limit : %x\n", RELOC(memory_limit));
 	prom_printf("  alloc_bottom : %x\n", RELOC(alloc_bottom));
 	prom_printf("  alloc_top    : %x\n", RELOC(alloc_top));
 	prom_printf("  alloc_top_hi : %x\n", RELOC(alloc_top_high));
@@ -873,6 +959,11 @@ static void __init prom_initialize_tce_t
 
 	reserve_mem(local_alloc_bottom, local_alloc_top - local_alloc_bottom);
 
+	if (RELOC(memory_limit)) {
+		RELOC(tce_alloc_start) = local_alloc_bottom;
+		RELOC(tce_alloc_end) = local_alloc_top;
+	}
+	
 	/* Flag the first invalid entry */
 	prom_debug("ending prom_initialize_tce_table\n");
 }
@@ -1688,6 +1779,15 @@ unsigned long __init prom_init(unsigned 
 		prom_setprop(_prom->chosen, "linux,iommu-off", NULL, 0);
 	if (RELOC(iommu_force_on))
 		prom_setprop(_prom->chosen, "linux,iommu-force-on", NULL, 0);
+	if (RELOC(memory_limit))
+		prom_setprop(_prom->chosen, "linux,memory-limit",
+			PTRRELOC(&memory_limit), sizeof(RELOC(memory_limit)));
+	if (RELOC(tce_alloc_start))
+		prom_setprop(_prom->chosen, "linux,tce-alloc-start",
+			PTRRELOC(&tce_alloc_start), sizeof(RELOC(tce_alloc_start)));
+	if (RELOC(tce_alloc_end))
+		prom_setprop(_prom->chosen, "linux,tce-alloc-end",
+			PTRRELOC(&tce_alloc_end), sizeof(RELOC(tce_alloc_end)));
 
 	/*
 	 * Now finally create the flattened device-tree



More information about the Linuxppc64-dev mailing list