[RFC/PATCH] ppc64: Add mem=X option

Olof Johansson olof at austin.ibm.com
Wed Feb 23 14:40:03 EST 2005


On Tue, Feb 22, 2005 at 07:24:23PM +1100, Michael Ellerman wrote:
> Hi Anton, Ben, and the rest of ya,
> 
> Here is my first take at adding support for the mem=X boot option.
> Please check it out.

Yes, finally! :-)   This has been asked for many a time, but noone has
implemented it before. Good work! I have some comments below, most of
them are nitpicks.

> -#if 0 /* XXX not currently used */
> +/* These three variables are used to save values passed to us by prom_init()
> + * via the device tree. The TCE variables are needed because with a memory_limit
> + * in force we may need to explicitly map the TCE are at the top of RAM. */

I think it's common kernel coding style to put the final */ on a new
line to get all the *'s lined up. There are other comments in the patch
with the same style, I'll only point out this one.

> +unsigned long tce_alloc_start;
> +unsigned long tce_alloc_end;
>  
> +#ifdef CONFIG_PPC_ISERIES
> +/* On iSeries we just parse the mem=X option from the command line.
> + * On pSeries it's a bit more complicated, see prom_init_mem() */
>  static int __init early_parsemem(char *p)
>  {
>  	if (!p)
> @@ -818,7 +824,7 @@ static int __init early_parsemem(char *p
>  	return 0;
>  }
>  early_param("mem", early_parsemem);
> -#endif
> +#endif /* CONFIG_PPC_ISERIES */
>  
>  #ifdef CONFIG_PPC_MULTIPLATFORM
>  static int __init set_preferred_console(void)
> Index: latest/arch/ppc64/kernel/lmb.c
> ===================================================================
> --- latest.orig/arch/ppc64/kernel/lmb.c
> +++ latest/arch/ppc64/kernel/lmb.c
> @@ -344,3 +344,34 @@ lmb_abs_to_phys(unsigned long aa)
>  
>  	return pa;
>  }
> +
> +/* Truncate the lmb list to memory_limit if it's set
> + * You must call lmb_analyze() after this. */
> +void __init lmb_apply_memory_limit(void)
> +{
> +	extern unsigned long memory_limit;
> +	unsigned long i, total = 0, crop;
> +	struct lmb_region *mem = &(lmb.memory);
> +
> +	if (likely(!memory_limit))
> +		return;

No need to worry about likely/unlikely here. It's not a hot call path.

> +
> +	for (i = 0; i < mem->cnt; i++) {
> +		total += mem->region[i].size;
> +		
> +		if (total <= memory_limit)
> +			continue;
> +			
> +		crop = (memory_limit - (total - mem->region[i].size));
> +#ifdef DEBUG
> +		udbg_printf("lmb_truncate(): truncating at region %x\n", i);
> +		udbg_printf("lmb_truncate(): total = %x\n", total);
> +		udbg_printf("lmb_truncate(): size  = %x\n", mem->region[i].size);
> +		udbg_printf("lmb_truncate(): crop = %x\n", crop);
> +#endif
> +
> +		mem->region[i].size = crop;
> +		mem->cnt = i + 1;
> +		break;

With the above tests, there's a chance for the last LMB to be of size 0.
I don't think it matters for things to work, but you might as well skip
that one too.

Changing the test to (total < memory_limit) should take care of it, the
last one will just be cropped by nothing.

Also, if you just reduce the size of memory_limit instead of keep the
rolling total, the crop calculation will be simpler. It took a bit of
thinking to make sure it's right the way it's written now.

> +	}
> +}
> Index: latest/include/asm-ppc64/lmb.h
> ===================================================================
> --- latest.orig/include/asm-ppc64/lmb.h
> +++ latest/include/asm-ppc64/lmb.h
> @@ -53,6 +53,7 @@ extern unsigned long __init lmb_alloc_ba
>  extern unsigned long __init lmb_phys_mem_size(void);
>  extern unsigned long __init lmb_end_of_DRAM(void);
>  extern unsigned long __init lmb_abs_to_phys(unsigned long);
> +extern void __init lmb_apply_memory_limit(void);
>  
>  extern void lmb_dump_all(void);
>  
> Index: latest/arch/ppc64/kernel/iSeries_setup.c
> ===================================================================
> --- latest.orig/arch/ppc64/kernel/iSeries_setup.c
> +++ latest/arch/ppc64/kernel/iSeries_setup.c
> @@ -284,7 +284,7 @@ unsigned long iSeries_process_mainstore_
>  	return mem_blocks;
>  }
>  
> -static void __init iSeries_parse_cmdline(void)
> +static void __init iSeries_get_cmdline(void)
>  {
>  	char *p, *q;
>  
> @@ -304,6 +304,8 @@ static void __init iSeries_parse_cmdline
>  
>  /*static*/ void __init iSeries_init_early(void)
>  {
> +	extern unsigned long memory_limit;
> +
>  	DBG(" -> iSeries_init_early()\n");
>  
>  	ppcdbg_initialize();
> @@ -351,6 +353,29 @@ static void __init iSeries_parse_cmdline
>  	 */
>  	build_iSeries_Memory_Map();
>  
> +	iSeries_get_cmdline();
> +	
> +	/* Save unparsed command line copy for /proc/cmdline */
> +	strlcpy(saved_command_line, cmd_line, COMMAND_LINE_SIZE);
> +	
> +	/* Parse early parameters, in particular mem=x */
> +	parse_early_param();
> +	
> +	if (unlikely(memory_limit)) {
> +		if (memory_limit > systemcfg->physicalMemorySize)
> +			printk("Ignoring 'mem' option, value %lu is too large.\n", memory_limit);
> +		else
> +			systemcfg->physicalMemorySize = memory_limit;
> +	}
> +	
> +	/* Bolt kernel mappings for all of memory */
> +	iSeries_bolt_kernel(0, systemcfg->physicalMemorySize);

Do you want to bolt it all, even if there's a mem= limitation?

> +
> +	lmb_init();
> +	lmb_add(0, systemcfg->physicalMemorySize);
> +	lmb_analyze();	/* ?? */
> +	lmb_reserve(0, __pa(klimit));
> +	
>  	/* Initialize machine-dependency vectors */
>  #ifdef CONFIG_SMP
>  	smp_init_iSeries();
> @@ -376,9 +401,6 @@ static void __init iSeries_parse_cmdline
>  		initrd_start = initrd_end = 0;
>  #endif /* CONFIG_BLK_DEV_INITRD */
>  
> -
> -	iSeries_parse_cmdline();
> -
>  	DBG(" <- iSeries_init_early()\n");
>  }
>  
> @@ -539,14 +561,6 @@ static void __init build_iSeries_Memory_
>  	 *   nextPhysChunk
>  	 */
>  	systemcfg->physicalMemorySize = chunk_to_addr(nextPhysChunk);
> -
> -	/* Bolt kernel mappings for all of memory */
> -	iSeries_bolt_kernel(0, systemcfg->physicalMemorySize);
> -
> -	lmb_init();
> -	lmb_add(0, systemcfg->physicalMemorySize);
> -	lmb_analyze();	/* ?? */
> -	lmb_reserve(0, __pa(klimit));
>  }
>  
>  /*
> Index: latest/arch/ppc64/kernel/prom.c
> ===================================================================
> --- latest.orig/arch/ppc64/kernel/prom.c
> +++ latest/arch/ppc64/kernel/prom.c
> @@ -875,6 +875,8 @@ static int __init early_init_dt_scan_cho
>  					    const char *full_path, void *data)
>  {
>  	u32 *prop;
> +	u64 *prop64;
> +	extern unsigned long memory_limit, tce_alloc_start, tce_alloc_end;
>  
>  	if (strcmp(full_path, "/chosen") != 0)
>  		return 0;
> @@ -891,6 +893,18 @@ static int __init early_init_dt_scan_cho
>  	if (get_flat_dt_prop(node, "linux,iommu-force-on", NULL) != NULL)
>  		iommu_force_on = 1;
>  
> +	prop64 = (u64*)get_flat_dt_prop(node, "linux,memory-limit", NULL);
> +	if (prop64)
> +		memory_limit = *prop64;
> +		
> +	prop64 = (u64*)get_flat_dt_prop(node, "linux,tce-alloc-start", NULL);
> +	if (prop64)
> +		tce_alloc_start = *prop64;
> +		
> +	prop64 = (u64*)get_flat_dt_prop(node, "linux,tce-alloc-end", NULL);
> +	if (prop64)
> +		tce_alloc_end = *prop64;
> +
>  #ifdef CONFIG_PPC_PSERIES
>  	/* To help early debugging via the front panel, we retreive a minimal
>  	 * set of RTAS infos now if available
> @@ -1030,6 +1044,7 @@ void __init early_init_devtree(void *par
>  	lmb_init();
>  	scan_flat_dt(early_init_dt_scan_root, NULL);
>  	scan_flat_dt(early_init_dt_scan_memory, NULL);
> +	lmb_apply_memory_limit();
>  	lmb_analyze();
>  	systemcfg->physicalMemorySize = lmb_phys_mem_size();
>  	lmb_reserve(0, __pa(klimit));
> Index: latest/arch/ppc64/mm/hash_utils.c
> ===================================================================
> --- latest.orig/arch/ppc64/mm/hash_utils.c
> +++ latest/arch/ppc64/mm/hash_utils.c
> @@ -140,6 +140,8 @@ void __init htab_initialize(void)
>  	unsigned long pteg_count;
>  	unsigned long mode_rw;
>  	int i, use_largepages = 0;
> +	unsigned long base = 0, size = 0;
> +	extern unsigned long memory_limit, tce_alloc_start, tce_alloc_end;
>  
>  	DBG(" -> htab_initialize()\n");
>  
> @@ -195,8 +197,6 @@ void __init htab_initialize(void)
>  
>  	/* create bolted the linear mapping in the hash table */
>  	for (i=0; i < lmb.memory.cnt; i++) {
> -		unsigned long base, size;
> -
>  		base = lmb.memory.region[i].physbase + KERNELBASE;
>  		size = lmb.memory.region[i].size;
>  
> @@ -225,6 +225,21 @@ void __init htab_initialize(void)
>  #endif /* CONFIG_U3_DART */
>  		create_pte_mapping(base, base + size, mode_rw, use_largepages);
>  	}
> +
> +	/* If we have a memory_limit and we've allocated TCEs then we need to
> +	 * explicitly map the TCE area at the top of RAM. We also cope with the
> +	 * case that the TCEs start below memory_limit. */
> +	if (unlikely(memory_limit && tce_alloc_start && tce_alloc_end)) {

Same here, not really a hot path, you can take out the unlikely()

Do you need to check both tce_alloc_start and tce_alloc_end? Won't both
be set if one is?

> +		tce_alloc_start += KERNELBASE;
> +		tce_alloc_end += KERNELBASE;
> +		
> +		if (base + size >= tce_alloc_start)
> +			tce_alloc_start = base + size + 1;
> +		
> +		create_pte_mapping(tce_alloc_start, tce_alloc_end,
> +			mode_rw, use_largepages);

Could tce_alloc_end ever be below memory_limit too?

You might need to check tce_alloc_start and end to make sure you can use
16MB pages, or if you need 4K because of alignment/size constraints.

Even if you don't have to, a comment related to it could be warranted.

> +	}
> +	
>  	DBG(" <- htab_initialize()\n");
>  }
>  #undef KB
> Index: latest/arch/ppc64/mm/numa.c
> ===================================================================
> --- latest.orig/arch/ppc64/mm/numa.c
> +++ latest/arch/ppc64/mm/numa.c
> @@ -270,6 +270,7 @@ static int __init parse_numa_properties(
>  	int max_domain = 0;
>  	long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
>  	unsigned long i;
> +	extern unsigned long memory_limit;
>  
>  	if (numa_enabled == 0) {
>  		printk(KERN_WARNING "NUMA disabled by user\n");
> @@ -378,7 +379,7 @@ new_range:
>  				size / PAGE_SIZE;
>  		}
>  
> -		for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
> +		for (i = start; i < (start+size) && i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT)
>  			numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
>  				numa_domain;
>  
> @@ -387,8 +388,33 @@ new_range:
>  			goto new_range;
>  	}
>  
> -	for (i = 0; i <= max_domain; i++)
> -		node_set_online(i);
> +	if (unlikely(memory_limit)) {

#include <std-not-a-hot-path-comment> :-)

> +		unsigned long size, total = 0;
> +
> +		for (i = 0; i <= max_domain; i++) {
> +			size = init_node_data[i].node_spanned_pages * PAGE_SIZE;
> +			total += size;
> +
> +			if (total <= memory_limit)
> +				continue;
> +
> +			size = (memory_limit - (total - size)) / PAGE_SIZE;
> +			dbg("NUMA: truncating node %ld to %ld pages\n", i, size);
> +			init_node_data[i].node_spanned_pages = size;
> +			break;
> +		}
> +
> +		for (i++; i <= max_domain; i++) {

I think I would change this to a regular while(++i < max_domain) loop instead.

> +			dbg("NUMA: offlining node %ld for memory_limit\n", i);
> +			node_set_offline(i);

Just because the node doesn't get memory allocated we can't set it
offline. I.e. the cpus will still be online.

> +			init_node_data[i].node_start_pfn = 0;
> +			init_node_data[i].node_spanned_pages = 0;
> +		}
> +	} else {
> +		/* FIXME do we need this? haven't we already done it in the else above? */
> +		for (i = 0; i <= max_domain; i++)
> +			node_set_online(i);

Good question, I think can go. That code was added by Matt Dobson earlier
this year.

> +	}
>  
>  	return 0;
>  err:
> Index: latest/arch/ppc64/kernel/prom_init.c
> ===================================================================
> --- latest.orig/arch/ppc64/kernel/prom_init.c
> +++ latest/arch/ppc64/kernel/prom_init.c
> @@ -178,6 +178,9 @@ static int __initdata of_platform;
>  
>  static char __initdata prom_cmd_line[COMMAND_LINE_SIZE];
>  
> +static unsigned long __initdata memory_limit;
> +static unsigned long __initdata tce_alloc_start;
> +static unsigned long __initdata tce_alloc_end;

A little confusing to have the same variable names here as local
statics. Maybe rename them? Or use the global.

>  static unsigned long __initdata alloc_top;
>  static unsigned long __initdata alloc_top_high;
>  static unsigned long __initdata alloc_bottom;
> @@ -385,10 +388,64 @@ static int __init prom_setprop(phandle n
>  			 (u32)(unsigned long) value, (u32) valuelen);
>  }
>  
> +/* We can't use the standard versions because of RELOC headaches. */
> +#define isxdigit(c)	(('0' <= (c) && (c) <= '9') \
> +			 || ('a' <= (c) && (c) <= 'f') \
> +			 || ('A' <= (c) && (c) <= 'F'))
> +			 
> +#define isdigit(c)	('0' <= (c) && (c) <= '9')
> +#define islower(c)	('a' <= (c) && (c) <= 'z')
> +#define toupper(c)	(islower(c) ? ((c) - 'a' + 'A') : (c))

#define a tonum(c,base) or something like that, and use that below:

> +
> +unsigned long prom_strtoul(const char *cp, const char **endp)
> +{
> +	unsigned long result = 0, base = 10, value;
> +	
> +	if (*cp == '0') {
> +		base = 8;
> +		cp++;
> +		if (toupper(*cp) == 'X') {
> +			cp++;
> +			base = 16;
> +		}
> +	}
> +
> +	while (isxdigit(*cp) &&
> +	       (value = isdigit(*cp) ? *cp - '0' : toupper(*cp) - 'A' + 10) < base) {
> +		result = result * base + value;
> +		cp++;
> +	}

Hmm. Would you mind breaking that up? It'll be a few more lines but much
easier to read.

> +	
> +	if (endp)
> +		*endp = cp;
> +	
> +	return result;
> +}
> +
> +unsigned long prom_memparse(const char *ptr, const char **retptr)
> +{
> +	unsigned long ret = prom_strtoul(ptr, retptr);
> +
> +	switch (**retptr) {
> +	case 'G':
> +	case 'g':
> +		ret <<= 10;
> +	case 'M':
> +	case 'm':
> +		ret <<= 10;
> +	case 'K':
> +	case 'k':
> +		ret <<= 10;
> +		(*retptr)++;

Do other architectures swallow/tolerate a b/B after the unit? Could be
nice.

> +	default:
> +		break;
> +	}
> +	return ret;
> +}
>  
>  /*
>   * Early parsing of the command line passed to the kernel, used for
> - * the options that affect the iommu
> + * "mem=x" and the options that affect the iommu
>   */
>  static void __init early_cmdline_parse(void)
>  {
> @@ -419,6 +476,14 @@ static void __init early_cmdline_parse(v
>  		else if (!strncmp(opt, RELOC("force"), 5))
>  			RELOC(iommu_force_on) = 1;
>  	}
> +	
> +	opt = strstr(RELOC(prom_cmd_line), RELOC("mem="));
> +	if (opt) {
> +		opt += 4;
> +		RELOC(memory_limit) = prom_memparse(opt, (const char **)&opt);
> +		/* Align to 16 MB == size of large page */
> +		RELOC(memory_limit) = ALIGN(RELOC(memory_limit), 0x1000000);

Maybe a printk to say that it's been rounded up, so we don't surprise
the user? 

> +	}
>  }
>  
>  /*
> @@ -665,15 +730,7 @@ static void __init prom_init_mem(void)
>  		}
>  	}
>  
> -	/* Setup our top/bottom alloc points, that is top of RMO or top of
> -	 * segment 0 when running non-LPAR
> -	 */
> -	if ( RELOC(of_platform) == PLATFORM_PSERIES_LPAR )
> -		RELOC(alloc_top) = RELOC(rmo_top);
> -	else
> -		RELOC(alloc_top) = RELOC(rmo_top) = min(0x40000000ul, RELOC(ram_top));
>  	RELOC(alloc_bottom) = PAGE_ALIGN(RELOC(klimit) - offset + 0x4000);
> -	RELOC(alloc_top_high) = RELOC(ram_top);
>  
>  	/* Check if we have an initrd after the kernel, if we do move our bottom
>  	 * point to after it
> @@ -683,8 +740,37 @@ static void __init prom_init_mem(void)
>  		    > RELOC(alloc_bottom))
>  			RELOC(alloc_bottom) = PAGE_ALIGN(RELOC(prom_initrd_end));
>  	}
> +	
> +	/* If memory_limit is set we reduce the upper limits *except* for
> +	 * alloc_top_high. This must be the real top of RAM so we can put
> +	 * TCE's up there.  */
> +	
> +	RELOC(alloc_top_high) = RELOC(ram_top);
> +	
> +	if (unlikely(RELOC(memory_limit))) {
> +		if (RELOC(memory_limit) <= RELOC(alloc_bottom)) {
> +			prom_printf("Ignoring mem=%x <= alloc_bottom.\n",
> +				RELOC(memory_limit));
> +			RELOC(memory_limit) = 0;

...or should it just be bumped up to include alloc_bottom instead?

> +		} else if (RELOC(memory_limit) >= RELOC(ram_top)) {
> +			prom_printf("Ignoring mem=%x >= ram_top.\n",
> +				RELOC(memory_limit));
> +			RELOC(memory_limit) = 0;
> +		} else {
> +			RELOC(ram_top) = RELOC(memory_limit);
> +			RELOC(rmo_top) = min(RELOC(rmo_top), RELOC(memory_limit));
> +		}
> +	}
> +
> +	/* Setup our top alloc point, that is top of RMO or top of
> +	 * segment 0 when running non-LPAR. */
> +	if ( RELOC(of_platform) == PLATFORM_PSERIES_LPAR )
> +		RELOC(alloc_top) = RELOC(rmo_top);
> +	else
> +		RELOC(alloc_top) = RELOC(rmo_top) = min(0x40000000ul, RELOC(ram_top));
>  
>  	prom_printf("memory layout at init:\n");
> +	prom_printf("  memory_limit : %x\n", RELOC(memory_limit));
>  	prom_printf("  alloc_bottom : %x\n", RELOC(alloc_bottom));
>  	prom_printf("  alloc_top    : %x\n", RELOC(alloc_top));
>  	prom_printf("  alloc_top_hi : %x\n", RELOC(alloc_top_high));
> @@ -873,6 +959,11 @@ static void __init prom_initialize_tce_t
>  
>  	reserve_mem(local_alloc_bottom, local_alloc_top - local_alloc_bottom);
>  
> +	if (RELOC(memory_limit)) {
> +		RELOC(tce_alloc_start) = local_alloc_bottom;
> +		RELOC(tce_alloc_end) = local_alloc_top;
> +	}
> +	
>  	/* Flag the first invalid entry */
>  	prom_debug("ending prom_initialize_tce_table\n");
>  }
> @@ -1688,6 +1779,15 @@ unsigned long __init prom_init(unsigned 
>  		prom_setprop(_prom->chosen, "linux,iommu-off", NULL, 0);
>  	if (RELOC(iommu_force_on))
>  		prom_setprop(_prom->chosen, "linux,iommu-force-on", NULL, 0);
> +	if (RELOC(memory_limit))
> +		prom_setprop(_prom->chosen, "linux,memory-limit",
> +			PTRRELOC(&memory_limit), sizeof(RELOC(memory_limit)));
> +	if (RELOC(tce_alloc_start))
> +		prom_setprop(_prom->chosen, "linux,tce-alloc-start",
> +			PTRRELOC(&tce_alloc_start), sizeof(RELOC(tce_alloc_start)));
> +	if (RELOC(tce_alloc_end))
> +		prom_setprop(_prom->chosen, "linux,tce-alloc-end",
> +			PTRRELOC(&tce_alloc_end), sizeof(RELOC(tce_alloc_end)));
>  
>  	/*
>  	 * Now finally create the flattened device-tree
> _______________________________________________
> Linuxppc64-dev mailing list
> Linuxppc64-dev at ozlabs.org
> https://ozlabs.org/cgi-bin/mailman/listinfo/linuxppc64-dev



More information about the Linuxppc64-dev mailing list