[RFC/PATCH] ppc64: Add mem=X option
Olof Johansson
olof at austin.ibm.com
Wed Feb 23 14:40:03 EST 2005
On Tue, Feb 22, 2005 at 07:24:23PM +1100, Michael Ellerman wrote:
> Hi Anton, Ben, and the rest of ya,
>
> Here is my first take at adding support for the mem=X boot option.
> Please check it out.
Yes, finally! :-) This has been asked for many a time, but noone has
implemented it before. Good work! I have some comments below, most of
them are nitpicks.
> -#if 0 /* XXX not currently used */
> +/* These three variables are used to save values passed to us by prom_init()
> + * via the device tree. The TCE variables are needed because with a memory_limit
> + * in force we may need to explicitly map the TCE are at the top of RAM. */
I think it's common kernel coding style to put the final */ on a new
line to get all the *'s lined up. There are other comments in the patch
with the same style, I'll only point out this one.
> +unsigned long tce_alloc_start;
> +unsigned long tce_alloc_end;
>
> +#ifdef CONFIG_PPC_ISERIES
> +/* On iSeries we just parse the mem=X option from the command line.
> + * On pSeries it's a bit more complicated, see prom_init_mem() */
> static int __init early_parsemem(char *p)
> {
> if (!p)
> @@ -818,7 +824,7 @@ static int __init early_parsemem(char *p
> return 0;
> }
> early_param("mem", early_parsemem);
> -#endif
> +#endif /* CONFIG_PPC_ISERIES */
>
> #ifdef CONFIG_PPC_MULTIPLATFORM
> static int __init set_preferred_console(void)
> Index: latest/arch/ppc64/kernel/lmb.c
> ===================================================================
> --- latest.orig/arch/ppc64/kernel/lmb.c
> +++ latest/arch/ppc64/kernel/lmb.c
> @@ -344,3 +344,34 @@ lmb_abs_to_phys(unsigned long aa)
>
> return pa;
> }
> +
> +/* Truncate the lmb list to memory_limit if it's set
> + * You must call lmb_analyze() after this. */
> +void __init lmb_apply_memory_limit(void)
> +{
> + extern unsigned long memory_limit;
> + unsigned long i, total = 0, crop;
> + struct lmb_region *mem = &(lmb.memory);
> +
> + if (likely(!memory_limit))
> + return;
No need to worry about likely/unlikely here. It's not a hot call path.
> +
> + for (i = 0; i < mem->cnt; i++) {
> + total += mem->region[i].size;
> +
> + if (total <= memory_limit)
> + continue;
> +
> + crop = (memory_limit - (total - mem->region[i].size));
> +#ifdef DEBUG
> + udbg_printf("lmb_truncate(): truncating at region %x\n", i);
> + udbg_printf("lmb_truncate(): total = %x\n", total);
> + udbg_printf("lmb_truncate(): size = %x\n", mem->region[i].size);
> + udbg_printf("lmb_truncate(): crop = %x\n", crop);
> +#endif
> +
> + mem->region[i].size = crop;
> + mem->cnt = i + 1;
> + break;
With the above tests, there's a chance for the last LMB to be of size 0.
I don't think it matters for things to work, but you might as well skip
that one too.
Changing the test to (total < memory_limit) should take care of it, the
last one will just be cropped by nothing.
Also, if you just reduce the size of memory_limit instead of keep the
rolling total, the crop calculation will be simpler. It took a bit of
thinking to make sure it's right the way it's written now.
> + }
> +}
> Index: latest/include/asm-ppc64/lmb.h
> ===================================================================
> --- latest.orig/include/asm-ppc64/lmb.h
> +++ latest/include/asm-ppc64/lmb.h
> @@ -53,6 +53,7 @@ extern unsigned long __init lmb_alloc_ba
> extern unsigned long __init lmb_phys_mem_size(void);
> extern unsigned long __init lmb_end_of_DRAM(void);
> extern unsigned long __init lmb_abs_to_phys(unsigned long);
> +extern void __init lmb_apply_memory_limit(void);
>
> extern void lmb_dump_all(void);
>
> Index: latest/arch/ppc64/kernel/iSeries_setup.c
> ===================================================================
> --- latest.orig/arch/ppc64/kernel/iSeries_setup.c
> +++ latest/arch/ppc64/kernel/iSeries_setup.c
> @@ -284,7 +284,7 @@ unsigned long iSeries_process_mainstore_
> return mem_blocks;
> }
>
> -static void __init iSeries_parse_cmdline(void)
> +static void __init iSeries_get_cmdline(void)
> {
> char *p, *q;
>
> @@ -304,6 +304,8 @@ static void __init iSeries_parse_cmdline
>
> /*static*/ void __init iSeries_init_early(void)
> {
> + extern unsigned long memory_limit;
> +
> DBG(" -> iSeries_init_early()\n");
>
> ppcdbg_initialize();
> @@ -351,6 +353,29 @@ static void __init iSeries_parse_cmdline
> */
> build_iSeries_Memory_Map();
>
> + iSeries_get_cmdline();
> +
> + /* Save unparsed command line copy for /proc/cmdline */
> + strlcpy(saved_command_line, cmd_line, COMMAND_LINE_SIZE);
> +
> + /* Parse early parameters, in particular mem=x */
> + parse_early_param();
> +
> + if (unlikely(memory_limit)) {
> + if (memory_limit > systemcfg->physicalMemorySize)
> + printk("Ignoring 'mem' option, value %lu is too large.\n", memory_limit);
> + else
> + systemcfg->physicalMemorySize = memory_limit;
> + }
> +
> + /* Bolt kernel mappings for all of memory */
> + iSeries_bolt_kernel(0, systemcfg->physicalMemorySize);
Do you want to bolt it all, even if there's a mem= limitation?
> +
> + lmb_init();
> + lmb_add(0, systemcfg->physicalMemorySize);
> + lmb_analyze(); /* ?? */
> + lmb_reserve(0, __pa(klimit));
> +
> /* Initialize machine-dependency vectors */
> #ifdef CONFIG_SMP
> smp_init_iSeries();
> @@ -376,9 +401,6 @@ static void __init iSeries_parse_cmdline
> initrd_start = initrd_end = 0;
> #endif /* CONFIG_BLK_DEV_INITRD */
>
> -
> - iSeries_parse_cmdline();
> -
> DBG(" <- iSeries_init_early()\n");
> }
>
> @@ -539,14 +561,6 @@ static void __init build_iSeries_Memory_
> * nextPhysChunk
> */
> systemcfg->physicalMemorySize = chunk_to_addr(nextPhysChunk);
> -
> - /* Bolt kernel mappings for all of memory */
> - iSeries_bolt_kernel(0, systemcfg->physicalMemorySize);
> -
> - lmb_init();
> - lmb_add(0, systemcfg->physicalMemorySize);
> - lmb_analyze(); /* ?? */
> - lmb_reserve(0, __pa(klimit));
> }
>
> /*
> Index: latest/arch/ppc64/kernel/prom.c
> ===================================================================
> --- latest.orig/arch/ppc64/kernel/prom.c
> +++ latest/arch/ppc64/kernel/prom.c
> @@ -875,6 +875,8 @@ static int __init early_init_dt_scan_cho
> const char *full_path, void *data)
> {
> u32 *prop;
> + u64 *prop64;
> + extern unsigned long memory_limit, tce_alloc_start, tce_alloc_end;
>
> if (strcmp(full_path, "/chosen") != 0)
> return 0;
> @@ -891,6 +893,18 @@ static int __init early_init_dt_scan_cho
> if (get_flat_dt_prop(node, "linux,iommu-force-on", NULL) != NULL)
> iommu_force_on = 1;
>
> + prop64 = (u64*)get_flat_dt_prop(node, "linux,memory-limit", NULL);
> + if (prop64)
> + memory_limit = *prop64;
> +
> + prop64 = (u64*)get_flat_dt_prop(node, "linux,tce-alloc-start", NULL);
> + if (prop64)
> + tce_alloc_start = *prop64;
> +
> + prop64 = (u64*)get_flat_dt_prop(node, "linux,tce-alloc-end", NULL);
> + if (prop64)
> + tce_alloc_end = *prop64;
> +
> #ifdef CONFIG_PPC_PSERIES
> /* To help early debugging via the front panel, we retreive a minimal
> * set of RTAS infos now if available
> @@ -1030,6 +1044,7 @@ void __init early_init_devtree(void *par
> lmb_init();
> scan_flat_dt(early_init_dt_scan_root, NULL);
> scan_flat_dt(early_init_dt_scan_memory, NULL);
> + lmb_apply_memory_limit();
> lmb_analyze();
> systemcfg->physicalMemorySize = lmb_phys_mem_size();
> lmb_reserve(0, __pa(klimit));
> Index: latest/arch/ppc64/mm/hash_utils.c
> ===================================================================
> --- latest.orig/arch/ppc64/mm/hash_utils.c
> +++ latest/arch/ppc64/mm/hash_utils.c
> @@ -140,6 +140,8 @@ void __init htab_initialize(void)
> unsigned long pteg_count;
> unsigned long mode_rw;
> int i, use_largepages = 0;
> + unsigned long base = 0, size = 0;
> + extern unsigned long memory_limit, tce_alloc_start, tce_alloc_end;
>
> DBG(" -> htab_initialize()\n");
>
> @@ -195,8 +197,6 @@ void __init htab_initialize(void)
>
> /* create bolted the linear mapping in the hash table */
> for (i=0; i < lmb.memory.cnt; i++) {
> - unsigned long base, size;
> -
> base = lmb.memory.region[i].physbase + KERNELBASE;
> size = lmb.memory.region[i].size;
>
> @@ -225,6 +225,21 @@ void __init htab_initialize(void)
> #endif /* CONFIG_U3_DART */
> create_pte_mapping(base, base + size, mode_rw, use_largepages);
> }
> +
> + /* If we have a memory_limit and we've allocated TCEs then we need to
> + * explicitly map the TCE area at the top of RAM. We also cope with the
> + * case that the TCEs start below memory_limit. */
> + if (unlikely(memory_limit && tce_alloc_start && tce_alloc_end)) {
Same here, not really a hot path, you can take out the unlikely()
Do you need to check both tce_alloc_start and tce_alloc_end? Won't both
be set if one is?
> + tce_alloc_start += KERNELBASE;
> + tce_alloc_end += KERNELBASE;
> +
> + if (base + size >= tce_alloc_start)
> + tce_alloc_start = base + size + 1;
> +
> + create_pte_mapping(tce_alloc_start, tce_alloc_end,
> + mode_rw, use_largepages);
Could tce_alloc_end ever be below memory_limit too?
You might need to check tce_alloc_start and end to make sure you can use
16MB pages, or if you need 4K because of alignment/size constraints.
Even if you don't have to, a comment related to it could be warranted.
> + }
> +
> DBG(" <- htab_initialize()\n");
> }
> #undef KB
> Index: latest/arch/ppc64/mm/numa.c
> ===================================================================
> --- latest.orig/arch/ppc64/mm/numa.c
> +++ latest/arch/ppc64/mm/numa.c
> @@ -270,6 +270,7 @@ static int __init parse_numa_properties(
> int max_domain = 0;
> long entries = lmb_end_of_DRAM() >> MEMORY_INCREMENT_SHIFT;
> unsigned long i;
> + extern unsigned long memory_limit;
>
> if (numa_enabled == 0) {
> printk(KERN_WARNING "NUMA disabled by user\n");
> @@ -378,7 +379,7 @@ new_range:
> size / PAGE_SIZE;
> }
>
> - for (i = start ; i < (start+size); i += MEMORY_INCREMENT)
> + for (i = start; i < (start+size) && i < lmb_end_of_DRAM(); i += MEMORY_INCREMENT)
> numa_memory_lookup_table[i >> MEMORY_INCREMENT_SHIFT] =
> numa_domain;
>
> @@ -387,8 +388,33 @@ new_range:
> goto new_range;
> }
>
> - for (i = 0; i <= max_domain; i++)
> - node_set_online(i);
> + if (unlikely(memory_limit)) {
#include <std-not-a-hot-path-comment> :-)
> + unsigned long size, total = 0;
> +
> + for (i = 0; i <= max_domain; i++) {
> + size = init_node_data[i].node_spanned_pages * PAGE_SIZE;
> + total += size;
> +
> + if (total <= memory_limit)
> + continue;
> +
> + size = (memory_limit - (total - size)) / PAGE_SIZE;
> + dbg("NUMA: truncating node %ld to %ld pages\n", i, size);
> + init_node_data[i].node_spanned_pages = size;
> + break;
> + }
> +
> + for (i++; i <= max_domain; i++) {
I think I would change this to a regular while(++i < max_domain) loop instead.
> + dbg("NUMA: offlining node %ld for memory_limit\n", i);
> + node_set_offline(i);
Just because the node doesn't get memory allocated we can't set it
offline. I.e. the cpus will still be online.
> + init_node_data[i].node_start_pfn = 0;
> + init_node_data[i].node_spanned_pages = 0;
> + }
> + } else {
> + /* FIXME do we need this? haven't we already done it in the else above? */
> + for (i = 0; i <= max_domain; i++)
> + node_set_online(i);
Good question, I think can go. That code was added by Matt Dobson earlier
this year.
> + }
>
> return 0;
> err:
> Index: latest/arch/ppc64/kernel/prom_init.c
> ===================================================================
> --- latest.orig/arch/ppc64/kernel/prom_init.c
> +++ latest/arch/ppc64/kernel/prom_init.c
> @@ -178,6 +178,9 @@ static int __initdata of_platform;
>
> static char __initdata prom_cmd_line[COMMAND_LINE_SIZE];
>
> +static unsigned long __initdata memory_limit;
> +static unsigned long __initdata tce_alloc_start;
> +static unsigned long __initdata tce_alloc_end;
A little confusing to have the same variable names here as local
statics. Maybe rename them? Or use the global.
> static unsigned long __initdata alloc_top;
> static unsigned long __initdata alloc_top_high;
> static unsigned long __initdata alloc_bottom;
> @@ -385,10 +388,64 @@ static int __init prom_setprop(phandle n
> (u32)(unsigned long) value, (u32) valuelen);
> }
>
> +/* We can't use the standard versions because of RELOC headaches. */
> +#define isxdigit(c) (('0' <= (c) && (c) <= '9') \
> + || ('a' <= (c) && (c) <= 'f') \
> + || ('A' <= (c) && (c) <= 'F'))
> +
> +#define isdigit(c) ('0' <= (c) && (c) <= '9')
> +#define islower(c) ('a' <= (c) && (c) <= 'z')
> +#define toupper(c) (islower(c) ? ((c) - 'a' + 'A') : (c))
#define a tonum(c,base) or something like that, and use that below:
> +
> +unsigned long prom_strtoul(const char *cp, const char **endp)
> +{
> + unsigned long result = 0, base = 10, value;
> +
> + if (*cp == '0') {
> + base = 8;
> + cp++;
> + if (toupper(*cp) == 'X') {
> + cp++;
> + base = 16;
> + }
> + }
> +
> + while (isxdigit(*cp) &&
> + (value = isdigit(*cp) ? *cp - '0' : toupper(*cp) - 'A' + 10) < base) {
> + result = result * base + value;
> + cp++;
> + }
Hmm. Would you mind breaking that up? It'll be a few more lines but much
easier to read.
> +
> + if (endp)
> + *endp = cp;
> +
> + return result;
> +}
> +
> +unsigned long prom_memparse(const char *ptr, const char **retptr)
> +{
> + unsigned long ret = prom_strtoul(ptr, retptr);
> +
> + switch (**retptr) {
> + case 'G':
> + case 'g':
> + ret <<= 10;
> + case 'M':
> + case 'm':
> + ret <<= 10;
> + case 'K':
> + case 'k':
> + ret <<= 10;
> + (*retptr)++;
Do other architectures swallow/tolerate a b/B after the unit? Could be
nice.
> + default:
> + break;
> + }
> + return ret;
> +}
>
> /*
> * Early parsing of the command line passed to the kernel, used for
> - * the options that affect the iommu
> + * "mem=x" and the options that affect the iommu
> */
> static void __init early_cmdline_parse(void)
> {
> @@ -419,6 +476,14 @@ static void __init early_cmdline_parse(v
> else if (!strncmp(opt, RELOC("force"), 5))
> RELOC(iommu_force_on) = 1;
> }
> +
> + opt = strstr(RELOC(prom_cmd_line), RELOC("mem="));
> + if (opt) {
> + opt += 4;
> + RELOC(memory_limit) = prom_memparse(opt, (const char **)&opt);
> + /* Align to 16 MB == size of large page */
> + RELOC(memory_limit) = ALIGN(RELOC(memory_limit), 0x1000000);
Maybe a printk to say that it's been rounded up, so we don't surprise
the user?
> + }
> }
>
> /*
> @@ -665,15 +730,7 @@ static void __init prom_init_mem(void)
> }
> }
>
> - /* Setup our top/bottom alloc points, that is top of RMO or top of
> - * segment 0 when running non-LPAR
> - */
> - if ( RELOC(of_platform) == PLATFORM_PSERIES_LPAR )
> - RELOC(alloc_top) = RELOC(rmo_top);
> - else
> - RELOC(alloc_top) = RELOC(rmo_top) = min(0x40000000ul, RELOC(ram_top));
> RELOC(alloc_bottom) = PAGE_ALIGN(RELOC(klimit) - offset + 0x4000);
> - RELOC(alloc_top_high) = RELOC(ram_top);
>
> /* Check if we have an initrd after the kernel, if we do move our bottom
> * point to after it
> @@ -683,8 +740,37 @@ static void __init prom_init_mem(void)
> > RELOC(alloc_bottom))
> RELOC(alloc_bottom) = PAGE_ALIGN(RELOC(prom_initrd_end));
> }
> +
> + /* If memory_limit is set we reduce the upper limits *except* for
> + * alloc_top_high. This must be the real top of RAM so we can put
> + * TCE's up there. */
> +
> + RELOC(alloc_top_high) = RELOC(ram_top);
> +
> + if (unlikely(RELOC(memory_limit))) {
> + if (RELOC(memory_limit) <= RELOC(alloc_bottom)) {
> + prom_printf("Ignoring mem=%x <= alloc_bottom.\n",
> + RELOC(memory_limit));
> + RELOC(memory_limit) = 0;
...or should it just be bumped up to include alloc_bottom instead?
> + } else if (RELOC(memory_limit) >= RELOC(ram_top)) {
> + prom_printf("Ignoring mem=%x >= ram_top.\n",
> + RELOC(memory_limit));
> + RELOC(memory_limit) = 0;
> + } else {
> + RELOC(ram_top) = RELOC(memory_limit);
> + RELOC(rmo_top) = min(RELOC(rmo_top), RELOC(memory_limit));
> + }
> + }
> +
> + /* Setup our top alloc point, that is top of RMO or top of
> + * segment 0 when running non-LPAR. */
> + if ( RELOC(of_platform) == PLATFORM_PSERIES_LPAR )
> + RELOC(alloc_top) = RELOC(rmo_top);
> + else
> + RELOC(alloc_top) = RELOC(rmo_top) = min(0x40000000ul, RELOC(ram_top));
>
> prom_printf("memory layout at init:\n");
> + prom_printf(" memory_limit : %x\n", RELOC(memory_limit));
> prom_printf(" alloc_bottom : %x\n", RELOC(alloc_bottom));
> prom_printf(" alloc_top : %x\n", RELOC(alloc_top));
> prom_printf(" alloc_top_hi : %x\n", RELOC(alloc_top_high));
> @@ -873,6 +959,11 @@ static void __init prom_initialize_tce_t
>
> reserve_mem(local_alloc_bottom, local_alloc_top - local_alloc_bottom);
>
> + if (RELOC(memory_limit)) {
> + RELOC(tce_alloc_start) = local_alloc_bottom;
> + RELOC(tce_alloc_end) = local_alloc_top;
> + }
> +
> /* Flag the first invalid entry */
> prom_debug("ending prom_initialize_tce_table\n");
> }
> @@ -1688,6 +1779,15 @@ unsigned long __init prom_init(unsigned
> prom_setprop(_prom->chosen, "linux,iommu-off", NULL, 0);
> if (RELOC(iommu_force_on))
> prom_setprop(_prom->chosen, "linux,iommu-force-on", NULL, 0);
> + if (RELOC(memory_limit))
> + prom_setprop(_prom->chosen, "linux,memory-limit",
> + PTRRELOC(&memory_limit), sizeof(RELOC(memory_limit)));
> + if (RELOC(tce_alloc_start))
> + prom_setprop(_prom->chosen, "linux,tce-alloc-start",
> + PTRRELOC(&tce_alloc_start), sizeof(RELOC(tce_alloc_start)));
> + if (RELOC(tce_alloc_end))
> + prom_setprop(_prom->chosen, "linux,tce-alloc-end",
> + PTRRELOC(&tce_alloc_end), sizeof(RELOC(tce_alloc_end)));
>
> /*
> * Now finally create the flattened device-tree
> _______________________________________________
> Linuxppc64-dev mailing list
> Linuxppc64-dev at ozlabs.org
> https://ozlabs.org/cgi-bin/mailman/listinfo/linuxppc64-dev
More information about the Linuxppc64-dev
mailing list