[PATCH] Fix boot freeze on machine with empty memory node
Jon Tollefson
kniht at us.ibm.com
Fri Dec 5 15:10:49 EST 2008
Dave Hansen wrote:
> I got a bug report about a distro kernel not booting on a particular
> machine. It would freeze during boot:
>
>
>> ...
>> Could not find start_pfn for node 1
>> [boot]0015 Setup Done
>> Built 2 zonelists in Node order, mobility grouping on. Total pages: 123783
>> Policy zone: DMA
>> Kernel command line:
>> [boot]0020 XICS Init
>> [boot]0021 XICS Done
>> PID hash table entries: 4096 (order: 12, 32768 bytes)
>> clocksource: timebase mult[7d0000] shift[22] registered
>> Console: colour dummy device 80x25
>> console handover: boot [udbg0] -> real [hvc0]
>> Dentry cache hash table entries: 1048576 (order: 7, 8388608 bytes)
>> Inode-cache hash table entries: 524288 (order: 6, 4194304 bytes)
>> freeing bootmem node 0
>>
>
> I've reproduced this on 2.6.27.7. I'm pretty sure it is caused by this
> patch:
>
> http://git.kernel.org/gitweb.cgi?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=8f64e1f2d1e09267ac926e15090fd505c1c0cbcb
>
> The problem is that Jon took a loop which was (in psuedocode):
>
> for_each_node(nid)
> NODE_DATA(nid) = careful_alloc(nid);
> setup_bootmem(nid);
> reserve_node_bootmem(nid);
>
> and broke it up into:
>
> for_each_node(nid)
> NODE_DATA(nid) = careful_alloc(nid);
> setup_bootmem(nid);
> for_each_node(nid)
> reserve_node_bootmem(nid);
>
> The issue comes in when the 'careful_alloc()' is called on a node with
> no memory. It falls back to using bootmem from a previously-initialized
> node. But, bootmem has not yet been reserved when Jon's patch is
> applied. It gives back bogus memory (0xc000000000000000) and pukes
> later in boot.
>
> The following patch collapses the loop back together. It also breaks
> the mark_reserved_regions_for_nid() code out into a function and adds
> some comments. I think a huge part of introducing this bug is because
> for loop was too long and hard to read.
>
> The actual bug fix here is the:
>
> + if (end_pfn <= node->node_start_pfn ||
> + start_pfn >= node_end_pfn)
> + continue;
>
> Signed-off-by: Dave Hansen <dave at linux.vnet.ibm.com>
>
> diff -ru linux-2.6.27.7.orig/arch/powerpc//mm/numa.c linux-2.6.27.7/arch/powerpc//mm/numa.c
> --- linux-2.6.27.7.orig/arch/powerpc//mm/numa.c 2008-11-20 17:02:37.000000000 -0600
> +++ linux-2.6.27.7/arch/powerpc//mm/numa.c 2008-11-24 15:53:35.000000000 -0600
> @@ -822,6 +822,67 @@
> .priority = 1 /* Must run before sched domains notifier. */
> };
>
> +static void mark_reserved_regions_for_nid(int nid)
> +{
> + struct pglist_data *node = NODE_DATA(nid);
> + int i;
> +
> + for (i = 0; i < lmb.reserved.cnt; i++) {
> + unsigned long physbase = lmb.reserved.region[i].base;
> + unsigned long size = lmb.reserved.region[i].size;
> + unsigned long start_pfn = physbase >> PAGE_SHIFT;
> + unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT);
> + struct node_active_region node_ar;
> + unsigned long node_end_pfn = node->node_start_pfn +
> + node->node_spanned_pages;
> +
> + /*
> + * Check to make sure that this lmb.reserved area is
> + * within the bounds of the node that we care about.
> + * Checking the nid of the start and end points is not
> + * sufficient because the reserved area could span the
> + * entire node.
> + */
> + if (end_pfn <= node->node_start_pfn ||
> + start_pfn >= node_end_pfn)
> + continue;
> +
> + get_node_active_region(start_pfn, &node_ar);
> + while (start_pfn < end_pfn &&
> + node_ar.start_pfn < node_ar.end_pfn) {
> + unsigned long reserve_size = size;
> + /*
> + * if reserved region extends past active region
> + * then trim size to active region
> + */
> + if (end_pfn > node_ar.end_pfn)
> + reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
> + - (start_pfn << PAGE_SHIFT);
> + dbg("reserve_bootmem %lx %lx nid=%d\n", physbase,
> + reserve_size, node_ar.nid);
> + reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase,
> + reserve_size, BOOTMEM_DEFAULT);
> + /*
> + * if reserved region is contained in the active region
> + * then done.
> + */
> + if (end_pfn <= node_ar.end_pfn)
> + break;
> +
> + /*
> + * reserved region extends past the active region
> + * get next active region that contains this
> + * reserved region
> + */
> + start_pfn = node_ar.end_pfn;
> + physbase = start_pfn << PAGE_SHIFT;
> + size = size - reserve_size;
> + get_node_active_region(start_pfn, &node_ar);
> + }
> + }
> +}
> +
> +
> void __init do_init_bootmem(void)
> {
> int nid;
> @@ -847,7 +908,13 @@
>
> get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
>
> - /* Allocate the node structure node local if possible */
> + /*
> + * Allocate the node structure node local if possible
> + *
> + * Be careful moving this around, as it relies on all
> + * previous nodes' bootmem to be initialized and have
> + * all reserved areas marked.
> + */
> NODE_DATA(nid) = careful_allocation(nid,
> sizeof(struct pglist_data),
> SMP_CACHE_BYTES, end_pfn);
> @@ -879,53 +946,14 @@
> start_pfn, end_pfn);
>
> free_bootmem_with_active_regions(nid, end_pfn);
> - }
> -
> - /* Mark reserved regions */
> - for (i = 0; i < lmb.reserved.cnt; i++) {
> - unsigned long physbase = lmb.reserved.region[i].base;
> - unsigned long size = lmb.reserved.region[i].size;
> - unsigned long start_pfn = physbase >> PAGE_SHIFT;
> - unsigned long end_pfn = ((physbase + size) >> PAGE_SHIFT);
> - struct node_active_region node_ar;
> -
> - get_node_active_region(start_pfn, &node_ar);
> - while (start_pfn < end_pfn &&
> - node_ar.start_pfn < node_ar.end_pfn) {
> - unsigned long reserve_size = size;
> - /*
> - * if reserved region extends past active region
> - * then trim size to active region
> - */
> - if (end_pfn > node_ar.end_pfn)
> - reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
> - - (start_pfn << PAGE_SHIFT);
> - dbg("reserve_bootmem %lx %lx nid=%d\n", physbase,
> - reserve_size, node_ar.nid);
> - reserve_bootmem_node(NODE_DATA(node_ar.nid), physbase,
> - reserve_size, BOOTMEM_DEFAULT);
> - /*
> - * if reserved region is contained in the active region
> - * then done.
> - */
> - if (end_pfn <= node_ar.end_pfn)
> - break;
> -
> - /*
> - * reserved region extends past the active region
> - * get next active region that contains this
> - * reserved region
> - */
> - start_pfn = node_ar.end_pfn;
> - physbase = start_pfn << PAGE_SHIFT;
> - size = size - reserve_size;
> - get_node_active_region(start_pfn, &node_ar);
> - }
> -
> - }
> -
> - for_each_online_node(nid)
> + /*
> + * Be very careful about moving this around. Future
> + * calls to careful_allocation() depend on this getting
> + * done correctly.
> + */
> + mark_reserved_regions_for_nid(nid);
> sparse_memory_present_with_active_regions(nid);
> + }
> }
>
> void __init paging_init(void)
>
>
> -- Dave
>
>
Sorry I was out for a while. I just tried testing the patch on a
machine with 3 gigantic pages and it crashes.
This first log snippet is from the machine WITHOUT the patch.
...
NUMA associativity depth for CPU/Memory: 3
Node 0 Memory: 0x0-0x288000000 0x500000000-0x1000000000
Node 1 Memory: 0x288000000-0x500000000 0x1000000000-0x1400000000
adding cpu 0 to node 0
node 0
NODE_DATA() = c0000004ffff1480
start_paddr = 0
end_paddr = 1000000000
bootmap_paddr = 4fffd0000
node 1
NODE_DATA() = c0000004fffc8980
start_paddr = 288000000
end_paddr = 1400000000
bootmap_paddr = 4fff90000
reserve_bootmem 0 c30000 nid=0
reserve_bootmem 2f00000 b86400 nid=0
reserve_bootmem 2f00000 b90000 nid=0
reserve_bootmem 3b90000 30000 nid=0
reserve_bootmem f530000 ad0000 nid=0
reserve_bootmem 4fff90000 30000 nid=1
reserve_bootmem 4fffc8980 27680 nid=1
reserve_bootmem 4ffff8b58 74a8 nid=1
reserve_bootmem 800000000 800000000 nid=0
reserve_bootmem 1000000000 400000000 nid=1
EEH: No capable adapters found
PPC64 nvram contains 15360 bytes
Using shared processor idle loop
Zone PFN ranges:
DMA 0x00000000 -> 0x00140000
Normal 0x00140000 -> 0x00140000
Movable zone start PFN for each node
early_node_map[4] active PFN ranges
0: 0x00000000 -> 0x00028800
1: 0x00028800 -> 0x00050000
0: 0x00080000 -> 0x00100000
1: 0x00100000 -> 0x00140000
On node 0 totalpages: 690176
DMA zone: 689152 pages, LIFO batch:1
On node 1 totalpages: 423936
DMA zone: 422818 pages, LIFO batch:1
[boot]0015 Setup Done
--------------------------------------
This is a log snippet when the patch has been applied.
--------------------------------------
NUMA associativity depth for CPU/Memory: 3
Node 0 Memory: 0x0-0x288000000 0x500000000-0x1000000000
Node 1 Memory: 0x288000000-0x500000000 0x1000000000-0x1400000000
adding cpu 0 to node 0
node 0
NODE_DATA() = c0000004ffff1480
start_paddr = 0
end_paddr = 1000000000
bootmap_paddr = 4fffd0000
reserve_bootmem 0 c30000 nid=0
reserve_bootmem 2f00000 b86800 nid=0
reserve_bootmem 2f00000 b90000 nid=0
reserve_bootmem 3b90000 30000 nid=0
reserve_bootmem f530000 ad0000 nid=0
reserve_bootmem 4fffd0000 20000 nid=1
Unable to handle kernel paging request for data at address 0x00007618
Faulting instruction address: 0xc0000000007279fc
Oops: Kernel access of bad area, sig: 11 [#1]
SMP NR_CPUS=1024 NUMA pSeries
Modules linked in:
Supported: Yes
NIP: c0000000007279fc LR: c000000000711b3c CTR: 8000000000f7cdec
REGS: c000000000963aa0 TRAP: 0300 Not tainted (2.6.27.7-4-ppc64)
MSR: 8000000000001032 <ME,IR,DR> CR: 24022082 XER: 00000001
DAR: 0000000000007618, DSISR: 0000000040000000
TASK = c0000000008b7fa0[0] 'swapper' THREAD: c000000000960000 CPU: 0
GPR00: 0000000000000008 c000000000963d20 c0000000009615c8 0000000000000000
GPR04: 00000004fffd0000 0000000000020000 0000000000000000 ffffffffffffffff
GPR08: 0000000000000000 00000004fffe0000 0000000000d3a20b 000000000000068c
GPR12: 0000000024022082 c000000000a22c80 c000000000743378 c000000000666106
GPR16: c000000000963d98 c000000000aa91d0 c000000000963d90 c000000000963d98
GPR20: c000000000963da0 c000000000c025d0 c0000004ffff1480 c000000000000000
GPR24: 0000000000000005 0000000000020000 c000000000c02e60 000000000004ffff
GPR28: 0000000000020000 000000000004fffd 0000000000000000 00000004fffd0000
NIP [c0000000007279fc] .reserve_bootmem_node+0x4/0x24
LR [c000000000711b3c] .do_init_bootmem+0xc10/0xd4c
Call Trace:
[c000000000963d20] [c000000000711b14] .do_init_bootmem+0xbe8/0xd4c
(unreliable)
[c000000000963e40] [c000000000706294] .setup_arch+0x1a4/0x21c
[c000000000963ee0] [c000000000700888] .start_kernel+0xdc/0x554
[c000000000963f90] [c000000000008568] .start_here_common+0x3c/0x54
Instruction dump:
38a00001 79248402 4bfffe84 3d230001 7c841a14 3929ffff 38a00000 78848402
79238402 38c00000 4bfffe64 3d240001 <e8637618> 7cc73378 3929ffff 78848402
---[ end trace 31fd0ba7d8756001 ]---
Kernel panic - not syncing: Attempted to kill the idle task!
------------[ cut here ]------------
Badness at kernel/smp.c:331
NIP: c0000000000b9734 LR: c0000000000b9a30 CTR: 8000000000f7cdec
REGS: c0000000009631a0 TRAP: 0700 Tainted: G D (2.6.27.7-4-ppc64)
MSR: 8000000000021032 <ME,IR,DR> CR: 28022084 XER: 00000001
TASK = c0000000008b7fa0[0] 'swapper' THREAD: c000000000960000 CPU: 0
GPR00: 0000000000000001 c000000000963420 c0000000009615c8 0000000000000001
GPR04: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
GPR12: 0000000028022082 c000000000a22c80 c000000000743378 c000000000666106
GPR16: c000000000963d98 c000000000aa91d0 c000000000963d90 c000000000963d98
GPR20: c000000000963da0 c000000000c025d0 c0000004ffff1480 c000000000000000
GPR24: 0000000000000000 c0000000008ff0d0 0000000000000000 0000000000000000
GPR28: 0000000000000000 0000000000000000 c0000000008e8138 000000000000000b
NIP [c0000000000b9734] .smp_call_function_mask+0x68/0x2c4
LR [c0000000000b9a30] .smp_call_function+0xa0/0xcc
Call Trace:
[c000000000963420] [c0000000004d6718] ._spin_unlock_irqrestore+0x3c/0x50
(unreliable)
[c000000000963600] [c0000000000b9a30] .smp_call_function+0xa0/0xcc
[c000000000963710] [c00000000002d3a0] .smp_send_stop+0x24/0x3c
[c000000000963790] [c0000000004e1284] .panic+0xa8/0x1b8
[c000000000963820] [c0000000000903ac] .do_exit+0xa0/0x980
[c000000000963910] [c000000000027ffc] .die+0x274/0x278
[c0000000009639b0] [c000000000030e98] .bad_page_fault+0xb8/0xd4
[c000000000963a30] [c000000000005698] handle_page_fault+0x3c/0x5c
--- Exception: 300 at .reserve_bootmem_node+0x4/0x24
LR = .do_init_bootmem+0xc10/0xd4c
[c000000000963d20] [c000000000711b14] .do_init_bootmem+0xbe8/0xd4c
(unreliable)
[c000000000963e40] [c000000000706294] .setup_arch+0x1a4/0x21c
[c000000000963ee0] [c000000000700888] .start_kernel+0xdc/0x554
[c000000000963f90] [c000000000008568] .start_here_common+0x3c/0x54
Instruction dump:
eb010298 eb6102a6 f8810218 f8a10220 f8c10228 f8e10230 f9010238 f9210240
f9410248 880d01da 7c000074 7800d182 <0b000000> 3be10210 a3ad000a e8be8028
More information about the Linuxppc-dev
mailing list