[PATCH v3 3/3] powerpc: Book3S 64-bit "heavyweight" KASAN support

Daniel Axtens dja at axtens.net
Wed Dec 18 15:32:04 AEDT 2019


Daniel Axtens <dja at axtens.net> writes:

> Hi Christophe,
>
> I'm working through your feedback, thank you. Regarding this one:
>
>>> --- a/arch/powerpc/kernel/process.c
>>> +++ b/arch/powerpc/kernel/process.c
>>> @@ -2081,7 +2081,14 @@ void show_stack(struct task_struct *tsk, unsigned long *stack)
>>>   		/*
>>>   		 * See if this is an exception frame.
>>>   		 * We look for the "regshere" marker in the current frame.
>>> +		 *
>>> +		 * KASAN may complain about this. If it is an exception frame,
>>> +		 * we won't have unpoisoned the stack in asm when we set the
>>> +		 * exception marker. If it's not an exception frame, who knows
>>> +		 * how things are laid out - the shadow could be in any state
>>> +		 * at all. Just disable KASAN reporting for now.
>>>   		 */
>>> +		kasan_disable_current();
>>>   		if (validate_sp(sp, tsk, STACK_INT_FRAME_SIZE)
>>>   		    && stack[STACK_FRAME_MARKER] == STACK_FRAME_REGS_MARKER) {
>>>   			struct pt_regs *regs = (struct pt_regs *)
>>> @@ -2091,6 +2098,7 @@ void show_stack(struct task_struct *tsk, unsigned long *stack)
>>>   			       regs->trap, (void *)regs->nip, (void *)lr);
>>>   			firstframe = 1;
>>>   		}
>>> +		kasan_enable_current();
>>
>> If this is really a concern for all targets including PPC32, should it 
>> be a separate patch with a Fixes: tag to be applied back in stable as well ?
>
> I've managed to repro this by commening out the kasan_disable/enable
> lines, and just booting in qemu without a disk attached:
>
> sudo qemu-system-ppc64 -accel kvm -m 2G -M pseries -cpu power9  -kernel ./vmlinux  -nographic -chardev stdio,id=charserial0,mux=on -device spapr-vty,chardev=charserial0,reg=0x30000000  -mon chardev=charserial0,mode=readline -nodefaults -smp 2 
>
> ...
>
> [    0.210740] Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(0,0)
> [    0.210789] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.5.0-rc1-next-20191213-16824-g469a24fbdb34 #12
> [    0.210844] Call Trace:
> [    0.210866] [c00000006a4839b0] [c000000001f74f48] dump_stack+0xfc/0x154 (unreliable)
> [    0.210915] [c00000006a483a00] [c00000000025411c] panic+0x258/0x59c
> [    0.210958] [c00000006a483aa0] [c0000000024870b0] mount_block_root+0x648/0x7ac
> [    0.211005] ==================================================================
> [    0.211054] BUG: KASAN: stack-out-of-bounds in show_stack+0x438/0x580
> [    0.211095] Read of size 8 at addr c00000006a483b00 by task swapper/0/1
> [    0.211134] 
> [    0.211152] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.5.0-rc1-next-20191213-16824-g469a24fbdb34 #12
> [    0.211207] Call Trace:
> [    0.211225] [c00000006a483680] [c000000001f74f48] dump_stack+0xfc/0x154 (unreliable)
> [    0.211274] [c00000006a4836d0] [c0000000008f877c] print_address_description.isra.10+0x7c/0x470
> [    0.211330] [c00000006a483760] [c0000000008f8e7c] __kasan_report+0x1bc/0x244
> [    0.211380] [c00000006a483830] [c0000000008f6eb8] kasan_report+0x18/0x30
> [    0.211422] [c00000006a483850] [c0000000008fa5d4] __asan_report_load8_noabort+0x24/0x40
> [    0.211471] [c00000006a483870] [c00000000003d448] show_stack+0x438/0x580
> [    0.211512] [c00000006a4839b0] [c000000001f74f48] dump_stack+0xfc/0x154
> [    0.211553] [c00000006a483a00] [c00000000025411c] panic+0x258/0x59c
> [    0.211595] [c00000006a483aa0] [c0000000024870b0] mount_block_root+0x648/0x7ac
> [    0.211644] [c00000006a483be0] [c000000002487784] prepare_namespace+0x1ec/0x240
> [    0.211694] [c00000006a483c60] [c00000000248669c] kernel_init_freeable+0x7f4/0x870
> [    0.211745] [c00000006a483da0] [c000000000011f30] kernel_init+0x3c/0x15c
> [    0.211787] [c00000006a483e20] [c00000000000bebc] ret_from_kernel_thread+0x5c/0x80
> [    0.211834] 
> [    0.211851] Allocated by task 0:
> [    0.211878]  save_stack+0x2c/0xe0
> [    0.211904]  __kasan_kmalloc.isra.16+0x11c/0x150
> [    0.211937]  kmem_cache_alloc_node+0x114/0x3b0
> [    0.211971]  copy_process+0x5b8/0x6410
> [    0.211996]  _do_fork+0x130/0xbf0
> [    0.212022]  kernel_thread+0xdc/0x130
> [    0.212047]  rest_init+0x44/0x184
> [    0.212072]  start_kernel+0x77c/0x7dc
> [    0.212098]  start_here_common+0x1c/0x20
> [    0.212122] 
> [    0.212139] Freed by task 0:
> [    0.212163] (stack is not available)
> [    0.212187] 
> [    0.212205] The buggy address belongs to the object at c00000006a480000
> [    0.212205]  which belongs to the cache thread_stack of size 16384
> [    0.212285] The buggy address is located 15104 bytes inside of
> [    0.212285]  16384-byte region [c00000006a480000, c00000006a484000)
> [    0.212356] The buggy address belongs to the page:
> [    0.212391] page:c00c0000001a9200 refcount:1 mapcount:0 mapping:c00000006a019e00 index:0x0 compound_mapcount: 0
> [    0.212455] raw: 007ffff000010200 5deadbeef0000100 5deadbeef0000122 c00000006a019e00
> [    0.212504] raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000
> [    0.212551] page dumped because: kasan: bad access detected
> [    0.212583] 
> [    0.212600] addr c00000006a483b00 is located in stack of task swapper/0/1 at offset 0 in frame:
> [    0.212656]  mount_block_root+0x0/0x7ac
> [    0.212681] 
> [    0.212698] this frame has 1 object:
> [    0.212722]  [32, 64) 'b'
> [    0.212723] 
> [    0.212755] Memory state around the buggy address:
> [    0.212788]  c00000006a483a00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> [    0.212836]  c00000006a483a80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> [    0.212884] >c00000006a483b00: f1 f1 f1 f1 00 00 00 00 f3 f3 f3 f3 00 00 00 00
> [    0.212931]                    ^
> [    0.212957]  c00000006a483b80: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> [    0.213005]  c00000006a483c00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> [    0.213052] ==================================================================
> [    0.213100] Disabling lock debugging due to kernel taint
> [    0.213134] [c00000006a483be0] [c000000002487784] prepare_namespace+0x1ec/0x240
> [    0.213182] [c00000006a483c60] [c00000000248669c] kernel_init_freeable+0x7f4/0x870
> [    0.213231] [c00000006a483da0] [c000000000011f30] kernel_init+0x3c/0x15c
> [    0.213272] [c00000006a483e20] [c00000000000bebc] ret_from_kernel_thread+0x5c/0x80
>
> Is that something that reproduces on ppc32?
>
> I don't see it running the test_kasan tests, so I guess that matches up
> with your experience.

I've debugged this a bit further. If I put a dump_stack() in
kernel_init() right before I call kernel_init_freeable(), I don't see
the splat. But if I put a dump_stack() immediately inside
kernel_init_freeable() I do see the splat. I wonder if some early init
code isn't setting up the stack quite right?

I don't see this in walking stacks that contain an interrupt frame, so I
think the correct thing is to tear out this code and debug the weird
stack frame stuff around kernel_init_freeable in parallel.

Thanks for your attention to detail.

Regards,
Daniel

>
> Regards,
> Daniel
>
>
>
>>
>>>   
>>>   		sp = newsp;
>>>   	} while (count++ < kstack_depth_to_print);
>>> diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
>>> index 6620f37abe73..d994c7c39c8d 100644
>>> --- a/arch/powerpc/kernel/prom.c
>>> +++ b/arch/powerpc/kernel/prom.c
>>> @@ -72,6 +72,7 @@ unsigned long tce_alloc_start, tce_alloc_end;
>>>   u64 ppc64_rma_size;
>>>   #endif
>>>   static phys_addr_t first_memblock_size;
>>> +static phys_addr_t top_phys_addr;
>>>   static int __initdata boot_cpu_count;
>>>   
>>>   static int __init early_parse_mem(char *p)
>>> @@ -449,6 +450,26 @@ static bool validate_mem_limit(u64 base, u64 *size)
>>>   {
>>>   	u64 max_mem = 1UL << (MAX_PHYSMEM_BITS);
>>>   
>>> +	/*
>>> +	 * To handle the NUMA/discontiguous memory case, don't allow a block
>>> +	 * to be added if it falls completely beyond the configured physical
>>> +	 * memory. Print an informational message.
>>> +	 *
>>> +	 * Frustratingly we also see this with qemu - it seems to split the
>>> +	 * specified memory into a number of smaller blocks. If this happens
>>> +	 * under qemu, it probably represents misconfiguration. So we want
>>> +	 * the message to be noticeable, but not shouty.
>>> +	 *
>>> +	 * See Documentation/powerpc/kasan.txt
>>> +	 */
>>> +	if (IS_ENABLED(CONFIG_KASAN) &&
>>> +	    (base >= ((u64)CONFIG_PHYS_MEM_SIZE_FOR_KASAN << 20))) {
>>> +		pr_warn("KASAN: not adding memory block at %llx (size %llx)\n"
>>> +			"This could be due to discontiguous memory or kernel misconfiguration.",
>>> +			base, *size);
>>> +		return false;
>>> +	}
>>> +
>>>   	if (base >= max_mem)
>>>   		return false;
>>>   	if ((base + *size) > max_mem)
>>> @@ -572,8 +593,11 @@ void __init early_init_dt_add_memory_arch(u64 base, u64 size)
>>>   
>>>   	/* Add the chunk to the MEMBLOCK list */
>>>   	if (add_mem_to_memblock) {
>>> -		if (validate_mem_limit(base, &size))
>>> +		if (validate_mem_limit(base, &size)) {
>>>   			memblock_add(base, size);
>>> +			if (base + size > top_phys_addr)
>>> +				top_phys_addr = base + size;
>>> +		}
>>
>> Can we use max() here ? Something like
>>
>> top_phys_addr = max(base + size, top_phys_addr);
>>
>>>   	}
>>>   }
>>>   
>>> @@ -613,6 +637,8 @@ static void __init early_reserve_mem_dt(void)
>>>   static void __init early_reserve_mem(void)
>>>   {
>>>   	__be64 *reserve_map;
>>> +	phys_addr_t kasan_shadow_start;
>>> +	phys_addr_t kasan_memory_size;
>>>   
>>>   	reserve_map = (__be64 *)(((unsigned long)initial_boot_params) +
>>>   			fdt_off_mem_rsvmap(initial_boot_params));
>>> @@ -651,6 +677,42 @@ static void __init early_reserve_mem(void)
>>>   		return;
>>>   	}
>>>   #endif
>>> +
>>> +	if (IS_ENABLED(CONFIG_KASAN) && IS_ENABLED(CONFIG_PPC_BOOK3S_64)) {
>>> +		kasan_memory_size =
>>> +			((phys_addr_t)CONFIG_PHYS_MEM_SIZE_FOR_KASAN << 20);
>>> +
>>> +		if (top_phys_addr < kasan_memory_size) {
>>> +			/*
>>> +			 * We are doomed. We shouldn't even be able to get this
>>> +			 * far, but we do in qemu. If we continue and turn
>>> +			 * relocations on, we'll take fatal page faults for
>>> +			 * memory that's not physically present. Instead,
>>> +			 * panic() here: it will be saved to __log_buf even if
>>> +			 * it doesn't get printed to the console.
>>> +			 */
>>> +			panic("Tried to book a KASAN kernel configured for %u MB with only %llu MB! Aborting.",
>>
>> book ==> boot ?
>>
>>> +			      CONFIG_PHYS_MEM_SIZE_FOR_KASAN,
>>> +			      (u64)(top_phys_addr >> 20));
>>> +		} else if (top_phys_addr > kasan_memory_size) {
>>> +			/* print a biiiig warning in hopes people notice */
>>> +			pr_err("===========================================\n"
>>> +				"Physical memory exceeds compiled-in maximum!\n"
>>> +				"This kernel was compiled for KASAN with %u MB physical memory.\n"
>>> +				"The physical memory detected is at least %llu MB.\n"
>>> +				"Memory above the compiled limit will not be used!\n"
>>> +				"===========================================\n",
>>> +				CONFIG_PHYS_MEM_SIZE_FOR_KASAN,
>>> +				(u64)(top_phys_addr >> 20));
>>> +		}
>>> +
>>> +		kasan_shadow_start = _ALIGN_DOWN(kasan_memory_size * 7 / 8,
>>> +						 PAGE_SIZE);
>>
>> Can't this fit on a single line ? powerpc allows 90 chars.
>>
>>> +		DBG("reserving %llx -> %llx for KASAN",
>>> +		    kasan_shadow_start, top_phys_addr);
>>> +		memblock_reserve(kasan_shadow_start,
>>> +				 top_phys_addr - kasan_shadow_start);
>>
>> Same ?
>>
>>> +	}
>>>   }
>>>   
>>>   #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
>>> diff --git a/arch/powerpc/mm/kasan/Makefile b/arch/powerpc/mm/kasan/Makefile
>>> index 6577897673dd..f02b15c78e4d 100644
>>> --- a/arch/powerpc/mm/kasan/Makefile
>>> +++ b/arch/powerpc/mm/kasan/Makefile
>>> @@ -2,4 +2,5 @@
>>>   
>>>   KASAN_SANITIZE := n
>>>   
>>> -obj-$(CONFIG_PPC32)           += kasan_init_32.o
>>> +obj-$(CONFIG_PPC32)           += init_32.o
>>
>> Shouldn't we do ppc32 name change in another patch ?
>>
>>> +obj-$(CONFIG_PPC_BOOK3S_64)   += init_book3s_64.o
>>> diff --git a/arch/powerpc/mm/kasan/kasan_init_32.c b/arch/powerpc/mm/kasan/init_32.c
>>> similarity index 100%
>>> rename from arch/powerpc/mm/kasan/kasan_init_32.c
>>> rename to arch/powerpc/mm/kasan/init_32.c
>>> diff --git a/arch/powerpc/mm/kasan/init_book3s_64.c b/arch/powerpc/mm/kasan/init_book3s_64.c
>>> new file mode 100644
>>> index 000000000000..f961e96be136
>>> --- /dev/null
>>> +++ b/arch/powerpc/mm/kasan/init_book3s_64.c
>>> @@ -0,0 +1,72 @@
>>> +// SPDX-License-Identifier: GPL-2.0
>>> +/*
>>> + * KASAN for 64-bit Book3S powerpc
>>> + *
>>> + * Copyright (C) 2019 IBM Corporation
>>> + * Author: Daniel Axtens <dja at axtens.net>
>>> + */
>>> +
>>> +#define DISABLE_BRANCH_PROFILING
>>> +
>>> +#include <linux/kasan.h>
>>> +#include <linux/printk.h>
>>> +#include <linux/sched/task.h>
>>> +#include <asm/pgalloc.h>
>>> +
>>> +void __init kasan_init(void)
>>> +{
>>> +	int i;
>>> +	void *k_start = kasan_mem_to_shadow((void *)RADIX_KERN_VIRT_START);
>>> +	void *k_end = kasan_mem_to_shadow((void *)RADIX_VMEMMAP_END);
>>> +
>>> +	pte_t pte = __pte(__pa(kasan_early_shadow_page) |
>>> +			  pgprot_val(PAGE_KERNEL) | _PAGE_PTE);
>>
>> Can't we do something with existing helpers ? Something like:
>>
>> pte = pte_mkpte(pfn_pte(virt_to_pfn(kasan_early_shadow_page), PAGE_KERNEL));
>>
>>> +
>>> +	if (!early_radix_enabled())
>>> +		panic("KASAN requires radix!");
>>> +
>>> +	for (i = 0; i < PTRS_PER_PTE; i++)
>>> +		__set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page,
>>> +			     &kasan_early_shadow_pte[i], pte, 0);
>>> +
>>> +	for (i = 0; i < PTRS_PER_PMD; i++)
>>> +		pmd_populate_kernel(&init_mm, &kasan_early_shadow_pmd[i],
>>> +				    kasan_early_shadow_pte);
>>> +
>>> +	for (i = 0; i < PTRS_PER_PUD; i++)
>>> +		pud_populate(&init_mm, &kasan_early_shadow_pud[i],
>>> +			     kasan_early_shadow_pmd);
>>> +
>>> +	memset(kasan_mem_to_shadow((void *)PAGE_OFFSET), KASAN_SHADOW_INIT,
>>> +	       KASAN_SHADOW_SIZE);
>>> +
>>> +	kasan_populate_early_shadow(
>>> +		kasan_mem_to_shadow((void *)RADIX_KERN_VIRT_START),
>>> +		kasan_mem_to_shadow((void *)RADIX_VMALLOC_START));
>>> +
>>> +	/* leave a hole here for vmalloc */
>>> +
>>> +	kasan_populate_early_shadow(
>>> +		kasan_mem_to_shadow((void *)RADIX_VMALLOC_END),
>>> +		kasan_mem_to_shadow((void *)RADIX_VMEMMAP_END));
>>> +
>>> +	flush_tlb_kernel_range((unsigned long)k_start, (unsigned long)k_end);
>>> +
>>> +	/* mark early shadow region as RO and wipe */
>>> +	pte = __pte(__pa(kasan_early_shadow_page) |
>>> +		    pgprot_val(PAGE_KERNEL_RO) | _PAGE_PTE);
>>
>> Same comment as above, use helpers ?
>>
>>> +	for (i = 0; i < PTRS_PER_PTE; i++)
>>> +		__set_pte_at(&init_mm, (unsigned long)kasan_early_shadow_page,
>>> +			     &kasan_early_shadow_pte[i], pte, 0);
>>> +
>>> +	/*
>>> +	 * clear_page relies on some cache info that hasn't been set up yet.
>>> +	 * It ends up looping ~forever and blows up other data.
>>> +	 * Use memset instead.
>>> +	 */
>>> +	memset(kasan_early_shadow_page, 0, PAGE_SIZE);
>>> +
>>> +	/* Enable error messages */
>>> +	init_task.kasan_depth = 0;
>>> +	pr_info("KASAN init done (64-bit Book3S heavyweight mode)\n");
>>> +}
>>> 
>>
>> Christophe


More information about the Linuxppc-dev mailing list