[PATCH] powerpc/pagetable: Add option to dump kernel pagetable

Rashmica rashmicy at gmail.com
Tue Feb 23 09:27:58 AEDT 2016


Hi Anshuman,

Thanks for the feedback!

On 22/02/16 21:13, Anshuman Khandual wrote:
> On 02/22/2016 11:32 AM, Rashmica Gupta wrote:
>> Useful to be able to dump the kernel page tables to check permissions and
>> memory types - derived from arm64's implementation.
>>
>> Add a debugfs file to check the page tables. To use this the PPC_PTDUMP
>> config option must be selected.
>>
>> Tested on 64BE and 64LE with both 4K and 64K page sizes.
>> ---
> This statement above must be after the ---- line else it will be part of
> the commit message or you wanted the test note as part of commit message
> itself ?
>
> The patch seems to contain some white space problems. Please clean them up.
Will do!
>>   arch/powerpc/Kconfig.debug |  12 ++
>>   arch/powerpc/mm/Makefile   |   1 +
>>   arch/powerpc/mm/dump.c     | 364 +++++++++++++++++++++++++++++++++++++++++++++
>>   3 files changed, 377 insertions(+)
>>   create mode 100644 arch/powerpc/mm/dump.c
>>
>> diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug
>> index 638f9ce740f5..e4883880abe3 100644
>> --- a/arch/powerpc/Kconfig.debug
>> +++ b/arch/powerpc/Kconfig.debug
>> @@ -344,4 +344,16 @@ config FAIL_IOMMU
>>   
>>   	  If you are unsure, say N.
>>   
>> +config PPC_PTDUMP
>> +        bool "Export kernel pagetable layout to userspace via debugfs"
>> +        depends on DEBUG_KERNEL
>> +        select DEBUG_FS
>> +        help
>> +          This options dumps the state of the kernel pagetables in a debugfs
>> +          file. This is only useful for kernel developers who are working in
>> +          architecture specific areas of the kernel - probably not a good idea to
>> +          enable this feature in a production kernel.
> Just clean the paragraph alignment here ......................................^^^^^^^^
>
>> +
>> +          If you are unsure, say N.
>> +
>>   endmenu
>> diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile
>> index 1ffeda85c086..16f84bdd7597 100644
>> --- a/arch/powerpc/mm/Makefile
>> +++ b/arch/powerpc/mm/Makefile
>> @@ -40,3 +40,4 @@ obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
>>   obj-$(CONFIG_HIGHMEM)		+= highmem.o
>>   obj-$(CONFIG_PPC_COPRO_BASE)	+= copro_fault.o
>>   obj-$(CONFIG_SPAPR_TCE_IOMMU)	+= mmu_context_iommu.o
>> +obj-$(CONFIG_PPC_PTDUMP)	+= dump.o
> File name like "[kernel_]pgtable_dump.c" will sound better ? Or
> just use like the X86 one "dump_pagetables.c". "dump.c" sounds
> very generic.
Yup, good point.
>> diff --git a/arch/powerpc/mm/dump.c b/arch/powerpc/mm/dump.c
>> new file mode 100644
>> index 000000000000..937b10fc40cc
>> --- /dev/null
>> +++ b/arch/powerpc/mm/dump.c
>> @@ -0,0 +1,364 @@
>> +/*
>> + * Copyright 2016, Rashmica Gupta, IBM Corp.
>> + *
>> + * Debug helper to dump the current kernel pagetables of the system
>> + * so that we can see what the various memory ranges are set to.
>> + *
>> + * Derived from the arm64 implementation:
>> + * Copyright (c) 2014, The Linux Foundation, Laura Abbott.
>> + * (C) Copyright 2008 Intel Corporation, Arjan van de Ven.
>> + *
>> + * This program is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU General Public License
>> + * as published by the Free Software Foundation; version 2
>> + * of the License.
>> + */
>> +#include <linux/debugfs.h>
>> +#include <linux/fs.h>
>> +#include <linux/io.h>
>> +#include <linux/mm.h>
>> +#include <linux/sched.h>
>> +#include <linux/seq_file.h>
>> +#include <asm/fixmap.h>
>> +#include <asm/pgtable.h>
>> +#include <linux/const.h>
>> +#include <asm/page.h>
>> +
>> +#define PUD_TYPE_MASK           (_AT(u64, 3) << 0)
>> +#define PUD_TYPE_SECT           (_AT(u64, 1) << 0)
>> +#define PMD_TYPE_MASK           (_AT(u64, 3) << 0)
>> +#define PMD_TYPE_SECT           (_AT(u64, 1) << 0)
>> +
>> +
>> +#if CONFIG_PGTABLE_LEVELS == 2
>> +#include <asm-generic/pgtable-nopmd.h>
>> +#elif CONFIG_PGTABLE_LEVELS == 3
>> +#include <asm-generic/pgtable-nopud.h>
>> +#endif
> Really ? Do we have any platform with only 2 level of page table ?
>   
I'm not sure - was trying to cover all the bases. If you're
confident that we don't, I can remove it.
>> +
>> +#define pmd_sect(pmd)  ((pmd_val(pmd) & PMD_TYPE_MASK) == PMD_TYPE_SECT)
>> +#ifdef CONFIG_PPC_64K_PAGES
>> +#define pud_sect(pud)           (0)
>> +#else
>> +#define pud_sect(pud)           ((pud_val(pud) & PUD_TYPE_MASK) == \
>> +                                               PUD_TYPE_SECT)
>> +#endif
> Can you please explain the use of pmd_sect() and pud_sect() defines ?
>
>> +	
>> +
>> +struct addr_marker {
>> +	unsigned long start_address;
>> +	const char *name;
>> +};
> All the architectures are using the same structure addr_marker.
> Cannot we just move it to a generic header file ? There are
> other such common structures like these in the file which are
> used across architectures and can be moved to some where common ?
Could do that. Where do you think would be the appropriate place
for such a header file?
>> +
>> +enum address_markers_idx {
>> +	VMALLOC_START_NR = 0,
>> +	VMALLOC_END_NR,
>> +	ISA_IO_START_NR,
>> +	ISA_IO_END_NR,
>> +	PHB_IO_START_NR,
>> +	PHB_IO_END_NR,
>> +	IOREMAP_START_NR,
>> +	IOREMP_END_NR,
>> +};
> Where these are used ? ^^^^^^^^^ I dont see any where.
Whoops, yes those are not used anymore.
>
> Also as it dumps only the kernel virtual mapping, should not we
> mention about it some where.
See my question below...
>> +
>> +static struct addr_marker address_markers[] = {
>> +	{ VMALLOC_START,	"vmalloc() Area" },
>> +	{ VMALLOC_END,		"vmalloc() End" },
>> +	{ ISA_IO_BASE,		"isa I/O start" },
>> +	{ ISA_IO_END,		"isa I/O end" },
>> +	{ PHB_IO_BASE,		"phb I/O start" },
>> +	{ PHB_IO_END,		"phb I/O end" },
>> +	{ IOREMAP_BASE,		"I/O remap start" },
>> +	{ IOREMAP_END,		"I/O remap end" },
>> +	{ -1,			NULL },
>> +};
>
> I understand that VMEMMAP range is not covered under the kernel virtual
> mapping page table. But we can hook it up looking into the linked list
> we track for the VA-PA mappings in there. Just a suggestion.
I'm not sure that I understand, can you please explain why we
would want to do that?
>
>> +
>> +/*
>> + * The page dumper groups page table entries of the same type into a single
>> + * description. It uses pg_state to track the range information while
>> + * iterating over the pte entries. When the continuity is broken it then
>> + * dumps out a description of the range.
>> + */
> This needs to be more detailed. Some where at the starting point of the
> file we need to write detailed description about what all information it
> is dumping and how.
Will do!
>
>> +struct pg_state {
>> +	struct seq_file *seq;
>> +	const struct addr_marker *marker;
>> +	unsigned long start_address;
>> +	unsigned level;
>> +	u64 current_prot;
>> +};
>> +
>> +struct prot_bits {
>> +	u64		mask;
>> +	u64		val;
>> +	const char	*set;
>> +	const char	*clear;
>> +};
> This structure encapsulates various PTE flags bit information.
> Then why it is named as "prot_bits" ?
Yup, a lazy oversight on my part.
>> +
>> +static const struct prot_bits pte_bits[] = {
>> +	{
>> +		.mask	= _PAGE_USER,
>> +		.val	= _PAGE_USER,
>> +		.set	= "user",
>> +		.clear	= "    ",
>> +	}, {
>> +		.mask	= _PAGE_RW,
>> +		.val	= _PAGE_RW,
>> +		.set	= "rw",
>> +		.clear	= "ro",
>> +	}, {
>> +		.mask	= _PAGE_EXEC,
>> +		.val	= _PAGE_EXEC,
>> +		.set	= " X ",
>> +		.clear	= "   ",
>> +	}, {
>> +		.mask	= _PAGE_PTE,
>> +		.val	= _PAGE_PTE,
>> +		.set	= "pte",
>> +		.clear	= "   ",
>> +	}, {
>> +		.mask	= _PAGE_PRESENT,
>> +		.val	= _PAGE_PRESENT,
>> +		.set	= "present",
>> +		.clear	= "       ",
>> +	}, {
>> +		.mask	= _PAGE_HASHPTE,
>> +		.val	= _PAGE_HASHPTE,
>> +		.set	= "htpe",
>> +		.clear	= "    ",
>> +	}, {
>> +		.mask	= _PAGE_GUARDED,
>> +		.val	= _PAGE_GUARDED,
>> +		.set	= "guarded",
>> +		.clear	= "       ",
>> +	}, {
>> +		.mask	= _PAGE_DIRTY,
>> +		.val	= _PAGE_DIRTY,
>> +		.set	= "dirty",
>> +		.clear	= "     ",
>> +	}, {
>> +		.mask	= _PAGE_ACCESSED,
>> +		.val	= _PAGE_ACCESSED,
>> +		.set	= "accessed",
>> +		.clear	= "        ",
>> +	}, {
>> +		.mask	= _PAGE_WRITETHRU,
>> +		.val	= _PAGE_WRITETHRU,
>> +		.set	= "write through",
>> +		.clear	= "             ",
>> +	}, {
>> +		.mask	= _PAGE_NO_CACHE,
>> +		.val	= _PAGE_NO_CACHE,
>> +		.set	= "no cache",
>> +		.clear	= "        ",
>> +	}, {
>> +		.mask	= _PAGE_BUSY,
>> +		.val	= _PAGE_BUSY,
>> +		.set	= "busy",
>> +	}, {
>> +		.mask	= _PAGE_F_GIX,
>> +		.val	= _PAGE_F_GIX,
>> +		.set	= "gix",
>> +	}, {
>> +		.mask	= _PAGE_F_SECOND,
>> +		.val	= _PAGE_F_SECOND,
>> +		.set	= "second",
>> +	}, {
>> +		.mask	= _PAGE_SPECIAL,
>> +		.val	= _PAGE_SPECIAL,
>> +		.set	= "special",
>> +	}
>> +};
>> +
>> +struct pg_level {
>> +	const struct prot_bits *bits;
>> +	size_t num;
>> +	u64 mask;
>> +};
>> +
> It describes each level of the page table and what all kind of
> PTE flags can be there at each of these levels and their combined
> mask. The structure and it's elements must be named accordingly.
> Its confusing right now.
>
>> +static struct pg_level pg_level[] = {
>> +	{
>> +	}, { /* pgd */
>> +		.bits	= pte_bits,
>> +		.num	= ARRAY_SIZE(pte_bits),
>> +	}, { /* pud */
>> +		.bits	= pte_bits,
>> +		.num	= ARRAY_SIZE(pte_bits),
>> +	}, { /* pmd */
>> +		.bits	= pte_bits,
>> +		.num	= ARRAY_SIZE(pte_bits),
>> +	}, { /* pte */
>> +		.bits	= pte_bits,
>> +		.num	= ARRAY_SIZE(pte_bits),
>> +	},
>> +};
>> +
>> +static void dump_prot(struct pg_state *st, const struct prot_bits *bits,
>> +			size_t num)
>> +{
>> +	unsigned i;
>> +
>> +	for (i = 0; i < num; i++, bits++) {
>> +		const char *s;
>> +
>> +		if ((st->current_prot & bits->mask) == bits->val)
>> +			s = bits->set;
>> +		else
>> +			s = bits->clear;
>> +
>> +		if (s)
>> +			seq_printf(st->seq, " %s", s);
>> +	}
>> +}
>> +
>> +static void note_page(struct pg_state *st, unsigned long addr, unsigned level,
>> +				u64 val)
>> +{
>> +	static const char units[] = "KMGTPE";
>> +	u64 prot = val & pg_level[level].mask;
>> +
>> +	/* At first no level is set */
>> +	if (!st->level) {
>> +		st->level = level;
>> +		st->current_prot = prot;
>> +		st->start_address = addr;
>> +		seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
>> +	/* We are only interested in dumping when something (protection,
>> +	 *  level of PTE or the section of vmalloc) has changed */
> Again, these are PTE flags not protection bits. Please change these
> comments accordingly. Also this function does a lot of things, can
> we split it up ?
Do you think that it would make it easier to understand by
splitting it up?
>> +	} else if (prot != st->current_prot || level != st->level ||
>> +		   addr >= st->marker[1].start_address) {
>> +		const char *unit = units;
>> +		unsigned long delta;
>> +
>> +		/* Check protection on PTE */
>> +		if (st->current_prot) {
>> +			seq_printf(st->seq, "0x%016lx-0x%016lx   ",
>> +				   st->start_address, addr-1);
>> +
>> +			delta = (addr - st->start_address) >> 10;
>> +			/* Work out what appropriate unit to use */
>> +			while (!(delta & 1023) && unit[1]) {
>> +				delta >>= 10;
>> +				unit++;
>> +			}
>> +			seq_printf(st->seq, "%9lu%c", delta, *unit);
>> +			/* Dump all the protection flags */
>> +			if (pg_level[st->level].bits)
>> +				dump_prot(st, pg_level[st->level].bits,
>> +					  pg_level[st->level].num);
>> +			seq_puts(st->seq, "\n");
>> +		}
>> +		/* Address indicates we have passed the end of the
>> +		 * current section of vmalloc */
>> +		while (addr >= st->marker[1].start_address) {
>> +			st->marker++;
>> +			seq_printf(st->seq, "---[ %s ]---\n", st->marker->name);
>> +		}
> Right now with this patch, it does not print anything after [ vmalloc() End ].
> But I would expect it to go over all other sections one by one. The above
> while loop just goes over the list and prints the remaining address range
> names. Why ?
>
> 0xd00007fffff00000-0xd00007fffff0ffff          64K      rw     pte present htpe         dirty accessed
> 0xd00007fffff10000-0xd00007fffff3ffff         192K      rw     pte present              dirty accessed
> 0xd00007fffff40000-0xd00007fffff4ffff          64K      rw     pte present htpe         dirty accessed
> 0xd00007fffff50000-0xd00007fffff7ffff         192K      rw     pte present              dirty accessed
> 0xd00007fffff80000-0xd00007fffff8ffff          64K      rw     pte present htpe         dirty accessed
> 0xd00007fffff90000-0xd00007fffffbffff         192K      rw     pte present              dirty accessed
> 0xd00007fffffc0000-0xd00007fffffcffff          64K      rw     pte present htpe         dirty accessed
> 0xd00007fffffd0000-0xd00007ffffffffff         192K      rw     pte present              dirty accessed
> ---[ vmalloc() End ]---
> ---[ isa I/O start ]---
> ---[ isa I/O end ]---
> ---[ phb I/O start ]---
> ---[ phb I/O end ]---
> ---[ I/O remap start ]---
> ---[ I/O remap end ]---
>
What setup are you using? My output looked something similar to this for 
all BE and LE
with both 4K and 64K pages:
0xd00007fffff60000-0xd00007fffff7ffff         128K      rw     pte 
present              dirty
accessed
0xd00007fffff80000-0xd00007fffff9ffff         128K      rw     pte 
present htpe         dirty
accessed
0xd00007fffffa0000-0xd00007fffffbffff         128K      rw     pte 
present              dirty
accessed
0xd00007fffffc0000-0xd00007fffffdffff         128K      rw     pte 
present htpe         dirty
accessed
0xd00007fffffe0000-0xd00007ffffffffff         128K      rw     pte 
present              dirty
accessed
---[ vmalloc() End ]---
---[ isa I/O start ]---
---[ isa I/O end ]---
---[ phb I/O start ]---
0xd000080000010000-0xd00008000001ffff          64K      rw     pte 
present htpe guarded
dirty accessed               no cache
---[ phb I/O end ]---
---[ I/O remap start ]---
0xd000080080020000-0xd00008008002ffff          64K      rw     pte 
present htpe guarded
dirty accessed               no cache
  0xd000080080040000-0xd00008008004ffff          64K      rw     pte 
present htpe guarded
  dirty accessed               no cache
0xd000080080060000-0xd00008008006ffff          64K      rw     pte 
present htpe guarded
dirty accessed               no cache
---[ I/O remap end ]---

>> +
>> +		st->start_address = addr;
>> +		st->current_prot = prot;
>> +		st->level = level;
>> +	}
>> +
>> +}
>> +
>> +static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start)
>> +{
>> +	pte_t *pte = pte_offset_kernel(pmd, 0);
>> +	unsigned long addr;
>> +	unsigned i;
>> +
>> +	for (i = 0; i < PTRS_PER_PTE; i++, pte++) {
>> +		addr = start + i * PAGE_SIZE;
>> +		note_page(st, addr, 4, pte_val(*pte));
>> +	}
>> +}
>> +
>> +static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start)
>> +{
>> +	pmd_t *pmd = pmd_offset(pud, 0);
>> +	unsigned long addr;
>> +	unsigned i;
>> +
>> +	for (i = 0; i < PTRS_PER_PMD; i++, pmd++) {
>> +		addr = start + i * PMD_SIZE;
>> +		if (!pmd_none(*pmd) && !pmd_sect(*pmd))
>> +			/* pmd exists */
>> +			walk_pte(st, pmd, addr);
>> +		else
>> +			note_page(st, addr, 3, pmd_val(*pmd));
>> +	}
>> +}
>> +
>> +static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start)
>> +{
>> +	pud_t *pud = pud_offset(pgd, 0);
>> +	unsigned long addr = 0UL;
>> +	unsigned i;
>> +
>> +	for (i = 0; i < PTRS_PER_PUD; i++, pud++) {
>> +		addr = start + i * PUD_SIZE;
>> +		if (!pud_none(*pud) && !pud_sect(*pud))
>> +			/* pud exists */
>> +			walk_pmd(st, pud, addr);
>> +		else
>> +			note_page(st, addr, 2, pud_val(*pud));
>> +	}
>> +}
>> +
>> +static void walk_pgd(struct pg_state *st, unsigned long start)
>> +{
>> +	pgd_t *pgd = pgd_offset_k( 0UL);
>> +	unsigned i;
>> +	unsigned long addr;
>> +
>> +	for (i = 0; i < PTRS_PER_PGD; i++, pgd++) {
>> +		addr = start + i * PGDIR_SIZE;
>> +		if(!pgd_none(*pgd))
>> +			/* pgd exists */
>> +			walk_pud(st, pgd, addr);
>> +		else
>> +			note_page(st, addr, 1, pgd_val(*pgd));
>> +
>> +	}
>> +}
>> +
>> +static int ptdump_show(struct seq_file *m, void *v)
>> +{
>> +	struct pg_state st = {
>> +		.seq = m,
>> +		.start_address = KERN_VIRT_START,
>> +		.marker = address_markers,
>> +	};
>> +	/* Traverse kernel page tables */
>> +	walk_pgd(&st, KERN_VIRT_START);
>> +	note_page(&st, 0, 0, 0);
>> +	return 0;
>> +}
>> +
>> +static int ptdump_open(struct inode *inode, struct file *file)
>> +{
>> +	return single_open(file, ptdump_show, NULL);
>> +}
>> +
>> +static const struct file_operations ptdump_fops = {
>> +	.open		= ptdump_open,
>> +	.read		= seq_read,
>> +	.llseek		= seq_lseek,
>> +	.release	= single_release,
>> +};
>> +
>> +static int ptdump_init(void)
>> +{
>> +	struct dentry *pe;
> 'pe' as a pointer ? Have a better name like 'pgtable' or simple
> pointer like name 'ptr' will be better.
>
Agreed.
>> +	unsigned i, j;
>> +
>> +	for (i = 0; i < ARRAY_SIZE(pg_level); i++)
>> +		if (pg_level[i].bits)
>> +			for (j = 0; j < pg_level[i].num; j++)
>> +				pg_level[i].mask |= pg_level[i].bits[j].mask;
> This is iterating over two data structures at a time. Cant we put
> this into a separate function like build_pagetable_complete_mask() ?
>



More information about the Linuxppc-dev mailing list