Four level pagetables, now with hugepages

David Gibson david at gibson.dropbear.id.au
Mon May 16 14:43:53 EST 2005


Ok, here's a new revised version of the four level pagetable patch,
with hugepages working again.  However, it now must be applied on top
of the hugepage consolidation patch which is in -mm tree.  I think
this is now close to ready for merging.

This patch implements full four-level page tables for ppc64.  It uses
a full page for the tables at the bottom and top level, and a quarter
page for the intermediate levels.  This gives a total usable address
space of 44 bits (16T).  This patch also tweaks the VSID allocation to
have a matching range for user addresses (thereby halving the number
of available contexts) and adds some #if and BUILD_BUG sanity checks.

Index: working-2.6/include/asm-ppc64/pgtable.h
===================================================================
--- working-2.6.orig/include/asm-ppc64/pgtable.h	2005-05-13 17:57:38.000000000 +1000
+++ working-2.6/include/asm-ppc64/pgtable.h	2005-05-13 17:57:39.000000000 +1000
@@ -15,19 +15,24 @@
 #include <asm/tlbflush.h>
 #endif /* __ASSEMBLY__ */
 
-#include <asm-generic/pgtable-nopud.h>
-
 /*
  * Entries per page directory level.  The PTE level must use a 64b record
  * for each page table entry.  The PMD and PGD level use a 32b record for 
  * each entry by assuming that each entry is page aligned.
  */
 #define PTE_INDEX_SIZE  9
-#define PMD_INDEX_SIZE  10
-#define PGD_INDEX_SIZE  10
+#define PMD_INDEX_SIZE  7
+#define PUD_INDEX_SIZE  7
+#define PGD_INDEX_SIZE  9
+
+#define PTE_TABLE_SIZE	(sizeof(pte_t) << PTE_INDEX_SIZE)
+#define PMD_TABLE_SIZE	(sizeof(pmd_t) << PMD_INDEX_SIZE)
+#define PUD_TABLE_SIZE	(sizeof(pud_t) << PUD_INDEX_SIZE)
+#define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
 
 #define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
 #define PTRS_PER_PMD	(1 << PMD_INDEX_SIZE)
+#define PTRS_PER_PUD	(1 << PMD_INDEX_SIZE)
 #define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
 
 /* PMD_SHIFT determines what a second-level page table entry can map */
@@ -35,8 +40,13 @@
 #define PMD_SIZE	(1UL << PMD_SHIFT)
 #define PMD_MASK	(~(PMD_SIZE-1))
 
-/* PGDIR_SHIFT determines what a third-level page table entry can map */
-#define PGDIR_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
+/* PUD_SHIFT determines what a third-level page table entry can map */
+#define PUD_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
+#define PUD_SIZE	(1UL << PUD_SHIFT)
+#define PUD_MASK	(~(PUD_SIZE-1))
+
+/* PGDIR_SHIFT determines what a fourth-level page table entry can map */
+#define PGDIR_SHIFT	(PUD_SHIFT + PUD_INDEX_SIZE)
 #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
@@ -45,15 +55,23 @@
 /*
  * Size of EA range mapped by our pagetables.
  */
-#define EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \
-                    PGD_INDEX_SIZE + PAGE_SHIFT)
-#define EADDR_MASK ((1UL << EADDR_SIZE) - 1)
+#define PGTABLE_EADDR_SIZE (PTE_INDEX_SIZE + PMD_INDEX_SIZE + \
+                	    PUD_INDEX_SIZE + PGD_INDEX_SIZE + PAGE_SHIFT)
+#define PGTABLE_RANGE (1UL << PGTABLE_EADDR_SIZE)
+
+#if TASK_SIZE_USER64 > PGTABLE_RANGE
+#error TASK_SIZE_USER64 exceeds pagetable range
+#endif
+
+#if TASK_SIZE_USER64 > (1UL << (USER_ESID_BITS + SID_SHIFT))
+#error TASK_SIZE_USER64 exceeds user VSID range
+#endif
 
 /*
  * Define the address range of the vmalloc VM area.
  */
 #define VMALLOC_START (0xD000000000000000ul)
-#define VMALLOC_SIZE  (0x10000000000UL)
+#define VMALLOC_SIZE  (0x80000000000UL)
 #define VMALLOC_END   (VMALLOC_START + VMALLOC_SIZE)
 
 /*
@@ -154,8 +172,6 @@
 #ifndef __ASSEMBLY__
 int hash_huge_page(struct mm_struct *mm, unsigned long access,
 		   unsigned long ea, unsigned long vsid, int local);
-
-void hugetlb_mm_free_pgd(struct mm_struct *mm);
 #endif /* __ASSEMBLY__ */
 
 #define HAVE_ARCH_UNMAPPED_AREA
@@ -163,7 +179,6 @@
 #else
 
 #define hash_huge_page(mm,a,ea,vsid,local)	-1
-#define hugetlb_mm_free_pgd(mm)			do {} while (0)
 
 #endif
 
@@ -197,39 +212,45 @@
 #define pte_pfn(x)		((unsigned long)((pte_val(x) >> PTE_SHIFT)))
 #define pte_page(x)		pfn_to_page(pte_pfn(x))
 
-#define pmd_set(pmdp, ptep) 	\
-	(pmd_val(*(pmdp)) = __ba_to_bpn(ptep))
+#define pmd_set(pmdp, ptep) 	(pmd_val(*(pmdp)) = (unsigned long)(ptep))
 #define pmd_none(pmd)		(!pmd_val(pmd))
 #define	pmd_bad(pmd)		(pmd_val(pmd) == 0)
 #define	pmd_present(pmd)	(pmd_val(pmd) != 0)
 #define	pmd_clear(pmdp)		(pmd_val(*(pmdp)) = 0)
-#define pmd_page_kernel(pmd)	(__bpn_to_ba(pmd_val(pmd)))
+#define pmd_page_kernel(pmd)	(pmd_val(pmd))
 #define pmd_page(pmd)		virt_to_page(pmd_page_kernel(pmd))
 
-#define pud_set(pudp, pmdp)	(pud_val(*(pudp)) = (__ba_to_bpn(pmdp)))
+#define pud_set(pudp, pmdp)	(pud_val(*(pudp)) = (unsigned long)(pmdp))
 #define pud_none(pud)		(!pud_val(pud))
-#define pud_bad(pud)		((pud_val(pud)) == 0UL)
-#define pud_present(pud)	(pud_val(pud) != 0UL)
-#define pud_clear(pudp)		(pud_val(*(pudp)) = 0UL)
-#define pud_page(pud)		(__bpn_to_ba(pud_val(pud)))
+#define pud_bad(pud)		((pud_val(pud)) == 0)
+#define pud_present(pud)	(pud_val(pud) != 0)
+#define pud_clear(pudp)		(pud_val(*(pudp)) = 0)
+#define pud_page(pud)		(pud_val(pud))
+
+#define pgd_set(pgdp, pudp)	({pgd_val(*(pgdp)) = (unsigned long)(pudp);})
+#define pgd_none(pgd)		(!pgd_val(pgd))
+#define pgd_bad(pgd)		(pgd_val(pgd) == 0)
+#define pgd_present(pgd)	(pgd_val(pgd) != 0)
+#define pgd_clear(pgdp)		(pgd_val(*(pgdp)) = 0)
+#define pgd_page(pgd)		(pgd_val(pgd))
 
 /* 
  * Find an entry in a page-table-directory.  We combine the address region 
  * (the high order N bits) and the pgd portion of the address.
  */
 /* to avoid overflow in free_pgtables we don't use PTRS_PER_PGD here */
-#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & 0x7ff)
+#define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & 0x1ff)
 
 #define pgd_offset(mm, address)	 ((mm)->pgd + pgd_index(address))
 
-/* Find an entry in the second-level page table.. */
+#define pud_offset(pgdp, addr)	\
+  (((pud_t *) pgd_page(*(pgdp))) + (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
+
 #define pmd_offset(pudp,addr) \
-  ((pmd_t *) pud_page(*(pudp)) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
+  (((pmd_t *) pud_page(*(pudp))) + (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)))
 
-/* Find an entry in the third-level page table.. */
 #define pte_offset_kernel(dir,addr) \
-  ((pte_t *) pmd_page_kernel(*(dir)) \
- + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)))
+  (((pte_t *) pmd_page_kernel(*(dir))) + (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)))
 
 #define pte_offset_map(dir,addr)	pte_offset_kernel((dir), (addr))
 #define pte_offset_map_nested(dir,addr)	pte_offset_kernel((dir), (addr))
@@ -458,9 +479,11 @@
 #define pte_same(A,B)	(((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0)
 
 #define pmd_ERROR(e) \
-	printk("%s:%d: bad pmd %08x.\n", __FILE__, __LINE__, pmd_val(e))
+	printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e))
+#define pud_ERROR(e) \
+	printk("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pud_val(e))
 #define pgd_ERROR(e) \
-	printk("%s:%d: bad pgd %08x.\n", __FILE__, __LINE__, pgd_val(e))
+	printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
 
 extern pgd_t swapper_pg_dir[];
 
Index: working-2.6/include/asm-ppc64/page.h
===================================================================
--- working-2.6.orig/include/asm-ppc64/page.h	2005-05-13 17:57:38.000000000 +1000
+++ working-2.6/include/asm-ppc64/page.h	2005-05-13 17:57:39.000000000 +1000
@@ -46,6 +46,7 @@
 
 #define ARCH_HAS_HUGEPAGE_ONLY_RANGE
 #define ARCH_HAS_PREPARE_HUGEPAGE_RANGE
+#define ARCH_HAS_SETCLEAR_HUGE_PTE
 
 #define touches_hugepage_low_range(mm, addr, len) \
 	(LOW_ESID_MASK((addr), (len)) & mm->context.htlb_segs)
@@ -125,36 +126,42 @@
  * Entries in the pte table are 64b, while entries in the pgd & pmd are 32b.
  */
 typedef struct { unsigned long pte; } pte_t;
-typedef struct { unsigned int  pmd; } pmd_t;
-typedef struct { unsigned int  pgd; } pgd_t;
+typedef struct { unsigned long pmd; } pmd_t;
+typedef struct { unsigned long pud; } pud_t;
+typedef struct { unsigned long pgd; } pgd_t;
 typedef struct { unsigned long pgprot; } pgprot_t;
 
 #define pte_val(x)	((x).pte)
 #define pmd_val(x)	((x).pmd)
+#define pud_val(x)	((x).pud)
 #define pgd_val(x)	((x).pgd)
 #define pgprot_val(x)	((x).pgprot)
 
-#define __pte(x)	((pte_t) { (x) } )
-#define __pmd(x)	((pmd_t) { (x) } )
-#define __pgd(x)	((pgd_t) { (x) } )
-#define __pgprot(x)	((pgprot_t) { (x) } )
+#define __pte(x)	((pte_t) { (x) })
+#define __pmd(x)	((pmd_t) { (x) })
+#define __pud(x)	((pud_t) { (x) })
+#define __pgd(x)	((pgd_t) { (x) })
+#define __pgprot(x)	((pgprot_t) { (x) })
 
 #else
 /*
  * .. while these make it easier on the compiler
  */
 typedef unsigned long pte_t;
-typedef unsigned int  pmd_t;
-typedef unsigned int  pgd_t;
+typedef unsigned long pmd_t;
+typedef unsigned long pud_t;
+typedef unsigned long pgd_t;
 typedef unsigned long pgprot_t;
 
 #define pte_val(x)	(x)
 #define pmd_val(x)	(x)
+#define pud_val(x)	(x)
 #define pgd_val(x)	(x)
 #define pgprot_val(x)	(x)
 
 #define __pte(x)	(x)
 #define __pmd(x)	(x)
+#define __pud(x)	(x)
 #define __pgd(x)	(x)
 #define __pgprot(x)	(x)
 
@@ -208,9 +215,6 @@
 #define USER_REGION_ID     (0UL)
 #define REGION_ID(ea)	   (((unsigned long)(ea)) >> REGION_SHIFT)
 
-#define __bpn_to_ba(x) ((((unsigned long)(x)) << PAGE_SHIFT) + KERNELBASE)
-#define __ba_to_bpn(x) ((((unsigned long)(x)) & ~REGION_MASK) >> PAGE_SHIFT)
-
 #define __va(x) ((void *)((unsigned long)(x) + KERNELBASE))
 
 #ifdef CONFIG_DISCONTIGMEM
Index: working-2.6/include/asm-ppc64/pgalloc.h
===================================================================
--- working-2.6.orig/include/asm-ppc64/pgalloc.h	2005-05-13 17:57:38.000000000 +1000
+++ working-2.6/include/asm-ppc64/pgalloc.h	2005-05-13 17:57:39.000000000 +1000
@@ -6,7 +6,7 @@
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
 
-extern kmem_cache_t *zero_cache;
+extern kmem_cache_t *pmd_cache;
 
 /*
  * This program is free software; you can redistribute it and/or
@@ -18,13 +18,31 @@
 static inline pgd_t *
 pgd_alloc(struct mm_struct *mm)
 {
-	return kmem_cache_alloc(zero_cache, GFP_KERNEL);
+	return (pgd_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO);
 }
 
 static inline void
 pgd_free(pgd_t *pgd)
 {
-	kmem_cache_free(zero_cache, pgd);
+	free_page((unsigned long)pgd);
+}
+
+#define pgd_populate(MM, PGD, PUD)	pgd_set(PGD, PUD)
+
+static inline pud_t *
+pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+	pud_t *pudp;
+
+	pudp = kmem_cache_alloc(pmd_cache, GFP_KERNEL|__GFP_REPEAT);
+	memset(pudp, 0, PUD_TABLE_SIZE);
+	return pudp;
+}
+
+static inline void
+pud_free(pud_t *pud)
+{
+	kmem_cache_free(pmd_cache, pud);
 }
 
 #define pud_populate(MM, PUD, PMD)	pud_set(PUD, PMD)
@@ -32,13 +50,17 @@
 static inline pmd_t *
 pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT);
+	pmd_t *pmdp;
+
+	pmdp = kmem_cache_alloc(pmd_cache, GFP_KERNEL|__GFP_REPEAT);
+	memset(pmdp, 0, PMD_TABLE_SIZE);
+	return pmdp;
 }
 
 static inline void
 pmd_free(pmd_t *pmd)
 {
-	kmem_cache_free(zero_cache, pmd);
+	kmem_cache_free(pmd_cache, pmd);
 }
 
 #define pmd_populate_kernel(mm, pmd, pte) pmd_set(pmd, pte)
@@ -47,44 +69,54 @@
 
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
-	return kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT);
+	return (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
 }
 
 static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
 {
-	pte_t *pte = kmem_cache_alloc(zero_cache, GFP_KERNEL|__GFP_REPEAT);
-	if (pte)
-		return virt_to_page(pte);
-	return NULL;
+	return alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO);
 }
 		
 static inline void pte_free_kernel(pte_t *pte)
 {
-	kmem_cache_free(zero_cache, pte);
+	free_page((unsigned long)pte);
 }
 
 static inline void pte_free(struct page *ptepage)
 {
-	kmem_cache_free(zero_cache, page_address(ptepage));
+	__free_page(ptepage);
 }
 
-struct pte_freelist_batch
+typedef struct pgtable_free {
+	unsigned long val;
+} pgtable_free_t;
+
+static inline pgtable_free_t pgtable_free_page(struct page *page)
 {
-	struct rcu_head	rcu;
-	unsigned int	index;
-	struct page *	pages[0];
-};
+	return (pgtable_free_t){.val = (unsigned long) page};
+}
 
-#define PTE_FREELIST_SIZE	((PAGE_SIZE - sizeof(struct pte_freelist_batch)) / \
-				  sizeof(struct page *))
+static inline pgtable_free_t pgtable_free_cache(void *p)
+{
+	return (pgtable_free_t){.val = ((unsigned long) p) | 1};
+}
 
-extern void pte_free_now(struct page *ptepage);
-extern void pte_free_submit(struct pte_freelist_batch *batch);
+static inline void pgtable_free(pgtable_free_t pgf)
+{
+	if (pgf.val & 1)
+		kmem_cache_free(pmd_cache, (void *)(pgf.val & ~1));
+	else
+		__free_page((struct page *)pgf.val);
+}
 
-DECLARE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
+void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf);
 
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage);
-#define __pmd_free_tlb(tlb, pmd)	__pte_free_tlb(tlb, virt_to_page(pmd))
+#define __pte_free_tlb(tlb, ptepage)	\
+	pgtable_free_tlb(tlb, pgtable_free_page(ptepage))
+#define __pmd_free_tlb(tlb, pmd)	\
+	pgtable_free_tlb(tlb, pgtable_free_cache(pmd))
+#define __pud_free_tlb(tlb, pmd)	\
+	pgtable_free_tlb(tlb, pgtable_free_cache(pud))
 
 #define check_pgt_cache()	do { } while (0)
 
Index: working-2.6/arch/ppc64/mm/init.c
===================================================================
--- working-2.6.orig/arch/ppc64/mm/init.c	2005-05-13 17:57:38.000000000 +1000
+++ working-2.6/arch/ppc64/mm/init.c	2005-05-13 17:57:39.000000000 +1000
@@ -66,6 +66,14 @@
 #include <asm/vdso.h>
 #include <asm/imalloc.h>
 
+#if PGTABLE_RANGE > USER_VSID_RANGE
+#warning Limited user VSID range means pagetable space is wasted
+#endif
+
+#if (TASK_SIZE_USER64 < PGTABLE_RANGE) && (TASK_SIZE_USER64 < USER_VSID_RANGE)
+#warning TASK_SIZE is smaller than it needs to be.
+#endif
+
 int mem_init_done;
 unsigned long ioremap_bot = IMALLOC_BASE;
 static unsigned long phbs_io_bot = PHBS_IO_BASE;
@@ -225,7 +233,7 @@
 	 * Before that, we map using addresses going
 	 * up from ioremap_bot.  imalloc will use
 	 * the addresses from ioremap_bot through
-	 * IMALLOC_END (0xE000001fffffffff)
+	 * IMALLOC_END
 	 * 
 	 */
 	pa = addr & PAGE_MASK;
@@ -416,12 +424,6 @@
 	int index;
 	int err;
 
-#ifdef CONFIG_HUGETLB_PAGE
-	/* We leave htlb_segs as it was, but for a fork, we need to
-	 * clear the huge_pgdir. */
-	mm->context.huge_pgdir = NULL;
-#endif
-
 again:
 	if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
 		return -ENOMEM;
@@ -452,8 +454,6 @@
 	spin_unlock(&mmu_context_lock);
 
 	mm->context.id = NO_CONTEXT;
-
-	hugetlb_mm_free_pgd(mm);
 }
 
 /*
@@ -824,23 +824,18 @@
 	return virt_addr;
 }
 
-kmem_cache_t *zero_cache;
-
-static void zero_ctor(void *pte, kmem_cache_t *cache, unsigned long flags)
-{
-	memset(pte, 0, PAGE_SIZE);
-}
+kmem_cache_t *pmd_cache;
 
 void pgtable_cache_init(void)
 {
-	zero_cache = kmem_cache_create("zero",
-				PAGE_SIZE,
-				0,
-				SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
-				zero_ctor,
-				NULL);
-	if (!zero_cache)
-		panic("pgtable_cache_init(): could not create zero_cache!\n");
+	BUILD_BUG_ON(PTE_TABLE_SIZE != PAGE_SIZE);
+	BUILD_BUG_ON(PMD_TABLE_SIZE != PUD_TABLE_SIZE);
+	BUILD_BUG_ON(PGD_TABLE_SIZE != PAGE_SIZE);
+
+	pmd_cache = kmem_cache_create("pmd", PMD_TABLE_SIZE, PMD_TABLE_SIZE,
+				      0, NULL, NULL);
+	if (! pmd_cache)
+		panic("pmd_pud_cache_init(): could not create pmd_pud_cache!\n");
 }
 
 pgprot_t phys_mem_access_prot(struct file *file, unsigned long addr,
Index: working-2.6/include/asm-ppc64/processor.h
===================================================================
--- working-2.6.orig/include/asm-ppc64/processor.h	2005-05-13 17:57:38.000000000 +1000
+++ working-2.6/include/asm-ppc64/processor.h	2005-05-13 18:03:37.000000000 +1000
@@ -530,8 +530,8 @@
 extern struct task_struct *last_task_used_math;
 extern struct task_struct *last_task_used_altivec;
 
-/* 64-bit user address space is 41-bits (2TBs user VM) */
-#define TASK_SIZE_USER64 (0x0000020000000000UL)
+/* 64-bit user address space is 44-bits (16TB user VM) */
+#define TASK_SIZE_USER64 (0x0000100000000000UL)
 
 /* 
  * 32-bit user address space is 4GB - 1 page 
Index: working-2.6/arch/ppc64/kernel/head.S
===================================================================
--- working-2.6.orig/arch/ppc64/kernel/head.S	2005-05-13 17:57:38.000000000 +1000
+++ working-2.6/arch/ppc64/kernel/head.S	2005-05-13 17:57:39.000000000 +1000
@@ -38,6 +38,7 @@
 #include <asm/cputable.h>
 #include <asm/setup.h>
 #include <asm/hvcall.h>
+#include <asm/pgtable.h>
 
 #ifdef CONFIG_PPC_ISERIES
 #define DO_SOFT_DISABLE
@@ -2117,17 +2118,17 @@
 empty_zero_page:
 	.space	4096
 
-	.globl	swapper_pg_dir
-swapper_pg_dir:
-	.space	4096
-
 #ifdef CONFIG_SMP
 /* 1 page segment table per cpu (max 48, cpu0 allocated at STAB0_PHYS_ADDR) */
 	.globl	stab_array
 stab_array:
 	.space	4096 * 48
 #endif
-	
+
+	.globl	swapper_pg_dir
+swapper_pg_dir:
+	.space	PAGE_SIZE
+
 /*
  * This space gets a copy of optional info passed to us by the bootstrap
  * Used to pass parameters into the kernel like root=/dev/sda1, etc.
Index: working-2.6/arch/ppc64/mm/imalloc.c
===================================================================
--- working-2.6.orig/arch/ppc64/mm/imalloc.c	2005-05-13 17:57:38.000000000 +1000
+++ working-2.6/arch/ppc64/mm/imalloc.c	2005-05-13 17:57:39.000000000 +1000
@@ -31,7 +31,7 @@
 			break;
 		if ((unsigned long)tmp->addr >= ioremap_bot)
 			addr = tmp->size + (unsigned long) tmp->addr;
-		if (addr > IMALLOC_END-size) 
+		if (addr >= IMALLOC_END-size)
 			return 1;
 	}
 	*im_addr = addr;
Index: working-2.6/arch/ppc64/mm/hash_utils.c
===================================================================
--- working-2.6.orig/arch/ppc64/mm/hash_utils.c	2005-05-13 17:57:38.000000000 +1000
+++ working-2.6/arch/ppc64/mm/hash_utils.c	2005-05-13 17:57:39.000000000 +1000
@@ -298,7 +298,7 @@
 	int local = 0;
 	cpumask_t tmp;
 
-	if ((ea & ~REGION_MASK) > EADDR_MASK)
+	if ((ea & ~REGION_MASK) >= PGTABLE_RANGE)
 		return 1;
 
  	switch (REGION_ID(ea)) {
Index: working-2.6/include/asm-ppc64/mmu.h
===================================================================
--- working-2.6.orig/include/asm-ppc64/mmu.h	2005-05-13 17:57:38.000000000 +1000
+++ working-2.6/include/asm-ppc64/mmu.h	2005-05-13 17:57:39.000000000 +1000
@@ -259,8 +259,10 @@
 #define VSID_BITS	36
 #define VSID_MODULUS	((1UL<<VSID_BITS)-1)
 
-#define CONTEXT_BITS	20
-#define USER_ESID_BITS	15
+#define CONTEXT_BITS	19
+#define USER_ESID_BITS	16
+
+#define USER_VSID_RANGE	(1UL << (USER_ESID_BITS + SID_SHIFT))
 
 /*
  * This macro generates asm code to compute the VSID scramble
@@ -302,7 +304,6 @@
 typedef struct {
 	mm_context_id_t id;
 #ifdef CONFIG_HUGETLB_PAGE
-	pgd_t *huge_pgdir;
 	u16 htlb_segs; /* bitmask */
 #endif
 } mm_context_t;
Index: working-2.6/include/asm-ppc64/imalloc.h
===================================================================
--- working-2.6.orig/include/asm-ppc64/imalloc.h	2005-05-13 17:57:38.000000000 +1000
+++ working-2.6/include/asm-ppc64/imalloc.h	2005-05-13 17:57:39.000000000 +1000
@@ -6,7 +6,7 @@
  */
 #define PHBS_IO_BASE  	  VMALLOC_END
 #define IMALLOC_BASE      (PHBS_IO_BASE + 0x80000000ul)	/* Reserve 2 gigs for PHBs */
-#define IMALLOC_END       (VMALLOC_START + EADDR_MASK)
+#define IMALLOC_END       (VMALLOC_START + PGTABLE_RANGE)
 
 
 /* imalloc region types */
Index: working-2.6/arch/ppc64/mm/tlb.c
===================================================================
--- working-2.6.orig/arch/ppc64/mm/tlb.c	2005-05-13 17:57:38.000000000 +1000
+++ working-2.6/arch/ppc64/mm/tlb.c	2005-05-13 17:57:39.000000000 +1000
@@ -41,7 +41,58 @@
 DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
 unsigned long pte_freelist_forced_free;
 
-void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage)
+struct pte_freelist_batch
+{
+	struct rcu_head	rcu;
+	unsigned int	index;
+	pgtable_free_t	tables[0];
+};
+
+DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
+unsigned long pte_freelist_forced_free;
+
+#define PTE_FREELIST_SIZE \
+	((PAGE_SIZE - sizeof(struct pte_freelist_batch)) \
+	  / sizeof(pgtable_free_t))
+
+#ifdef CONFIG_SMP
+static void pte_free_smp_sync(void *arg)
+{
+	/* Do nothing, just ensure we sync with all CPUs */
+}
+#endif
+
+/* This is only called when we are critically out of memory
+ * (and fail to get a page in pte_free_tlb).
+ */
+static void pgtable_free_now(pgtable_free_t pgf)
+{
+	pte_freelist_forced_free++;
+
+	smp_call_function(pte_free_smp_sync, NULL, 0, 1);
+
+	pgtable_free(pgf);
+}
+
+static void pte_free_rcu_callback(struct rcu_head *head)
+{
+	struct pte_freelist_batch *batch =
+		container_of(head, struct pte_freelist_batch, rcu);
+	unsigned int i;
+
+	for (i = 0; i < batch->index; i++)
+		pgtable_free(batch->tables[i]);
+
+	free_page((unsigned long)batch);
+}
+
+static void pte_free_submit(struct pte_freelist_batch *batch)
+{
+	INIT_RCU_HEAD(&batch->rcu);
+	call_rcu(&batch->rcu, pte_free_rcu_callback);
+}
+
+void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf)
 {
 	/* This is safe as we are holding page_table_lock */
         cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
@@ -49,19 +100,19 @@
 
 	if (atomic_read(&tlb->mm->mm_users) < 2 ||
 	    cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask)) {
-		pte_free(ptepage);
+		pgtable_free(pgf);
 		return;
 	}
 
 	if (*batchp == NULL) {
 		*batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
 		if (*batchp == NULL) {
-			pte_free_now(ptepage);
+			pgtable_free_now(pgf);
 			return;
 		}
 		(*batchp)->index = 0;
 	}
-	(*batchp)->pages[(*batchp)->index++] = ptepage;
+	(*batchp)->tables[(*batchp)->index++] = pgf;
 	if ((*batchp)->index == PTE_FREELIST_SIZE) {
 		pte_free_submit(*batchp);
 		*batchp = NULL;
@@ -132,42 +183,6 @@
 	put_cpu();
 }
 
-#ifdef CONFIG_SMP
-static void pte_free_smp_sync(void *arg)
-{
-	/* Do nothing, just ensure we sync with all CPUs */
-}
-#endif
-
-/* This is only called when we are critically out of memory
- * (and fail to get a page in pte_free_tlb).
- */
-void pte_free_now(struct page *ptepage)
-{
-	pte_freelist_forced_free++;
-
-	smp_call_function(pte_free_smp_sync, NULL, 0, 1);
-
-	pte_free(ptepage);
-}
-
-static void pte_free_rcu_callback(struct rcu_head *head)
-{
-	struct pte_freelist_batch *batch =
-		container_of(head, struct pte_freelist_batch, rcu);
-	unsigned int i;
-
-	for (i = 0; i < batch->index; i++)
-		pte_free(batch->pages[i]);
-	free_page((unsigned long)batch);
-}
-
-void pte_free_submit(struct pte_freelist_batch *batch)
-{
-	INIT_RCU_HEAD(&batch->rcu);
-	call_rcu(&batch->rcu, pte_free_rcu_callback);
-}
-
 void pte_free_finish(void)
 {
 	/* This is safe as we are holding page_table_lock */
Index: working-2.6/arch/ppc64/mm/hugetlbpage.c
===================================================================
--- working-2.6.orig/arch/ppc64/mm/hugetlbpage.c	2005-05-13 17:57:38.000000000 +1000
+++ working-2.6/arch/ppc64/mm/hugetlbpage.c	2005-05-13 17:57:39.000000000 +1000
@@ -27,124 +27,93 @@
 
 #include <linux/sysctl.h>
 
-#define	HUGEPGDIR_SHIFT		(HPAGE_SHIFT + PAGE_SHIFT - 3)
-#define HUGEPGDIR_SIZE		(1UL << HUGEPGDIR_SHIFT)
-#define HUGEPGDIR_MASK		(~(HUGEPGDIR_SIZE-1))
-
-#define HUGEPTE_INDEX_SIZE	9
-#define HUGEPGD_INDEX_SIZE	10
-
-#define PTRS_PER_HUGEPTE	(1 << HUGEPTE_INDEX_SIZE)
-#define PTRS_PER_HUGEPGD	(1 << HUGEPGD_INDEX_SIZE)
-
-static inline int hugepgd_index(unsigned long addr)
+/* Modelled after find_linux_pte() */
+pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
 {
-	return (addr & ~REGION_MASK) >> HUGEPGDIR_SHIFT;
-}
+	pgd_t *pg;
+	pud_t *pu;
+	pmd_t *pm;
+	pte_t *pt;
 
-static pud_t *hugepgd_offset(struct mm_struct *mm, unsigned long addr)
-{
-	int index;
+	BUG_ON(! in_hugepage_area(mm->context, addr));
 
-	if (! mm->context.huge_pgdir)
-		return NULL;
+	addr &= HPAGE_MASK;
 
+	pg = pgd_offset(mm, addr);
+	if (!pgd_none(*pg)) {
+		pu = pud_offset(pg, addr);
+		if (!pud_none(*pu)) {
+			pm = pmd_offset(pu, addr);
+			pt = (pte_t *)pm;
+			BUG_ON(!pmd_none(*pm)
+			       && !(pte_present(*pt) && pte_huge(*pt)));
+			return pt;
+		}
+	}
 
-	index = hugepgd_index(addr);
-	BUG_ON(index >= PTRS_PER_HUGEPGD);
-	return (pud_t *)(mm->context.huge_pgdir + index);
+	return NULL;
 }
 
-static inline pte_t *hugepte_offset(pud_t *dir, unsigned long addr)
+pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
-	int index;
-
-	if (pud_none(*dir))
-		return NULL;
-
-	index = (addr >> HPAGE_SHIFT) % PTRS_PER_HUGEPTE;
-	return (pte_t *)pud_page(*dir) + index;
-}
+	pgd_t *pg;
+	pud_t *pu;
+	pmd_t *pm;
+	pte_t *pt;
 
-static pud_t *hugepgd_alloc(struct mm_struct *mm, unsigned long addr)
-{
 	BUG_ON(! in_hugepage_area(mm->context, addr));
 
-	if (! mm->context.huge_pgdir) {
-		pgd_t *new;
-		spin_unlock(&mm->page_table_lock);
-		/* Don't use pgd_alloc(), because we want __GFP_REPEAT */
-		new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
-		BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
-		spin_lock(&mm->page_table_lock);
-
-		/*
-		 * Because we dropped the lock, we should re-check the
-		 * entry, as somebody else could have populated it..
-		 */
-		if (mm->context.huge_pgdir)
-			pgd_free(new);
-		else
-			mm->context.huge_pgdir = new;
-	}
-	return hugepgd_offset(mm, addr);
-}
+	addr &= HPAGE_MASK;
 
-static pte_t *hugepte_alloc(struct mm_struct *mm, pud_t *dir, unsigned long addr)
-{
-	if (! pud_present(*dir)) {
-		pte_t *new;
+	pg = pgd_offset(mm, addr);
+	pu = pud_alloc(mm, pg, addr);
 
-		spin_unlock(&mm->page_table_lock);
-		new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
-		BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
-		spin_lock(&mm->page_table_lock);
-		/*
-		 * Because we dropped the lock, we should re-check the
-		 * entry, as somebody else could have populated it..
-		 */
-		if (pud_present(*dir)) {
-			if (new)
-				kmem_cache_free(zero_cache, new);
-		} else {
-			struct page *ptepage;
-
-			if (! new)
-				return NULL;
-			ptepage = virt_to_page(new);
-			ptepage->mapping = (void *) mm;
-			ptepage->index = addr & HUGEPGDIR_MASK;
-			pud_populate(mm, dir, new);
+	if (pu) {
+		pm = pmd_alloc(mm, pu, addr);
+		if (pm) {
+			pt = (pte_t *)pm;
+			BUG_ON(!pmd_none(*pm)
+			       && !(pte_present(*pt) && pte_huge(*pt)));
+			return pt;
 		}
 	}
 
-	return hugepte_offset(dir, addr);
+	return NULL;
 }
 
-pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
-{
-	pud_t *pud;
+#define HUGEPTE_BATCH_SIZE	(HPAGE_SIZE / PMD_SIZE)
 
-	BUG_ON(! in_hugepage_area(mm->context, addr));
+void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
+		     pte_t *ptep, pte_t pte)
+{
+	int i;
 
-	pud = hugepgd_offset(mm, addr);
-	if (! pud)
-		return NULL;
+	if (pte_present(*ptep)) {
+		pte_clear(mm, addr, ptep);
+		flush_tlb_pending();
+	}
 
-	return hugepte_offset(pud, addr);
+	for (i = 0; i < HUGEPTE_BATCH_SIZE; i++) {
+		*ptep = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS);
+		ptep++;
+	}
 }
 
-pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
+pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
+			      pte_t *ptep)
 {
-	pud_t *pud;
+	unsigned long old = pte_update(ptep, ~0UL);
+	int i;
 
-	BUG_ON(! in_hugepage_area(mm->context, addr));
+	if (old & _PAGE_HASHPTE)
+		hpte_update(mm, addr, old, 0);
 
-	pud = hugepgd_alloc(mm, addr);
-	if (! pud)
-		return NULL;
+	for (i = 1; i < HUGEPTE_BATCH_SIZE; i++) {
+		*ptep = __pte(0);
+		ptep++;
+	}
 
-	return hugepte_alloc(mm, pud, addr);
+	return __pte(old);
 }
 
 /*
@@ -517,42 +486,6 @@
 	}
 }
 
-void hugetlb_mm_free_pgd(struct mm_struct *mm)
-{
-	int i;
-	pgd_t *pgdir;
-
-	spin_lock(&mm->page_table_lock);
-
-	pgdir = mm->context.huge_pgdir;
-	if (! pgdir)
-		goto out;
-
-	mm->context.huge_pgdir = NULL;
-
-	/* cleanup any hugepte pages leftover */
-	for (i = 0; i < PTRS_PER_HUGEPGD; i++) {
-		pud_t *pud = (pud_t *)(pgdir + i);
-
-		if (! pud_none(*pud)) {
-			pte_t *pte = (pte_t *)pud_page(*pud);
-			struct page *ptepage = virt_to_page(pte);
-
-			ptepage->mapping = NULL;
-
-			BUG_ON(memcmp(pte, empty_zero_page, PAGE_SIZE));
-			kmem_cache_free(zero_cache, pte);
-		}
-		pud_clear(pud);
-	}
-
-	BUG_ON(memcmp(pgdir, empty_zero_page, PAGE_SIZE));
-	kmem_cache_free(zero_cache, pgdir);
-
- out:
-	spin_unlock(&mm->page_table_lock);
-}
-
 int hash_huge_page(struct mm_struct *mm, unsigned long access,
 		   unsigned long ea, unsigned long vsid, int local)
 {


-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/people/dgibson



More information about the Linuxppc64-dev mailing list