RFC: PPC64 hugepage rework

David Gibson david at gibson.dropbear.id.au
Tue Sep 21 10:37:25 EST 2004


On Mon, Sep 20, 2004 at 11:30:24AM -0500, Joel Schopp wrote:
> >The patch below reworks the ppc64 hugepage code.  Instead of using
> >specially marked pmd entries in the normal pagetables to represent
> >hugepages, use normal pte_t entries, in a special set of pagetables
> >used for hugepages only.
> >
> >Using pte_t instead of a special hugepte_t makes the code more similar
> >to that for other architecturess, allowing more possibilities for
> >consolidating the hugepage code.
> 
> Excellent!
> 
> 
> >-static void flush_hash_hugepage(mm_context_t context, unsigned long ea,
> >-				hugepte_t pte, int local);
> >-
> 
> If you remove this you might fix up flush_hash_page so it works with 
> huge pages.  Or did you do that and I just missed it?

Did that - hpte_update() and flush_hash_page() now understand huge
pages.

> >-int
> >-follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> >-		    struct page **pages, struct vm_area_struct **vmas,
> >-		    unsigned long *position, int *length, int i)
> >+int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
> >+			struct page **pages, struct vm_area_struct **vmas,
> >+			unsigned long *position, int *length, int i)
> > {
> 
> This seems like unnecessary churn.

True enough, removed.

> > {
> >-	struct page *page;
> >-
> >-	BUG_ON(! pmd_hugepage(*pmd));
> >-
> >-	page = hugepte_page(*(hugepte_t *)pmd);
> >-	if (page)
> >-		page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
> >-	return page;
> >+	BUG();
> >+	return NULL;
> > }
> 
> Why not just remove the function and let the compiler catch any errors 
> instead of catching them at runtime?

That function is referenced from generic code.  It should never be
called, because our pmd_huge() always returns zero, but that's not
inline, so I don't think the compiler can know that.

Below is a new version of the patch which also includes some other
changes to make the code more similar to other architectures, in
preparation for more consolidation work:

Rework the ppc64 hugepage code.  Instead of using specially marked pmd
entries in the normal pagetables to represent hugepages, use normal
pte_t entries, in a special set of pagetables used for hugepages only.

Using pte_t instead of a special hugepte_t makes the code more similar
to that for other architecturess, allowing more possibilities for
consolidating the hugepage code.

Using independent pagetables for the hugepages is also a prerequisite
for moving the hugepages into their own region well outside the normal
user address space.  The restrictions imposed by the powerpc mmu's
segment design mean we probably want to do that in the fairly near
future.

Index: working-2.6/include/asm-ppc64/pgtable.h
===================================================================
--- working-2.6.orig/include/asm-ppc64/pgtable.h	2004-09-15 10:53:53.000000000 +1000
+++ working-2.6/include/asm-ppc64/pgtable.h	2004-09-20 14:15:57.000000000 +1000
@@ -98,6 +98,7 @@
 #define _PAGE_BUSY	0x0800 /* software: PTE & hash are busy */ 
 #define _PAGE_SECONDARY 0x8000 /* software: HPTE is in secondary group */
 #define _PAGE_GROUP_IX  0x7000 /* software: HPTE index within group */
+#define _PAGE_HUGE	0x10000 /* 16MB page */
 /* Bits 0x7000 identify the index within an HPT Group */
 #define _PAGE_HPTEFLAGS (_PAGE_BUSY | _PAGE_HASHPTE | _PAGE_SECONDARY | _PAGE_GROUP_IX)
 /* PAGE_MASK gives the right answer below, but only by accident */
@@ -157,19 +158,19 @@
 #endif /* __ASSEMBLY__ */
 
 /* shift to put page number into pte */
-#define PTE_SHIFT (16)
+#define PTE_SHIFT (17)
 
 /* We allow 2^41 bytes of real memory, so we need 29 bits in the PMD
  * to give the PTE page number.  The bottom two bits are for flags. */
 #define PMD_TO_PTEPAGE_SHIFT (2)
 
 #ifdef CONFIG_HUGETLB_PAGE
-#define _PMD_HUGEPAGE	0x00000001U
-#define HUGEPTE_BATCH_SIZE (1<<(HPAGE_SHIFT-PMD_SHIFT))
 
 #ifndef __ASSEMBLY__
 int hash_huge_page(struct mm_struct *mm, unsigned long access,
 		   unsigned long ea, unsigned long vsid, int local);
+
+void hugetlb_mm_free_pgd(struct mm_struct *mm);
 #endif /* __ASSEMBLY__ */
 
 #define HAVE_ARCH_UNMAPPED_AREA
@@ -177,7 +178,7 @@
 #else
 
 #define hash_huge_page(mm,a,ea,vsid,local)	-1
-#define _PMD_HUGEPAGE	0
+#define hugetlb_mm_free_pgd(mm)			do {} while (0)
 
 #endif
 
@@ -213,10 +214,8 @@
 #define pmd_set(pmdp, ptep) 	\
 	(pmd_val(*(pmdp)) = (__ba_to_bpn(ptep) << PMD_TO_PTEPAGE_SHIFT))
 #define pmd_none(pmd)		(!pmd_val(pmd))
-#define	pmd_hugepage(pmd)	(!!(pmd_val(pmd) & _PMD_HUGEPAGE))
-#define	pmd_bad(pmd)		(((pmd_val(pmd)) == 0) || pmd_hugepage(pmd))
-#define	pmd_present(pmd)	((!pmd_hugepage(pmd)) \
-				 && (pmd_val(pmd) & ~_PMD_HUGEPAGE) != 0)
+#define	pmd_bad(pmd)		(pmd_val(pmd) == 0)
+#define	pmd_present(pmd)	(pmd_val(pmd) != 0)
 #define	pmd_clear(pmdp)		(pmd_val(*(pmdp)) = 0)
 #define pmd_page_kernel(pmd)	\
 	(__bpn_to_ba(pmd_val(pmd) >> PMD_TO_PTEPAGE_SHIFT))
@@ -269,6 +268,7 @@
 static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY;}
 static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED;}
 static inline int pte_file(pte_t pte) { return pte_val(pte) & _PAGE_FILE;}
+static inline int pte_huge(pte_t pte) { return pte_val(pte) & _PAGE_HUGE;}
 
 static inline void pte_uncache(pte_t pte) { pte_val(pte) |= _PAGE_NO_CACHE; }
 static inline void pte_cache(pte_t pte)   { pte_val(pte) &= ~_PAGE_NO_CACHE; }
@@ -294,6 +294,8 @@
 	pte_val(pte) |= _PAGE_DIRTY; return pte; }
 static inline pte_t pte_mkyoung(pte_t pte) {
 	pte_val(pte) |= _PAGE_ACCESSED; return pte; }
+static inline pte_t pte_mkhuge(pte_t pte) {
+	pte_val(pte) |= _PAGE_HUGE; return pte; }
 
 /* Atomic PTE updates */
 static inline unsigned long pte_update(pte_t *p, unsigned long clr)
@@ -464,6 +466,10 @@
 
 extern void paging_init(void);
 
+struct mmu_gather;
+void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
+			   unsigned long start, unsigned long end);
+
 /*
  * This gets called at the end of handling a page fault, when
  * the kernel has put a new PTE into the page table for the process.
Index: working-2.6/arch/ppc64/mm/hugetlbpage.c
===================================================================
--- working-2.6.orig/arch/ppc64/mm/hugetlbpage.c	2004-09-20 10:12:50.000000000 +1000
+++ working-2.6/arch/ppc64/mm/hugetlbpage.c	2004-09-21 10:34:46.384789688 +1000
@@ -27,116 +27,143 @@
 
 #include <linux/sysctl.h>
 
-/* HugePTE layout:
- *
- * 31 30 ... 15 14 13 12 10 9  8  7   6    5    4    3    2    1    0
- * PFN>>12..... -  -  -  -  -  -  HASH_IX....   2ND  HASH RW   -    HG=1
- */
+#define	HUGEPGDIR_SHIFT		(HPAGE_SHIFT + PAGE_SHIFT - 3)
+#define HUGEPGDIR_SIZE		(1UL << HUGEPGDIR_SHIFT)
+#define HUGEPGDIR_MASK		(~(HUGEPGDIR_SIZE-1))
+
+#define HUGEPTE_INDEX_SIZE	9
+#define HUGEPGD_INDEX_SIZE	10
+
+#define PTRS_PER_HUGEPTE	(1 << HUGEPTE_INDEX_SIZE)
+#define PTRS_PER_HUGEPGD	(1 << HUGEPGD_INDEX_SIZE)
 
-#define HUGEPTE_SHIFT	15
-#define _HUGEPAGE_PFN		0xffff8000
-#define _HUGEPAGE_BAD		0x00007f00
-#define _HUGEPAGE_HASHPTE	0x00000008
-#define _HUGEPAGE_SECONDARY	0x00000010
-#define _HUGEPAGE_GROUP_IX	0x000000e0
-#define _HUGEPAGE_HPTEFLAGS	(_HUGEPAGE_HASHPTE | _HUGEPAGE_SECONDARY | \
-				 _HUGEPAGE_GROUP_IX)
-#define _HUGEPAGE_RW		0x00000004
-
-typedef struct {unsigned int val;} hugepte_t;
-#define hugepte_val(hugepte)	((hugepte).val)
-#define __hugepte(x)		((hugepte_t) { (x) } )
-#define hugepte_pfn(x)		\
-	((unsigned long)(hugepte_val(x)>>HUGEPTE_SHIFT) << HUGETLB_PAGE_ORDER)
-#define mk_hugepte(page,wr)	__hugepte( \
-	((page_to_pfn(page)>>HUGETLB_PAGE_ORDER) << HUGEPTE_SHIFT ) \
-	| (!!(wr) * _HUGEPAGE_RW) | _PMD_HUGEPAGE )
-
-#define hugepte_bad(x)	( !(hugepte_val(x) & _PMD_HUGEPAGE) || \
-			  (hugepte_val(x) & _HUGEPAGE_BAD) )
-#define hugepte_page(x)	pfn_to_page(hugepte_pfn(x))
-#define hugepte_none(x)	(!(hugepte_val(x) & _HUGEPAGE_PFN))
-
-
-static void flush_hash_hugepage(mm_context_t context, unsigned long ea,
-				hugepte_t pte, int local);
-
-static inline unsigned int hugepte_update(hugepte_t *p, unsigned int clr,
-					  unsigned int set)
-{
-	unsigned int old, tmp;
-
-	__asm__ __volatile__(
-	"1:	lwarx	%0,0,%3		# pte_update\n\
-	andc	%1,%0,%4 \n\
-	or	%1,%1,%5 \n\
-	stwcx.	%1,0,%3 \n\
-	bne-	1b"
-	: "=&r" (old), "=&r" (tmp), "=m" (*p)
-	: "r" (p), "r" (clr), "r" (set), "m" (*p)
-	: "cc" );
-	return old;
+static inline int hugepgd_index(unsigned long addr)
+{
+	return (addr & ~REGION_MASK) >> HUGEPGDIR_SHIFT;
 }
 
-static inline void set_hugepte(hugepte_t *ptep, hugepte_t pte)
+static pgd_t *hugepgd_offset(struct mm_struct *mm, unsigned long addr)
 {
-	hugepte_update(ptep, ~_HUGEPAGE_HPTEFLAGS,
-		       hugepte_val(pte) & ~_HUGEPAGE_HPTEFLAGS);
+	int index;
+
+	if (! mm->context.huge_pgdir)
+		return NULL;
+
+
+	index = hugepgd_index(addr);
+	BUG_ON(index >= PTRS_PER_HUGEPGD);
+	return mm->context.huge_pgdir + index;
 }
 
-static hugepte_t *hugepte_alloc(struct mm_struct *mm, unsigned long addr)
+static inline pte_t *hugepte_offset(pgd_t *dir, unsigned long addr)
 {
-	pgd_t *pgd;
-	pmd_t *pmd = NULL;
+	int index;
 
-	BUG_ON(!in_hugepage_area(mm->context, addr));
+	if (pgd_none(*dir))
+		return NULL;
 
-	pgd = pgd_offset(mm, addr);
-	pmd = pmd_alloc(mm, pgd, addr);
+	index = (addr >> HPAGE_SHIFT) % PTRS_PER_HUGEPTE;
+	return (pte_t *)pgd_page(*dir) + index;
+}
 
-	/* We shouldn't find a (normal) PTE page pointer here */
-	BUG_ON(!pmd_none(*pmd) && !pmd_hugepage(*pmd));
-	
-	return (hugepte_t *)pmd;
+static pgd_t *hugepgd_alloc(struct mm_struct *mm, unsigned long addr)
+{
+	BUG_ON(! in_hugepage_area(mm->context, addr));
+
+	if (! mm->context.huge_pgdir) {
+		pgd_t *new;
+		spin_unlock(&mm->page_table_lock);
+		/* Don't use pgd_alloc(), because we want __GFP_REPEAT */
+		new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
+		BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
+		spin_lock(&mm->page_table_lock);
+
+		/*
+		 * Because we dropped the lock, we should re-check the
+		 * entry, as somebody else could have populated it..
+		 */
+		if (mm->context.huge_pgdir)
+			pgd_free(new);
+		else
+			mm->context.huge_pgdir = new;
+	}
+	return hugepgd_offset(mm, addr);
 }
 
-static hugepte_t *hugepte_offset(struct mm_struct *mm, unsigned long addr)
+static pte_t *hugepte_alloc(struct mm_struct *mm, pgd_t *dir,
+			    unsigned long addr)
 {
-	pgd_t *pgd;
-	pmd_t *pmd = NULL;
+	if (! pgd_present(*dir)) {
+		pte_t *new;
 
-	BUG_ON(!in_hugepage_area(mm->context, addr));
+		spin_unlock(&mm->page_table_lock);
+		new = kmem_cache_alloc(zero_cache, GFP_KERNEL | __GFP_REPEAT);
+		BUG_ON(memcmp(new, empty_zero_page, PAGE_SIZE));
+		spin_lock(&mm->page_table_lock);
+		/*
+		 * Because we dropped the lock, we should re-check the
+		 * entry, as somebody else could have populated it..
+		 */
+		if (pgd_present(*dir)) {
+			if (new)
+				kmem_cache_free(zero_cache, new);
+		} else {
+			struct page *ptepage;
 
-	pgd = pgd_offset(mm, addr);
-	if (pgd_none(*pgd))
-		return NULL;
+			if (! new)
+				return NULL;
+			ptepage = virt_to_page(new);
+			ptepage->mapping = (void *) mm;
+			ptepage->index = addr & HUGEPGDIR_MASK;
+			pgd_populate(mm, dir, new);
+		}
+	}
 
-	pmd = pmd_offset(pgd, addr);
+	return hugepte_offset(dir, addr);
+}
+
+static pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
 
-	/* We shouldn't find a (normal) PTE page pointer here */
-	BUG_ON(!pmd_none(*pmd) && !pmd_hugepage(*pmd));
+	BUG_ON(! in_hugepage_area(mm->context, addr));
 
-	return (hugepte_t *)pmd;
+	pgd = hugepgd_offset(mm, addr);
+	if (! pgd)
+		return NULL;
+
+	return hugepte_offset(pgd, addr);
 }
 
-static void setup_huge_pte(struct mm_struct *mm, struct page *page,
-			   hugepte_t *ptep, int write_access)
+static pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
 {
-	hugepte_t entry;
-	int i;
+	pgd_t *pgd;
 
-	mm->rss += (HPAGE_SIZE / PAGE_SIZE);
-	entry = mk_hugepte(page, write_access);
-	for (i = 0; i < HUGEPTE_BATCH_SIZE; i++)
-		set_hugepte(ptep+i, entry);
+	BUG_ON(! in_hugepage_area(mm->context, addr));
+
+	pgd = hugepgd_alloc(mm, addr);
+	if (! pgd)
+		return NULL;
+
+	return hugepte_alloc(mm, pgd, addr);
 }
 
-static void teardown_huge_pte(hugepte_t *ptep)
+static void set_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+			 struct page *page, pte_t *ptep, int write_access)
 {
-	int i;
+	pte_t entry;
 
-	for (i = 0; i < HUGEPTE_BATCH_SIZE; i++)
-		pmd_clear((pmd_t *)(ptep+i));
+	mm->rss += (HPAGE_SIZE / PAGE_SIZE);
+	if (write_access) {
+		entry =
+		    pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
+	} else {
+		entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot));
+	}
+	entry = pte_mkyoung(entry);
+	entry = pte_mkhuge(entry);
+
+	set_pte(ptep, entry);
 }
 
 /*
@@ -267,34 +294,31 @@
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 			struct vm_area_struct *vma)
 {
-	hugepte_t *src_pte, *dst_pte, entry;
+	pte_t *src_pte, *dst_pte, entry;
 	struct page *ptepage;
 	unsigned long addr = vma->vm_start;
 	unsigned long end = vma->vm_end;
+	int err = -ENOMEM;
 
 	while (addr < end) {
-		BUG_ON(! in_hugepage_area(src->context, addr));
-		BUG_ON(! in_hugepage_area(dst->context, addr));
-
-		dst_pte = hugepte_alloc(dst, addr);
+		dst_pte = huge_pte_alloc(dst, addr);
 		if (!dst_pte)
-			return -ENOMEM;
+			goto out;
 
-		src_pte = hugepte_offset(src, addr);
+		src_pte = huge_pte_offset(src, addr);
 		entry = *src_pte;
 		
-		if ((addr % HPAGE_SIZE) == 0) {
-			/* This is the first hugepte in a batch */
-			ptepage = hugepte_page(entry);
-			get_page(ptepage);
-			dst->rss += (HPAGE_SIZE / PAGE_SIZE);
-		}
-		set_hugepte(dst_pte, entry);
-
+		ptepage = pte_page(entry);
+		get_page(ptepage);
+		dst->rss += (HPAGE_SIZE / PAGE_SIZE);
+		set_pte(dst_pte, entry);
 
-		addr += PMD_SIZE;
+		addr += HPAGE_SIZE;
 	}
-	return 0;
+
+	err = 0;
+ out:
+	return err;
 }
 
 int
@@ -309,18 +333,16 @@
 
 	vpfn = vaddr/PAGE_SIZE;
 	while (vaddr < vma->vm_end && remainder) {
-		BUG_ON(!in_hugepage_area(mm->context, vaddr));
-
 		if (pages) {
-			hugepte_t *pte;
+			pte_t *pte;
 			struct page *page;
 
-			pte = hugepte_offset(mm, vaddr);
+			pte = huge_pte_offset(mm, vaddr);
 
 			/* hugetlb should be locked, and hence, prefaulted */
-			WARN_ON(!pte || hugepte_none(*pte));
+			WARN_ON(!pte || pte_none(*pte));
 
-			page = &hugepte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
 
 			WARN_ON(!PageCompound(page));
 
@@ -346,26 +368,31 @@
 struct page *
 follow_huge_addr(struct mm_struct *mm, unsigned long address, int write)
 {
-	return ERR_PTR(-EINVAL);
+	pte_t *ptep;
+	struct page *page;
+
+	if (! in_hugepage_area(mm->context, address))
+		return ERR_PTR(-EINVAL);
+
+	ptep = huge_pte_offset(mm, address);
+	page = pte_page(*ptep);
+	if (page)
+		page += (address % HPAGE_SIZE) / PAGE_SIZE;
+
+	return page;
 }
 
 int pmd_huge(pmd_t pmd)
 {
-	return pmd_hugepage(pmd);
+	return 0;
 }
 
 struct page *
 follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 		pmd_t *pmd, int write)
 {
-	struct page *page;
-
-	BUG_ON(! pmd_hugepage(*pmd));
-
-	page = hugepte_page(*(hugepte_t *)pmd);
-	if (page)
-		page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
-	return page;
+	BUG();
+	return NULL;
 }
 
 void unmap_hugepage_range(struct vm_area_struct *vma,
@@ -373,44 +400,38 @@
 {
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long addr;
-	hugepte_t *ptep;
+	pte_t *ptep;
 	struct page *page;
-	int cpu;
-	int local = 0;
-	cpumask_t tmp;
 
 	WARN_ON(!is_vm_hugetlb_page(vma));
 	BUG_ON((start % HPAGE_SIZE) != 0);
 	BUG_ON((end % HPAGE_SIZE) != 0);
 
-	/* XXX are there races with checking cpu_vm_mask? - Anton */
-	cpu = get_cpu();
-	tmp = cpumask_of_cpu(cpu);
-	if (cpus_equal(vma->vm_mm->cpu_vm_mask, tmp))
-		local = 1;
-
 	for (addr = start; addr < end; addr += HPAGE_SIZE) {
-		hugepte_t pte;
-
-		BUG_ON(!in_hugepage_area(mm->context, addr));
+		pte_t pte;
 
-		ptep = hugepte_offset(mm, addr);
-		if (!ptep || hugepte_none(*ptep))
+		ptep = huge_pte_offset(mm, addr);
+		if (!ptep || pte_none(*ptep))
 			continue;
 
 		pte = *ptep;
-		page = hugepte_page(pte);
-		teardown_huge_pte(ptep);
-		
-		if (hugepte_val(pte) & _HUGEPAGE_HASHPTE)
-			flush_hash_hugepage(mm->context, addr,
-					    pte, local);
+		page = pte_page(pte);
+		pte_clear(ptep);
 
 		put_page(page);
 	}
-	put_cpu();
-
 	mm->rss -= (end - start) >> PAGE_SHIFT;
+	flush_tlb_pending();
+}
+
+void hugetlb_free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *prev,
+			   unsigned long start, unsigned long end)
+{
+	/* Because the huge pgtables are only 2 level, they can take
+	 * at most around 4M, much less than one hugepage which the
+	 * process is presumably entitled to use.  So we don't bother
+	 * freeing up the pagetables on unmap, and wait until
+	 * destroy_context() to clean up the lot. */
 }
 
 int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
@@ -426,16 +447,14 @@
 	spin_lock(&mm->page_table_lock);
 	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
 		unsigned long idx;
-		hugepte_t *pte = hugepte_alloc(mm, addr);
+		pte_t *pte = huge_pte_alloc(mm, addr);
 		struct page *page;
 
-		BUG_ON(!in_hugepage_area(mm->context, addr));
-
 		if (!pte) {
 			ret = -ENOMEM;
 			goto out;
 		}
-		if (!hugepte_none(*pte))
+		if (! pte_none(*pte))
 			continue;
 
 		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
@@ -462,7 +481,7 @@
 				goto out;
 			}
 		}
-		setup_huge_pte(mm, page, pte, vma->vm_flags & VM_WRITE);
+		set_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
 	}
 out:
 	spin_unlock(&mm->page_table_lock);
@@ -716,20 +735,55 @@
 	}
 }
 
+void hugetlb_mm_free_pgd(struct mm_struct *mm)
+{
+	int i;
+	pgd_t *pgdir;
+
+	spin_lock(&mm->page_table_lock);
+
+	pgdir = mm->context.huge_pgdir;
+	if (! pgdir)
+		return;
+
+	mm->context.huge_pgdir = NULL;
+
+	/* cleanup any hugepte pages leftover */
+	for (i = 0; i < PTRS_PER_HUGEPGD; i++) {
+		pgd_t *pgd = pgdir + i;
+
+		if (! pgd_none(*pgd)) {
+			pte_t *pte = (pte_t *)pgd_page(*pgd);
+			struct page *ptepage = virt_to_page(pte);
+
+			ptepage->mapping = NULL;
+
+			BUG_ON(memcmp(pte, empty_zero_page, PAGE_SIZE));
+			kmem_cache_free(zero_cache, pte);
+		}
+		pgd_clear(pgd);
+	}
+
+	BUG_ON(memcmp(pgdir, empty_zero_page, PAGE_SIZE));
+	kmem_cache_free(zero_cache, pgdir);
+
+	spin_unlock(&mm->page_table_lock);
+}
+
 int hash_huge_page(struct mm_struct *mm, unsigned long access,
 		   unsigned long ea, unsigned long vsid, int local)
 {
-	hugepte_t *ptep;
+	pte_t *ptep;
 	unsigned long va, vpn;
 	int is_write;
-	hugepte_t old_pte, new_pte;
-	unsigned long hpteflags, prpn, flags;
+	pte_t old_pte, new_pte;
+	unsigned long hpteflags, prpn;
 	long slot;
+	int err = 1;
+
+	spin_lock(&mm->page_table_lock);
 
-	/* We have to find the first hugepte in the batch, since
-	 * that's the one that will store the HPTE flags */
-	ea &= HPAGE_MASK;
-	ptep = hugepte_offset(mm, ea);
+	ptep = huge_pte_offset(mm, ea);
 
 	/* Search the Linux page table for a match with va */
 	va = (vsid << 28) | (ea & 0x0fffffff);
@@ -739,19 +793,18 @@
 	 * If no pte found or not present, send the problem up to
 	 * do_page_fault
 	 */
-	if (unlikely(!ptep || hugepte_none(*ptep)))
-		return 1;
+	if (unlikely(!ptep || pte_none(*ptep)))
+		goto out;
 
-	BUG_ON(hugepte_bad(*ptep));
+/* 	BUG_ON(pte_bad(*ptep)); */
 
 	/* 
 	 * Check the user's access rights to the page.  If access should be
 	 * prevented then send the problem up to do_page_fault.
 	 */
 	is_write = access & _PAGE_RW;
-	if (unlikely(is_write && !(hugepte_val(*ptep) & _HUGEPAGE_RW)))
-		return 1;
-
+	if (unlikely(is_write && !(pte_val(*ptep) & _PAGE_RW)))
+		goto out;
 	/*
 	 * At this point, we have a pte (old_pte) which can be used to build
 	 * or update an HPTE. There are 2 cases:
@@ -764,41 +817,40 @@
 	 *	page is currently not DIRTY. 
 	 */
 
-	spin_lock_irqsave(&mm->page_table_lock, flags);
 
 	old_pte = *ptep;
 	new_pte = old_pte;
 
-	hpteflags = 0x2 | (! (hugepte_val(new_pte) & _HUGEPAGE_RW));
+	hpteflags = 0x2 | (! (pte_val(new_pte) & _PAGE_RW));
 
 	/* Check if pte already has an hpte (case 2) */
-	if (unlikely(hugepte_val(old_pte) & _HUGEPAGE_HASHPTE)) {
+	if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) {
 		/* There MIGHT be an HPTE for this pte */
 		unsigned long hash, slot;
 
 		hash = hpt_hash(vpn, 1);
-		if (hugepte_val(old_pte) & _HUGEPAGE_SECONDARY)
+		if (pte_val(old_pte) & _PAGE_SECONDARY)
 			hash = ~hash;
 		slot = (hash & htab_data.htab_hash_mask) * HPTES_PER_GROUP;
-		slot += (hugepte_val(old_pte) & _HUGEPAGE_GROUP_IX) >> 5;
+		slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12;
 
 		if (ppc_md.hpte_updatepp(slot, hpteflags, va, 1, local) == -1)
-			hugepte_val(old_pte) &= ~_HUGEPAGE_HPTEFLAGS;
+			pte_val(old_pte) &= ~_PAGE_HPTEFLAGS;
 	}
 
-	if (likely(!(hugepte_val(old_pte) & _HUGEPAGE_HASHPTE))) {
+	if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) {
 		unsigned long hash = hpt_hash(vpn, 1);
 		unsigned long hpte_group;
 
-		prpn = hugepte_pfn(old_pte);
+		prpn = pte_pfn(old_pte);
 
 repeat:
 		hpte_group = ((hash & htab_data.htab_hash_mask) *
 			      HPTES_PER_GROUP) & ~0x7UL;
 
 		/* Update the linux pte with the HPTE slot */
-		hugepte_val(new_pte) &= ~_HUGEPAGE_HPTEFLAGS;
-		hugepte_val(new_pte) |= _HUGEPAGE_HASHPTE;
+		pte_val(new_pte) &= ~_PAGE_HPTEFLAGS;
+		pte_val(new_pte) |= _PAGE_HASHPTE;
 
 		/* Add in WIMG bits */
 		/* XXX We should store these in the pte */
@@ -809,7 +861,7 @@
 
 		/* Primary is full, try the secondary */
 		if (unlikely(slot == -1)) {
-			hugepte_val(new_pte) |= _HUGEPAGE_SECONDARY;
+			pte_val(new_pte) |= _PAGE_SECONDARY;
 			hpte_group = ((~hash & htab_data.htab_hash_mask) *
 				      HPTES_PER_GROUP) & ~0x7UL; 
 			slot = ppc_md.hpte_insert(hpte_group, va, prpn,
@@ -826,39 +878,20 @@
 		if (unlikely(slot == -2))
 			panic("hash_huge_page: pte_insert failed\n");
 
-		hugepte_val(new_pte) |= (slot<<5) & _HUGEPAGE_GROUP_IX;
+		pte_val(new_pte) |= (slot<<12) & _PAGE_GROUP_IX;
 
 		/* 
 		 * No need to use ldarx/stdcx here because all who
 		 * might be updating the pte will hold the
-		 * page_table_lock or the hash_table_lock
-		 * (we hold both)
+		 * page_table_lock 
 		 */
 		*ptep = new_pte;
 	}
 
-	spin_unlock_irqrestore(&mm->page_table_lock, flags);
-
-	return 0;
-}
-
-static void flush_hash_hugepage(mm_context_t context, unsigned long ea,
-				hugepte_t pte, int local)
-{
-	unsigned long vsid, vpn, va, hash, slot;
-
-	BUG_ON(hugepte_bad(pte));
-	BUG_ON(!in_hugepage_area(context, ea));
-
-	vsid = get_vsid(context.id, ea);
+	err = 0;
 
-	va = (vsid << 28) | (ea & 0x0fffffff);
-	vpn = va >> HPAGE_SHIFT;
-	hash = hpt_hash(vpn, 1);
-	if (hugepte_val(pte) & _HUGEPAGE_SECONDARY)
-		hash = ~hash;
-	slot = (hash & htab_data.htab_hash_mask) * HPTES_PER_GROUP;
-	slot += (hugepte_val(pte) & _HUGEPAGE_GROUP_IX) >> 5;
+ out:
+	spin_unlock(&mm->page_table_lock);
 
-	ppc_md.hpte_invalidate(slot, va, 1, local);
+	return err;
 }
Index: working-2.6/include/asm-ppc64/mmu.h
===================================================================
--- working-2.6.orig/include/asm-ppc64/mmu.h	2004-09-20 10:12:50.000000000 +1000
+++ working-2.6/include/asm-ppc64/mmu.h	2004-09-20 14:15:57.000000000 +1000
@@ -24,6 +24,7 @@
 typedef struct {
 	mm_context_id_t id;
 #ifdef CONFIG_HUGETLB_PAGE
+	pgd_t *huge_pgdir;
 	u16 htlb_segs; /* bitmask */
 #endif
 } mm_context_t;
Index: working-2.6/include/asm-ppc64/page.h
===================================================================
--- working-2.6.orig/include/asm-ppc64/page.h	2004-09-20 10:12:50.000000000 +1000
+++ working-2.6/include/asm-ppc64/page.h	2004-09-20 14:15:57.000000000 +1000
@@ -64,7 +64,6 @@
 #define is_hugepage_only_range(addr, len) \
 	(touches_hugepage_high_range((addr), (len)) || \
 	  touches_hugepage_low_range((addr), (len)))
-#define hugetlb_free_pgtables free_pgtables
 #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 
 #define in_hugepage_area(context, addr) \
Index: working-2.6/arch/ppc64/mm/init.c
===================================================================
--- working-2.6.orig/arch/ppc64/mm/init.c	2004-09-20 10:12:50.000000000 +1000
+++ working-2.6/arch/ppc64/mm/init.c	2004-09-20 14:15:57.000000000 +1000
@@ -484,6 +484,12 @@
 	int index;
 	int err;
 
+#ifdef CONFIG_HUGETLB_PAGE
+	/* We leave htlb_segs as it was, but for a fork, we need to
+	 * clear the huge_pgdir. */
+	mm->context.huge_pgdir = NULL;
+#endif
+
 again:
 	if (!idr_pre_get(&mmu_context_idr, GFP_KERNEL))
 		return -ENOMEM;
@@ -514,6 +520,8 @@
 	spin_unlock(&mmu_context_lock);
 
 	mm->context.id = NO_CONTEXT;
+
+	hugetlb_mm_free_pgd(mm);
 }
 
 static int __init mmu_context_init(void)
Index: working-2.6/arch/ppc64/mm/hash_utils.c
===================================================================
--- working-2.6.orig/arch/ppc64/mm/hash_utils.c	2004-09-20 10:12:50.000000000 +1000
+++ working-2.6/arch/ppc64/mm/hash_utils.c	2004-09-20 14:15:57.000000000 +1000
@@ -321,9 +321,7 @@
 		     int local)
 {
 	unsigned long vsid, vpn, va, hash, secondary, slot;
-
-	/* XXX fix for large ptes */
-	unsigned long large = 0;
+	unsigned long huge = pte_huge(pte);
 
 	if ((ea >= USER_START) && (ea <= USER_END))
 		vsid = get_vsid(context, ea);
@@ -331,18 +329,18 @@
 		vsid = get_kernel_vsid(ea);
 
 	va = (vsid << 28) | (ea & 0x0fffffff);
-	if (large)
+	if (huge)
 		vpn = va >> HPAGE_SHIFT;
 	else
 		vpn = va >> PAGE_SHIFT;
-	hash = hpt_hash(vpn, large);
+	hash = hpt_hash(vpn, huge);
 	secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15;
 	if (secondary)
 		hash = ~hash;
 	slot = (hash & htab_data.htab_hash_mask) * HPTES_PER_GROUP;
 	slot += (pte_val(pte) & _PAGE_GROUP_IX) >> 12;
 
-	ppc_md.hpte_invalidate(slot, va, large, local);
+	ppc_md.hpte_invalidate(slot, va, huge, local);
 }
 
 void flush_hash_range(unsigned long context, unsigned long number, int local)


-- 
David Gibson			| For every complex problem there is a
david AT gibson.dropbear.id.au	| solution which is simple, neat and
				| wrong.
http://www.ozlabs.org/people/dgibson



More information about the Linuxppc64-dev mailing list