hugetlbfs for ppc64 (POWER4)

Fri Jun 6 16:30:51 EST 2003

The patch attached (also up on the LTC patch repository) adds
hugetlbfs support to ppc64, using the 16MB pages available on Power4
hardware.  The seems to more-or-less work, although it has some
limitations and is only lightly tested so far.

The patch is against 2.5.70.

Known problems/limitations:
	- as yet untested for 64-bit user programs

	- hugepages are marked not-executable, if we change that we'll
	  need to ensure that we have dcache/icache flushes where
	  necessary

	- a fixed virtual range is used for mapping hugepages:  2-3G
	  in 32-bit processes and 1-1.5T in 64-bit processes.
	  (removing this restriction entirely would be extremely
	  difficult because of the PPC's segmented memory model, but
	  we may be able to add a little more flexibility here).

	- attempting to allocate more hugepages than are available
	  will cause a kernel crash.  I'm working on this now, and I
	  suspect it will be easily fixed.

	- testing, testing testing

--
David Gibson			| For every complex problem there is a
david at gibson.dropbear.id.au	| solution which is simple, neat and
				| wrong.
http://www.ozlabs.org/people/dgibson
-------------- next part --------------
diff -urN /scratch/anton/export/arch/ppc64/Kconfig linux-gogogo/arch/ppc64/Kconfig

--- /scratch/anton/export/arch/ppc64/Kconfig	2003-06-03 17:49:13.000000000 +1000
+++ linux-gogogo/arch/ppc64/Kconfig	2003-06-04 12:41:52.000000000 +1000
@@ -69,6 +69,17 @@
 	bool
 	default y

+config HUGETLB_PAGE
+	bool "Huge TLB Page Support"
+	help
+	  This enables support for huge pages.  User space applications
+	  can make use of this support with the sys_alloc_hugepages and
+	  sys_free_hugepages system calls.  If your applications are
+	  huge page aware and your processor supports this (only POWER4,
+	  then say Y here.
+
+	  Otherwise, say N.
+
 config SMP
 	bool "Symmetric multi-processing support"
 	---help---
diff -urN /scratch/anton/export/arch/ppc64/kernel/htab.c linux-gogogo/arch/ppc64/kernel/htab.c
--- /scratch/anton/export/arch/ppc64/kernel/htab.c	2003-04-24 18:54:33.000000000 +1000
+++ linux-gogogo/arch/ppc64/kernel/htab.c	2003-06-03 15:44:09.000000000 +1000
@@ -195,7 +195,7 @@
 	if (!pgd_none(*pg)) {

 		pm = pmd_offset(pg, ea);
-		if (!pmd_none(*pm)) {
+		if (pmd_present(*pm)) {
 			pt = pte_offset_kernel(pm, ea);
 			pte = *pt;
 			if (!pte_present(pte))
@@ -431,8 +431,12 @@
 	if (user_region && (mm->cpu_vm_mask == (1 << smp_processor_id())))
 		local = 1;

-	ptep = find_linux_pte(pgdir, ea);
-	ret = __hash_page(ea, access, vsid, ptep, trap, local);
+	ret = hash_huge_page(mm, access, ea, vsid, local);
+	if (ret < 0) {
+		ptep = find_linux_pte(pgdir, ea);
+		ret = __hash_page(ea, access, vsid, ptep, trap, local);
+	}
+
 	spin_unlock(&mm->page_table_lock);

 	return ret;
diff -urN /scratch/anton/export/arch/ppc64/kernel/stab.c linux-gogogo/arch/ppc64/kernel/stab.c
--- /scratch/anton/export/arch/ppc64/kernel/stab.c	2003-05-06 07:49:37.000000000 +1000
+++ linux-gogogo/arch/ppc64/kernel/stab.c	2003-06-06 14:58:26.000000000 +1000
@@ -204,6 +204,12 @@
 	vsid_data.data.kp = 1;
 	if (large)
 		vsid_data.data.l = 1;
+	/* FIXME: hack alert! we make user hugepages noexec to
+	 * sidestep icache/dcache coherence issues for now.  We should
+	 * fix this properly. */
+	if (large &&
+	    (REGION_ID(esid << SID_SHIFT) == USER_REGION_ID))
+		vsid_data.data.n = 1;
 	if (kernel_segment)
 		vsid_data.data.c = 1;

@@ -220,7 +226,7 @@
 }

 static inline void __ste_allocate(unsigned long esid, unsigned long vsid,
-				  int kernel_segment)
+				  int kernel_segment, int hugepage)
 {
 	if (cpu_has_slb()) {
 #ifndef CONFIG_PPC_ISERIES
@@ -228,7 +234,7 @@
 			make_slbe(esid, vsid, 1, kernel_segment);
 		else
 #endif
-			make_slbe(esid, vsid, 0, kernel_segment);
+			make_slbe(esid, vsid, hugepage, kernel_segment);
 	} else {
 		unsigned char top_entry, stab_entry, *segments;

@@ -254,6 +260,7 @@
 {
 	unsigned long vsid, esid;
 	int kernel_segment = 0;
+	int hugepage = 0;

 	PMC_SW_PROCESSOR(stab_faults);

@@ -271,10 +278,12 @@
 			vsid = get_vsid(mm->context, ea);
 		else
 			return 1;
+
+		hugepage = in_hugepage_area(mm->context, ea);
 	}

 	esid = GET_ESID(ea);
-	__ste_allocate(esid, vsid, kernel_segment);
+	__ste_allocate(esid, vsid, kernel_segment, hugepage);
 	if (!cpu_has_slb()) {
 		/* Order update */
 		asm volatile("sync":::"memory");
@@ -301,7 +310,8 @@
 		for (esid = 0; esid < 16; esid++) {
 			unsigned long ea = esid << SID_SHIFT;
 			vsid = get_vsid(mm->context, ea);
-			__ste_allocate(esid, vsid, 0);
+			__ste_allocate(esid, vsid,
+				       in_hugepage_area(mm->context, ea), 0);
 		}
 	} else {
 		unsigned long pc = KSTK_EIP(tsk);
@@ -310,12 +320,17 @@
 		unsigned long stack_segment = stack & ~SID_MASK;
 		unsigned long vsid;

+		BUG_ON(in_hugepage_area(mm->context, pc));
+		BUG_ON(in_hugepage_area(mm->context, stack));
+		/* FIXME: Should we try to deal with the case where pc
+		 * or stack is hugepage? */
+
 		if (pc) {
 			if (!IS_VALID_EA(pc) ||
 			    (REGION_ID(pc) >= KERNEL_REGION_ID))
 				return;
 			vsid = get_vsid(mm->context, pc);
-			__ste_allocate(GET_ESID(pc), vsid, 0);
+			__ste_allocate(GET_ESID(pc), vsid, 0, 0);
 		}

 		if (stack && (pc_segment != stack_segment)) {
@@ -323,7 +338,7 @@
 			    (REGION_ID(stack) >= KERNEL_REGION_ID))
 				return;
 			vsid = get_vsid(mm->context, stack);
-			__ste_allocate(GET_ESID(stack), vsid, 0);
+			__ste_allocate(GET_ESID(stack), vsid, 0, 0);
 		}
 	}

diff -urN /scratch/anton/export/arch/ppc64/mm/Makefile linux-gogogo/arch/ppc64/mm/Makefile
--- /scratch/anton/export/arch/ppc64/mm/Makefile	2003-02-13 00:02:23.000000000 +1100
+++ linux-gogogo/arch/ppc64/mm/Makefile	2003-06-03 15:44:09.000000000 +1000
@@ -6,3 +6,4 @@

 obj-y := fault.o init.o extable.o imalloc.o
 obj-$(CONFIG_DISCONTIGMEM) += numa.o
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff -urN /scratch/anton/export/arch/ppc64/mm/hugetlbpage.c linux-gogogo/arch/ppc64/mm/hugetlbpage.c
--- /scratch/anton/export/arch/ppc64/mm/hugetlbpage.c	Thu Jan 01 10:00:00 1970
+++ linux-gogogo/arch/ppc64/mm/hugetlbpage.c	Fri Jun 06 16:01:47 2003
@@ -0,0 +1,744 @@
+/*
+ * PPC64 (POWER4) Huge TLB Page Support for Kernel.
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * Based on the IA-32 version:
+ * Copyright (C) 2002, Rohit Seth <rohit.seth at intel.com>
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/sysctl.h>
+#include <asm/mman.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/machdep.h>
+
+#include <linux/sysctl.h>
+
+static long    htlbpagemem;
+int     htlbpage_max;
+static long    htlbzone_pages;
+
+static LIST_HEAD(htlbpage_freelist);
+static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
+
+/* HugePTE layout:
+ *
+ * 31 30 ... 15 14 13 12 10 9  8  7   6    5    4    3    2    1    0
+ * PFN>>12..... -  -  -  -  -  -  HASH_IX....   2ND  HASH RW   -    HG=1
+ */
+
+#define HUGEPTE_SHIFT	15
+#define _HUGEPAGE_PFN		0xffff8000
+#define _HUGEPAGE_BAD		0x00007f00
+#define _HUGEPAGE_HASHPTE	0x00000008
+#define _HUGEPAGE_SECONDARY	0x00000010
+#define _HUGEPAGE_GROUP_IX	0x000000e0
+#define _HUGEPAGE_HPTEFLAGS	(_HUGEPAGE_HASHPTE | _HUGEPAGE_SECONDARY | \
+				 _HUGEPAGE_GROUP_IX)
+#define _HUGEPAGE_RW		0x00000004
+
+typedef struct {unsigned int val;} hugepte_t;
+#define hugepte_val(hugepte)	((hugepte).val)
+#define __hugepte(x)		((hugepte_t) { (x) } )
+#define hugepte_pfn(x)		\
+	((unsigned long)(hugepte_val(x)>>HUGEPTE_SHIFT) << HUGETLB_PAGE_ORDER)
+#define mk_hugepte(page,wr)	__hugepte( \
+	((page_to_pfn(page)>>HUGETLB_PAGE_ORDER) << HUGEPTE_SHIFT ) \
+	| (!!(wr) * _HUGEPAGE_RW) | _PMD_HUGEPAGE )
+
+#define hugepte_bad(x)	( !(hugepte_val(x) & _PMD_HUGEPAGE) || \
+			  (hugepte_val(x) & _HUGEPAGE_BAD) )
+#define hugepte_page(x)	pfn_to_page(hugepte_pfn(x))
+#define hugepte_none(x)	(!(hugepte_val(x) & _HUGEPAGE_PFN))
+
+
+static void free_huge_page(struct page *page);
+static void flush_hash_hugepage(mm_context_t context, unsigned long ea,
+				hugepte_t pte, int local);
+
+static inline unsigned int hugepte_update(hugepte_t *p, unsigned int clr,
+					  unsigned int set)
+{
+	unsigned int old, tmp;
+
+	__asm__ __volatile__(
+	"1:	lwarx	%0,0,%3		# pte_update\n\
+	andc	%1,%0,%4 \n\
+	or	%1,%1,%5 \n\
+	stwcx.	%1,0,%3 \n\
+	bne-	1b"
+	: "=&r" (old), "=&r" (tmp), "=m" (*p)
+	: "r" (p), "r" (clr), "r" (set), "m" (*p)
+	: "cc" );
+	return old;
+}
+
+static inline void set_hugepte(hugepte_t *ptep, hugepte_t pte)
+{
+	hugepte_update(ptep, ~_HUGEPAGE_HPTEFLAGS,
+		       hugepte_val(pte) & ~_HUGEPAGE_HPTEFLAGS);
+}
+
+static struct page *alloc_hugetlb_page(void)
+{
+	int i;
+	struct page *page;
+
+	spin_lock(&htlbpage_lock);
+	if (list_empty(&htlbpage_freelist)) {
+		spin_unlock(&htlbpage_lock);
+		return NULL;
+	}
+
+	page = list_entry(htlbpage_freelist.next, struct page, list);
+	list_del(&page->list);
+	htlbpagemem--;
+	spin_unlock(&htlbpage_lock);
+	set_page_count(page, 1);
+	page->lru.prev = (void *)free_huge_page;
+	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
+		clear_highpage(&page[i]);
+	return page;
+}
+
+static hugepte_t *hugepte_alloc(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pmd_t *pmd = NULL;
+
+	BUG_ON(!in_hugepage_area(mm->context, addr));
+
+	pgd = pgd_offset(mm, addr);
+	pmd = pmd_alloc(mm, pgd, addr);
+
+	/* We shouldn't find a (normal) PTE page pointer here */
+	BUG_ON(!pmd_none(*pmd) && !pmd_hugepage(*pmd));
+
+	return (hugepte_t *) pmd;
+}
+
+static hugepte_t *hugepte_offset(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	hugepte_t *hugepte = NULL;
+
+	BUG_ON(!in_hugepage_area(mm->context, addr));
+
+	pgd = pgd_offset(mm, addr);
+	hugepte = (hugepte_t *)pmd_offset(pgd, addr);
+
+	BUG_ON(hugepte_bad(*hugepte));
+
+	return hugepte;
+}
+
+static void setup_huge_pte(struct mm_struct *mm, struct vm_area_struct *vma,
+			   struct page *page, hugepte_t *ptep,
+			   int write_access)
+{
+	hugepte_t entry;
+	int i;
+
+	mm->rss += (HPAGE_SIZE / PAGE_SIZE);
+	entry = mk_hugepte(page, write_access);
+	for (i = 0; i < HUGEPTE_BATCH_SIZE; i++)
+		set_hugepte(ptep+i, entry);
+}
+
+static void teardown_huge_pte(hugepte_t *ptep)
+{
+	int i;
+
+	for (i = 0; i < HUGEPTE_BATCH_SIZE; i++)
+		pmd_clear((pmd_t *)(ptep+i));
+}
+
+/*
+ * This function checks for proper alignment of input addr and len parameters.
+ */
+int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
+{
+	if (len & ~HPAGE_MASK)
+		return -EINVAL;
+	if (addr & ~HPAGE_MASK)
+		return -EINVAL;
+	if (! is_hugepage_only_range(addr, len))
+		return -EINVAL;
+	return 0;
+}
+
+int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
+			struct vm_area_struct *vma)
+{
+	hugepte_t *src_pte, *dst_pte, entry;
+	struct page *ptepage;
+	unsigned long addr = vma->vm_start;
+	unsigned long end = vma->vm_end;
+
+	while (addr < end) {
+		BUG_ON(! in_hugepage_area(src->context, addr));
+		BUG_ON(! in_hugepage_area(dst->context, addr));
+
+		dst_pte = hugepte_alloc(dst, addr);
+		if (!dst_pte)
+			return -ENOMEM;
+
+		src_pte = hugepte_offset(src, addr);
+		entry = *src_pte;
+
+		if ((addr % HPAGE_SIZE) == 0) {
+			/* First hugepte in the batch referring to
+			 * this page */
+			ptepage = hugepte_page(entry);
+			get_page(ptepage);
+			dst->rss += (HPAGE_SIZE / PAGE_SIZE);
+		}
+		set_hugepte(dst_pte, entry);
+
+
+		addr += PMD_SIZE;
+	}
+	return 0;
+}
+
+int
+follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		    struct page **pages, struct vm_area_struct **vmas,
+		    unsigned long *position, int *length, int i)
+{
+	unsigned long vpfn, vaddr = *position;
+	int remainder = *length;
+
+	WARN_ON(!is_vm_hugetlb_page(vma));
+
+	vpfn = vaddr/PAGE_SIZE;
+	while (vaddr < vma->vm_end && remainder) {
+		BUG_ON(!in_hugepage_area(mm->context, vaddr));
+
+		if (pages) {
+			hugepte_t *pte;
+			struct page *page;
+
+			pte = hugepte_offset(mm, vaddr);
+
+			/* hugetlb should be locked, and hence, prefaulted */
+			WARN_ON(!pte || hugepte_none(*pte));
+
+			page = &hugepte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+
+			WARN_ON(!PageCompound(page));
+
+			get_page(page);
+			pages[i] = page;
+		}
+
+		if (vmas)
+			vmas[i] = vma;
+
+		vaddr += PAGE_SIZE;
+		++vpfn;
+		--remainder;
+		++i;
+	}
+
+	*length = remainder;
+	*position = vaddr;
+
+	return i;
+}
+
+struct page *
+follow_huge_addr(struct mm_struct *mm,
+	struct vm_area_struct *vma, unsigned long address, int write)
+{
+	return NULL;
+}
+
+struct vm_area_struct *hugepage_vma(struct mm_struct *mm, unsigned long addr)
+{
+	return NULL;
+}
+
+int pmd_huge(pmd_t pmd)
+{
+	return pmd_hugepage(pmd);
+}
+
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+		pmd_t *pmd, int write)
+{
+	struct page *page;
+
+	BUG_ON(! pmd_hugepage(*pmd));
+
+	page = hugepte_page(*(hugepte_t *)pmd);
+	if (page) {
+		page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
+		get_page(page);
+	}
+	return page;
+}
+
+static void free_huge_page(struct page *page)
+{
+	BUG_ON(page_count(page));
+	BUG_ON(page->mapping);
+
+	INIT_LIST_HEAD(&page->list);
+
+	spin_lock(&htlbpage_lock);
+	list_add(&page->list, &htlbpage_freelist);
+	htlbpagemem++;
+	spin_unlock(&htlbpage_lock);
+}
+
+void huge_page_release(struct page *page)
+{
+	if (!put_page_testzero(page))
+		return;
+
+	free_huge_page(page);
+}
+
+void unmap_hugepage_range(struct vm_area_struct *vma,
+		unsigned long start, unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long addr;
+	hugepte_t *ptep;
+	struct page *page;
+	int local = 0;
+
+	BUG_ON((start % HPAGE_SIZE) != 0);
+	BUG_ON((end % HPAGE_SIZE) != 0);
+	BUG_ON(!in_hugepage_area(mm->context, start));
+	BUG_ON(!in_hugepage_area(mm->context, end));
+
+	/* XXX are there races with checking cpu_vm_mask? - Anton */
+	if (vma->vm_mm->cpu_vm_mask == (1 << smp_processor_id()))
+		local = 1;
+
+	for (addr = start; addr < end; addr += HPAGE_SIZE) {
+		hugepte_t pte;
+
+		ptep = hugepte_offset(mm, addr);
+		if (!ptep || hugepte_none(*ptep))
+			continue;
+
+		pte = *ptep;
+		page = hugepte_page(pte);
+		teardown_huge_pte(ptep);
+
+		if (hugepte_val(pte) & _HUGEPAGE_HASHPTE)
+			flush_hash_hugepage(mm->context, addr,
+					    pte, local);
+
+		huge_page_release(page);
+	}
+
+	mm->rss -= (end - start) >> PAGE_SHIFT;
+}
+
+void
+zap_hugepage_range(struct vm_area_struct *vma,
+		unsigned long start, unsigned long length)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	spin_lock(&mm->page_table_lock);
+	unmap_hugepage_range(vma, start, start + length);
+	spin_unlock(&mm->page_table_lock);
+}
+
+int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr;
+	int ret = 0;
+
+	BUG_ON(vma->vm_start & ~HPAGE_MASK);
+	BUG_ON(vma->vm_end & ~HPAGE_MASK);
+
+	spin_lock(&mm->page_table_lock);
+	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+		unsigned long idx;
+		hugepte_t *pte = hugepte_alloc(mm, addr);
+		struct page *page;
+
+		if (!pte) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		if (!hugepte_none(*pte))
+			continue;
+
+		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+		page = find_get_page(mapping, idx);
+		if (!page) {
+			page = alloc_hugetlb_page();
+			if (!page) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+			unlock_page(page);
+			if (ret) {
+				free_huge_page(page);
+				goto out;
+			}
+		}
+		setup_huge_pte(mm, vma, page, pte, vma->vm_flags & VM_WRITE);
+	}
+out:
+	spin_unlock(&mm->page_table_lock);
+	return ret;
+}
+
+unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+					unsigned long len, unsigned long pgoff,
+					unsigned long flags)
+{
+	struct vm_area_struct *vma;
+
+	if (len & ~HPAGE_MASK)
+		return -EINVAL;
+
+	if (!in_hugepage_area(current->mm->context, addr)
+	    || (addr & (HPAGE_SIZE - 1)))
+		addr = TASK_HPAGE_BASE;
+
+	for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
+		/* At this point:  (!vma || addr < vma->vm_end). */
+		if (addr + len > TASK_HPAGE_END)
+			return -ENOMEM;
+		if (!vma || (addr + len) <= vma->vm_start)
+			return addr;
+		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+
+		/* Because we're in an exclusively hugepage region,
+		 * this alignment shouldn't have skipped over any
+		 * other vmas */
+	}
+}
+
+static inline unsigned long computeHugeHptePP(unsigned int hugepte)
+{
+	unsigned long flags = 0x2;
+
+	if (! (hugepte & _HUGEPAGE_RW))
+		flags |= 0x1;
+	return flags;
+}
+
+int hash_huge_page(struct mm_struct *mm, unsigned long access,
+		   unsigned long ea, unsigned long vsid, int local)
+{
+	hugepte_t *ptep;
+	unsigned long va, vpn;
+	int is_write;
+	hugepte_t old_pte, new_pte;
+	unsigned long hpteflags, prpn;
+	long slot;
+
+	/* Is this for us? */
+	if (!in_hugepage_area(mm->context, ea))
+		return -1;
+
+	/* We have to find the first hugepte in the batch, since
+	 * that's the one that will store the HPTE flags */
+	ptep = hugepte_offset(mm, ea & ~(HPAGE_SIZE-1));
+
+	/* Search the Linux page table for a match with va */
+	va = (vsid << 28) | (ea & 0x0fffffff);
+	vpn = va >> HPAGE_SHIFT;
+
+	BUG_ON(hugepte_bad(*ptep));
+
+	/*
+	 * If no pte found or not present, send the problem up to
+	 * do_page_fault
+	 */
+	if (unlikely(!ptep || hugepte_none(*ptep)))
+		return 1;
+
+	/*
+	 * Check the user's access rights to the page.  If access should be
+	 * prevented then send the problem up to do_page_fault.
+	 */
+	is_write = access & _PAGE_RW;
+	if (unlikely(is_write && !(hugepte_val(*ptep) & _HUGEPAGE_RW)))
+		return 1;
+
+	/*
+	 * At this point, we have a pte (old_pte) which can be used to build
+	 * or update an HPTE. There are 2 cases:
+	 *
+	 * 1. There is a valid (present) pte with no associated HPTE (this is
+	 *	the most common case)
+	 * 2. There is a valid (present) pte with an associated HPTE. The
+	 *	current values of the pp bits in the HPTE prevent access
+	 *	because we are doing software DIRTY bit management and the
+	 *	page is currently not DIRTY.
+	 */
+
+	old_pte = *ptep;
+	new_pte = old_pte;
+
+	hpteflags = computeHugeHptePP(hugepte_val(new_pte));
+
+	/* Check if pte already has an hpte (case 2) */
+	if (unlikely(hugepte_val(old_pte) & _HUGEPAGE_HASHPTE)) {
+		/* There MIGHT be an HPTE for this pte */
+		unsigned long hash, slot;
+
+		hash = hpt_hash(vpn, 1);
+		if (hugepte_val(old_pte) & _HUGEPAGE_SECONDARY)
+			hash = ~hash;
+		slot = (hash & htab_data.htab_hash_mask) * HPTES_PER_GROUP;
+		slot += (hugepte_val(old_pte) & _HUGEPAGE_GROUP_IX) >> 5;
+
+		if (ppc_md.hpte_updatepp(slot, hpteflags, va, 1, local) == -1)
+			hugepte_val(old_pte) &= ~_HUGEPAGE_HPTEFLAGS;
+	}
+
+	if (likely(!(hugepte_val(old_pte) & _HUGEPAGE_HASHPTE))) {
+		unsigned long hash = hpt_hash(vpn, 1);
+		unsigned long hpte_group;
+
+		prpn = hugepte_pfn(old_pte);
+
+repeat:
+		hpte_group = ((hash & htab_data.htab_hash_mask) *
+			      HPTES_PER_GROUP) & ~0x7UL;
+
+		/* Update the linux pte with the HPTE slot */
+		hugepte_val(new_pte) &= ~_HUGEPAGE_HPTEFLAGS;
+		hugepte_val(new_pte) |= _HUGEPAGE_HASHPTE;
+
+		slot = ppc_md.hpte_insert(hpte_group, va, prpn, 0,
+					  hpteflags, 0, 1);
+
+		/* Primary is full, try the secondary */
+		if (unlikely(slot == -1)) {
+			hugepte_val(new_pte) |= _HUGEPAGE_SECONDARY;
+			hpte_group = ((~hash & htab_data.htab_hash_mask) *
+				      HPTES_PER_GROUP) & ~0x7UL;
+			slot = ppc_md.hpte_insert(hpte_group, va, prpn,
+						  1, hpteflags, 0, 1);
+			if (slot == -1) {
+				if (mftb() & 0x1)
+					hpte_group = ((hash & htab_data.htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+
+				ppc_md.hpte_remove(hpte_group);
+				goto repeat;
+                        }
+		}
+
+		if (unlikely(slot == -2))
+			panic("hash_page: pte_insert failed\n");
+
+		hugepte_val(new_pte) |= (slot<<5) & _HUGEPAGE_GROUP_IX;
+
+		/*
+		 * No need to use ldarx/stdcx here because all who
+		 * might be updating the pte will hold the
+		 * page_table_lock or the hash_table_lock
+		 * (we hold both)
+		 */
+		*ptep = new_pte;
+	}
+
+	return 0;
+}
+
+static void flush_hash_hugepage(mm_context_t context, unsigned long ea,
+				hugepte_t pte, int local)
+{
+	unsigned long vsid, vpn, va, hash, secondary, slot;
+
+	BUG_ON(hugepte_bad(pte));
+	BUG_ON(!in_hugepage_area(context, ea));
+
+	vsid = get_vsid(context, ea);
+
+	va = (vsid << 28) | (ea & 0x0fffffff);
+	vpn = va >> LARGE_PAGE_SHIFT;
+	hash = hpt_hash(vpn, 1);
+	secondary = !!(hugepte_val(pte) & _HUGEPAGE_SECONDARY);
+	if (secondary)
+		hash = ~hash;
+	slot = (hash & htab_data.htab_hash_mask) * HPTES_PER_GROUP;
+	slot += (hugepte_val(pte) & _HUGEPAGE_GROUP_IX) >> 5;
+
+	ppc_md.hpte_invalidate(slot, va, 1, local);
+}
+
+static void update_and_free_page(struct page *page)
+{
+	int j;
+	struct page *map;
+
+	map = page;
+	htlbzone_pages--;
+	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
+		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
+				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
+				1 << PG_private | 1<< PG_writeback);
+		set_page_count(map, 0);
+		map++;
+	}
+	set_page_count(page, 1);
+	__free_pages(page, HUGETLB_PAGE_ORDER);
+}
+
+static int try_to_free_low(int count)
+{
+	struct list_head *p;
+	struct page *page, *map;
+
+	map = NULL;
+	spin_lock(&htlbpage_lock);
+	list_for_each(p, &htlbpage_freelist) {
+		if (map) {
+			list_del(&map->list);
+			update_and_free_page(map);
+			htlbpagemem--;
+			map = NULL;
+			if (++count == 0)
+				break;
+		}
+		page = list_entry(p, struct page, list);
+		if (!PageHighMem(page))
+			map = page;
+	}
+	if (map) {
+		list_del(&map->list);
+		update_and_free_page(map);
+		htlbpagemem--;
+		count++;
+	}
+	spin_unlock(&htlbpage_lock);
+	return count;
+}
+
+int set_hugetlb_mem_size(int count)
+{
+	int lcount;
+	struct page *page;
+	extern long htlbzone_pages;
+	extern struct list_head htlbpage_freelist;
+
+	if (count < 0)
+		lcount = count;
+	else
+		lcount = count - htlbzone_pages;
+
+	if (lcount == 0)
+		return (int)htlbzone_pages;
+	if (lcount > 0) {	/* Increase the mem size. */
+		while (lcount--) {
+			page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER);
+			if (page == NULL)
+				break;
+			spin_lock(&htlbpage_lock);
+			list_add(&page->list, &htlbpage_freelist);
+			htlbpagemem++;
+			htlbzone_pages++;
+			spin_unlock(&htlbpage_lock);
+		}
+		return (int) htlbzone_pages;
+	}
+	/* Shrink the memory size. */
+	lcount = try_to_free_low(lcount);
+	while (lcount++) {
+		page = alloc_hugetlb_page();
+		if (page == NULL)
+			break;
+		spin_lock(&htlbpage_lock);
+		update_and_free_page(page);
+		spin_unlock(&htlbpage_lock);
+	}
+	return (int) htlbzone_pages;
+}
+
+int hugetlb_sysctl_handler(ctl_table *table, int write,
+		struct file *file, void *buffer, size_t *length)
+{
+	proc_dointvec(table, write, file, buffer, length);
+	htlbpage_max = set_hugetlb_mem_size(htlbpage_max);
+	return 0;
+}
+
+static int __init hugetlb_setup(char *s)
+{
+	if (sscanf(s, "%d", &htlbpage_max) <= 0)
+		htlbpage_max = 0;
+	return 1;
+}
+__setup("hugepages=", hugetlb_setup);
+
+static int __init hugetlb_init(void)
+{
+	int i;
+	struct page *page;
+
+	for (i = 0; i < htlbpage_max; ++i) {
+		page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER);
+		if (!page)
+			break;
+		spin_lock(&htlbpage_lock);
+		list_add(&page->list, &htlbpage_freelist);
+		spin_unlock(&htlbpage_lock);
+	}
+	htlbpage_max = htlbpagemem = htlbzone_pages = i;
+	printk("Total HugeTLB memory allocated, %ld\n", htlbpagemem);
+	return 0;
+}
+module_init(hugetlb_init);
+
+int hugetlb_report_meminfo(char *buf)
+{
+	return sprintf(buf,
+			"HugePages_Total: %5lu\n"
+			"HugePages_Free:  %5lu\n"
+			"Hugepagesize:    %5lu kB\n",
+			htlbzone_pages,
+			htlbpagemem,
+			HPAGE_SIZE/1024);
+}
+
+int is_hugepage_mem_enough(size_t size)
+{
+	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpagemem;
+}
+
+/*
+ * We cannot handle pagefaults against hugetlb pages at all.  They cause
+ * handle_mm_fault() to try to instantiate regular-sized pages in the
+ * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
+ * this far.
+ */
+static struct page *hugetlb_nopage(struct vm_area_struct *vma,
+				unsigned long address, int unused)
+{
+	BUG();
+	return NULL;
+}
+
+struct vm_operations_struct hugetlb_vm_ops = {
+	.nopage = hugetlb_nopage,
+};
diff -urN /scratch/anton/export/arch/ppc64/mm/init.c linux-gogogo/arch/ppc64/mm/init.c
--- /scratch/anton/export/arch/ppc64/mm/init.c	2003-06-04 11:16:24.000000000 +1000
+++ linux-gogogo/arch/ppc64/mm/init.c	2003-06-06 12:51:54.000000000 +1000
@@ -293,7 +293,7 @@

 	if (!pgd_none(*pgd)) {
 		pmd = pmd_offset(pgd, vmaddr);
-		if (!pmd_none(*pmd)) {
+		if (pmd_present(*pmd)) {
 			ptep = pte_offset_kernel(pmd, vmaddr);
 			/* Check if HPTE might exist and flush it if so */
 			pte = __pte(pte_update(ptep, _PAGE_HPTEFLAGS, 0));
@@ -301,6 +301,7 @@
 				flush_hash_page(context, vmaddr, pte, local);
 			}
 		}
+		WARN_ON(pmd_hugepage(*pmd));
 	}
 }

@@ -349,7 +350,7 @@
 				pmd_end = (start + PMD_SIZE) & PMD_MASK;
 				if (pmd_end > end)
 					pmd_end = end;
-				if (!pmd_none(*pmd)) {
+				if (pmd_present(*pmd)) {
 					ptep = pte_offset_kernel(pmd, start);
 					do {
 						if (pte_val(*ptep) & _PAGE_HASHPTE) {
@@ -368,6 +369,7 @@
 						++ptep;
 					} while (start < pmd_end);
 				} else {
+					WARN_ON(pmd_hugepage(*pmd));
 					start = pmd_end;
 				}
 				++pmd;
diff -urN /scratch/anton/export/include/asm-ppc64/mmu_context.h linux-gogogo/include/asm-ppc64/mmu_context.h
--- /scratch/anton/export/include/asm-ppc64/mmu_context.h	2003-02-13 00:02:43.000000000 +1100
+++ linux-gogogo/include/asm-ppc64/mmu_context.h	2003-06-03 15:44:09.000000000 +1000
@@ -36,6 +36,12 @@
 #define LAST_USER_CONTEXT	0x8000  /* Same as PID_MAX for now... */
 #define NUM_USER_CONTEXT	(LAST_USER_CONTEXT-FIRST_USER_CONTEXT)

+#ifdef CONFIG_HUGETLB_PAGE
+#define CONTEXT_32BIT		(1UL<<63)
+#else
+#define CONTEXT_32BIT		0
+#endif
+
 /* Choose whether we want to implement our context
  * number allocator as a LIFO or FIFO queue.
  */
@@ -90,6 +96,8 @@

 	head = mmu_context_queue.head;
 	mm->context = mmu_context_queue.elements[head];
+	if (tsk->thread_info->flags & _TIF_32BIT)
+		mm->context |= CONTEXT_32BIT;

 	head = (head < LAST_USER_CONTEXT-1) ? head+1 : 0;
 	mmu_context_queue.head = head;
@@ -189,6 +197,8 @@
 {
 	unsigned long ordinal, vsid;

+	context &= ~CONTEXT_32BIT;
+
 	ordinal = (((ea >> 28) & 0x1fffff) * LAST_USER_CONTEXT) | context;
 	vsid = (ordinal * VSID_RANDOMIZER) & VSID_MASK;

diff -urN /scratch/anton/export/include/asm-ppc64/page.h linux-gogogo/include/asm-ppc64/page.h
--- /scratch/anton/export/include/asm-ppc64/page.h	2003-04-24 18:54:37.000000000 +1000
+++ linux-gogogo/include/asm-ppc64/page.h	2003-06-03 15:44:09.000000000 +1000
@@ -22,6 +22,40 @@
 #define PAGE_MASK	(~(PAGE_SIZE-1))
 #define PAGE_OFFSET_MASK (PAGE_SIZE-1)

+#ifdef CONFIG_HUGETLB_PAGE
+
+#define HPAGE_SHIFT	24
+#define HPAGE_SIZE	((1UL) << HPAGE_SHIFT)
+#define HPAGE_MASK	(~(HPAGE_SIZE - 1))
+#define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
+
+/* For 64-bit processes the hugepage range is 1T-1.5T */
+#define TASK_HPAGE_BASE_64 	(0x0000010000000000UL)
+#define TASK_HPAGE_END_64 	(0x0000018000000000UL)
+/* For 32-bit processes the hugepage range is 2-3G */
+#define TASK_HPAGE_BASE_32	(0x80000000UL)
+#define TASK_HPAGE_END_32	(0xc0000000UL)
+
+#define TASK_HPAGE_BASE	(test_thread_flag(TIF_32BIT) ? \
+		TASK_HPAGE_BASE_32 : TASK_HPAGE_BASE_64)
+#define TASK_HPAGE_END	(test_thread_flag(TIF_32BIT) ? \
+		TASK_HPAGE_END_32 : TASK_HPAGE_END_64)
+
+#define ARCH_HAS_HUGEPAGE_ONLY_RANGE
+#define is_hugepage_only_range(addr, len) \
+	((addr > (TASK_HPAGE_BASE-len)) && (addr < TASK_HPAGE_END))
+#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+
+#define in_hugepage_area(context, addr) (((context) & CONTEXT_32BIT) ? \
+	(((addr) >= TASK_HPAGE_BASE_32) && ((addr) < TASK_HPAGE_END_32)) : \
+	(((addr) >= TASK_HPAGE_BASE_64) && ((addr) < TASK_HPAGE_END_64)))
+
+#else /* !CONFIG_HUGETLB_PAGE */
+
+#define in_hugepage_area(mm, addr)	0
+
+#endif /* !CONFIG_HUGETLB_PAGE */
+
 #define SID_SHIFT       28
 #define SID_MASK        0xfffffffff
 #define GET_ESID(x)     (((x) >> SID_SHIFT) & SID_MASK)
diff -urN /scratch/anton/export/include/asm-ppc64/pgtable.h linux-gogogo/include/asm-ppc64/pgtable.h
--- /scratch/anton/export/include/asm-ppc64/pgtable.h	2003-05-30 01:22:36.000000000 +1000
+++ linux-gogogo/include/asm-ppc64/pgtable.h	2003-06-06 12:52:23.000000000 +1000
@@ -149,6 +149,22 @@
 /* shift to put page number into pte */
 #define PTE_SHIFT (16)

+/* We allow 2^41 bytes of real memory, so we need 29 bits in the PMD
+ * to give the PTE page number.  The bottom two bits are for flags. */
+#define PMD_TO_PTEPAGE_SHIFT (2)
+#ifdef CONFIG_HUGETLB_PAGE
+#define _PMD_HUGEPAGE	0x00000001U
+#define HUGEPTE_BATCH_SIZE (1<<(HPAGE_SHIFT-PMD_SHIFT))
+
+int hash_huge_page(struct mm_struct *mm, unsigned long access,
+		   unsigned long ea, unsigned long vsid, int local);
+#else
+
+#define hash_huge_page(mm,a,ea,vsid,local)	-1
+#define _PMD_HUGEPAGE	0
+
+#endif
+
 #ifndef __ASSEMBLY__

 /*
@@ -178,12 +194,16 @@
 #define pte_pfn(x)		((unsigned long)((pte_val(x) >> PTE_SHIFT)))
 #define pte_page(x)		pfn_to_page(pte_pfn(x))

-#define pmd_set(pmdp, ptep) 	(pmd_val(*(pmdp)) = (__ba_to_bpn(ptep)))
+#define pmd_set(pmdp, ptep) 	\
+	(pmd_val(*(pmdp)) = (__ba_to_bpn(ptep) << PMD_TO_PTEPAGE_SHIFT))
 #define pmd_none(pmd)		(!pmd_val(pmd))
-#define	pmd_bad(pmd)		((pmd_val(pmd)) == 0)
-#define	pmd_present(pmd)	((pmd_val(pmd)) != 0)
+#define	pmd_hugepage(pmd)	(!!(pmd_val(pmd) & _PMD_HUGEPAGE))
+#define	pmd_bad(pmd)		(((pmd_val(pmd)) == 0) || pmd_hugepage(pmd))
+#define	pmd_present(pmd)	((!pmd_hugepage(pmd)) \
+				 && (pmd_val(pmd) & ~_PMD_HUGEPAGE) != 0)
 #define	pmd_clear(pmdp)		(pmd_val(*(pmdp)) = 0)
-#define pmd_page_kernel(pmd)	(__bpn_to_ba(pmd_val(pmd)))
+#define pmd_page_kernel(pmd)	\
+	(__bpn_to_ba(pmd_val(pmd) >> PMD_TO_PTEPAGE_SHIFT))
 #define pmd_page(pmd)		virt_to_page(pmd_page_kernel(pmd))
 #define pgd_set(pgdp, pmdp)	(pgd_val(*(pgdp)) = (__ba_to_bpn(pmdp)))
 #define pgd_none(pgd)		(!pgd_val(pgd))