Improved ppc64 hugepage patch

Fri Jul 4 15:02:49 EST 2003

The patch below is an improved version of the hugepage support for
ppc64.  As well as fixing a couple of bugs it removes some
restrictions on hugepage mappings:

- Hugepages are now executable (since huge pages are never COWed,
there aren't actually any cache flushing issues)

- The "low" hugepage region (2-3G for 32-bit processes) is now more
flexible.  When a process is started the region is not reserved for
hugepages and can be used freely for normal mappings.  The region is
only activated when the process attempts a hugepage mapping.  If there
are normal mappings in the area the hugepage mapping will fail
(-EBUSY).  Once the region is activated, normal mappings cannot be
made there for the life of the process.

Known problems:

- Semantics of when the low range is activated are a little strange -
it will be activated only when a 32-bit process does a hugepage map
*without* MAP_FIXED.  A hugepage MAP_FIXED will counter-intuitively
fail if the region has not already been activated, even if there are
no non-huge mappings in the region.  This should be fixed, although I
can see no obvious applications for hugepage MAP_FIXED.

The patch is against 2.5.74 (Linus' BK tree).

diff -urN linux-2.5/arch/ppc64/Kconfig linux-gogogo/arch/ppc64/Kconfig

--- linux-2.5/arch/ppc64/Kconfig	2003-07-04 14:42:34.000000000 +1000
+++ linux-gogogo/arch/ppc64/Kconfig	2003-07-04 14:45:12.000000000 +1000
@@ -69,6 +69,17 @@
 	bool
 	default y

+config HUGETLB_PAGE
+	bool "Huge TLB Page Support"
+	help
+	  This enables support for huge pages.  User space applications
+	  can make use of this support with the sys_alloc_hugepages and
+	  sys_free_hugepages system calls.  If your applications are
+	  huge page aware and your processor supports this (only POWER4,
+	  then say Y here.
+
+	  Otherwise, say N.
+
 config SMP
 	bool "Symmetric multi-processing support"
 	---help---
diff -urN linux-2.5/arch/ppc64/kernel/htab.c linux-gogogo/arch/ppc64/kernel/htab.c
--- linux-2.5/arch/ppc64/kernel/htab.c	2003-07-04 14:42:34.000000000 +1000
+++ linux-gogogo/arch/ppc64/kernel/htab.c	2003-07-04 14:45:07.000000000 +1000
@@ -197,7 +197,7 @@
 	if (!pgd_none(*pg)) {

 		pm = pmd_offset(pg, ea);
-		if (!pmd_none(*pm)) {
+		if (pmd_present(*pm)) {
 			pt = pte_offset_kernel(pm, ea);
 			pte = *pt;
 			if (!pte_present(pte))
@@ -434,8 +434,12 @@
 	if (user_region && (mm->cpu_vm_mask == (1 << smp_processor_id())))
 		local = 1;

-	ptep = find_linux_pte(pgdir, ea);
-	ret = __hash_page(ea, access, vsid, ptep, trap, local);
+	ret = hash_huge_page(mm, access, ea, vsid, local);
+	if (ret < 0) {
+		ptep = find_linux_pte(pgdir, ea);
+		ret = __hash_page(ea, access, vsid, ptep, trap, local);
+	}
+
 	spin_unlock(&mm->page_table_lock);

 	return ret;
diff -urN linux-2.5/arch/ppc64/kernel/stab.c linux-gogogo/arch/ppc64/kernel/stab.c
--- linux-2.5/arch/ppc64/kernel/stab.c	2003-07-04 14:42:34.000000000 +1000
+++ linux-gogogo/arch/ppc64/kernel/stab.c	2003-07-04 14:44:40.000000000 +1000
@@ -221,15 +221,18 @@
 }

 static inline void __ste_allocate(unsigned long esid, unsigned long vsid,
-				  int kernel_segment)
+				  int kernel_segment, mm_context_t context)
 {
 	if (cur_cpu_spec->cpu_features & CPU_FTR_SLB) {
+		int large = 0;
+
 #ifndef CONFIG_PPC_ISERIES
 		if (REGION_ID(esid << SID_SHIFT) == KERNEL_REGION_ID)
-			make_slbe(esid, vsid, 1, kernel_segment);
-		else
+			large = 1;
+		else if (REGION_ID(esid << SID_SHIFT) == USER_REGION_ID)
+			large = in_hugepage_area(context, esid << SID_SHIFT);
 #endif
-			make_slbe(esid, vsid, 0, kernel_segment);
+		make_slbe(esid, vsid, large, kernel_segment);
 	} else {
 		unsigned char top_entry, stab_entry, *segments;

@@ -255,6 +258,7 @@
 {
 	unsigned long vsid, esid;
 	int kernel_segment = 0;
+	mm_context_t context;

 	PMC_SW_PROCESSOR(stab_faults);

@@ -266,16 +270,18 @@
 	if (REGION_ID(ea) >= KERNEL_REGION_ID) {
 		kernel_segment = 1;
 		vsid = get_kernel_vsid(ea);
+		context = REGION_ID(ea);
 	} else {
-		struct mm_struct *mm = current->mm;
-		if (mm)
-			vsid = get_vsid(mm->context, ea);
-		else
+		if (! current->mm)
 			return 1;
+
+		context = current->mm->context;
+
+		vsid = get_vsid(context, ea);
 	}

 	esid = GET_ESID(ea);
-	__ste_allocate(esid, vsid, kernel_segment);
+	__ste_allocate(esid, vsid, kernel_segment, context);
 	if (!(cur_cpu_spec->cpu_features & CPU_FTR_SLB)) {
 		/* Order update */
 		asm volatile("sync":::"memory");
@@ -302,7 +308,7 @@
 		for (esid = 0; esid < 16; esid++) {
 			unsigned long ea = esid << SID_SHIFT;
 			vsid = get_vsid(mm->context, ea);
-			__ste_allocate(esid, vsid, 0);
+			__ste_allocate(esid, vsid, 0, mm->context);
 		}
 	} else {
 		unsigned long pc = KSTK_EIP(tsk);
@@ -316,7 +322,7 @@
 			    (REGION_ID(pc) >= KERNEL_REGION_ID))
 				return;
 			vsid = get_vsid(mm->context, pc);
-			__ste_allocate(GET_ESID(pc), vsid, 0);
+			__ste_allocate(GET_ESID(pc), vsid, 0, mm->context);
 		}

 		if (stack && (pc_segment != stack_segment)) {
@@ -324,7 +330,7 @@
 			    (REGION_ID(stack) >= KERNEL_REGION_ID))
 				return;
 			vsid = get_vsid(mm->context, stack);
-			__ste_allocate(GET_ESID(stack), vsid, 0);
+			__ste_allocate(GET_ESID(stack), vsid, 0, mm->context);
 		}
 	}

diff -urN linux-2.5/arch/ppc64/mm/Makefile linux-gogogo/arch/ppc64/mm/Makefile
--- linux-2.5/arch/ppc64/mm/Makefile	2003-07-04 14:42:34.000000000 +1000
+++ linux-gogogo/arch/ppc64/mm/Makefile	2003-07-04 14:45:02.000000000 +1000
@@ -6,3 +6,4 @@

 obj-y := fault.o init.o extable.o imalloc.o
 obj-$(CONFIG_DISCONTIGMEM) += numa.o
+obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
diff -urN linux-2.5/arch/ppc64/mm/hugetlbpage.c linux-gogogo/arch/ppc64/mm/hugetlbpage.c
--- linux-2.5/arch/ppc64/mm/hugetlbpage.c	Thu Jan 01 10:00:00 1970
+++ linux-gogogo/arch/ppc64/mm/hugetlbpage.c	Fri Jul 04 14:00:11 2003
@@ -0,0 +1,866 @@
+/*
+ * PPC64 (POWER4) Huge TLB Page Support for Kernel.
+ *
+ * Copyright (C) 2003 David Gibson, IBM Corporation.
+ *
+ * Based on the IA-32 version:
+ * Copyright (C) 2002, Rohit Seth <rohit.seth at intel.com>
+ */
+
+#include <linux/config.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/hugetlb.h>
+#include <linux/pagemap.h>
+#include <linux/smp_lock.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/sysctl.h>
+#include <asm/mman.h>
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+#include <asm/tlbflush.h>
+#include <asm/mmu_context.h>
+#include <asm/machdep.h>
+#include <asm/cputable.h>
+#include <asm/tlb.h>
+#include <asm/rmap.h>
+
+#include <linux/sysctl.h>
+
+int htlbpage_max;
+
+/* This lock protects the two counters and list below */
+static spinlock_t htlbpage_lock = SPIN_LOCK_UNLOCKED;
+
+static int htlbpage_free; /* = 0 */
+static int htlbpage_total; /* = 0 */
+static LIST_HEAD(htlbpage_freelist);
+
+/* HugePTE layout:
+ *
+ * 31 30 ... 15 14 13 12 10 9  8  7   6    5    4    3    2    1    0
+ * PFN>>12..... -  -  -  -  -  -  HASH_IX....   2ND  HASH RW   -    HG=1
+ */
+
+#define HUGEPTE_SHIFT	15
+#define _HUGEPAGE_PFN		0xffff8000
+#define _HUGEPAGE_BAD		0x00007f00
+#define _HUGEPAGE_HASHPTE	0x00000008
+#define _HUGEPAGE_SECONDARY	0x00000010
+#define _HUGEPAGE_GROUP_IX	0x000000e0
+#define _HUGEPAGE_HPTEFLAGS	(_HUGEPAGE_HASHPTE | _HUGEPAGE_SECONDARY | \
+				 _HUGEPAGE_GROUP_IX)
+#define _HUGEPAGE_RW		0x00000004
+
+typedef struct {unsigned int val;} hugepte_t;
+#define hugepte_val(hugepte)	((hugepte).val)
+#define __hugepte(x)		((hugepte_t) { (x) } )
+#define hugepte_pfn(x)		\
+	((unsigned long)(hugepte_val(x)>>HUGEPTE_SHIFT) << HUGETLB_PAGE_ORDER)
+#define mk_hugepte(page,wr)	__hugepte( \
+	((page_to_pfn(page)>>HUGETLB_PAGE_ORDER) << HUGEPTE_SHIFT ) \
+	| (!!(wr) * _HUGEPAGE_RW) | _PMD_HUGEPAGE )
+
+#define hugepte_bad(x)	( !(hugepte_val(x) & _PMD_HUGEPAGE) || \
+			  (hugepte_val(x) & _HUGEPAGE_BAD) )
+#define hugepte_page(x)	pfn_to_page(hugepte_pfn(x))
+#define hugepte_none(x)	(!(hugepte_val(x) & _HUGEPAGE_PFN))
+
+
+static void free_huge_page(struct page *page);
+static void flush_hash_hugepage(mm_context_t context, unsigned long ea,
+				hugepte_t pte, int local);
+
+static inline unsigned int hugepte_update(hugepte_t *p, unsigned int clr,
+					  unsigned int set)
+{
+	unsigned int old, tmp;
+
+	__asm__ __volatile__(
+	"1:	lwarx	%0,0,%3		# pte_update\n\
+	andc	%1,%0,%4 \n\
+	or	%1,%1,%5 \n\
+	stwcx.	%1,0,%3 \n\
+	bne-	1b"
+	: "=&r" (old), "=&r" (tmp), "=m" (*p)
+	: "r" (p), "r" (clr), "r" (set), "m" (*p)
+	: "cc" );
+	return old;
+}
+
+static inline void set_hugepte(hugepte_t *ptep, hugepte_t pte)
+{
+	hugepte_update(ptep, ~_HUGEPAGE_HPTEFLAGS,
+		       hugepte_val(pte) & ~_HUGEPAGE_HPTEFLAGS);
+}
+
+static struct page *alloc_hugetlb_page(void)
+{
+	int i;
+	struct page *page;
+
+	spin_lock(&htlbpage_lock);
+	if (list_empty(&htlbpage_freelist)) {
+		spin_unlock(&htlbpage_lock);
+		return NULL;
+	}
+
+	page = list_entry(htlbpage_freelist.next, struct page, list);
+	list_del(&page->list);
+	htlbpage_free--;
+	spin_unlock(&htlbpage_lock);
+	set_page_count(page, 1);
+	page->lru.prev = (void *)free_huge_page;
+	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
+		clear_highpage(&page[i]);
+	return page;
+}
+
+static hugepte_t *hugepte_alloc(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pmd_t *pmd = NULL;
+
+	BUG_ON(!in_hugepage_area(mm->context, addr));
+
+	pgd = pgd_offset(mm, addr);
+	pmd = pmd_alloc(mm, pgd, addr);
+
+	/* We shouldn't find a (normal) PTE page pointer here */
+	BUG_ON(!pmd_none(*pmd) && !pmd_hugepage(*pmd));
+
+	return (hugepte_t *)pmd;
+}
+
+static hugepte_t *hugepte_offset(struct mm_struct *mm, unsigned long addr)
+{
+	pgd_t *pgd;
+	pmd_t *pmd = NULL;
+
+	BUG_ON(!in_hugepage_area(mm->context, addr));
+
+	pgd = pgd_offset(mm, addr);
+	pmd = pmd_offset(pgd, addr);
+
+	/* We shouldn't find a (normal) PTE page pointer here */
+	BUG_ON(!pmd_none(*pmd) && !pmd_hugepage(*pmd));
+
+	return (hugepte_t *)pmd;
+}
+
+static void setup_huge_pte(struct mm_struct *mm, struct page *page,
+			   hugepte_t *ptep, int write_access)
+{
+	hugepte_t entry;
+	int i;
+
+	mm->rss += (HPAGE_SIZE / PAGE_SIZE);
+	entry = mk_hugepte(page, write_access);
+	for (i = 0; i < HUGEPTE_BATCH_SIZE; i++)
+		set_hugepte(ptep+i, entry);
+}
+
+static void teardown_huge_pte(hugepte_t *ptep)
+{
+	int i;
+
+	for (i = 0; i < HUGEPTE_BATCH_SIZE; i++)
+		pmd_clear((pmd_t *)(ptep+i));
+}
+
+/*
+ * This function checks for proper alignment of input addr and len parameters.
+ */
+int is_aligned_hugepage_range(unsigned long addr, unsigned long len)
+{
+	if (len & ~HPAGE_MASK)
+		return -EINVAL;
+	if (addr & ~HPAGE_MASK)
+		return -EINVAL;
+	if (! is_hugepage_only_range(addr, len))
+		return -EINVAL;
+	return 0;
+}
+
+static void do_slbia(void *unused)
+{
+	asm volatile ("isync; slbia; isync":::"memory");
+}
+
+/* Activate the low hpage region for 32bit processes.  mmap_sem must
+ * be held*/
+static int open_32bit_htlbpage_range(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	unsigned long addr;
+
+	if (mm->context & CONTEXT_LOW_HPAGES)
+		return 0; /* The window is already open */
+
+	/* Check no VMAs are in the region */
+	vma = find_vma(mm, TASK_HPAGE_BASE_32);
+
+	if (vma && (vma->vm_start < TASK_HPAGE_END_32))
+		return -EBUSY;
+
+	/* Clean up any leftover PTE pages in the region */
+	spin_lock(&mm->page_table_lock);
+	for (addr = TASK_HPAGE_BASE_32; addr < TASK_HPAGE_END_32;
+	     addr += PMD_SIZE) {
+		pgd_t *pgd = pgd_offset(mm, addr);
+		pmd_t *pmd = pmd_offset(pgd, addr);
+
+		if (! pmd_none(*pmd)) {
+			struct page *page = pmd_page(*pmd);
+			pte_t *pte = (pte_t *)pmd_page_kernel(*pmd);
+			int i;
+
+			/* No VMAs, so there should be no PTEs, check
+			 * just in case. */
+			for (i = 0; i < PTRS_PER_PTE; i++) {
+				BUG_ON(! pte_none(*pte));
+				pte++;
+			}
+
+			pmd_clear(pmd);
+			pgtable_remove_rmap(page);
+			pte_free(page);
+		}
+	}
+	spin_unlock(&mm->page_table_lock);
+
+	/* FIXME: do we need to scan for PTEs too? */
+
+	mm->context |= CONTEXT_LOW_HPAGES;
+
+	/* the context change must make it to memory before the slbia,
+	 * so that further SLB misses do the right thing. */
+	mb();
+
+	on_each_cpu(do_slbia, NULL, 0, 1);
+
+	return 0;
+}
+
+int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
+			struct vm_area_struct *vma)
+{
+	hugepte_t *src_pte, *dst_pte, entry;
+	struct page *ptepage;
+	unsigned long addr = vma->vm_start;
+	unsigned long end = vma->vm_end;
+
+	while (addr < end) {
+		BUG_ON(! in_hugepage_area(src->context, addr));
+		BUG_ON(! in_hugepage_area(dst->context, addr));
+
+		dst_pte = hugepte_alloc(dst, addr);
+		if (!dst_pte)
+			return -ENOMEM;
+
+		src_pte = hugepte_offset(src, addr);
+		entry = *src_pte;
+
+		if ((addr % HPAGE_SIZE) == 0) {
+			/* This is the first hugepte in a batch */
+			ptepage = hugepte_page(entry);
+			get_page(ptepage);
+			dst->rss += (HPAGE_SIZE / PAGE_SIZE);
+		}
+		set_hugepte(dst_pte, entry);
+
+
+		addr += PMD_SIZE;
+	}
+	return 0;
+}
+
+int
+follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
+		    struct page **pages, struct vm_area_struct **vmas,
+		    unsigned long *position, int *length, int i)
+{
+	unsigned long vpfn, vaddr = *position;
+	int remainder = *length;
+
+	WARN_ON(!is_vm_hugetlb_page(vma));
+
+	vpfn = vaddr/PAGE_SIZE;
+	while (vaddr < vma->vm_end && remainder) {
+		BUG_ON(!in_hugepage_area(mm->context, vaddr));
+
+		if (pages) {
+			hugepte_t *pte;
+			struct page *page;
+
+			pte = hugepte_offset(mm, vaddr);
+
+			/* hugetlb should be locked, and hence, prefaulted */
+			WARN_ON(!pte || hugepte_none(*pte));
+
+			page = &hugepte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+
+			WARN_ON(!PageCompound(page));
+
+			get_page(page);
+			pages[i] = page;
+		}
+
+		if (vmas)
+			vmas[i] = vma;
+
+		vaddr += PAGE_SIZE;
+		++vpfn;
+		--remainder;
+		++i;
+	}
+
+	*length = remainder;
+	*position = vaddr;
+
+	return i;
+}
+
+struct page *
+follow_huge_addr(struct mm_struct *mm,
+	struct vm_area_struct *vma, unsigned long address, int write)
+{
+	return NULL;
+}
+
+struct vm_area_struct *hugepage_vma(struct mm_struct *mm, unsigned long addr)
+{
+	return NULL;
+}
+
+int pmd_huge(pmd_t pmd)
+{
+	return pmd_hugepage(pmd);
+}
+
+struct page *
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+		pmd_t *pmd, int write)
+{
+	struct page *page;
+
+	BUG_ON(! pmd_hugepage(*pmd));
+
+	page = hugepte_page(*(hugepte_t *)pmd);
+	if (page) {
+		page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT);
+		get_page(page);
+	}
+	return page;
+}
+
+static void free_huge_page(struct page *page)
+{
+	BUG_ON(page_count(page));
+	BUG_ON(page->mapping);
+
+	INIT_LIST_HEAD(&page->list);
+
+	spin_lock(&htlbpage_lock);
+	list_add(&page->list, &htlbpage_freelist);
+	htlbpage_free++;
+	spin_unlock(&htlbpage_lock);
+}
+
+void huge_page_release(struct page *page)
+{
+	if (!put_page_testzero(page))
+		return;
+
+	free_huge_page(page);
+}
+
+void unmap_hugepage_range(struct vm_area_struct *vma,
+			  unsigned long start, unsigned long end)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long addr;
+	hugepte_t *ptep;
+	struct page *page;
+	int local = 0;
+
+	WARN_ON(!is_vm_hugetlb_page(vma));
+	BUG_ON((start % HPAGE_SIZE) != 0);
+	BUG_ON((end % HPAGE_SIZE) != 0);
+
+	/* XXX are there races with checking cpu_vm_mask? - Anton */
+	if (vma->vm_mm->cpu_vm_mask == (1 << smp_processor_id()))
+		local = 1;
+
+	for (addr = start; addr < end; addr += HPAGE_SIZE) {
+		hugepte_t pte;
+
+		BUG_ON(!in_hugepage_area(mm->context, addr));
+
+		ptep = hugepte_offset(mm, addr);
+		if (!ptep || hugepte_none(*ptep))
+			continue;
+
+		pte = *ptep;
+		page = hugepte_page(pte);
+		teardown_huge_pte(ptep);
+
+		if (hugepte_val(pte) & _HUGEPAGE_HASHPTE)
+			flush_hash_hugepage(mm->context, addr,
+					    pte, local);
+
+		huge_page_release(page);
+	}
+
+	mm->rss -= (end - start) >> PAGE_SHIFT;
+}
+
+void zap_hugepage_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long length)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	spin_lock(&mm->page_table_lock);
+	unmap_hugepage_range(vma, start, start + length);
+	spin_unlock(&mm->page_table_lock);
+}
+
+int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long addr;
+	int ret = 0;
+
+	WARN_ON(!is_vm_hugetlb_page(vma));
+	BUG_ON((vma->vm_start % HPAGE_SIZE) != 0);
+	BUG_ON((vma->vm_end % HPAGE_SIZE) != 0);
+
+	spin_lock(&mm->page_table_lock);
+	for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
+		unsigned long idx;
+		hugepte_t *pte = hugepte_alloc(mm, addr);
+		struct page *page;
+
+		BUG_ON(!in_hugepage_area(mm->context, addr));
+
+		if (!pte) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		if (!hugepte_none(*pte))
+			continue;
+
+		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+		page = find_get_page(mapping, idx);
+		if (!page) {
+			page = alloc_hugetlb_page();
+			if (!page) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
+			unlock_page(page);
+			if (ret) {
+				free_huge_page(page);
+				goto out;
+			}
+		}
+		setup_huge_pte(mm, page, pte, vma->vm_flags & VM_WRITE);
+	}
+out:
+	spin_unlock(&mm->page_table_lock);
+	return ret;
+}
+
+/* Because we have an exclusive hugepage region which lies within the
+ * normal user address space, we have to take special measures to make
+ * non-huge mmap()s evade the hugepage reserved region. */
+unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
+				     unsigned long len, unsigned long pgoff,
+				     unsigned long flags)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long start_addr;
+
+	if (len > TASK_SIZE)
+		return -ENOMEM;
+
+	if (addr) {
+		addr = PAGE_ALIGN(addr);
+		vma = find_vma(mm, addr);
+		if (TASK_SIZE - len >= addr &&
+		    (!vma || addr + len <= vma->vm_start) &&
+		    !is_hugepage_only_range(addr,len))
+			return addr;
+	}
+	start_addr = addr = mm->free_area_cache;
+
+full_search:
+	for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
+		/* At this point:  (!vma || addr < vma->vm_end). */
+		if (TASK_SIZE - len < addr) {
+			/*
+			 * Start a new search - just in case we missed
+			 * some holes.
+			 */
+			if (start_addr != TASK_UNMAPPED_BASE) {
+				start_addr = addr = TASK_UNMAPPED_BASE;
+				goto full_search;
+			}
+			return -ENOMEM;
+		}
+		if (!vma || addr + len <= vma->vm_start) {
+			if (is_hugepage_only_range(addr, len)) {
+				if (addr < TASK_HPAGE_END_32)
+					addr = TASK_HPAGE_END_32;
+				else
+					addr = TASK_HPAGE_END;
+
+				continue;
+			}
+			/*
+			 * Remember the place where we stopped the search:
+			 */
+			mm->free_area_cache = addr + len;
+			return addr;
+		}
+		addr = vma->vm_end;
+	}
+}
+
+
+unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+					unsigned long len, unsigned long pgoff,
+					unsigned long flags)
+{
+	struct vm_area_struct *vma;
+	unsigned long base, end;
+
+	if (len & ~HPAGE_MASK)
+		return -EINVAL;
+
+	if (!(cur_cpu_spec->cpu_features & CPU_FTR_16M_PAGE))
+		return -EINVAL;
+
+	if (test_thread_flag(TIF_32BIT)) {
+		int err;
+
+		err = open_32bit_htlbpage_range(current->mm);
+		if (err)
+			return err; /* Should this just be EINVAL? */
+
+		base = TASK_HPAGE_BASE_32;
+		end = TASK_HPAGE_END_32;
+	} else {
+		base = TASK_HPAGE_BASE;
+		end = TASK_HPAGE_END;
+	}
+
+	if (!in_hugepage_area(current->mm->context, addr)
+	    || (addr & (HPAGE_SIZE - 1)))
+		addr = base;
+
+	for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
+		/* At this point:  (!vma || addr < vma->vm_end). */
+		if (addr + len > end)
+			return -ENOMEM;
+		if (!vma || (addr + len) <= vma->vm_start)
+			return addr;
+		addr = ALIGN(vma->vm_end, HPAGE_SIZE);
+
+		/* Because we're in an exclusively hugepage region,
+		 * this alignment shouldn't have skipped over any
+		 * other vmas */
+	}
+}
+
+static inline unsigned long computeHugeHptePP(unsigned int hugepte)
+{
+	unsigned long flags = 0x2;
+
+	if (! (hugepte & _HUGEPAGE_RW))
+		flags |= 0x1;
+	return flags;
+}
+
+int hash_huge_page(struct mm_struct *mm, unsigned long access,
+		   unsigned long ea, unsigned long vsid, int local)
+{
+	hugepte_t *ptep;
+	unsigned long va, vpn;
+	int is_write;
+	hugepte_t old_pte, new_pte;
+	unsigned long hpteflags, prpn;
+	long slot;
+
+	/* Is this for us? */
+	if (!in_hugepage_area(mm->context, ea))
+		return -1;
+
+	/* We have to find the first hugepte in the batch, since
+	 * that's the one that will store the HPTE flags */
+	ptep = hugepte_offset(mm, ea & ~(HPAGE_SIZE-1));
+
+	/* Search the Linux page table for a match with va */
+	va = (vsid << 28) | (ea & 0x0fffffff);
+	vpn = va >> HPAGE_SHIFT;
+
+	/*
+	 * If no pte found or not present, send the problem up to
+	 * do_page_fault
+	 */
+	if (unlikely(!ptep || hugepte_none(*ptep)))
+		return 1;
+
+	BUG_ON(hugepte_bad(*ptep));
+
+	/*
+	 * Check the user's access rights to the page.  If access should be
+	 * prevented then send the problem up to do_page_fault.
+	 */
+	is_write = access & _PAGE_RW;
+	if (unlikely(is_write && !(hugepte_val(*ptep) & _HUGEPAGE_RW)))
+		return 1;
+
+	/*
+	 * At this point, we have a pte (old_pte) which can be used to build
+	 * or update an HPTE. There are 2 cases:
+	 *
+	 * 1. There is a valid (present) pte with no associated HPTE (this is
+	 *	the most common case)
+	 * 2. There is a valid (present) pte with an associated HPTE. The
+	 *	current values of the pp bits in the HPTE prevent access
+	 *	because we are doing software DIRTY bit management and the
+	 *	page is currently not DIRTY.
+	 */
+
+	old_pte = *ptep;
+	new_pte = old_pte;
+
+	hpteflags = computeHugeHptePP(hugepte_val(new_pte));
+
+	/* Check if pte already has an hpte (case 2) */
+	if (unlikely(hugepte_val(old_pte) & _HUGEPAGE_HASHPTE)) {
+		/* There MIGHT be an HPTE for this pte */
+		unsigned long hash, slot;
+
+		hash = hpt_hash(vpn, 1);
+		if (hugepte_val(old_pte) & _HUGEPAGE_SECONDARY)
+			hash = ~hash;
+		slot = (hash & htab_data.htab_hash_mask) * HPTES_PER_GROUP;
+		slot += (hugepte_val(old_pte) & _HUGEPAGE_GROUP_IX) >> 5;
+
+		if (ppc_md.hpte_updatepp(slot, hpteflags, va, 1, local) == -1)
+			hugepte_val(old_pte) &= ~_HUGEPAGE_HPTEFLAGS;
+	}
+
+	if (likely(!(hugepte_val(old_pte) & _HUGEPAGE_HASHPTE))) {
+		unsigned long hash = hpt_hash(vpn, 1);
+		unsigned long hpte_group;
+
+		prpn = hugepte_pfn(old_pte);
+
+repeat:
+		hpte_group = ((hash & htab_data.htab_hash_mask) *
+			      HPTES_PER_GROUP) & ~0x7UL;
+
+		/* Update the linux pte with the HPTE slot */
+		hugepte_val(new_pte) &= ~_HUGEPAGE_HPTEFLAGS;
+		hugepte_val(new_pte) |= _HUGEPAGE_HASHPTE;
+
+		slot = ppc_md.hpte_insert(hpte_group, va, prpn, 0,
+					  hpteflags, 0, 1);
+
+		/* Primary is full, try the secondary */
+		if (unlikely(slot == -1)) {
+			hugepte_val(new_pte) |= _HUGEPAGE_SECONDARY;
+			hpte_group = ((~hash & htab_data.htab_hash_mask) *
+				      HPTES_PER_GROUP) & ~0x7UL;
+			slot = ppc_md.hpte_insert(hpte_group, va, prpn,
+						  1, hpteflags, 0, 1);
+			if (slot == -1) {
+				if (mftb() & 0x1)
+					hpte_group = ((hash & htab_data.htab_hash_mask) * HPTES_PER_GROUP) & ~0x7UL;
+
+				ppc_md.hpte_remove(hpte_group);
+				goto repeat;
+                        }
+		}
+
+		if (unlikely(slot == -2))
+			panic("hash_page: pte_insert failed\n");
+
+		hugepte_val(new_pte) |= (slot<<5) & _HUGEPAGE_GROUP_IX;
+
+		/*
+		 * No need to use ldarx/stdcx here because all who
+		 * might be updating the pte will hold the
+		 * page_table_lock or the hash_table_lock
+		 * (we hold both)
+		 */
+		*ptep = new_pte;
+	}
+
+	return 0;
+}
+
+static void flush_hash_hugepage(mm_context_t context, unsigned long ea,
+				hugepte_t pte, int local)
+{
+	unsigned long vsid, vpn, va, hash, secondary, slot;
+
+	BUG_ON(hugepte_bad(pte));
+	BUG_ON(!in_hugepage_area(context, ea));
+
+	vsid = get_vsid(context, ea);
+
+	va = (vsid << 28) | (ea & 0x0fffffff);
+	vpn = va >> LARGE_PAGE_SHIFT;
+	hash = hpt_hash(vpn, 1);
+	secondary = !!(hugepte_val(pte) & _HUGEPAGE_SECONDARY);
+	if (secondary)
+		hash = ~hash;
+	slot = (hash & htab_data.htab_hash_mask) * HPTES_PER_GROUP;
+	slot += (hugepte_val(pte) & _HUGEPAGE_GROUP_IX) >> 5;
+
+	ppc_md.hpte_invalidate(slot, va, 1, local);
+}
+
+static void split_and_free_hugepage(struct page *page)
+{
+	int j;
+	struct page *map;
+
+	map = page;
+	htlbpage_total--;
+	for (j = 0; j < (HPAGE_SIZE / PAGE_SIZE); j++) {
+		map->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
+				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
+				1 << PG_private | 1<< PG_writeback);
+		set_page_count(map, 0);
+		map++;
+	}
+	set_page_count(page, 1);
+	__free_pages(page, HUGETLB_PAGE_ORDER);
+}
+
+int set_hugetlb_mem_size(int count)
+{
+	int lcount;
+	struct page *page;
+
+	if (!(cur_cpu_spec->cpu_features & CPU_FTR_16M_PAGE))
+		return 0;
+
+	if (count < 0)
+		lcount = count;
+	else
+		lcount = count - htlbpage_total;
+
+	if (lcount == 0)
+		return htlbpage_total;
+	if (lcount > 0) {	/* Increase the mem size. */
+		while (lcount--) {
+			page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER);
+			if (page == NULL)
+				break;
+			spin_lock(&htlbpage_lock);
+			list_add(&page->list, &htlbpage_freelist);
+			htlbpage_free++;
+			htlbpage_total++;
+			spin_unlock(&htlbpage_lock);
+		}
+		return htlbpage_total;
+	}
+	/* Shrink the memory size. */
+	while (lcount++) {
+		page = alloc_hugetlb_page();
+		if (page == NULL)
+			break;
+		spin_lock(&htlbpage_lock);
+		split_and_free_hugepage(page);
+		spin_unlock(&htlbpage_lock);
+	}
+	return htlbpage_total;
+}
+
+int hugetlb_sysctl_handler(ctl_table *table, int write,
+		struct file *file, void *buffer, size_t *length)
+{
+	proc_dointvec(table, write, file, buffer, length);
+	htlbpage_max = set_hugetlb_mem_size(htlbpage_max);
+	return 0;
+}
+
+static int __init hugetlb_setup(char *s)
+{
+	if (sscanf(s, "%d", &htlbpage_max) <= 0)
+		htlbpage_max = 0;
+	return 1;
+}
+__setup("hugepages=", hugetlb_setup);
+
+static int __init hugetlb_init(void)
+{
+	int i;
+	struct page *page;
+
+	if (cur_cpu_spec->cpu_features & CPU_FTR_16M_PAGE) {
+		for (i = 0; i < htlbpage_max; ++i) {
+			page = alloc_pages(__GFP_HIGHMEM, HUGETLB_PAGE_ORDER);
+			if (!page)
+				break;
+			spin_lock(&htlbpage_lock);
+			list_add(&page->list, &htlbpage_freelist);
+			spin_unlock(&htlbpage_lock);
+		}
+		htlbpage_max = htlbpage_free = htlbpage_total = i;
+		printk("Total HugeTLB memory allocated, %d\n", htlbpage_free);
+	} else {
+		htlbpage_max = 0;
+		printk("CPU does not support HugeTLB\n");
+	}
+
+	return 0;
+}
+module_init(hugetlb_init);
+
+int hugetlb_report_meminfo(char *buf)
+{
+	return sprintf(buf,
+			"HugePages_Total: %5d\n"
+			"HugePages_Free:  %5d\n"
+			"Hugepagesize:    %5lu kB\n",
+			htlbpage_total,
+			htlbpage_free,
+			HPAGE_SIZE/1024);
+}
+
+/* This is advisory only, so we can get away with accesing
+ * htlbpage_free without taking the lock. */
+int is_hugepage_mem_enough(size_t size)
+{
+	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= htlbpage_free;
+}
+
+/*
+ * We cannot handle pagefaults against hugetlb pages at all.  They cause
+ * handle_mm_fault() to try to instantiate regular-sized pages in the
+ * hugegpage VMA.  do_page_fault() is supposed to trap this, so BUG is we get
+ * this far.
+ */
+static struct page *hugetlb_nopage(struct vm_area_struct *vma,
+				unsigned long address, int unused)
+{
+	BUG();
+	return NULL;
+}
+
+struct vm_operations_struct hugetlb_vm_ops = {
+	.nopage = hugetlb_nopage,
+};
diff -urN linux-2.5/arch/ppc64/mm/init.c linux-gogogo/arch/ppc64/mm/init.c
--- linux-2.5/arch/ppc64/mm/init.c	2003-07-04 14:42:34.000000000 +1000
+++ linux-gogogo/arch/ppc64/mm/init.c	2003-07-04 14:45:20.000000000 +1000
@@ -294,7 +294,7 @@

 	if (!pgd_none(*pgd)) {
 		pmd = pmd_offset(pgd, vmaddr);
-		if (!pmd_none(*pmd)) {
+		if (pmd_present(*pmd)) {
 			ptep = pte_offset_kernel(pmd, vmaddr);
 			/* Check if HPTE might exist and flush it if so */
 			pte = __pte(pte_update(ptep, _PAGE_HPTEFLAGS, 0));
@@ -302,6 +302,7 @@
 				flush_hash_page(context, vmaddr, pte, local);
 			}
 		}
+		WARN_ON(pmd_hugepage(*pmd));
 	}
 }

@@ -350,7 +351,7 @@
 				pmd_end = (start + PMD_SIZE) & PMD_MASK;
 				if (pmd_end > end)
 					pmd_end = end;
-				if (!pmd_none(*pmd)) {
+				if (pmd_present(*pmd)) {
 					ptep = pte_offset_kernel(pmd, start);
 					do {
 						if (pte_val(*ptep) & _PAGE_HASHPTE) {
@@ -369,6 +370,7 @@
 						++ptep;
 					} while (start < pmd_end);
 				} else {
+					WARN_ON(pmd_hugepage(*pmd));
 					start = pmd_end;
 				}
 				++pmd;
diff -urN linux-2.5/include/asm-ppc64/mmu.h linux-gogogo/include/asm-ppc64/mmu.h
--- linux-2.5/include/asm-ppc64/mmu.h	2003-07-04 14:42:56.000000000 +1000
+++ linux-gogogo/include/asm-ppc64/mmu.h	2003-07-04 14:45:38.000000000 +1000
@@ -18,6 +18,12 @@
 /* Default "unsigned long" context */
 typedef unsigned long mm_context_t;

+#ifdef CONFIG_HUGETLB_PAGE
+#define CONTEXT_LOW_HPAGES	(1UL<<63)
+#else
+#define CONTEXT_LOW_HPAGES	0
+#endif
+
 /*
  * Define the size of the cache used for segment table entries.  The first
  * entry is used as a cache pointer, therefore the actual number of entries
diff -urN linux-2.5/include/asm-ppc64/mmu_context.h linux-gogogo/include/asm-ppc64/mmu_context.h
--- linux-2.5/include/asm-ppc64/mmu_context.h	2003-07-04 14:42:56.000000000 +1000
+++ linux-gogogo/include/asm-ppc64/mmu_context.h	2003-07-04 14:46:03.000000000 +1000
@@ -127,7 +127,8 @@
 #endif

 	mmu_context_queue.size++;
-	mmu_context_queue.elements[index] = mm->context;
+	mmu_context_queue.elements[index] =
+		mm->context & ~CONTEXT_LOW_HPAGES;

 	spin_unlock_irqrestore(&mmu_context_queue.lock, flags);
 }
@@ -189,6 +190,8 @@
 {
 	unsigned long ordinal, vsid;

+	context &= ~CONTEXT_LOW_HPAGES;
+
 	ordinal = (((ea >> 28) & 0x1fffff) * LAST_USER_CONTEXT) | context;
 	vsid = (ordinal * VSID_RANDOMIZER) & VSID_MASK;

diff -urN linux-2.5/include/asm-ppc64/page.h linux-gogogo/include/asm-ppc64/page.h
--- linux-2.5/include/asm-ppc64/page.h	2003-07-04 14:42:56.000000000 +1000
+++ linux-gogogo/include/asm-ppc64/page.h	2003-07-04 14:46:10.000000000 +1000
@@ -22,6 +22,39 @@
 #define PAGE_MASK	(~(PAGE_SIZE-1))
 #define PAGE_OFFSET_MASK (PAGE_SIZE-1)

+#ifdef CONFIG_HUGETLB_PAGE
+
+#define HPAGE_SHIFT	24
+#define HPAGE_SIZE	((1UL) << HPAGE_SHIFT)
+#define HPAGE_MASK	(~(HPAGE_SIZE - 1))
+#define HUGETLB_PAGE_ORDER	(HPAGE_SHIFT - PAGE_SHIFT)
+
+/* For 64-bit processes the hugepage range is 1T-1.5T */
+#define TASK_HPAGE_BASE 	(0x0000010000000000UL)
+#define TASK_HPAGE_END 	(0x0000018000000000UL)
+/* For 32-bit processes the hugepage range is 2-3G */
+#define TASK_HPAGE_BASE_32	(0x80000000UL)
+#define TASK_HPAGE_END_32	(0xc0000000UL)
+
+#define ARCH_HAS_HUGEPAGE_ONLY_RANGE
+#define is_hugepage_only_range(addr, len) \
+	( ((addr > (TASK_HPAGE_BASE-len)) && (addr < TASK_HPAGE_END)) || \
+	  ((current->mm->context & CONTEXT_LOW_HPAGES) && \
+	   (addr > (TASK_HPAGE_BASE_32-len)) && (addr < TASK_HPAGE_END_32)) )
+#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+
+#define in_hugepage_area(context, addr) \
+	((cur_cpu_spec->cpu_features & CPU_FTR_16M_PAGE) && \
+	 ((((addr) >= TASK_HPAGE_BASE) && ((addr) < TASK_HPAGE_END)) || \
+	  (((context) & CONTEXT_LOW_HPAGES) && \
+	   (((addr) >= TASK_HPAGE_BASE_32) && ((addr) < TASK_HPAGE_END_32)))))
+
+#else /* !CONFIG_HUGETLB_PAGE */
+
+#define in_hugepage_area(mm, addr)	0
+
+#endif /* !CONFIG_HUGETLB_PAGE */
+
 #define SID_SHIFT       28
 #define SID_MASK        0xfffffffff
 #define GET_ESID(x)     (((x) >> SID_SHIFT) & SID_MASK)
diff -urN linux-2.5/include/asm-ppc64/pgtable.h linux-gogogo/include/asm-ppc64/pgtable.h
--- linux-2.5/include/asm-ppc64/pgtable.h	2003-07-04 14:42:56.000000000 +1000
+++ linux-gogogo/include/asm-ppc64/pgtable.h	2003-07-04 14:46:24.000000000 +1000
@@ -149,6 +149,25 @@
 /* shift to put page number into pte */
 #define PTE_SHIFT (16)

+/* We allow 2^41 bytes of real memory, so we need 29 bits in the PMD
+ * to give the PTE page number.  The bottom two bits are for flags. */
+#define PMD_TO_PTEPAGE_SHIFT (2)
+
+#ifdef CONFIG_HUGETLB_PAGE
+#define _PMD_HUGEPAGE	0x00000001U
+#define HUGEPTE_BATCH_SIZE (1<<(HPAGE_SHIFT-PMD_SHIFT))
+
+int hash_huge_page(struct mm_struct *mm, unsigned long access,
+		   unsigned long ea, unsigned long vsid, int local);
+
+#define HAVE_ARCH_UNMAPPED_AREA
+#else
+
+#define hash_huge_page(mm,a,ea,vsid,local)	-1
+#define _PMD_HUGEPAGE	0
+
+#endif
+
 #ifndef __ASSEMBLY__

 /*
@@ -178,12 +197,16 @@
 #define pte_pfn(x)		((unsigned long)((pte_val(x) >> PTE_SHIFT)))
 #define pte_page(x)		pfn_to_page(pte_pfn(x))

-#define pmd_set(pmdp, ptep) 	(pmd_val(*(pmdp)) = (__ba_to_bpn(ptep)))
+#define pmd_set(pmdp, ptep) 	\
+	(pmd_val(*(pmdp)) = (__ba_to_bpn(ptep) << PMD_TO_PTEPAGE_SHIFT))
 #define pmd_none(pmd)		(!pmd_val(pmd))
-#define	pmd_bad(pmd)		((pmd_val(pmd)) == 0)
-#define	pmd_present(pmd)	((pmd_val(pmd)) != 0)
+#define	pmd_hugepage(pmd)	(!!(pmd_val(pmd) & _PMD_HUGEPAGE))
+#define	pmd_bad(pmd)		(((pmd_val(pmd)) == 0) || pmd_hugepage(pmd))
+#define	pmd_present(pmd)	((!pmd_hugepage(pmd)) \
+				 && (pmd_val(pmd) & ~_PMD_HUGEPAGE) != 0)
 #define	pmd_clear(pmdp)		(pmd_val(*(pmdp)) = 0)
-#define pmd_page_kernel(pmd)	(__bpn_to_ba(pmd_val(pmd)))
+#define pmd_page_kernel(pmd)	\
+	(__bpn_to_ba(pmd_val(pmd) >> PMD_TO_PTEPAGE_SHIFT))
 #define pmd_page(pmd)		virt_to_page(pmd_page_kernel(pmd))
 #define pgd_set(pgdp, pmdp)	(pgd_val(*(pgdp)) = (__ba_to_bpn(pmdp)))
 #define pgd_none(pgd)		(!pgd_val(pgd))


--
David Gibson			| For every complex problem there is a
david at gibson.dropbear.id.au	| solution which is simple, neat and
				| wrong.
http://www.ozlabs.org/people/dgibson

** Sent via the linuxppc64-dev mail list. See http://lists.linuxppc.org/