[PATCH] [2.4] [RHEL] Backport of benh's PTE mgmt changes

Wed Jan 7 11:01:09 EST 2004

Below is a 2.4 backport of parts of benh's 2.6 pte_free rewrite. It's
different in a few ways:

1. 2.4 has no RCU. Instead I just send a syncronous IPI to all processors.
Since the IPI won't be delivered until a processor is out of hash_page, it
can be used as a barrier between new and old traversals.

2. There's no batching of TLB shootdowns, like in 2.6. So I had to hijack
do_check_pgt_cache(). This is ugly, and I'm not too happy about it, but
I think RedHat would be more likely to accept this than a change in
generic code (at this point in the product cycle). Julie, feel free to
prove me wrong. :-)

3. Because of the above reason, I had to add an extra per-cpu lock for the
pte_freelist_batch structures.

4. The __hash_page locking is rougher than in 2.6. I left the hash locks
there, since I believe they are still needed.

5. I recycled _PAGE_HASHNOIX, since it's never used. There were no other
free bits available...



(6. RedHat disabled the fast PTE/PMD/PGD allocator, so the patch won't
apply cleanly to an ameslab or marcelo 2.4 tree, but the differences are
pretty obvious.)



I think that's it. Please provide feedback. We're working on a deadline
with RedHat, so sooner is better than later. I'll be beating on this with
the specweb benchmark over the next couple of days as well. :-)


Thanks,

Olof



Olof Johansson                                        Office: 4E002/905
Linux on Power Development                            IBM Systems Group
Email: olof at austin.ibm.com                          Phone: 512-838-9858
All opinions are my own and not those of IBM



Index: arch/ppc64/kernel/htab.c
===================================================================
RCS file: /cvs/local/rhel/arch/ppc64/kernel/htab.c,v
retrieving revision 1.1.1.2
diff -w -p -u -r1.1.1.2 htab.c

--- arch/ppc64/kernel/htab.c	5 Sep 2003 18:57:01 -0000	1.1.1.2
+++ arch/ppc64/kernel/htab.c	6 Jan 2004 22:31:58 -0000
@@ -320,8 +320,10 @@ int __hash_page(unsigned long ea, unsign
 	unsigned long va, vpn;
 	unsigned long newpp, prpn;
 	unsigned long hpteflags, lock_slot;
+	unsigned long access_ok, tmp;
 	long slot;
 	pte_t old_pte, new_pte;
+	int ret = 0;

 	/* Search the Linux page table for a match with va */
 	va = (vsid << 28) | (ea & 0x0fffffff);
@@ -337,21 +339,52 @@ int __hash_page(unsigned long ea, unsign
 	 * Check the user's access rights to the page.  If access should be
 	 * prevented then send the problem up to do_page_fault.
 	 */
-#ifdef CONFIG_SHARED_MEMORY_ADDRESSING
+
+	/*
+	 * Check the user's access rights to the page.  If access should be
+	 * prevented then send the problem up to do_page_fault.
+	 */
+
 	access |= _PAGE_PRESENT;
-	if (unlikely(access & ~(pte_val(*ptep)))) {
+
+	/* We'll do access checking and _PAGE_BUSY setting in assembly, since
+	 * it needs to be atomic.
+	 */
+
+	__asm__ __volatile__ ("\n
+	1:	ldarx	%0,0,%3\n
+		# Check access rights (access & ~(pte_val(*ptep)))\n
+		andc.	%1,%2,%0\n
+		bne-	2f\n
+		# Check if PTE is busy\n
+		andi.	%1,%0,%4\n
+		bne-	1b\n
+		ori	%0,%0,%4\n
+		# Write the linux PTE atomically (setting busy)\n
+		stdcx.	%0,0,%3\n
+		bne-	1b\n
+		li      %1,1\n
+                b	3f\n
+	2:      stdcx.  %0,0,%3 # to clear the reservation\n
+		li      %1,0\n
+	3:"
+	: "=r" (old_pte), "=r" (access_ok)
+	: "r" (access), "r" (ptep), "i" (_PAGE_BUSY)
+        : "cc", "memory");
+
+#ifdef CONFIG_SHARED_MEMORY_ADDRESSING
+	if (unlikely(!access_ok)) {
 		if(!(((ea >> SMALLOC_EA_SHIFT) ==
 		      (SMALLOC_START >> SMALLOC_EA_SHIFT)) &&
 		     ((current->thread.flags) & PPC_FLAG_SHARED))) {
-			spin_unlock(&hash_table_lock[lock_slot].lock);
-			return 1;
+			ret = 1;
+			goto out_unlock;
 		}
 	}
 #else
-	access |= _PAGE_PRESENT;
-	if (unlikely(access & ~(pte_val(*ptep)))) {
-		spin_unlock(&hash_table_lock[lock_slot].lock);
-		return 1;
+	if (unlikely(!access_ok)) {
+		ret = 1;
+		goto out_unlock;
 	}
 #endif

@@ -428,9 +461,22 @@ int __hash_page(unsigned long ea, unsign
 		*ptep = new_pte;
 	}

+out_unlock:
+	tmp = _PAGE_BUSY;
+
+	/* Clear _PAGE_BUSY flag atomically. */
+	__asm__ __volatile__ ("
+	1:	ldarx	%0,0,%2\n
+		andc.	%0,%0,%1\n
+		stdcx.	%0,0,%2\n
+		bne-	1b\n"
+	: "=r" (new_pte)
+        : "r" (tmp), "r" (ptep)
+        : "cc", "memory");
+
 	spin_unlock(&hash_table_lock[lock_slot].lock);

-	return 0;
+	return ret;
 }

 /*
@@ -497,12 +543,6 @@ int hash_page(unsigned long ea, unsigned
 	pgdir = mm->pgd;
 	if (pgdir == NULL) return 1;

-	/*
-	 * Lock the Linux page table to prevent mmap and kswapd
-	 * from modifying entries while we search and update
-	 */
-	spin_lock(&mm->page_table_lock);
-
 	ptep = find_linux_pte(pgdir, ea);
 	/*
 	 * If no pte found or not present, send the problem up to
@@ -515,8 +555,6 @@ int hash_page(unsigned long ea, unsigned
 		ret = 1;
 	}

-	spin_unlock(&mm->page_table_lock);
-
 	return ret;
 }

Index: arch/ppc64/mm/init.c
===================================================================
RCS file: /cvs/local/rhel/arch/ppc64/mm/init.c,v
retrieving revision 1.1.1.1
diff -w -p -u -r1.1.1.1 init.c
--- arch/ppc64/mm/init.c	7 Aug 2003 03:21:44 -0000	1.1.1.1
+++ arch/ppc64/mm/init.c	6 Jan 2004 22:42:55 -0000
@@ -104,9 +104,72 @@ unsigned long __max_memory;
  */
 mmu_gather_t     mmu_gathers[NR_CPUS];

+/* PTE free batching structures. We need a lock since not all
+ * operations take place under page_table_lock. Keep it per-CPU
+ * to avoid bottlenecks.
+ */
+
+spinlock_t pte_freelist_lock[NR_CPUS] = { [0 ... NR_CPUS-1] = SPIN_LOCK_UNLOCKED};
+struct pte_freelist_batch *pte_freelist_cur[NR_CPUS];
+
+unsigned long pte_freelist_forced_free;
+
+static void pte_free_smp_sync(void *arg)
+{
+	/* Do nothing, just ensure we sync with all CPUs */
+}
+
+/* This is only called when we are critically out of memory
+ * (and fail to get a page in pte_free_tlb).
+ */
+void pte_free_now(struct page *ptepage)
+{
+	pte_freelist_forced_free++;
+
+	smp_call_function(pte_free_smp_sync, NULL, 0, 1);
+
+	pte_free_kernel(page_address(ptepage));
+}
+
+
+void pte_free_batch(struct pte_freelist_batch *batch)
+{
+	unsigned int i;
+
+	/* A sync is good enough: It will ensure that no other
+	 * CPU is currently traversing down to a free'd pte.
+	 */
+
+	smp_call_function(pte_free_smp_sync, NULL, 0, 1);
+
+	for (i = 0; i < batch->index; i++)
+		pte_free_kernel(page_address(batch->pages[i]));
+	free_page((unsigned long)batch);
+}
+
+
 int do_check_pgt_cache(int low, int high)
 {
 	int freed = 0;
+	struct pte_freelist_batch **batchp;
+	spinlock_t *lock = &pte_freelist_lock[smp_processor_id()];
+
+	/* We use this function to push the current pte free batch to be
+	 * deallocated, since do_check_pgt_cache() is called at the end of each
+	 * free_one_pgd() and other parts of VM relies on all PTE's being
+	 * properly freed upon return from that function.
+	 */
+
+	spin_lock(lock);
+
+	batchp = &pte_freelist_cur[smp_processor_id()];
+
+	if(*batchp) {
+		pte_free_batch(*batchp);
+		*batchp = NULL;
+	}
+
+	spin_unlock(lock);

 #if 0
 	if (pgtable_cache_size > high) {
@@ -120,6 +183,7 @@ int do_check_pgt_cache(int low, int high
 		} while (pgtable_cache_size > low);
 	}
 #endif
+
 	return freed;
 }

Index: include/asm-ppc64/mmu.h
===================================================================
RCS file: /cvs/local/rhel/include/asm-ppc64/mmu.h,v
retrieving revision 1.1.1.1
diff -w -p -u -r1.1.1.1 mmu.h
Index: include/asm-ppc64/pgalloc.h
===================================================================
RCS file: /cvs/local/rhel/include/asm-ppc64/pgalloc.h,v
retrieving revision 1.1.1.2
diff -w -p -u -r1.1.1.2 pgalloc.h
--- include/asm-ppc64/pgalloc.h	26 Sep 2003 14:42:15 -0000	1.1.1.2
+++ include/asm-ppc64/pgalloc.h	6 Jan 2004 22:34:39 -0000
@@ -112,7 +112,51 @@ pte_alloc_one(struct mm_struct *mm, unsi
         return NULL;
 }

-#define pte_free(pte_page)      pte_free_kernel(page_address(pte_page))
+#define pte_free(pte_page)      __pte_free(pte_page)
+
+struct pte_freelist_batch
+{
+	unsigned int	index;
+	struct page *	pages[0];
+};
+
+#define PTE_FREELIST_SIZE	((PAGE_SIZE - sizeof(struct pte_freelist_batch) / \
+				  sizeof(struct page *)))
+
+extern void pte_free_now(struct page *ptepage);
+extern void pte_free_batch(struct pte_freelist_batch *batch);
+
+extern struct pte_freelist_batch *pte_freelist_cur[];
+extern spinlock_t pte_freelist_lock[];
+
+static inline void __pte_free(struct page *ptepage)
+{
+	spinlock_t *lock = &pte_freelist_lock[smp_processor_id()];
+	struct pte_freelist_batch **batchp;
+
+	spin_lock(lock);
+
+	batchp = &pte_freelist_cur[smp_processor_id()];
+
+	if (*batchp == NULL) {
+		*batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
+		if (*batchp == NULL) {
+			spin_unlock(lock);
+			pte_free_now(ptepage);
+			return;
+		}
+		(*batchp)->index = 0;
+	}
+
+	(*batchp)->pages[(*batchp)->index++] = ptepage;
+	if ((*batchp)->index == PTE_FREELIST_SIZE) {
+		pte_free_batch(*batchp);
+		*batchp = NULL;
+	}
+
+	spin_unlock(lock);
+}
+

 extern int do_check_pgt_cache(int, int);

Index: include/asm-ppc64/pgtable.h
===================================================================
RCS file: /cvs/local/rhel/include/asm-ppc64/pgtable.h,v
retrieving revision 1.1.1.1
diff -w -p -u -r1.1.1.1 pgtable.h
--- include/asm-ppc64/pgtable.h	7 Aug 2003 03:21:59 -0000	1.1.1.1
+++ include/asm-ppc64/pgtable.h	6 Jan 2004 22:34:23 -0000
@@ -88,22 +88,22 @@
  * Bits in a linux-style PTE.  These match the bits in the
  * (hardware-defined) PowerPC PTE as closely as possible.
  */
-#define _PAGE_PRESENT	0x001UL	/* software: pte contains a translation */
-#define _PAGE_USER	0x002UL	/* matches one of the PP bits */
-#define _PAGE_RW	0x004UL	/* software: user write access allowed */
-#define _PAGE_GUARDED	0x008UL
-#define _PAGE_COHERENT	0x010UL	/* M: enforce memory coherence (SMP systems) */
-#define _PAGE_NO_CACHE	0x020UL	/* I: cache inhibit */
-#define _PAGE_WRITETHRU	0x040UL	/* W: cache write-through */
-#define _PAGE_DIRTY	0x080UL	/* C: page changed */
-#define _PAGE_ACCESSED	0x100UL	/* R: page referenced */
-#define _PAGE_HPTENOIX	0x200UL /* software: pte HPTE slot unknown */
-#define _PAGE_HASHPTE	0x400UL	/* software: pte has an associated HPTE */
-#define _PAGE_EXEC	0x800UL	/* software: i-cache coherence required */
-#define _PAGE_SECONDARY 0x8000UL /* software: HPTE is in secondary group */
-#define _PAGE_GROUP_IX  0x7000UL /* software: HPTE index within group */
+#define _PAGE_PRESENT	0x0001 /* software: pte contains a translation */
+#define _PAGE_USER	0x0002 /* matches one of the PP bits */
+#define _PAGE_RW	0x0004 /* software: user write access allowed */
+#define _PAGE_GUARDED	0x0008
+#define _PAGE_COHERENT	0x0010 /* M: enforce memory coherence (SMP systems) */
+#define _PAGE_NO_CACHE	0x0020 /* I: cache inhibit */
+#define _PAGE_WRITETHRU	0x0040 /* W: cache write-through */
+#define _PAGE_DIRTY	0x0080 /* C: page changed */
+#define _PAGE_ACCESSED	0x0100 /* R: page referenced */
+#define _PAGE_BUSY	0x0200 /* software: pte & hash are busy */
+#define _PAGE_HASHPTE	0x0400 /* software: pte has an associated HPTE */
+#define _PAGE_EXEC	0x0800 /* software: i-cache coherence required */
+#define _PAGE_GROUP_IX  0x7000 /* software: HPTE index within group */
+#define _PAGE_SECONDARY 0x8000 /* software: HPTE is in secondary group */
 /* Bits 0x7000 identify the index within an HPT Group */
-#define _PAGE_HPTEFLAGS (_PAGE_HASHPTE | _PAGE_HPTENOIX | _PAGE_SECONDARY | _PAGE_GROUP_IX)
+#define _PAGE_HPTEFLAGS (_PAGE_HASHPTE | _PAGE_SECONDARY | _PAGE_GROUP_IX)
 /* PAGE_MASK gives the right answer below, but only by accident */
 /* It should be preserving the high 48 bits and then specifically */
 /* preserving _PAGE_SECONDARY | _PAGE_GROUP_IX */
@@ -290,12 +290,14 @@ static inline unsigned long pte_update(

 	__asm__ __volatile__("\n\
 1:	ldarx	%0,0,%3	\n\
+        andi.   %1,%0,%7 # loop on _PAGE_BUSY set\n\
+        bne-    1b \n\
 	andc	%1,%0,%4 \n\
 	or	%1,%1,%5 \n\
 	stdcx.	%1,0,%3 \n\
 	bne-	1b"
 	: "=&r" (old), "=&r" (tmp), "=m" (*p)
-	: "r" (p), "r" (clr), "r" (set), "m" (*p)
+	: "r" (p), "r" (clr), "r" (set), "m" (*p), "i" (_PAGE_BUSY)
 	: "cc" );
 	return old;
 }


** Sent via the linuxppc64-dev mail list. See http://lists.linuxppc.org/