[PATCH] [2.4] [RHEL] Backport of benh's PTE mgmt changes
olof at austin.ibm.com
olof at austin.ibm.com
Wed Jan 7 11:01:09 EST 2004
Below is a 2.4 backport of parts of benh's 2.6 pte_free rewrite. It's
different in a few ways:
1. 2.4 has no RCU. Instead I just send a syncronous IPI to all processors.
Since the IPI won't be delivered until a processor is out of hash_page, it
can be used as a barrier between new and old traversals.
2. There's no batching of TLB shootdowns, like in 2.6. So I had to hijack
do_check_pgt_cache(). This is ugly, and I'm not too happy about it, but
I think RedHat would be more likely to accept this than a change in
generic code (at this point in the product cycle). Julie, feel free to
prove me wrong. :-)
3. Because of the above reason, I had to add an extra per-cpu lock for the
pte_freelist_batch structures.
4. The __hash_page locking is rougher than in 2.6. I left the hash locks
there, since I believe they are still needed.
5. I recycled _PAGE_HASHNOIX, since it's never used. There were no other
free bits available...
(6. RedHat disabled the fast PTE/PMD/PGD allocator, so the patch won't
apply cleanly to an ameslab or marcelo 2.4 tree, but the differences are
pretty obvious.)
I think that's it. Please provide feedback. We're working on a deadline
with RedHat, so sooner is better than later. I'll be beating on this with
the specweb benchmark over the next couple of days as well. :-)
Thanks,
Olof
Olof Johansson Office: 4E002/905
Linux on Power Development IBM Systems Group
Email: olof at austin.ibm.com Phone: 512-838-9858
All opinions are my own and not those of IBM
Index: arch/ppc64/kernel/htab.c
===================================================================
RCS file: /cvs/local/rhel/arch/ppc64/kernel/htab.c,v
retrieving revision 1.1.1.2
diff -w -p -u -r1.1.1.2 htab.c
--- arch/ppc64/kernel/htab.c 5 Sep 2003 18:57:01 -0000 1.1.1.2
+++ arch/ppc64/kernel/htab.c 6 Jan 2004 22:31:58 -0000
@@ -320,8 +320,10 @@ int __hash_page(unsigned long ea, unsign
unsigned long va, vpn;
unsigned long newpp, prpn;
unsigned long hpteflags, lock_slot;
+ unsigned long access_ok, tmp;
long slot;
pte_t old_pte, new_pte;
+ int ret = 0;
/* Search the Linux page table for a match with va */
va = (vsid << 28) | (ea & 0x0fffffff);
@@ -337,21 +339,52 @@ int __hash_page(unsigned long ea, unsign
* Check the user's access rights to the page. If access should be
* prevented then send the problem up to do_page_fault.
*/
-#ifdef CONFIG_SHARED_MEMORY_ADDRESSING
+
+ /*
+ * Check the user's access rights to the page. If access should be
+ * prevented then send the problem up to do_page_fault.
+ */
+
access |= _PAGE_PRESENT;
- if (unlikely(access & ~(pte_val(*ptep)))) {
+
+ /* We'll do access checking and _PAGE_BUSY setting in assembly, since
+ * it needs to be atomic.
+ */
+
+ __asm__ __volatile__ ("\n
+ 1: ldarx %0,0,%3\n
+ # Check access rights (access & ~(pte_val(*ptep)))\n
+ andc. %1,%2,%0\n
+ bne- 2f\n
+ # Check if PTE is busy\n
+ andi. %1,%0,%4\n
+ bne- 1b\n
+ ori %0,%0,%4\n
+ # Write the linux PTE atomically (setting busy)\n
+ stdcx. %0,0,%3\n
+ bne- 1b\n
+ li %1,1\n
+ b 3f\n
+ 2: stdcx. %0,0,%3 # to clear the reservation\n
+ li %1,0\n
+ 3:"
+ : "=r" (old_pte), "=r" (access_ok)
+ : "r" (access), "r" (ptep), "i" (_PAGE_BUSY)
+ : "cc", "memory");
+
+#ifdef CONFIG_SHARED_MEMORY_ADDRESSING
+ if (unlikely(!access_ok)) {
if(!(((ea >> SMALLOC_EA_SHIFT) ==
(SMALLOC_START >> SMALLOC_EA_SHIFT)) &&
((current->thread.flags) & PPC_FLAG_SHARED))) {
- spin_unlock(&hash_table_lock[lock_slot].lock);
- return 1;
+ ret = 1;
+ goto out_unlock;
}
}
#else
- access |= _PAGE_PRESENT;
- if (unlikely(access & ~(pte_val(*ptep)))) {
- spin_unlock(&hash_table_lock[lock_slot].lock);
- return 1;
+ if (unlikely(!access_ok)) {
+ ret = 1;
+ goto out_unlock;
}
#endif
@@ -428,9 +461,22 @@ int __hash_page(unsigned long ea, unsign
*ptep = new_pte;
}
+out_unlock:
+ tmp = _PAGE_BUSY;
+
+ /* Clear _PAGE_BUSY flag atomically. */
+ __asm__ __volatile__ ("
+ 1: ldarx %0,0,%2\n
+ andc. %0,%0,%1\n
+ stdcx. %0,0,%2\n
+ bne- 1b\n"
+ : "=r" (new_pte)
+ : "r" (tmp), "r" (ptep)
+ : "cc", "memory");
+
spin_unlock(&hash_table_lock[lock_slot].lock);
- return 0;
+ return ret;
}
/*
@@ -497,12 +543,6 @@ int hash_page(unsigned long ea, unsigned
pgdir = mm->pgd;
if (pgdir == NULL) return 1;
- /*
- * Lock the Linux page table to prevent mmap and kswapd
- * from modifying entries while we search and update
- */
- spin_lock(&mm->page_table_lock);
-
ptep = find_linux_pte(pgdir, ea);
/*
* If no pte found or not present, send the problem up to
@@ -515,8 +555,6 @@ int hash_page(unsigned long ea, unsigned
ret = 1;
}
- spin_unlock(&mm->page_table_lock);
-
return ret;
}
Index: arch/ppc64/mm/init.c
===================================================================
RCS file: /cvs/local/rhel/arch/ppc64/mm/init.c,v
retrieving revision 1.1.1.1
diff -w -p -u -r1.1.1.1 init.c
--- arch/ppc64/mm/init.c 7 Aug 2003 03:21:44 -0000 1.1.1.1
+++ arch/ppc64/mm/init.c 6 Jan 2004 22:42:55 -0000
@@ -104,9 +104,72 @@ unsigned long __max_memory;
*/
mmu_gather_t mmu_gathers[NR_CPUS];
+/* PTE free batching structures. We need a lock since not all
+ * operations take place under page_table_lock. Keep it per-CPU
+ * to avoid bottlenecks.
+ */
+
+spinlock_t pte_freelist_lock[NR_CPUS] = { [0 ... NR_CPUS-1] = SPIN_LOCK_UNLOCKED};
+struct pte_freelist_batch *pte_freelist_cur[NR_CPUS];
+
+unsigned long pte_freelist_forced_free;
+
+static void pte_free_smp_sync(void *arg)
+{
+ /* Do nothing, just ensure we sync with all CPUs */
+}
+
+/* This is only called when we are critically out of memory
+ * (and fail to get a page in pte_free_tlb).
+ */
+void pte_free_now(struct page *ptepage)
+{
+ pte_freelist_forced_free++;
+
+ smp_call_function(pte_free_smp_sync, NULL, 0, 1);
+
+ pte_free_kernel(page_address(ptepage));
+}
+
+
+void pte_free_batch(struct pte_freelist_batch *batch)
+{
+ unsigned int i;
+
+ /* A sync is good enough: It will ensure that no other
+ * CPU is currently traversing down to a free'd pte.
+ */
+
+ smp_call_function(pte_free_smp_sync, NULL, 0, 1);
+
+ for (i = 0; i < batch->index; i++)
+ pte_free_kernel(page_address(batch->pages[i]));
+ free_page((unsigned long)batch);
+}
+
+
int do_check_pgt_cache(int low, int high)
{
int freed = 0;
+ struct pte_freelist_batch **batchp;
+ spinlock_t *lock = &pte_freelist_lock[smp_processor_id()];
+
+ /* We use this function to push the current pte free batch to be
+ * deallocated, since do_check_pgt_cache() is called at the end of each
+ * free_one_pgd() and other parts of VM relies on all PTE's being
+ * properly freed upon return from that function.
+ */
+
+ spin_lock(lock);
+
+ batchp = &pte_freelist_cur[smp_processor_id()];
+
+ if(*batchp) {
+ pte_free_batch(*batchp);
+ *batchp = NULL;
+ }
+
+ spin_unlock(lock);
#if 0
if (pgtable_cache_size > high) {
@@ -120,6 +183,7 @@ int do_check_pgt_cache(int low, int high
} while (pgtable_cache_size > low);
}
#endif
+
return freed;
}
Index: include/asm-ppc64/mmu.h
===================================================================
RCS file: /cvs/local/rhel/include/asm-ppc64/mmu.h,v
retrieving revision 1.1.1.1
diff -w -p -u -r1.1.1.1 mmu.h
Index: include/asm-ppc64/pgalloc.h
===================================================================
RCS file: /cvs/local/rhel/include/asm-ppc64/pgalloc.h,v
retrieving revision 1.1.1.2
diff -w -p -u -r1.1.1.2 pgalloc.h
--- include/asm-ppc64/pgalloc.h 26 Sep 2003 14:42:15 -0000 1.1.1.2
+++ include/asm-ppc64/pgalloc.h 6 Jan 2004 22:34:39 -0000
@@ -112,7 +112,51 @@ pte_alloc_one(struct mm_struct *mm, unsi
return NULL;
}
-#define pte_free(pte_page) pte_free_kernel(page_address(pte_page))
+#define pte_free(pte_page) __pte_free(pte_page)
+
+struct pte_freelist_batch
+{
+ unsigned int index;
+ struct page * pages[0];
+};
+
+#define PTE_FREELIST_SIZE ((PAGE_SIZE - sizeof(struct pte_freelist_batch) / \
+ sizeof(struct page *)))
+
+extern void pte_free_now(struct page *ptepage);
+extern void pte_free_batch(struct pte_freelist_batch *batch);
+
+extern struct pte_freelist_batch *pte_freelist_cur[];
+extern spinlock_t pte_freelist_lock[];
+
+static inline void __pte_free(struct page *ptepage)
+{
+ spinlock_t *lock = &pte_freelist_lock[smp_processor_id()];
+ struct pte_freelist_batch **batchp;
+
+ spin_lock(lock);
+
+ batchp = &pte_freelist_cur[smp_processor_id()];
+
+ if (*batchp == NULL) {
+ *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
+ if (*batchp == NULL) {
+ spin_unlock(lock);
+ pte_free_now(ptepage);
+ return;
+ }
+ (*batchp)->index = 0;
+ }
+
+ (*batchp)->pages[(*batchp)->index++] = ptepage;
+ if ((*batchp)->index == PTE_FREELIST_SIZE) {
+ pte_free_batch(*batchp);
+ *batchp = NULL;
+ }
+
+ spin_unlock(lock);
+}
+
extern int do_check_pgt_cache(int, int);
Index: include/asm-ppc64/pgtable.h
===================================================================
RCS file: /cvs/local/rhel/include/asm-ppc64/pgtable.h,v
retrieving revision 1.1.1.1
diff -w -p -u -r1.1.1.1 pgtable.h
--- include/asm-ppc64/pgtable.h 7 Aug 2003 03:21:59 -0000 1.1.1.1
+++ include/asm-ppc64/pgtable.h 6 Jan 2004 22:34:23 -0000
@@ -88,22 +88,22 @@
* Bits in a linux-style PTE. These match the bits in the
* (hardware-defined) PowerPC PTE as closely as possible.
*/
-#define _PAGE_PRESENT 0x001UL /* software: pte contains a translation */
-#define _PAGE_USER 0x002UL /* matches one of the PP bits */
-#define _PAGE_RW 0x004UL /* software: user write access allowed */
-#define _PAGE_GUARDED 0x008UL
-#define _PAGE_COHERENT 0x010UL /* M: enforce memory coherence (SMP systems) */
-#define _PAGE_NO_CACHE 0x020UL /* I: cache inhibit */
-#define _PAGE_WRITETHRU 0x040UL /* W: cache write-through */
-#define _PAGE_DIRTY 0x080UL /* C: page changed */
-#define _PAGE_ACCESSED 0x100UL /* R: page referenced */
-#define _PAGE_HPTENOIX 0x200UL /* software: pte HPTE slot unknown */
-#define _PAGE_HASHPTE 0x400UL /* software: pte has an associated HPTE */
-#define _PAGE_EXEC 0x800UL /* software: i-cache coherence required */
-#define _PAGE_SECONDARY 0x8000UL /* software: HPTE is in secondary group */
-#define _PAGE_GROUP_IX 0x7000UL /* software: HPTE index within group */
+#define _PAGE_PRESENT 0x0001 /* software: pte contains a translation */
+#define _PAGE_USER 0x0002 /* matches one of the PP bits */
+#define _PAGE_RW 0x0004 /* software: user write access allowed */
+#define _PAGE_GUARDED 0x0008
+#define _PAGE_COHERENT 0x0010 /* M: enforce memory coherence (SMP systems) */
+#define _PAGE_NO_CACHE 0x0020 /* I: cache inhibit */
+#define _PAGE_WRITETHRU 0x0040 /* W: cache write-through */
+#define _PAGE_DIRTY 0x0080 /* C: page changed */
+#define _PAGE_ACCESSED 0x0100 /* R: page referenced */
+#define _PAGE_BUSY 0x0200 /* software: pte & hash are busy */
+#define _PAGE_HASHPTE 0x0400 /* software: pte has an associated HPTE */
+#define _PAGE_EXEC 0x0800 /* software: i-cache coherence required */
+#define _PAGE_GROUP_IX 0x7000 /* software: HPTE index within group */
+#define _PAGE_SECONDARY 0x8000 /* software: HPTE is in secondary group */
/* Bits 0x7000 identify the index within an HPT Group */
-#define _PAGE_HPTEFLAGS (_PAGE_HASHPTE | _PAGE_HPTENOIX | _PAGE_SECONDARY | _PAGE_GROUP_IX)
+#define _PAGE_HPTEFLAGS (_PAGE_HASHPTE | _PAGE_SECONDARY | _PAGE_GROUP_IX)
/* PAGE_MASK gives the right answer below, but only by accident */
/* It should be preserving the high 48 bits and then specifically */
/* preserving _PAGE_SECONDARY | _PAGE_GROUP_IX */
@@ -290,12 +290,14 @@ static inline unsigned long pte_update(
__asm__ __volatile__("\n\
1: ldarx %0,0,%3 \n\
+ andi. %1,%0,%7 # loop on _PAGE_BUSY set\n\
+ bne- 1b \n\
andc %1,%0,%4 \n\
or %1,%1,%5 \n\
stdcx. %1,0,%3 \n\
bne- 1b"
: "=&r" (old), "=&r" (tmp), "=m" (*p)
- : "r" (p), "r" (clr), "r" (set), "m" (*p)
+ : "r" (p), "r" (clr), "r" (set), "m" (*p), "i" (_PAGE_BUSY)
: "cc" );
return old;
}
** Sent via the linuxppc64-dev mail list. See http://lists.linuxppc.org/
More information about the Linuxppc64-dev
mailing list