[PATCH] [2.4] [RHEL] Backport of benh's PTE mgmt changes
olof at austin.ibm.com
olof at austin.ibm.com
Thu Jan 8 08:43:22 EST 2004
On Tue, 6 Jan 2004 olof at forte.austin.ibm.com wrote:
> I'll try to see how visible is with workloads. I'm not sure how
> syncronization could be acheived without either RCU or IPI support, so
> hopefully it won't be a big hit.
No visible impact on SPECweb, as far as I can tell. SDET didn't show much
a difference either, but it's hard to tell since numbers vary quite a bit.
But anyway: The solution is too obvious: A rwlock, with hash_page taking
it for reading, and pte_freelist_batch taking it for writing momentarily
to syncronize with the readers. This way, batch free only has to wait for
all hash_page()s to complete, and no IPI is needed. Code size of hash_page
is largely unaltered from the old page_table_lock spin_locks.
I also moved to a nonatomic clearing of _PAGE_BUSY, and added the PMD
free's to be managed by the freelist stuff too, just as in 2.6.
So, bottom line: This gives patch gives no particular performance benefit
over baseline, but it does remove the deadlock hangs.
New patch below.
-Olof
Index: arch/ppc64/kernel/htab.c
===================================================================
RCS file: /cvs/local/rhel/arch/ppc64/kernel/htab.c,v
retrieving revision 1.1.1.2
diff -p -u -r1.1.1.2 htab.c
--- arch/ppc64/kernel/htab.c 5 Sep 2003 18:57:01 -0000 1.1.1.2
+++ arch/ppc64/kernel/htab.c 7 Jan 2004 20:25:52 -0000
@@ -64,6 +64,7 @@ HTAB htab_data = {NULL, 0, 0, 0, 0};
extern unsigned long _SDR1;
extern unsigned long klimit;
+extern rwlock_t pte_hash_lock;
void make_pte(HPTE *htab, unsigned long va, unsigned long pa,
int mode, unsigned long hash_mask, int large);
@@ -320,8 +321,10 @@ int __hash_page(unsigned long ea, unsign
unsigned long va, vpn;
unsigned long newpp, prpn;
unsigned long hpteflags, lock_slot;
+ unsigned long access_ok, tmp;
long slot;
pte_t old_pte, new_pte;
+ int ret = 0;
/* Search the Linux page table for a match with va */
va = (vsid << 28) | (ea & 0x0fffffff);
@@ -337,21 +340,52 @@ int __hash_page(unsigned long ea, unsign
* Check the user's access rights to the page. If access should be
* prevented then send the problem up to do_page_fault.
*/
-#ifdef CONFIG_SHARED_MEMORY_ADDRESSING
+
+ /*
+ * Check the user's access rights to the page. If access should be
+ * prevented then send the problem up to do_page_fault.
+ */
+
access |= _PAGE_PRESENT;
- if (unlikely(access & ~(pte_val(*ptep)))) {
+
+ /* We'll do access checking and _PAGE_BUSY setting in assembly, since
+ * it needs to be atomic.
+ */
+
+ __asm__ __volatile__ ("\n
+ 1: ldarx %0,0,%3\n
+ # Check access rights (access & ~(pte_val(*ptep)))\n
+ andc. %1,%2,%0\n
+ bne- 2f\n
+ # Check if PTE is busy\n
+ andi. %1,%0,%4\n
+ bne- 1b\n
+ ori %0,%0,%4\n
+ # Write the linux PTE atomically (setting busy)\n
+ stdcx. %0,0,%3\n
+ bne- 1b\n
+ li %1,1\n
+ b 3f\n
+ 2: stdcx. %0,0,%3 # to clear the reservation\n
+ li %1,0\n
+ 3:"
+ : "=r" (old_pte), "=r" (access_ok)
+ : "r" (access), "r" (ptep), "i" (_PAGE_BUSY)
+ : "cr0", "memory");
+
+#ifdef CONFIG_SHARED_MEMORY_ADDRESSING
+ if (unlikely(!access_ok)) {
if(!(((ea >> SMALLOC_EA_SHIFT) ==
(SMALLOC_START >> SMALLOC_EA_SHIFT)) &&
((current->thread.flags) & PPC_FLAG_SHARED))) {
- spin_unlock(&hash_table_lock[lock_slot].lock);
- return 1;
+ ret = 1;
+ goto out_unlock;
}
}
#else
- access |= _PAGE_PRESENT;
- if (unlikely(access & ~(pte_val(*ptep)))) {
- spin_unlock(&hash_table_lock[lock_slot].lock);
- return 1;
+ if (unlikely(!access_ok)) {
+ ret = 1;
+ goto out_unlock;
}
#endif
@@ -428,9 +462,14 @@ int __hash_page(unsigned long ea, unsign
*ptep = new_pte;
}
+out_unlock:
+ smp_wmb();
+
+ pte_val(*ptep) &= ~_PAGE_BUSY;
+
spin_unlock(&hash_table_lock[lock_slot].lock);
- return 0;
+ return ret;
}
/*
@@ -497,11 +536,14 @@ int hash_page(unsigned long ea, unsigned
pgdir = mm->pgd;
if (pgdir == NULL) return 1;
- /*
- * Lock the Linux page table to prevent mmap and kswapd
- * from modifying entries while we search and update
+ /* The pte_hash_lock is used to block any PTE deallocations
+ * while we walk the tree and use the entry. While technically
+ * we both read and write the PTE entry while holding the read
+ * lock, the _PAGE_BUSY bit will block pte_update()s to the
+ * specific entry.
*/
- spin_lock(&mm->page_table_lock);
+
+ read_lock(&pte_hash_lock);
ptep = find_linux_pte(pgdir, ea);
/*
@@ -514,8 +556,7 @@ int hash_page(unsigned long ea, unsigned
/* If no pte, send the problem up to do_page_fault */
ret = 1;
}
-
- spin_unlock(&mm->page_table_lock);
+ read_unlock(&pte_hash_lock);
return ret;
}
Index: arch/ppc64/mm/init.c
===================================================================
RCS file: /cvs/local/rhel/arch/ppc64/mm/init.c,v
retrieving revision 1.1.1.1
diff -p -u -r1.1.1.1 init.c
--- arch/ppc64/mm/init.c 7 Aug 2003 03:21:44 -0000 1.1.1.1
+++ arch/ppc64/mm/init.c 7 Jan 2004 20:45:14 -0000
@@ -104,9 +104,78 @@ unsigned long __max_memory;
*/
mmu_gather_t mmu_gathers[NR_CPUS];
+/* PTE free batching structures. We need a lock since not all
+ * operations take place under page_table_lock. Keep it per-CPU
+ * to avoid bottlenecks.
+ */
+
+spinlock_t pte_freelist_lock[NR_CPUS] = { [0 ... NR_CPUS-1] = SPIN_LOCK_UNLOCKED};
+struct pte_freelist_batch *pte_freelist_cur[NR_CPUS];
+rwlock_t pte_hash_lock = RW_LOCK_UNLOCKED;
+
+unsigned long pte_freelist_forced_free;
+
+static inline void pte_free_sync(void)
+{
+ unsigned long flags;
+
+ /* A sync (lock/unlock) is good enough: It will ensure that no
+ * other CPU is in hash_page, currently traversing down to a
+ * free'd pte.
+ */
+
+ write_lock_irqsave(&pte_hash_lock, flags);
+ write_unlock_irqrestore(&pte_hash_lock, flags);
+}
+
+
+/* This is only called when we are critically out of memory
+ * (and fail to get a page in pte_free_tlb).
+ */
+void pte_free_now(pte_t *pte)
+{
+ pte_freelist_forced_free++;
+
+ pte_free_sync();
+
+ pte_free_kernel(pte);
+}
+
+
+void pte_free_batch(struct pte_freelist_batch *batch)
+{
+ unsigned int i;
+
+ pte_free_sync();
+
+ for (i = 0; i < batch->index; i++)
+ pte_free_kernel(batch->entry[i]);
+ free_page((unsigned long)batch);
+}
+
+
int do_check_pgt_cache(int low, int high)
{
int freed = 0;
+ struct pte_freelist_batch **batchp;
+ spinlock_t *lock = &pte_freelist_lock[smp_processor_id()];
+
+ /* We use this function to push the current pte free batch to be
+ * deallocated, since do_check_pgt_cache() is called at the end of each
+ * free_one_pgd() and other parts of VM relies on all PTE's being
+ * properly freed upon return from that function.
+ */
+
+ spin_lock(lock);
+
+ batchp = &pte_freelist_cur[smp_processor_id()];
+
+ if(*batchp) {
+ pte_free_batch(*batchp);
+ *batchp = NULL;
+ }
+
+ spin_unlock(lock);
#if 0
if (pgtable_cache_size > high) {
Index: include/asm-ppc64/pgalloc.h
===================================================================
RCS file: /cvs/local/rhel/include/asm-ppc64/pgalloc.h,v
retrieving revision 1.1.1.2
diff -p -u -r1.1.1.2 pgalloc.h
--- include/asm-ppc64/pgalloc.h 26 Sep 2003 14:42:15 -0000 1.1.1.2
+++ include/asm-ppc64/pgalloc.h 7 Jan 2004 20:46:01 -0000
@@ -93,12 +93,6 @@ pte_free_kernel(pte_t *pte)
}
-static inline void
-pmd_free (pmd_t *pmd)
-{
- free_page((unsigned long)pmd);
-}
-
#define pte_alloc_one_fast(mm, address) (0)
static inline struct page *
@@ -112,7 +106,57 @@ pte_alloc_one(struct mm_struct *mm, unsi
return NULL;
}
-#define pte_free(pte_page) pte_free_kernel(page_address(pte_page))
+/* Use the PTE functions for freeing PMD as well, since the same
+ * problem with tree traversals apply. Since pmd pointers are always
+ * virtual, no need for a page_address() translation.
+ */
+
+#define pte_free(pte_page) __pte_free(page_address(pte_page))
+#define pmd_free(pmd) __pte_free(pmd)
+
+struct pte_freelist_batch
+{
+ unsigned int index;
+ void* entry[0];
+};
+
+#define PTE_FREELIST_SIZE ((PAGE_SIZE - sizeof(struct pte_freelist_batch) / \
+ sizeof(struct page *)))
+
+extern void pte_free_now(pte_t *pte);
+extern void pte_free_batch(struct pte_freelist_batch *batch);
+
+extern struct pte_freelist_batch *pte_freelist_cur[];
+extern spinlock_t pte_freelist_lock[];
+
+static inline void __pte_free(pte_t *pte)
+{
+ spinlock_t *lock = &pte_freelist_lock[smp_processor_id()];
+ struct pte_freelist_batch **batchp;
+
+ spin_lock(lock);
+
+ batchp = &pte_freelist_cur[smp_processor_id()];
+
+ if (*batchp == NULL) {
+ *batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
+ if (*batchp == NULL) {
+ spin_unlock(lock);
+ pte_free_now(pte);
+ return;
+ }
+ (*batchp)->index = 0;
+ }
+
+ (*batchp)->entry[(*batchp)->index++] = pte;
+ if ((*batchp)->index == PTE_FREELIST_SIZE) {
+ pte_free_batch(*batchp);
+ *batchp = NULL;
+ }
+
+ spin_unlock(lock);
+}
+
extern int do_check_pgt_cache(int, int);
Index: include/asm-ppc64/pgtable.h
===================================================================
RCS file: /cvs/local/rhel/include/asm-ppc64/pgtable.h,v
retrieving revision 1.1.1.1
diff -p -u -r1.1.1.1 pgtable.h
--- include/asm-ppc64/pgtable.h 7 Aug 2003 03:21:59 -0000 1.1.1.1
+++ include/asm-ppc64/pgtable.h 7 Jan 2004 20:32:57 -0000
@@ -88,22 +88,22 @@
* Bits in a linux-style PTE. These match the bits in the
* (hardware-defined) PowerPC PTE as closely as possible.
*/
-#define _PAGE_PRESENT 0x001UL /* software: pte contains a translation */
-#define _PAGE_USER 0x002UL /* matches one of the PP bits */
-#define _PAGE_RW 0x004UL /* software: user write access allowed */
-#define _PAGE_GUARDED 0x008UL
-#define _PAGE_COHERENT 0x010UL /* M: enforce memory coherence (SMP systems) */
-#define _PAGE_NO_CACHE 0x020UL /* I: cache inhibit */
-#define _PAGE_WRITETHRU 0x040UL /* W: cache write-through */
-#define _PAGE_DIRTY 0x080UL /* C: page changed */
-#define _PAGE_ACCESSED 0x100UL /* R: page referenced */
-#define _PAGE_HPTENOIX 0x200UL /* software: pte HPTE slot unknown */
-#define _PAGE_HASHPTE 0x400UL /* software: pte has an associated HPTE */
-#define _PAGE_EXEC 0x800UL /* software: i-cache coherence required */
-#define _PAGE_SECONDARY 0x8000UL /* software: HPTE is in secondary group */
-#define _PAGE_GROUP_IX 0x7000UL /* software: HPTE index within group */
+#define _PAGE_PRESENT 0x0001 /* software: pte contains a translation */
+#define _PAGE_USER 0x0002 /* matches one of the PP bits */
+#define _PAGE_RW 0x0004 /* software: user write access allowed */
+#define _PAGE_GUARDED 0x0008
+#define _PAGE_COHERENT 0x0010 /* M: enforce memory coherence (SMP systems) */
+#define _PAGE_NO_CACHE 0x0020 /* I: cache inhibit */
+#define _PAGE_WRITETHRU 0x0040 /* W: cache write-through */
+#define _PAGE_DIRTY 0x0080 /* C: page changed */
+#define _PAGE_ACCESSED 0x0100 /* R: page referenced */
+#define _PAGE_BUSY 0x0200 /* software: pte & hash are busy */
+#define _PAGE_HASHPTE 0x0400 /* software: pte has an associated HPTE */
+#define _PAGE_EXEC 0x0800 /* software: i-cache coherence required */
+#define _PAGE_GROUP_IX 0x7000 /* software: HPTE index within group */
+#define _PAGE_SECONDARY 0x8000 /* software: HPTE is in secondary group */
/* Bits 0x7000 identify the index within an HPT Group */
-#define _PAGE_HPTEFLAGS (_PAGE_HASHPTE | _PAGE_HPTENOIX | _PAGE_SECONDARY | _PAGE_GROUP_IX)
+#define _PAGE_HPTEFLAGS (_PAGE_HASHPTE | _PAGE_SECONDARY | _PAGE_GROUP_IX)
/* PAGE_MASK gives the right answer below, but only by accident */
/* It should be preserving the high 48 bits and then specifically */
/* preserving _PAGE_SECONDARY | _PAGE_GROUP_IX */
@@ -289,13 +289,15 @@ static inline unsigned long pte_update(
unsigned long old, tmp;
__asm__ __volatile__("\n\
-1: ldarx %0,0,%3 \n\
+1: ldarx %0,0,%3 \n\
+ andi. %1,%0,%7 # loop on _PAGE_BUSY set\n\
+ bne- 1b \n\
andc %1,%0,%4 \n\
or %1,%1,%5 \n\
stdcx. %1,0,%3 \n\
bne- 1b"
: "=&r" (old), "=&r" (tmp), "=m" (*p)
- : "r" (p), "r" (clr), "r" (set), "m" (*p)
+ : "r" (p), "r" (clr), "r" (set), "m" (*p), "i" (_PAGE_BUSY)
: "cc" );
return old;
}
** Sent via the linuxppc64-dev mail list. See http://lists.linuxppc.org/
More information about the Linuxppc64-dev
mailing list