[PATCH] [2.4] [RHEL] Backport of benh's PTE mgmt changes

olof at austin.ibm.com olof at austin.ibm.com
Thu Jan 8 08:43:22 EST 2004


On Tue, 6 Jan 2004 olof at forte.austin.ibm.com wrote:

> I'll try to see how visible is with workloads. I'm not sure how
> syncronization could be acheived without either RCU or IPI support, so
> hopefully it won't be a big hit.

No visible impact on SPECweb, as far as I can tell. SDET didn't show much
a difference either, but it's hard to tell since numbers vary quite a bit.

But anyway: The solution is too obvious: A rwlock, with hash_page taking
it for reading, and pte_freelist_batch taking it for writing momentarily
to syncronize with the readers. This way, batch free only has to wait for
all hash_page()s to complete, and no IPI is needed. Code size of hash_page
is largely unaltered from the old page_table_lock spin_locks.

I also moved to a nonatomic clearing of _PAGE_BUSY, and added the PMD
free's to be managed by the freelist stuff too, just as in 2.6.

So, bottom line: This gives patch gives no particular performance benefit
over baseline, but it does remove the deadlock hangs.

New patch below.

-Olof


Index: arch/ppc64/kernel/htab.c
===================================================================
RCS file: /cvs/local/rhel/arch/ppc64/kernel/htab.c,v
retrieving revision 1.1.1.2
diff -p -u -r1.1.1.2 htab.c
--- arch/ppc64/kernel/htab.c	5 Sep 2003 18:57:01 -0000	1.1.1.2
+++ arch/ppc64/kernel/htab.c	7 Jan 2004 20:25:52 -0000
@@ -64,6 +64,7 @@ HTAB htab_data = {NULL, 0, 0, 0, 0};

 extern unsigned long _SDR1;
 extern unsigned long klimit;
+extern rwlock_t pte_hash_lock;

 void make_pte(HPTE *htab, unsigned long va, unsigned long pa,
 	      int mode, unsigned long hash_mask, int large);
@@ -320,8 +321,10 @@ int __hash_page(unsigned long ea, unsign
 	unsigned long va, vpn;
 	unsigned long newpp, prpn;
 	unsigned long hpteflags, lock_slot;
+	unsigned long access_ok, tmp;
 	long slot;
 	pte_t old_pte, new_pte;
+	int ret = 0;

 	/* Search the Linux page table for a match with va */
 	va = (vsid << 28) | (ea & 0x0fffffff);
@@ -337,21 +340,52 @@ int __hash_page(unsigned long ea, unsign
 	 * Check the user's access rights to the page.  If access should be
 	 * prevented then send the problem up to do_page_fault.
 	 */
-#ifdef CONFIG_SHARED_MEMORY_ADDRESSING
+
+	/*
+	 * Check the user's access rights to the page.  If access should be
+	 * prevented then send the problem up to do_page_fault.
+	 */
+
 	access |= _PAGE_PRESENT;
-	if (unlikely(access & ~(pte_val(*ptep)))) {
+
+	/* We'll do access checking and _PAGE_BUSY setting in assembly, since
+	 * it needs to be atomic.
+	 */
+
+	__asm__ __volatile__ ("\n
+	1:	ldarx	%0,0,%3\n
+		# Check access rights (access & ~(pte_val(*ptep)))\n
+		andc.	%1,%2,%0\n
+		bne-	2f\n
+		# Check if PTE is busy\n
+		andi.	%1,%0,%4\n
+		bne-	1b\n
+		ori	%0,%0,%4\n
+		# Write the linux PTE atomically (setting busy)\n
+		stdcx.	%0,0,%3\n
+		bne-	1b\n
+		li      %1,1\n
+                b	3f\n
+	2:      stdcx.  %0,0,%3 # to clear the reservation\n
+		li      %1,0\n
+	3:"
+	: "=r" (old_pte), "=r" (access_ok)
+	: "r" (access), "r" (ptep), "i" (_PAGE_BUSY)
+        : "cr0", "memory");
+
+#ifdef CONFIG_SHARED_MEMORY_ADDRESSING
+	if (unlikely(!access_ok)) {
 		if(!(((ea >> SMALLOC_EA_SHIFT) ==
 		      (SMALLOC_START >> SMALLOC_EA_SHIFT)) &&
 		     ((current->thread.flags) & PPC_FLAG_SHARED))) {
-			spin_unlock(&hash_table_lock[lock_slot].lock);
-			return 1;
+			ret = 1;
+			goto out_unlock;
 		}
 	}
 #else
-	access |= _PAGE_PRESENT;
-	if (unlikely(access & ~(pte_val(*ptep)))) {
-		spin_unlock(&hash_table_lock[lock_slot].lock);
-		return 1;
+	if (unlikely(!access_ok)) {
+		ret = 1;
+		goto out_unlock;
 	}
 #endif

@@ -428,9 +462,14 @@ int __hash_page(unsigned long ea, unsign
 		*ptep = new_pte;
 	}

+out_unlock:
+	smp_wmb();
+
+	pte_val(*ptep) &= ~_PAGE_BUSY;
+
 	spin_unlock(&hash_table_lock[lock_slot].lock);

-	return 0;
+	return ret;
 }

 /*
@@ -497,11 +536,14 @@ int hash_page(unsigned long ea, unsigned
 	pgdir = mm->pgd;
 	if (pgdir == NULL) return 1;

-	/*
-	 * Lock the Linux page table to prevent mmap and kswapd
-	 * from modifying entries while we search and update
+	/* The pte_hash_lock is used to block any PTE deallocations
+	 * while we walk the tree and use the entry. While technically
+	 * we both read and write the PTE entry while holding the read
+	 * lock, the _PAGE_BUSY bit will block pte_update()s to the
+	 * specific entry.
 	 */
-	spin_lock(&mm->page_table_lock);
+
+	read_lock(&pte_hash_lock);

 	ptep = find_linux_pte(pgdir, ea);
 	/*
@@ -514,8 +556,7 @@ int hash_page(unsigned long ea, unsigned
 		/* If no pte, send the problem up to do_page_fault */
 		ret = 1;
 	}
-
-	spin_unlock(&mm->page_table_lock);
+	read_unlock(&pte_hash_lock);

 	return ret;
 }
Index: arch/ppc64/mm/init.c
===================================================================
RCS file: /cvs/local/rhel/arch/ppc64/mm/init.c,v
retrieving revision 1.1.1.1
diff -p -u -r1.1.1.1 init.c
--- arch/ppc64/mm/init.c	7 Aug 2003 03:21:44 -0000	1.1.1.1
+++ arch/ppc64/mm/init.c	7 Jan 2004 20:45:14 -0000
@@ -104,9 +104,78 @@ unsigned long __max_memory;
  */
 mmu_gather_t     mmu_gathers[NR_CPUS];

+/* PTE free batching structures. We need a lock since not all
+ * operations take place under page_table_lock. Keep it per-CPU
+ * to avoid bottlenecks.
+ */
+
+spinlock_t pte_freelist_lock[NR_CPUS] = { [0 ... NR_CPUS-1] = SPIN_LOCK_UNLOCKED};
+struct pte_freelist_batch *pte_freelist_cur[NR_CPUS];
+rwlock_t pte_hash_lock = RW_LOCK_UNLOCKED;
+
+unsigned long pte_freelist_forced_free;
+
+static inline void pte_free_sync(void)
+{
+	unsigned long flags;
+
+	/* A sync (lock/unlock) is good enough: It will ensure that no
+	 * other CPU is in hash_page, currently traversing down to a
+	 * free'd pte.
+	 */
+
+	write_lock_irqsave(&pte_hash_lock, flags);
+	write_unlock_irqrestore(&pte_hash_lock, flags);
+}
+
+
+/* This is only called when we are critically out of memory
+ * (and fail to get a page in pte_free_tlb).
+ */
+void pte_free_now(pte_t *pte)
+{
+	pte_freelist_forced_free++;
+
+	pte_free_sync();
+
+	pte_free_kernel(pte);
+}
+
+
+void pte_free_batch(struct pte_freelist_batch *batch)
+{
+	unsigned int i;
+
+	pte_free_sync();
+
+	for (i = 0; i < batch->index; i++)
+		pte_free_kernel(batch->entry[i]);
+	free_page((unsigned long)batch);
+}
+
+
 int do_check_pgt_cache(int low, int high)
 {
 	int freed = 0;
+	struct pte_freelist_batch **batchp;
+	spinlock_t *lock = &pte_freelist_lock[smp_processor_id()];
+
+	/* We use this function to push the current pte free batch to be
+	 * deallocated, since do_check_pgt_cache() is called at the end of each
+	 * free_one_pgd() and other parts of VM relies on all PTE's being
+	 * properly freed upon return from that function.
+	 */
+
+	spin_lock(lock);
+
+	batchp = &pte_freelist_cur[smp_processor_id()];
+
+	if(*batchp) {
+		pte_free_batch(*batchp);
+		*batchp = NULL;
+	}
+
+	spin_unlock(lock);

 #if 0
 	if (pgtable_cache_size > high) {
Index: include/asm-ppc64/pgalloc.h
===================================================================
RCS file: /cvs/local/rhel/include/asm-ppc64/pgalloc.h,v
retrieving revision 1.1.1.2
diff -p -u -r1.1.1.2 pgalloc.h
--- include/asm-ppc64/pgalloc.h	26 Sep 2003 14:42:15 -0000	1.1.1.2
+++ include/asm-ppc64/pgalloc.h	7 Jan 2004 20:46:01 -0000
@@ -93,12 +93,6 @@ pte_free_kernel(pte_t *pte)
 }


-static inline void
-pmd_free (pmd_t *pmd)
-{
-        free_page((unsigned long)pmd);
-}
-
 #define pte_alloc_one_fast(mm, address)		(0)

 static inline struct page *
@@ -112,7 +106,57 @@ pte_alloc_one(struct mm_struct *mm, unsi
         return NULL;
 }

-#define pte_free(pte_page)      pte_free_kernel(page_address(pte_page))
+/* Use the PTE functions for freeing PMD as well, since the same
+ * problem with tree traversals apply. Since pmd pointers are always
+ * virtual, no need for a page_address() translation.
+ */
+
+#define pte_free(pte_page)      __pte_free(page_address(pte_page))
+#define pmd_free(pmd)           __pte_free(pmd)
+
+struct pte_freelist_batch
+{
+	unsigned int	index;
+	void*		entry[0];
+};
+
+#define PTE_FREELIST_SIZE	((PAGE_SIZE - sizeof(struct pte_freelist_batch) / \
+				  sizeof(struct page *)))
+
+extern void pte_free_now(pte_t *pte);
+extern void pte_free_batch(struct pte_freelist_batch *batch);
+
+extern struct pte_freelist_batch *pte_freelist_cur[];
+extern spinlock_t pte_freelist_lock[];
+
+static inline void __pte_free(pte_t *pte)
+{
+	spinlock_t *lock = &pte_freelist_lock[smp_processor_id()];
+	struct pte_freelist_batch **batchp;
+
+	spin_lock(lock);
+
+	batchp = &pte_freelist_cur[smp_processor_id()];
+
+	if (*batchp == NULL) {
+		*batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
+		if (*batchp == NULL) {
+			spin_unlock(lock);
+			pte_free_now(pte);
+			return;
+		}
+		(*batchp)->index = 0;
+	}
+
+	(*batchp)->entry[(*batchp)->index++] = pte;
+	if ((*batchp)->index == PTE_FREELIST_SIZE) {
+		pte_free_batch(*batchp);
+		*batchp = NULL;
+	}
+
+	spin_unlock(lock);
+}
+

 extern int do_check_pgt_cache(int, int);

Index: include/asm-ppc64/pgtable.h
===================================================================
RCS file: /cvs/local/rhel/include/asm-ppc64/pgtable.h,v
retrieving revision 1.1.1.1
diff -p -u -r1.1.1.1 pgtable.h
--- include/asm-ppc64/pgtable.h	7 Aug 2003 03:21:59 -0000	1.1.1.1
+++ include/asm-ppc64/pgtable.h	7 Jan 2004 20:32:57 -0000
@@ -88,22 +88,22 @@
  * Bits in a linux-style PTE.  These match the bits in the
  * (hardware-defined) PowerPC PTE as closely as possible.
  */
-#define _PAGE_PRESENT	0x001UL	/* software: pte contains a translation */
-#define _PAGE_USER	0x002UL	/* matches one of the PP bits */
-#define _PAGE_RW	0x004UL	/* software: user write access allowed */
-#define _PAGE_GUARDED	0x008UL
-#define _PAGE_COHERENT	0x010UL	/* M: enforce memory coherence (SMP systems) */
-#define _PAGE_NO_CACHE	0x020UL	/* I: cache inhibit */
-#define _PAGE_WRITETHRU	0x040UL	/* W: cache write-through */
-#define _PAGE_DIRTY	0x080UL	/* C: page changed */
-#define _PAGE_ACCESSED	0x100UL	/* R: page referenced */
-#define _PAGE_HPTENOIX	0x200UL /* software: pte HPTE slot unknown */
-#define _PAGE_HASHPTE	0x400UL	/* software: pte has an associated HPTE */
-#define _PAGE_EXEC	0x800UL	/* software: i-cache coherence required */
-#define _PAGE_SECONDARY 0x8000UL /* software: HPTE is in secondary group */
-#define _PAGE_GROUP_IX  0x7000UL /* software: HPTE index within group */
+#define _PAGE_PRESENT	0x0001 /* software: pte contains a translation */
+#define _PAGE_USER	0x0002 /* matches one of the PP bits */
+#define _PAGE_RW	0x0004 /* software: user write access allowed */
+#define _PAGE_GUARDED	0x0008
+#define _PAGE_COHERENT	0x0010 /* M: enforce memory coherence (SMP systems) */
+#define _PAGE_NO_CACHE	0x0020 /* I: cache inhibit */
+#define _PAGE_WRITETHRU	0x0040 /* W: cache write-through */
+#define _PAGE_DIRTY	0x0080 /* C: page changed */
+#define _PAGE_ACCESSED	0x0100 /* R: page referenced */
+#define _PAGE_BUSY	0x0200 /* software: pte & hash are busy */
+#define _PAGE_HASHPTE	0x0400 /* software: pte has an associated HPTE */
+#define _PAGE_EXEC	0x0800 /* software: i-cache coherence required */
+#define _PAGE_GROUP_IX  0x7000 /* software: HPTE index within group */
+#define _PAGE_SECONDARY 0x8000 /* software: HPTE is in secondary group */
 /* Bits 0x7000 identify the index within an HPT Group */
-#define _PAGE_HPTEFLAGS (_PAGE_HASHPTE | _PAGE_HPTENOIX | _PAGE_SECONDARY | _PAGE_GROUP_IX)
+#define _PAGE_HPTEFLAGS (_PAGE_HASHPTE | _PAGE_SECONDARY | _PAGE_GROUP_IX)
 /* PAGE_MASK gives the right answer below, but only by accident */
 /* It should be preserving the high 48 bits and then specifically */
 /* preserving _PAGE_SECONDARY | _PAGE_GROUP_IX */
@@ -289,13 +289,15 @@ static inline unsigned long pte_update(
 	unsigned long old, tmp;

 	__asm__ __volatile__("\n\
-1:	ldarx	%0,0,%3	\n\
+1:	ldarx	%0,0,%3 \n\
+        andi.   %1,%0,%7 # loop on _PAGE_BUSY set\n\
+        bne-    1b \n\
 	andc	%1,%0,%4 \n\
 	or	%1,%1,%5 \n\
 	stdcx.	%1,0,%3 \n\
 	bne-	1b"
 	: "=&r" (old), "=&r" (tmp), "=m" (*p)
-	: "r" (p), "r" (clr), "r" (set), "m" (*p)
+	: "r" (p), "r" (clr), "r" (set), "m" (*p), "i" (_PAGE_BUSY)
 	: "cc" );
 	return old;
 }


** Sent via the linuxppc64-dev mail list. See http://lists.linuxppc.org/





More information about the Linuxppc64-dev mailing list