[PATCH] [2.4] Remove page_table_lock in hash_page for kernel faults

olof at austin.ibm.com olof at austin.ibm.com
Wed Sep 24 07:09:46 EST 2003


Attached patch removes the page_table_lock for vmalloc and io-mapped
regions.

We've seen cases where someone would call vmalloc(), which takes the
page_table_lock without disabling interrupts, we get an interrupt, and the
interrupt code will fault -> hash_page() -> deadlock. It seems to be more
common when the driver is a module (i.e. in the vmalloc region).

The tricky part is to make the update atomic. I essentially copied Paul
MacKerras 32-bit code where it checks access and sets the ACCESSED/DIRTY
bits. ppc(32) has it all in assembler, I kept most of it in C to keep
changes smaller. Rewriting selected parts into larger asm-blocks is a
todo-item for the future (it'd shorten path length a bit).

We've given this patch a few days of testing on one of the large SPECweb
servers, where we'd otherwise see the deadlock every now and then, and we
haven't seen any problems with it. But more eyes on the code couldn't
hurt.

Comments/questions anyone?


Thanks,

-Olof

Olof Johansson                                        Office: 4E002/905
pSeries Linux Development                             IBM Systems Group
Email: olof at austin.ibm.com                          Phone: 512-838-9858
All opinions are my own and not those of IBM
-------------- next part --------------
===== arch/ppc64/kernel/htab.c 1.8 vs edited =====
--- 1.8/arch/ppc64/kernel/htab.c	Mon Aug 25 23:47:44 2003
+++ edited/arch/ppc64/kernel/htab.c	Tue Sep 23 13:46:05 2003
@@ -310,6 +310,25 @@
 		  1));
 }
 
+#define READ_PTE(addr, ret) __asm__ __volatile__	\
+		("ldarx %0,0,%1"			\
+		: "=r"(ret)				\
+		: "r"(addr)				\
+		: "memory")
+
+/* Returns 0 if store failed due to lost reservation */
+          
+#define WRITE_PTE(addr, val) ({				\
+	long r;						\
+	__asm__ __volatile__ ("				\
+		stdcx.	%1,0,%2\n			\
+		mfcr	%0\n				\
+		rlwinm	%0,%0,3,31,31"			\
+		: "=r" (r)				\
+		: "r"(val), "r"(addr)			\
+		: "cc", "memory");			\
+	r; })
+
 /*
  * Handle a fault by adding an HPTE. If the address can't be determined
  * to be valid via Linux page tables, return 1. If handled return 0
@@ -321,7 +340,9 @@
 	unsigned long newpp, prpn;
 	unsigned long hpteflags, lock_slot;
 	long slot;
-	pte_t old_pte, new_pte;
+	pte_t pte;
+	long set_bits, clear_bits;
+	int shared_mem_area = 0;
 
 	/* Search the Linux page table for a match with va */
 	va = (vsid << 28) | (ea & 0x0fffffff);
@@ -333,26 +354,10 @@
 	 */
 	spin_lock(&hash_table_lock[lock_slot].lock);
 	
-	/* 
-	 * Check the user's access rights to the page.  If access should be
-	 * prevented then send the problem up to do_page_fault.
-	 */
 #ifdef CONFIG_SHARED_MEMORY_ADDRESSING
-	access |= _PAGE_PRESENT;
-	if (unlikely(access & ~(pte_val(*ptep)))) {
-		if(!(((ea >> SMALLOC_EA_SHIFT) == 
+	shared_mem_area = (((ea >> SMALLOC_EA_SHIFT) == 
 		      (SMALLOC_START >> SMALLOC_EA_SHIFT)) &&
-		     ((current->thread.flags) & PPC_FLAG_SHARED))) {
-			spin_unlock(&hash_table_lock[lock_slot].lock);
-			return 1;
-		}
-	}
-#else
-	access |= _PAGE_PRESENT;
-	if (unlikely(access & ~(pte_val(*ptep)))) {
-		spin_unlock(&hash_table_lock[lock_slot].lock);
-		return 1;
-	}
+				((current->thread.flags) & PPC_FLAG_SHARED));
 #endif
 
 	/* 
@@ -360,12 +365,44 @@
 	 * The spinlocks prevent this status from changing
 	 * The hash_table_lock prevents the _PAGE_HASHPTE status
 	 * from changing (RPN, DIRTY and ACCESSED too)
-	 * The page_table_lock prevents the pte from being 
-	 * invalidated or modified
+	 *
+	 * For VMALLOC/IO regions the page_table_lock is not held when
+	 * we're executing here so all PTE updates must be atomic.
 	 */
 
+	set_bits = 0;
+	clear_bits = 0;
+
+	/* _PAGE_RW is set for store accesses */
+	if (access & _PAGE_RW)
+		set_bits |= _PAGE_ACCESSED|_PAGE_DIRTY;
+	else
+		set_bits |= _PAGE_ACCESSED;
+
+	access |= _PAGE_PRESENT;
+
+retry:
+
+	READ_PTE(ptep, pte);
+	/* 
+	 * Check the user's access rights to the page.  If access should be
+	 * prevented then send the problem up to do_page_fault.
+	 */
+
+	if(unlikely(access & ~(pte_val(pte)))) {
+		if(!shared_mem_area) {
+			spin_unlock(&hash_table_lock[lock_slot].lock);
+			return 1;
+		}
+	}
+
+	/* Update the entry. If it's been modified we need to start over. */
+
+	if(!WRITE_PTE(ptep, pte_val(pte) | set_bits))
+		goto retry;
+
 	/*
-	 * At this point, we have a pte (old_pte) which can be used to build
+	 * At this point, we have a pte (pte) which can be used to build
 	 * or update an HPTE. There are 2 cases:
 	 *
 	 * 1. There is a valid (present) pte with no associated HPTE (this is 
@@ -376,56 +413,42 @@
 	 *	page is currently not DIRTY. 
 	 */
 
-	old_pte = *ptep;
-	new_pte = old_pte;
-
-	/* If the attempted access was a store */
-	if (access & _PAGE_RW)
-		pte_val(new_pte) |= _PAGE_ACCESSED | _PAGE_DIRTY;
-	else
-		pte_val(new_pte) |= _PAGE_ACCESSED;
-
-	newpp = computeHptePP(pte_val(new_pte));
+	newpp = computeHptePP(pte_val(pte) | set_bits);
 	
 	/* Check if pte already has an hpte (case 2) */
-	if (unlikely(pte_val(old_pte) & _PAGE_HASHPTE)) {
+	if (unlikely(pte_val(pte) & _PAGE_HASHPTE)) {
 		/* There MIGHT be an HPTE for this pte */
-		unsigned long hash, slot, secondary;
+		unsigned long hash, secondary;
 
 		/* XXX fix large pte flag */
 		hash = hpt_hash(vpn, 0);
-		secondary = (pte_val(old_pte) & _PAGE_SECONDARY) >> 15;
+		secondary = (pte_val(pte) & _PAGE_SECONDARY) >> 15;
 		if (secondary)
 			hash = ~hash;
 		slot = (hash & htab_data.htab_hash_mask) * HPTES_PER_GROUP;
-		slot += (pte_val(old_pte) & _PAGE_GROUP_IX) >> 12;
+		slot += (pte_val(pte) & _PAGE_GROUP_IX) >> 12;
 
 		/* XXX fix large pte flag */
 		if (ppc_md.hpte_updatepp(slot, secondary, 
 					 newpp, va, 0) == -1) {
-			pte_val(old_pte) &= ~_PAGE_HPTEFLAGS;
-		} else {
-			if (!pte_same(old_pte, new_pte)) {
-				*ptep = new_pte;
-			}
+			pte_val(pte) &= ~_PAGE_HPTEFLAGS;
 		}
 	}
 
-	if (likely(!(pte_val(old_pte) & _PAGE_HASHPTE))) {
+	if (likely(!(pte_val(pte) & _PAGE_HASHPTE))) {
 		/* Update the linux pte with the HPTE slot */
-		pte_val(new_pte) &= ~_PAGE_HPTEFLAGS;
-		pte_val(new_pte) |= _PAGE_HASHPTE;
-		prpn = pte_val(old_pte) >> PTE_SHIFT;
+		clear_bits |= _PAGE_HPTEFLAGS;
+		set_bits |= _PAGE_HASHPTE;
+		prpn = pte_val(pte) >> PTE_SHIFT;
 
 		/* copy appropriate flags from linux pte */
-		hpteflags = (pte_val(new_pte) & 0x1f8) | newpp;
+		hpteflags = (((pte_val(pte)&~clear_bits)|set_bits) & 0x1f8 ) | newpp;
 
 		slot = ppc_md.hpte_insert(vpn, prpn, hpteflags, 0, 0);
 
-		pte_val(new_pte) |= ((slot<<12) & 
-				     (_PAGE_GROUP_IX | _PAGE_SECONDARY));
+		set_bits |= ((slot<<12) & (_PAGE_GROUP_IX | _PAGE_SECONDARY));
 
-		*ptep = new_pte;
+		pte_update(ptep, clear_bits, set_bits);
 	}
 
 	spin_unlock(&hash_table_lock[lock_slot].lock);
@@ -444,6 +467,7 @@
 	struct mm_struct *mm;
 	pte_t *ptep;
 	int ret;
+	spinlock_t *lock = NULL;
 
 	/* Check for invalid addresses. */
 	if (!IS_VALID_EA(ea)) return 1;
@@ -451,15 +475,18 @@
  	switch (REGION_ID(ea)) {
 	case USER_REGION_ID:
 		mm = current->mm;
+		lock = &mm->page_table_lock;
 		if (mm == NULL) return 1;
 		vsid = get_vsid(mm->context, ea);
 		break;
 	case IO_REGION_ID:
 		mm = &ioremap_mm;
+		/* no locking for IO regions */
 		vsid = get_kernel_vsid(ea);
 		break;
 	case VMALLOC_REGION_ID:
 		mm = &init_mm;
+		/* no locking for VMALLOC regions */
 		vsid = get_kernel_vsid(ea);
 #ifdef CONFIG_SHARED_MEMORY_ADDRESSING
                 /*
@@ -501,7 +528,8 @@
 	 * Lock the Linux page table to prevent mmap and kswapd
 	 * from modifying entries while we search and update
 	 */
-	spin_lock(&mm->page_table_lock);
+	if(lock)
+		spin_lock(lock);
 
 	ptep = find_linux_pte(pgdir, ea);
 	/*
@@ -515,7 +543,8 @@
 		ret = 1;
 	}
 
-	spin_unlock(&mm->page_table_lock);
+	if(lock)
+		spin_unlock(lock);
 
 	return ret;
 }


More information about the Linuxppc64-dev mailing list