[PATCH] Fix race between pte_free and hash_page

Fri Dec 12 17:12:03 EST 2003

Hi !

As discussed earlier with Olof, there is (and has always been afaik)
a race between freeing a PTE page and dereferencing PTEs in that page
from hash_page on another CPU. Typically can happen with a threaded
application if one thread unmap's something that the other thread
still uses.

After much thinking and at least 2 different implementations and
Rusty wise advice, here is an implementation that should be both
fast and scalable. The idea is to defer the freeing until some
safe point where we know the PTE page will no longer be used
(since the PMD has been cleared, since a point where we know all
pending hash_page are completed). This is just what RCU gives us
so let's use it.

Unless I missed something, this should probably be applied to
ameslab-2.5 now.

===== include/asm/pgalloc.h 1.11 vs edited =====

--- 1.11/include/asm-ppc64/pgalloc.h	Fri Sep 19 16:55:11 2003
+++ edited/include/asm/pgalloc.h	Fri Dec 12 17:07:17 2003
@@ -3,7 +3,10 @@

 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
 #include <asm/processor.h>
+#include <asm/tlb.h>

 extern kmem_cache_t *zero_cache;

@@ -62,15 +65,54 @@

 	return NULL;
 }
-
-static inline void
-pte_free_kernel(pte_t *pte)
+
+static inline void pte_free_kernel(pte_t *pte)
 {
 	kmem_cache_free(zero_cache, pte);
 }

 #define pte_free(pte_page)	pte_free_kernel(page_address(pte_page))
-#define __pte_free_tlb(tlb, pte)	pte_free(pte)
+
+struct pte_freelist_batch
+{
+	struct rcu_head	rcu;
+	unsigned int	index;
+	struct page *	pages[0];
+};
+
+#define PTE_FREELIST_SIZE	((PAGE_SIZE - sizeof(struct pte_freelist_batch) / \
+				  sizeof(struct page *)))
+
+extern void pte_free_now(struct page *ptepage);
+extern void pte_free_submit(struct pte_freelist_batch *batch);
+
+DECLARE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
+
+static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *ptepage)
+{
+	/* This is safe as we are holding page_table_lock */
+        cpumask_t local_cpumask = cpumask_of_cpu(smp_processor_id());
+	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+
+	if (cpus_equal(tlb->mm->cpu_vm_mask, local_cpumask) ||
+	    cpus_equal(tlb->mm->cpu_vm_mask, CPU_MASK_NONE)) {
+		pte_free(ptepage);
+		return;
+	}
+
+	if (*batchp == NULL) {
+		*batchp = (struct pte_freelist_batch *)__get_free_page(GFP_ATOMIC);
+		if (*batchp == NULL) {
+			pte_free_now(ptepage);
+			return;
+		}
+	}
+	(*batchp)->pages[(*batchp)->index++] = ptepage;
+	if ((*batchp)->index == PTE_FREELIST_SIZE) {
+		pte_free_submit(*batchp);
+		*batchp = NULL;
+	}
+}

 #define check_pgt_cache()	do { } while (0)

===== include/asm/tlb.h 1.9 vs edited =====
--- 1.9/include/asm-ppc64/tlb.h	Tue Aug 19 12:46:23 2003
+++ edited/include/asm/tlb.h	Fri Dec 12 13:48:28 2003
@@ -74,6 +74,8 @@
 	batch->index = i;
 }

+extern void pte_free_finish(void);
+
 static inline void tlb_flush(struct mmu_gather *tlb)
 {
 	int cpu = smp_processor_id();
@@ -86,6 +88,8 @@

 	flush_hash_range(tlb->mm->context, batch->index, local);
 	batch->index = 0;
+
+	pte_free_finish();
 }

 #endif /* _PPC64_TLB_H */
===== arch/ppc64/mm/init.c 1.52 vs edited =====
--- 1.52/arch/ppc64/mm/init.c	Fri Oct 24 00:10:29 2003
+++ edited/arch/ppc64/mm/init.c	Fri Dec 12 17:09:58 2003
@@ -94,6 +94,52 @@
  * include/asm-ppc64/tlb.h file -- tgall
  */
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU(struct pte_freelist_batch *, pte_freelist_cur);
+unsigned long pte_freelist_forced_free;
+
+static void pte_free_smp_sync(void *arg)
+{
+	/* Do nothing, just ensure we sync with all CPUs */
+}
+
+/* This is only called when we are critically out of memory
+ * (and fail to get a page in pte_free_tlb).
+ */
+void pte_free_now(struct page *ptepage)
+{
+	pte_freelist_forced_free++;
+
+	smp_call_function(pte_free_smp_sync, NULL, 0, 1);
+
+	pte_free(ptepage);
+}
+
+static void pte_free_rcu_callback(void *arg)
+{
+	struct pte_freelist_batch *batch = arg;
+	unsigned int i;
+
+	for (i = 0; i < batch->index; i++)
+		pte_free(batch->pages[i]);
+	free_page((unsigned long)batch);
+}
+
+void pte_free_submit(struct pte_freelist_batch *batch)
+{
+	INIT_RCU_HEAD(&batch->rcu);
+	call_rcu(&batch->rcu, pte_free_rcu_callback, batch);
+}
+
+void pte_free_finish(void)
+{
+	/* This is safe as we are holding page_table_lock */
+	struct pte_freelist_batch **batchp = &__get_cpu_var(pte_freelist_cur);
+
+	if (*batchp == NULL)
+		return;
+	pte_free_submit(*batchp);
+	*batchp = NULL;
+}

 void show_mem(void)
 {


** Sent via the linuxppc64-dev mail list. See http://lists.linuxppc.org/