[PATCH 2/2] ppc: lazy flush_tlb_mm for nohash architectures

Sat Sep 25 04:01:37 EST 2010

On PPC_MMU_NOHASH processors that support a large number of contexts,
implement a lazy flush_tlb_mm() that switches to a free context, marking
the old one stale.  The tlb is only flushed when no free contexts are
available.

The lazy tlb flushing is controlled by the global variable tlb_lazy_flush
which is set during init, dependent upon MMU_FTR_TYPE_47x.

Signed-off-by: Dave Kleikamp <shaggy at linux.vnet.ibm.com>
---
 arch/powerpc/mm/mmu_context_nohash.c |  154 +++++++++++++++++++++++++++++++---
 arch/powerpc/mm/mmu_decl.h           |    8 ++
 arch/powerpc/mm/tlb_nohash.c         |   28 +++++-
 3 files changed, 174 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c
index ddfd7ad..87c7dc2 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -17,10 +17,6 @@
  * TODO:
  *
  *   - The global context lock will not scale very well
- *   - The maps should be dynamically allocated to allow for processors
- *     that support more PID bits at runtime
- *   - Implement flush_tlb_mm() by making the context stale and picking
- *     a new one
  *   - More aggressively clear stale map bits and maybe find some way to
  *     also clear mm->cpu_vm_mask bits when processes are migrated
  */
@@ -52,6 +48,8 @@
 #include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
 
+#include "mmu_decl.h"
+
 static unsigned int first_context, last_context;
 static unsigned int next_context, nr_free_contexts;
 static unsigned long *context_map;
@@ -59,9 +57,31 @@ static unsigned long *stale_map[NR_CPUS];
 static struct mm_struct **context_mm;
 static DEFINE_RAW_SPINLOCK(context_lock);
 
+int tlb_lazy_flush;
+static int tlb_needs_flush[NR_CPUS];
+static unsigned long *context_available_map;
+static unsigned int nr_stale_contexts;
+
 #define CTX_MAP_SIZE	\
 	(sizeof(unsigned long) * (last_context / BITS_PER_LONG + 1))
 
+/*
+ * if another cpu recycled the stale contexts, we need to flush
+ * the local TLB, so that we may re-use those contexts
+ */
+void flush_recycled_contexts(int cpu)
+{
+	int i;
+
+	if (tlb_needs_flush[cpu]) {
+		pr_hard("[%d] flushing tlb\n", cpu);
+		_tlbil_all();
+		for (i = cpu_first_thread_in_core(cpu);
+		     i <= cpu_last_thread_in_core(cpu); i++) {
+			tlb_needs_flush[i] = 0;
+		}
+	}
+}
 
 /* Steal a context from a task that has one at the moment.
  *
@@ -147,7 +167,7 @@ static unsigned int steal_context_up(unsigned int id)
 	pr_hardcont(" | steal %d from 0x%p", id, mm);
 
 	/* Flush the TLB for that context */
-	local_flush_tlb_mm(mm);
+	__local_flush_tlb_mm(mm);
 
 	/* Mark this mm has having no context anymore */
 	mm->context.id = MMU_NO_CONTEXT;
@@ -161,13 +181,19 @@ static unsigned int steal_context_up(unsigned int id)
 #ifdef DEBUG_MAP_CONSISTENCY
 static void context_check_map(void)
 {
-	unsigned int id, nrf, nact;
+	unsigned int id, nrf, nact, nstale;
 
-	nrf = nact = 0;
+	nrf = nact = nstale = 0;
 	for (id = first_context; id <= last_context; id++) {
 		int used = test_bit(id, context_map);
-		if (!used)
-			nrf++;
+		int allocated = tlb_lazy_flush &&
+				test_bit(id, context_available_map);
+		if (!used) {
+			if (allocated)
+				nstale++;
+			else
+				nrf++;
+		}
 		if (used != (context_mm[id] != NULL))
 			pr_err("MMU: Context %d is %s and MM is %p !\n",
 			       id, used ? "used" : "free", context_mm[id]);
@@ -179,6 +205,11 @@ static void context_check_map(void)
 		       nr_free_contexts, nrf);
 		nr_free_contexts = nrf;
 	}
+	if (nstale != nr_stale_contexts) {
+		pr_err("MMU: Stale context count out of sync ! (%d vs %d)\n",
+		       nr_stale_contexts, nstale);
+		nr_stale_contexts = nstale;
+	}
 	if (nact > num_online_cpus())
 		pr_err("MMU: More active contexts than CPUs ! (%d vs %d)\n",
 		       nact, num_online_cpus());
@@ -189,6 +220,38 @@ static void context_check_map(void)
 static void context_check_map(void) { }
 #endif
 
+/*
+ * On architectures that support a large number of contexts, the tlb
+ * can be flushed lazily by picking a new context and making the stale
+ * context unusable until a lazy tlb flush has been issued.
+ *
+ * context_available_map keeps track of both active and stale contexts,
+ * while context_map continues to track only active contexts.  When the
+ * lazy tlb flush is triggered, context_map is copied to
+ * context_available_map, making the once-stale contexts available again
+ */
+static void recycle_stale_contexts(void)
+{
+	if (nr_free_contexts == 0 && nr_stale_contexts > 0) {
+		unsigned int cpu = smp_processor_id();
+		unsigned int i;
+
+		pr_hard("[%d] recycling stale contexts\n", cpu);
+		/* Time to flush the TLB's */
+		memcpy(context_available_map, context_map, CTX_MAP_SIZE);
+		nr_free_contexts = nr_stale_contexts;
+		nr_stale_contexts = 0;
+		for_each_online_cpu(i) {
+			if ((i < cpu_first_thread_in_core(cpu)) ||
+			    (i > cpu_last_thread_in_core(cpu)))
+				tlb_needs_flush[i] = 1;
+			else
+				tlb_needs_flush[i] = 0;	/* This core */
+		}
+		_tlbil_all();
+	}
+}
+
 void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 {
 	unsigned int i, id, cpu = smp_processor_id();
@@ -197,6 +260,8 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 	/* No lockless fast path .. yet */
 	raw_spin_lock(&context_lock);
 
+	flush_recycled_contexts(cpu);
+
 	pr_hard("[%d] activating context for mm @%p, active=%d, id=%d",
 		cpu, next, next->context.active, next->context.id);
 
@@ -227,7 +292,12 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 	id = next_context;
 	if (id > last_context)
 		id = first_context;
-	map = context_map;
+
+	if (tlb_lazy_flush) {
+		recycle_stale_contexts();
+		map = context_available_map;
+	} else
+		map = context_map;
 
 	/* No more free contexts, let's try to steal one */
 	if (nr_free_contexts == 0) {
@@ -250,6 +320,13 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 		if (id > last_context)
 			id = first_context;
 	}
+	if (tlb_lazy_flush)
+		/*
+		 * In the while loop above, we set the bit in
+		 * context_available_map, it also needs to be set in
+		 * context_map
+		 */
+		__set_bit(id, context_map);
  stolen:
 	next_context = id + 1;
 	context_mm[id] = next;
@@ -267,7 +344,7 @@ void switch_mmu_context(struct mm_struct *prev, struct mm_struct *next)
 			    id, cpu_first_thread_in_core(cpu),
 			    cpu_last_thread_in_core(cpu));
 
-		local_flush_tlb_mm(next);
+		__local_flush_tlb_mm(next);
 
 		/* XXX This clear should ultimately be part of local_flush_tlb_mm */
 		for (i = cpu_first_thread_in_core(cpu);
@@ -317,11 +394,61 @@ void destroy_context(struct mm_struct *mm)
 		mm->context.active = 0;
 #endif
 		context_mm[id] = NULL;
-		nr_free_contexts++;
+
+		if (tlb_lazy_flush)
+			nr_stale_contexts++;
+		else
+			nr_free_contexts++;
 	}
 	raw_spin_unlock_irqrestore(&context_lock, flags);
 }
 
+/*
+ * This is called from flush_tlb_mm().  Mark the current context as stale
+ * and grab an available one.  The tlb will be flushed when no more
+ * contexts are available
+ */
+void lazy_flush_context(struct mm_struct *mm)
+{
+	unsigned int id;
+	unsigned long flags;
+	unsigned long *map;
+
+	raw_spin_lock_irqsave(&context_lock, flags);
+
+	id = mm->context.id;
+	if (unlikely(id == MMU_NO_CONTEXT))
+		goto no_context;
+
+	/*
+	 * Make the existing context stale.  It remains in
+	 * context_available_map as long as nr_free_contexts remains non-zero
+	 */
+	 __clear_bit(id, context_map);
+	 context_mm[id] = NULL;
+	 nr_stale_contexts++;
+
+	recycle_stale_contexts();
+	BUG_ON(nr_free_contexts == 0);
+
+	nr_free_contexts--;
+	id = last_context;
+	map = context_available_map;
+	while (__test_and_set_bit(id, map)) {
+		id = find_next_zero_bit(map, last_context+1, id);
+		if (id > last_context)
+			id = first_context;
+	}
+	set_bit(id, context_map);
+	next_context = id + 1;
+	context_mm[id] = mm;
+	mm->context.id = id;
+	if (current->active_mm == mm)
+		set_context(id, mm->pgd);
+no_context:
+	raw_spin_unlock_irqrestore(&context_lock, flags);
+}
+
 #ifdef CONFIG_SMP
 
 static int __cpuinit mmu_context_cpu_notify(struct notifier_block *self,
@@ -407,6 +534,7 @@ void __init mmu_context_init(void)
 	} else if (mmu_has_feature(MMU_FTR_TYPE_47x)) {
 		first_context = 1;
 		last_context = 65535;
+		tlb_lazy_flush = 1;
 	} else {
 		first_context = 1;
 		last_context = 255;
@@ -419,6 +547,8 @@ void __init mmu_context_init(void)
 	 * Allocate the maps used by context management
 	 */
 	context_map = alloc_bootmem(CTX_MAP_SIZE);
+	if (tlb_lazy_flush)
+		context_available_map = alloc_bootmem(CTX_MAP_SIZE);
 	context_mm = alloc_bootmem(sizeof(void *) * (last_context + 1));
 	stale_map[0] = alloc_bootmem(CTX_MAP_SIZE);
 
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 63b84a0..64240f1 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -25,6 +25,14 @@
 #ifdef CONFIG_PPC_MMU_NOHASH
 
 /*
+ * Lazy tlb flush
+ */
+extern int tlb_lazy_flush;
+extern void flush_recycled_contexts(int);
+void lazy_flush_context(struct mm_struct *mm);
+void __local_flush_tlb_mm(struct mm_struct *mm);
+
+/*
  * On 40x and 8xx, we directly inline tlbia and tlbivax
  */
 #if defined(CONFIG_40x) || defined(CONFIG_8xx)
diff --git a/arch/powerpc/mm/tlb_nohash.c b/arch/powerpc/mm/tlb_nohash.c
index fe391e9..264d0ea 100644
--- a/arch/powerpc/mm/tlb_nohash.c
+++ b/arch/powerpc/mm/tlb_nohash.c
@@ -36,6 +36,7 @@
 #include <linux/spinlock.h>
 #include <linux/memblock.h>
 
+#include <asm/mmu_context.h>
 #include <asm/tlbflush.h>
 #include <asm/tlb.h>
 #include <asm/code-patching.h>
@@ -117,7 +118,7 @@ unsigned long linear_map_top;	/* Top of linear mapping */
 /*
  * These are the base non-SMP variants of page and mm flushing
  */
-void local_flush_tlb_mm(struct mm_struct *mm)
+void __local_flush_tlb_mm(struct mm_struct *mm)
 {
 	unsigned int pid;
 
@@ -127,6 +128,14 @@ void local_flush_tlb_mm(struct mm_struct *mm)
 		_tlbil_pid(pid);
 	preempt_enable();
 }
+
+void local_flush_tlb_mm(struct mm_struct *mm)
+{
+	if (tlb_lazy_flush)
+		lazy_flush_context(mm);
+	else
+		__local_flush_tlb_mm(mm);
+}
 EXPORT_SYMBOL(local_flush_tlb_mm);
 
 void __local_flush_tlb_page(struct mm_struct *mm, unsigned long vmaddr,
@@ -166,13 +175,19 @@ struct tlb_flush_param {
 	unsigned int pid;
 	unsigned int tsize;
 	unsigned int ind;
+	struct mm_struct *mm;
 };
 
 static void do_flush_tlb_mm_ipi(void *param)
 {
 	struct tlb_flush_param *p = param;
 
-	_tlbil_pid(p ? p->pid : 0);
+	if (tlb_lazy_flush && p) {
+		flush_recycled_contexts(smp_processor_id());
+		if (current->active_mm == p->mm)
+			set_context(p->pid, p->mm->pgd);
+	} else
+		_tlbil_pid(p ? p->pid : 0);
 }
 
 static void do_flush_tlb_page_ipi(void *param)
@@ -207,13 +222,18 @@ void flush_tlb_mm(struct mm_struct *mm)
 	pid = mm->context.id;
 	if (unlikely(pid == MMU_NO_CONTEXT))
 		goto no_context;
+	if (tlb_lazy_flush) {
+		lazy_flush_context(mm);
+		pid = mm->context.id;
+	}
 	if (!mm_is_core_local(mm)) {
-		struct tlb_flush_param p = { .pid = pid };
+		struct tlb_flush_param p = { .pid = pid, .mm = mm };
 		/* Ignores smp_processor_id() even if set. */
 		smp_call_function_many(mm_cpumask(mm),
 				       do_flush_tlb_mm_ipi, &p, 1);
 	}
-	_tlbil_pid(pid);
+	if (!tlb_lazy_flush)
+		_tlbil_pid(pid);
  no_context:
 	preempt_enable();
 }
-- 
1.7.2.2