[PATCH 11/16] powerpc/mm: Add SMP support to no-hash TLB handling v3

Tue Dec 16 07:19:48 EST 2008

On Dec 14, 2008, at 11:44 PM, Benjamin Herrenschmidt wrote:

> This patch moves the whole no-hash TLB handling out of line into a
> new tlb_nohash.c file, and implements some basic SMP support using
> IPIs and/or broadcast tlbivax instructions.
>
> Note that I'm using local invalidations for D->I cache coherency.
>
> At worst, if another processor is trying to execute the same and
> has the old entry in its TLB, it will just take a fault and re-do
> the TLB flush locally (it won't re-do the cache flush in any case).
>
> Signed-off-by: Benjamin Herrenschmidt <benh at kernel.crashing.org>
> ---
>
> v2. This variant fixes usage of linux/spinlock.h instead of asm/ 
> spinlock.h
> v3. Invadvertently un-EXPORT_SYMBOL'ed some cache flush calls on ppc64
> v4. Fix differences in local_* flush variants between CPU types and
>    corresponding clash with highmem code. Remove remaining _tlbie  
> calls
>    from nohash code.
>
> arch/powerpc/include/asm/highmem.h  |    4
> arch/powerpc/include/asm/mmu.h      |    3
> arch/powerpc/include/asm/tlbflush.h |   84 ++++++--------
> arch/powerpc/kernel/misc_32.S       |    9 +
> arch/powerpc/kernel/ppc_ksyms.c     |    6 -
> arch/powerpc/mm/Makefile            |    2
> arch/powerpc/mm/fault.c             |    2
> arch/powerpc/mm/mem.c               |    2
> arch/powerpc/mm/tlb_hash32.c        |    4
> arch/powerpc/mm/tlb_nohash.c        |  209 ++++++++++++++++++++++++++ 
> ++++++++++
> 10 files changed, 268 insertions(+), 57 deletions(-)
>

> Index: linux-work/arch/powerpc/mm/tlb_nohash.c
> ===================================================================
> --- /dev/null	1970-01-01 00:00:00.000000000 +0000
> +++ linux-work/arch/powerpc/mm/tlb_nohash.c	2008-12-15  
> 14:36:20.000000000 +1100
> @@ -0,0 +1,209 @@
> +/*
> + * This file contains the routines for TLB flushing.
> + * On machines where the MMU does not use a hash table to store  
> virtual to
> + * physical translations (ie, SW loaded TLBs or Book3E compilant  
> processors,
> + * this does -not- include 603 however which shares the  
> implementation with
> + * hash based processors)
> + *
> + *  -- BenH
> + *
> + * Copyright 2008 Ben Herrenschmidt <benh at kernel.crashing.org>
> + *                IBM Corp.
> + *
> + *  Derived from arch/ppc/mm/init.c:
> + *    Copyright (C) 1995-1996 Gary Thomas (gdt at linuxppc.org)
> + *
> + *  Modifications by Paul Mackerras (PowerMac) (paulus at cs.anu.edu.au)
> + *  and Cort Dougan (PReP) (cort at cs.nmt.edu)
> + *    Copyright (C) 1996 Paul Mackerras
> + *
> + *  Derived from "arch/i386/mm/init.c"
> + *    Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
> + *
> + *  This program is free software; you can redistribute it and/or
> + *  modify it under the terms of the GNU General Public License
> + *  as published by the Free Software Foundation; either version
> + *  2 of the License, or (at your option) any later version.
> + *
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/mm.h>
> +#include <linux/init.h>
> +#include <linux/highmem.h>
> +#include <linux/pagemap.h>
> +#include <linux/preempt.h>
> +#include <linux/spinlock.h>
> +
> +#include <asm/tlbflush.h>
> +#include <asm/tlb.h>
> +
> +#include "mmu_decl.h"
> +
> +/*
> + * Basse TLB flushing operations:

One 's'

>
> + *
> + *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
> + *  - flush_tlb_page(vma, vmaddr) flushes one page
> + *  - flush_tlb_range(vma, start, end) flushes a range of pages
> + *  - flush_tlb_kernel_range(start, end) flushes kernel pages
> + *
> + *  - local_* variants of page and mm only apply to the current
> + *    processor
> + */
> +
> +/*
> + * These are the base non-SMP variants of page and mm flushing
> + */
> +void local_flush_tlb_mm(struct mm_struct *mm)
> +{
> +	unsigned int pid;
> +
> +	preempt_disable();
> +	pid = mm->context.id;
> +	if (pid != MMU_NO_CONTEXT)
> +		_tlbil_pid(pid);
> +	preempt_enable();
> +}
> +EXPORT_SYMBOL(local_flush_tlb_mm);

Do these really get called w/MMU_NO_CONTEXT?  What is the calling code  
trying to flush under those situations?

> +
> +void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long  
> vmaddr)
> +{
> +	unsigned int pid;
> +
> +	preempt_disable();
> +	pid = vma ? vma->vm_mm->context.id : 0;
> +	if (pid != MMU_NO_CONTEXT)
> +		_tlbil_va(vmaddr, pid);
> +	preempt_enable();
> +}
> +EXPORT_SYMBOL(local_flush_tlb_page);
> +
> +
> +/*
> + * And here are the SMP non-local implementations
> + */
> +#ifdef CONFIG_SMP
> +
> +static DEFINE_SPINLOCK(tlbivax_lock);
> +
> +struct tlb_flush_param {
> +	unsigned long addr;
> +	unsigned int pid;
> +};
> +
> +static void do_flush_tlb_mm_ipi(void *param)
> +{
> +	struct tlb_flush_param *p = param;
> +
> +	_tlbil_pid(p ? p->pid : 0);
> +}
> +
> +static void do_flush_tlb_page_ipi(void *param)
> +{
> +	struct tlb_flush_param *p = param;
> +
> +	_tlbil_va(p->addr, p->pid);
> +}
> +
> +
> +/* Note on invalidations and PID:
> + *
> + * We snapshot the PID with preempt disabled. At this point, it can  
> still
> + * change either because:
> + * - our context is being stolen (PID -> NO_CONTEXT) on another CPU
> + * - we are invaliating some target that isn't currently running here
> + *   and is concurrently acquiring a new PID on another CPU
> + * - some other CPU is re-acquiring a lost PID for this mm
> + * etc...
> + *
> + * However, this shouldn't be a problem as we only guarantee
> + * invalidation of TLB entries present prior to this call, so we
> + * don't care about the PID changing, and invalidating a stale PID
> + * is generally harmless.
> + */
> +
> +void flush_tlb_mm(struct mm_struct *mm)
> +{
> +	cpumask_t cpu_mask;
> +	unsigned int pid;
> +
> +	preempt_disable();
> +	pid = mm->context.id;
> +	if (unlikely(pid == MMU_NO_CONTEXT))
> +		goto no_context;
> +	cpu_mask = mm->cpu_vm_mask;
> +	cpu_clear(smp_processor_id(), cpu_mask);
> +	if (!cpus_empty(cpu_mask)) {
> +		struct tlb_flush_param p = { .pid = pid };
> +		smp_call_function_mask(cpu_mask, do_flush_tlb_mm_ipi, &p, 1);
> +	}
> +	_tlbil_pid(pid);
> + no_context:
> +	preempt_enable();
> +}
> +EXPORT_SYMBOL(flush_tlb_mm);
> +
> +void flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr)
> +{
> +	cpumask_t cpu_mask;
> +	unsigned int pid;
> +
> +	preempt_disable();
> +	pid = vma ? vma->vm_mm->context.id : 0;
> +	if (unlikely(pid == MMU_NO_CONTEXT))
> +		goto bail;
> +	cpu_mask = vma->vm_mm->cpu_vm_mask;
> +	cpu_clear(smp_processor_id(), cpu_mask);
> +	if (!cpus_empty(cpu_mask)) {
> +		/* If broadcast tlbivax is supported, use it */
> +		if (mmu_has_feature(MMU_FTR_HAS_TLBIVAX_BCAST)) {
> +			int lock = mmu_has_feature(MMU_FTR_TLBIVAX_NEED_LOCK);
> +			if (lock)
> +				spin_lock(&tlbivax_lock);
> +			_tlbivax_bcast(vmaddr, pid);
> +			if (lock)
> +				spin_unlock(&tlbivax_lock);
> +			goto bail;
> +		} else {
> +			struct tlb_flush_param p = { .pid = pid, .addr = vmaddr };
> +			smp_call_function_mask(cpu_mask,
> +					       do_flush_tlb_page_ipi, &p, 1);
> +		}
> +	}
> +	_tlbil_va(vmaddr, pid);
> + bail:
> +	preempt_enable();
> +}
> +EXPORT_SYMBOL(flush_tlb_page);
> +
> +#endif /* CONFIG_SMP */
> +
> +/*
> + * Flush kernel TLB entries in the given range
> + */
> +void flush_tlb_kernel_range(unsigned long start, unsigned long end)
> +{
> +#ifdef CONFIG_SMP
> +	preempt_disable();
> +	smp_call_function(do_flush_tlb_mm_ipi, NULL, 1);
> +	_tlbil_pid(0);
> +	preempt_enable();
> +#endif
> +	_tlbil_pid(0);
> +}
> +EXPORT_SYMBOL(flush_tlb_kernel_range);
> +
> +/*
> + * Currently, for range flushing, we just do a full mm flush. This  
> should
> + * be optimized based on a threshold on the size of the range, since
> + * some implementation can stack multiple tlbivax before a tlbsync  
> but
> + * for now, we keep it that way
> + */
> +void flush_tlb_range(struct vm_area_struct *vma, unsigned long start,
> +		     unsigned long end)
> +
> +{
> +	flush_tlb_mm(vma->vm_mm);
> +}
> +EXPORT_SYMBOL(flush_tlb_range);

[snip]

> Index: linux-work/arch/powerpc/include/asm/mmu.h
> ===================================================================
> --- linux-work.orig/arch/powerpc/include/asm/mmu.h	2008-12-15  
> 14:36:20.000000000 +1100
> +++ linux-work/arch/powerpc/include/asm/mmu.h	2008-12-15  
> 14:36:20.000000000 +1100
> @@ -15,6 +15,9 @@
> #define MMU_FTR_TYPE_FSL_E		ASM_CONST(0x00000010)
> #define MMU_FTR_HAS_HIGH_BATS		ASM_CONST(0x00010000)
> #define MMU_FTR_BIG_PHYS		ASM_CONST(0x00020000)
> +#define MMU_FTR_HAS_TLBIVAX_BCAST	ASM_CONST(0x00040000)
> +#define MMU_FTR_HAS_TLBILX_PID		ASM_CONST(0x00080000)

Can we make these FTR_USE_ instead of FTR_HAS_.  On e500 we have  
TLBIVAX_BCAST but dont plan to use it.  I'd prefer not to have to  
answer questions about that.

> +#define MMU_FTR_TLBIVAX_NEED_LOCK	ASM_CONST(0x00100000)

Is this really ivax lock or sync lock?

- k