[PATCH] powerpc/64: pseudo-NMI/SMP watchdog
Balbir Singh
bsingharora at gmail.com
Sat Dec 10 16:22:13 AEDT 2016
On 10/12/16 02:52, Nicholas Piggin wrote:
> Rather than use perf / PMU interrupts and the generic hardlockup
> detector, this takes the decrementer interrupt as an "NMI" when
> interrupts are soft disabled (XXX: will this do the right thing with a
> large decrementer?). This will work even if we start soft-disabling PMU
> interrupts.
>
> This does not solve the hardlockup problem completely however, because
> interrupts can often become hard disabled when soft disabled for long
> periods. And they can be hard disabled for other reasons.
>
Ben/Paul suggested a way to work around this with XICS. The idea was to
have MSR_EE set and use XICS to stash away the current
interrupt and acknowledge it/replay it later. Decrementer interrupts would
not trigger timers, but trigger a special NMI watchdog, like you've
implemented.
> To make up for the lack of a periodic true NMI, this also has an SMP
> hard lockup detector where all CPUs can observe lockups on others.
>
> This still needs a bit more polishing, testing, comments, config
> options, and boot parameters, etc., so it's RFC quality only.
>
> Thanks,
> Nick
> ---
> arch/powerpc/Kconfig | 2 +
> arch/powerpc/include/asm/nmi.h | 5 +
> arch/powerpc/kernel/Makefile | 1 +
> arch/powerpc/kernel/exceptions-64s.S | 14 +-
> arch/powerpc/kernel/kvm.c | 3 +
> arch/powerpc/kernel/nmi.c | 288 +++++++++++++++++++++++++++++++++++
> arch/powerpc/kernel/setup_64.c | 18 ---
> arch/powerpc/kernel/time.c | 2 +
> arch/sparc/kernel/nmi.c | 2 +-
> include/linux/nmi.h | 14 ++
> init/main.c | 3 +
> kernel/watchdog.c | 16 +-
> 12 files changed, 341 insertions(+), 27 deletions(-)
> create mode 100644 arch/powerpc/kernel/nmi.c
>
> diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
> index 65fba4c..adb3387 100644
> --- a/arch/powerpc/Kconfig
> +++ b/arch/powerpc/Kconfig
> @@ -124,6 +124,8 @@ config PPC
> select HAVE_CBPF_JIT if !PPC64
> select HAVE_EBPF_JIT if PPC64
> select HAVE_ARCH_JUMP_LABEL
> + select HAVE_NMI
> + select HAVE_NMI_WATCHDOG if PPC64
> select ARCH_HAVE_NMI_SAFE_CMPXCHG
> select ARCH_HAS_GCOV_PROFILE_ALL
> select GENERIC_SMP_IDLE_THREAD
> diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h
> index ff1ccb3..d00e29b 100644
> --- a/arch/powerpc/include/asm/nmi.h
> +++ b/arch/powerpc/include/asm/nmi.h
> @@ -1,4 +1,9 @@
> #ifndef _ASM_NMI_H
> #define _ASM_NMI_H
>
> +#define arch_nmi_init powerpc_nmi_init
> +void __init powerpc_nmi_init(void);
> +void touch_nmi_watchdog(void);
> +void soft_nmi_interrupt(struct pt_regs *regs);
> +
> #endif /* _ASM_NMI_H */
> diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
> index 1925341..77f199f 100644
> --- a/arch/powerpc/kernel/Makefile
> +++ b/arch/powerpc/kernel/Makefile
> @@ -42,6 +42,7 @@ obj-$(CONFIG_PPC64) += setup_64.o sys_ppc32.o \
> signal_64.o ptrace32.o \
> paca.o nvram_64.o firmware.o
> obj-$(CONFIG_VDSO32) += vdso32/
> +obj-$(CONFIG_HAVE_NMI_WATCHDOG) += nmi.o
> obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
> obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o
> obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index 1ba82ea..b159d02 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -1295,7 +1295,7 @@ masked_##_H##interrupt: \
> lis r10,0x7fff; \
> ori r10,r10,0xffff; \
> mtspr SPRN_DEC,r10; \
> - b 2f; \
> + b masked_decrementer_##_H##interrupt; \
> 1: cmpwi r10,PACA_IRQ_DBELL; \
> beq 2f; \
> cmpwi r10,PACA_IRQ_HMI; \
> @@ -1312,6 +1312,16 @@ masked_##_H##interrupt: \
> ##_H##rfid; \
> b .
>
> +#define MASKED_NMI(_H) \
> +masked_decrementer_##_H##interrupt: \
> + std r12,PACA_EXGEN+EX_R12(r13); \
> + GET_SCRATCH0(r10); \
> + std r10,PACA_EXGEN+EX_R13(r13); \
> + EXCEPTION_PROLOG_PSERIES_1(soft_nmi_common, _H)
> +
> +EXC_COMMON(soft_nmi_common, 0x900, soft_nmi_interrupt)
> +
> +
> /*
> * Real mode exceptions actually use this too, but alternate
> * instruction code patches (which end up in the common .text area)
> @@ -1319,7 +1329,9 @@ masked_##_H##interrupt: \
> */
> USE_FIXED_SECTION(virt_trampolines)
> MASKED_INTERRUPT()
> + MASKED_NMI()
> MASKED_INTERRUPT(H)
> + MASKED_NMI(H)
>
> #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
> TRAMP_REAL_BEGIN(kvmppc_skip_interrupt)
> diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
> index 9ad37f8..f0d215c 100644
> --- a/arch/powerpc/kernel/kvm.c
> +++ b/arch/powerpc/kernel/kvm.c
> @@ -25,6 +25,7 @@
> #include <linux/kvm_para.h>
> #include <linux/slab.h>
> #include <linux/of.h>
> +#include <linux/nmi.h>
>
> #include <asm/reg.h>
> #include <asm/sections.h>
> @@ -718,6 +719,8 @@ static __init void kvm_free_tmp(void)
>
> static int __init kvm_guest_init(void)
> {
> + /* XXX: disable hardlockup watchdog? */
> +
You mean the hypervisor watchdog? Did your testing
catch anything here?
> if (!kvm_para_available())
> goto free_tmp;
>
> diff --git a/arch/powerpc/kernel/nmi.c b/arch/powerpc/kernel/nmi.c
> new file mode 100644
> index 0000000..8e89db4
> --- /dev/null
> +++ b/arch/powerpc/kernel/nmi.c
> @@ -0,0 +1,288 @@
> +/*
> + * NMI support on powerpc systems.
> + *
> + * Copyright 2016, IBM Corporation.
> + *
> + * This uses code from arch/sparc/kernel/nmi.c and kernel/watchdog.c
> + */
> +#include <linux/kernel.h>
> +#include <linux/param.h>
> +#include <linux/init.h>
> +#include <linux/percpu.h>
> +#include <linux/cpu.h>
> +#include <linux/nmi.h>
> +#include <linux/module.h>
> +#include <linux/export.h>
> +#include <linux/kprobes.h>
> +#include <linux/hardirq.h>
> +#include <linux/reboot.h>
> +#include <linux/slab.h>
> +#include <linux/kdebug.h>
> +#include <linux/delay.h>
> +#include <linux/smp.h>
> +
> +#include <asm/paca.h>
> +
> +/*
> + * Powerpc NMI is implemented using either the 0x100 system reset interrupt
> + * which is a real NMI, or using the soft-disable mechanism which can still
> + * be hard masked.
> + */
> +static int nmi_panic_timeout __read_mostly = 30; /* sec to panic */
> +static u64 nmi_panic_timeout_tb __read_mostly; /* timebase ticks */
> +
> +static int nmi_timer_period __read_mostly = 10; /* sec between activity checks */
> +static u64 nmi_timer_period_tb __read_mostly; /* timebase ticks */
> +
> +static DEFINE_PER_CPU(struct timer_list, nmi_timer);
> +static DEFINE_PER_CPU(u64, last_activity_tb);
> +
> +static cpumask_t nmi_cpus_enabled __read_mostly;
> +
> +/*
> + * Cross-CPU watchdog checker
> + */
> +#ifdef CONFIG_SMP
> +static int nmi_global_panic_timeout __read_mostly = 60; /* sec to panic other */
> +static u64 nmi_global_panic_timeout_tb __read_mostly; /* timebase ticks */
> +static cpumask_t nmi_global_cpus_pending;
> +static u64 nmi_global_last_reset_tb;
> +#endif
> +
> +static void watchdog_panic(struct pt_regs *regs, int cpu)
> +{
> + static unsigned long lock = 0;
> +
> + while (unlikely(test_and_set_bit_lock(0, &lock)))
> + cpu_relax();
> +
> + if (cpu < 0) {
> + pr_emerg("Watchdog detected hard LOCKUP other cpus %*pbl\n",
> + cpumask_pr_args(&nmi_global_cpus_pending));
> + } else {
> + pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu);
> + }
> + print_modules();
> + print_irqtrace_events(current);
> + if (regs)
> + show_regs(regs);
> + else
> + dump_stack();
> +
> + /* XXX: trigger NMI IPI? */
> +
> + nmi_panic(regs, "Hard LOCKUP");
> +}
> +
> +#ifdef CONFIG_SMP
> +static void nmi_global_clear_cpu_pending(int cpu, u64 tb)
> +{
> + static unsigned long lock = 0;
> +
> + while (unlikely(test_and_set_bit_lock(0, &lock)))
> + cpu_relax();
> + cpumask_clear_cpu(cpu, &nmi_global_cpus_pending);
> + if (cpumask_empty(&nmi_global_cpus_pending)) {
> + nmi_global_last_reset_tb = tb;
> + cpumask_copy(&nmi_global_cpus_pending, &nmi_cpus_enabled);
> + }
> + clear_bit_unlock(0, &lock);
> +}
> +#endif
> +
> +static void watchdog_timer_interrupt(int cpu)
> +{
> + u64 tb = get_tb();
> +
> + raw_cpu_write(last_activity_tb, tb);
> +
> +#ifdef CONFIG_SMP
> + if (tb - nmi_global_last_reset_tb < nmi_timer_period_tb)
> + return;
> +
> + if (cpumask_test_cpu(cpu, &nmi_global_cpus_pending))
> + nmi_global_clear_cpu_pending(cpu, tb);
> +
> + if (tb - nmi_global_last_reset_tb >= nmi_global_panic_timeout_tb)
> + watchdog_panic(NULL, -1);
> +#endif
> +}
> +
> +static void nmi_timer_fn(unsigned long data)
> +{
> + struct timer_list *t = this_cpu_ptr(&nmi_timer);
> + int cpu = smp_processor_id();
> +
> + watchdog_timer_interrupt(cpu);
> +
> + t->expires = round_jiffies(jiffies + nmi_timer_period * HZ);
> + add_timer_on(t, cpu);
> +}
Do we have to have this running all the time? Can we do an on-demand
version of NMI where we do periodic decrementers without any reliance
on timers to implement NMI watchdog
> +
> +void touch_nmi_watchdog(void)
> +{
> + int cpu = smp_processor_id();
> +
> + if (cpumask_test_cpu(cpu, &nmi_cpus_enabled))
> + watchdog_timer_interrupt(cpu);
> +
> + touch_softlockup_watchdog();
> +}
> +EXPORT_SYMBOL(touch_nmi_watchdog);
> +
> +static void watchdog_nmi_interrupt(struct pt_regs *regs, u64 tb)
> +{
> + u64 interval = tb - __this_cpu_read(last_activity_tb);
> +
> + if (interval >= nmi_panic_timeout_tb)
> + watchdog_panic(regs, smp_processor_id());
> +}
> +
> +/*
> + * This is the soft-mask nmi handler (called by masked_interrupt() in
> + * low level exception entry).
> + */
> +static DEFINE_PER_CPU(u64, soft_nmi_last_tb);
> +
> +/*
> + * Soft nmi interrupts only occur when linux irqs are disabled (but
> + * powerpc hardware irqs are enabled), so they can not be relied
> + * upon to be timely or delivered at all. Their only real use is
> + * the nmi watchdog.
> + */
> +void soft_nmi_interrupt(struct pt_regs *regs)
> +{
> + u64 tb;
> +
> + if (!cpumask_test_cpu(smp_processor_id(), &nmi_cpus_enabled))
> + return;
> +
> + tb = get_tb();
> + if (tb - __this_cpu_read(soft_nmi_last_tb) < nmi_timer_period_tb)
> + return;
> + __this_cpu_write(soft_nmi_last_tb, tb);
> +
> + nmi_enter();
> + watchdog_nmi_interrupt(regs, tb);
> + nmi_exit();
> +}
> +
> +static void start_nmi_on_cpu(int cpu)
> +{
> + struct timer_list *t = per_cpu_ptr(&nmi_timer, cpu);
> + u64 *last_tb = per_cpu_ptr(&soft_nmi_last_tb, cpu);
> +
> + *last_tb = get_tb();
> + cpumask_set_cpu(cpu, &nmi_cpus_enabled);
> +
> + setup_pinned_timer(t, nmi_timer_fn, cpu);
> + t->expires = round_jiffies(jiffies + nmi_timer_period * HZ);
> + add_timer_on(t, cpu);
> +}
> +
> +static void stop_nmi_on_cpu(int cpu)
> +{
> + struct timer_list *t = per_cpu_ptr(&nmi_timer, cpu);
> +
> + del_timer_sync(t);
> +
> + cpumask_clear_cpu(cpu, &nmi_cpus_enabled);
> +#ifdef CONFIG_SMP
> + nmi_global_clear_cpu_pending(cpu, get_tb());
> +#endif
> +}
> +
> +
> +static int nmi_startup(void)
> +{
> + if (num_online_cpus() != 1) {
> + WARN_ON(1);
> + return -EINVAL;
> + }
> +
> +#ifdef CONFIG_SMP
> + nmi_global_last_reset_tb = get_tb();
> +#endif
> + start_nmi_on_cpu(smp_processor_id());
> +#ifdef CONFIG_SMP
> + cpumask_copy(&nmi_global_cpus_pending, &nmi_cpus_enabled);
> +#endif
> +
> + pr_info("NMI Watchdog started on cpus %*pbl\n",
> + cpumask_pr_args(&nmi_cpus_enabled));
> +
> + return 0;
> +}
> +
> +static int nmi_cpu_notify(struct notifier_block *self,
> + unsigned long action, void *hcpu)
> +{
> + int cpu = (unsigned long)hcpu;
> +
> + switch (action & ~CPU_TASKS_FROZEN) {
> + case CPU_ONLINE:
> + case CPU_DOWN_FAILED:
> + start_nmi_on_cpu(cpu);
> + pr_info("NMI Watchdog running on cpus %*pbl\n",
> + cpumask_pr_args(&nmi_cpus_enabled));
> + break;
> + case CPU_DOWN_PREPARE:
> + stop_nmi_on_cpu(cpu);
> + pr_info("NMI Watchdog running on cpus %*pbl\n",
> + cpumask_pr_args(&nmi_cpus_enabled));
> + break;
> + }
> + return NOTIFY_OK;
> +}
FYI: These bits are changing in linux-next
> +
> +static int nmi_shutdown(struct notifier_block *nb, unsigned long cmd, void *p)
> +{
> + stop_nmi_on_cpu(smp_processor_id());
> + return 0;
> +}
> +
> +static struct notifier_block nmi_reboot_notifier = {
> + .notifier_call = nmi_shutdown,
> +};
> +
> +void __init powerpc_nmi_init(void)
> +{
> + int err;
> +
> + if (!nmi_panic_timeout)
> + return;
> +
> + nmi_panic_timeout_tb = nmi_panic_timeout * ppc_tb_freq;
> + nmi_timer_period_tb = nmi_timer_period * ppc_tb_freq;
> +#ifdef CONFIG_SMP
> + nmi_global_panic_timeout_tb = nmi_global_panic_timeout * ppc_tb_freq;
> +#endif
> +
> + err = register_reboot_notifier(&nmi_reboot_notifier);
> + if (err)
> + return;
> +
> + err = nmi_startup();
> + if (err) {
> + unregister_reboot_notifier(&nmi_reboot_notifier);
> + return;
> + }
> +
> + cpu_notifier(nmi_cpu_notify, 0);
> +}
> +
> +static int __init setup_nmi_watchdog(char *str)
> +{
> + if (!strncmp(str, "0", 1))
> + nmi_panic_timeout = 0;
> + return 0;
> +}
> +__setup("nmi_watchdog=", setup_nmi_watchdog);
> +
> +static int __init nowatchdog_setup(char *str)
> +{
> + nmi_panic_timeout = 0;
> + return 1;
> +}
> +__setup("nowatchdog", nowatchdog_setup);
> +
> diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
> index 8d586cf..6a403a9 100644
> --- a/arch/powerpc/kernel/setup_64.c
> +++ b/arch/powerpc/kernel/setup_64.c
> @@ -652,21 +652,3 @@ struct ppc_pci_io ppc_pci_io;
> EXPORT_SYMBOL(ppc_pci_io);
> #endif
>
> -#ifdef CONFIG_HARDLOCKUP_DETECTOR
> -u64 hw_nmi_get_sample_period(int watchdog_thresh)
> -{
> - return ppc_proc_freq * watchdog_thresh;
> -}
> -
> -/*
> - * The hardlockup detector breaks PMU event based branches and is likely
> - * to get false positives in KVM guests, so disable it by default.
> - */
> -static int __init disable_hardlockup_detector(void)
> -{
> - hardlockup_detector_disable();
> -
> - return 0;
> -}
> -early_initcall(disable_hardlockup_detector);
> -#endif
> diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
> index bc3f7d0..5aec1ad 100644
> --- a/arch/powerpc/kernel/time.c
> +++ b/arch/powerpc/kernel/time.c
> @@ -52,6 +52,7 @@
> #include <linux/jiffies.h>
> #include <linux/posix-timers.h>
> #include <linux/irq.h>
> +#include <linux/nmi.h>
> #include <linux/delay.h>
> #include <linux/irq_work.h>
> #include <linux/clk-provider.h>
> @@ -66,6 +67,7 @@
> #include <asm/machdep.h>
> #include <asm/uaccess.h>
> #include <asm/time.h>
> +#include <asm/nmi.h>
> #include <asm/prom.h>
> #include <asm/irq.h>
> #include <asm/div64.h>
> diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c
> index a9973bb..ae3a4d5 100644
> --- a/arch/sparc/kernel/nmi.c
> +++ b/arch/sparc/kernel/nmi.c
> @@ -244,7 +244,7 @@ static struct notifier_block nmi_reboot_notifier = {
> .notifier_call = nmi_shutdown,
> };
>
> -int __init nmi_init(void)
> +void __init nmi_init(void)
> {
> int err;
>
> diff --git a/include/linux/nmi.h b/include/linux/nmi.h
> index a78c35c..9552ab0 100644
> --- a/include/linux/nmi.h
> +++ b/include/linux/nmi.h
> @@ -18,6 +18,9 @@
> #include <asm/nmi.h>
> extern void touch_nmi_watchdog(void);
> #else
> +static inline void nmi_init(void)
> +{
> +}
> static inline void touch_nmi_watchdog(void)
> {
> touch_softlockup_watchdog();
> @@ -30,6 +33,17 @@ extern void hardlockup_detector_disable(void);
> static inline void hardlockup_detector_disable(void) {}
> #endif
>
> +#ifdef arch_nmi_init
> +static inline void nmi_init(void)
> +{
> + arch_nmi_init();
> +}
> +#else
> +static inline void nmi_init(void)
> +{
> +}
> +#endif
> +
> /*
> * Create trigger_all_cpu_backtrace() out of the arch-provided
> * base function. Return whether such support was available,
> diff --git a/init/main.c b/init/main.c
> index 2858be7..36fd7e7 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -33,6 +33,7 @@
> #include <linux/start_kernel.h>
> #include <linux/security.h>
> #include <linux/smp.h>
> +#include <linux/nmi.h>
> #include <linux/profile.h>
> #include <linux/rcupdate.h>
> #include <linux/moduleparam.h>
> @@ -579,6 +580,8 @@ asmlinkage __visible void __init start_kernel(void)
>
> kmem_cache_init_late();
>
> + nmi_init();
How did you test these?
Balbir
More information about the Linuxppc-dev
mailing list