[PATCH] powerpc/64: pseudo-NMI/SMP watchdog

Nicholas Piggin npiggin at gmail.com
Sat Dec 10 02:52:52 AEDT 2016


Rather than use perf / PMU interrupts and the generic hardlockup
detector, this takes the decrementer interrupt as an "NMI" when
interrupts are soft disabled (XXX: will this do the right thing with a
large decrementer?).  This will work even if we start soft-disabling PMU
interrupts.

This does not solve the hardlockup problem completely however, because
interrupts can often become hard disabled when soft disabled for long
periods. And they can be hard disabled for other reasons.

To make up for the lack of a periodic true NMI, this also has an SMP
hard lockup detector where all CPUs can observe lockups on others.

This still needs a bit more polishing, testing, comments, config
options, and boot parameters, etc., so it's RFC quality only.

Thanks,
Nick
---
 arch/powerpc/Kconfig                 |   2 +
 arch/powerpc/include/asm/nmi.h       |   5 +
 arch/powerpc/kernel/Makefile         |   1 +
 arch/powerpc/kernel/exceptions-64s.S |  14 +-
 arch/powerpc/kernel/kvm.c            |   3 +
 arch/powerpc/kernel/nmi.c            | 288 +++++++++++++++++++++++++++++++++++
 arch/powerpc/kernel/setup_64.c       |  18 ---
 arch/powerpc/kernel/time.c           |   2 +
 arch/sparc/kernel/nmi.c              |   2 +-
 include/linux/nmi.h                  |  14 ++
 init/main.c                          |   3 +
 kernel/watchdog.c                    |  16 +-
 12 files changed, 341 insertions(+), 27 deletions(-)
 create mode 100644 arch/powerpc/kernel/nmi.c

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 65fba4c..adb3387 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -124,6 +124,8 @@ config PPC
 	select HAVE_CBPF_JIT if !PPC64
 	select HAVE_EBPF_JIT if PPC64
 	select HAVE_ARCH_JUMP_LABEL
+	select HAVE_NMI
+	select HAVE_NMI_WATCHDOG if PPC64
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select GENERIC_SMP_IDLE_THREAD
diff --git a/arch/powerpc/include/asm/nmi.h b/arch/powerpc/include/asm/nmi.h
index ff1ccb3..d00e29b 100644
--- a/arch/powerpc/include/asm/nmi.h
+++ b/arch/powerpc/include/asm/nmi.h
@@ -1,4 +1,9 @@
 #ifndef _ASM_NMI_H
 #define _ASM_NMI_H
 
+#define arch_nmi_init powerpc_nmi_init
+void __init powerpc_nmi_init(void);
+void touch_nmi_watchdog(void);
+void soft_nmi_interrupt(struct pt_regs *regs);
+
 #endif /* _ASM_NMI_H */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 1925341..77f199f 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_PPC64)		+= setup_64.o sys_ppc32.o \
 				   signal_64.o ptrace32.o \
 				   paca.o nvram_64.o firmware.o
 obj-$(CONFIG_VDSO32)		+= vdso32/
+obj-$(CONFIG_HAVE_NMI_WATCHDOG)	+= nmi.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT)	+= hw_breakpoint.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_ppc970.o cpu_setup_pa6t.o
 obj-$(CONFIG_PPC_BOOK3S_64)	+= cpu_setup_power.o
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 1ba82ea..b159d02 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1295,7 +1295,7 @@ masked_##_H##interrupt:					\
 	lis	r10,0x7fff;				\
 	ori	r10,r10,0xffff;				\
 	mtspr	SPRN_DEC,r10;				\
-	b	2f;					\
+	b	masked_decrementer_##_H##interrupt;	\
 1:	cmpwi	r10,PACA_IRQ_DBELL;			\
 	beq	2f;					\
 	cmpwi	r10,PACA_IRQ_HMI;			\
@@ -1312,6 +1312,16 @@ masked_##_H##interrupt:					\
 	##_H##rfid;					\
 	b	.
 
+#define MASKED_NMI(_H)					\
+masked_decrementer_##_H##interrupt:			\
+	std	r12,PACA_EXGEN+EX_R12(r13);		\
+	GET_SCRATCH0(r10);				\
+	std	r10,PACA_EXGEN+EX_R13(r13);		\
+	EXCEPTION_PROLOG_PSERIES_1(soft_nmi_common, _H)
+
+EXC_COMMON(soft_nmi_common, 0x900, soft_nmi_interrupt)
+
+
 /*
  * Real mode exceptions actually use this too, but alternate
  * instruction code patches (which end up in the common .text area)
@@ -1319,7 +1329,9 @@ masked_##_H##interrupt:					\
  */
 USE_FIXED_SECTION(virt_trampolines)
 	MASKED_INTERRUPT()
+	MASKED_NMI()
 	MASKED_INTERRUPT(H)
+	MASKED_NMI(H)
 
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
 TRAMP_REAL_BEGIN(kvmppc_skip_interrupt)
diff --git a/arch/powerpc/kernel/kvm.c b/arch/powerpc/kernel/kvm.c
index 9ad37f8..f0d215c 100644
--- a/arch/powerpc/kernel/kvm.c
+++ b/arch/powerpc/kernel/kvm.c
@@ -25,6 +25,7 @@
 #include <linux/kvm_para.h>
 #include <linux/slab.h>
 #include <linux/of.h>
+#include <linux/nmi.h>
 
 #include <asm/reg.h>
 #include <asm/sections.h>
@@ -718,6 +719,8 @@ static __init void kvm_free_tmp(void)
 
 static int __init kvm_guest_init(void)
 {
+	/* XXX: disable hardlockup watchdog? */
+
 	if (!kvm_para_available())
 		goto free_tmp;
 
diff --git a/arch/powerpc/kernel/nmi.c b/arch/powerpc/kernel/nmi.c
new file mode 100644
index 0000000..8e89db4
--- /dev/null
+++ b/arch/powerpc/kernel/nmi.c
@@ -0,0 +1,288 @@
+/*
+ * NMI support on powerpc systems.
+ *
+ * Copyright 2016, IBM Corporation.
+ *
+ * This uses code from arch/sparc/kernel/nmi.c and kernel/watchdog.c
+ */
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/init.h>
+#include <linux/percpu.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/module.h>
+#include <linux/export.h>
+#include <linux/kprobes.h>
+#include <linux/hardirq.h>
+#include <linux/reboot.h>
+#include <linux/slab.h>
+#include <linux/kdebug.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+
+#include <asm/paca.h>
+
+/*
+ * Powerpc NMI is implemented using either the 0x100 system reset interrupt
+ * which is a real NMI, or using the soft-disable mechanism which can still
+ * be hard masked.
+ */
+static int nmi_panic_timeout __read_mostly = 30; /* sec to panic */
+static u64 nmi_panic_timeout_tb __read_mostly; /* timebase ticks */
+
+static int nmi_timer_period __read_mostly = 10; /* sec between activity checks */
+static u64 nmi_timer_period_tb __read_mostly; /* timebase ticks */
+
+static DEFINE_PER_CPU(struct timer_list, nmi_timer);
+static DEFINE_PER_CPU(u64, last_activity_tb);
+
+static cpumask_t nmi_cpus_enabled __read_mostly;
+
+/*
+ * Cross-CPU watchdog checker
+ */
+#ifdef CONFIG_SMP
+static int nmi_global_panic_timeout __read_mostly = 60; /* sec to panic other */
+static u64 nmi_global_panic_timeout_tb __read_mostly; /* timebase ticks */
+static cpumask_t nmi_global_cpus_pending;
+static u64 nmi_global_last_reset_tb;
+#endif
+
+static void watchdog_panic(struct pt_regs *regs, int cpu)
+{
+	static unsigned long lock = 0;
+
+	while (unlikely(test_and_set_bit_lock(0, &lock)))
+		cpu_relax();
+
+	if (cpu < 0) {
+		pr_emerg("Watchdog detected hard LOCKUP other cpus %*pbl\n",
+				cpumask_pr_args(&nmi_global_cpus_pending));
+	} else {
+		pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n", cpu);
+	}
+	print_modules();
+	print_irqtrace_events(current);
+	if (regs)
+		show_regs(regs);
+	else
+		dump_stack();
+
+	/* XXX: trigger NMI IPI? */
+
+	nmi_panic(regs, "Hard LOCKUP");
+}
+
+#ifdef CONFIG_SMP
+static void nmi_global_clear_cpu_pending(int cpu, u64 tb)
+{
+	static unsigned long lock = 0;
+
+	while (unlikely(test_and_set_bit_lock(0, &lock)))
+		cpu_relax();
+	cpumask_clear_cpu(cpu, &nmi_global_cpus_pending);
+	if (cpumask_empty(&nmi_global_cpus_pending)) {
+		nmi_global_last_reset_tb = tb;
+		cpumask_copy(&nmi_global_cpus_pending, &nmi_cpus_enabled);
+	}
+	clear_bit_unlock(0, &lock);
+}
+#endif
+
+static void watchdog_timer_interrupt(int cpu)
+{
+	u64 tb = get_tb();
+
+	raw_cpu_write(last_activity_tb, tb);
+
+#ifdef CONFIG_SMP
+	if (tb - nmi_global_last_reset_tb < nmi_timer_period_tb)
+		return;
+
+	if (cpumask_test_cpu(cpu, &nmi_global_cpus_pending))
+		nmi_global_clear_cpu_pending(cpu, tb);
+
+	if (tb - nmi_global_last_reset_tb >= nmi_global_panic_timeout_tb)
+		watchdog_panic(NULL, -1);
+#endif
+}
+
+static void nmi_timer_fn(unsigned long data)
+{
+	struct timer_list *t = this_cpu_ptr(&nmi_timer);
+	int cpu = smp_processor_id();
+
+	watchdog_timer_interrupt(cpu);
+
+	t->expires = round_jiffies(jiffies + nmi_timer_period * HZ);
+	add_timer_on(t, cpu);
+}
+
+void touch_nmi_watchdog(void)
+{
+	int cpu = smp_processor_id();
+
+	if (cpumask_test_cpu(cpu, &nmi_cpus_enabled))
+		watchdog_timer_interrupt(cpu);
+
+	touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+
+static void watchdog_nmi_interrupt(struct pt_regs *regs, u64 tb)
+{
+	u64 interval = tb - __this_cpu_read(last_activity_tb);
+
+	if (interval >= nmi_panic_timeout_tb)
+		watchdog_panic(regs, smp_processor_id());
+}
+
+/*
+ * This is the soft-mask nmi handler (called by masked_interrupt() in
+ * low level exception entry).
+ */
+static DEFINE_PER_CPU(u64, soft_nmi_last_tb);
+
+/*
+ * Soft nmi interrupts only occur when linux irqs are disabled (but
+ * powerpc hardware irqs are enabled), so they can not be relied
+ * upon to be timely or delivered at all. Their only real use is
+ * the nmi watchdog.
+ */
+void soft_nmi_interrupt(struct pt_regs *regs)
+{
+	u64 tb;
+
+	if (!cpumask_test_cpu(smp_processor_id(), &nmi_cpus_enabled))
+		return;
+
+	tb = get_tb();
+	if (tb - __this_cpu_read(soft_nmi_last_tb) < nmi_timer_period_tb)
+		return;
+	__this_cpu_write(soft_nmi_last_tb, tb);
+
+	nmi_enter();
+	watchdog_nmi_interrupt(regs, tb);
+	nmi_exit();
+}
+
+static void start_nmi_on_cpu(int cpu)
+{
+	struct timer_list *t = per_cpu_ptr(&nmi_timer, cpu);
+	u64 *last_tb = per_cpu_ptr(&soft_nmi_last_tb, cpu);
+
+	*last_tb = get_tb();
+	cpumask_set_cpu(cpu, &nmi_cpus_enabled);
+
+	setup_pinned_timer(t, nmi_timer_fn, cpu);
+	t->expires = round_jiffies(jiffies + nmi_timer_period * HZ);
+	add_timer_on(t, cpu);
+}
+
+static void stop_nmi_on_cpu(int cpu)
+{
+	struct timer_list *t = per_cpu_ptr(&nmi_timer, cpu);
+
+	del_timer_sync(t);
+
+	cpumask_clear_cpu(cpu, &nmi_cpus_enabled);
+#ifdef CONFIG_SMP
+	nmi_global_clear_cpu_pending(cpu, get_tb());
+#endif
+}
+
+
+static int nmi_startup(void)
+{
+	if (num_online_cpus() != 1) {
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
+#ifdef CONFIG_SMP
+	nmi_global_last_reset_tb = get_tb();
+#endif
+	start_nmi_on_cpu(smp_processor_id());
+#ifdef CONFIG_SMP
+	cpumask_copy(&nmi_global_cpus_pending, &nmi_cpus_enabled);
+#endif
+
+	pr_info("NMI Watchdog started on cpus %*pbl\n",
+			cpumask_pr_args(&nmi_cpus_enabled));
+
+	return 0;
+}
+
+static int nmi_cpu_notify(struct notifier_block *self,
+				 unsigned long action, void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_ONLINE:
+	case CPU_DOWN_FAILED:
+		start_nmi_on_cpu(cpu);
+		pr_info("NMI Watchdog running on cpus %*pbl\n",
+				cpumask_pr_args(&nmi_cpus_enabled));
+		break;
+	case CPU_DOWN_PREPARE:
+		stop_nmi_on_cpu(cpu);
+		pr_info("NMI Watchdog running on cpus %*pbl\n",
+				cpumask_pr_args(&nmi_cpus_enabled));
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static int nmi_shutdown(struct notifier_block *nb, unsigned long cmd, void *p)
+{
+	stop_nmi_on_cpu(smp_processor_id());
+	return 0;
+}
+
+static struct notifier_block nmi_reboot_notifier = {
+	.notifier_call = nmi_shutdown,
+};
+
+void __init powerpc_nmi_init(void)
+{
+	int err;
+
+	if (!nmi_panic_timeout)
+		return;
+
+	nmi_panic_timeout_tb = nmi_panic_timeout * ppc_tb_freq;
+	nmi_timer_period_tb = nmi_timer_period * ppc_tb_freq;
+#ifdef CONFIG_SMP
+	nmi_global_panic_timeout_tb = nmi_global_panic_timeout * ppc_tb_freq;
+#endif
+
+	err = register_reboot_notifier(&nmi_reboot_notifier);
+	if (err)
+		return;
+
+	err = nmi_startup();
+	if (err) {
+		unregister_reboot_notifier(&nmi_reboot_notifier);
+		return;
+	}
+
+	cpu_notifier(nmi_cpu_notify, 0);
+}
+
+static int __init setup_nmi_watchdog(char *str)
+{
+	if (!strncmp(str, "0", 1))
+		nmi_panic_timeout = 0;
+	return 0;
+}
+__setup("nmi_watchdog=", setup_nmi_watchdog);
+
+static int __init nowatchdog_setup(char *str)
+{
+	nmi_panic_timeout = 0;
+	return 1;
+}
+__setup("nowatchdog", nowatchdog_setup);
+
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 8d586cf..6a403a9 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -652,21 +652,3 @@ struct ppc_pci_io ppc_pci_io;
 EXPORT_SYMBOL(ppc_pci_io);
 #endif
 
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-u64 hw_nmi_get_sample_period(int watchdog_thresh)
-{
-	return ppc_proc_freq * watchdog_thresh;
-}
-
-/*
- * The hardlockup detector breaks PMU event based branches and is likely
- * to get false positives in KVM guests, so disable it by default.
- */
-static int __init disable_hardlockup_detector(void)
-{
-	hardlockup_detector_disable();
-
-	return 0;
-}
-early_initcall(disable_hardlockup_detector);
-#endif
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index bc3f7d0..5aec1ad 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -52,6 +52,7 @@
 #include <linux/jiffies.h>
 #include <linux/posix-timers.h>
 #include <linux/irq.h>
+#include <linux/nmi.h>
 #include <linux/delay.h>
 #include <linux/irq_work.h>
 #include <linux/clk-provider.h>
@@ -66,6 +67,7 @@
 #include <asm/machdep.h>
 #include <asm/uaccess.h>
 #include <asm/time.h>
+#include <asm/nmi.h>
 #include <asm/prom.h>
 #include <asm/irq.h>
 #include <asm/div64.h>
diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c
index a9973bb..ae3a4d5 100644
--- a/arch/sparc/kernel/nmi.c
+++ b/arch/sparc/kernel/nmi.c
@@ -244,7 +244,7 @@ static struct notifier_block nmi_reboot_notifier = {
 	.notifier_call = nmi_shutdown,
 };
 
-int __init nmi_init(void)
+void __init nmi_init(void)
 {
 	int err;
 
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index a78c35c..9552ab0 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -18,6 +18,9 @@
 #include <asm/nmi.h>
 extern void touch_nmi_watchdog(void);
 #else
+static inline void nmi_init(void)
+{
+}
 static inline void touch_nmi_watchdog(void)
 {
 	touch_softlockup_watchdog();
@@ -30,6 +33,17 @@ extern void hardlockup_detector_disable(void);
 static inline void hardlockup_detector_disable(void) {}
 #endif
 
+#ifdef arch_nmi_init
+static inline void nmi_init(void)
+{
+	arch_nmi_init();
+}
+#else
+static inline void nmi_init(void)
+{
+}
+#endif
+
 /*
  * Create trigger_all_cpu_backtrace() out of the arch-provided
  * base function. Return whether such support was available,
diff --git a/init/main.c b/init/main.c
index 2858be7..36fd7e7 100644
--- a/init/main.c
+++ b/init/main.c
@@ -33,6 +33,7 @@
 #include <linux/start_kernel.h>
 #include <linux/security.h>
 #include <linux/smp.h>
+#include <linux/nmi.h>
 #include <linux/profile.h>
 #include <linux/rcupdate.h>
 #include <linux/moduleparam.h>
@@ -579,6 +580,8 @@ asmlinkage __visible void __init start_kernel(void)
 
 	kmem_cache_init_late();
 
+	nmi_init();
+
 	/*
 	 * HACK ALERT! This is early. We're enabling the console before
 	 * we've done PCI setups etc, and console_init() must be aware of
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 9acb29f..309a588 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -142,6 +142,13 @@ static int __init hardlockup_panic_setup(char *str)
 	return 1;
 }
 __setup("nmi_watchdog=", hardlockup_panic_setup);
+
+static int __init nowatchdog_setup(char *str)
+{
+	watchdog_enabled = 0;
+	return 1;
+}
+__setup("nowatchdog", nowatchdog_setup);
 #endif
 
 unsigned int __read_mostly softlockup_panic =
@@ -155,13 +162,6 @@ static int __init softlockup_panic_setup(char *str)
 }
 __setup("softlockup_panic=", softlockup_panic_setup);
 
-static int __init nowatchdog_setup(char *str)
-{
-	watchdog_enabled = 0;
-	return 1;
-}
-__setup("nowatchdog", nowatchdog_setup);
-
 static int __init nosoftlockup_setup(char *str)
 {
 	watchdog_enabled &= ~SOFT_WATCHDOG_ENABLED;
@@ -177,6 +177,7 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str)
 	return 1;
 }
 __setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
 static int __init hardlockup_all_cpu_backtrace_setup(char *str)
 {
 	sysctl_hardlockup_all_cpu_backtrace =
@@ -185,6 +186,7 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str)
 }
 __setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
 #endif
+#endif
 
 /*
  * Hard-lockup warnings should be triggered after just a few seconds. Soft-
-- 
2.10.2



More information about the Linuxppc-dev mailing list