[PATCH 2/2] powerpc/watchdog: Avoid holding wd_smp_lock over printk and smp_send_nmi_ipi
Laurent Dufour
ldufour at linux.ibm.com
Fri Oct 29 23:09:43 AEDT 2021
Le 29/10/2021 à 10:39, Nicholas Piggin a écrit :
> There is a deadlock with the console_owner lock and the wd_smp_lock:
>
> CPU x takes the console_owner lock
> CPU y takes a watchdog timer interrupt and takes __wd_smp_lock
> CPU x takes a watchdog timer interrupt and spins on __wd_smp_lock
> CPU y detects a deadlock, tries to print something and spins on console_owner
> -> deadlock
>
> Change the watchdog locking scheme so wd_smp_lock protects the watchdog
> internal data, but "reporting" (printing, issuing NMI IPIs, taking any
> action outside of watchdog) uses a non-waiting exclusion. If a CPU detects
> a problem but can not take the reporting lock, it just returns because
> something else is already reporting. It will try again at some point.
>
> Typically hard lockup watchdog report usefulness is not impacted due to
> failure to spewing a large enough amount of data in as short a time as
> possible, but by messages getting garbled.
>
> Laurent debugged this and found the deadlock, and this patch is based on
> his general approach to avoid expensive operations while holding the lock.
> With the addition of the reporting exclusion.
>
> Signed-off-by: Laurent Dufour <ldufour at linux.ibm.com>
> [np: rework to add reporting exclusion update changelog]
> Signed-off-by: Nicholas Piggin <npiggin at gmail.com>
> ---
> arch/powerpc/kernel/watchdog.c | 76 ++++++++++++++++++++++++++--------
> 1 file changed, 59 insertions(+), 17 deletions(-)
>
> diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
> index 4bb7c8e371a2..69a475aa0f44 100644
> --- a/arch/powerpc/kernel/watchdog.c
> +++ b/arch/powerpc/kernel/watchdog.c
> @@ -85,10 +85,32 @@ static DEFINE_PER_CPU(u64, wd_timer_tb);
>
> /* SMP checker bits */
> static unsigned long __wd_smp_lock;
> +static unsigned long __wd_reporting;
> static cpumask_t wd_smp_cpus_pending;
> static cpumask_t wd_smp_cpus_stuck;
> static u64 wd_smp_last_reset_tb;
>
> +/*
> + * Try to take the exclusive watchdog action / NMI IPI / printing lock.
> + * wd_smp_lock must be held. If this fails, we should return and wait
> + * for the watchdog to kick in again (or another CPU to trigger it).
> + */
> +static bool wd_try_report(void)
> +{
> + if (__wd_reporting)
> + return false;
> + __wd_reporting = 1;
> + return true;
> +}
> +
> +/* End printing after successful wd_try_report. wd_smp_lock not required. */
> +static void wd_end_reporting(void)
> +{
> + smp_mb(); /* End printing "critical section" */
> + WARN_ON_ONCE(__wd_reporting == 0);
> + WRITE_ONCE(__wd_reporting, 0);
> +}
> +
> static inline void wd_smp_lock(unsigned long *flags)
> {
> /*
> @@ -131,10 +153,10 @@ static void wd_lockup_ipi(struct pt_regs *regs)
> /* Do not panic from here because that can recurse into NMI IPI layer */
> }
>
> -static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
> +static void set_cpu_stuck(int cpu, u64 tb)
> {
> - cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask);
> - cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask);
> + cpumask_set_cpu(cpu, &wd_smp_cpus_stuck);
> + cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
> if (cpumask_empty(&wd_smp_cpus_pending)) {
> wd_smp_last_reset_tb = tb;
> cpumask_andnot(&wd_smp_cpus_pending,
> @@ -142,10 +164,6 @@ static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
> &wd_smp_cpus_stuck);
> }
> }
> -static void set_cpu_stuck(int cpu, u64 tb)
> -{
> - set_cpumask_stuck(cpumask_of(cpu), tb);
> -}
>
> static void watchdog_smp_panic(int cpu, u64 tb)
> {
> @@ -160,6 +178,9 @@ static void watchdog_smp_panic(int cpu, u64 tb)
> goto out;
> if (cpumask_weight(&wd_smp_cpus_pending) == 0)
> goto out;
> + if (!wd_try_report())
> + goto out;
> + wd_smp_unlock(&flags);
>
> pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n",
> cpu, cpumask_pr_args(&wd_smp_cpus_pending));
> @@ -172,24 +193,32 @@ static void watchdog_smp_panic(int cpu, u64 tb)
> * Try to trigger the stuck CPUs, unless we are going to
> * get a backtrace on all of them anyway.
> */
> - for_each_cpu(c, &wd_smp_cpus_pending) {
> + for_each_online_cpu(c) {
> if (c == cpu)
> continue;
> + if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
^ c
cpu is the reporting CPU, c is the target here.
> + continue;
> + wd_smp_lock(&flags);
> + if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
^ again c
> + wd_smp_unlock(&flags);
> + continue;
> + }
> + /* Take the stuck CPU out of the watch group */
> + set_cpu_stuck(cpu, tb);
^ c
> + wd_smp_unlock(&flags);
> +
> smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
> }
> }
>
> - /* Take the stuck CPUs out of the watch group */
> - set_cpumask_stuck(&wd_smp_cpus_pending, tb);
> -
> - wd_smp_unlock(&flags);
> -
> if (sysctl_hardlockup_all_cpu_backtrace)
> trigger_allbutself_cpu_backtrace();
>
> if (hardlockup_panic)
> nmi_panic(NULL, "Hard LOCKUP");
>
> + wd_end_reporting();
> +
> return;
>
> out:
> @@ -203,8 +232,6 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
> struct pt_regs *regs = get_irq_regs();
> unsigned long flags;
>
> - wd_smp_lock(&flags);
> -
> pr_emerg("CPU %d became unstuck TB:%lld\n",
> cpu, tb);
> print_irqtrace_events(current);
> @@ -213,6 +240,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
> else
> dump_stack();
>
> + wd_smp_lock(&flags);
> cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
> wd_smp_unlock(&flags);
> } else {
> @@ -291,8 +319,17 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
> wd_smp_unlock(&flags);
> return 0;
> }
> + if (!wd_try_report()) {
> + wd_smp_unlock(&flags);
> + /* Couldn't report, try again in 100ms */
> + mtspr(SPRN_DEC, 100 * tb_ticks_per_usec * 1000);
> + return 0;
> + }
> +
> set_cpu_stuck(cpu, tb);
>
> + wd_smp_unlock(&flags);
> +
> pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n",
> cpu, (void *)regs->nip);
> pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
> @@ -302,14 +339,19 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
> print_irqtrace_events(current);
> show_regs(regs);
>
> - wd_smp_unlock(&flags);
> -
> if (sysctl_hardlockup_all_cpu_backtrace)
> trigger_allbutself_cpu_backtrace();
>
> if (hardlockup_panic)
> nmi_panic(regs, "Hard LOCKUP");
> +
> + wd_end_reporting();
> }
> + /*
> + * We are okay to change DEC in soft_nmi_interrupt because the masked
> + * handler has marked a DEC as pending, so the timer interrupt will be
> + * replayed as soon as local irqs are enabled again.
> + */
> if (wd_panic_timeout_tb < 0x7fffffff)
> mtspr(SPRN_DEC, wd_panic_timeout_tb);
>
>
More information about the Linuxppc-dev
mailing list