[PATCH 2/2] powerpc/watchdog: Avoid holding wd_smp_lock over printk and smp_send_nmi_ipi

Fri Oct 29 23:09:43 AEDT 2021

Le 29/10/2021 à 10:39, Nicholas Piggin a écrit :
> There is a deadlock with the console_owner lock and the wd_smp_lock:
> 
> CPU x takes the console_owner lock
> CPU y takes a watchdog timer interrupt and takes __wd_smp_lock
> CPU x takes a watchdog timer interrupt and spins on __wd_smp_lock
> CPU y detects a deadlock, tries to print something and spins on console_owner
> -> deadlock
> 
> Change the watchdog locking scheme so wd_smp_lock protects the watchdog
> internal data, but "reporting" (printing, issuing NMI IPIs, taking any
> action outside of watchdog) uses a non-waiting exclusion. If a CPU detects
> a problem but can not take the reporting lock, it just returns because
> something else is already reporting. It will try again at some point.
> 
> Typically hard lockup watchdog report usefulness is not impacted due to
> failure to spewing a large enough amount of data in as short a time as
> possible, but by messages getting garbled.
> 
> Laurent debugged this and found the deadlock, and this patch is based on
> his general approach to avoid expensive operations while holding the lock.
> With the addition of the reporting exclusion.
> 
> Signed-off-by: Laurent Dufour <ldufour at linux.ibm.com>
> [np: rework to add reporting exclusion update changelog]
> Signed-off-by: Nicholas Piggin <npiggin at gmail.com>
> ---
>   arch/powerpc/kernel/watchdog.c | 76 ++++++++++++++++++++++++++--------
>   1 file changed, 59 insertions(+), 17 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
> index 4bb7c8e371a2..69a475aa0f44 100644
> --- a/arch/powerpc/kernel/watchdog.c
> +++ b/arch/powerpc/kernel/watchdog.c
> @@ -85,10 +85,32 @@ static DEFINE_PER_CPU(u64, wd_timer_tb);
>   
>   /* SMP checker bits */
>   static unsigned long __wd_smp_lock;
> +static unsigned long __wd_reporting;
>   static cpumask_t wd_smp_cpus_pending;
>   static cpumask_t wd_smp_cpus_stuck;
>   static u64 wd_smp_last_reset_tb;
>   
> +/*
> + * Try to take the exclusive watchdog action / NMI IPI / printing lock.
> + * wd_smp_lock must be held. If this fails, we should return and wait
> + * for the watchdog to kick in again (or another CPU to trigger it).
> + */
> +static bool wd_try_report(void)
> +{
> +	if (__wd_reporting)
> +		return false;
> +	__wd_reporting = 1;
> +	return true;
> +}
> +
> +/* End printing after successful wd_try_report. wd_smp_lock not required. */
> +static void wd_end_reporting(void)
> +{
> +	smp_mb(); /* End printing "critical section" */
> +	WARN_ON_ONCE(__wd_reporting == 0);
> +	WRITE_ONCE(__wd_reporting, 0);
> +}
> +
>   static inline void wd_smp_lock(unsigned long *flags)
>   {
>   	/*
> @@ -131,10 +153,10 @@ static void wd_lockup_ipi(struct pt_regs *regs)
>   	/* Do not panic from here because that can recurse into NMI IPI layer */
>   }
>   
> -static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
> +static void set_cpu_stuck(int cpu, u64 tb)
>   {
> -	cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask);
> -	cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask);
> +	cpumask_set_cpu(cpu, &wd_smp_cpus_stuck);
> +	cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
>   	if (cpumask_empty(&wd_smp_cpus_pending)) {
>   		wd_smp_last_reset_tb = tb;
>   		cpumask_andnot(&wd_smp_cpus_pending,
> @@ -142,10 +164,6 @@ static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
>   				&wd_smp_cpus_stuck);
>   	}
>   }
> -static void set_cpu_stuck(int cpu, u64 tb)
> -{
> -	set_cpumask_stuck(cpumask_of(cpu), tb);
> -}
>   
>   static void watchdog_smp_panic(int cpu, u64 tb)
>   {
> @@ -160,6 +178,9 @@ static void watchdog_smp_panic(int cpu, u64 tb)
>   		goto out;
>   	if (cpumask_weight(&wd_smp_cpus_pending) == 0)
>   		goto out;
> +	if (!wd_try_report())
> +		goto out;
> +	wd_smp_unlock(&flags);
>   
>   	pr_emerg("CPU %d detected hard LOCKUP on other CPUs %*pbl\n",
>   		 cpu, cpumask_pr_args(&wd_smp_cpus_pending));
> @@ -172,24 +193,32 @@ static void watchdog_smp_panic(int cpu, u64 tb)
>   		 * Try to trigger the stuck CPUs, unless we are going to
>   		 * get a backtrace on all of them anyway.
>   		 */
> -		for_each_cpu(c, &wd_smp_cpus_pending) {
> +		for_each_online_cpu(c) {
>   			if (c == cpu)
>   				continue;
> +			if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending))
                                               ^ c
cpu is the reporting CPU, c is the target here.

> +				continue;
> +			wd_smp_lock(&flags);
> +			if (!cpumask_test_cpu(cpu, &wd_smp_cpus_pending)) {
                                               ^ again c
> +				wd_smp_unlock(&flags);
> +				continue;
> +			}
> +			/* Take the stuck CPU out of the watch group */
> +			set_cpu_stuck(cpu, tb);
                                       ^ c
> +			wd_smp_unlock(&flags);
> +
>   			smp_send_nmi_ipi(c, wd_lockup_ipi, 1000000);
>   		}
>   	}
>   
> -	/* Take the stuck CPUs out of the watch group */
> -	set_cpumask_stuck(&wd_smp_cpus_pending, tb);
> -
> -	wd_smp_unlock(&flags);
> -
>   	if (sysctl_hardlockup_all_cpu_backtrace)
>   		trigger_allbutself_cpu_backtrace();
>   
>   	if (hardlockup_panic)
>   		nmi_panic(NULL, "Hard LOCKUP");
>   
> +	wd_end_reporting();
> +
>   	return;
>   
>   out:
> @@ -203,8 +232,6 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
>   			struct pt_regs *regs = get_irq_regs();
>   			unsigned long flags;
>   
> -			wd_smp_lock(&flags);
> -
>   			pr_emerg("CPU %d became unstuck TB:%lld\n",
>   				 cpu, tb);
>   			print_irqtrace_events(current);
> @@ -213,6 +240,7 @@ static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
>   			else
>   				dump_stack();
>   
> +			wd_smp_lock(&flags);
>   			cpumask_clear_cpu(cpu, &wd_smp_cpus_stuck);
>   			wd_smp_unlock(&flags);
>   		} else {
> @@ -291,8 +319,17 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
>   			wd_smp_unlock(&flags);
>   			return 0;
>   		}
> +		if (!wd_try_report()) {
> +			wd_smp_unlock(&flags);
> +			/* Couldn't report, try again in 100ms */
> +			mtspr(SPRN_DEC, 100 * tb_ticks_per_usec * 1000);
> +			return 0;
> +		}
> +
>   		set_cpu_stuck(cpu, tb);
>   
> +		wd_smp_unlock(&flags);
> +
>   		pr_emerg("CPU %d self-detected hard LOCKUP @ %pS\n",
>   			 cpu, (void *)regs->nip);
>   		pr_emerg("CPU %d TB:%lld, last heartbeat TB:%lld (%lldms ago)\n",
> @@ -302,14 +339,19 @@ DEFINE_INTERRUPT_HANDLER_NMI(soft_nmi_interrupt)
>   		print_irqtrace_events(current);
>   		show_regs(regs);
>   
> -		wd_smp_unlock(&flags);
> -
>   		if (sysctl_hardlockup_all_cpu_backtrace)
>   			trigger_allbutself_cpu_backtrace();
>   
>   		if (hardlockup_panic)
>   			nmi_panic(regs, "Hard LOCKUP");
> +
> +		wd_end_reporting();
>   	}
> +	/*
> +	 * We are okay to change DEC in soft_nmi_interrupt because the masked
> +	 * handler has marked a DEC as pending, so the timer interrupt will be
> +	 * replayed as soon as local irqs are enabled again.
> +	 */
>   	if (wd_panic_timeout_tb < 0x7fffffff)
>   		mtspr(SPRN_DEC, wd_panic_timeout_tb);
>   
>