[PATCH 5/5] powerpc/64s: Fix unrecoverable MCE calling async handler from NMI

Cédric Le Goater clg at kaod.org
Tue Oct 5 02:28:52 AEDT 2021


On 10/4/21 16:56, Nicholas Piggin wrote:
> The machine check handler is not considered NMI on 64s. The early
> handler is the true NMI handler, and then it schedules the
> machine_check_exception handler to run when interrupts are enabled.
> 
> This works fine except the case of an unrecoverable MCE, where the true
> NMI is taken when MSR[RI] is clear, it can not recover, so it calls
> machine_check_exception directly so something might be done about it.
> 
> Calling an async handler from NMI context can result in irq state and
> other things getting corrupted. This can also trigger the BUG at
>    arch/powerpc/include/asm/interrupt.h:168
>    BUG_ON(!arch_irq_disabled_regs(regs) && !(regs->msr & MSR_EE));

I was hitting this problem when I rebooted a P8 tuleta system and
this series fixes it.

Tested-by: Cédric Le Goater <clg at kaod.org>

Thanks,

C.
  
> Fix this by making an _async version of the handler which is called
> in the normal case, and a NMI version that is called for unrecoverable
> interrupts.
> 
> Fixes: 2b43dd7653cc ("powerpc/64: enable MSR[EE] in irq replay pt_regs")
> Signed-off-by: Nicholas Piggin <npiggin at gmail.com>> ---
>   arch/powerpc/include/asm/interrupt.h |  5 ++---
>   arch/powerpc/kernel/exceptions-64s.S |  8 +++++--
>   arch/powerpc/kernel/traps.c          | 31 ++++++++++++++++------------
>   3 files changed, 26 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
> index b894b7169706..a1d238255f07 100644
> --- a/arch/powerpc/include/asm/interrupt.h
> +++ b/arch/powerpc/include/asm/interrupt.h
> @@ -528,10 +528,9 @@ static __always_inline long ____##func(struct pt_regs *regs)
>   /* kernel/traps.c */
>   DECLARE_INTERRUPT_HANDLER_NMI(system_reset_exception);
>   #ifdef CONFIG_PPC_BOOK3S_64
> -DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception);
> -#else
> -DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception);
> +DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async);
>   #endif
> +DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception);
>   DECLARE_INTERRUPT_HANDLER(SMIException);
>   DECLARE_INTERRUPT_HANDLER(handle_hmi_exception);
>   DECLARE_INTERRUPT_HANDLER(unknown_exception);
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index 024d9231f88c..eaf1f72131a1 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -1243,7 +1243,7 @@ EXC_COMMON_BEGIN(machine_check_common)
>   	li	r10,MSR_RI
>   	mtmsrd 	r10,1
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
> -	bl	machine_check_exception
> +	bl	machine_check_exception_async
>   	b	interrupt_return_srr
>   
>   
> @@ -1303,7 +1303,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
>   	subi	r12,r12,1
>   	sth	r12,PACA_IN_MCE(r13)
>   
> -	/* Invoke machine_check_exception to print MCE event and panic. */
> +	/*
> +	 * Invoke machine_check_exception to print MCE event and panic.
> +	 * This is the NMI version of the handler because we are called from
> +	 * the early handler which is a true NMI.
> +	 */
>   	addi	r3,r1,STACK_FRAME_OVERHEAD
>   	bl	machine_check_exception
>   
> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> index e453b666613b..11741703d26e 100644
> --- a/arch/powerpc/kernel/traps.c
> +++ b/arch/powerpc/kernel/traps.c
> @@ -796,24 +796,22 @@ void die_mce(const char *str, struct pt_regs *regs, long err)
>   	 * do_exit() checks for in_interrupt() and panics in that case, so
>   	 * exit the irq/nmi before calling die.
>   	 */
> -	if (IS_ENABLED(CONFIG_PPC_BOOK3S_64))
> -		irq_exit();
> -	else
> +	if (in_nmi())
>   		nmi_exit();
> +	else
> +		irq_exit();
>   	die(str, regs, err);
>   }
>   
>   /*
> - * BOOK3S_64 does not call this handler as a non-maskable interrupt
> + * BOOK3S_64 does not usually call this handler as a non-maskable interrupt
>    * (it uses its own early real-mode handler to handle the MCE proper
>    * and then raises irq_work to call this handler when interrupts are
> - * enabled).
> + * enabled). The only time when this is not true is if the early handler
> + * is unrecoverable, then it does call this directly to try to get a
> + * message out.
>    */
> -#ifdef CONFIG_PPC_BOOK3S_64
> -DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception)
> -#else
> -DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
> -#endif
> +static void __machine_check_exception(struct pt_regs *regs)
>   {
>   	int recover = 0;
>   
> @@ -847,12 +845,19 @@ DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
>   	/* Must die if the interrupt is not recoverable */
>   	if (regs_is_unrecoverable(regs))
>   		die_mce("Unrecoverable Machine check", regs, SIGBUS);
> +}
>   
>   #ifdef CONFIG_PPC_BOOK3S_64
> -	return;
> -#else
> -	return 0;
> +DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async)
> +{
> +	__machine_check_exception(regs);
> +}
>   #endif
> +DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
> +{
> +	__machine_check_exception(regs);
> +
> +	return 0;
>   }
>   
>   DEFINE_INTERRUPT_HANDLER(SMIException) /* async? */
> 



More information about the Linuxppc-dev mailing list