[PATCH 5/5] powerpc/64s: Fix unrecoverable MCE calling async handler from NMI
Cédric Le Goater
clg at kaod.org
Tue Oct 5 02:28:52 AEDT 2021
On 10/4/21 16:56, Nicholas Piggin wrote:
> The machine check handler is not considered NMI on 64s. The early
> handler is the true NMI handler, and then it schedules the
> machine_check_exception handler to run when interrupts are enabled.
>
> This works fine except the case of an unrecoverable MCE, where the true
> NMI is taken when MSR[RI] is clear, it can not recover, so it calls
> machine_check_exception directly so something might be done about it.
>
> Calling an async handler from NMI context can result in irq state and
> other things getting corrupted. This can also trigger the BUG at
> arch/powerpc/include/asm/interrupt.h:168
> BUG_ON(!arch_irq_disabled_regs(regs) && !(regs->msr & MSR_EE));
I was hitting this problem when I rebooted a P8 tuleta system and
this series fixes it.
Tested-by: Cédric Le Goater <clg at kaod.org>
Thanks,
C.
> Fix this by making an _async version of the handler which is called
> in the normal case, and a NMI version that is called for unrecoverable
> interrupts.
>
> Fixes: 2b43dd7653cc ("powerpc/64: enable MSR[EE] in irq replay pt_regs")
> Signed-off-by: Nicholas Piggin <npiggin at gmail.com>> ---
> arch/powerpc/include/asm/interrupt.h | 5 ++---
> arch/powerpc/kernel/exceptions-64s.S | 8 +++++--
> arch/powerpc/kernel/traps.c | 31 ++++++++++++++++------------
> 3 files changed, 26 insertions(+), 18 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/interrupt.h b/arch/powerpc/include/asm/interrupt.h
> index b894b7169706..a1d238255f07 100644
> --- a/arch/powerpc/include/asm/interrupt.h
> +++ b/arch/powerpc/include/asm/interrupt.h
> @@ -528,10 +528,9 @@ static __always_inline long ____##func(struct pt_regs *regs)
> /* kernel/traps.c */
> DECLARE_INTERRUPT_HANDLER_NMI(system_reset_exception);
> #ifdef CONFIG_PPC_BOOK3S_64
> -DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception);
> -#else
> -DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception);
> +DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async);
> #endif
> +DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception);
> DECLARE_INTERRUPT_HANDLER(SMIException);
> DECLARE_INTERRUPT_HANDLER(handle_hmi_exception);
> DECLARE_INTERRUPT_HANDLER(unknown_exception);
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index 024d9231f88c..eaf1f72131a1 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -1243,7 +1243,7 @@ EXC_COMMON_BEGIN(machine_check_common)
> li r10,MSR_RI
> mtmsrd r10,1
> addi r3,r1,STACK_FRAME_OVERHEAD
> - bl machine_check_exception
> + bl machine_check_exception_async
> b interrupt_return_srr
>
>
> @@ -1303,7 +1303,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
> subi r12,r12,1
> sth r12,PACA_IN_MCE(r13)
>
> - /* Invoke machine_check_exception to print MCE event and panic. */
> + /*
> + * Invoke machine_check_exception to print MCE event and panic.
> + * This is the NMI version of the handler because we are called from
> + * the early handler which is a true NMI.
> + */
> addi r3,r1,STACK_FRAME_OVERHEAD
> bl machine_check_exception
>
> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> index e453b666613b..11741703d26e 100644
> --- a/arch/powerpc/kernel/traps.c
> +++ b/arch/powerpc/kernel/traps.c
> @@ -796,24 +796,22 @@ void die_mce(const char *str, struct pt_regs *regs, long err)
> * do_exit() checks for in_interrupt() and panics in that case, so
> * exit the irq/nmi before calling die.
> */
> - if (IS_ENABLED(CONFIG_PPC_BOOK3S_64))
> - irq_exit();
> - else
> + if (in_nmi())
> nmi_exit();
> + else
> + irq_exit();
> die(str, regs, err);
> }
>
> /*
> - * BOOK3S_64 does not call this handler as a non-maskable interrupt
> + * BOOK3S_64 does not usually call this handler as a non-maskable interrupt
> * (it uses its own early real-mode handler to handle the MCE proper
> * and then raises irq_work to call this handler when interrupts are
> - * enabled).
> + * enabled). The only time when this is not true is if the early handler
> + * is unrecoverable, then it does call this directly to try to get a
> + * message out.
> */
> -#ifdef CONFIG_PPC_BOOK3S_64
> -DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception)
> -#else
> -DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
> -#endif
> +static void __machine_check_exception(struct pt_regs *regs)
> {
> int recover = 0;
>
> @@ -847,12 +845,19 @@ DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
> /* Must die if the interrupt is not recoverable */
> if (regs_is_unrecoverable(regs))
> die_mce("Unrecoverable Machine check", regs, SIGBUS);
> +}
>
> #ifdef CONFIG_PPC_BOOK3S_64
> - return;
> -#else
> - return 0;
> +DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async)
> +{
> + __machine_check_exception(regs);
> +}
> #endif
> +DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
> +{
> + __machine_check_exception(regs);
> +
> + return 0;
> }
>
> DEFINE_INTERRUPT_HANDLER(SMIException) /* async? */
>
More information about the Linuxppc-dev
mailing list