[PATCH v2 3/3] powerpc: machine check interrupt is a non-maskable interrupt

Christophe Leroy christophe.leroy at c-s.fr
Tue Oct 9 20:36:18 AEDT 2018



On 10/09/2018 05:30 AM, Nicholas Piggin wrote:
> On Tue, 9 Oct 2018 06:46:30 +0200
> Christophe LEROY <christophe.leroy at c-s.fr> wrote:
> 
>> Le 09/10/2018 à 06:32, Nicholas Piggin a écrit :
>>> On Mon, 8 Oct 2018 17:39:11 +0200
>>> Christophe LEROY <christophe.leroy at c-s.fr> wrote:
>>>    
>>>> Hi Nick,
>>>>
>>>> Le 19/07/2017 à 08:59, Nicholas Piggin a écrit :
>>>>> Use nmi_enter similarly to system reset interrupts. This uses NMI
>>>>> printk NMI buffers and turns off various debugging facilities that
>>>>> helps avoid tripping on ourselves or other CPUs.
>>>>>
>>>>> Signed-off-by: Nicholas Piggin <npiggin at gmail.com>
>>>>> ---
>>>>>     arch/powerpc/kernel/traps.c | 9 ++++++---
>>>>>     1 file changed, 6 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
>>>>> index 2849c4f50324..6d31f9d7c333 100644
>>>>> --- a/arch/powerpc/kernel/traps.c
>>>>> +++ b/arch/powerpc/kernel/traps.c
>>>>> @@ -789,8 +789,10 @@ int machine_check_generic(struct pt_regs *regs)
>>>>>     
>>>>>     void machine_check_exception(struct pt_regs *regs)
>>>>>     {
>>>>> -	enum ctx_state prev_state = exception_enter();
>>>>>     	int recover = 0;
>>>>> +	bool nested = in_nmi();
>>>>> +	if (!nested)
>>>>> +		nmi_enter();
>>>>
>>>> This alters preempt_count, then when die() is called
>>>> in_interrupt() returns true allthough the trap didn't happen in
>>>> interrupt, so oops_end() panics for "fatal exception in interrupt"
>>>> instead of gently sending SIGBUS the faulting app.
>>>
>>> Thanks for tracking that down.
>>>    
>>>> Any idea on how to fix this ?
>>>
>>> I would say we have to deliver the sigbus by hand.
>>>
>>>       if ((user_mode(regs)))
>>>           _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
>>>       else
>>>           die("Machine check", regs, SIGBUS);
>>>    
>>
>> And what about all the other things done by 'die()' ?
>>
>> And what if it is a kernel thread ?
>>
>> In one of my boards, I have a kernel thread regularly checking the HW,
>> and if it gets a machine check I expect it to gently stop and the die
>> notification to be delivered to all registered notifiers.
>>
>> Until before this patch, it was working well.
> 
> I guess the alternative is we could check regs->trap for machine
> check in the die test. Complication is having to account for MCE
> in an interrupt handler.
> 
>         if (in_interrupt()) {
>                  if (!IS_MCHECK_EXC(regs) || (irq_count() - (NMI_OFFSET + HARDIRQ_OFFSET)))
>                      panic("Fatal exception in interrupt");
>         }
> 
> Something like that might work for you? We needs a ppc64 macro for the
> MCE, and can probably add something like in_nmi_from_interrupt() for
> the second part of the test.

Don't know, I'm away from home on business trip so I won't be able to 
test anything before next week. However it looks more or less like a 
hack, doesn't it ?

What about the following ?

diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index fd58749b4d6b..1f09033a5103 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -208,7 +208,7 @@ static unsigned long oops_begin(struct pt_regs *regs)
  NOKPROBE_SYMBOL(oops_begin);

  static void oops_end(unsigned long flags, struct pt_regs *regs,
-			       int signr)
+		     int signr, bool is_in_interrupt)
  {
  	bust_spinlocks(0);
  	add_taint(TAINT_DIE, LOCKDEP_NOW_UNRELIABLE);
@@ -247,7 +247,7 @@ static void oops_end(unsigned long flags, struct 
pt_regs *regs,
  		mdelay(MSEC_PER_SEC);
  	}

-	if (in_interrupt())
+	if (is_in_interrupt)
  		panic("Fatal exception in interrupt");
  	if (panic_on_oops)
  		panic("Fatal exception");
@@ -288,7 +288,7 @@ static int __die(const char *str, struct pt_regs 
*regs, long err)
  }
  NOKPROBE_SYMBOL(__die);

-void die(const char *str, struct pt_regs *regs, long err)
+static void nmi_die(const char *str, struct pt_regs *regs, long err, 
bool is_in_interrupt)
  {
  	unsigned long flags;

@@ -303,7 +303,13 @@ void die(const char *str, struct pt_regs *regs, 
long err)
  	flags = oops_begin(regs);
  	if (__die(str, regs, err))
  		err = 0;
-	oops_end(flags, regs, err);
+	oops_end(flags, regs, err, is_in_interrupt);
+}
+NOKPROBE_SYMBOL(nmi_die);
+
+void die(const char *str, struct pt_regs *regs, long err)
+{
+	nmi_die(str, regs, err, in_interrupt());
  }
  NOKPROBE_SYMBOL(die);

@@ -737,6 +743,7 @@ int machine_check_generic(struct pt_regs *regs)
  void machine_check_exception(struct pt_regs *regs)
  {
  	int recover = 0;
+	bool is_in_interrupt = in_interrupt();
  	bool nested = in_nmi();
  	if (!nested)
  		nmi_enter();
@@ -765,7 +772,7 @@ void machine_check_exception(struct pt_regs *regs)
  	if (check_io_access(regs))
  		goto bail;

-	die("Machine check", regs, SIGBUS);
+	nmi_die("Machine check", regs, SIGBUS, is_in_interrupt);

  	/* Must die if the interrupt is not recoverable */
  	if (!(regs->msr & MSR_RI))


Thanks
Christophe


More information about the Linuxppc-dev mailing list