[PATCH v2 2/3] powerpc/powernv: machine check use kernel crash path

Mahesh Jagannath Salgaonkar mahesh at linux.vnet.ibm.com
Thu Jul 20 17:14:52 AEST 2017


On 07/19/2017 12:29 PM, Nicholas Piggin wrote:
> There are quite a few machine check exceptions that can be caused by
> kernel bugs. To make debugging easier, use the kernel crash path in
> cases of synchronous machine checks that occur in kernel mode, if that
> would not result in the machine going straight to panic or crash dump.
> 
> There is a downside here that die()ing the process in kernel mode can
> still leave the system unstable. panic_on_oops will always force the
> system to fail-stop, so systems where that behaviour is important will
> still do the right thing.
> 
> As a test, when triggering an i-side 0111b error (ifetch from foreign
> address) in kernel mode process context on POWER9, the kernel currently
> dies quickly like this:
> 
> Severe Machine check interrupt [Not recovered]
>   NIP [ffff000000000000]: 0xffff000000000000
>   Initiator: CPU
>   Error type: Real address [Instruction fetch (foreign)]
> [  127.426651616,0] OPAL: Reboot requested due to Platform error.
>     Effective[  127.426693712,3] OPAL: Reboot requested due to Platform error. address: ffff000000000000
> opal: Reboot type 1 not supported
> Kernel panic - not syncing: PowerNV Unrecovered Machine Check
> CPU: 56 PID: 4425 Comm: syscall Tainted: G   M            4.12.0-rc1-13857-ga4700a261072-dirty #35
> Call Trace:
> [  128.017988928,4] IPMI: BUG: Dropping ESEL on the floor due to buggy/mising code in OPAL for this BMCRebooting in 10 seconds..
> Trying to free IRQ 496 from IRQ context!
> 
> 
> After this patch, the process is killed and the kernel continues with
> this message, which gives enough information to identify the offending
> branch (i.e., with CFAR):
> 
> Severe Machine check interrupt [Not recovered]
>   NIP [ffff000000000000]: 0xffff000000000000
>   Initiator: CPU
>   Error type: Real address [Instruction fetch (foreign)]
>     Effective address: ffff000000000000
> Oops: Machine check, sig: 7 [#1]
> SMP NR_CPUS=2048
> NUMA
> PowerNV
> Modules linked in: iptable_mangle ipt_MASQUERADE nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 xt_tcpudp tun bridge stp llc kvm_hv kvm iptable_filter binfmt_misc vmx_crypto ip_tables x_tables autofs4 crc32c_vpmsum
> CPU: 22 PID: 4436 Comm: syscall Tainted: G   M            4.12.0-rc1-13857-ga4700a261072-dirty #36
> task: c000000932300000 task.stack: c000000932380000
> NIP: ffff000000000000 LR: 00000000217706a4 CTR: ffff000000000000
> REGS: c00000000fc8fd80 TRAP: 0200   Tainted: G   M             (4.12.0-rc1-13857-ga4700a261072-dirty)
> MSR: 90000000001c1003 <SF,HV,ME,RI,LE>
>   CR: 24000484  XER: 20000000
> CFAR: c000000000004c80 DAR: 0000000021770a90 DSISR: 0a000000 SOFTE: 1
> GPR00: 0000000000001ebe 00007fffce4818b0 0000000021797f00 0000000000000000
> GPR04: 00007fff8007ac24 0000000044000484 0000000000004000 00007fff801405e8
> GPR08: 900000000280f033 0000000024000484 0000000000000000 0000000000000030
> GPR12: 9000000000001003 00007fff801bc370 0000000000000000 0000000000000000
> GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> GPR24: 0000000000000000 0000000000000000 0000000000000000 0000000000000000
> GPR28: 00007fff801b0000 0000000000000000 00000000217707a0 00007fffce481918
> NIP [ffff000000000000] 0xffff000000000000
> LR [00000000217706a4] 0x217706a4
> Call Trace:
> Instruction dump:
> XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
> XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
> ---[ end trace 32ae1dabb4f8dae6 ]---
> 
> Signed-off-by: Nicholas Piggin <npiggin at gmail.com>

Reviewed-by: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>

Thanks,
-Mahesh.

> ---
>  arch/powerpc/include/asm/bug.h        |  1 +
>  arch/powerpc/include/asm/fadump.h     |  2 ++
>  arch/powerpc/kernel/fadump.c          |  9 ++++++++-
>  arch/powerpc/kernel/traps.c           | 22 ++++++++++++++++++++++
>  arch/powerpc/platforms/powernv/opal.c | 32 ++++++++++++++++++++++++++------
>  5 files changed, 59 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h
> index 0151af6c2a50..9a918b3ca5ee 100644
> --- a/arch/powerpc/include/asm/bug.h
> +++ b/arch/powerpc/include/asm/bug.h
> @@ -133,6 +133,7 @@ extern int do_page_fault(struct pt_regs *, unsigned long, unsigned long);
>  extern void bad_page_fault(struct pt_regs *, unsigned long, int);
>  extern void _exception(int, struct pt_regs *, int, unsigned long);
>  extern void die(const char *, struct pt_regs *, long);
> +extern bool die_will_crash(void);
> 
>  #endif /* !__ASSEMBLY__ */
> 
> diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h
> index ce88bbe1d809..5a23010af600 100644
> --- a/arch/powerpc/include/asm/fadump.h
> +++ b/arch/powerpc/include/asm/fadump.h
> @@ -209,11 +209,13 @@ extern int early_init_dt_scan_fw_dump(unsigned long node,
>  extern int fadump_reserve_mem(void);
>  extern int setup_fadump(void);
>  extern int is_fadump_active(void);
> +extern int should_fadump_crash(void);
>  extern void crash_fadump(struct pt_regs *, const char *);
>  extern void fadump_cleanup(void);
> 
>  #else	/* CONFIG_FA_DUMP */
>  static inline int is_fadump_active(void) { return 0; }
> +static inline int should_fadump_crash(void) { return 0; }
>  static inline void crash_fadump(struct pt_regs *regs, const char *str) { }
>  #endif
>  #endif
> diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c
> index da8830e49696..8a3058f5943b 100644
> --- a/arch/powerpc/kernel/fadump.c
> +++ b/arch/powerpc/kernel/fadump.c
> @@ -125,6 +125,13 @@ int is_fadump_boot_memory_area(u64 addr, ulong size)
>  	return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size;
>  }
> 
> +int should_fadump_crash(void)
> +{
> +	if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr)
> +		return 0;
> +	return 1;
> +}
> +
>  int is_fadump_active(void)
>  {
>  	return fw_dump.dump_active;
> @@ -518,7 +525,7 @@ void crash_fadump(struct pt_regs *regs, const char *str)
>  	struct fadump_crash_info_header *fdh = NULL;
>  	int old_cpu, this_cpu;
> 
> -	if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr)
> +	if (!should_fadump_crash())
>  		return;
> 
>  	/*
> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> index 574e949f8db9..2849c4f50324 100644
> --- a/arch/powerpc/kernel/traps.c
> +++ b/arch/powerpc/kernel/traps.c
> @@ -114,6 +114,28 @@ static void pmac_backlight_unblank(void)
>  static inline void pmac_backlight_unblank(void) { }
>  #endif
> 
> +/*
> + * If oops/die is expected to crash the machine, return true here.
> + *
> + * This should not be expected to be 100% accurate, there may be
> + * notifiers registered or other unexpected conditions that may bring
> + * down the kernel. Or if the current process in the kernel is holding
> + * locks or has other critical state, the kernel may become effectively
> + * unusable anyway.
> + */
> +bool die_will_crash(void)
> +{
> +	if (should_fadump_crash())
> +		return true;
> +	if (kexec_should_crash(current))
> +		return true;
> +	if (in_interrupt() || panic_on_oops ||
> +			!current->pid || is_global_init(current))
> +		return true;
> +
> +	return false;
> +}
> +
>  static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
>  static int die_owner = -1;
>  static unsigned int die_nest_count;
> diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
> index 96436d129684..140350aacca5 100644
> --- a/arch/powerpc/platforms/powernv/opal.c
> +++ b/arch/powerpc/platforms/powernv/opal.c
> @@ -34,6 +34,7 @@
>  #include <asm/opal.h>
>  #include <asm/firmware.h>
>  #include <asm/mce.h>
> +#include <asm/bug.h>
> 
>  #include "powernv.h"
> 
> @@ -426,17 +427,36 @@ static int opal_recover_mce(struct pt_regs *regs,
>  		/* Fatal machine check */
>  		pr_err("Machine check interrupt is fatal\n");
>  		recovered = 0;
> -	} else if ((evt->severity == MCE_SEV_ERROR_SYNC) &&
> -			(user_mode(regs) && !is_global_init(current))) {
> +	}
> +
> +	if (!recovered && evt->severity == MCE_SEV_ERROR_SYNC) {
>  		/*
> -		 * For now, kill the task if we have received exception when
> -		 * in userspace.
> +		 * Try to kill processes if we get a synchronous machine check
> +		 * (e.g., one caused by execution of this instruction). This
> +		 * will devolve into a panic if we try to kill init or are in
> +		 * an interrupt etc.
>  		 *
>  		 * TODO: Queue up this address for hwpoisioning later.
> +		 * TODO: This is not quite right for d-side machine
> +		 *       checks ->nip is not necessarily the important
> +		 *       address.
>  		 */
> -		_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
> -		recovered = 1;
> +		if ((user_mode(regs))) {
> +			_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
> +			recovered = 1;
> +		} else if (die_will_crash()) {
> +			/*
> +			 * die() would kill the kernel, so better to go via
> +			 * the platform reboot code that will log the
> +			 * machine check.
> +			 */
> +			recovered = 0;
> +		} else {
> +			die("Machine check", regs, SIGBUS);
> +			recovered = 1;
> +		}
>  	}
> +
>  	return recovered;
>  }
> 



More information about the Linuxppc-dev mailing list