[RFC PATCH 3/7] powerpc/book3s: mce: Process the MCE event and recover if possible.

Mahesh J Salgaonkar mahesh at linux.vnet.ibm.com
Tue Feb 21 12:52:08 AEDT 2017


From: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>

Once we get high level MCE error event from opal, process it and figure
out if it recoverable or not. If yes, take corrective actions.

TODO:
- Rework on handling of asynchronous MCE errors.
  - Update opal_recover_mce() to ignore async errors.
- Update flush_and_reload_slb() to avoid SLB reload in radix mode.

Signed-off-by: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/mce.h        |    3 +++
 arch/powerpc/kernel/mce.c             |   26 +++++++++++++++++++++++
 arch/powerpc/kernel/mce_power.c       |   38 +++++++++++++++++++++++++++++++++
 arch/powerpc/platforms/powernv/opal.c |    2 ++
 4 files changed, 69 insertions(+)

diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h
index 36db6b0..69e4a42 100644
--- a/arch/powerpc/include/asm/mce.h
+++ b/arch/powerpc/include/asm/mce.h
@@ -88,9 +88,12 @@ extern void save_mce_event(struct pt_regs *regs, long handled,
 			   struct mce_error_info *mce_err, uint64_t nip,
 			   uint64_t addr);
 extern int get_mce_event(struct OpalMachineCheckEvent *mce, bool release);
+extern int set_mce_event(struct OpalMachineCheckEvent *mce);
 extern void release_mce_event(void);
 extern void machine_check_queue_event(void);
 extern void machine_check_print_event_info(struct OpalMachineCheckEvent *evt);
 extern uint64_t get_mce_fault_addr(struct OpalMachineCheckEvent *evt);
+extern long handle_mce_errors(struct pt_regs *regs,
+					struct OpalMachineCheckEvent *evt);
 
 #endif /* __ASM_PPC64_MCE_H__ */
diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c
index 51a7c64..36da14a3 100644
--- a/arch/powerpc/kernel/mce.c
+++ b/arch/powerpc/kernel/mce.c
@@ -166,6 +166,32 @@ int get_mce_event(struct OpalMachineCheckEvent *mce, bool release)
 	return ret;
 }
 
+int set_mce_event(struct OpalMachineCheckEvent *mce)
+{
+	int index = __this_cpu_inc_return(mce_nest_count) - 1;
+	struct OpalMachineCheckEvent *mc_evt = this_cpu_ptr(&mce_event[index]);
+	int ret = 0;
+
+	/* Sanity check */
+	if (index < 0)
+		return ret;
+
+	/* Check if we have MCE info slot within array limit. */
+	if (index < MAX_MC_EVT) {
+		/* Copy the event structure and release the original */
+		if (mce) {
+			*mc_evt = *mce;
+			/* endian conversions */
+			mc_evt->srr0 = be64_to_cpu(mce->srr0);
+			mc_evt->srr1 = be64_to_cpu(mce->srr1);
+			mc_evt->u.ue_error.effective_address =
+				be64_to_cpu(mce->u.ue_error.effective_address);
+		}
+		ret = 1;
+	}
+	return ret;
+}
+
 void release_mce_event(void)
 {
 	get_mce_event(NULL, true);
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 7353991..91ed2ef 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -372,3 +372,41 @@ long __machine_check_early_realmode_p8(struct pt_regs *regs)
 	save_mce_event(regs, handled, &mce_error_info, nip, addr);
 	return handled;
 }
+
+static long flush_tlb(void)
+{
+	long handled = 0;
+
+	if (cur_cpu_spec && cur_cpu_spec->flush_tlb) {
+		cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_GLOBAL);
+		handled = 1;
+	}
+	return handled;
+}
+
+long handle_mce_errors(struct pt_regs *regs, struct OpalMachineCheckEvent *evt)
+{
+	long handled = 1;
+
+	if (evt->disposition == MCE_DISPOSITION_RECOVERED)
+		return handled;
+
+	switch (evt->error_type) {
+	case MCE_ERROR_TYPE_UE:
+		handled = mce_handle_ue_error(regs);
+		break;
+	case MCE_ERROR_TYPE_SLB:
+	case MCE_ERROR_TYPE_ERAT:
+		flush_and_reload_slb();
+		handled = 1;
+		break;
+	case MCE_ERROR_TYPE_TLB:
+		handled = flush_tlb();
+		break;
+	default:
+		handled = 0;
+	}
+	if (handled)
+		evt->disposition = MCE_DISPOSITION_RECOVERED;
+	return handled;
+}
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 263c57e..f1115c4 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -501,6 +501,8 @@ int opal_machine_check_early(struct pt_regs *regs, long *handled)
 	if (rc != OPAL_SUCCESS)
 		return -1;
 
+	*handled = handle_mce_errors(regs, &evt);
+	set_mce_event(&evt);
 	return 0;
 }
 



More information about the Linuxppc-dev mailing list