[Skiboot] [PATCH] core/hmi: report processor recovery reason from core FIR bits on P9

Thu Mar 1 18:12:20 AEDT 2018

When an error is encountered that causes processor recovery, HMI is
generated if the recovery was successful. The reason is recorded in
the core FIR, which gets copied into the WOF.

In this case dump the WOF register and an error string into the OPAL
msglog. A broken init setting led to HMIs reported in Linux as

[    3.591547] Harmless Hypervisor Maintenance interrupt [Recovered]
[    3.591648]  Error detail: Processor Recovery done
[    3.591714]  HMER: 2040000000000000

This patch would have been useful because it tells us exactly that
the problem is in the d-side ERAT:

[  414.489690798,7] HMI: Received HMI interrupt: HMER = 0x2040000000000000
[  414.489693339,7] HMI: [Loc: UOPWR.0000000-Node0-Proc0]: P:0 C:1 T:1: Processor recovery occurred.
[  414.489699837,7] HMI: Core WOF = 0x0000000410000000 recovered error:
[  414.489701543,7] HMI: LSU - SRAM (DCACHE parity, etc)
[  414.489702341,7] HMI: LSU - ERAT multi hit

In future it will be good to unify this reporting, so Linux could
print something more useful. Until then, this gives some good data.

Signed-off-by: Nicholas Piggin <npiggin at gmail.com>
---
 core/hmi.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 59 insertions(+), 3 deletions(-)

diff --git a/core/hmi.c b/core/hmi.c
index 17983a33..846d2b92 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -163,6 +163,9 @@
 #define P8_CORE_FIR		0x10013100
 #define P9_CORE_FIR		0x20010A40
 
+/* And core WOF (Whose On First) */
+#define P9_CORE_WOF		0x20010A48
+
 /* xscom addresses for pMisc Receive Malfunction Alert Register */
 #define P8_MALFUNC_ALERT	0x02020011
 #define P9_MALFUNC_ALERT	0x00090022
@@ -215,6 +218,28 @@ static const struct core_xstop_bit_info {
 	{ 63, CORE_CHECKSTOP_PC_SPRD_HYP_ERR_INJ },
 };
 
+static const struct core_recoverable_bit_info {
+	uint8_t bit;		/* CORE FIR bit number */
+	const char *reason;
+} recoverable_bits[] = {
+	{ 0, "IFU - SRAM (ICACHE parity, etc)" },
+	{ 2, "IFU - RegFile" },
+	{ 4, "IFU - Logic" },
+	{ 9, "ISU - RegFile" },
+	{ 11, "ISU - Logic" },
+	{ 13, "ISU - Recoverable due to not in MT window" },
+	{ 24, "VSU - Logic" },
+	{ 27, "VSU - DFU logic" },
+	{ 29, "LSU - SRAM (DCACHE parity, etc)" },
+	{ 31, "LSU - RegFile" },
+	/* The following 3 bits may be set by SRAM errors. */
+	{ 33, "LSU - TLB multi hit" },
+	{ 34, "LSU - SLB multi hit" },
+	{ 35, "LSU - ERAT multi hit" },
+	{ 37, "LSU - Logic" },
+	{ 39, "LSU - Recoverable due to not in MT window" },
+};
+
 static const struct nx_xstop_bit_info {
 	uint8_t bit;		/* NX FIR bit number */
 	enum OpalHMI_NestAccelXstopReason reason;
@@ -313,6 +338,21 @@ static int read_core_fir(uint32_t chip_id, uint32_t core_id, uint64_t *core_fir)
 	return rc;
 }
 
+static int read_core_wof(uint32_t chip_id, uint32_t core_id, uint64_t *core_wof)
+{
+	int rc;
+
+	switch (proc_gen) {
+	case proc_gen_p9:
+		rc = xscom_read(chip_id,
+			XSCOM_ADDR_P9_EC(core_id, P9_CORE_WOF), core_wof);
+		break;
+	default:
+		rc = OPAL_HARDWARE;
+	}
+	return rc;
+}
+
 static bool decode_core_fir(struct cpu_thread *cpu,
 				struct OpalHMIEvent *hmi_evt)
 {
@@ -1069,6 +1109,7 @@ static void hmi_print_debug(const uint8_t *msg)
 
 int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 {
+	struct cpu_thread *cpu = this_cpu();
 	int recover = 1;
 	uint64_t tfmr;
 
@@ -1084,18 +1125,33 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 	 * looking at TFMR register. TFMR will tell us correct state of
 	 * TB register.
 	 */
-	this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
+	cpu->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
 	prlog(PR_DEBUG, "Received HMI interrupt: HMER = 0x%016llx\n", hmer);
 	if (hmi_evt)
 		hmi_evt->hmer = hmer;
 	if (hmer & SPR_HMER_PROC_RECV_DONE) {
+		uint32_t chip_id = pir_to_chip_id(cpu->pir);
+		uint32_t core_id = pir_to_core_id(cpu->pir);
+		uint64_t core_wof;
+
+		hmi_print_debug("Processor recovery occurred.");
+		if (!read_core_wof(chip_id, core_id, &core_wof)) {
+			int i;
+
+			prlog(PR_DEBUG, "Core WOF = 0x%016llx recovered error:\n", core_wof);
+			for (i = 0; i < ARRAY_SIZE(recoverable_bits); i++) {
+				if (core_wof & PPC_BIT(recoverable_bits[i].bit))
+					prlog(PR_DEBUG, "%s\n",
+						recoverable_bits[i].reason);
+			}
+		}
+
 		hmer &= ~SPR_HMER_PROC_RECV_DONE;
 		if (hmi_evt) {
 			hmi_evt->severity = OpalHMI_SEV_NO_ERROR;
 			hmi_evt->type = OpalHMI_ERROR_PROC_RECOV_DONE;
 			queue_hmi_event(hmi_evt, recover);
 		}
-		hmi_print_debug("Processor recovery Done.");
 	}
 	if (hmer & SPR_HMER_PROC_RECV_ERROR_MASKED) {
 		hmer &= ~SPR_HMER_PROC_RECV_ERROR_MASKED;
@@ -1180,7 +1236,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 	mtspr(SPR_HMER, hmer);
 	hmi_exit();
 	/* Set the TB state looking at TFMR register before we head out. */
-	this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
+	cpu->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
 	unlock(&hmi_lock);
 	return recover;
 }
-- 
2.16.1