[Skiboot] [PATCH v2 5/9] opal: Handle TFMR parity HMI event.

Mahesh J Salgaonkar mahesh at linux.vnet.ibm.com
Wed Mar 11 21:31:13 AEDT 2015


From: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>

Handle TFMR parity errors reported through HMER[bit 5] and TFMR bit 60
i.e tx_tfmr_corrupt. For recovery, write '1' to TFMR bit 60 to clear it.
Once we clear this error, check for timebase machine state in TFMR [28:31]
and clear TB errors if timebase machine state is in error (9) state. Once
we reset the timebase machine state continue loading TOD into core TB.

To inject TFMR parity error issue:
	$ putscom pu.ex 10013281 0001080000000000 -all

Signed-off-by: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
---
 core/hmi.c   |    2 +-
 hw/chiptod.c |   60 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/core/hmi.c b/core/hmi.c
index 8a2889a..5d6d2e8 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -517,7 +517,7 @@ int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt)
 	if (hmer & SPR_HMER_TFMR_PARITY_ERROR) {
 		tfmr = mfspr(SPR_TFMR);		/* save original TFMR */
 		hmer &= ~SPR_HMER_TFMR_PARITY_ERROR;
-		recover = 0;
+		recover = chiptod_recover_tb_errors();
 		if (hmi_evt) {
 			hmi_evt->severity = OpalHMI_SEV_FATAL;
 			hmi_evt->type = OpalHMI_ERROR_TFMR_PARITY;
diff --git a/hw/chiptod.c b/hw/chiptod.c
index 22c265c..b00c3cf 100644
--- a/hw/chiptod.c
+++ b/hw/chiptod.c
@@ -73,6 +73,9 @@
 /* Number of iterations for the various timeouts */
 #define TIMEOUT_LOOPS		20000000
 
+/* Timebase State Machine error state */
+#define TBST_STATE_ERROR	9
+
 static enum chiptod_type {
 	chiptod_unknown,
 	chiptod_p7,
@@ -562,7 +565,6 @@ bool chiptod_wakeup_resync(void)
 	return false;
 }
 
-
 static int chiptod_recover_tod_errors(void)
 {
 	uint64_t terr;
@@ -684,6 +686,47 @@ static bool tfmr_recover_tb_errors(uint64_t tfmr)
 }
 
 /*
+ * TFMR parity error recovery as per pc_workbook:
+ *	MT(TFMR) bits 11 and 60 are b’1’
+ *	MT(HMER) all bits 1 except for bits 4,5
+ */
+static bool chiptod_recover_tfmr_error(void)
+{
+	uint64_t tfmr;
+
+	/* Get the base TFMR */
+	tfmr = base_tfmr;
+
+	/* Set bit 60 to clear TFMR parity error. */
+	tfmr |= SPR_TFMR_TFMR_CORRUPT;
+	mtspr(SPR_TFMR, tfmr);
+
+	/* Write twice to clear the error */
+	mtspr(SPR_TFMR, tfmr);
+
+	/* Get fresh copy of TFMR */
+	tfmr = mfspr(SPR_TFMR);
+
+	/* Check if TFMR parity error still present. */
+	if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+		prerror("CHIPTOD: TFMR error recovery: corrupt TFMR !\n");
+		return false;
+	}
+
+	/*
+	 * Now that we have sane value in TFMR, check if Timebase machine
+	 * state is in ERROR state. If yes, clear TB errors so that
+	 * Timebase machine state changes to RESET state. Once in RESET state
+	 * then we can then load TB with TOD value.
+	 */
+	if (GETFIELD(SPR_TFMR_TBST_ENCODED, tfmr) == TBST_STATE_ERROR) {
+		if (!chiptod_reset_tb_errors())
+			return false;
+	}
+	return true;
+}
+
+/*
  * Recover from TB and TOD errors.
  * Timebase register is per core and first thread that gets chance to
  * handle interrupt would fix actual TFAC errors and rest of the threads
@@ -710,6 +753,21 @@ int chiptod_recover_tb_errors(void)
 	tfmr = mfspr(SPR_TFMR);
 
 	/*
+	 * Check for TFMR parity error and recover from it.
+	 * We can not trust any other bits in TFMR If it is corrupt. Fix this
+	 * before we do anything.
+	 */
+	if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+		if (!chiptod_recover_tfmr_error()) {
+			rc = 0;
+			goto error_out;
+		}
+	}
+
+	/* Get fresh copy of TFMR */
+	tfmr = mfspr(SPR_TFMR);
+
+	/*
 	 * Check for TB errors.
 	 * On Sync check error, bit 44 of TFMR is set. Check for it and
 	 * clear it.



More information about the Skiboot mailing list