[Skiboot] [PATCH v2 08/15] opal/hmi: Do not send HMI event if no errors are found.

Mahesh J Salgaonkar mahesh at linux.vnet.ibm.com
Tue Apr 17 03:33:49 AEST 2018


From: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>

For TOD errors, all the cores in the chip get HMIs. Any one thread from any
core can fix the issue and TFMR will have error conditions cleared. Rest of
the threads need take any action if TOD errors are already cleared. Hence
thread 0 of every core should get a fresh copy of TFMR before going ahead
recovery path. Initialize recover = -1, so that if no errors found that
thread need not send a HMI event to linux. This helps in stop flooding host
with hmi event by every thread even there are no errors found.

Signed-off-by: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
---
 core/hmi.c        |   21 +++++++++++++--------
 hw/chiptod.c      |    6 +++++-
 include/chiptod.h |    2 +-
 3 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/core/hmi.c b/core/hmi.c
index 95ab96cde..eadb75be4 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -955,7 +955,7 @@ static int handle_thread_tfac_error(uint64_t tfmr, uint64_t *out_flags)
 static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
 {
 	struct cpu_thread *t, *t0;
-	int recover = 1;
+	int recover = -1;
 
 	t = this_cpu();
 	t0 = find_cpu_by_pir(cpu_get_thread0(t));
@@ -975,11 +975,15 @@ static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
 	if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
 		/* Check if it's still in error state */
 		if (mfspr(SPR_TFMR) & SPR_TFMR_TFMR_CORRUPT)
-			if (!recover_corrupt_tfmr())
+			if (!recover_corrupt_tfmr()) {
+				unlock(&hmi_lock);
 				recover = 0;
+			}
 
-		if (!recover)
+		if (!recover) {
+			unlock(&hmi_lock);
 			goto error_out;
+		}
 
 		tfmr = mfspr(SPR_TFMR);
 
@@ -988,8 +992,10 @@ static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
 			recover = handle_thread_tfac_error(tfmr, out_flags);
 			tfmr &= ~SPR_TFMR_THREAD_ERRORS;
 		}
-		if (!recover)
+		if (!recover) {
+			unlock(&hmi_lock);
 			goto error_out;
+		}
 	}
 
 	/* Tell the OS ... */
@@ -1023,8 +1029,7 @@ static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
 
 	/* Now perform the actual TB recovery on thread 0 */
 	if (t == t0)
-		recover = chiptod_recover_tb_errors(tfmr,
-						&this_cpu()->tb_resynced);
+		recover = chiptod_recover_tb_errors(&this_cpu()->tb_resynced);
 
 error_out:
 	/* Last rendez-vous */
@@ -1043,7 +1048,7 @@ error_out:
 static int handle_tfac_errors(uint64_t hmer, struct OpalHMIEvent *hmi_evt,
 			      uint64_t *out_flags)
 {
-	int recover = 1;
+	int recover = -1;
 	uint64_t tfmr = mfspr(SPR_TFMR);
 
 	/* A TFMR parity error makes us ignore all the local stuff */
@@ -1106,7 +1111,7 @@ static int handle_tfac_errors(uint64_t hmer, struct OpalHMIEvent *hmi_evt,
 						mfspr(SPR_TFMR));
 	}
 
-	if (hmi_evt) {
+	if (recover != -1 && hmi_evt) {
 		hmi_evt->severity = OpalHMI_SEV_ERROR_SYNC;
 		hmi_evt->type = OpalHMI_ERROR_TFAC;
 		hmi_evt->tfmr = tfmr;
diff --git a/hw/chiptod.c b/hw/chiptod.c
index a160e5a10..f6ef9a469 100644
--- a/hw/chiptod.c
+++ b/hw/chiptod.c
@@ -1505,8 +1505,9 @@ bool tfmr_clear_core_errors(uint64_t tfmr)
  *	1	<= Successfully recovered from errors
  *	-1	<= No errors found. Errors are already been fixed.
  */
-int chiptod_recover_tb_errors(uint64_t tfmr, bool *out_resynced)
+int chiptod_recover_tb_errors(bool *out_resynced)
 {
+	uint64_t tfmr;
 	int rc = -1;
 
 	*out_resynced = false;
@@ -1516,6 +1517,9 @@ int chiptod_recover_tb_errors(uint64_t tfmr, bool *out_resynced)
 
 	lock(&chiptod_lock);
 
+	/* Get fresh copy of TFMR */
+	tfmr = mfspr(SPR_TFMR);
+
 	/*
 	 * Check for TB errors.
 	 * On Sync check error, bit 44 of TFMR is set. Check for it and
diff --git a/include/chiptod.h b/include/chiptod.h
index 7708d4899..667e6fd83 100644
--- a/include/chiptod.h
+++ b/include/chiptod.h
@@ -29,7 +29,7 @@ enum chiptod_topology {
 
 extern void chiptod_init(void);
 extern bool chiptod_wakeup_resync(void);
-extern int chiptod_recover_tb_errors(uint64_t tfmr, bool *out_resynced);
+extern int chiptod_recover_tb_errors(bool *out_resynced);
 extern bool tfmr_recover_local_errors(uint64_t tfmr);
 extern bool recover_corrupt_tfmr(void);
 extern void tfmr_cleanup_core_errors(uint64_t tfmr);



More information about the Skiboot mailing list