[Skiboot] [RFC PATCH 1/6] opal/hmi: Introduce core and thread level error counters

Mahesh Salgaonkar mahesh at linux.vnet.ibm.com
Wed Apr 24 04:12:45 AEST 2019


Introduce counters for each recoverable error that are reported to OPAL
through HMI. These counters then will be used to implement thresholding
mechanism for HMI errors.

TODO:
- Add counters for chip level errors.

Signed-off-by: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
---
 core/hmi.c    |   17 +++++++++++++++++
 hw/chiptod.c  |   28 ++++++++++++++++++++++------
 include/cpu.h |    2 ++
 include/hmi.h |   46 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 87 insertions(+), 6 deletions(-)
 create mode 100644 include/hmi.h

diff --git a/core/hmi.c b/core/hmi.c
index e81328600..2fc47ff12 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -956,6 +956,22 @@ static struct cpu_job **hmi_kick_secondaries(void)
 	return hmi_jobs;
 }
 
+void hmi_update_core_counters(enum hmi_core_error error)
+{
+	struct cpu_thread *t, *t0;
+
+	t = this_cpu();
+	t0 = find_cpu_by_pir(cpu_get_thread0(t));
+
+	if (t == t0)
+		t->hmi_counters.core_counters[error]++;
+}
+
+void hmi_update_thread_counters(enum hmi_thread_error error)
+{
+	this_cpu()->hmi_counters.thread_counters[error]++;
+}
+
 static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
 {
 	struct cpu_thread *t, *t0;
@@ -1109,6 +1125,7 @@ static int handle_tfac_errors(struct OpalHMIEvent *hmi_evt, uint64_t *out_flags)
 		 * it up later
 		 */
 		this_cpu()->tb_invalid = true;
+		hmi_update_core_counters(CORE_ERR_TFMR_CORRUPT);
 		goto bad_tfmr;
 	}
 
diff --git a/hw/chiptod.c b/hw/chiptod.c
index 668789ebe..011a8472c 100644
--- a/hw/chiptod.c
+++ b/hw/chiptod.c
@@ -27,6 +27,7 @@
 #include <cpu.h>
 #include <timebase.h>
 #include <opal-api.h>
+#include <hmi.h>
 
 /* TOD chip XSCOM addresses */
 #define TOD_MASTER_PATH_CTRL		0x00040000 /* Master Path ctrl reg */
@@ -1348,24 +1349,34 @@ static bool tfmr_recover_tb_errors(uint64_t tfmr)
 	tfmr_reset_error = base_tfmr | SPR_TFMR_CLEAR_TB_ERRORS;
 
 	/* Additionally pHyp sets these (write-1-to-clear ?) */
-	if (tfmr & SPR_TFMR_TB_MISSING_SYNC)
+	if (tfmr & SPR_TFMR_TB_MISSING_SYNC) {
 		tfmr_reset_error |= SPR_TFMR_TB_MISSING_SYNC;
+		hmi_update_core_counters(CORE_ERR_TB_MISSING_SYNC);
+	}
 
-	if (tfmr & SPR_TFMR_TB_MISSING_STEP)
+	if (tfmr & SPR_TFMR_TB_MISSING_STEP) {
 		tfmr_reset_error |= SPR_TFMR_TB_MISSING_STEP;
+		hmi_update_core_counters(CORE_ERR_TB_MISSING_STEP);
+	}
 
 	/*
 	 * write 1 to bit 45 to clear TB residue the error.
 	 * TB register has already been reset to zero as part pre-recovery.
 	 */
-	if (tfmr & SPR_TFMR_TB_RESIDUE_ERR)
+	if (tfmr & SPR_TFMR_TB_RESIDUE_ERR) {
 		tfmr_reset_error |= SPR_TFMR_TB_RESIDUE_ERR;
+		hmi_update_core_counters(CORE_ERR_TB_RESIDUE);
+	}
 
-	if (tfmr & SPR_TFMR_FW_CONTROL_ERR)
+	if (tfmr & SPR_TFMR_FW_CONTROL_ERR) {
 		tfmr_reset_error |= SPR_TFMR_FW_CONTROL_ERR;
+		hmi_update_core_counters(CORE_ERR_FW_CONTROL);
+	}
 
-	if (tfmr & SPR_TFMR_TBST_CORRUPT)
+	if (tfmr & SPR_TFMR_TBST_CORRUPT) {
 		tfmr_reset_error |= SPR_TFMR_TBST_CORRUPT;
+		hmi_update_core_counters(CORE_ERR_TBST_CORRUPT);
+	}
 
 	mtspr(SPR_TFMR, tfmr_reset_error);
 
@@ -1397,6 +1408,7 @@ bool tfmr_recover_local_errors(uint64_t tfmr)
 
 		/* set bit 59 to clear TFMR DEC parity error. */
 		tfmr_reset_errors |= SPR_TFMR_DEC_PARITY_ERR;
+		hmi_update_thread_counters(THREAD_ERR_DEC_PARITY);
 	}
 
 	/*
@@ -1411,6 +1423,7 @@ bool tfmr_recover_local_errors(uint64_t tfmr)
 
 		/* set bit 57 to clear TFMR PURR parity error. */
 		tfmr_reset_errors |= SPR_TFMR_PURR_PARITY_ERR;
+		hmi_update_thread_counters(THREAD_ERR_PURR_PARITY);
 	}
 
 	if (tfmr & SPR_TFMR_SPURR_PARITY_ERR) {
@@ -1419,6 +1432,7 @@ bool tfmr_recover_local_errors(uint64_t tfmr)
 
 		/* set bit 58 to clear TFMR PURR parity error. */
 		tfmr_reset_errors |= SPR_TFMR_SPURR_PARITY_ERR;
+		hmi_update_thread_counters(THREAD_ERR_SPURR_PARITY);
 	}
 
 	/* Write TFMR twice to clear the error */
@@ -1483,8 +1497,10 @@ void tfmr_cleanup_core_errors(uint64_t tfmr)
 	/* If HDEC is bad, clean it on all threads before we clear the
 	 * error condition.
 	 */
-	if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
+	if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR) {
 		mtspr(SPR_HDEC, 0);
+		hmi_update_core_counters(CORE_ERR_HDEC_PARITY);
+	}
 
 	/* If TB is invalid, clean it on all threads as well, it will be
 	 * restored after the next rendez-vous
diff --git a/include/cpu.h b/include/cpu.h
index 011b12bb9..c268bfe27 100644
--- a/include/cpu.h
+++ b/include/cpu.h
@@ -24,6 +24,7 @@
 #include <opal.h>
 #include <stack.h>
 #include <timer.h>
+#include <hmi.h>
 
 /*
  * cpu_thread is our internal structure representing each
@@ -100,6 +101,7 @@ struct cpu_thread {
 	uint32_t			*core_hmi_state_ptr;
 	bool				tb_invalid;
 	bool				tb_resynced;
+	struct hmi_counters		hmi_counters;
 
 	/* For use by XICS emulation on XIVE */
 	struct xive_cpu_state		*xstate;
diff --git a/include/hmi.h b/include/hmi.h
new file mode 100644
index 000000000..d21928af7
--- /dev/null
+++ b/include/hmi.h
@@ -0,0 +1,46 @@
+/* Copyright 2013-2019 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *	http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __OPAL_HMI_H
+#define __OPAL_HMI_H
+
+enum hmi_core_error {
+	CORE_ERR_HDEC_PARITY		= 0,
+	CORE_ERR_TB_MISSING_SYNC,
+	CORE_ERR_TB_MISSING_STEP,
+	CORE_ERR_TB_RESIDUE,
+	CORE_ERR_TBST_CORRUPT,
+	CORE_ERR_TFMR_CORRUPT,
+	CORE_ERR_FW_CONTROL,
+	CORE_ERR_MAX
+};
+
+enum hmi_thread_error {
+	THREAD_ERR_PURR_PARITY	= 0,
+	THREAD_ERR_SPURR_PARITY,
+	THREAD_ERR_DEC_PARITY,
+	THREAD_ERR_MAX
+};
+
+struct hmi_counters {
+	uint32_t core_counters[CORE_ERR_MAX];  /* Only primary thread 0 */
+	uint32_t thread_counters[THREAD_ERR_MAX];
+};
+
+void hmi_update_core_counters(enum hmi_core_error error);
+void hmi_update_thread_counters(enum hmi_thread_error error);
+
+#endif /* __OPAL_HMI_H */



More information about the Skiboot mailing list