[Skiboot] [RFC PATCH 2/6] opal/hmi: Introduce thresholding of HMI errors.

Mahesh Salgaonkar mahesh at linux.vnet.ibm.com
Wed Apr 24 04:12:52 AEST 2019


Define two threshold levels similar to what pHyp uses.
Level 1) 100 errors in 100 msec.
     If we get same hmi error 100 times in 100 msec then we definitely
     have a BAD chip/core.
Level 2) 32 errors on 24 hour time window.
     If we get same hmi error 32 times in 24 hour time window then also
     we can consider that we have a BAD chip/core.

In either of above cases when threshold is reached log an eSEL pointing out
a BAD chip. Possibly also send an event to Linux host to hotplug out
respective core and mark a gard record for same.

Signed-off-by: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
---
 core/hmi.c    |  117 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/hmi.h |   41 +++++++++++++++++++-
 2 files changed, 153 insertions(+), 5 deletions(-)

diff --git a/core/hmi.c b/core/hmi.c
index 2fc47ff12..cac8505d4 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -33,6 +33,7 @@
 #include <capp.h>
 #include <nvram.h>
 #include <cpu.h>
+#include <timebase.h>
 
 /*
  * HMER register layout:
@@ -310,6 +311,97 @@ static int setup_scom_addresses(void)
 	return 0;
 }
 
+static void threshold_check(hmi_err_count_t *counters, int error,
+							uint64_t tb_now)
+{
+	uint64_t level1_timer = counters[error].threshold_level1_timer;
+	uint64_t level2_timer = counters[error].threshold_level2_timer;
+	uint32_t level1_count = counters[error].level1_count;
+	uint32_t level2_count = counters[error].level2_count;
+
+	/*
+	 * If TB is invalid then delay the threshold check to end of
+	 * hmi handler.
+	 */
+	if (!(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID)) {
+		counters[error].flags |= HMI_DELAY_THRESHOLD_CHECK;
+		return;
+	}
+
+	/* Ignore if threshold is already reached. */
+	if (counters[error].flags & (HMI_THRESHOLD_REACHED))
+		return;;
+
+	/*
+	 * Check if we have crossed threshold timers. if yes then reset the
+	 * threshold timers and set count to 1. OR check if we hit the
+	 * threshold limit.
+	 */
+	if (tb_now > level1_timer) {
+		counters[error].level1_count = 1;
+		counters[error].threshold_level1_timer =
+			tb_now + msecs_to_tb(HMI_THRESHOLD_LEVEL1_TIME);
+	} else if (level1_count >= HMI_THRESHOLD_LEVEL1_LIMIT) {
+		counters[error].flags |= HMI_THRESHOLD_LEVEL1;
+		prlog(PR_DEBUG, "Threshold level1 limit reached for %d\n",
+								error);
+		/* TODO: Generate an errorlog with HW callout details. */
+	}
+
+	if (tb_now > level2_timer) {
+		counters[error].level2_count = 1;
+		counters[error].threshold_level2_timer =
+			tb_now + msecs_to_tb(HMI_THRESHOLD_LEVEL2_TIME);
+	} else if (level2_count >= HMI_THRESHOLD_LEVEL2_LIMIT) {
+		counters[error].flags |= HMI_THRESHOLD_LEVEL2;
+		prlog(PR_DEBUG, "Threshold level2 limit reached for %d\n",
+								error);
+		/* TODO: Generate an errorlog with HW callout details. */
+	}
+}
+
+static void process_delayed_counters(hmi_err_count_t *counters, int num_errors,
+				uint64_t tb_now)
+{
+	int i;
+	for (i = 0; i < num_errors; i++) {
+		/* skip if not marked for delayed check */
+		if (!(counters[i].flags & HMI_DELAY_THRESHOLD_CHECK))
+			continue;
+
+		counters[i].flags &= ~HMI_DELAY_THRESHOLD_CHECK;
+		threshold_check(counters, i, tb_now);
+	}
+}
+
+static void delayed_threshold_check(void)
+{
+	struct cpu_thread *t, *t0;
+	hmi_err_count_t *counters;
+	uint64_t now = mftb();
+
+	t = this_cpu();
+	t0 = find_cpu_by_pir(cpu_get_thread0(t));
+
+	/*
+	 * if TB is bad then skip the threshold check. We will anyway
+	 * head down to panic.
+	 */
+	if (!(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID))
+		return;
+
+	/* thread level counters check */
+	counters = t->hmi_counters.thread_counters;
+	process_delayed_counters(counters, THREAD_ERR_MAX, now);
+
+	if (t != t0)
+		return;
+
+	/* Core level counters check on primary thread 0 */
+	counters = t->hmi_counters.core_counters;
+	process_delayed_counters(counters, CORE_ERR_MAX, now);
+}
+
 static int queue_hmi_event(struct OpalHMIEvent *hmi_evt, int recover, uint64_t *out_flags)
 {
 	size_t num_params;
@@ -959,17 +1051,35 @@ static struct cpu_job **hmi_kick_secondaries(void)
 void hmi_update_core_counters(enum hmi_core_error error)
 {
 	struct cpu_thread *t, *t0;
+	hmi_err_count_t	*counters;
+	uint64_t now = mftb();
 
 	t = this_cpu();
 	t0 = find_cpu_by_pir(cpu_get_thread0(t));
 
-	if (t == t0)
-		t->hmi_counters.core_counters[error]++;
+	/* Only on primary thread 0 */
+	if (t != t0)
+		return;
+
+	counters = t->hmi_counters.core_counters;
+	counters[error].total_count++;
+	counters[error].level1_count++;
+	counters[error].level2_count++;
+	counters[error].flags |= HMI_ERROR_OCCURRED;
+	threshold_check(counters, error, now);
 }
 
 void hmi_update_thread_counters(enum hmi_thread_error error)
 {
-	this_cpu()->hmi_counters.thread_counters[error]++;
+	struct cpu_thread *t = this_cpu();
+	hmi_err_count_t	*counters = t->hmi_counters.thread_counters;
+	uint64_t now = mftb();
+
+	counters[error].total_count++;
+	counters[error].level1_count++;
+	counters[error].level2_count++;
+	counters[error].flags |= HMI_ERROR_OCCURRED;
+	threshold_check(counters, error, now);
 }
 
 static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
@@ -1299,6 +1409,7 @@ static int handle_hmi_exception(uint64_t hmer, struct OpalHMIEvent *hmi_evt,
 		}
 	}
 
+	delayed_threshold_check();
 	if (recover == 0)
 		disable_fast_reboot("Unrecoverable HMI");
 	/*
diff --git a/include/hmi.h b/include/hmi.h
index d21928af7..03f10adcd 100644
--- a/include/hmi.h
+++ b/include/hmi.h
@@ -35,9 +35,46 @@ enum hmi_thread_error {
 	THREAD_ERR_MAX
 };
 
+/*
+ * Define two threshold level similar to what pHyp uses.
+ * 1. 100 errors in 100 msec.
+ *	If we get same hmi error 100 times in 100 msec then we definetly
+ *	have a BAD chip/core.
+ * 2. 32 errors on 24 hour time window.
+ *	If we get same hmi error 32 times in 24 hour time window then also
+ *	we can consider that we have a BAD chip/core.
+ *
+ * In either of above cases when threshold is reached log an eSEL pointing out
+ * a BAD chip. Possibly also send an event to Linux host to hotplug out
+ * repspective core and mark a gard record for same.
+ */
+
+#define HMI_THRESHOLD_LEVEL1_TIME	100			/* 100 msecs */
+#define HMI_THRESHOLD_LEVEL1_LIMIT	100			/* 100 errors */
+#define HMI_THRESHOLD_LEVEL2_TIME	(24 * 3600 * 1000)	/* 24hr */
+#define HMI_THRESHOLD_LEVEL2_LIMIT	32			/* 32 errors */
+
+/* Values for for hmi_err_count.flags */
+#define HMI_ERROR_OCCURRED		(1 << 0)
+#define HMI_DELAY_THRESHOLD_CHECK	(1 << 1)
+#define HMI_THRESHOLD_LEVEL1		(1 << 2)
+#define HMI_THRESHOLD_LEVEL2		(1 << 3)
+
+#define HMI_THRESHOLD_REACHED		(HMI_THRESHOLD_LEVEL1	\
+					| HMI_THRESHOLD_LEVEL2)
+
+typedef struct hmi_err_count {
+	uint32_t	total_count;
+	uint32_t	flags;
+	uint64_t	threshold_level1_timer;
+	uint64_t	threshold_level2_timer;
+	uint32_t	level1_count;
+	uint32_t	level2_count;
+} hmi_err_count_t;
+
 struct hmi_counters {
-	uint32_t core_counters[CORE_ERR_MAX];  /* Only primary thread 0 */
-	uint32_t thread_counters[THREAD_ERR_MAX];
+	hmi_err_count_t core_counters[CORE_ERR_MAX]; /* Only primary thread 0 */
+	hmi_err_count_t thread_counters[THREAD_ERR_MAX];
 };
 
 void hmi_update_core_counters(enum hmi_core_error error);



More information about the Skiboot mailing list