[Skiboot] [PATCH 3/5] centaur: Mark centaur offline after 10 consecutive access errors

Benjamin Herrenschmidt benh at kernel.crashing.org
Tue Sep 13 14:45:26 AEST 2016


This avoids spamming the logs when the centaur is dead and PRD
constantly tries to access it

Signed-off-by: Benjamin Herrenschmidt <benh at kernel.crashing.org>
---
 hw/centaur.c       | 28 ++++++++++++++++++++++++++++
 include/centaur.h  |  2 ++
 include/opal-api.h |  1 +
 3 files changed, 31 insertions(+)

diff --git a/hw/centaur.c b/hw/centaur.c
index 1f2b9c4..8b7311f 100644
--- a/hw/centaur.c
+++ b/hw/centaur.c
@@ -39,6 +39,9 @@
 /* Is that correct ? */
 #define MAX_CENTAURS_PER_CHIP	8
 
+/* Mark the centaur offline after this many consecutive errors */
+#define CENTAUR_ERR_OFFLINE_THRESHOLD	10
+
 /*
  * FSI2PIB register definitions (this could be moved out if we were to
  * support FSI master to other chips.
@@ -319,12 +322,24 @@ int64_t centaur_xscom_read(uint32_t id, uint64_t pcb_addr, uint64_t *val)
 
 	if (!centaur)
 		return OPAL_PARAMETER;
+	if (!centaur->online)
+		return OPAL_XSCOM_CTR_OFFLINED;
 
 	lock(&centaur->lock);
 	if (pcb_addr & XSCOM_ADDR_IND_FLAG)
 		rc = centaur_xscom_ind_read(centaur, pcb_addr, val);
 	else
 		rc = centaur_fsiscom_read(centaur, pcb_addr, val);
+
+	/* We mark the centaur offline if we get too many errors on
+	 * consecutive accesses
+	 */
+	if (rc) {
+		centaur->error_count++;
+		if (centaur->error_count > CENTAUR_ERR_OFFLINE_THRESHOLD)
+			centaur->online = false;
+	} else
+		centaur->error_count = 0;
 	unlock(&centaur->lock);
 
 	return rc;
@@ -337,12 +352,24 @@ int64_t centaur_xscom_write(uint32_t id, uint64_t pcb_addr, uint64_t val)
 
 	if (!centaur)
 		return OPAL_PARAMETER;
+	if (!centaur->online)
+		return OPAL_XSCOM_CTR_OFFLINED;
 
 	lock(&centaur->lock);
 	if (pcb_addr & XSCOM_ADDR_IND_FLAG)
 		rc = centaur_xscom_ind_write(centaur, pcb_addr, val);
 	else
 		rc = centaur_fsiscom_write(centaur, pcb_addr, val);
+
+	/* We mark the centaur offline if we get too many errors on
+	 * consecutive accesses
+	 */
+	if (rc) {
+		centaur->error_count++;
+		if (centaur->error_count > CENTAUR_ERR_OFFLINE_THRESHOLD)
+			centaur->online = false;
+	} else
+		centaur->error_count = 0;
 	unlock(&centaur->lock);
 
 	return rc;
@@ -425,6 +452,7 @@ static bool centaur_add(uint32_t part_id, uint32_t mchip, uint32_t meng,
 	centaur->fsi_master_chip_id = mchip;
 	centaur->fsi_master_port = mport;
 	centaur->fsi_master_engine = meng ? MFSI_cMFSI1 : MFSI_cMFSI0;
+	centaur->online = true;
 	init_lock(&centaur->lock);
 	list_head_init(&centaur->i2cms);
 
diff --git a/include/centaur.h b/include/centaur.h
index 6453e13..7a50bee 100644
--- a/include/centaur.h
+++ b/include/centaur.h
@@ -24,6 +24,7 @@
 
 struct centaur_chip {
 	bool			valid;
+	bool			online;
 	uint8_t			ec_level;
 	uint32_t		part_id;
 	uint32_t		fsi_master_chip_id;
@@ -31,6 +32,7 @@ struct centaur_chip {
 	uint32_t		fsi_master_engine;
 	uint32_t		scache_disable_count;
 	bool			scache_was_enabled;
+	uint32_t		error_count;
 	struct lock		lock;
 
 	/* Used by hw/p8-i2c.c */
diff --git a/include/opal-api.h b/include/opal-api.h
index 7422443..4cc1544 100644
--- a/include/opal-api.h
+++ b/include/opal-api.h
@@ -52,6 +52,7 @@
 #define OPAL_XSCOM_CLOCK_ERROR	-27
 #define OPAL_XSCOM_PARITY_ERROR	-28
 #define OPAL_XSCOM_TIMEOUT	-29
+#define OPAL_XSCOM_CTR_OFFLINED	-30
 
 /* API Tokens (in r0) */
 #define OPAL_INVALID_CALL		       -1
-- 
2.7.4



More information about the Skiboot mailing list