[Skiboot] [RESEND PATCH v4] hw/xscom: Reset XSCOM engine after finite number of retries when busy

Vipin K Parashar vipin at linux.vnet.ibm.com
Mon Jun 6 19:26:37 AEST 2016


OPAL retries XSCOM read/write operations forever till it succeeds.
This can cause XSCOM ops to hang forever when XSCOM engine remains
busy for some reason. Changed it to retry XSCOM operations only
XSCOM_BUSY_MAX_RETRIES number of times instead of retrying forever.
Also added logic to reset XSCOM engine after XSCOM_BUSY_RESET_THRESHOLD
number of retries to unblock it when it remains busy.

Cc: stable # 9c2d82394fd2 ("xscom: Return OPAL_WRONG_STATE on XSCOM ops..")
Signed-off-by: Vipin K Parashar <vipin at linux.vnet.ibm.com>
Signed-off-by: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>
---
Changes in v4:
 - Changed xscom operation retries loop counter variable to 'retries'.
 - Added a FIXME comment to replace 10 ms delay with exact HW delay
   or better method to confirm XSCOM reset completion.

Changes in v3:
 - Added delay of 10ms after XSCOM engine reset when XSCOM is found busy.
 - Modified 'if' condition to check return value of xscom_handle_error().

Changes in v2:
 - Changed newly added macro names to slightly more intuitive names.
   Used XSCOM_BUSY_MAX_RETRIES to signify total retries allowed if
   XSCOM remains busy and XSCOM_BUSY_RESET_THRESHOLD to hold threshold
   count for resetting XSCOM before retrying XSCOM operation again.

 hw/xscom.c         | 74 +++++++++++++++++++++++++++++++++++++++++-------------
 include/errorlog.h |  1 +
 include/xscom.h    |  6 +++++
 3 files changed, 64 insertions(+), 17 deletions(-)

diff --git a/hw/xscom.c b/hw/xscom.c
index 4d88ce5..2a91a7b 100644
--- a/hw/xscom.c
+++ b/hw/xscom.c
@@ -23,6 +23,7 @@
 #include <centaur.h>
 #include <errorlog.h>
 #include <opal-api.h>
+#include <timebase.h>
 
 /* Mask of bits to clear in HMER before an access */
 #define HMER_CLR_MASK	(~(SPR_HMER_XSCOM_FAIL | \
@@ -41,6 +42,10 @@ DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
 		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
 		OPAL_NA);
 
+DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_BUSY, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
+		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_NA);
+
 /* xscom details to trigger xstop */
 static struct {
 	uint64_t addr;
@@ -118,18 +123,49 @@ static void xscom_reset(uint32_t gcid)
 	 */
 }
 
-static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
-			       bool is_write)
+static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
+			      bool is_write, int64_t retries)
 {
+	struct timespec ts;
 	unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer);
 
 	/* XXX Figure out error codes from doc and error
 	 * recovery procedures
 	 */
 	switch(stat) {
-	/* XSCOM blocked, just retry */
+	/*
+	 * XSCOM engine is blocked, need to retry. Reset XSCOM engine
+	 * after crossing retry threshold before retrying again.
+	 */
 	case 1:
+		if (retries && !(retries  % XSCOM_BUSY_RESET_THRESHOLD)) {
+			prlog(PR_NOTICE, "XSCOM: Busy even after %d retries, "
+				"resetting XSCOM now. Total retries  = %lld\n",
+				XSCOM_BUSY_RESET_THRESHOLD, retries);
+			xscom_reset(gcid);
+
+			/*
+			 * Its observed that sometimes immediate retry of
+			 * XSCOM operation returns wrong data. Adding a
+			 * delay for XSCOM reset to be effective. Delay of
+			 * 10 ms is found to be working fine experimentally.
+			 * FIXME: Replace 10ms delay by exact delay needed
+			 * or other alternate method to confirm XSCOM reset
+			 * completion, after checking from HW folks.
+			 */
+			ts.tv_sec = 0;
+			ts.tv_nsec = 10 * 1000;
+			nanosleep_nopoll(&ts, NULL);
+		}
+
+		/* Log error if we have retried enough and its still busy */
+		if (retries == XSCOM_BUSY_MAX_RETRIES)
+			log_simple_error(&e_info(OPAL_RC_XSCOM_BUSY),
+				"XSCOM: %s-busy error gcid=0x%x pcb_addr=0x%x "
+				"stat=0x%x\n", is_write ? "write" : "read",
+				gcid, pcb_addr, stat);
 		return OPAL_BUSY;
+
 	/* CPU is asleep, don't retry */
 	case 2:
 		return OPAL_WRONG_STATE;
@@ -178,14 +214,14 @@ static bool xscom_gcid_ok(uint32_t gcid)
 static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
 {
 	uint64_t hmer;
-	int64_t ret;
+	int64_t ret, retries;
 
 	if (!xscom_gcid_ok(gcid)) {
 		prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
 		return OPAL_PARAMETER;
 	}
 
-	for (;;) {
+	for (retries = 0; retries <= XSCOM_BUSY_MAX_RETRIES; retries++) {
 		/* Clear status bits in HMER (HMER is special
 		 * writing to it *ands* bits
 		 */
@@ -199,27 +235,29 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
 
 		/* Check for error */
 		if (!(hmer & SPR_HMER_XSCOM_FAIL))
-			break;
+			return OPAL_SUCCESS;
 
 		/* Handle error and possibly eventually retry */
-		ret = xscom_handle_error(hmer, gcid, pcb_addr, false);
-		if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE)
-			return ret;
+		ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries);
+		if (ret != OPAL_BUSY)
+			break;
 	}
-	return OPAL_SUCCESS;
+
+	prerror("XSCOM: Read failed, ret =  %lld\n", ret);
+	return ret;
 }
 
 static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
 {
 	uint64_t hmer;
-	int64_t ret;
+	int64_t ret, retries = 0;
 
 	if (!xscom_gcid_ok(gcid)) {
 		prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
 		return OPAL_PARAMETER;
 	}
 
-	for (;;) {
+	for (retries = 0; retries <= XSCOM_BUSY_MAX_RETRIES; retries++) {
 		/* Clear status bits in HMER (HMER is special
 		 * writing to it *ands* bits
 		 */
@@ -233,14 +271,16 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
 
 		/* Check for error */
 		if (!(hmer & SPR_HMER_XSCOM_FAIL))
-			break;
+			return OPAL_SUCCESS;
 
 		/* Handle error and possibly eventually retry */
-		ret = xscom_handle_error(hmer, gcid, pcb_addr, true);
-		if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE)
-			return ret;
+		ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries);
+		if (ret != OPAL_BUSY)
+			break;
 	}
-	return OPAL_SUCCESS;
+
+	prerror("XSCOM: Write failed, ret =  %lld\n", ret);
+	return ret;
 }
 
 /*
diff --git a/include/errorlog.h b/include/errorlog.h
index b8fca7d..0a4237f 100644
--- a/include/errorlog.h
+++ b/include/errorlog.h
@@ -276,6 +276,7 @@ enum opal_reasoncode {
 	OPAL_RC_XSCOM_RW		= OPAL_XS | 0x10,
 	OPAL_RC_XSCOM_INDIRECT_RW	= OPAL_XS | 0x11,
 	OPAL_RC_XSCOM_RESET		= OPAL_XS | 0x12,
+	OPAL_RC_XSCOM_BUSY		= OPAL_XS | 0x13,
 /* PCI */
 	OPAL_RC_PCI_INIT_SLOT   = OPAL_PC | 0x10,
 	OPAL_RC_PCI_ADD_SLOT    = OPAL_PC | 0x11,
diff --git a/include/xscom.h b/include/xscom.h
index 933af6a..1aee40e 100644
--- a/include/xscom.h
+++ b/include/xscom.h
@@ -167,6 +167,12 @@
 /* HB folks say: try 10 time for now */
 #define XSCOM_IND_MAX_RETRIES		10
 
+/* Max number of retries when XSCOM remains busy */
+#define XSCOM_BUSY_MAX_RETRIES		3000
+
+/* Retry count after which to reset XSCOM, if still busy */
+#define XSCOM_BUSY_RESET_THRESHOLD	1000
+
 /*
  * Error handling:
  *
-- 
2.1.4



More information about the Skiboot mailing list