[Skiboot] [PATCH] hw/xscom: Reset XSCOM after finite number of retries when busy
Vipin K Parashar
vipin at linux.vnet.ibm.com
Mon May 16 20:43:52 AEST 2016
OPAL retries XSCOM read/write operations forever till it succeeds.
In case XSCOM remains busy for some reason, it causes XSCOM ops to hang.
Added logic to retry XSCOM operations only XSCOM_OPS_MAX_RETRIES number
of times. Also added logic to reset XSCOM after XSCOM_BUSY_MAX_RETRIES
number of retries to unblock it, if it remains busy for some reason.
Signed-off-by: Vipin K Parashar <vipin at linux.vnet.ibm.com>
Signed-off-by: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>
---
hw/xscom.c | 63 ++++++++++++++++++++++++++++++++++++++++--------------
include/errorlog.h | 1 +
include/xscom.h | 6 ++++++
3 files changed, 54 insertions(+), 16 deletions(-)
diff --git a/hw/xscom.c b/hw/xscom.c
index 84f72f5..04f1e33 100644
--- a/hw/xscom.c
+++ b/hw/xscom.c
@@ -41,6 +41,10 @@ DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
OPAL_NA);
+DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_BUSY, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
+ OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
+ OPAL_NA);
+
/* xscom details to trigger xstop */
static struct {
uint64_t addr;
@@ -119,7 +123,7 @@ static void xscom_reset(uint32_t gcid)
}
static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
- bool is_write)
+ bool is_write, int64_t retries)
{
unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer);
@@ -127,9 +131,26 @@ static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
* recovery procedures
*/
switch(stat) {
- /* XSCOM blocked, just retry */
+ /*
+ * XSCOM blocked, need to retry. Reset XSCOM after
+ * crossing retry threshold before retrying again.
+ */
case 1:
+ if (retries && !(retries % XSCOM_BUSY_MAX_RETRIES)) {
+ prlog(PR_NOTICE, "XSCOM: Busy!! Resetting after %d "
+ "retries, Total retries = %lld\n",
+ XSCOM_BUSY_MAX_RETRIES, retries);
+ xscom_reset(gcid);
+ }
+
+ /* Log error if we have retried enough and its still busy */
+ if (retries == XSCOM_OPS_MAX_RETRIES)
+ log_simple_error(&e_info(OPAL_RC_XSCOM_BUSY),
+ "XSCOM: %s-busy error gcid=0x%x pcb_addr=0x%x "
+ "stat=0x%x\n", is_write ? "write" : "read",
+ gcid, pcb_addr, stat);
return OPAL_BUSY;
+
/* CPU is asleep, don't retry */
case 2:
return OPAL_WRONG_STATE;
@@ -177,15 +198,16 @@ static bool xscom_gcid_ok(uint32_t gcid)
*/
static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
{
+ int i;
uint64_t hmer;
- int64_t ret;
+ int64_t ret, retries = 0;
if (!xscom_gcid_ok(gcid)) {
prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
return OPAL_PARAMETER;
}
- for (;;) {
+ for (i = 0; i <= XSCOM_OPS_MAX_RETRIES; i++) {
/* Clear status bits in HMER (HMER is special
* writing to it *ands* bits
*/
@@ -199,27 +221,32 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
/* Check for error */
if (!(hmer & SPR_HMER_XSCOM_FAIL))
- break;
+ return OPAL_SUCCESS;
/* Handle error and possibly eventually retry */
- ret = xscom_handle_error(hmer, gcid, pcb_addr, false);
- if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE)
- return ret;
+ ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries);
+ if (ret == OPAL_BUSY)
+ retries++;
+ else
+ break;
}
- return OPAL_SUCCESS;
+
+ prerror("XSCOM: Read failed, ret = %lld\n", ret);
+ return ret;
}
static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
{
+ int i;
uint64_t hmer;
- int64_t ret;
+ int64_t ret, retries = 0;
if (!xscom_gcid_ok(gcid)) {
prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
return OPAL_PARAMETER;
}
- for (;;) {
+ for (i = 0; i <= XSCOM_OPS_MAX_RETRIES; i++) {
/* Clear status bits in HMER (HMER is special
* writing to it *ands* bits
*/
@@ -233,14 +260,18 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
/* Check for error */
if (!(hmer & SPR_HMER_XSCOM_FAIL))
- break;
+ return OPAL_SUCCESS;
/* Handle error and possibly eventually retry */
- ret = xscom_handle_error(hmer, gcid, pcb_addr, true);
- if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE)
- return ret;
+ ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries);
+ if (ret == OPAL_BUSY)
+ retries++;
+ else
+ break;
}
- return OPAL_SUCCESS;
+
+ prerror("XSCOM: Write failed, ret = %lld\n", ret);
+ return ret;
}
/*
diff --git a/include/errorlog.h b/include/errorlog.h
index ed90dab..214aed2 100644
--- a/include/errorlog.h
+++ b/include/errorlog.h
@@ -275,6 +275,7 @@ enum opal_reasoncode {
OPAL_RC_XSCOM_RW = OPAL_XS | 0x10,
OPAL_RC_XSCOM_INDIRECT_RW = OPAL_XS | 0x11,
OPAL_RC_XSCOM_RESET = OPAL_XS | 0x12,
+ OPAL_RC_XSCOM_BUSY = OPAL_XS | 0x13,
/* PCI */
OPAL_RC_PCI_INIT_SLOT = OPAL_PC | 0x10,
OPAL_RC_PCI_ADD_SLOT = OPAL_PC | 0x11,
diff --git a/include/xscom.h b/include/xscom.h
index 933af6a..2055608 100644
--- a/include/xscom.h
+++ b/include/xscom.h
@@ -167,6 +167,12 @@
/* HB folks say: try 10 time for now */
#define XSCOM_IND_MAX_RETRIES 10
+/* Max retry count for XSCOM ops */
+#define XSCOM_OPS_MAX_RETRIES 3000
+
+/* Retry count after which to reset XSCOM, if still busy */
+#define XSCOM_BUSY_MAX_RETRIES 1000
+
/*
* Error handling:
*
--
2.1.4
More information about the Skiboot
mailing list