[Skiboot] [PATCH v4] hw/xscom: Reset XSCOM engine after finite number of retries when busy
Vipin K Parashar
vipin at linux.vnet.ibm.com
Mon Jun 6 19:30:19 AEST 2016
Please discard this one as i missed out
removing unused counter variable "i".
Please use one with subject tag "RESEND PATCH 4".
Thanks,
Vipin
On Monday 06 June 2016 02:44 PM, Vipin K Parashar wrote:
> OPAL retries XSCOM read/write operations forever till it succeeds.
> This can cause XSCOM ops to hang forever when XSCOM engine remains
> busy for some reason. Changed it to retry XSCOM operations only
> XSCOM_BUSY_MAX_RETRIES number of times instead of retrying forever.
> Also added logic to reset XSCOM engine after XSCOM_BUSY_RESET_THRESHOLD
> number of retries to unblock it when it remains busy.
>
> Cc: stable # 9c2d82394fd2 ("xscom: Return OPAL_WRONG_STATE on XSCOM ops..")
> Signed-off-by: Vipin K Parashar <vipin at linux.vnet.ibm.com>
> Signed-off-by: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>
> ---
> Changes in v4:
> - Changed xscom operation retries loop counter variable to 'retries'.
> - Added a FIXME comment to replace 10 ms delay with exact HW delay
> or better method to confirm XSCOM reset completion.
>
> Changes in v3:
> - Added delay of 10ms after XSCOM engine reset when XSCOM is found busy.
> - Modified 'if' condition to check return value of xscom_handle_error().
>
> Changes in v2:
> - Changed newly added macro names to slightly more intuitive names.
> Used XSCOM_BUSY_MAX_RETRIES to signify total retries allowed if
> XSCOM remains busy and XSCOM_BUSY_RESET_THRESHOLD to hold threshold
> count for resetting XSCOM before retrying XSCOM operation again.
>
> hw/xscom.c | 76 ++++++++++++++++++++++++++++++++++++++++++------------
> include/errorlog.h | 1 +
> include/xscom.h | 6 +++++
> 3 files changed, 66 insertions(+), 17 deletions(-)
>
> diff --git a/hw/xscom.c b/hw/xscom.c
> index 4d88ce5..708afb8 100644
> --- a/hw/xscom.c
> +++ b/hw/xscom.c
> @@ -23,6 +23,7 @@
> #include <centaur.h>
> #include <errorlog.h>
> #include <opal-api.h>
> +#include <timebase.h>
>
> /* Mask of bits to clear in HMER before an access */
> #define HMER_CLR_MASK (~(SPR_HMER_XSCOM_FAIL | \
> @@ -41,6 +42,10 @@ DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
> OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
> OPAL_NA);
>
> +DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_BUSY, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
> + OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
> + OPAL_NA);
> +
> /* xscom details to trigger xstop */
> static struct {
> uint64_t addr;
> @@ -118,18 +123,49 @@ static void xscom_reset(uint32_t gcid)
> */
> }
>
> -static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
> - bool is_write)
> +static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
> + bool is_write, int64_t retries)
> {
> + struct timespec ts;
> unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer);
>
> /* XXX Figure out error codes from doc and error
> * recovery procedures
> */
> switch(stat) {
> - /* XSCOM blocked, just retry */
> + /*
> + * XSCOM engine is blocked, need to retry. Reset XSCOM engine
> + * after crossing retry threshold before retrying again.
> + */
> case 1:
> + if (retries && !(retries % XSCOM_BUSY_RESET_THRESHOLD)) {
> + prlog(PR_NOTICE, "XSCOM: Busy even after %d retries, "
> + "resetting XSCOM now. Total retries = %lld\n",
> + XSCOM_BUSY_RESET_THRESHOLD, retries);
> + xscom_reset(gcid);
> +
> + /*
> + * Its observed that sometimes immediate retry of
> + * XSCOM operation returns wrong data. Adding a
> + * delay for XSCOM reset to be effective. Delay of
> + * 10 ms is found to be working fine experimentally.
> + * FIXME: Replace 10ms delay by exact delay needed
> + * or other alternate method to confirm XSCOM reset
> + * completion, after checking from HW folks.
> + */
> + ts.tv_sec = 0;
> + ts.tv_nsec = 10 * 1000;
> + nanosleep_nopoll(&ts, NULL);
> + }
> +
> + /* Log error if we have retried enough and its still busy */
> + if (retries == XSCOM_BUSY_MAX_RETRIES)
> + log_simple_error(&e_info(OPAL_RC_XSCOM_BUSY),
> + "XSCOM: %s-busy error gcid=0x%x pcb_addr=0x%x "
> + "stat=0x%x\n", is_write ? "write" : "read",
> + gcid, pcb_addr, stat);
> return OPAL_BUSY;
> +
> /* CPU is asleep, don't retry */
> case 2:
> return OPAL_WRONG_STATE;
> @@ -177,15 +213,16 @@ static bool xscom_gcid_ok(uint32_t gcid)
> */
> static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
> {
> + int i;
> uint64_t hmer;
> - int64_t ret;
> + int64_t ret, retries;
>
> if (!xscom_gcid_ok(gcid)) {
> prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
> return OPAL_PARAMETER;
> }
>
> - for (;;) {
> + for (retries = 0; retries <= XSCOM_BUSY_MAX_RETRIES; retries++) {
> /* Clear status bits in HMER (HMER is special
> * writing to it *ands* bits
> */
> @@ -199,27 +236,30 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
>
> /* Check for error */
> if (!(hmer & SPR_HMER_XSCOM_FAIL))
> - break;
> + return OPAL_SUCCESS;
>
> /* Handle error and possibly eventually retry */
> - ret = xscom_handle_error(hmer, gcid, pcb_addr, false);
> - if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE)
> - return ret;
> + ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries);
> + if (ret != OPAL_BUSY)
> + break;
> }
> - return OPAL_SUCCESS;
> +
> + prerror("XSCOM: Read failed, ret = %lld\n", ret);
> + return ret;
> }
>
> static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
> {
> + int i;
> uint64_t hmer;
> - int64_t ret;
> + int64_t ret, retries = 0;
>
> if (!xscom_gcid_ok(gcid)) {
> prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
> return OPAL_PARAMETER;
> }
>
> - for (;;) {
> + for (retries = 0; retries <= XSCOM_BUSY_MAX_RETRIES; retries++) {
> /* Clear status bits in HMER (HMER is special
> * writing to it *ands* bits
> */
> @@ -233,14 +273,16 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
>
> /* Check for error */
> if (!(hmer & SPR_HMER_XSCOM_FAIL))
> - break;
> + return OPAL_SUCCESS;
>
> /* Handle error and possibly eventually retry */
> - ret = xscom_handle_error(hmer, gcid, pcb_addr, true);
> - if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE)
> - return ret;
> + ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries);
> + if (ret != OPAL_BUSY)
> + break;
> }
> - return OPAL_SUCCESS;
> +
> + prerror("XSCOM: Write failed, ret = %lld\n", ret);
> + return ret;
> }
>
> /*
> diff --git a/include/errorlog.h b/include/errorlog.h
> index b8fca7d..0a4237f 100644
> --- a/include/errorlog.h
> +++ b/include/errorlog.h
> @@ -276,6 +276,7 @@ enum opal_reasoncode {
> OPAL_RC_XSCOM_RW = OPAL_XS | 0x10,
> OPAL_RC_XSCOM_INDIRECT_RW = OPAL_XS | 0x11,
> OPAL_RC_XSCOM_RESET = OPAL_XS | 0x12,
> + OPAL_RC_XSCOM_BUSY = OPAL_XS | 0x13,
> /* PCI */
> OPAL_RC_PCI_INIT_SLOT = OPAL_PC | 0x10,
> OPAL_RC_PCI_ADD_SLOT = OPAL_PC | 0x11,
> diff --git a/include/xscom.h b/include/xscom.h
> index 933af6a..1aee40e 100644
> --- a/include/xscom.h
> +++ b/include/xscom.h
> @@ -167,6 +167,12 @@
> /* HB folks say: try 10 time for now */
> #define XSCOM_IND_MAX_RETRIES 10
>
> +/* Max number of retries when XSCOM remains busy */
> +#define XSCOM_BUSY_MAX_RETRIES 3000
> +
> +/* Retry count after which to reset XSCOM, if still busy */
> +#define XSCOM_BUSY_RESET_THRESHOLD 1000
> +
> /*
> * Error handling:
> *
More information about the Skiboot
mailing list