[Skiboot] [PATCH v2] hw/xscom: Reset XSCOM after finite number of retries when busy
Vipin K Parashar
vipin at linux.vnet.ibm.com
Sun May 22 04:32:58 AEST 2016
On Saturday 21 May 2016 11:27 PM, Mahesh J Salgaonkar wrote:
> On 2016-05-16 21:35:44 Mon, Vipin K Parashar wrote:
>> OPAL retries XSCOM read/write operations forever till it succeeds.
>> It can cause XSCOM ops to hang, if XSCOM remains busy for some reason,
>> Changed it to retry XSCOM operations XSCOM_BUSY_MAX_RETRIES number of
>> times. Also added logic to reset XSCOM after XSCOM_BUSY_RESET_THRESHOLD
>> number of retries to unblock it, if its still busy.
>>
>> Signed-off-by: Vipin K Parashar <vipin at linux.vnet.ibm.com>
>> Signed-off-by: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>
>> ---
>> Changes in v2:
>> - Changed newly added macro names to slightly more intuitive names.
>> Used XSCOM_BUSY_MAX_RETRIES to signify total retries allowed if
>> XSCOM remains busy and XSCOM_BUSY_RESET_THRESHOLD to hold threshold
>> count for resetting XSCOM before retrying XSCOM operation again.
>>
>> hw/xscom.c | 63 ++++++++++++++++++++++++++++++++++++++++--------------
>> include/errorlog.h | 1 +
>> include/xscom.h | 6 ++++++
>> 3 files changed, 54 insertions(+), 16 deletions(-)
>>
>> diff --git a/hw/xscom.c b/hw/xscom.c
>> index 4d88ce5..a9e62d3 100644
>> --- a/hw/xscom.c
>> +++ b/hw/xscom.c
>> @@ -41,6 +41,10 @@ DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
>> OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
>> OPAL_NA);
>>
>> +DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_BUSY, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
>> + OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
>> + OPAL_NA);
>> +
>> /* xscom details to trigger xstop */
>> static struct {
>> uint64_t addr;
>> @@ -119,7 +123,7 @@ static void xscom_reset(uint32_t gcid)
>> }
>>
>> static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
>> - bool is_write)
>> + bool is_write, int64_t retries)
>> {
>> unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer);
>>
>> @@ -127,9 +131,26 @@ static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
>> * recovery procedures
>> */
>> switch(stat) {
>> - /* XSCOM blocked, just retry */
>> + /*
>> + * XSCOM blocked, need to retry. Reset XSCOM after
>> + * crossing retry threshold before retrying again.
>> + */
>> case 1:
>> + if (retries && !(retries % XSCOM_BUSY_RESET_THRESHOLD)) {
>> + prlog(PR_NOTICE, "XSCOM: Busy!! Resetting after %d "
>> + "retries, Total retries = %lld\n",
>> + XSCOM_BUSY_RESET_THRESHOLD, retries);
>> + xscom_reset(gcid);
>> + }
>> +
>> + /* Log error if we have retried enough and its still busy */
>> + if (retries == XSCOM_BUSY_MAX_RETRIES)
>> + log_simple_error(&e_info(OPAL_RC_XSCOM_BUSY),
>> + "XSCOM: %s-busy error gcid=0x%x pcb_addr=0x%x "
>> + "stat=0x%x\n", is_write ? "write" : "read",
>> + gcid, pcb_addr, stat);
>> return OPAL_BUSY;
>> +
>> /* CPU is asleep, don't retry */
>> case 2:
>> return OPAL_WRONG_STATE;
>> @@ -177,15 +198,16 @@ static bool xscom_gcid_ok(uint32_t gcid)
>> */
>> static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
>> {
>> + int i;
>> uint64_t hmer;
>> - int64_t ret;
>> + int64_t ret, retries = 0;
>>
>> if (!xscom_gcid_ok(gcid)) {
>> prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
>> return OPAL_PARAMETER;
>> }
>>
>> - for (;;) {
>> + for (i = 0; i <= XSCOM_BUSY_MAX_RETRIES; i++) {
>> /* Clear status bits in HMER (HMER is special
>> * writing to it *ands* bits
>> */
>> @@ -199,27 +221,32 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
>>
>> /* Check for error */
>> if (!(hmer & SPR_HMER_XSCOM_FAIL))
>> - break;
>> + return OPAL_SUCCESS;
>>
>> /* Handle error and possibly eventually retry */
>> - ret = xscom_handle_error(hmer, gcid, pcb_addr, false);
>> - if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE)
>> - return ret;
>> + ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries);
>> + if (ret == OPAL_BUSY)
>> + retries++;
>> + else
>> + break;
> How about re-writing above 5 lines as below ?
>
> ret = xscom_handle_error(hmer, gcid, pcb_addr, false, i);
> if (ret != OPAL_BUSY)
> break;
Perfect!! Will change.
Thanks for review Mahesh.
> Thanks,
> -Mahesh.
>
>> }
>> - return OPAL_SUCCESS;
>> +
>> + prerror("XSCOM: Read failed, ret = %lld\n", ret);
>> + return ret;
>> }
>>
>> static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
>> {
>> + int i;
>> uint64_t hmer;
>> - int64_t ret;
>> + int64_t ret, retries = 0;
>>
>> if (!xscom_gcid_ok(gcid)) {
>> prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
>> return OPAL_PARAMETER;
>> }
>>
>> - for (;;) {
>> + for (i = 0; i <= XSCOM_BUSY_MAX_RETRIES; i++) {
>> /* Clear status bits in HMER (HMER is special
>> * writing to it *ands* bits
>> */
>> @@ -233,14 +260,18 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
>>
>> /* Check for error */
>> if (!(hmer & SPR_HMER_XSCOM_FAIL))
>> - break;
>> + return OPAL_SUCCESS;
>>
>> /* Handle error and possibly eventually retry */
>> - ret = xscom_handle_error(hmer, gcid, pcb_addr, true);
>> - if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE)
>> - return ret;
>> + ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries);
>> + if (ret == OPAL_BUSY)
>> + retries++;
>> + else
>> + break;
>> }
>> - return OPAL_SUCCESS;
>> +
>> + prerror("XSCOM: Write failed, ret = %lld\n", ret);
>> + return ret;
>> }
>>
>> /*
>> diff --git a/include/errorlog.h b/include/errorlog.h
>> index b8fca7d..0a4237f 100644
>> --- a/include/errorlog.h
>> +++ b/include/errorlog.h
>> @@ -276,6 +276,7 @@ enum opal_reasoncode {
>> OPAL_RC_XSCOM_RW = OPAL_XS | 0x10,
>> OPAL_RC_XSCOM_INDIRECT_RW = OPAL_XS | 0x11,
>> OPAL_RC_XSCOM_RESET = OPAL_XS | 0x12,
>> + OPAL_RC_XSCOM_BUSY = OPAL_XS | 0x13,
>> /* PCI */
>> OPAL_RC_PCI_INIT_SLOT = OPAL_PC | 0x10,
>> OPAL_RC_PCI_ADD_SLOT = OPAL_PC | 0x11,
>> diff --git a/include/xscom.h b/include/xscom.h
>> index 933af6a..1aee40e 100644
>> --- a/include/xscom.h
>> +++ b/include/xscom.h
>> @@ -167,6 +167,12 @@
>> /* HB folks say: try 10 time for now */
>> #define XSCOM_IND_MAX_RETRIES 10
>>
>> +/* Max number of retries when XSCOM remains busy */
>> +#define XSCOM_BUSY_MAX_RETRIES 3000
>> +
>> +/* Retry count after which to reset XSCOM, if still busy */
>> +#define XSCOM_BUSY_RESET_THRESHOLD 1000
>> +
>> /*
>> * Error handling:
>> *
>> --
>> 2.1.4
>>
>> _______________________________________________
>> Skiboot mailing list
>> Skiboot at lists.ozlabs.org
>> https://lists.ozlabs.org/listinfo/skiboot
More information about the Skiboot
mailing list