[Skiboot] [PATCH v2] hw/xscom: Reset XSCOM after finite number of retries when busy

Vipin K Parashar vipin at linux.vnet.ibm.com
Sun May 22 04:32:58 AEST 2016


On Saturday 21 May 2016 11:27 PM, Mahesh J Salgaonkar wrote:
> On 2016-05-16 21:35:44 Mon, Vipin K Parashar wrote:
>> OPAL retries XSCOM read/write operations forever till it succeeds.
>> It can cause XSCOM ops to hang, if XSCOM remains busy for some reason,
>> Changed it to retry XSCOM operations XSCOM_BUSY_MAX_RETRIES number of
>> times. Also added logic to reset XSCOM after XSCOM_BUSY_RESET_THRESHOLD
>> number of retries to unblock it, if its still busy.
>>
>> Signed-off-by: Vipin K Parashar <vipin at linux.vnet.ibm.com>
>> Signed-off-by: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>
>> ---
>> Changes in v2:
>>   - Changed newly added macro names to slightly more intuitive names.
>>     Used XSCOM_BUSY_MAX_RETRIES to signify total retries allowed if
>>     XSCOM remains busy and XSCOM_BUSY_RESET_THRESHOLD to hold threshold
>>     count for resetting XSCOM before retrying XSCOM operation again.
>>
>>   hw/xscom.c         | 63 ++++++++++++++++++++++++++++++++++++++++--------------
>>   include/errorlog.h |  1 +
>>   include/xscom.h    |  6 ++++++
>>   3 files changed, 54 insertions(+), 16 deletions(-)
>>
>> diff --git a/hw/xscom.c b/hw/xscom.c
>> index 4d88ce5..a9e62d3 100644
>> --- a/hw/xscom.c
>> +++ b/hw/xscom.c
>> @@ -41,6 +41,10 @@ DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
>>   		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
>>   		OPAL_NA);
>>   
>> +DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_BUSY, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
>> +		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
>> +		OPAL_NA);
>> +
>>   /* xscom details to trigger xstop */
>>   static struct {
>>   	uint64_t addr;
>> @@ -119,7 +123,7 @@ static void xscom_reset(uint32_t gcid)
>>   }
>>   
>>   static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
>> -			       bool is_write)
>> +			      bool is_write, int64_t retries)
>>   {
>>   	unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer);
>>   
>> @@ -127,9 +131,26 @@ static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
>>   	 * recovery procedures
>>   	 */
>>   	switch(stat) {
>> -	/* XSCOM blocked, just retry */
>> +	/*
>> +	 * XSCOM blocked, need to retry. Reset XSCOM after
>> +	 * crossing retry threshold before retrying again.
>> +	 */
>>   	case 1:
>> +		if (retries && !(retries  % XSCOM_BUSY_RESET_THRESHOLD)) {
>> +			prlog(PR_NOTICE, "XSCOM: Busy!! Resetting after %d "
>> +				"retries, Total retries  = %lld\n",
>> +				XSCOM_BUSY_RESET_THRESHOLD, retries);
>> +			xscom_reset(gcid);
>> +		}
>> +
>> +		/* Log error if we have retried enough and its still busy */
>> +		if (retries == XSCOM_BUSY_MAX_RETRIES)
>> +			log_simple_error(&e_info(OPAL_RC_XSCOM_BUSY),
>> +				"XSCOM: %s-busy error gcid=0x%x pcb_addr=0x%x "
>> +				"stat=0x%x\n", is_write ? "write" : "read",
>> +				gcid, pcb_addr, stat);
>>   		return OPAL_BUSY;
>> +
>>   	/* CPU is asleep, don't retry */
>>   	case 2:
>>   		return OPAL_WRONG_STATE;
>> @@ -177,15 +198,16 @@ static bool xscom_gcid_ok(uint32_t gcid)
>>    */
>>   static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
>>   {
>> +	int i;
>>   	uint64_t hmer;
>> -	int64_t ret;
>> +	int64_t ret, retries = 0;
>>   
>>   	if (!xscom_gcid_ok(gcid)) {
>>   		prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
>>   		return OPAL_PARAMETER;
>>   	}
>>   
>> -	for (;;) {
>> +	for (i = 0; i <= XSCOM_BUSY_MAX_RETRIES; i++) {
>>   		/* Clear status bits in HMER (HMER is special
>>   		 * writing to it *ands* bits
>>   		 */
>> @@ -199,27 +221,32 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
>>   
>>   		/* Check for error */
>>   		if (!(hmer & SPR_HMER_XSCOM_FAIL))
>> -			break;
>> +			return OPAL_SUCCESS;
>>   
>>   		/* Handle error and possibly eventually retry */
>> -		ret = xscom_handle_error(hmer, gcid, pcb_addr, false);
>> -		if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE)
>> -			return ret;
>> +		ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries);
>> +		if (ret == OPAL_BUSY)
>> +			retries++;
>> +		else
>> +			break;
> How about re-writing above 5 lines as below ?
>
> 	ret = xscom_handle_error(hmer, gcid, pcb_addr, false, i);
> 	if (ret != OPAL_BUSY)
> 		break;

Perfect!! Will change.
Thanks for review Mahesh.

> Thanks,
> -Mahesh.
>
>>   	}
>> -	return OPAL_SUCCESS;
>> +
>> +	prerror("XSCOM: Read failed, ret =  %lld\n", ret);
>> +	return ret;
>>   }
>>   
>>   static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
>>   {
>> +	int i;
>>   	uint64_t hmer;
>> -	int64_t ret;
>> +	int64_t ret, retries = 0;
>>   
>>   	if (!xscom_gcid_ok(gcid)) {
>>   		prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
>>   		return OPAL_PARAMETER;
>>   	}
>>   
>> -	for (;;) {
>> +	for (i = 0; i <= XSCOM_BUSY_MAX_RETRIES; i++) {
>>   		/* Clear status bits in HMER (HMER is special
>>   		 * writing to it *ands* bits
>>   		 */
>> @@ -233,14 +260,18 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
>>   
>>   		/* Check for error */
>>   		if (!(hmer & SPR_HMER_XSCOM_FAIL))
>> -			break;
>> +			return OPAL_SUCCESS;
>>   
>>   		/* Handle error and possibly eventually retry */
>> -		ret = xscom_handle_error(hmer, gcid, pcb_addr, true);
>> -		if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE)
>> -			return ret;
>> +		ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries);
>> +		if (ret == OPAL_BUSY)
>> +			retries++;
>> +		else
>> +			break;
>>   	}
>> -	return OPAL_SUCCESS;
>> +
>> +	prerror("XSCOM: Write failed, ret =  %lld\n", ret);
>> +	return ret;
>>   }
>>   
>>   /*
>> diff --git a/include/errorlog.h b/include/errorlog.h
>> index b8fca7d..0a4237f 100644
>> --- a/include/errorlog.h
>> +++ b/include/errorlog.h
>> @@ -276,6 +276,7 @@ enum opal_reasoncode {
>>   	OPAL_RC_XSCOM_RW		= OPAL_XS | 0x10,
>>   	OPAL_RC_XSCOM_INDIRECT_RW	= OPAL_XS | 0x11,
>>   	OPAL_RC_XSCOM_RESET		= OPAL_XS | 0x12,
>> +	OPAL_RC_XSCOM_BUSY		= OPAL_XS | 0x13,
>>   /* PCI */
>>   	OPAL_RC_PCI_INIT_SLOT   = OPAL_PC | 0x10,
>>   	OPAL_RC_PCI_ADD_SLOT    = OPAL_PC | 0x11,
>> diff --git a/include/xscom.h b/include/xscom.h
>> index 933af6a..1aee40e 100644
>> --- a/include/xscom.h
>> +++ b/include/xscom.h
>> @@ -167,6 +167,12 @@
>>   /* HB folks say: try 10 time for now */
>>   #define XSCOM_IND_MAX_RETRIES		10
>>   
>> +/* Max number of retries when XSCOM remains busy */
>> +#define XSCOM_BUSY_MAX_RETRIES		3000
>> +
>> +/* Retry count after which to reset XSCOM, if still busy */
>> +#define XSCOM_BUSY_RESET_THRESHOLD	1000
>> +
>>   /*
>>    * Error handling:
>>    *
>> -- 
>> 2.1.4
>>
>> _______________________________________________
>> Skiboot mailing list
>> Skiboot at lists.ozlabs.org
>> https://lists.ozlabs.org/listinfo/skiboot



More information about the Skiboot mailing list