[Skiboot] [PATCH v3] hw/xscom: Reset XSCOM engine after finite number of retries when busy

Mahesh J Salgaonkar mahesh at linux.vnet.ibm.com
Fri Jun 3 15:46:30 AEST 2016


On 2016-05-31 17:09:01 Tue, Vipin K Parashar wrote:
> OPAL retries XSCOM read/write operations forever till it succeeds.
> This can cause XSCOM ops to hang forever when XSCOM engine remains
> busy for some reason. Changed it to retry XSCOM operations only
> XSCOM_BUSY_MAX_RETRIES number of times instead of retrying forever.
> Also added logic to reset XSCOM engine after XSCOM_BUSY_RESET_THRESHOLD
> number of retries to unblock it when it remains busy.
> 
> Cc: stable # 9c2d82394fd2 ("xscom: Return OPAL_WRONG_STATE on XSCOM ops..")
> Signed-off-by: Vipin K Parashar <vipin at linux.vnet.ibm.com>
> Signed-off-by: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>
> ---
> Changes in v3:
>  - Added delay of 10ms after XSCOM engine reset when XSCOM is found busy.
>  - Modified 'if' condition to check return value of xscom_handle_error().
> 
> Changes in v2:
>  - Changed newly added macro names to slightly more intuitive names.
>    Used XSCOM_BUSY_MAX_RETRIES to signify total retries allowed if
>    XSCOM remains busy and XSCOM_BUSY_RESET_THRESHOLD to hold threshold
>    count for resetting XSCOM before retrying XSCOM operation again.
> 
>  hw/xscom.c         | 76 ++++++++++++++++++++++++++++++++++++++++++------------
>  include/errorlog.h |  1 +
>  include/xscom.h    |  6 +++++
>  3 files changed, 66 insertions(+), 17 deletions(-)
> 
> diff --git a/hw/xscom.c b/hw/xscom.c
> index 84f72f5..2649a50 100644
> --- a/hw/xscom.c
> +++ b/hw/xscom.c
> @@ -23,6 +23,7 @@
>  #include <centaur.h>
>  #include <errorlog.h>
>  #include <opal-api.h>
> +#include <timebase.h>
>  
>  /* Mask of bits to clear in HMER before an access */
>  #define HMER_CLR_MASK	(~(SPR_HMER_XSCOM_FAIL | \
> @@ -41,6 +42,10 @@ DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_RESET, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
>  		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
>  		OPAL_NA);
>  
> +DEFINE_LOG_ENTRY(OPAL_RC_XSCOM_BUSY, OPAL_PLATFORM_ERR_EVT, OPAL_XSCOM,
> +		OPAL_CEC_HARDWARE, OPAL_PREDICTIVE_ERR_GENERAL,
> +		OPAL_NA);
> +
>  /* xscom details to trigger xstop */
>  static struct {
>  	uint64_t addr;
> @@ -118,18 +123,46 @@ static void xscom_reset(uint32_t gcid)
>  	 */
>  }
>  
> -static int xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
> -			       bool is_write)
> +static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
> +			      bool is_write, int64_t retries)
>  {
> +	struct timespec ts;
>  	unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer);
>  
>  	/* XXX Figure out error codes from doc and error
>  	 * recovery procedures
>  	 */
>  	switch(stat) {
> -	/* XSCOM blocked, just retry */
> +	/*
> +	 * XSCOM engine is blocked, need to retry. Reset XSCOM engine
> +	 * after crossing retry threshold before retrying again.
> +	 */
>  	case 1:
> +		if (retries && !(retries  % XSCOM_BUSY_RESET_THRESHOLD)) {
> +			prlog(PR_NOTICE, "XSCOM: Busy even after %d retries, "
> +				"resetting XSCOM now. Total retries  = %lld\n",
> +				XSCOM_BUSY_RESET_THRESHOLD, retries);
> +			xscom_reset(gcid);
> +
> +			/*
> +			 * Its observed that sometimes immediate retry of
> +			 * XSCOM operation returns wrong data. Adding a
> +			 * delay for XSCOM reset to be effective. Delay of
> +			 * 10 ms is found to be working fine experimentally.
> +			 */

I understand that today we are not sure about how to verify whether xscom reset
has succeeded or not and hence this patch adds the delay. But I think we
should add FIXME comment here so that in future we can fix this
properly by talking to hardware folks to find out alternate way to
verify xscom reset completion status.

> +			ts.tv_sec = 0;
> +			ts.tv_nsec = 10 * 1000;
> +			nanosleep_nopoll(&ts, NULL);
> +		}
> +
> +		/* Log error if we have retried enough and its still busy */
> +		if (retries == XSCOM_BUSY_MAX_RETRIES)
> +			log_simple_error(&e_info(OPAL_RC_XSCOM_BUSY),
> +				"XSCOM: %s-busy error gcid=0x%x pcb_addr=0x%x "
> +				"stat=0x%x\n", is_write ? "write" : "read",
> +				gcid, pcb_addr, stat);
>  		return OPAL_BUSY;
> +
>  	/* CPU is asleep, don't retry */
>  	case 2:
>  		return OPAL_WRONG_STATE;
> @@ -177,15 +210,16 @@ static bool xscom_gcid_ok(uint32_t gcid)
>   */
>  static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
>  {
> +	int i;
>  	uint64_t hmer;
> -	int64_t ret;
> +	int64_t ret, retries = 0;
>  
>  	if (!xscom_gcid_ok(gcid)) {
>  		prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
>  		return OPAL_PARAMETER;
>  	}
>  
> -	for (;;) {
> +	for (i = 0; i <= XSCOM_BUSY_MAX_RETRIES; i++) {
>  		/* Clear status bits in HMER (HMER is special
>  		 * writing to it *ands* bits
>  		 */
> @@ -199,27 +233,31 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
>  
>  		/* Check for error */
>  		if (!(hmer & SPR_HMER_XSCOM_FAIL))
> -			break;
> +			return OPAL_SUCCESS;
>  
>  		/* Handle error and possibly eventually retry */
> -		ret = xscom_handle_error(hmer, gcid, pcb_addr, false);
> -		if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE)
> -			return ret;
> +		ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries);
> +		if (ret != OPAL_BUSY)
> +			break;
> +		retries++;

I think we can get rid off extra variable here 'retries'. Just pass
variable 'i' to xscom_handle_error() which anyway gets incremented as
part of for loop.

>  	}
> -	return OPAL_SUCCESS;
> +
> +	prerror("XSCOM: Read failed, ret =  %lld\n", ret);
> +	return ret;
>  }
>  
>  static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
>  {
> +	int i;
>  	uint64_t hmer;
> -	int64_t ret;
> +	int64_t ret, retries = 0;
>  
>  	if (!xscom_gcid_ok(gcid)) {
>  		prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
>  		return OPAL_PARAMETER;
>  	}
>  
> -	for (;;) {
> +	for (i = 0; i <= XSCOM_BUSY_MAX_RETRIES; i++) {
>  		/* Clear status bits in HMER (HMER is special
>  		 * writing to it *ands* bits
>  		 */
> @@ -233,14 +271,18 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
>  
>  		/* Check for error */
>  		if (!(hmer & SPR_HMER_XSCOM_FAIL))
> -			break;
> +			return OPAL_SUCCESS;
>  
>  		/* Handle error and possibly eventually retry */
> -		ret = xscom_handle_error(hmer, gcid, pcb_addr, true);
> -		if (ret == OPAL_HARDWARE || ret == OPAL_WRONG_STATE)
> -			return ret;
> +		ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries);
> +		if (ret == OPAL_BUSY)
> +			retries++;
> +		else
> +			break;

Same here..


Thanks,
-Mahesh.



More information about the Skiboot mailing list