[Skiboot] [PATCH 2/2] opal/xscom: Add recovery for lost core wakeup scom failures.
Nicholas Piggin
npiggin at gmail.com
Fri Dec 8 14:52:42 AEDT 2017
On Thu, 07 Dec 2017 21:43:06 +0530
Mahesh J Salgaonkar <mahesh at linux.vnet.ibm.com> wrote:
> From: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
>
> Due to a hardware issue where core responding to scom was delayed due to
> thread reconfiguration, leaves the SCOM logic in a state where the
> subsequent scom to that core can get errors. This is affected for Core
> PC scom registers in the range of 20010A80-20010ABF
>
> The solution is if a xscom timeout occurs to one of Core PC scom registers
> in the range of 20010A80-20010ABF, a clearing scom write is done to
> 0x20010800 with data of '0x00000000' which will also get a timeout but
> clears the scom logic errors. After the clearing write is done the original
> scom operation can be retried.
>
> The scom timeout is reported as status 0x4 (Invalid address) in HMER[21-23].
This looks to me like it follows the recipe for the fix.
Reviewed-by: Nicholas Piggin <npiggin at gmail.com>
>
> Signed-off-by: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
> ---
> hw/xscom.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
> include/xscom.h | 8 ++++++
> 2 files changed, 80 insertions(+), 3 deletions(-)
>
> diff --git a/hw/xscom.c b/hw/xscom.c
> index 2621465..2ad5114 100644
> --- a/hw/xscom.c
> +++ b/hw/xscom.c
> @@ -151,8 +151,64 @@ static void xscom_reset(uint32_t gcid)
> */
> }
>
> +static int xscom_clear_error(uint32_t gcid, uint32_t pcb_addr)
> +{
> + u64 hmer;
> + uint32_t base_xscom_addr;
> + uint32_t xscom_clear_reg = 0x20010800;
> +
> + /* only in case of p9 */
> + if (proc_gen != proc_gen_p9)
> + return 0;
> +
> + /*
> + * Due to a hardware issue where core responding to scom was delayed
> + * due to thread reconfiguration, leaves the scom logic in a state
> + * where the subsequent scom to that core can get errors. This is
> + * affected for Core PC scom registers in the range of
> + * 20010A80-20010ABF.
> + *
> + * The solution is if a xscom timeout occurs to one of Core PC scom
> + * registers in the range of 20010A80-20010ABF, a clearing scom
> + * write is done to 0x20010800 with data of '0x00000000' which will
> + * also get a timeout but clears the scom logic errors. After the
> + * clearing write is done the original scom operation can be retried.
> + *
> + * The scom timeout is reported as status 0x4 (Invalid address)
> + * in HMER[21-23].
> + */
> +
> + base_xscom_addr = pcb_addr & XSCOM_CLEAR_RANGE_MASK;
> + if (!((base_xscom_addr >= XSCOM_CLEAR_RANGE_START) &&
> + (base_xscom_addr <= XSCOM_CLEAR_RANGE_END)))
> + return 0;
> +
> + /* Reset the XSCOM or next scom operation will fail. */
> + xscom_reset(gcid);
> +
> + /* Clear errors in HMER */
> + mtspr(SPR_HMER, HMER_CLR_MASK);
> +
> + /* Write 0 to clear the xscom logic errors on target chip */
> + out_be64(xscom_addr(gcid, xscom_clear_reg), 0);
> + hmer = xscom_wait_done();
> +
> + /*
> + * Above clearing xscom write will timeout and error out with
> + * invalid access as there is no register at that address. This
> + * xscom operation just helps to clear the xscom logic error.
> + *
> + * On failure, reset the XSCOM or we'll hang on the next access
> + */
> + if (hmer & SPR_HMER_XSCOM_FAIL)
> + xscom_reset(gcid);
> +
> + return 1;
> +}
> +
> static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
> - bool is_write, int64_t retries)
> + bool is_write, int64_t retries,
> + int64_t *xscom_clear_retries)
> {
> unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer);
> int64_t rc = OPAL_HARDWARE;
> @@ -191,6 +247,15 @@ static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_add
> break;
> case 4: /* Invalid address / address error */
> rc = OPAL_XSCOM_ADDR_ERROR;
> + if (xscom_clear_error(gcid, pcb_addr)) {
> + /* return busy if retries still pending. */
> + if ((*xscom_clear_retries)--)
> + return OPAL_XSCOM_BUSY;
> +
> + prlog(PR_DEBUG, "XSCOM: error recovery failed for "
> + "gcid=0x%x pcb_addr=0x%x\n", gcid, pcb_addr);
> +
> + }
> break;
> case 5: /* Clock error */
> rc = OPAL_XSCOM_CLOCK_ERROR;
> @@ -253,6 +318,7 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
> {
> uint64_t hmer;
> int64_t ret, retries;
> + int64_t xscom_clear_retries = XSCOM_CLEAR_MAX_RETRIES;
>
> if (!xscom_gcid_ok(gcid)) {
> prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
> @@ -276,7 +342,8 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
> return OPAL_SUCCESS;
>
> /* Handle error and possibly eventually retry */
> - ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries);
> + ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries,
> + &xscom_clear_retries);
> if (ret != OPAL_BUSY)
> break;
> }
> @@ -303,6 +370,7 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
> {
> uint64_t hmer;
> int64_t ret, retries = 0;
> + int64_t xscom_clear_retries = XSCOM_CLEAR_MAX_RETRIES;
>
> if (!xscom_gcid_ok(gcid)) {
> prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
> @@ -326,7 +394,8 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
> return OPAL_SUCCESS;
>
> /* Handle error and possibly eventually retry */
> - ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries);
> + ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries,
> + &xscom_clear_retries);
> if (ret != OPAL_BUSY)
> break;
> }
> diff --git a/include/xscom.h b/include/xscom.h
> index 5a5d0b9..3a1374c 100644
> --- a/include/xscom.h
> +++ b/include/xscom.h
> @@ -206,6 +206,14 @@
> /* Max number of retries when XSCOM remains busy */
> #define XSCOM_BUSY_MAX_RETRIES 3000
>
> +/* Max number of retries after xscom clearing is done */
> +#define XSCOM_CLEAR_MAX_RETRIES 3
> +
> +/* xscom clear address range/mask */
> +#define XSCOM_CLEAR_RANGE_START 0x20010A00
> +#define XSCOM_CLEAR_RANGE_END 0x20010ABF
> +#define XSCOM_CLEAR_RANGE_MASK 0x200FFBFF
> +
> /* Retry count after which to reset XSCOM, if still busy */
> #define XSCOM_BUSY_RESET_THRESHOLD 1000
>
>
> _______________________________________________
> Skiboot mailing list
> Skiboot at lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/skiboot
More information about the Skiboot
mailing list