[Skiboot] [PATCH 06/14] opal/hmi: Rework HMI handling of TFAC errors
Stewart Smith
stewart at linux.ibm.com
Fri Apr 13 15:28:27 AEST 2018
Mahesh J Salgaonkar <mahesh at linux.vnet.ibm.com> writes:
> From: Benjamin Herrenschmidt <benh at kernel.crashing.org>
>
> This patch reworks the HMI handling for TFAC errors by introducing
> 4 rendez-vous points improve the thread synchronization while handling
> timebase errors that requires all thread to clear dirty data from TB/HDEC
> register before clearing the errors.
>
> Signed-off-by: Benjamin Herrenschmidt <benh at kernel.crashing.org>
> Signed-off-by: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
> ---
> core/cpu.c | 2
> core/hmi.c | 519 +++++++++++++++++++++++------------------------------
> hw/chiptod.c | 118 ++++--------
> include/chiptod.h | 6 +
> include/cpu.h | 3
> 5 files changed, 276 insertions(+), 372 deletions(-)
>
> diff --git a/core/cpu.c b/core/cpu.c
> index e243344aa..251ca5e88 100644
> --- a/core/cpu.c
> +++ b/core/cpu.c
> @@ -1088,7 +1088,6 @@ void init_all_cpus(void)
> #endif
> t->core_hmi_state = 0;
> t->core_hmi_state_ptr = &t->core_hmi_state;
> - t->thread_mask = 1;
>
> /* Add associativity properties */
> add_core_associativity(t);
> @@ -1116,7 +1115,6 @@ void init_all_cpus(void)
> t->node = cpu;
> t->chip_id = chip_id;
> t->core_hmi_state_ptr = &pt->core_hmi_state;
> - t->thread_mask = 1 << thread;
> }
> prlog(PR_INFO, "CPU: %d secondary threads\n", thread);
> }
> diff --git a/core/hmi.c b/core/hmi.c
> index df8697b75..cd9420ad2 100644
> --- a/core/hmi.c
> +++ b/core/hmi.c
> @@ -184,8 +184,12 @@
> */
> #define NX_HMI_ACTIVE PPC_BIT(54)
>
> -/* Number of iterations for the various timeouts */
> -#define TIMEOUT_LOOPS 20000000
> +/*
> + * Number of iterations for the various timeouts. We can't use the timebase
> + * as it might be broken. We measured experimentally that 40 millions loops
> + * of cpu_relax() gives us more than 1s. The margin is comfortable enough.
> + */
> +#define TIMEOUT_LOOPS 40000000
>
> /* TFMR other errors. (other than bit 26 and 45) */
> #define SPR_TFMR_OTHER_ERRORS \
> @@ -195,6 +199,18 @@
> SPR_TFMR_DEC_PARITY_ERR | SPR_TFMR_TFMR_CORRUPT | \
> SPR_TFMR_CHIP_TOD_INTERRUPT)
>
> +/* TFMR "all core" errors (sent to all threads) */
> +#define SPR_TFMR_CORE_ERRORS \
> + (SPR_TFMR_TBST_CORRUPT | SPR_TFMR_TB_MISSING_SYNC | \
> + SPR_TFMR_TB_MISSING_STEP | SPR_TFMR_FW_CONTROL_ERR | \
> + SPR_TFMR_TFMR_CORRUPT | SPR_TFMR_TB_RESIDUE_ERR | \
> + SPR_TFMR_HDEC_PARITY_ERROR | SPR_TFMR_CHIP_TOD_INTERRUPT)
> +
> +/* TFMR "thread" errors */
> +#define SPR_TFMR_THREAD_ERRORS \
> + (SPR_TFMR_PURR_PARITY_ERR | SPR_TFMR_SPURR_PARITY_ERR | \
> + SPR_TFMR_DEC_PARITY_ERR)
> +
> static const struct core_xstop_bit_info {
> uint8_t bit; /* CORE FIR bit number */
> enum OpalHMI_CoreXstopReason reason;
> @@ -792,360 +808,279 @@ static void decode_malfunction(struct OpalHMIEvent *hmi_evt, uint64_t *out_flags
> *out_flags |= flags;
> }
>
> -static void wait_for_cleanup_complete(void)
> -{
> - uint64_t timeout = 0;
> -
> - smt_lowest();
> - while (!(*(this_cpu()->core_hmi_state_ptr) & HMI_STATE_CLEANUP_DONE)) {
> - /*
> - * We use a fixed number of TIMEOUT_LOOPS rather
> - * than using the timebase to do a pseudo-wall time
> - * timeout due to the fact that timebase may not actually
> - * work at this point in time.
> - */
> - if (++timeout >= (TIMEOUT_LOOPS*3)) {
> - /*
> - * Break out the loop here and fall through
> - * recovery code. If recovery fails, kernel will get
> - * informed about the failure. This way we can avoid
> - * looping here if other threads are stuck.
> - */
> - prlog(PR_DEBUG, "TB pre-recovery timeout\n");
> - break;
> - }
> - barrier();
> - }
> - smt_medium();
> -}
> -
> /*
> - * For successful recovery of TB residue error, remove dirty data
> - * from TB/HDEC register in each active partition (subcore). Writing
> - * zero's to TB/HDEC will achieve the same.
> + * This will "rendez-vous" all threads on the core to the rendez-vous
> + * id "sig". You need to make sure that "sig" is different from the
> + * previous rendez vous. The sig value must be between 0 and 7 with
> + * boot time being set to 0.
> + *
> + * Note: in theory, we could just use a flip flop "sig" in the thread
> + * structure (binary rendez-vous with no argument). This is a bit more
> + * debuggable and better at handling timeouts (arguably).
> + *
> + * This should be called with the no lock held
> */
> -static void timer_facility_do_cleanup(uint64_t tfmr)
> +static void hmi_rendez_vous(uint32_t sig)
> {
> + struct cpu_thread *t = this_cpu();
> + uint32_t my_id = cpu_get_thread_index(t);
> + uint32_t my_shift = my_id << 2;
> + uint32_t *sptr = t->core_hmi_state_ptr;
> + uint32_t val, prev, shift, i;
> + uint64_t timeout;
> +
> + assert(sig <= 0x7);
> +
> /*
> - * Workaround for HW logic bug in Power9. Do not reset the
> - * TB register if TB is valid and running.
> + * Mark ourselves as having reached the rendez vous point with
> + * the exit bit cleared
> */
> - if ((tfmr & SPR_TFMR_TB_RESIDUE_ERR) && !(tfmr & SPR_TFMR_TB_VALID)) {
> + do {
> + val = prev = *sptr;
> + val &= ~(0xfu << my_shift);
> + val |= sig << my_shift;
> + } while (cmpxchg32(sptr, prev, val) != prev);
>
> - /* Reset the TB register to clear the dirty data. */
> - mtspr(SPR_TBWU, 0);
> - mtspr(SPR_TBWL, 0);
> + /*
> + * Wait for everybody else to reach that point, ignore the
> + * exit bit as another thread could have already set it.
> + */
> + for (i = 0; i < cpu_thread_count; i++) {
> + shift = i << 2;
> +
> + timeout = TIMEOUT_LOOPS;
> + while (((*sptr >> shift) & 0x7) != sig && --timeout)
> + cpu_relax();
> + if (!timeout)
> + prlog(PR_ERR, "Rendez-vous stage 1 timeout, CPU 0x%x"
> + " waiting for thread %d\n", t->pir, i);
> }
>
> - if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR) {
> - /* Reset HDEC register */
> - mtspr(SPR_HDEC, 0);
> + /* Set the exit bit */
> + do {
> + val = prev = *sptr;
> + val &= ~(0xfu << my_shift);
> + val |= (sig | 8) << my_shift;
> + } while (cmpxchg32(sptr, prev, val) != prev);
> +
> + /* At this point, we need to wait for everybody else to have a value
> + * that is *not* sig. IE. they either have set the exit bit *or* they
> + * have changed the rendez-vous (meaning they have moved on to another
> + * rendez vous point).
> + */
> + for (i = 0; i < cpu_thread_count; i++) {
> + shift = i << 2;
> +
> + timeout = TIMEOUT_LOOPS;
> + while (((*sptr >> shift) & 0xf) == sig && --timeout)
> + cpu_relax();
> + if (!timeout)
> + prlog(PR_ERR, "Rendez-vous stage 2 timeout, CPU 0x%x"
> + " waiting for thread %d\n", t->pir, i);
> }
> }
>
> -static int get_split_core_mode(void)
> +static void hmi_print_debug(const uint8_t *msg, uint64_t hmer)
> {
> - uint64_t hid0;
> + const char *loc;
> + uint32_t core_id, thread_index;
>
> - hid0 = mfspr(SPR_HID0);
> - if (hid0 & SPR_HID0_POWER8_2LPARMODE)
> - return 2;
> - else if (hid0 & SPR_HID0_POWER8_4LPARMODE)
> - return 4;
> + core_id = pir_to_core_id(this_cpu()->pir);
> + thread_index = cpu_get_thread_index(this_cpu());
>
> - return 1;
> -}
> + loc = chip_loc_code(this_cpu()->chip_id);
> + if (!loc)
> + loc = "Not Available";
>
> + if (hmer & (SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR)) {
> + prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: TFMR(%016lx) %s\n",
> + loc, this_cpu()->chip_id, core_id, thread_index,
> + mfspr(SPR_TFMR), msg);
> + } else {
> + prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: %s\n",
> + loc, this_cpu()->chip_id, core_id, thread_index,
> + msg);
> + }
> +}
>
> -/*
> - * Certain TB/HDEC errors leaves dirty data in timebase and hdec register
> - * which need to cleared before we initiate clear_tb_errors through TFMR[24].
> - * The cleanup has to be done by once by any one thread from core or subcore.
> - *
> - * In split core mode, it is required to clear the dirty data from TB/HDEC
> - * register by all subcores (active partitions) before we clear tb errors
> - * through TFMR[24]. The HMI recovery would fail even if one subcore do
> - * not cleanup the respective TB/HDEC register.
> - *
> - * For un-split core, any one thread can do the cleanup.
> - * For split core, any one thread from each subcore can do the cleanup.
> - *
> - * Errors that required pre-recovery cleanup:
> - * - SPR_TFMR_TB_RESIDUE_ERR
> - * - SPR_TFMR_HDEC_PARITY_ERROR
> - */
> -static void pre_recovery_cleanup_p8(uint64_t *out_flags)
> +static int handle_thread_tfac_error(uint64_t tfmr, uint64_t *out_flags)
> {
> - uint64_t tfmr;
> - uint32_t sibling_thread_mask;
> - int split_core_mode, subcore_id, thread_id, threads_per_core;
> - int i;
> + int recover = 1;
>
> - /*
> - * Exit if it is not the error that leaves dirty data in timebase
> - * or HDEC register. OR this may be the thread which came in very
> - * late and recovery is been already done.
> - *
> - * TFMR is per [sub]core register. If any one thread on the [sub]core
> - * does the recovery it reflects in TFMR register and applicable to
> - * all threads in that [sub]core. Hence take a lock before checking
> - * TFMR errors. Once a thread from a [sub]core completes the
> - * recovery, all other threads on that [sub]core will return from
> - * here.
> - *
> - * If TFMR does not show error that we are looking for, return
> - * from here. We would just fall through recovery code which would
> - * check for other errors on TFMR and fix them.
> - */
> - lock(&hmi_lock);
> - tfmr = mfspr(SPR_TFMR);
> - if (!(tfmr & SPR_TFMR_TB_VALID))
> - *out_flags |= OPAL_HMI_FLAGS_TB_RESYNC;
> if (tfmr & SPR_TFMR_DEC_PARITY_ERR)
> *out_flags |= OPAL_HMI_FLAGS_DEC_LOST;
> - if (!(tfmr & (SPR_TFMR_TB_RESIDUE_ERR | SPR_TFMR_HDEC_PARITY_ERROR))) {
> - unlock(&hmi_lock);
> - return;
> - }
> + if (!tfmr_recover_local_errors(tfmr))
> + recover = 0;
> + tfmr &= ~(SPR_TFMR_PURR_PARITY_ERR |
> + SPR_TFMR_SPURR_PARITY_ERR |
> + SPR_TFMR_DEC_PARITY_ERR);
> + return recover;
> +}
>
> - /* Tell OS about a possible loss of HDEC */
> - if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
> - *out_flags |= OPAL_HMI_FLAGS_HDEC_LOST;
> +static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
> +{
> + struct cpu_thread *t, *t0;
> + int recover = 1;
>
> - /* Gather split core information. */
> - split_core_mode = get_split_core_mode();
> - threads_per_core = cpu_thread_count / split_core_mode;
> + t = this_cpu();
> + t0 = find_cpu_by_pir(cpu_get_thread0(t));
>
> - /* Prepare core/subcore sibling mask */
> - thread_id = cpu_get_thread_index(this_cpu());
> - subcore_id = thread_id / threads_per_core;
> - sibling_thread_mask = SUBCORE_THREAD_MASK(subcore_id, threads_per_core);
> + /* Rendez vous all threads */
> + hmi_rendez_vous(1);
>
> - /*
> - * First thread on the core ?
> - * if yes, setup the hmi cleanup state to !DONE
> + /* We use a lock here as some of the TFMR bits are shared and I
> + * prefer avoiding doing the cleanup simultaneously.
> */
> - if ((*(this_cpu()->core_hmi_state_ptr) & CORE_THREAD_MASK) == 0)
> - *(this_cpu()->core_hmi_state_ptr) &= ~HMI_STATE_CLEANUP_DONE;
> + lock(&hmi_lock);
>
> - /*
> - * First thread on subcore ?
> - * if yes, do cleanup.
> - *
> - * Clear TB and wait for other threads (one from each subcore) to
> - * finish its cleanup work.
> + /* First handle corrupt TFMR otherwise we can't trust anything.
> + * We'll use a lock here so that the threads don't try to do it at
> + * the same time
> */
> + if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
> + /* Check if it's still in error state */
> + if (mfspr(SPR_TFMR) & SPR_TFMR_TFMR_CORRUPT)
> + if (!recover_corrupt_tfmr())
> + recover = 0;
>
> - if ((*(this_cpu()->core_hmi_state_ptr) & sibling_thread_mask) == 0)
> - timer_facility_do_cleanup(tfmr);
> + if (!recover)
> + goto error_out;
>
> - /*
> - * Mark this thread bit. This bit will stay on until this thread
> - * exit from handle_hmi_exception().
> - */
> - *(this_cpu()->core_hmi_state_ptr) |= this_cpu()->thread_mask;
> + tfmr = mfspr(SPR_TFMR);
>
> - /*
> - * Check if each subcore has completed the cleanup work.
> - * if yes, then notify all the threads that we are done with cleanup.
> - */
> - for (i = 0; i < split_core_mode; i++) {
> - uint32_t subcore_thread_mask =
> - SUBCORE_THREAD_MASK(i, threads_per_core);
> - if (!(*(this_cpu()->core_hmi_state_ptr) & subcore_thread_mask))
> - break;
> + /* We could have got new thread errors in the meantime */
> + if (tfmr & SPR_TFMR_THREAD_ERRORS) {
> + recover = handle_thread_tfac_error(tfmr, out_flags);
> + tfmr &= ~SPR_TFMR_THREAD_ERRORS;
> + }
> + if (!recover)
> + goto error_out;
> }
>
> - if (i == split_core_mode)
> - *(this_cpu()->core_hmi_state_ptr) |= HMI_STATE_CLEANUP_DONE;
> -
> - unlock(&hmi_lock);
> + /* Tell the OS ... */
> + if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
> + *out_flags |= OPAL_HMI_FLAGS_HDEC_LOST;
>
> - /* Wait for other subcore to complete the cleanup. */
> - wait_for_cleanup_complete();
> -}
> + /* Cleanup bad HDEC or TB on all threads or subcures before we clear
> + * the error conditions
> + */
> + tfmr_cleanup_core_errors(tfmr);
>
> -/*
> - * Certain TB/HDEC errors leaves dirty data in timebase and hdec register
> - * which need to cleared before we initiate clear_tb_errors through TFMR[24].
> - * The cleanup has to be done by all the threads from core in p9.
> - *
> - * On TB/HDEC errors, all 4 threads on the affected receives HMI. On power9,
> - * every thread on the core has its own copy of TB and hence every thread
> - * has to clear the dirty data from its own TB register before we clear tb
> - * errors through TFMR[24]. The HMI recovery would fail even if one thread
> - * do not cleanup the respective TB/HDEC register.
> - *
> - * There is no split core mode in power9.
> - *
> - * Errors that required pre-recovery cleanup:
> - * - SPR_TFMR_TB_RESIDUE_ERR
> - * - SPR_TFMR_HDEC_PARITY_ERROR
> - */
> -static void pre_recovery_cleanup_p9(uint64_t *out_flags)
> -{
> - uint64_t tfmr;
> - int threads_per_core = cpu_thread_count;
> - int i;
> + /* Unlock before next rendez-vous */
> + unlock(&hmi_lock);
>
> - /*
> - * Exit if it is not the error that leaves dirty data in timebase
> - * or HDEC register. OR this may be the thread which came in very
> - * late and recovery is been already done.
> - *
> - * TFMR is per core register. Ideally if any one thread on the core
> - * does the recovery it should reflect in TFMR register and
> - * applicable to all threads in that core. Hence take a lock before
> - * checking TFMR errors. Once a thread from a core completes the
> - * recovery, all other threads on that core will return from
> - * here.
> - *
> - * If TFMR does not show error that we are looking for, return
> - * from here. We would just fall through recovery code which would
> - * check for other errors on TFMR and fix them.
> + /* Second rendez vous, ensure the above cleanups are all done before
> + * we proceed further
> */
> - lock(&hmi_lock);
> - tfmr = mfspr(SPR_TFMR);
> - if (!(tfmr & SPR_TFMR_TB_VALID))
> - *out_flags |= OPAL_HMI_FLAGS_TB_RESYNC;
> - if (tfmr & SPR_TFMR_DEC_PARITY_ERR)
> - *out_flags |= OPAL_HMI_FLAGS_DEC_LOST;
> - if (!(tfmr & (SPR_TFMR_TB_RESIDUE_ERR | SPR_TFMR_HDEC_PARITY_ERROR))) {
> - unlock(&hmi_lock);
> - return;
> - }
> + hmi_rendez_vous(2);
>
> - /*
> - * Due to a HW logic bug in p9, TFMR bit 26 and 45 always set
> - * once TB residue or HDEC errors occurs at first time. Hence for HMI
> - * on subsequent TB errors add additional check as workaround to
> - * identify validity of the errors and decide whether pre-recovery
> - * is required or not. Exit pre-recovery if there are other TB
> - * errors also present on TFMR.
> - */
> - if (tfmr & SPR_TFMR_OTHER_ERRORS) {
> - unlock(&hmi_lock);
> - return;
> + /* We can now clear the error conditions in the core. */
> + if (!tfmr_clear_core_errors(tfmr)) {
> + recover = 0;
> + goto error_out;
> }
>
> - /*
> - * First thread on the core ?
> - * if yes, setup the hmi cleanup state to !DONE
> + /* Third rendez-vous. We could in theory do the timebase resync as
> + * part of the previous one, but I prefer having all the error
> + * conditions cleared before we start trying.
> */
> - if ((*(this_cpu()->core_hmi_state_ptr) & CORE_THREAD_MASK) == 0)
> - *(this_cpu()->core_hmi_state_ptr) &= ~HMI_STATE_CLEANUP_DONE;
> + hmi_rendez_vous(3);
>
> - /* Tell OS about a possible loss of HDEC */
> - if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
> - *out_flags |= OPAL_HMI_FLAGS_HDEC_LOST;
> + /* Now perform the actual TB recovery on thread 0 */
> + if (t == t0)
> + recover = chiptod_recover_tb_errors(tfmr,
> + &this_cpu()->tb_resynced);
>
> - /*
> - * Clear TB and wait for other threads to finish its cleanup work.
> - */
> - timer_facility_do_cleanup(tfmr);
> -
> - /*
> - * Mark this thread bit. This bit will stay on until this thread
> - * exit from handle_hmi_exception().
> - */
> - *(this_cpu()->core_hmi_state_ptr) |= this_cpu()->thread_mask;
> +error_out:
> + /* Last rendez-vous */
> + hmi_rendez_vous(4);
>
> - /*
> - * Check if each thread has completed the cleanup work.
> - * if yes, then notify all the threads that we are done with cleanup.
> + /* Now all threads have gone past rendez-vous 3 and not yet past another
> + * rendez-vous 1, so the value of tb_resynced of thread 0 of the core
> + * contains an accurate indication as to whether the timebase was lost.
> */
> - for (i = 0; i < threads_per_core; i++) {
> - uint32_t thread_mask = SINGLE_THREAD_MASK(i);
> - if (!(*(this_cpu()->core_hmi_state_ptr) & thread_mask))
> - break;
> - }
> -
> - if (i == threads_per_core)
> - *(this_cpu()->core_hmi_state_ptr) |= HMI_STATE_CLEANUP_DONE;
> -
> - unlock(&hmi_lock);
> -
> - /* Wait for other threads to complete the cleanup. */
> - wait_for_cleanup_complete();
> -}
> + if (t0->tb_resynced)
> + *out_flags |= OPAL_HMI_FLAGS_TB_RESYNC;
>
> -static void pre_recovery_cleanup(uint64_t *out_flags)
> -{
> - if (proc_gen == proc_gen_p9)
> - return pre_recovery_cleanup_p9(out_flags);
> - else
> - return pre_recovery_cleanup_p8(out_flags);
> + return recover;
> }
>
> -static void hmi_print_debug(const uint8_t *msg, uint64_t hmer)
> +static int handle_tfac_errors(uint64_t hmer, struct OpalHMIEvent *hmi_evt,
> + uint64_t *out_flags)
> {
> - const char *loc;
> - uint32_t core_id, thread_index;
> + int recover = 1;
> + uint64_t tfmr = mfspr(SPR_TFMR);
>
> - core_id = pir_to_core_id(this_cpu()->pir);
> - thread_index = cpu_get_thread_index(this_cpu());
> + /* A TFMR parity error makes us ignore all the local stuff */
> + if ((hmer & SPR_HMER_TFMR_PARITY_ERROR) || (tfmr & SPR_TFMR_TFMR_CORRUPT)) {
> + /* Mark TB as invalid for now as we don't trust TFMR, we'll fix
> + * it up later
> + */
> + this_cpu()->tb_invalid = true;
> + goto bad_tfmr;
> + }
>
> - loc = chip_loc_code(this_cpu()->chip_id);
> - if (!loc)
> - loc = "Not Available";
> + this_cpu()->tb_invalid = !(tfmr & SPR_TFMR_TB_VALID);
>
> - if (hmer & (SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR)) {
> - prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: TFMR(%016lx) %s\n",
> - loc, this_cpu()->chip_id, core_id, thread_index,
> - mfspr(SPR_TFMR), msg);
> - } else {
> - prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: %s\n",
> - loc, this_cpu()->chip_id, core_id, thread_index,
> - msg);
> + /* P9 errata: In theory, an HDEC error is sent to all threads. However,
> + * due to an errata on P9 where TFMR bit 26 (HDEC parity) cannot be
> + * cleared on thread 1..3, I am not confident we can do a rendez-vous
> + * in all cases.
> + *
> + * Our current approach is to ignore that error unless no other TFAC
> + * error is present in the TFMR. The error will be re-detected and
> + * re-reported if necessary.
> + */
> + if (proc_gen == proc_gen_p9 && (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)) {
> + if (this_cpu()->tb_invalid || (tfmr & SPR_TFMR_OTHER_ERRORS))
> + tfmr &= ~SPR_TFMR_HDEC_PARITY_ERROR;
> }
> -}
>
> -static int handle_tfac_errors(uint64_t hmer, struct OpalHMIEvent *hmi_evt,
> - uint64_t *out_flags)
> -{
> - int recover = 1;
> - uint64_t tfmr;
> + /* The TB residue error is ignored if TB is valid due to a similar
> + * errata as above
> + */
> + if ((tfmr & SPR_TFMR_TB_RESIDUE_ERR) && !this_cpu()->tb_invalid)
> + tfmr &= ~SPR_TFMR_TB_RESIDUE_ERR;
>
> - pre_recovery_cleanup(out_flags);
> + /* First, handle thread local errors */
> + if (tfmr & SPR_TFMR_THREAD_ERRORS) {
> + recover = handle_thread_tfac_error(tfmr, out_flags);
> + tfmr &= ~SPR_TFMR_THREAD_ERRORS;
> + }
>
> - lock(&hmi_lock);
> - this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
> + bad_tfmr:
>
> - /*
> - * Assert for now for all TOD errors. In future we need to decode
> - * TFMR and take corrective action wherever required.
> + /* Let's see if we still have a all-core error to deal with, if
> + * not, we just bail out
> */
> - if (hmer & SPR_HMER_TFAC_ERROR) {
> - tfmr = mfspr(SPR_TFMR); /* save original TFMR */
> + if (tfmr & SPR_TFMR_CORE_ERRORS) {
> + int recover2;
>
> - hmi_print_debug("Timer Facility Error", hmer);
> -
> - recover = chiptod_recover_tb_errors();
> - if (hmi_evt) {
> - hmi_evt->severity = OpalHMI_SEV_ERROR_SYNC;
> - hmi_evt->type = OpalHMI_ERROR_TFAC;
> - hmi_evt->tfmr = tfmr;
> - queue_hmi_event(hmi_evt, recover, out_flags);
> - }
> + /* Only update "recover" if it's not already 0 (non-recovered)
> + */
> + recover2 = handle_all_core_tfac_error(tfmr, out_flags);
> + if (recover != 0)
> + recover = recover2;
> + } else if (this_cpu()->tb_invalid) {
> + /* This shouldn't happen, TB is invalid and no global error
> + * was reported. We just return for now assuming one will
> + * be. We can't do a rendez vous without a core-global HMI.
> + */
> + // XX log CPU#
> + prlog(PR_ERR, "HMI: TB invalid without core error
> reported !\n");
Fix up the XX here? Should be easy to print something sensible.
> }
> - if (hmer & SPR_HMER_TFMR_PARITY_ERROR) {
> - tfmr = mfspr(SPR_TFMR); /* save original TFMR */
>
> - hmi_print_debug("TFMR parity Error", hmer);
> - recover = chiptod_recover_tb_errors();
> - if (hmi_evt) {
> - hmi_evt->severity = OpalHMI_SEV_FATAL;
> - hmi_evt->type = OpalHMI_ERROR_TFMR_PARITY;
> - hmi_evt->tfmr = tfmr;
> - queue_hmi_event(hmi_evt, recover, out_flags);
> - }
> + if (hmi_evt) {
> + hmi_evt->severity = OpalHMI_SEV_ERROR_SYNC;
> + hmi_evt->type = OpalHMI_ERROR_TFAC;
> + hmi_evt->tfmr = tfmr;
> + queue_hmi_event(hmi_evt, recover, out_flags);
> }
> - /* Unconditionally unset the thread bit */
> - *(this_cpu()->core_hmi_state_ptr) &= ~(this_cpu()->thread_mask);
>
> /* Set the TB state looking at TFMR register before we head out. */
> this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
> - unlock(&hmi_lock);
> +
> + // XX Warn if !valid
add something like this:
if(this_cpu()->tb_invalid) prlog(PR_WARNING, "eep");
(i mean, probably "eep" is a bad error message, but you get the point :)
--
Stewart Smith
OPAL Architect, IBM.
More information about the Skiboot
mailing list