[Skiboot] [PATCH v2 06/15] opal/hmi: Rework HMI handling of TFAC errors
Mahesh J Salgaonkar
mahesh at linux.vnet.ibm.com
Tue Apr 17 03:33:31 AEST 2018
From: Benjamin Herrenschmidt <benh at kernel.crashing.org>
This patch reworks the HMI handling for TFAC errors by introducing
4 rendez-vous points improve the thread synchronization while handling
timebase errors that requires all thread to clear dirty data from TB/HDEC
register before clearing the errors.
Signed-off-by: Benjamin Herrenschmidt <benh at kernel.crashing.org>
Signed-off-by: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
---
core/cpu.c | 2
core/hmi.c | 523 +++++++++++++++++++++++------------------------------
hw/chiptod.c | 118 ++++--------
include/chiptod.h | 6 +
include/cpu.h | 3
5 files changed, 280 insertions(+), 372 deletions(-)
diff --git a/core/cpu.c b/core/cpu.c
index 6826fee0a..b8d31e215 100644
--- a/core/cpu.c
+++ b/core/cpu.c
@@ -1107,7 +1107,6 @@ void init_all_cpus(void)
#endif
t->core_hmi_state = 0;
t->core_hmi_state_ptr = &t->core_hmi_state;
- t->thread_mask = 1;
/* Add associativity properties */
add_core_associativity(t);
@@ -1131,7 +1130,6 @@ void init_all_cpus(void)
t->node = cpu;
t->chip_id = chip_id;
t->core_hmi_state_ptr = &pt->core_hmi_state;
- t->thread_mask = 1 << thread;
}
prlog(PR_INFO, "CPU: %d secondary threads\n", thread);
}
diff --git a/core/hmi.c b/core/hmi.c
index c2b44b9d1..f51512989 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -184,8 +184,12 @@
*/
#define NX_HMI_ACTIVE PPC_BIT(54)
-/* Number of iterations for the various timeouts */
-#define TIMEOUT_LOOPS 20000000
+/*
+ * Number of iterations for the various timeouts. We can't use the timebase
+ * as it might be broken. We measured experimentally that 40 millions loops
+ * of cpu_relax() gives us more than 1s. The margin is comfortable enough.
+ */
+#define TIMEOUT_LOOPS 40000000
/* TFMR other errors. (other than bit 26 and 45) */
#define SPR_TFMR_OTHER_ERRORS \
@@ -195,6 +199,18 @@
SPR_TFMR_DEC_PARITY_ERR | SPR_TFMR_TFMR_CORRUPT | \
SPR_TFMR_CHIP_TOD_INTERRUPT)
+/* TFMR "all core" errors (sent to all threads) */
+#define SPR_TFMR_CORE_ERRORS \
+ (SPR_TFMR_TBST_CORRUPT | SPR_TFMR_TB_MISSING_SYNC | \
+ SPR_TFMR_TB_MISSING_STEP | SPR_TFMR_FW_CONTROL_ERR | \
+ SPR_TFMR_TFMR_CORRUPT | SPR_TFMR_TB_RESIDUE_ERR | \
+ SPR_TFMR_HDEC_PARITY_ERROR | SPR_TFMR_CHIP_TOD_INTERRUPT)
+
+/* TFMR "thread" errors */
+#define SPR_TFMR_THREAD_ERRORS \
+ (SPR_TFMR_PURR_PARITY_ERR | SPR_TFMR_SPURR_PARITY_ERR | \
+ SPR_TFMR_DEC_PARITY_ERR)
+
static const struct core_xstop_bit_info {
uint8_t bit; /* CORE FIR bit number */
enum OpalHMI_CoreXstopReason reason;
@@ -827,360 +843,283 @@ static void decode_malfunction(struct OpalHMIEvent *hmi_evt, uint64_t *out_flags
*out_flags |= flags;
}
-static void wait_for_cleanup_complete(void)
-{
- uint64_t timeout = 0;
-
- smt_lowest();
- while (!(*(this_cpu()->core_hmi_state_ptr) & HMI_STATE_CLEANUP_DONE)) {
- /*
- * We use a fixed number of TIMEOUT_LOOPS rather
- * than using the timebase to do a pseudo-wall time
- * timeout due to the fact that timebase may not actually
- * work at this point in time.
- */
- if (++timeout >= (TIMEOUT_LOOPS*3)) {
- /*
- * Break out the loop here and fall through
- * recovery code. If recovery fails, kernel will get
- * informed about the failure. This way we can avoid
- * looping here if other threads are stuck.
- */
- prlog(PR_DEBUG, "TB pre-recovery timeout\n");
- break;
- }
- barrier();
- }
- smt_medium();
-}
-
/*
- * For successful recovery of TB residue error, remove dirty data
- * from TB/HDEC register in each active partition (subcore). Writing
- * zero's to TB/HDEC will achieve the same.
+ * This will "rendez-vous" all threads on the core to the rendez-vous
+ * id "sig". You need to make sure that "sig" is different from the
+ * previous rendez vous. The sig value must be between 0 and 7 with
+ * boot time being set to 0.
+ *
+ * Note: in theory, we could just use a flip flop "sig" in the thread
+ * structure (binary rendez-vous with no argument). This is a bit more
+ * debuggable and better at handling timeouts (arguably).
+ *
+ * This should be called with the no lock held
*/
-static void timer_facility_do_cleanup(uint64_t tfmr)
+static void hmi_rendez_vous(uint32_t sig)
{
+ struct cpu_thread *t = this_cpu();
+ uint32_t my_id = cpu_get_thread_index(t);
+ uint32_t my_shift = my_id << 2;
+ uint32_t *sptr = t->core_hmi_state_ptr;
+ uint32_t val, prev, shift, i;
+ uint64_t timeout;
+
+ assert(sig <= 0x7);
+
/*
- * Workaround for HW logic bug in Power9. Do not reset the
- * TB register if TB is valid and running.
+ * Mark ourselves as having reached the rendez vous point with
+ * the exit bit cleared
*/
- if ((tfmr & SPR_TFMR_TB_RESIDUE_ERR) && !(tfmr & SPR_TFMR_TB_VALID)) {
+ do {
+ val = prev = *sptr;
+ val &= ~(0xfu << my_shift);
+ val |= sig << my_shift;
+ } while (cmpxchg32(sptr, prev, val) != prev);
- /* Reset the TB register to clear the dirty data. */
- mtspr(SPR_TBWU, 0);
- mtspr(SPR_TBWL, 0);
+ /*
+ * Wait for everybody else to reach that point, ignore the
+ * exit bit as another thread could have already set it.
+ */
+ for (i = 0; i < cpu_thread_count; i++) {
+ shift = i << 2;
+
+ timeout = TIMEOUT_LOOPS;
+ while (((*sptr >> shift) & 0x7) != sig && --timeout)
+ cpu_relax();
+ if (!timeout)
+ prlog(PR_ERR, "Rendez-vous stage 1 timeout, CPU 0x%x"
+ " waiting for thread %d\n", t->pir, i);
}
- if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR) {
- /* Reset HDEC register */
- mtspr(SPR_HDEC, 0);
+ /* Set the exit bit */
+ do {
+ val = prev = *sptr;
+ val &= ~(0xfu << my_shift);
+ val |= (sig | 8) << my_shift;
+ } while (cmpxchg32(sptr, prev, val) != prev);
+
+ /* At this point, we need to wait for everybody else to have a value
+ * that is *not* sig. IE. they either have set the exit bit *or* they
+ * have changed the rendez-vous (meaning they have moved on to another
+ * rendez vous point).
+ */
+ for (i = 0; i < cpu_thread_count; i++) {
+ shift = i << 2;
+
+ timeout = TIMEOUT_LOOPS;
+ while (((*sptr >> shift) & 0xf) == sig && --timeout)
+ cpu_relax();
+ if (!timeout)
+ prlog(PR_ERR, "Rendez-vous stage 2 timeout, CPU 0x%x"
+ " waiting for thread %d\n", t->pir, i);
}
}
-static int get_split_core_mode(void)
+static void hmi_print_debug(const uint8_t *msg, uint64_t hmer)
{
- uint64_t hid0;
+ const char *loc;
+ uint32_t core_id, thread_index;
- hid0 = mfspr(SPR_HID0);
- if (hid0 & SPR_HID0_POWER8_2LPARMODE)
- return 2;
- else if (hid0 & SPR_HID0_POWER8_4LPARMODE)
- return 4;
+ core_id = pir_to_core_id(this_cpu()->pir);
+ thread_index = cpu_get_thread_index(this_cpu());
- return 1;
-}
+ loc = chip_loc_code(this_cpu()->chip_id);
+ if (!loc)
+ loc = "Not Available";
+ if (hmer & (SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR)) {
+ prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: TFMR(%016lx) %s\n",
+ loc, this_cpu()->chip_id, core_id, thread_index,
+ mfspr(SPR_TFMR), msg);
+ } else {
+ prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: %s\n",
+ loc, this_cpu()->chip_id, core_id, thread_index,
+ msg);
+ }
+}
-/*
- * Certain TB/HDEC errors leaves dirty data in timebase and hdec register
- * which need to cleared before we initiate clear_tb_errors through TFMR[24].
- * The cleanup has to be done by once by any one thread from core or subcore.
- *
- * In split core mode, it is required to clear the dirty data from TB/HDEC
- * register by all subcores (active partitions) before we clear tb errors
- * through TFMR[24]. The HMI recovery would fail even if one subcore do
- * not cleanup the respective TB/HDEC register.
- *
- * For un-split core, any one thread can do the cleanup.
- * For split core, any one thread from each subcore can do the cleanup.
- *
- * Errors that required pre-recovery cleanup:
- * - SPR_TFMR_TB_RESIDUE_ERR
- * - SPR_TFMR_HDEC_PARITY_ERROR
- */
-static void pre_recovery_cleanup_p8(uint64_t *out_flags)
+static int handle_thread_tfac_error(uint64_t tfmr, uint64_t *out_flags)
{
- uint64_t tfmr;
- uint32_t sibling_thread_mask;
- int split_core_mode, subcore_id, thread_id, threads_per_core;
- int i;
+ int recover = 1;
- /*
- * Exit if it is not the error that leaves dirty data in timebase
- * or HDEC register. OR this may be the thread which came in very
- * late and recovery is been already done.
- *
- * TFMR is per [sub]core register. If any one thread on the [sub]core
- * does the recovery it reflects in TFMR register and applicable to
- * all threads in that [sub]core. Hence take a lock before checking
- * TFMR errors. Once a thread from a [sub]core completes the
- * recovery, all other threads on that [sub]core will return from
- * here.
- *
- * If TFMR does not show error that we are looking for, return
- * from here. We would just fall through recovery code which would
- * check for other errors on TFMR and fix them.
- */
- lock(&hmi_lock);
- tfmr = mfspr(SPR_TFMR);
- if (!(tfmr & SPR_TFMR_TB_VALID))
- *out_flags |= OPAL_HMI_FLAGS_TB_RESYNC;
if (tfmr & SPR_TFMR_DEC_PARITY_ERR)
*out_flags |= OPAL_HMI_FLAGS_DEC_LOST;
- if (!(tfmr & (SPR_TFMR_TB_RESIDUE_ERR | SPR_TFMR_HDEC_PARITY_ERROR))) {
- unlock(&hmi_lock);
- return;
- }
+ if (!tfmr_recover_local_errors(tfmr))
+ recover = 0;
+ tfmr &= ~(SPR_TFMR_PURR_PARITY_ERR |
+ SPR_TFMR_SPURR_PARITY_ERR |
+ SPR_TFMR_DEC_PARITY_ERR);
+ return recover;
+}
- /* Tell OS about a possible loss of HDEC */
- if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
- *out_flags |= OPAL_HMI_FLAGS_HDEC_LOST;
+static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
+{
+ struct cpu_thread *t, *t0;
+ int recover = 1;
- /* Gather split core information. */
- split_core_mode = get_split_core_mode();
- threads_per_core = cpu_thread_count / split_core_mode;
+ t = this_cpu();
+ t0 = find_cpu_by_pir(cpu_get_thread0(t));
- /* Prepare core/subcore sibling mask */
- thread_id = cpu_get_thread_index(this_cpu());
- subcore_id = thread_id / threads_per_core;
- sibling_thread_mask = SUBCORE_THREAD_MASK(subcore_id, threads_per_core);
+ /* Rendez vous all threads */
+ hmi_rendez_vous(1);
- /*
- * First thread on the core ?
- * if yes, setup the hmi cleanup state to !DONE
+ /* We use a lock here as some of the TFMR bits are shared and I
+ * prefer avoiding doing the cleanup simultaneously.
*/
- if ((*(this_cpu()->core_hmi_state_ptr) & CORE_THREAD_MASK) == 0)
- *(this_cpu()->core_hmi_state_ptr) &= ~HMI_STATE_CLEANUP_DONE;
+ lock(&hmi_lock);
- /*
- * First thread on subcore ?
- * if yes, do cleanup.
- *
- * Clear TB and wait for other threads (one from each subcore) to
- * finish its cleanup work.
+ /* First handle corrupt TFMR otherwise we can't trust anything.
+ * We'll use a lock here so that the threads don't try to do it at
+ * the same time
*/
+ if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+ /* Check if it's still in error state */
+ if (mfspr(SPR_TFMR) & SPR_TFMR_TFMR_CORRUPT)
+ if (!recover_corrupt_tfmr())
+ recover = 0;
- if ((*(this_cpu()->core_hmi_state_ptr) & sibling_thread_mask) == 0)
- timer_facility_do_cleanup(tfmr);
+ if (!recover)
+ goto error_out;
- /*
- * Mark this thread bit. This bit will stay on until this thread
- * exit from handle_hmi_exception().
- */
- *(this_cpu()->core_hmi_state_ptr) |= this_cpu()->thread_mask;
+ tfmr = mfspr(SPR_TFMR);
- /*
- * Check if each subcore has completed the cleanup work.
- * if yes, then notify all the threads that we are done with cleanup.
- */
- for (i = 0; i < split_core_mode; i++) {
- uint32_t subcore_thread_mask =
- SUBCORE_THREAD_MASK(i, threads_per_core);
- if (!(*(this_cpu()->core_hmi_state_ptr) & subcore_thread_mask))
- break;
+ /* We could have got new thread errors in the meantime */
+ if (tfmr & SPR_TFMR_THREAD_ERRORS) {
+ recover = handle_thread_tfac_error(tfmr, out_flags);
+ tfmr &= ~SPR_TFMR_THREAD_ERRORS;
+ }
+ if (!recover)
+ goto error_out;
}
- if (i == split_core_mode)
- *(this_cpu()->core_hmi_state_ptr) |= HMI_STATE_CLEANUP_DONE;
-
- unlock(&hmi_lock);
+ /* Tell the OS ... */
+ if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
+ *out_flags |= OPAL_HMI_FLAGS_HDEC_LOST;
- /* Wait for other subcore to complete the cleanup. */
- wait_for_cleanup_complete();
-}
+ /* Cleanup bad HDEC or TB on all threads or subcures before we clear
+ * the error conditions
+ */
+ tfmr_cleanup_core_errors(tfmr);
-/*
- * Certain TB/HDEC errors leaves dirty data in timebase and hdec register
- * which need to cleared before we initiate clear_tb_errors through TFMR[24].
- * The cleanup has to be done by all the threads from core in p9.
- *
- * On TB/HDEC errors, all 4 threads on the affected receives HMI. On power9,
- * every thread on the core has its own copy of TB and hence every thread
- * has to clear the dirty data from its own TB register before we clear tb
- * errors through TFMR[24]. The HMI recovery would fail even if one thread
- * do not cleanup the respective TB/HDEC register.
- *
- * There is no split core mode in power9.
- *
- * Errors that required pre-recovery cleanup:
- * - SPR_TFMR_TB_RESIDUE_ERR
- * - SPR_TFMR_HDEC_PARITY_ERROR
- */
-static void pre_recovery_cleanup_p9(uint64_t *out_flags)
-{
- uint64_t tfmr;
- int threads_per_core = cpu_thread_count;
- int i;
+ /* Unlock before next rendez-vous */
+ unlock(&hmi_lock);
- /*
- * Exit if it is not the error that leaves dirty data in timebase
- * or HDEC register. OR this may be the thread which came in very
- * late and recovery is been already done.
- *
- * TFMR is per core register. Ideally if any one thread on the core
- * does the recovery it should reflect in TFMR register and
- * applicable to all threads in that core. Hence take a lock before
- * checking TFMR errors. Once a thread from a core completes the
- * recovery, all other threads on that core will return from
- * here.
- *
- * If TFMR does not show error that we are looking for, return
- * from here. We would just fall through recovery code which would
- * check for other errors on TFMR and fix them.
+ /* Second rendez vous, ensure the above cleanups are all done before
+ * we proceed further
*/
- lock(&hmi_lock);
- tfmr = mfspr(SPR_TFMR);
- if (!(tfmr & SPR_TFMR_TB_VALID))
- *out_flags |= OPAL_HMI_FLAGS_TB_RESYNC;
- if (tfmr & SPR_TFMR_DEC_PARITY_ERR)
- *out_flags |= OPAL_HMI_FLAGS_DEC_LOST;
- if (!(tfmr & (SPR_TFMR_TB_RESIDUE_ERR | SPR_TFMR_HDEC_PARITY_ERROR))) {
- unlock(&hmi_lock);
- return;
- }
+ hmi_rendez_vous(2);
- /*
- * Due to a HW logic bug in p9, TFMR bit 26 and 45 always set
- * once TB residue or HDEC errors occurs at first time. Hence for HMI
- * on subsequent TB errors add additional check as workaround to
- * identify validity of the errors and decide whether pre-recovery
- * is required or not. Exit pre-recovery if there are other TB
- * errors also present on TFMR.
- */
- if (tfmr & SPR_TFMR_OTHER_ERRORS) {
- unlock(&hmi_lock);
- return;
+ /* We can now clear the error conditions in the core. */
+ if (!tfmr_clear_core_errors(tfmr)) {
+ recover = 0;
+ goto error_out;
}
- /*
- * First thread on the core ?
- * if yes, setup the hmi cleanup state to !DONE
+ /* Third rendez-vous. We could in theory do the timebase resync as
+ * part of the previous one, but I prefer having all the error
+ * conditions cleared before we start trying.
*/
- if ((*(this_cpu()->core_hmi_state_ptr) & CORE_THREAD_MASK) == 0)
- *(this_cpu()->core_hmi_state_ptr) &= ~HMI_STATE_CLEANUP_DONE;
+ hmi_rendez_vous(3);
- /* Tell OS about a possible loss of HDEC */
- if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
- *out_flags |= OPAL_HMI_FLAGS_HDEC_LOST;
+ /* Now perform the actual TB recovery on thread 0 */
+ if (t == t0)
+ recover = chiptod_recover_tb_errors(tfmr,
+ &this_cpu()->tb_resynced);
- /*
- * Clear TB and wait for other threads to finish its cleanup work.
- */
- timer_facility_do_cleanup(tfmr);
-
- /*
- * Mark this thread bit. This bit will stay on until this thread
- * exit from handle_hmi_exception().
- */
- *(this_cpu()->core_hmi_state_ptr) |= this_cpu()->thread_mask;
+error_out:
+ /* Last rendez-vous */
+ hmi_rendez_vous(4);
- /*
- * Check if each thread has completed the cleanup work.
- * if yes, then notify all the threads that we are done with cleanup.
+ /* Now all threads have gone past rendez-vous 3 and not yet past another
+ * rendez-vous 1, so the value of tb_resynced of thread 0 of the core
+ * contains an accurate indication as to whether the timebase was lost.
*/
- for (i = 0; i < threads_per_core; i++) {
- uint32_t thread_mask = SINGLE_THREAD_MASK(i);
- if (!(*(this_cpu()->core_hmi_state_ptr) & thread_mask))
- break;
- }
-
- if (i == threads_per_core)
- *(this_cpu()->core_hmi_state_ptr) |= HMI_STATE_CLEANUP_DONE;
-
- unlock(&hmi_lock);
-
- /* Wait for other threads to complete the cleanup. */
- wait_for_cleanup_complete();
-}
+ if (t0->tb_resynced)
+ *out_flags |= OPAL_HMI_FLAGS_TB_RESYNC;
-static void pre_recovery_cleanup(uint64_t *out_flags)
-{
- if (proc_gen == proc_gen_p9)
- return pre_recovery_cleanup_p9(out_flags);
- else
- return pre_recovery_cleanup_p8(out_flags);
+ return recover;
}
-static void hmi_print_debug(const uint8_t *msg, uint64_t hmer)
+static int handle_tfac_errors(uint64_t hmer, struct OpalHMIEvent *hmi_evt,
+ uint64_t *out_flags)
{
- const char *loc;
- uint32_t core_id, thread_index;
+ int recover = 1;
+ uint64_t tfmr = mfspr(SPR_TFMR);
- core_id = pir_to_core_id(this_cpu()->pir);
- thread_index = cpu_get_thread_index(this_cpu());
+ /* A TFMR parity error makes us ignore all the local stuff */
+ if ((hmer & SPR_HMER_TFMR_PARITY_ERROR) || (tfmr & SPR_TFMR_TFMR_CORRUPT)) {
+ /* Mark TB as invalid for now as we don't trust TFMR, we'll fix
+ * it up later
+ */
+ this_cpu()->tb_invalid = true;
+ goto bad_tfmr;
+ }
- loc = chip_loc_code(this_cpu()->chip_id);
- if (!loc)
- loc = "Not Available";
+ this_cpu()->tb_invalid = !(tfmr & SPR_TFMR_TB_VALID);
- if (hmer & (SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR)) {
- prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: TFMR(%016lx) %s\n",
- loc, this_cpu()->chip_id, core_id, thread_index,
- mfspr(SPR_TFMR), msg);
- } else {
- prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: %s\n",
- loc, this_cpu()->chip_id, core_id, thread_index,
- msg);
+ /* P9 errata: In theory, an HDEC error is sent to all threads. However,
+ * due to an errata on P9 where TFMR bit 26 (HDEC parity) cannot be
+ * cleared on thread 1..3, I am not confident we can do a rendez-vous
+ * in all cases.
+ *
+ * Our current approach is to ignore that error unless no other TFAC
+ * error is present in the TFMR. The error will be re-detected and
+ * re-reported if necessary.
+ */
+ if (proc_gen == proc_gen_p9 && (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)) {
+ if (this_cpu()->tb_invalid || (tfmr & SPR_TFMR_OTHER_ERRORS))
+ tfmr &= ~SPR_TFMR_HDEC_PARITY_ERROR;
}
-}
-static int handle_tfac_errors(uint64_t hmer, struct OpalHMIEvent *hmi_evt,
- uint64_t *out_flags)
-{
- int recover = 1;
- uint64_t tfmr;
+ /* The TB residue error is ignored if TB is valid due to a similar
+ * errata as above
+ */
+ if ((tfmr & SPR_TFMR_TB_RESIDUE_ERR) && !this_cpu()->tb_invalid)
+ tfmr &= ~SPR_TFMR_TB_RESIDUE_ERR;
- pre_recovery_cleanup(out_flags);
+ /* First, handle thread local errors */
+ if (tfmr & SPR_TFMR_THREAD_ERRORS) {
+ recover = handle_thread_tfac_error(tfmr, out_flags);
+ tfmr &= ~SPR_TFMR_THREAD_ERRORS;
+ }
- lock(&hmi_lock);
- this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
+ bad_tfmr:
- /*
- * Assert for now for all TOD errors. In future we need to decode
- * TFMR and take corrective action wherever required.
+ /* Let's see if we still have a all-core error to deal with, if
+ * not, we just bail out
*/
- if (hmer & SPR_HMER_TFAC_ERROR) {
- tfmr = mfspr(SPR_TFMR); /* save original TFMR */
+ if (tfmr & SPR_TFMR_CORE_ERRORS) {
+ int recover2;
- hmi_print_debug("Timer Facility Error", hmer);
-
- recover = chiptod_recover_tb_errors();
- if (hmi_evt) {
- hmi_evt->severity = OpalHMI_SEV_ERROR_SYNC;
- hmi_evt->type = OpalHMI_ERROR_TFAC;
- hmi_evt->tfmr = tfmr;
- queue_hmi_event(hmi_evt, recover, out_flags);
- }
+ /* Only update "recover" if it's not already 0 (non-recovered)
+ */
+ recover2 = handle_all_core_tfac_error(tfmr, out_flags);
+ if (recover != 0)
+ recover = recover2;
+ } else if (this_cpu()->tb_invalid) {
+ /* This shouldn't happen, TB is invalid and no global error
+ * was reported. We just return for now assuming one will
+ * be. We can't do a rendez vous without a core-global HMI.
+ */
+ prlog(PR_ERR, "HMI: TB invalid without core error reported ! "
+ "CPU=%x, TFMR=0x%016lx\n", this_cpu()->pir,
+ mfspr(SPR_TFMR));
}
- if (hmer & SPR_HMER_TFMR_PARITY_ERROR) {
- tfmr = mfspr(SPR_TFMR); /* save original TFMR */
- hmi_print_debug("TFMR parity Error", hmer);
- recover = chiptod_recover_tb_errors();
- if (hmi_evt) {
- hmi_evt->severity = OpalHMI_SEV_FATAL;
- hmi_evt->type = OpalHMI_ERROR_TFMR_PARITY;
- hmi_evt->tfmr = tfmr;
- queue_hmi_event(hmi_evt, recover, out_flags);
- }
+ if (hmi_evt) {
+ hmi_evt->severity = OpalHMI_SEV_ERROR_SYNC;
+ hmi_evt->type = OpalHMI_ERROR_TFAC;
+ hmi_evt->tfmr = tfmr;
+ queue_hmi_event(hmi_evt, recover, out_flags);
}
- /* Unconditionally unset the thread bit */
- *(this_cpu()->core_hmi_state_ptr) &= ~(this_cpu()->thread_mask);
/* Set the TB state looking at TFMR register before we head out. */
this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
- unlock(&hmi_lock);
+
+ if (this_cpu()->tb_invalid)
+ prlog(PR_WARNING, "Failed to get TB in running state! "
+ "CPU=%x, TFMR=%016lx\n", this_cpu()->pir,
+ mfspr(SPR_TFMR));
return recover;
}
diff --git a/hw/chiptod.c b/hw/chiptod.c
index cacc27340..a160e5a10 100644
--- a/hw/chiptod.c
+++ b/hw/chiptod.c
@@ -1370,17 +1370,10 @@ static bool tfmr_recover_tb_errors(uint64_t tfmr)
return true;
}
-static bool tfmr_recover_non_tb_errors(uint64_t tfmr)
+bool tfmr_recover_local_errors(uint64_t tfmr)
{
uint64_t tfmr_reset_errors = 0;
- /*
- * write 1 to bit 26 to clear TFMR HDEC parity error.
- * HDEC register has already been reset to zero as part pre-recovery.
- */
- if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
- tfmr_reset_errors |= SPR_TFMR_HDEC_PARITY_ERROR;
-
if (tfmr & SPR_TFMR_DEC_PARITY_ERR) {
/* Set DEC with all ones */
mtspr(SPR_DEC, ~0);
@@ -1390,11 +1383,11 @@ static bool tfmr_recover_non_tb_errors(uint64_t tfmr)
}
/*
- * Reset PURR/SPURR to recover. We also need help from KVM
- * layer to handle this change in PURR/SPURR. That needs
- * to be handled in kernel KVM layer. For now, to recover just
- * reset it.
- */
+ * Reset PURR/SPURR to recover. We also need help from KVM
+ * layer to handle this change in PURR/SPURR. That needs
+ * to be handled in kernel KVM layer. For now, to recover just
+ * reset it.
+ */
if (tfmr & SPR_TFMR_PURR_PARITY_ERR) {
/* set PURR register with sane value or reset it. */
mtspr(SPR_PURR, 0);
@@ -1432,7 +1425,7 @@ static bool tfmr_recover_non_tb_errors(uint64_t tfmr)
* MT(TFMR) bits 11 and 60 are b’1’
* MT(HMER) all bits 1 except for bits 4,5
*/
-static bool chiptod_recover_tfmr_error(void)
+bool recover_corrupt_tfmr(void)
{
uint64_t tfmr;
@@ -1468,6 +1461,37 @@ static bool chiptod_recover_tfmr_error(void)
return true;
}
+void tfmr_cleanup_core_errors(uint64_t tfmr)
+{
+ /* If HDEC is bad, clean it on all threads before we clear the
+ * error condition.
+ */
+ if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
+ mtspr(SPR_HDEC, 0);
+
+ /* If TB is invalid, clean it on all threads as well, it will be
+ * restored after the next rendez-vous
+ */
+ if (!(tfmr & SPR_TFMR_TB_VALID)) {
+ mtspr(SPR_TBWU, 0);
+ mtspr(SPR_TBWU, 0);
+ }
+}
+
+bool tfmr_clear_core_errors(uint64_t tfmr)
+{
+ uint64_t tfmr_reset_errors = 0;
+
+ if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
+ tfmr_reset_errors |= SPR_TFMR_HDEC_PARITY_ERROR;
+
+ /* Write TFMR twice to clear the error */
+ mtspr(SPR_TFMR, base_tfmr | tfmr_reset_errors);
+ mtspr(SPR_TFMR, base_tfmr | tfmr_reset_errors);
+
+ return true;
+}
+
/*
* Recover from TB and TOD errors.
* Timebase register is per core and first thread that gets chance to
@@ -1481,46 +1505,17 @@ static bool chiptod_recover_tfmr_error(void)
* 1 <= Successfully recovered from errors
* -1 <= No errors found. Errors are already been fixed.
*/
-int chiptod_recover_tb_errors(void)
+int chiptod_recover_tb_errors(uint64_t tfmr, bool *out_resynced)
{
- uint64_t tfmr;
int rc = -1;
- int thread_id;
+
+ *out_resynced = false;
if (chiptod_primary < 0)
return 0;
lock(&chiptod_lock);
- /* Get fresh copy of TFMR */
- tfmr = mfspr(SPR_TFMR);
-
- /*
- * Check for TFMR parity error and recover from it.
- * We can not trust any other bits in TFMR If it is corrupt. Fix this
- * before we do anything.
- */
- if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
- if (!chiptod_recover_tfmr_error()) {
- rc = 0;
- goto error_out;
- }
- }
-
- /* Get fresh copy of TFMR */
- tfmr = mfspr(SPR_TFMR);
-
- /*
- * Workaround for HW logic bug in Power9
- * Even after clearing TB residue error by one thread it does not
- * get reflected to other threads on same core.
- * Check if TB is already valid and skip the checking of TB errors.
- */
-
- if ((proc_gen == proc_gen_p9) && (tfmr & SPR_TFMR_TB_RESIDUE_ERR)
- && (tfmr & SPR_TFMR_TB_VALID))
- goto skip_tb_error_clear;
-
/*
* Check for TB errors.
* On Sync check error, bit 44 of TFMR is set. Check for it and
@@ -1544,7 +1539,6 @@ int chiptod_recover_tb_errors(void)
}
}
-skip_tb_error_clear:
/*
* Check for TOD sync check error.
* On TOD errors, bit 51 of TFMR is set. If this bit is on then we
@@ -1574,35 +1568,9 @@ skip_tb_error_clear:
if (!chiptod_to_tb())
goto error_out;
- /* We have successfully able to get TB running. */
- rc = 1;
- }
+ *out_resynced = true;
- /*
- * Workaround for HW logic bug in power9.
- * In idea case (without the HW bug) only one thread from the core
- * would have fallen through tfmr_recover_non_tb_errors() to clear
- * HDEC parity error on TFMR.
- *
- * Hence to achieve same behavior, allow only thread 0 to clear the
- * HDEC parity error. And for rest of the threads just reset the bit
- * to avoid other threads to fall through tfmr_recover_non_tb_errors().
- */
- thread_id = cpu_get_thread_index(this_cpu());
- if ((proc_gen == proc_gen_p9) && thread_id)
- tfmr &= ~SPR_TFMR_HDEC_PARITY_ERROR;
-
- /*
- * Now that TB is running, check for TFMR non-TB errors.
- */
- if ((tfmr & SPR_TFMR_HDEC_PARITY_ERROR) ||
- (tfmr & SPR_TFMR_PURR_PARITY_ERR) ||
- (tfmr & SPR_TFMR_SPURR_PARITY_ERR) ||
- (tfmr & SPR_TFMR_DEC_PARITY_ERR)) {
- if (!tfmr_recover_non_tb_errors(tfmr)) {
- rc = 0;
- goto error_out;
- }
+ /* We have successfully able to get TB running. */
rc = 1;
}
diff --git a/include/chiptod.h b/include/chiptod.h
index fd5cd9644..7708d4899 100644
--- a/include/chiptod.h
+++ b/include/chiptod.h
@@ -29,7 +29,11 @@ enum chiptod_topology {
extern void chiptod_init(void);
extern bool chiptod_wakeup_resync(void);
-extern int chiptod_recover_tb_errors(void);
+extern int chiptod_recover_tb_errors(uint64_t tfmr, bool *out_resynced);
+extern bool tfmr_recover_local_errors(uint64_t tfmr);
+extern bool recover_corrupt_tfmr(void);
+extern void tfmr_cleanup_core_errors(uint64_t tfmr);
+extern bool tfmr_clear_core_errors(uint64_t tfmr);
extern void chiptod_reset_tb(void);
extern bool chiptod_adjust_topology(enum chiptod_topology topo, bool enable);
extern bool chiptod_capp_timebase_sync(unsigned int chip_id, uint32_t tfmr_addr,
diff --git a/include/cpu.h b/include/cpu.h
index b7cd588d5..68f246396 100644
--- a/include/cpu.h
+++ b/include/cpu.h
@@ -97,9 +97,8 @@ struct cpu_thread {
*/
uint32_t core_hmi_state; /* primary only */
uint32_t *core_hmi_state_ptr;
- /* Mask to indicate thread id in core. */
- uint8_t thread_mask;
bool tb_invalid;
+ bool tb_resynced;
/* For use by XICS emulation on XIVE */
struct xive_cpu_state *xstate;
More information about the Skiboot
mailing list