[Skiboot] [PATCH v2 06/15] opal/hmi: Rework HMI handling of TFAC errors

Tue Apr 17 03:33:31 AEST 2018

From: Benjamin Herrenschmidt <benh at kernel.crashing.org>

This patch reworks the HMI handling for TFAC errors by introducing
4 rendez-vous points improve the thread synchronization while handling
timebase errors that requires all thread to clear dirty data from TB/HDEC
register before clearing the errors.

Signed-off-by: Benjamin Herrenschmidt <benh at kernel.crashing.org>
Signed-off-by: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
---
 core/cpu.c        |    2 
 core/hmi.c        |  523 +++++++++++++++++++++++------------------------------
 hw/chiptod.c      |  118 ++++--------
 include/chiptod.h |    6 +
 include/cpu.h     |    3 
 5 files changed, 280 insertions(+), 372 deletions(-)

diff --git a/core/cpu.c b/core/cpu.c
index 6826fee0a..b8d31e215 100644
--- a/core/cpu.c
+++ b/core/cpu.c
@@ -1107,7 +1107,6 @@ void init_all_cpus(void)
 #endif
 		t->core_hmi_state = 0;
 		t->core_hmi_state_ptr = &t->core_hmi_state;
-		t->thread_mask = 1;
 
 		/* Add associativity properties */
 		add_core_associativity(t);
@@ -1131,7 +1130,6 @@ void init_all_cpus(void)
 			t->node = cpu;
 			t->chip_id = chip_id;
 			t->core_hmi_state_ptr = &pt->core_hmi_state;
-			t->thread_mask = 1 << thread;
 		}
 		prlog(PR_INFO, "CPU:  %d secondary threads\n", thread);
 	}
diff --git a/core/hmi.c b/core/hmi.c
index c2b44b9d1..f51512989 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -184,8 +184,12 @@
  */
 #define NX_HMI_ACTIVE		PPC_BIT(54)
 
-/* Number of iterations for the various timeouts */
-#define TIMEOUT_LOOPS		20000000
+/*
+ * Number of iterations for the various timeouts. We can't use the timebase
+ * as it might be broken. We measured experimentally that 40 millions loops
+ * of cpu_relax() gives us more than 1s. The margin is comfortable enough.
+ */
+#define TIMEOUT_LOOPS		40000000
 
 /* TFMR other errors. (other than bit 26 and 45) */
 #define SPR_TFMR_OTHER_ERRORS	\
@@ -195,6 +199,18 @@
 	 SPR_TFMR_DEC_PARITY_ERR | SPR_TFMR_TFMR_CORRUPT |	\
 	 SPR_TFMR_CHIP_TOD_INTERRUPT)
 
+/* TFMR "all core" errors (sent to all threads) */
+#define SPR_TFMR_CORE_ERRORS	\
+	(SPR_TFMR_TBST_CORRUPT | SPR_TFMR_TB_MISSING_SYNC |	\
+	 SPR_TFMR_TB_MISSING_STEP | SPR_TFMR_FW_CONTROL_ERR |	\
+	 SPR_TFMR_TFMR_CORRUPT | SPR_TFMR_TB_RESIDUE_ERR |	\
+	 SPR_TFMR_HDEC_PARITY_ERROR | SPR_TFMR_CHIP_TOD_INTERRUPT)
+
+/* TFMR "thread" errors  */
+#define SPR_TFMR_THREAD_ERRORS \
+	(SPR_TFMR_PURR_PARITY_ERR | SPR_TFMR_SPURR_PARITY_ERR |	\
+	 SPR_TFMR_DEC_PARITY_ERR)
+
 static const struct core_xstop_bit_info {
 	uint8_t bit;		/* CORE FIR bit number */
 	enum OpalHMI_CoreXstopReason reason;
@@ -827,360 +843,283 @@ static void decode_malfunction(struct OpalHMIEvent *hmi_evt, uint64_t *out_flags
 	*out_flags |= flags;
 }
 
-static void wait_for_cleanup_complete(void)
-{
-	uint64_t timeout = 0;
-
-	smt_lowest();
-	while (!(*(this_cpu()->core_hmi_state_ptr) & HMI_STATE_CLEANUP_DONE)) {
-		/*
-		 * We use a fixed number of TIMEOUT_LOOPS rather
-		 * than using the timebase to do a pseudo-wall time
-		 * timeout due to the fact that timebase may not actually
-		 * work at this point in time.
-		 */
-		if (++timeout >= (TIMEOUT_LOOPS*3)) {
-			/*
-			 * Break out the loop here and fall through
-			 * recovery code. If recovery fails, kernel will get
-			 * informed about the failure. This way we can avoid
-			 * looping here if other threads are stuck.
-			 */
-			prlog(PR_DEBUG, "TB pre-recovery timeout\n");
-			break;
-		}
-		barrier();
-	}
-	smt_medium();
-}
-
 /*
- * For successful recovery of TB residue error, remove dirty data
- * from TB/HDEC register in each active partition (subcore). Writing
- * zero's to TB/HDEC will achieve the same.
+ * This will "rendez-vous" all threads on the core to the rendez-vous
+ * id "sig". You need to make sure that "sig" is different from the
+ * previous rendez vous. The sig value must be between 0 and 7 with
+ * boot time being set to 0.
+ *
+ * Note: in theory, we could just use a flip flop "sig" in the thread
+ * structure (binary rendez-vous with no argument). This is a bit more
+ * debuggable and better at handling timeouts (arguably).
+ *
+ * This should be called with the no lock held
  */
-static void timer_facility_do_cleanup(uint64_t tfmr)
+static void hmi_rendez_vous(uint32_t sig)
 {
+	struct cpu_thread *t = this_cpu();
+	uint32_t my_id = cpu_get_thread_index(t);
+	uint32_t my_shift = my_id << 2;
+	uint32_t *sptr = t->core_hmi_state_ptr;
+	uint32_t val, prev, shift, i;
+	uint64_t timeout;
+
+	assert(sig <= 0x7);
+
 	/*
-	 * Workaround for HW logic bug in Power9. Do not reset the
-	 * TB register if TB is valid and running.
+	 * Mark ourselves as having reached the rendez vous point with
+	 * the exit bit cleared
 	 */
-	if ((tfmr & SPR_TFMR_TB_RESIDUE_ERR) && !(tfmr & SPR_TFMR_TB_VALID)) {
+	do {
+		val = prev = *sptr;
+		val &= ~(0xfu << my_shift);
+		val |= sig << my_shift;
+	} while (cmpxchg32(sptr, prev, val) != prev);
 
-		/* Reset the TB register to clear the dirty data. */
-		mtspr(SPR_TBWU, 0);
-		mtspr(SPR_TBWL, 0);
+	/*
+	 * Wait for everybody else to reach that point, ignore the
+	 * exit bit as another thread could have already set it.
+	 */
+	for (i = 0; i < cpu_thread_count; i++) {
+		shift = i << 2;
+
+		timeout = TIMEOUT_LOOPS;
+		while (((*sptr >> shift) & 0x7) != sig && --timeout)
+			cpu_relax();
+		if (!timeout)
+			prlog(PR_ERR, "Rendez-vous stage 1 timeout, CPU 0x%x"
+			      " waiting for thread %d\n", t->pir, i);
 	}
 
-	if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR) {
-		/* Reset HDEC register */
-		mtspr(SPR_HDEC, 0);
+	/* Set the exit bit */
+	do {
+		val = prev = *sptr;
+		val &= ~(0xfu << my_shift);
+		val |= (sig | 8) << my_shift;
+	} while (cmpxchg32(sptr, prev, val) != prev);
+
+	/* At this point, we need to wait for everybody else to have a value
+	 * that is *not* sig. IE. they either have set the exit bit *or* they
+	 * have changed the rendez-vous (meaning they have moved on to another
+	 * rendez vous point).
+	 */
+	for (i = 0; i < cpu_thread_count; i++) {
+		shift = i << 2;
+
+		timeout = TIMEOUT_LOOPS;
+		while (((*sptr >> shift) & 0xf) == sig && --timeout)
+			cpu_relax();
+		if (!timeout)
+			prlog(PR_ERR, "Rendez-vous stage 2 timeout, CPU 0x%x"
+			      " waiting for thread %d\n", t->pir, i);
 	}
 }
 
-static int get_split_core_mode(void)
+static void hmi_print_debug(const uint8_t *msg, uint64_t hmer)
 {
-	uint64_t hid0;
+	const char *loc;
+	uint32_t core_id, thread_index;
 
-	hid0 = mfspr(SPR_HID0);
-	if (hid0 & SPR_HID0_POWER8_2LPARMODE)
-		return 2;
-	else if (hid0 & SPR_HID0_POWER8_4LPARMODE)
-		return 4;
+	core_id = pir_to_core_id(this_cpu()->pir);
+	thread_index = cpu_get_thread_index(this_cpu());
 
-	return 1;
-}
+	loc = chip_loc_code(this_cpu()->chip_id);
+	if (!loc)
+		loc = "Not Available";
 
+	if (hmer & (SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR)) {
+		prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: TFMR(%016lx) %s\n",
+			loc, this_cpu()->chip_id, core_id, thread_index,
+			mfspr(SPR_TFMR), msg);
+	} else {
+		prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: %s\n",
+			loc, this_cpu()->chip_id, core_id, thread_index,
+			msg);
+	}
+}
 
-/*
- * Certain TB/HDEC errors leaves dirty data in timebase and hdec register
- * which need to cleared before we initiate clear_tb_errors through TFMR[24].
- * The cleanup has to be done by once by any one thread from core or subcore.
- *
- * In split core mode, it is required to clear the dirty data from TB/HDEC
- * register by all subcores (active partitions) before we clear tb errors
- * through TFMR[24]. The HMI recovery would fail even if one subcore do
- * not cleanup the respective TB/HDEC register.
- *
- * For un-split core, any one thread can do the cleanup.
- * For split core, any one thread from each subcore can do the cleanup.
- *
- * Errors that required pre-recovery cleanup:
- *	- SPR_TFMR_TB_RESIDUE_ERR
- *	- SPR_TFMR_HDEC_PARITY_ERROR
- */
-static void pre_recovery_cleanup_p8(uint64_t *out_flags)
+static int handle_thread_tfac_error(uint64_t tfmr, uint64_t *out_flags)
 {
-	uint64_t tfmr;
-	uint32_t sibling_thread_mask;
-	int split_core_mode, subcore_id, thread_id, threads_per_core;
-	int i;
+	int recover = 1;
 
-	/*
-	 * Exit if it is not the error that leaves dirty data in timebase
-	 * or HDEC register. OR this may be the thread which came in very
-	 * late and recovery is been already done.
-	 *
-	 * TFMR is per [sub]core register. If any one thread on the [sub]core
-	 * does the recovery it reflects in TFMR register and applicable to
-	 * all threads in that [sub]core. Hence take a lock before checking
-	 * TFMR errors. Once a thread from a [sub]core completes the
-	 * recovery, all other threads on that [sub]core will return from
-	 * here.
-	 *
-	 * If TFMR does not show error that we are looking for, return
-	 * from here. We would just fall through recovery code which would
-	 * check for other errors on TFMR and fix them.
-	 */
-	lock(&hmi_lock);
-	tfmr = mfspr(SPR_TFMR);
-	if (!(tfmr & SPR_TFMR_TB_VALID))
-		*out_flags |= OPAL_HMI_FLAGS_TB_RESYNC;
 	if (tfmr & SPR_TFMR_DEC_PARITY_ERR)
 		*out_flags |= OPAL_HMI_FLAGS_DEC_LOST;
-	if (!(tfmr & (SPR_TFMR_TB_RESIDUE_ERR | SPR_TFMR_HDEC_PARITY_ERROR))) {
-		unlock(&hmi_lock);
-		return;
-	}
+	if (!tfmr_recover_local_errors(tfmr))
+		recover = 0;
+	tfmr &= ~(SPR_TFMR_PURR_PARITY_ERR |
+		  SPR_TFMR_SPURR_PARITY_ERR |
+		  SPR_TFMR_DEC_PARITY_ERR);
+	return recover;
+}
 
-	/* Tell OS about a possible loss of HDEC */
-	if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
-		*out_flags |= OPAL_HMI_FLAGS_HDEC_LOST;
+static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
+{
+	struct cpu_thread *t, *t0;
+	int recover = 1;
 
-	/* Gather split core information. */
-	split_core_mode = get_split_core_mode();
-	threads_per_core = cpu_thread_count / split_core_mode;
+	t = this_cpu();
+	t0 = find_cpu_by_pir(cpu_get_thread0(t));
 
-	/* Prepare core/subcore sibling mask */
-	thread_id = cpu_get_thread_index(this_cpu());
-	subcore_id = thread_id / threads_per_core;
-	sibling_thread_mask = SUBCORE_THREAD_MASK(subcore_id, threads_per_core);
+	/* Rendez vous all threads */
+	hmi_rendez_vous(1);
 
-	/*
-	 * First thread on the core ?
-	 * if yes, setup the hmi cleanup state to !DONE
+	/* We use a lock here as some of the TFMR bits are shared and I
+	 * prefer avoiding doing the cleanup simultaneously.
 	 */
-	if ((*(this_cpu()->core_hmi_state_ptr) & CORE_THREAD_MASK) == 0)
-		*(this_cpu()->core_hmi_state_ptr) &= ~HMI_STATE_CLEANUP_DONE;
+	lock(&hmi_lock);
 
-	/*
-	 * First thread on subcore ?
-	 * if yes, do cleanup.
-	 *
-	 * Clear TB and wait for other threads (one from each subcore) to
-	 * finish its cleanup work.
+	/* First handle corrupt TFMR otherwise we can't trust anything.
+	 * We'll use a lock here so that the threads don't try to do it at
+	 * the same time
 	 */
+	if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
+		/* Check if it's still in error state */
+		if (mfspr(SPR_TFMR) & SPR_TFMR_TFMR_CORRUPT)
+			if (!recover_corrupt_tfmr())
+				recover = 0;
 
-	if ((*(this_cpu()->core_hmi_state_ptr) & sibling_thread_mask) == 0)
-		timer_facility_do_cleanup(tfmr);
+		if (!recover)
+			goto error_out;
 
-	/*
-	 * Mark this thread bit. This bit will stay on until this thread
-	 * exit from handle_hmi_exception().
-	 */
-	*(this_cpu()->core_hmi_state_ptr) |= this_cpu()->thread_mask;
+		tfmr = mfspr(SPR_TFMR);
 
-	/*
-	 * Check if each subcore has completed the cleanup work.
-	 * if yes, then notify all the threads that we are done with cleanup.
-	 */
-	for (i = 0; i < split_core_mode; i++) {
-		uint32_t subcore_thread_mask =
-				SUBCORE_THREAD_MASK(i, threads_per_core);
-		if (!(*(this_cpu()->core_hmi_state_ptr) & subcore_thread_mask))
-			break;
+		/* We could have got new thread errors in the meantime */
+		if (tfmr & SPR_TFMR_THREAD_ERRORS) {
+			recover = handle_thread_tfac_error(tfmr, out_flags);
+			tfmr &= ~SPR_TFMR_THREAD_ERRORS;
+		}
+		if (!recover)
+			goto error_out;
 	}
 
-	if (i == split_core_mode)
-		*(this_cpu()->core_hmi_state_ptr) |= HMI_STATE_CLEANUP_DONE;
-
-	unlock(&hmi_lock);
+	/* Tell the OS ... */
+	if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
+		*out_flags |= OPAL_HMI_FLAGS_HDEC_LOST;
 
-	/* Wait for other subcore to complete the cleanup. */
-	wait_for_cleanup_complete();
-}
+	/* Cleanup bad HDEC or TB on all threads or subcures before we clear
+	 * the error conditions
+	 */
+	tfmr_cleanup_core_errors(tfmr);
 
-/*
- * Certain TB/HDEC errors leaves dirty data in timebase and hdec register
- * which need to cleared before we initiate clear_tb_errors through TFMR[24].
- * The cleanup has to be done by all the threads from core in p9.
- *
- * On TB/HDEC errors, all 4 threads on the affected receives HMI. On power9,
- * every thread on the core has its own copy of TB and hence every thread
- * has to clear the dirty data from its own TB register before we clear tb
- * errors through TFMR[24]. The HMI recovery would fail even if one thread
- * do not cleanup the respective TB/HDEC register.
- *
- * There is no split core mode in power9.
- *
- * Errors that required pre-recovery cleanup:
- *	- SPR_TFMR_TB_RESIDUE_ERR
- *	- SPR_TFMR_HDEC_PARITY_ERROR
- */
-static void pre_recovery_cleanup_p9(uint64_t *out_flags)
-{
-	uint64_t tfmr;
-	int threads_per_core = cpu_thread_count;
-	int i;
+	/* Unlock before next rendez-vous */
+	unlock(&hmi_lock);
 
-	/*
-	 * Exit if it is not the error that leaves dirty data in timebase
-	 * or HDEC register. OR this may be the thread which came in very
-	 * late and recovery is been already done.
-	 *
-	 * TFMR is per core register. Ideally if any one thread on the core
-	 * does the recovery it should reflect in TFMR register and
-	 * applicable to all threads in that core. Hence take a lock before
-	 * checking TFMR errors. Once a thread from a core completes the
-	 * recovery, all other threads on that core will return from
-	 * here.
-	 *
-	 * If TFMR does not show error that we are looking for, return
-	 * from here. We would just fall through recovery code which would
-	 * check for other errors on TFMR and fix them.
+	/* Second rendez vous, ensure the above cleanups are all done before
+	 * we proceed further
 	 */
-	lock(&hmi_lock);
-	tfmr = mfspr(SPR_TFMR);
-	if (!(tfmr & SPR_TFMR_TB_VALID))
-		*out_flags |= OPAL_HMI_FLAGS_TB_RESYNC;
-	if (tfmr & SPR_TFMR_DEC_PARITY_ERR)
-		*out_flags |= OPAL_HMI_FLAGS_DEC_LOST;
-	if (!(tfmr & (SPR_TFMR_TB_RESIDUE_ERR | SPR_TFMR_HDEC_PARITY_ERROR))) {
-		unlock(&hmi_lock);
-		return;
-	}
+	hmi_rendez_vous(2);
 
-	/*
-	 * Due to a HW logic bug in p9, TFMR bit 26 and 45 always set
-	 * once TB residue or HDEC errors occurs at first time. Hence for HMI
-	 * on subsequent TB errors add additional check as workaround to
-	 * identify validity of the errors and decide whether pre-recovery
-	 * is required or not. Exit pre-recovery if there are other TB
-	 * errors also present on TFMR.
-	 */
-	if (tfmr & SPR_TFMR_OTHER_ERRORS) {
-		unlock(&hmi_lock);
-		return;
+	/* We can now clear the error conditions in the core. */
+	if (!tfmr_clear_core_errors(tfmr)) {
+		recover = 0;
+		goto error_out;
 	}
 
-	/*
-	 * First thread on the core ?
-	 * if yes, setup the hmi cleanup state to !DONE
+	/* Third rendez-vous. We could in theory do the timebase resync as
+	 * part of the previous one, but I prefer having all the error
+	 * conditions cleared before we start trying.
 	 */
-	if ((*(this_cpu()->core_hmi_state_ptr) & CORE_THREAD_MASK) == 0)
-		*(this_cpu()->core_hmi_state_ptr) &= ~HMI_STATE_CLEANUP_DONE;
+	hmi_rendez_vous(3);
 
-	/* Tell OS about a possible loss of HDEC */
-	if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
-		*out_flags |= OPAL_HMI_FLAGS_HDEC_LOST;
+	/* Now perform the actual TB recovery on thread 0 */
+	if (t == t0)
+		recover = chiptod_recover_tb_errors(tfmr,
+						&this_cpu()->tb_resynced);
 
-	/*
-	 * Clear TB and wait for other threads to finish its cleanup work.
-	 */
-	timer_facility_do_cleanup(tfmr);
-
-	/*
-	 * Mark this thread bit. This bit will stay on until this thread
-	 * exit from handle_hmi_exception().
-	 */
-	*(this_cpu()->core_hmi_state_ptr) |= this_cpu()->thread_mask;
+error_out:
+	/* Last rendez-vous */
+	hmi_rendez_vous(4);
 
-	/*
-	 * Check if each thread has completed the cleanup work.
-	 * if yes, then notify all the threads that we are done with cleanup.
+	/* Now all threads have gone past rendez-vous 3 and not yet past another
+	 * rendez-vous 1, so the value of tb_resynced of thread 0 of the core
+	 * contains an accurate indication as to whether the timebase was lost.
 	 */
-	for (i = 0; i < threads_per_core; i++) {
-		uint32_t thread_mask = SINGLE_THREAD_MASK(i);
-		if (!(*(this_cpu()->core_hmi_state_ptr) & thread_mask))
-			break;
-	}
-
-	if (i == threads_per_core)
-		*(this_cpu()->core_hmi_state_ptr) |= HMI_STATE_CLEANUP_DONE;
-
-	unlock(&hmi_lock);
-
-	/* Wait for other threads to complete the cleanup. */
-	wait_for_cleanup_complete();
-}
+	if (t0->tb_resynced)
+		*out_flags |= OPAL_HMI_FLAGS_TB_RESYNC;
 
-static void pre_recovery_cleanup(uint64_t *out_flags)
-{
-	if (proc_gen == proc_gen_p9)
-		return pre_recovery_cleanup_p9(out_flags);
-	else
-		return pre_recovery_cleanup_p8(out_flags);
+	return recover;
 }
 
-static void hmi_print_debug(const uint8_t *msg, uint64_t hmer)
+static int handle_tfac_errors(uint64_t hmer, struct OpalHMIEvent *hmi_evt,
+			      uint64_t *out_flags)
 {
-	const char *loc;
-	uint32_t core_id, thread_index;
+	int recover = 1;
+	uint64_t tfmr = mfspr(SPR_TFMR);
 
-	core_id = pir_to_core_id(this_cpu()->pir);
-	thread_index = cpu_get_thread_index(this_cpu());
+	/* A TFMR parity error makes us ignore all the local stuff */
+	if ((hmer & SPR_HMER_TFMR_PARITY_ERROR) || (tfmr & SPR_TFMR_TFMR_CORRUPT)) {
+		/* Mark TB as invalid for now as we don't trust TFMR, we'll fix
+		 * it up later
+		 */
+		this_cpu()->tb_invalid = true;
+		goto bad_tfmr;
+	}
 
-	loc = chip_loc_code(this_cpu()->chip_id);
-	if (!loc)
-		loc = "Not Available";
+	this_cpu()->tb_invalid = !(tfmr & SPR_TFMR_TB_VALID);
 
-	if (hmer & (SPR_HMER_TFAC_ERROR | SPR_HMER_TFMR_PARITY_ERROR)) {
-		prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: TFMR(%016lx) %s\n",
-			loc, this_cpu()->chip_id, core_id, thread_index,
-			mfspr(SPR_TFMR), msg);
-	} else {
-		prlog(PR_DEBUG, "[Loc: %s]: P:%d C:%d T:%d: %s\n",
-			loc, this_cpu()->chip_id, core_id, thread_index,
-			msg);
+	/* P9 errata: In theory, an HDEC error is sent to all threads. However,
+	 * due to an errata on P9 where TFMR bit 26 (HDEC parity) cannot be
+	 * cleared on thread 1..3, I am not confident we can do a rendez-vous
+	 * in all cases.
+	 *
+	 * Our current approach is to ignore that error unless no other TFAC
+	 * error is present in the TFMR. The error will be re-detected and
+	 * re-reported if necessary.
+	 */
+	if (proc_gen == proc_gen_p9 && (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)) {
+		if (this_cpu()->tb_invalid || (tfmr & SPR_TFMR_OTHER_ERRORS))
+			tfmr &= ~SPR_TFMR_HDEC_PARITY_ERROR;
 	}
-}
 
-static int handle_tfac_errors(uint64_t hmer, struct OpalHMIEvent *hmi_evt,
-			      uint64_t *out_flags)
-{
-	int recover = 1;
-	uint64_t tfmr;
+	/* The TB residue error is ignored if TB is valid due to a similar
+	 * errata as above
+	 */
+	if ((tfmr & SPR_TFMR_TB_RESIDUE_ERR) && !this_cpu()->tb_invalid)
+		tfmr &= ~SPR_TFMR_TB_RESIDUE_ERR;
 
-	pre_recovery_cleanup(out_flags);
+	/* First, handle thread local errors */
+	if (tfmr & SPR_TFMR_THREAD_ERRORS) {
+		recover = handle_thread_tfac_error(tfmr, out_flags);
+		tfmr &= ~SPR_TFMR_THREAD_ERRORS;
+	}
 
-	lock(&hmi_lock);
-	this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
+ bad_tfmr:
 
-	/*
-	 * Assert for now for all TOD errors. In future we need to decode
-	 * TFMR and take corrective action wherever required.
+	/* Let's see if we still have a all-core error to deal with, if
+	 * not, we just bail out
 	 */
-	if (hmer & SPR_HMER_TFAC_ERROR) {
-		tfmr = mfspr(SPR_TFMR);		/* save original TFMR */
+	if (tfmr & SPR_TFMR_CORE_ERRORS) {
+		int recover2;
 
-		hmi_print_debug("Timer Facility Error", hmer);
-
-		recover = chiptod_recover_tb_errors();
-		if (hmi_evt) {
-			hmi_evt->severity = OpalHMI_SEV_ERROR_SYNC;
-			hmi_evt->type = OpalHMI_ERROR_TFAC;
-			hmi_evt->tfmr = tfmr;
-			queue_hmi_event(hmi_evt, recover, out_flags);
-		}
+		/* Only update "recover" if it's not already 0 (non-recovered)
+		 */
+		recover2 = handle_all_core_tfac_error(tfmr, out_flags);
+		if (recover != 0)
+			recover = recover2;
+	} else if (this_cpu()->tb_invalid) {
+		/* This shouldn't happen, TB is invalid and no global error
+		 * was reported. We just return for now assuming one will
+		 * be. We can't do a rendez vous without a core-global HMI.
+		 */
+		prlog(PR_ERR, "HMI: TB invalid without core error reported ! "
+			"CPU=%x, TFMR=0x%016lx\n", this_cpu()->pir,
+						mfspr(SPR_TFMR));
 	}
-	if (hmer & SPR_HMER_TFMR_PARITY_ERROR) {
-		tfmr = mfspr(SPR_TFMR);		/* save original TFMR */
 
-		hmi_print_debug("TFMR parity Error", hmer);
-		recover = chiptod_recover_tb_errors();
-		if (hmi_evt) {
-			hmi_evt->severity = OpalHMI_SEV_FATAL;
-			hmi_evt->type = OpalHMI_ERROR_TFMR_PARITY;
-			hmi_evt->tfmr = tfmr;
-			queue_hmi_event(hmi_evt, recover, out_flags);
-		}
+	if (hmi_evt) {
+		hmi_evt->severity = OpalHMI_SEV_ERROR_SYNC;
+		hmi_evt->type = OpalHMI_ERROR_TFAC;
+		hmi_evt->tfmr = tfmr;
+		queue_hmi_event(hmi_evt, recover, out_flags);
 	}
-	/* Unconditionally unset the thread bit */
-	*(this_cpu()->core_hmi_state_ptr) &= ~(this_cpu()->thread_mask);
 
 	/* Set the TB state looking at TFMR register before we head out. */
 	this_cpu()->tb_invalid = !(mfspr(SPR_TFMR) & SPR_TFMR_TB_VALID);
-	unlock(&hmi_lock);
+
+	if (this_cpu()->tb_invalid)
+		prlog(PR_WARNING, "Failed to get TB in running state! "
+			"CPU=%x, TFMR=%016lx\n", this_cpu()->pir,
+					mfspr(SPR_TFMR));
 
 	return recover;
 }
diff --git a/hw/chiptod.c b/hw/chiptod.c
index cacc27340..a160e5a10 100644
--- a/hw/chiptod.c
+++ b/hw/chiptod.c
@@ -1370,17 +1370,10 @@ static bool tfmr_recover_tb_errors(uint64_t tfmr)
 	return true;
 }
 
-static bool tfmr_recover_non_tb_errors(uint64_t tfmr)
+bool tfmr_recover_local_errors(uint64_t tfmr)
 {
 	uint64_t tfmr_reset_errors = 0;
 
-	/*
-	 * write 1 to bit 26 to clear TFMR HDEC parity error.
-	 * HDEC register has already been reset to zero as part pre-recovery.
-	 */
-	if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
-		tfmr_reset_errors |= SPR_TFMR_HDEC_PARITY_ERROR;
-
 	if (tfmr & SPR_TFMR_DEC_PARITY_ERR) {
 		/* Set DEC with all ones */
 		mtspr(SPR_DEC, ~0);
@@ -1390,11 +1383,11 @@ static bool tfmr_recover_non_tb_errors(uint64_t tfmr)
 	}
 
 	/*
-	 * Reset PURR/SPURR to recover. We also need help from KVM
-	 * layer to handle this change in PURR/SPURR. That needs
-	 * to be handled in kernel KVM layer. For now, to recover just
-	 * reset it.
-	 */
+	* Reset PURR/SPURR to recover. We also need help from KVM
+	* layer to handle this change in PURR/SPURR. That needs
+	* to be handled in kernel KVM layer. For now, to recover just
+	* reset it.
+	*/
 	if (tfmr & SPR_TFMR_PURR_PARITY_ERR) {
 		/* set PURR register with sane value or reset it. */
 		mtspr(SPR_PURR, 0);
@@ -1432,7 +1425,7 @@ static bool tfmr_recover_non_tb_errors(uint64_t tfmr)
  *	MT(TFMR) bits 11 and 60 are b’1’
  *	MT(HMER) all bits 1 except for bits 4,5
  */
-static bool chiptod_recover_tfmr_error(void)
+bool recover_corrupt_tfmr(void)
 {
 	uint64_t tfmr;
 
@@ -1468,6 +1461,37 @@ static bool chiptod_recover_tfmr_error(void)
 	return true;
 }
 
+void tfmr_cleanup_core_errors(uint64_t tfmr)
+{
+	/* If HDEC is bad, clean it on all threads before we clear the
+	 * error condition.
+	 */
+	if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
+		mtspr(SPR_HDEC, 0);
+
+	/* If TB is invalid, clean it on all threads as well, it will be
+	 * restored after the next rendez-vous
+	 */
+	if (!(tfmr & SPR_TFMR_TB_VALID)) {
+		mtspr(SPR_TBWU, 0);
+		mtspr(SPR_TBWU, 0);
+	}
+}
+
+bool tfmr_clear_core_errors(uint64_t tfmr)
+{
+	uint64_t tfmr_reset_errors = 0;
+
+	if (tfmr & SPR_TFMR_HDEC_PARITY_ERROR)
+		tfmr_reset_errors |= SPR_TFMR_HDEC_PARITY_ERROR;
+
+	/* Write TFMR twice to clear the error */
+	mtspr(SPR_TFMR, base_tfmr | tfmr_reset_errors);
+	mtspr(SPR_TFMR, base_tfmr | tfmr_reset_errors);
+
+	return true;
+}
+
 /*
  * Recover from TB and TOD errors.
  * Timebase register is per core and first thread that gets chance to
@@ -1481,46 +1505,17 @@ static bool chiptod_recover_tfmr_error(void)
  *	1	<= Successfully recovered from errors
  *	-1	<= No errors found. Errors are already been fixed.
  */
-int chiptod_recover_tb_errors(void)
+int chiptod_recover_tb_errors(uint64_t tfmr, bool *out_resynced)
 {
-	uint64_t tfmr;
 	int rc = -1;
-	int thread_id;
+
+	*out_resynced = false;
 
 	if (chiptod_primary < 0)
 		return 0;
 
 	lock(&chiptod_lock);
 
-	/* Get fresh copy of TFMR */
-	tfmr = mfspr(SPR_TFMR);
-
-	/*
-	 * Check for TFMR parity error and recover from it.
-	 * We can not trust any other bits in TFMR If it is corrupt. Fix this
-	 * before we do anything.
-	 */
-	if (tfmr & SPR_TFMR_TFMR_CORRUPT) {
-		if (!chiptod_recover_tfmr_error()) {
-			rc = 0;
-			goto error_out;
-		}
-	}
-
-	/* Get fresh copy of TFMR */
-	tfmr = mfspr(SPR_TFMR);
-
-	/*
-	 * Workaround for HW logic bug in Power9
-	 * Even after clearing TB residue error by one thread it does not
-	 * get reflected to other threads on same core.
-	 * Check if TB is already valid and skip the checking of TB errors.
-	 */
-
-	if ((proc_gen == proc_gen_p9) && (tfmr & SPR_TFMR_TB_RESIDUE_ERR)
-						&& (tfmr & SPR_TFMR_TB_VALID))
-		goto skip_tb_error_clear;
-
 	/*
 	 * Check for TB errors.
 	 * On Sync check error, bit 44 of TFMR is set. Check for it and
@@ -1544,7 +1539,6 @@ int chiptod_recover_tb_errors(void)
 		}
 	}
 
-skip_tb_error_clear:
 	/*
 	 * Check for TOD sync check error.
 	 * On TOD errors, bit 51 of TFMR is set. If this bit is on then we
@@ -1574,35 +1568,9 @@ skip_tb_error_clear:
 		if (!chiptod_to_tb())
 			goto error_out;
 
-		/* We have successfully able to get TB running. */
-		rc = 1;
-	}
+		*out_resynced = true;
 
-	/*
-	 * Workaround for HW logic bug in power9.
-	 * In idea case (without the HW bug) only one thread from the core
-	 * would have fallen through tfmr_recover_non_tb_errors() to clear
-	 * HDEC parity error on TFMR.
-	 *
-	 * Hence to achieve same behavior, allow only thread 0 to clear the
-	 * HDEC parity error. And for rest of the threads just reset the bit
-	 * to avoid other threads to fall through tfmr_recover_non_tb_errors().
-	 */
-	thread_id = cpu_get_thread_index(this_cpu());
-	if ((proc_gen == proc_gen_p9) && thread_id)
-		tfmr &= ~SPR_TFMR_HDEC_PARITY_ERROR;
-
-	/*
-	 * Now that TB is running, check for TFMR non-TB errors.
-	 */
-	if ((tfmr & SPR_TFMR_HDEC_PARITY_ERROR) ||
-		(tfmr & SPR_TFMR_PURR_PARITY_ERR) ||
-		(tfmr & SPR_TFMR_SPURR_PARITY_ERR) ||
-		(tfmr & SPR_TFMR_DEC_PARITY_ERR)) {
-		if (!tfmr_recover_non_tb_errors(tfmr)) {
-			rc = 0;
-			goto error_out;
-		}
+		/* We have successfully able to get TB running. */
 		rc = 1;
 	}
 
diff --git a/include/chiptod.h b/include/chiptod.h
index fd5cd9644..7708d4899 100644
--- a/include/chiptod.h
+++ b/include/chiptod.h
@@ -29,7 +29,11 @@ enum chiptod_topology {
 
 extern void chiptod_init(void);
 extern bool chiptod_wakeup_resync(void);
-extern int chiptod_recover_tb_errors(void);
+extern int chiptod_recover_tb_errors(uint64_t tfmr, bool *out_resynced);
+extern bool tfmr_recover_local_errors(uint64_t tfmr);
+extern bool recover_corrupt_tfmr(void);
+extern void tfmr_cleanup_core_errors(uint64_t tfmr);
+extern bool tfmr_clear_core_errors(uint64_t tfmr);
 extern void chiptod_reset_tb(void);
 extern bool chiptod_adjust_topology(enum chiptod_topology topo, bool enable);
 extern bool chiptod_capp_timebase_sync(unsigned int chip_id, uint32_t tfmr_addr,
diff --git a/include/cpu.h b/include/cpu.h
index b7cd588d5..68f246396 100644
--- a/include/cpu.h
+++ b/include/cpu.h
@@ -97,9 +97,8 @@ struct cpu_thread {
 	 */
 	uint32_t			core_hmi_state; /* primary only */
 	uint32_t			*core_hmi_state_ptr;
-	/* Mask to indicate thread id in core. */
-	uint8_t				thread_mask;
 	bool				tb_invalid;
+	bool				tb_resynced;
 
 	/* For use by XICS emulation on XIVE */
 	struct xive_cpu_state		*xstate;