[Skiboot] [PATCH 1/2] opal/hmi: Fix TB reside and HDEC parity error recovery for power9
Mahesh J Salgaonkar
mahesh at linux.vnet.ibm.com
Mon Oct 23 17:15:45 AEDT 2017
From: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
On TB/HDEC errors, all 4 threads on the affected receives HMI. On power9,
every thread on the core has its own copy of TB/HDEC and hence every thread
has to clear the dirty data from its own TB/HDEC register before we clear tb
errors through TFMR[24]. The HMI recovery would fail even if one thread
do not cleanup the respective TB/HDEC register.
Signed-off-by: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
---
core/hmi.c | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 102 insertions(+), 3 deletions(-)
diff --git a/core/hmi.c b/core/hmi.c
index c1769e3..7093b73 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -152,6 +152,7 @@
#define CORE_THREAD_MASK 0x0ff
#define SUBCORE_THREAD_MASK(s_id, t_count) \
((((1UL) << (t_count)) - 1) << ((s_id) * (t_count)))
+#define SINGLE_THREAD_MASK(t_id) ((1UL) << (t_id))
/* xscom addresses for core FIR (Fault Isolation Register) */
#define P8_CORE_FIR 0x10013100
@@ -619,7 +620,7 @@ static void decode_malfunction(struct OpalHMIEvent *hmi_evt)
}
}
-static void wait_for_subcore_threads(void)
+static void wait_for_cleanup_complete(void)
{
uint64_t timeout = 0;
@@ -696,7 +697,7 @@ static int get_split_core_mode(void)
* - SPR_TFMR_TB_RESIDUE_ERR
* - SPR_TFMR_HDEC_PARITY_ERROR
*/
-static void pre_recovery_cleanup(void)
+static void pre_recovery_cleanup_p8(void)
{
uint64_t hmer;
uint64_t tfmr;
@@ -783,7 +784,105 @@ static void pre_recovery_cleanup(void)
unlock(&hmi_lock);
/* Wait for other subcore to complete the cleanup. */
- wait_for_subcore_threads();
+ wait_for_cleanup_complete();
+}
+
+/*
+ * Certain TB/HDEC errors leaves dirty data in timebase and hdec register
+ * which need to cleared before we initiate clear_tb_errors through TFMR[24].
+ * The cleanup has to be done by all the threads from core in p9.
+ *
+ * On TB/HDEC errors, all 4 threads on the affected receives HMI. On power9,
+ * every thread on the core has its own copy of TB and hence every thread
+ * has to clear the dirty data from its own TB register before we clear tb
+ * errors through TFMR[24]. The HMI recovery would fail even if one thread
+ * do not cleanup the respective TB/HDEC register.
+ *
+ * There is no split core mode in power9.
+ *
+ * Errors that required pre-recovery cleanup:
+ * - SPR_TFMR_TB_RESIDUE_ERR
+ * - SPR_TFMR_HDEC_PARITY_ERROR
+ */
+static void pre_recovery_cleanup_p9(void)
+{
+ uint64_t hmer;
+ uint64_t tfmr;
+ int threads_per_core = cpu_thread_count;
+ int i;
+
+ hmer = mfspr(SPR_HMER);
+
+ /* exit if it is not Time facility error. */
+ if (!(hmer & SPR_HMER_TFAC_ERROR))
+ return;
+
+ /*
+ * Exit if it is not the error that leaves dirty data in timebase
+ * or HDEC register. OR this may be the thread which came in very
+ * late and recovery is been already done.
+ *
+ * TFMR is per core register. Ideally if any one thread on the core
+ * does the recovery it should reflect in TFMR register and
+ * applicable to all threads in that core. Hence take a lock before
+ * checking TFMR errors. Once a thread from a core completes the
+ * recovery, all other threads on that core will return from
+ * here.
+ *
+ * If TFMR does not show error that we are looking for, return
+ * from here. We would just fall through recovery code which would
+ * check for other errors on TFMR and fix them.
+ */
+ lock(&hmi_lock);
+ tfmr = mfspr(SPR_TFMR);
+ if (!(tfmr & (SPR_TFMR_TB_RESIDUE_ERR | SPR_TFMR_HDEC_PARITY_ERROR))) {
+ unlock(&hmi_lock);
+ return;
+ }
+
+ /*
+ * First thread on the core ?
+ * if yes, setup the hmi cleanup state to !DONE
+ */
+ if ((*(this_cpu()->core_hmi_state_ptr) & CORE_THREAD_MASK) == 0)
+ *(this_cpu()->core_hmi_state_ptr) &= ~HMI_STATE_CLEANUP_DONE;
+
+ /*
+ * Clear TB and wait for other threads to finish its cleanup work.
+ */
+ timer_facility_do_cleanup(tfmr);
+
+ /*
+ * Mark this thread bit. This bit will stay on until this thread
+ * exit from handle_hmi_exception().
+ */
+ *(this_cpu()->core_hmi_state_ptr) |= this_cpu()->thread_mask;
+
+ /*
+ * Check if each thread has completed the cleanup work.
+ * if yes, then notify all the threads that we are done with cleanup.
+ */
+ for (i = 0; i < threads_per_core; i++) {
+ uint32_t thread_mask = SINGLE_THREAD_MASK(i);
+ if (!(*(this_cpu()->core_hmi_state_ptr) & thread_mask))
+ break;
+ }
+
+ if (i == threads_per_core)
+ *(this_cpu()->core_hmi_state_ptr) |= HMI_STATE_CLEANUP_DONE;
+
+ unlock(&hmi_lock);
+
+ /* Wait for other threads to complete the cleanup. */
+ wait_for_cleanup_complete();
+}
+
+static void pre_recovery_cleanup(void)
+{
+ if (proc_gen == proc_gen_p9)
+ return pre_recovery_cleanup_p9();
+ else
+ return pre_recovery_cleanup_p8();
}
static void hmi_exit(void)
More information about the Skiboot
mailing list