[Skiboot] [PATCH] opal/hmi: Handle early HMIs on thread0 when secondaries are still in OPAL.
Mahesh J Salgaonkar
mahesh at linux.vnet.ibm.com
Fri Sep 21 13:31:34 AEST 2018
From: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
When primary thread receives a CORE level HMI for timer facility errors
while secondaries are still in OPAL, thread 0 ends up in rendez-vous
waiting for secondaries to get into hmi handling. This is because OPAL
runs with MSR(EE=0) and hence HMIs are delayed on secondary threads until
they are given to Linux OS. Fix this by adding a check for secondary
state and force them in hmi handling by queuing job on secondary threads.
I have tested this by injecting HDEC parity error very early during Linux
kernel boot. Recovery works fine for non-TB errors. But if TB is bad at
this very eary stage we already doomed.
Without this patch we see:
[ 285.046347408,7] OPAL: Start CPU 0x0843 (PIR 0x0843) -> 0x000000000000a83c
[ 285.051160609,7] OPAL: Start CPU 0x0844 (PIR 0x0844) -> 0x000000000000a83c
[ 285.055359021,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000
[ 285.055361439,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:0: TFMR(2e12002870e14000) Timer Facility Error
[ 286.232183823,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 1 (sptr=0000ccc1)
[ 287.409002056,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 2 (sptr=0000ccc1)
[ 289.073820164,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 3 (sptr=0000ccc1)
[ 290.250638683,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 1 (sptr=0000ccc2)
[ 291.427456821,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 2 (sptr=0000ccc2)
[ 293.092274807,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 3 (sptr=0000ccc2)
[ 294.269092904,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 1 (sptr=0000ccc3)
[ 295.445910944,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 2 (sptr=0000ccc3)
[ 297.110728970,3] HMI: Rendez-vous stage 1 timeout, CPU 0x844 waiting for thread 3 (sptr=0000ccc3)
After this patch:
[ 259.401719351,7] OPAL: Start CPU 0x0841 (PIR 0x0841) -> 0x000000000000a83c
[ 259.406259572,7] OPAL: Start CPU 0x0842 (PIR 0x0842) -> 0x000000000000a83c
[ 259.410615534,7] OPAL: Start CPU 0x0843 (PIR 0x0843) -> 0x000000000000a83c
[ 259.415444519,7] OPAL: Start CPU 0x0844 (PIR 0x0844) -> 0x000000000000a83c
[ 259.419641401,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000
[ 259.419644124,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:0: TFMR(2e12002870e04000) Timer Facility Error
[ 259.419650678,7] HMI: Sending hmi job to thread 1
[ 259.419652744,7] HMI: Sending hmi job to thread 2
[ 259.419653051,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000
[ 259.419654725,7] HMI: Sending hmi job to thread 3
[ 259.419654916,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000
[ 259.419658025,7] HMI: Received HMI interrupt: HMER = 0x0840000000000000
[ 259.419658406,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:2: TFMR(2e12002870e04000) Timer Facility Error
[ 259.419663095,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:3: TFMR(2e12002870e04000) Timer Facility Error
[ 259.419655234,7] HMI: [Loc: U78D3.ND1.WZS004A-P1-C48]: P:8 C:17 T:1: TFMR(2e12002870e04000) Timer Facility Error
[ 259.425109779,7] OPAL: Start CPU 0x0845 (PIR 0x0845) -> 0x000000000000a83c
[ 259.429870681,7] OPAL: Start CPU 0x0846 (PIR 0x0846) -> 0x000000000000a83c
[ 259.434549250,7] OPAL: Start CPU 0x0847 (PIR 0x0847) -> 0x000000000000a83c
Signed-off-by: Mahesh Salgaonkar <mahesh at linux.vnet.ibm.com>
---
core/hmi.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 49 insertions(+)
diff --git a/core/hmi.c b/core/hmi.c
index 4d1c3a7de..57f3fdbd6 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -32,6 +32,7 @@
#include <npu.h>
#include <capp.h>
#include <nvram.h>
+#include <cpu.h>
/*
* HMER register layout:
@@ -966,14 +967,54 @@ static int handle_thread_tfac_error(uint64_t tfmr, uint64_t *out_flags)
return recover;
}
+static int64_t opal_handle_hmi(void);
+
+static void opal_handle_hmi_job(void *data __unused)
+{
+ opal_handle_hmi();
+}
+
+/*
+ * Queue hmi handling job If secondaries are still in OPAL
+ * This function is called by thread 0.
+ */
+static struct cpu_job **hmi_kick_secondaries(void)
+{
+ struct cpu_thread *ts = this_cpu();
+ struct cpu_job **hmi_jobs = NULL;
+ int job_sz = sizeof(struct cpu_job *) * cpu_thread_count;
+ int i;
+
+ for (i = 1; i < cpu_thread_count; i++) {
+ ts = next_cpu(ts);
+
+ /* Is this thread still in OPAL ? */
+ if (ts->state == cpu_state_active) {
+ if (!hmi_jobs) {
+ hmi_jobs = zalloc(job_sz);
+ assert(hmi_jobs);
+ }
+
+ prlog(PR_DEBUG, "Sending hmi job to thread %d\n", i);
+ hmi_jobs[i] = cpu_queue_job(ts, "handle_hmi_job",
+ opal_handle_hmi_job, NULL);
+ }
+ }
+ return hmi_jobs;
+}
+
static int handle_all_core_tfac_error(uint64_t tfmr, uint64_t *out_flags)
{
struct cpu_thread *t, *t0;
int recover = -1;
+ struct cpu_job **hmi_jobs = NULL;
t = this_cpu();
t0 = find_cpu_by_pir(cpu_get_thread0(t));
+ if (t == t0 && t0->state == cpu_state_os)
+ hmi_jobs = hmi_kick_secondaries();
+
/* Rendez vous all threads */
hmi_rendez_vous(1);
@@ -1055,6 +1096,14 @@ error_out:
if (t0->tb_resynced)
*out_flags |= OPAL_HMI_FLAGS_TB_RESYNC;
+ if (t == t0 && hmi_jobs) {
+ int i;
+ for (i = 1; i < cpu_thread_count; i++)
+ if (hmi_jobs[i])
+ cpu_wait_job(hmi_jobs[i], true);
+ free(hmi_jobs);
+ }
+
return recover;
}
More information about the Skiboot
mailing list