[Skiboot] [PATCH 00/18] move direct controls out of fast-reboot,

Nicholas Piggin npiggin at gmail.com
Thu Nov 23 15:10:34 AEDT 2017


On Sun, 19 Nov 2017 19:14:50 +1000
Nicholas Piggin <npiggin at gmail.com> wrote:

> Here's a set of patches that move the p8 direct controls out of
> fast-reboot and into direct-controls. The fast reboot APIs are
> implemented on top of direct controls API. This should allow fast
> reboot to use sreset on p9, and cleans things up.
> 
> Also added opal quiescing patches at the end while I was working
> on fast reboot stuff, but they don't logically depend on one another.
> 
> This seems to work okay on a power8 test system. Anybody looking at
> doing the rest of fast reboot on power9 should start with this I
> think.
> 
> Comments?

Here is an incremental patch that goes on the top. It fixes a
bug with un-quiescing opal when fast reboot fails, and also
adds some more messages and debugging in some of the failure
paths.

I will split it up and merge important bits in the series if
they turn out okay, but I'm just posting this because there
have been some failures noticed in testing. Hopefully this will
solve them or narrow down the problem.

Thanks,
Nick

---
 asm/head.S         |  1 +
 core/fast-reboot.c | 37 +++++++++++++++++++++++++++----------
 core/opal.c        |  8 ++++----
 3 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/asm/head.S b/asm/head.S
index 4c67ead19..e9d4143c1 100644
--- a/asm/head.S
+++ b/asm/head.S
@@ -985,6 +985,7 @@ opal_entry:
 
 	/* Jump ! */
 	bctrl
+	mr	%r4,%r1
 	bl	opal_exit_check
 1:	ld	%r12,STACK_LR(%r1)
 	mtlr	%r12
diff --git a/core/fast-reboot.c b/core/fast-reboot.c
index 31aabdb2f..7aa0b4725 100644
--- a/core/fast-reboot.c
+++ b/core/fast-reboot.c
@@ -37,8 +37,11 @@ static bool fast_reset(void)
 	struct cpu_thread *cpu;
 	bool ret;
 
-	if (sreset_all_prepare())
+	if (sreset_all_prepare()) {
+		prlog(PR_NOTICE, "RESET: Fast reboot failed to prepare "
+				"secondaries for system reset\n");
 		return false;
+	}
 
 	/* Put everybody in stop except myself */
 	for_each_ungarded_cpu(cpu) {
@@ -53,7 +56,12 @@ static bool fast_reset(void)
 	setup_reset_vector();
 
 	/* Send everyone else to 0x100 */
-	ret = (sreset_all_others() == OPAL_SUCCESS);
+	ret = true;
+	if (sreset_all_others() != OPAL_SUCCESS) {
+		prlog(PR_NOTICE, "RESET: Fast reboot failed to system reset "
+				"secondaries\n");
+		ret = false;
+	}
 
 	prlog(PR_DEBUG, "RESET: Releasing special wakeups...\n");
 	sreset_all_finish();
@@ -120,7 +128,8 @@ void fast_reboot(void)
 	 * Ensure all other CPUs have left OPAL calls.
 	 */
 	if (!opal_quiesce(QUIESCE_HOLD, -1)) {
-		prlog(PR_DEBUG, "RESET: Fast reboot disabled because OPAL quiesce timed out\n");
+		prlog(PR_NOTICE, "RESET: Fast reboot disabled because OPAL "
+				"quiesce timed out\n");
 		return;
 	}
 
@@ -131,22 +140,30 @@ void fast_reboot(void)
 		return;
 	}
 
-	prlog(PR_NOTICE, "RESET: Initiating fast reboot %d...\n", ++fast_reboot_count);
+	prlog(PR_NOTICE, "RESET: PIR=%04lx Initiating fast reboot %d...\n", mfspr(SPR_PIR), ++fast_reboot_count);
+
 	free(fdt);
 
 	fast_boot_release = false;
 	sync();
 
-	if (!fast_reset()) {
-		opal_quiesce(QUIESCE_RESUME, -1);
+	/*
+	 * There is no point un-quiescing due to failure after this point,
+	 * because secondaries could be stopped, and exception vectors might
+	 * be trashed. Nothing useful can be done besides IPL.
+	 */
+	if (!fast_reset())
 		return;
-	}
-
-	opal_quiesce(QUIESCE_RESUME_FAST_REBOOT, -1);
 
 	/* Ensure all the sresets get through */
-	if (!cpu_state_wait_all_others(cpu_state_present, msecs_to_tb(100)))
+	if (!cpu_state_wait_all_others(cpu_state_present, msecs_to_tb(100))) {
+		prlog(PR_NOTICE, "RESET: Fast reboot timed out waiting for "
+				"secondaries to call in\n");
 		return;
+	}
+
+	/* This resets our quiesce state ready to enter the new kernel. */
+	opal_quiesce(QUIESCE_RESUME_FAST_REBOOT, -1);
 
 	asm volatile("ba	0x100\n\t" : : : "memory");
 	for (;;)
diff --git a/core/opal.c b/core/opal.c
index 0b72758c3..b2d204e1b 100644
--- a/core/opal.c
+++ b/core/opal.c
@@ -165,15 +165,15 @@ again:
 	return OPAL_SUCCESS;
 }
 
-void opal_exit_check(void);
+void opal_exit_check(int64_t retval, struct stack_frame *eframe);
 
-void opal_exit_check(void)
+void opal_exit_check(int64_t retval, struct stack_frame *eframe)
 {
 	struct cpu_thread *cpu = this_cpu();
 
 	if (!cpu->in_opal_call) {
-		printf("CPU UN-ACCOUNTED FIRMWARE ENTRY! PIR=%04lx cpu @%p -> pir=%04x\n",
-		       mfspr(SPR_PIR), cpu, cpu->pir);
+		uint64_t token = eframe->gpr[0];
+		printf("CPU UN-ACCOUNTED FIRMWARE ENTRY DETECTED AT EXIT! PIR=%04lx cpu @%p -> pir=%04x token %lld return=%lld\n", mfspr(SPR_PIR), cpu, cpu->pir, token, retval);
 	} else {
 		cpu->in_opal_call--;
 	}
-- 
2.15.0




More information about the Skiboot mailing list