[Skiboot] [RFC/WIP/PATCH] Fast reboot for P8

Benjamin Herrenschmidt benh at kernel.crashing.org
Fri Jul 22 17:17:17 AEST 2016


This is an experimental patch that implements "Fast reboot" on P8
machines.

The basic idea is that when the OS calls OPAL reboot, we gather all
the threads in the system using a combination of patching the reset
vector and soft-resetting them, then cleanup a few bits of hardware
(we do re-probe PCIe for example), and reload & restart the bootloader.

This is very experimental and needs a lot of testing and also auditing
code for other bits of HW that might need to be cleaned up. I also need
to check if we are properly PERST'ing PCI devices.

I've successfully fast rebooted a Habanero a few times.

This is partially based on old code I had to do that on P7. I only
support it on P8 though as there are issues with the PSI interrupts
on P7 that cannot be reliably solved.

Not-yet-signed-off-by: Benjamin Herrenschmidt <benh at kernel.crashing.org>
---

 asm/head.S           | 125 +++++++--------
 core/device.c        |   8 +
 core/fast-reboot.c   | 438 ++++++++++++++++++++++++++++++---------------------
 core/init.c          |   6 +-
 core/lock.c          |   3 +
 core/pci.c           |  13 +-
 core/platform.c      |   4 +-
 hw/fsp/fsp-console.c |   5 +
 hw/occ.c             |   5 +
 hw/psi.c             |  97 +++++-------
 include/config.h     |   4 +-
 include/device.h     |   2 +
 include/skiboot.h    |   3 +-
 13 files changed, 402 insertions(+), 311 deletions(-)

diff --git a/asm/head.S b/asm/head.S
index e92f9b8..0c5649d 100644
--- a/asm/head.S
+++ b/asm/head.S
@@ -294,7 +294,7 @@ boot_entry:
 	bne	secondary_wait
 
 	/* Initialize per-core SPRs */
-	bl init_shared_sprs
+	bl	init_shared_sprs
 
 	/* Pick a boot CPU, cpu index in r31 */
 	LOAD_IMM32(%r3, boot_sem - __head)
@@ -311,7 +311,7 @@ boot_entry:
 	smt_medium
 
 	/* Initialize thread SPRs */
-	bl init_replicated_sprs
+	bl	init_replicated_sprs
 
 	/* Save the initial offset. The secondary threads will spin on boot_flag
 	 * before relocation so we need to keep track of its location to wake
@@ -410,11 +410,11 @@ secondary_wait:
 	add	%r3,%r3,%r30
 	mtctr	%r3
 	isync
-	bctr	
+	bctr
 1:
 	/* Now wait for cpu_secondary_start to be set */
 	LOAD_ADDR_FROM_TOC(%r3, cpu_secondary_start)
-1:	smt_very_low	
+1:	smt_very_low
 	ld	%r0,0(%r3)
 	cmpdi	%r0,0
 	beq	1b
@@ -457,64 +457,6 @@ call_relocate:
 1:	/* Fatal relocate failure */
 	attn
 
-/* This is a little piece of code that is copied down to
- * 0x100 when doing a "fast reset"
- */
-.global fast_reset_patch_start
-fast_reset_patch_start:	
-	smt_medium
-	LOAD_IMM64(%r30, SKIBOOT_BASE)
-	LOAD_IMM32(%r3, fast_reset_entry - __head)
-	add	%r3,%r30,%r3
-	mtctr	%r3
-	bctr
-.global fast_reset_patch_end
-fast_reset_patch_end:
-
-/* Fast reset code. We clean up the TLB and a few SPRs and
- * return to C code. All CPUs do that, the CPU triggering the
- * reset does it to itself last. The C code will sort out who
- * the master is. We come from the trampoline above with
- * r30 containing SKIBOOT_BASE
- */
-fast_reset_entry:
-	/* Clear out SLB */
-	li	%r6,0
-	slbmte	%r6,%r6
-	slbia
-	ptesync
-
-	/* Get PIR */
-	mfspr	%r31,SPR_PIR
-
-	/* Get a stack and restore r13 */
-	GET_STACK(%r1,%r31)
-	li	%r3,0
-	std	%r3,0(%r1)
-	std	%r3,8(%r1)
-	std	%r3,16(%r1)
-	GET_CPU()
-
-	/* Get our TOC */
-	addis	%r2,%r30,(__toc_start - __head)@ha
-	addi	%r2,%r2,(__toc_start - __head)@l
-
-	/* Go to C ! */
-	bl	fast_reboot
-	b	.
-
-.global cleanup_tlb
-cleanup_tlb:
-	/* Clean the TLB */
-	li	%r3,128
-	mtctr	%r3
-	li	%r4,0x800		/* IS field = 0b10 */
-	ptesync
-1:	tlbiel	%r4
-	addi	%r4,%r4,0x1000
-	bdnz	1b
-	ptesync
-
 #define FIXUP_ENDIAN                                              \
        tdi   0,0,0x48;   /* Reverse endian of b . + 8          */ \
        b     $+36;       /* Skip trampoline if endian is good  */ \
@@ -652,6 +594,65 @@ rvwinkle_restore:
 	mtlr	%r0
 	blr
 
+/* This is a little piece of code that is copied down to
+ * 0x100 when doing a "fast reset"
+ */
+.global fast_reset_patch_start
+fast_reset_patch_start:
+	FIXUP_ENDIAN
+	smt_medium
+	LOAD_IMM64(%r30, SKIBOOT_BASE)
+	LOAD_IMM32(%r3, fast_reset_entry - __head)
+	add	%r3,%r30,%r3
+	mtctr	%r3
+	bctr
+.global fast_reset_patch_end
+fast_reset_patch_end:
+
+/* Fast reset code. We clean up the TLB and a few SPRs and
+ * return to C code. All CPUs do that, the CPU triggering the
+ * reset does it to itself last. The C code will sort out who
+ * the master is. We come from the trampoline above with
+ * r30 containing SKIBOOT_BASE
+ */
+fast_reset_entry:
+	/* Clear out SLB */
+	li	%r6,0
+	slbmte	%r6,%r6
+	slbia
+	ptesync
+
+	/* Get PIR */
+	mfspr	%r31,SPR_PIR
+
+	/* Get a stack and restore r13 */
+	GET_STACK(%r1,%r31)
+	li	%r3,0
+	std	%r3,0(%r1)
+	std	%r3,8(%r1)
+	std	%r3,16(%r1)
+	GET_CPU()
+
+	/* Get our TOC */
+	addis	%r2,%r30,(__toc_start - __head)@ha
+	addi	%r2,%r2,(__toc_start - __head)@l
+
+	/* Go to C ! */
+	bl	fast_reboot_entry
+	b	.
+
+.global cleanup_tlb
+cleanup_tlb:
+	/* Clean the TLB */
+	li	%r3,128
+	mtctr	%r3
+	li	%r4,0x800		/* IS field = 0b10 */
+	ptesync
+1:	tlbiel	%r4
+	addi	%r4,%r4,0x1000
+	bdnz	1b
+	ptesync
+
 /* Functions to initialize replicated and shared SPRs to sane
  * values. This is called at boot and on soft-reset
  */
diff --git a/core/device.c b/core/device.c
index 9e7ef0d..e7b53a8 100644
--- a/core/device.c
+++ b/core/device.c
@@ -581,6 +581,14 @@ const struct dt_property *dt_find_property(const struct dt_node *node,
 	return NULL;
 }
 
+void dt_check_del_prop(struct dt_node *node, const char *name)
+{
+	struct dt_property *p;
+
+	p = __dt_find_property(node, name);
+	if (p)
+		dt_del_property(node, p);
+}
 const struct dt_property *dt_require_property(const struct dt_node *node,
 					      const char *name, int wanted_len)
 {
diff --git a/core/fast-reboot.c b/core/fast-reboot.c
index 30b77e9..1a7f2cc 100644
--- a/core/fast-reboot.c
+++ b/core/fast-reboot.c
@@ -26,179 +26,283 @@
 #include <pci.h>
 #include <chip.h>
 
-/*
- * To get control of all threads, we sreset them via XSCOM after
- * patching the 0x100 vector. This will work as long as the target
- * HRMOR is 0. If Linux ever uses HRMOR, we'll have to consider
- * a more messy approach.
- *
- * The SCOM register we want is called "Core RAS Control" in the doc
- * and EX0.EC.PC.TCTL_GENERATE#0.TCTL.DIRECT_CONTROLS in the SCOM list
- *
- * Bits in there change from CPU rev to CPU rev but the bit we care
- * about, bit 60 "sreset_request" appears to have stuck to the same
- * place in both P7 and P7+. The register also has the same SCOM
- * address
- */
-#define EX0_TCTL_DIRECT_CONTROLS0	0x08010400
-#define EX0_TCTL_DIRECT_CONTROLS1	0x08010440
-#define EX0_TCTL_DIRECT_CONTROLS2	0x08010480
-#define EX0_TCTL_DIRECT_CONTROLS3	0x080104c0
-#define   TCTL_DC_SRESET_REQUEST	PPC_BIT(60)
+#define P8_EX_TCTL_DIRECT_CONTROLS(t)	(0x10013000 + (t) * 0x10)
+#define P8_DIRECT_CTL_STOP		PPC_BIT(63)
+#define P8_DIRECT_CTL_PRENAP		PPC_BIT(47)
+#define P8_DIRECT_CTL_SRESET		PPC_BIT(60)
+
 
 /* Flag tested by the OPAL entry code */
 uint8_t reboot_in_progress;
-static struct cpu_thread *resettor, *resettee;
+static volatile bool fast_boot_release;
+static struct cpu_thread *last_man_standing;
+static struct lock reset_lock = LOCK_UNLOCKED;
 
-static void flush_caches(void)
+static int set_special_wakeup(struct cpu_thread *cpu)
 {
-	uint64_t base = SKIBOOT_BASE;
-	uint64_t end = base + SKIBOOT_SIZE;
+	uint64_t val, poll_target, stamp;
+	uint32_t core_id;
+	int rc;
 
-	/* Not sure what the effect of sreset is on cores, so let's
-	 * shoot a series of dcbf's on all cachelines that make up
-	 * our core memory just in case...
+	/*
+	 * Note: HWP checks for checkstops, but I assume we don't need to
+	 * as we wouldn't be running if one was present
 	 */
-	while(base < end) {
-		asm volatile("dcbf 0,%0" : : "r" (base) : "memory");
-		base += 128;
+
+	/* Grab core ID once */
+	core_id = pir_to_core_id(cpu->pir);
+
+	prlog(PR_DEBUG, "RESET Waking up core 0x%x\n", core_id);
+
+	/*
+	 * The original HWp reads the XSCOM first but ignores the result
+	 * and error, let's do the same until I know for sure that is
+	 * not necessary
+	 */
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP),
+		   &val);
+
+	/* Then we write special wakeup */
+	rc = xscom_write(cpu->chip_id,
+			 XSCOM_ADDR_P8_EX_SLAVE(core_id,
+						EX_PM_SPECIAL_WAKEUP_PHYP),
+			 PPC_BIT(0));
+	if (rc) {
+		prerror("RESET: XSCOM error %d asserting special"
+			" wakeup on 0x%x\n", rc, cpu->pir);
+		return rc;
 	}
-	sync();
+
+	/*
+	 * HWP uses the history for Perf register here, dunno why it uses
+	 * that one instead of the pHyp one, maybe to avoid clobbering it...
+	 *
+	 * In any case, it does that to check for run/nap vs.sleep/winkle/other
+	 * to decide whether to poll on checkstop or not. Since we don't deal
+	 * with checkstop conditions here, we ignore that part.
+	 */
+
+	/*
+	 * Now poll for completion of special wakeup. The HWP is nasty here,
+	 * it will poll at 5ms intervals for up to 200ms. This is not quite
+	 * acceptable for us at runtime, at least not until we have the
+	 * ability to "context switch" HBRT. In practice, because we don't
+	 * winkle, it will never take that long, so we increase the polling
+	 * frequency to 1us per poll. However we do have to keep the same
+	 * timeout.
+	 *
+	 * We don't use time_wait_ms() either for now as we don't want to
+	 * poll the FSP here.
+	 */
+	stamp = mftb();
+	poll_target = stamp + msecs_to_tb(200);
+	val = 0;
+	while (!(val & EX_PM_GP0_SPECIAL_WAKEUP_DONE)) {
+		/* Wait 1 us */
+		time_wait_us(1);
+
+		/* Read PM state */
+		rc = xscom_read(cpu->chip_id,
+				XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_GP0),
+				&val);
+		if (rc) {
+			prerror("RESET: XSCOM error %d reading PM state on"
+				" 0x%x\n", rc, cpu->pir);
+			return rc;
+		}
+		/* Check timeout */
+		if (mftb() > poll_target)
+			break;
+	}
+
+	/* Success ? */
+	if (val & EX_PM_GP0_SPECIAL_WAKEUP_DONE) {
+		uint64_t now = mftb();
+		prlog(PR_TRACE, "RESET: Special wakeup complete after %ld us\n",
+		      tb_to_usecs(now - stamp));
+		return 0;
+	}
+
+	/*
+	 * We timed out ...
+	 *
+	 * HWP has a complex workaround for HW255321 which affects
+	 * Murano DD1 and Venice DD1. Ignore that for now
+	 *
+	 * Instead we just dump some XSCOMs for error logging
+	 */
+	prerror("RESET: Timeout on special wakeup of 0x%0x\n", cpu->pir);
+	prerror("RESET:      PM0 = 0x%016llx\n", val);
+	val = -1;
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP),
+		   &val);
+	prerror("RESET: SPC_WKUP = 0x%016llx\n", val);
+	val = -1;
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id,
+					  EX_PM_IDLE_STATE_HISTORY_PHYP),
+		   &val);
+	prerror("RESET:  HISTORY = 0x%016llx\n", val);
+
+	return OPAL_HARDWARE;
 }
 
-static bool do_reset_core_p7(struct cpu_thread *cpu)
+static int clr_special_wakeup(struct cpu_thread *cpu)
 {
-	uint32_t xscom_addr, chip;
-	uint64_t ctl;
+	uint64_t val;
+	uint32_t core_id;
 	int rc;
 
-	/* Add the Core# */
-	xscom_addr = EX0_TCTL_DIRECT_CONTROLS0;
-	xscom_addr |= ((cpu->pir >> 2) & 7) << 24;
+	/*
+	 * Note: HWP checks for checkstops, but I assume we don't need to
+	 * as we wouldn't be running if one was present
+	 */
+
+	/* Grab core ID once */
+	core_id = pir_to_core_id(cpu->pir);
 
-	chip = pir_to_chip_id(cpu->pir);
+	prlog(PR_DEBUG, "RESET: Releasing core 0x%x wakeup\n", core_id);
 
-	ctl = TCTL_DC_SRESET_REQUEST;
-	rc = xscom_write(chip, xscom_addr, ctl);
-	rc |= xscom_write(chip, xscom_addr + 0x40, ctl);
-	rc |= xscom_write(chip, xscom_addr + 0x80, ctl);
-	rc |= xscom_write(chip, xscom_addr + 0xc0, ctl);
+	/*
+	 * The original HWp reads the XSCOM first but ignores the result
+	 * and error, let's do the same until I know for sure that is
+	 * not necessary
+	 */
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP),
+		   &val);
+
+	/* Then we write special wakeup */
+	rc = xscom_write(cpu->chip_id,
+			 XSCOM_ADDR_P8_EX_SLAVE(core_id,
+						EX_PM_SPECIAL_WAKEUP_PHYP), 0);
 	if (rc) {
-		prerror("RESET: Error %d resetting CPU 0x%04x\n",
-			rc, cpu->pir);
-		return false;
+		prerror("RESET: XSCOM error %d deasserting"
+			" special wakeup on 0x%x\n", rc, cpu->pir);
+		return rc;
 	}
-	return true;
+
+	/*
+	 * The original HWp reads the XSCOM again with the comment
+	 * "This puts an inherent delay in the propagation of the reset
+	 * transition"
+	 */
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP),
+		   &val);
+
+	return 0;
 }
 
-static void fast_reset_p7(void)
+static void set_direct_ctl(struct cpu_thread *cpu, uint64_t bits)
 {
-	struct cpu_thread *cpu;
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t thread_id = pir_to_thread_id(cpu->pir);
+	uint32_t xscom_addr;
 
-	resettee = this_cpu();
-	resettor = NULL;
+	xscom_addr = XSCOM_ADDR_P8_EX(core_id, P8_EX_TCTL_DIRECT_CONTROLS(thread_id));
 
-	/* Pick up a candidate resettor. We do that before we flush
-	 * the caches
-	 */
-	for_each_cpu(cpu) {
-		/*
-		 * Some threads might still be in skiboot.
-		 *
-		 * But because we deal with entire cores and we don't want
-		 * to special case things, we are just going to reset them
-		 * too making the assumption that this is safe, they are
-		 * holding no locks. This can only be true if they don't
-		 * have jobs scheduled which is hopefully the case.
-		 */
-		if (cpu->state != cpu_state_os &&
-		    cpu->state != cpu_state_active)
-			continue;
+	xscom_write(chip_id, xscom_addr, bits);
+}
 
-		/*
-		 * Only hit cores and only if they aren't on the same core
-		 * as ourselves
-		 */
-		if (cpu_get_thread0(cpu) == cpu_get_thread0(this_cpu()) ||
-		    cpu->pir & 0x3)
-			continue;
+static void patch_reset_vector(void)
+{
+	extern uint32_t fast_reset_patch_start;
+	extern uint32_t fast_reset_patch_end;
+	uint32_t *dst, *src;
+
+	/* Copy reset trampoline */
+	prlog(PR_DEBUG, "RESET: Copying reset trampoline...\n");
+	src = &fast_reset_patch_start;
+	dst = (uint32_t *)0x100;
+	while(src < &fast_reset_patch_end)
+		*(dst++) = *(src++);
+	sync_icache();
+}
+
+static bool fast_reset_p8(void)
+{
+	struct cpu_thread *cpu;
+
+	/* Mark ourselves as last man standing in need of a reset */
+	last_man_standing = this_cpu();
 
-		/* Pick up one of those guys as our "resettor". It will be
-		 * in charge of resetting this CPU. We avoid resetting
-		 * ourselves, not sure how well it would do with SCOM
-		 */
-		resettor = cpu;
-		break;
+	prlog(PR_DEBUG, "RESET: Resetting from cpu: 0x%x (core 0x%x)\n",
+	      this_cpu()->pir, pir_to_core_id(this_cpu()->pir));
+
+	/* Assert special wakup on all cores */
+	for_each_cpu(cpu) {
+		if (cpu->primary == cpu)
+			if (set_special_wakeup(cpu) != OPAL_SUCCESS)
+				return false;
 	}
 
-	if (!resettor) {
-		printf("RESET: Can't find a resettor !\n");
-		return;
+	prlog(PR_DEBUG, "RESET: Stopping the world...\n");
+
+	/* Put everybody in stop except myself */
+	for_each_cpu(cpu) {
+		if (cpu != this_cpu())
+			set_direct_ctl(cpu, P8_DIRECT_CTL_STOP);
 	}
-	printf("RESET: Resetting from 0x%04x, resettor 0x%04x\n",
-	       this_cpu()->pir, resettor->pir);
 
-	printf("RESET: Flushing caches...\n");
+	/* Patch reset */
+	patch_reset_vector();
 
-	/* Is that necessary ? */
-	flush_caches();
+	prlog(PR_DEBUG, "RESET: Pre-napping all threads but one...\n");
 
-	/* Reset everybody except self and except resettor */
+	/* Put everybody in pre-nap except myself */
 	for_each_cpu(cpu) {
-		if (cpu->state != cpu_state_os &&
-		    cpu->state != cpu_state_active)
-			continue;
-		if (cpu_get_thread0(cpu) == cpu_get_thread0(this_cpu()) ||
-		    cpu->pir & 0x3)
-			continue;
-		if (cpu_get_thread0(cpu) == cpu_get_thread0(resettor))
-			continue;
+		if (cpu != this_cpu())
+			set_direct_ctl(cpu, P8_DIRECT_CTL_PRENAP);
+	}
 
-		printf("RESET: Resetting CPU 0x%04x...\n", cpu->pir);
+	prlog(PR_DEBUG, "RESET: Resetting all threads but one...\n");
 
-		if (!do_reset_core_p7(cpu))
-			return;
+	/* Reset everybody except my own core threads */
+	for_each_cpu(cpu) {
+		if (cpu != this_cpu())
+			set_direct_ctl(cpu, P8_DIRECT_CTL_SRESET);
 	}
 
-	/* Reset the resettor last because it's going to kill me ! */
-	printf("RESET: Resetting CPU 0x%04x...\n", resettor->pir);
-	if (!do_reset_core_p7(resettor))
-		return;
-
-	/* Don't return */
-	for (;;)
-		;
+	return true;
 }
 
-void fast_reset(void)
+void fast_reboot(void)
 {
-	uint32_t pvr = mfspr(SPR_PVR);
-	extern uint32_t fast_reset_patch_start;
-	extern uint32_t fast_reset_patch_end;
-	uint32_t *dst, *src;
+	bool success;
 
-	printf("RESET: Fast reboot request !\n");
+	if (proc_gen != proc_gen_p8)
+		return;
+
+	prlog(PR_INFO, "RESET: Initiating fast reboot...\n");
 
 	/* XXX We need a way to ensure that no other CPU is in skiboot
 	 * holding locks (via the OPAL APIs) and if they are, we need
-	 * for them to get out
+	 * for them to get out. Hopefully that isn't happening, but...
+	 *
+	 * To fix this properly, we want to keep track of OPAL entry/exit
+	 * on all CPUs.
 	 */
 	reboot_in_progress = 1;
 	time_wait_ms(200);
 
-	/* Copy reset trampoline */
-	printf("RESET: Copying reset trampoline...\n");
-	src = &fast_reset_patch_start;
-	dst = (uint32_t *)0x100;
-	while(src < &fast_reset_patch_end)
-		*(dst++) = *(src++);
-	sync_icache();
+	/* Lock so the new guys coming don't reset us */
+	lock(&reset_lock);
 
-	switch(PVR_TYPE(pvr)) {
-	case PVR_TYPE_P7:
-	case PVR_TYPE_P7P:
-		fast_reset_p7();
-	}
+	fast_boot_release = false;
+
+	success = fast_reset_p8();
+
+	/* Unlock, at this point we go away */
+	unlock(&reset_lock);
+
+	if (success)
+		/* Don't return */
+		for (;;)
+			;
 }
 
 static void cleanup_cpu_state(void)
@@ -211,56 +315,24 @@ static void cleanup_cpu_state(void)
 	reset_cpu_icp();
 }
 
-#ifdef FAST_REBOOT_CLEARS_MEMORY
-static void fast_mem_clear(uint64_t start, uint64_t end)
-{
-	printf("MEMORY: Clearing %llx..%llx\n", start, end);
-
-	while(start < end) {
-		asm volatile("dcbz 0,%0" : : "r" (start) : "memory");
-		start += 128;
-	}
-}
-
-static void memory_reset(void)
-{
-	struct address_range *i;
-	uint64_t skistart = SKIBOOT_BASE;
-	uint64_t skiend = SKIBOOT_BASE + SKIBOOT_SIZE;
-
-	printf("MEMORY: Clearing ...\n");
-
-	list_for_each(&address_ranges, i, list) {
-		uint64_t start = cleanup_addr(i->arange->start);
-		uint64_t end = cleanup_addr(i->arange->end);
-
-		if (start >= skiend || end <= skistart)
-			fast_mem_clear(start, end);
-		else {
-			if (start < skistart)
-				fast_mem_clear(start, skistart);
-			if (end > skiend)
-				fast_mem_clear(skiend, end);
-		}
-	}
-}
-#endif /* FAST_REBOOT_CLEARS_MEMORY */
-
 /* Entry from asm after a fast reset */
-void __noreturn fast_reboot(void);
+void __noreturn fast_reboot_entry(void);
 
-void __noreturn fast_reboot(void)
+void __noreturn fast_reboot_entry(void)
 {
-	static volatile bool fast_boot_release;
 	struct cpu_thread *cpu;
 
-	printf("INIT: CPU PIR 0x%04x reset in\n", this_cpu()->pir);
+	prlog(PR_DEBUG, "RESET: CPU PIR 0x%04x reset in\n", this_cpu()->pir);
+	time_wait_ms(100);
 
-	/* If this CPU was chosen as the resettor, it must reset the
-	 * resettee (the one that initiated the whole process
-	 */
-	if (this_cpu() == resettor)
-		do_reset_core_p7(resettee);
+	lock(&reset_lock);
+	if (last_man_standing) {
+		prlog(PR_DEBUG, "RESET: last man standing fixup...\n");
+		set_direct_ctl(last_man_standing, P8_DIRECT_CTL_PRENAP);
+		set_direct_ctl(last_man_standing, P8_DIRECT_CTL_SRESET);
+		last_man_standing = NULL;
+	}
+	unlock(&reset_lock);
 
 	/* Are we the original boot CPU ? If not, we spin waiting
 	 * for a relase signal from CPU 1, then we clean ourselves
@@ -277,6 +349,8 @@ void __noreturn fast_reboot(void)
 		__secondary_cpu_entry();
 	}
 
+	prlog(PR_INFO, "RESET: Boot CPU waiting for everybody...\n");
+
 	/* We are the original boot CPU, wait for secondaries to
 	 * be captured
 	 */
@@ -292,7 +366,7 @@ void __noreturn fast_reboot(void)
 		smt_medium();
 	}
 
-	printf("INIT: Releasing secondaries...\n");
+	prlog(PR_INFO, "RESET: Releasing secondaries...\n");
 
 	/* Release everybody */
 	fast_boot_release = true;
@@ -310,7 +384,14 @@ void __noreturn fast_reboot(void)
 		}
 	}
 
-	printf("INIT: All done, resetting everything else...\n");
+	prlog(PR_DEBUG, "RESET: Releasing special wakeups...\n");
+
+	for_each_cpu(cpu) {
+		if (cpu->primary == cpu)
+			clr_special_wakeup(cpu);
+	}
+
+	prlog(PR_INFO, "RESET: All done, cleaning up...\n");
 
 	/* Clear release flag for next time */
 	fast_boot_release = false;
@@ -322,6 +403,8 @@ void __noreturn fast_reboot(void)
 	/* Set our state to active */
 	this_cpu()->state = cpu_state_active;
 
+	start_preload_kernel();
+
 	/* Poke the consoles (see comments in the code there) */
 	fsp_console_reset();
 
@@ -331,15 +414,6 @@ void __noreturn fast_reboot(void)
 	/* Remove all PCI devices */
 	pci_reset();
 
-	/* Reset IO Hubs */
-	cec_reset();
-
-	/* Re-Initialize all discovered PCI slots */
-	pci_init_slots();
-
-	/* Clear memory */
-#ifdef FAST_REBOOT_CLEARS_MEMORY
-	memory_reset();
-#endif
+	/* Load and boot payload */
 	load_and_boot_kernel(true);
 }
diff --git a/core/init.c b/core/init.c
index ca3ad55..1a3d741 100644
--- a/core/init.c
+++ b/core/init.c
@@ -287,7 +287,7 @@ extern uint64_t boot_offset;
 static size_t kernel_size;
 static size_t initramfs_size;
 
-static bool start_preload_kernel(void)
+bool start_preload_kernel(void)
 {
 	int loaded;
 
@@ -384,6 +384,9 @@ static void load_initramfs(void)
 {
 	int loaded;
 
+	dt_check_del_prop(dt_chosen, "linux,initrd-start");
+	dt_check_del_prop(dt_chosen, "linux,initrd-end");
+
 	loaded = wait_for_resource_loaded(RESOURCE_ID_INITRAMFS,
 					  RESOURCE_SUBID_NONE);
 
@@ -447,6 +450,7 @@ void __noreturn load_and_boot_kernel(bool is_reboot)
 
 	/* Set kernel command line argument if specified */
 #ifdef KERNEL_COMMAND_LINE
+	dt_check_del_prop(dt_chosen, "bootargs");
 	dt_add_property_string(dt_chosen, "bootargs", KERNEL_COMMAND_LINE);
 #endif
 
diff --git a/core/lock.c b/core/lock.c
index 53cc337..e82048b 100644
--- a/core/lock.c
+++ b/core/lock.c
@@ -110,6 +110,9 @@ void unlock(struct lock *l)
 	this_cpu()->lock_depth--;
 	l->lock_val = 0;
 
+	/* WARNING: On fast reboot, we can be reset right at that
+	 * point, so the reset_lock in there cannot be in the con path
+	 */
 	if (l->in_con_path) {
 		cpu->con_suspend--;
 		if (cpu->con_suspend == 0 && cpu->con_need_flush)
diff --git a/core/pci.c b/core/pci.c
index cbaea35..bbf4583 100644
--- a/core/pci.c
+++ b/core/pci.c
@@ -1456,6 +1456,7 @@ static void __pci_reset(struct list_head *list)
 
 	while ((pd = list_pop(list, struct pci_device, link)) != NULL) {
 		__pci_reset(&pd->children);
+		dt_free(pd->dn);
 		free(pd);
 	}
 }
@@ -1466,16 +1467,22 @@ void pci_reset(void)
 
 	prlog(PR_NOTICE, "PCI: Clearing all devices...\n");
 
-	/* This is a remnant of fast-reboot, not currently used */
 
 	/* XXX Do those in parallel (at least the power up
 	 * state machine could be done in parallel)
 	 */
 	for (i = 0; i < ARRAY_SIZE(phbs); i++) {
-		if (!phbs[i])
+		struct phb *phb = phbs[i];
+		if (!phb)
 			continue;
-		__pci_reset(&phbs[i]->devices);
+		__pci_reset(&phb->devices);
+		if (phb->ops->ioda_reset)
+			phb->ops->ioda_reset(phb, true);
 	}
+
+	/* Re-Initialize all discovered PCI slots */
+	pci_init_slots();
+
 }
 
 static void pci_do_jobs(void (*fn)(void *))
diff --git a/core/platform.c b/core/platform.c
index de6e406..7915857 100644
--- a/core/platform.c
+++ b/core/platform.c
@@ -52,9 +52,9 @@ static int64_t opal_cec_reboot(void)
 
 	console_complete_flush();
 
-#ifdef ENABLE_FAST_RESET
+#ifdef ENABLE_FAST_REBOOT
 	/* Try a fast reset first */
-	fast_reset();
+	fast_reboot();
 #endif
 	if (platform.cec_reboot)
 		return platform.cec_reboot();
diff --git a/hw/fsp/fsp-console.c b/hw/fsp/fsp-console.c
index 87e509d..5e27197 100644
--- a/hw/fsp/fsp-console.c
+++ b/hw/fsp/fsp-console.c
@@ -884,6 +884,9 @@ static void reopen_all_hvsi(void)
 
 void fsp_console_reset(void)
 {
+	if (!fsp_present())
+		return;
+
 	prlog(PR_NOTICE, "FSP: Console reset !\n");
 
 	/* This is called on a fast-reset. To work around issues with HVSI
@@ -985,6 +988,8 @@ void fsp_console_select_stdout(void)
 			 */
 		}
 	}
+	dt_check_del_prop(dt_chosen, "linux,stdout-path");
+
 	if (fsp_serials[1].open && use_serial) {
 		dt_add_property_string(dt_chosen, "linux,stdout-path",
 				       "/ibm,opal/consoles/serial at 1");
diff --git a/hw/occ.c b/hw/occ.c
index b606a67..3d86f7a 100644
--- a/hw/occ.c
+++ b/hw/occ.c
@@ -517,10 +517,14 @@ void occ_pstates_init(void)
 	struct proc_chip *chip;
 	struct cpu_thread *c;
 	s8 pstate_nom;
+	static bool occ_pstates_initialized;
 
 	/* OCC is P8 only */
 	if (proc_gen != proc_gen_p8)
 		return;
+	/* Handle fast reboots */
+	if (occ_pstates_initialized)
+		return;
 
 	chip = next_chip(NULL);
 	if (!chip->homer_base) {
@@ -558,6 +562,7 @@ void occ_pstates_init(void)
 	for_each_chip(chip)
 		chip->throttle = 0;
 	opal_add_poller(occ_throttle_poll, NULL);
+	occ_pstates_initialized = true;
 }
 
 struct occ_load_req {
diff --git a/hw/psi.c b/hw/psi.c
index 3efc177..03527f6 100644
--- a/hw/psi.c
+++ b/hw/psi.c
@@ -432,34 +432,25 @@ static int64_t psi_p7_get_xive(struct irq_source *is, uint32_t isn __unused,
 	return OPAL_SUCCESS;
 }
 
+static const uint32_t psi_p8_irq_to_xivr[P8_IRQ_PSI_ALL_COUNT] = {
+	[P8_IRQ_PSI_FSP]	= PSIHB_XIVR_FSP,
+	[P8_IRQ_PSI_OCC]	= PSIHB_XIVR_OCC,
+	[P8_IRQ_PSI_FSI]	= PSIHB_XIVR_FSI,
+	[P8_IRQ_PSI_LPC]	= PSIHB_XIVR_LPC,
+	[P8_IRQ_PSI_LOCAL_ERR]	= PSIHB_XIVR_LOCAL_ERR,
+	[P8_IRQ_PSI_HOST_ERR]	= PSIHB_XIVR_HOST_ERR,
+};
+
 static int64_t psi_p8_set_xive(struct irq_source *is, uint32_t isn,
 			       uint16_t server, uint8_t priority)
 {
 	struct psi *psi = is->data;
 	uint64_t xivr_p, xivr;
+	uint32_t irq_idx = isn & 7;
 
-	switch(isn & 7) {
-	case P8_IRQ_PSI_FSP:
-		xivr_p = PSIHB_XIVR_FSP;
-		break;
-	case P8_IRQ_PSI_OCC:
-		xivr_p = PSIHB_XIVR_OCC;
-		break;
-	case P8_IRQ_PSI_FSI:
-		xivr_p = PSIHB_XIVR_FSI;
-		break;
-	case P8_IRQ_PSI_LPC:
-		xivr_p = PSIHB_XIVR_LPC;
-		break;
-	case P8_IRQ_PSI_LOCAL_ERR:
-		xivr_p = PSIHB_XIVR_LOCAL_ERR;
-		break;
-	case P8_IRQ_PSI_HOST_ERR:
-		xivr_p = PSIHB_XIVR_HOST_ERR;
-		break;
-	default:
+	if (irq_idx >= P8_IRQ_PSI_ALL_COUNT)
 		return OPAL_PARAMETER;
-	}
+	xivr_p = psi_p8_irq_to_xivr[irq_idx];
 
 	/* Populate the XIVR */
 	xivr  = (uint64_t)server << 40;
@@ -476,29 +467,11 @@ static int64_t psi_p8_get_xive(struct irq_source *is, uint32_t isn __unused,
 {
 	struct psi *psi = is->data;
 	uint64_t xivr_p, xivr;
+	uint32_t irq_idx = isn & 7;
 
-	switch(isn & 7) {
-	case P8_IRQ_PSI_FSP:
-		xivr_p = PSIHB_XIVR_FSP;
-		break;
-	case P8_IRQ_PSI_OCC:
-		xivr_p = PSIHB_XIVR_OCC;
-		break;
-	case P8_IRQ_PSI_FSI:
-		xivr_p = PSIHB_XIVR_FSI;
-		break;
-	case P8_IRQ_PSI_LPC:
-		xivr_p = PSIHB_XIVR_LPC;
-		break;
-	case P8_IRQ_PSI_LOCAL_ERR:
-		xivr_p = PSIHB_XIVR_LOCAL_ERR;
-		break;
-	case P8_IRQ_PSI_HOST_ERR:
-		xivr_p = PSIHB_XIVR_HOST_ERR;
-		break;
-	default:
+	if (irq_idx >= P8_IRQ_PSI_ALL_COUNT)
 		return OPAL_PARAMETER;
-	}
+	xivr_p = psi_p8_irq_to_xivr[irq_idx];
 
 	/* Read & decode the XIVR */
 	xivr = in_be64(psi->regs + xivr_p);
@@ -509,33 +482,41 @@ static int64_t psi_p8_get_xive(struct irq_source *is, uint32_t isn __unused,
 	return OPAL_SUCCESS;
 }
 
+static void psi_cleanup_irq(struct psi *psi)
+{
+	uint32_t irq;
+	uint64_t xivr, xivr_p;
+
+	for (irq = 0; irq < P8_IRQ_PSI_ALL_COUNT; irq++) {
+		printf("PSI[0x%03x]: Cleaning up IRQ %d\n",
+		       psi->chip_id, irq);
+
+		xivr_p = psi_p8_irq_to_xivr[irq];
+		xivr = in_be64(psi->regs + xivr_p);
+		xivr |= (0xffull << 32);
+		out_be64(psi->regs + xivr_p, xivr);
+		time_wait_ms_nopoll(10);
+		xivr = in_be64(psi->regs + xivr_p);
+		if (xivr & PPC_BIT(39)) {
+			printf(" Need EOI !\n");
+			icp_send_eoi(psi->interrupt + irq);
+		}
+	}
+}
+
 /* Called on a fast reset, make sure we aren't stuck with
  * an accepted and never EOId PSI interrupt
  */
 void psi_irq_reset(void)
 {
 	struct psi *psi;
-	uint64_t xivr;
 
 	printf("PSI: Hot reset!\n");
 
-	assert(proc_gen == proc_gen_p7);
+	assert(proc_gen == proc_gen_p8);
 
 	list_for_each(&psis, psi, list) {
-		/* Mask the interrupt & clean the XIVR */
-		xivr = 0x000000ff00000000UL;
-		xivr |=	P7_IRQ_BUID(psi->interrupt) << 16;
-		out_be64(psi->regs + PSIHB_XIVR, xivr);
-
-#if 0 /* Seems to checkstop ... */
-		/*
-		 * Maybe not anymore; we were just blindly sending
-		 * this on all iopaths, not just the active one;
-		 * We don't even know if those psis are even correct.
-		 */
-		/* Send a dummy EOI to make sure the ICP is clear */
-		icp_send_eoi(psi->interrupt);
-#endif
+		psi_cleanup_irq(psi);
 	}
 }
 
diff --git a/include/config.h b/include/config.h
index 2524570..3163c65 100644
--- a/include/config.h
+++ b/include/config.h
@@ -72,8 +72,8 @@
  */
 //#define FORCE_DUMMY_CONSOLE 1
 
-/* Enable this to do fast resets. Currently unreliable... */
-//#define ENABLE_FAST_RESET	1
+/* Enable this to do fast reboots. Currently unreliable... */
+#define ENABLE_FAST_REBOOT	1
 
 /* Enable this to make fast reboot clear memory */
 //#define FAST_REBOOT_CLEARS_MEMORY	1
diff --git a/include/device.h b/include/device.h
index ed4fc46..4198a41 100644
--- a/include/device.h
+++ b/include/device.h
@@ -119,6 +119,8 @@ static inline struct dt_property *dt_add_property_u64(struct dt_node *node,
 
 void dt_del_property(struct dt_node *node, struct dt_property *prop);
 
+void dt_check_del_prop(struct dt_node *node, const char *name);
+
 /* Warning: moves *prop! */
 void dt_resize_property(struct dt_property **prop, size_t len);
 
diff --git a/include/skiboot.h b/include/skiboot.h
index 72cda14..d073cf5 100644
--- a/include/skiboot.h
+++ b/include/skiboot.h
@@ -190,12 +190,13 @@ extern unsigned long get_symbol(unsigned long addr,
 				char **sym, char **sym_end);
 
 /* Fast reboot support */
-extern void fast_reset(void);
+extern void fast_reboot(void);
 extern void __noreturn __secondary_cpu_entry(void);
 extern void __noreturn load_and_boot_kernel(bool is_reboot);
 extern void cleanup_tlb(void);
 extern void init_shared_sprs(void);
 extern void init_replicated_sprs(void);
+extern bool start_preload_kernel(void);
 
 /* Various probe routines, to replace with an initcall system */
 extern void probe_p7ioc(void);



More information about the Skiboot mailing list