[Skiboot] [PATCH] [v3] Fast reboot for P8

Benjamin Herrenschmidt benh at kernel.crashing.org
Sun Jul 24 09:32:10 AEST 2016


This is an experimental patch that implements "Fast reboot" on P8
machines.

The basic idea is that when the OS calls OPAL reboot, we gather all
the threads in the system using a combination of patching the reset
vector and soft-resetting them, then cleanup a few bits of hardware
(we do re-probe PCIe for example), and reload & restart the bootloader.

This is very experimental and needs a lot of testing and also auditing
code for other bits of HW that might need to be cleaned up. I also need
to check if we are properly PERST'ing PCI devices.

I've successfully fast rebooted a Habanero a few times.

This is partially based on old code I had to do that on P7. I only
support it on P8 though as there are issues with the PSI interrupts
on P7 that cannot be reliably solved.

Not-yet-signed-off-by: Benjamin Herrenschmidt <benh at kernel.crashing.org>
---

v2. Properly cleanup the TLB
    Handle when OS had cores in fast sleep (restore TB and undo workaround)
    Untested attempt at dealing with split cores (unsplit them)
    Additional cleanups

v3. Rebased on top of the NAP series
    More random fixes I didn't keep track off

TODO: We need to disable fast reboot under some circumstances:

  - When something checkstopped (NPU, NX, ...), we should check FIRs

  - When CAPI is enabled on a PHB since we can't currently switch
    it back to normal PCI and Linux won't deal with a PHB coming
    up already iN CAPI mode

  - Split the patch more
---
 asm/head.S           | 122 ++++++------
 core/fast-reboot.c   | 521 +++++++++++++++++++++++++++++++++------------------
 core/init.c          |   8 +-
 core/lock.c          |   3 +
 core/pci.c           |  12 +-
 core/platform.c      |   4 +-
 hw/fsp/fsp-console.c |   5 +
 hw/fsp/fsp-leds.c    |   3 +
 hw/occ.c             |   5 +
 hw/psi.c             |  98 ++++------
 hw/slw.c             |   7 +-
 include/config.h     |   2 +-
 include/cpu.h        |   1 +
 include/processor.h  |   1 +
 include/skiboot.h    |   9 +-
 15 files changed, 494 insertions(+), 307 deletions(-)

diff --git a/asm/head.S b/asm/head.S
index 6b180ca..e74d3a5 100644
--- a/asm/head.S
+++ b/asm/head.S
@@ -462,64 +462,6 @@ call_relocate:
 1:	/* Fatal relocate failure */
 	attn
 
-/* This is a little piece of code that is copied down to
- * 0x100 when doing a "fast reset"
- */
-.global fast_reset_patch_start
-fast_reset_patch_start:	
-	smt_medium
-	LOAD_IMM64(%r30, SKIBOOT_BASE)
-	LOAD_IMM32(%r3, fast_reset_entry - __head)
-	add	%r3,%r30,%r3
-	mtctr	%r3
-	bctr
-.global fast_reset_patch_end
-fast_reset_patch_end:
-
-/* Fast reset code. We clean up the TLB and a few SPRs and
- * return to C code. All CPUs do that, the CPU triggering the
- * reset does it to itself last. The C code will sort out who
- * the master is. We come from the trampoline above with
- * r30 containing SKIBOOT_BASE
- */
-fast_reset_entry:
-	/* Clear out SLB */
-	li	%r6,0
-	slbmte	%r6,%r6
-	slbia
-	ptesync
-
-	/* Get PIR */
-	mfspr	%r31,SPR_PIR
-
-	/* Get a stack and restore r13 */
-	GET_STACK(%r1,%r31)
-	li	%r3,0
-	std	%r3,0(%r1)
-	std	%r3,8(%r1)
-	std	%r3,16(%r1)
-	GET_CPU()
-
-	/* Get our TOC */
-	addis	%r2,%r30,(__toc_start - __head)@ha
-	addi	%r2,%r2,(__toc_start - __head)@l
-
-	/* Go to C ! */
-	bl	fast_reboot
-	b	.
-
-.global cleanup_tlb
-cleanup_tlb:
-	/* Clean the TLB */
-	li	%r3,128
-	mtctr	%r3
-	li	%r4,0x800		/* IS field = 0b10 */
-	ptesync
-1:	tlbiel	%r4
-	addi	%r4,%r4,0x1000
-	bdnz	1b
-	ptesync
-
 #define FIXUP_ENDIAN                                              \
        tdi   0,0,0x48;   /* Reverse endian of b . + 8          */ \
        b     $+36;       /* Skip trampoline if endian is good  */ \
@@ -628,7 +570,12 @@ reset_wakeup:
 	GET_CPU()
 
 	/* Restore original stack pointer */
-	ld	%r1,CPUTHREAD_SAVE_R1(%r13)
+	ld	%r3,CPUTHREAD_SAVE_R1(%r13)
+
+	/* If it's 0, we are doing a fast reboot */
+	cmpldi	%r3,0
+	beq	fast_reset_entry
+	mr	%r1,%r3
 
 	/* Restore more stuff */
 	lwz	%r3,STACK_CR(%r1)
@@ -665,6 +612,46 @@ reset_wakeup:
 	mtlr	%r0
 	blr
 
+/* Fast reset code. We clean up the TLB and a few SPRs and
+ * return to C code. All CPUs do that, the CPU triggering the
+ * reset does it to itself last. The C code will sort out who
+ * the master is. We come from the trampoline above with
+ * r30 containing SKIBOOT_BASE
+ */
+fast_reset_entry:
+	/* Clear out SLB */
+	li	%r6,0
+	slbmte	%r6,%r6
+	slbia
+	ptesync
+
+	/* Dummy stack frame */
+	li	%r3,0
+	std	%r3,0(%r1)
+	std	%r3,8(%r1)
+	std	%r3,16(%r1)
+
+	/* Get our TOC */
+	addis	%r2,%r30,(__toc_start - __head)@ha
+	addi	%r2,%r2,(__toc_start - __head)@l
+
+	/* Go to C ! */
+	bl	fast_reboot_entry
+	b	.
+
+.global cleanup_tlb
+cleanup_tlb:
+	/* Clean the TLB */
+	li	%r3,512
+	mtctr	%r3
+	li	%r4,0xc00		/* IS field = 0b11 */
+	ptesync
+1:	tlbiel	%r4
+	addi	%r4,%r4,0x1000
+	bdnz	1b
+	ptesync
+	blr
+
 /* Functions to initialize replicated and shared SPRs to sane
  * values. This is called at boot and on soft-reset
  */
@@ -712,10 +699,14 @@ init_shared_sprs:
 	mtspr	SPR_LPCR,%r3
 	sync
 	isync
-	/* HID0: Clear bit 13 (enable core recovery) */
+	/* HID0: Clear bit 13 (enable core recovery)
+	 *       Clear bit 19 (HILE)
+	 */
 	mfspr	%r3,SPR_HID0
 	li	%r0,1
-	sldi	%r0,%r0,(63-13)
+	sldi	%r4,%r0,(63-13)
+	sldi	%r5,%r0,(63-19)
+	or	%r0,%r4,%r5,
 	andc	%r3,%r3,%r0
 	sync
 	mtspr	SPR_HID0,%r3
@@ -747,6 +738,15 @@ init_replicated_sprs:
 	/* XXX TODO: Add more */
 	blr
 
+	.global enter_nap
+enter_nap:
+	std	%r0,0(%r1)
+	ptesync
+	ld	%r0,0(%r1)
+1:	cmp	%cr0,%r0,%r0
+	bne	1b
+	nap
+	b	.
 /*
  *
  * NACA structure, accessed by the FPS to find the SPIRA
diff --git a/core/fast-reboot.c b/core/fast-reboot.c
index 30b77e9..1caaec3 100644
--- a/core/fast-reboot.c
+++ b/core/fast-reboot.c
@@ -25,242 +25,401 @@
 #include <timebase.h>
 #include <pci.h>
 #include <chip.h>
+#include <chiptod.h>
+
+#define P8_EX_TCTL_DIRECT_CONTROLS(t)	(0x10013000 + (t) * 0x10)
+#define P8_DIRECT_CTL_STOP		PPC_BIT(63)
+#define P8_DIRECT_CTL_PRENAP		PPC_BIT(47)
+#define P8_DIRECT_CTL_SRESET		PPC_BIT(60)
 
-/*
- * To get control of all threads, we sreset them via XSCOM after
- * patching the 0x100 vector. This will work as long as the target
- * HRMOR is 0. If Linux ever uses HRMOR, we'll have to consider
- * a more messy approach.
- *
- * The SCOM register we want is called "Core RAS Control" in the doc
- * and EX0.EC.PC.TCTL_GENERATE#0.TCTL.DIRECT_CONTROLS in the SCOM list
- *
- * Bits in there change from CPU rev to CPU rev but the bit we care
- * about, bit 60 "sreset_request" appears to have stuck to the same
- * place in both P7 and P7+. The register also has the same SCOM
- * address
- */
-#define EX0_TCTL_DIRECT_CONTROLS0	0x08010400
-#define EX0_TCTL_DIRECT_CONTROLS1	0x08010440
-#define EX0_TCTL_DIRECT_CONTROLS2	0x08010480
-#define EX0_TCTL_DIRECT_CONTROLS3	0x080104c0
-#define   TCTL_DC_SRESET_REQUEST	PPC_BIT(60)
 
 /* Flag tested by the OPAL entry code */
 uint8_t reboot_in_progress;
-static struct cpu_thread *resettor, *resettee;
+static volatile bool fast_boot_release;
+static struct cpu_thread *last_man_standing;
+static struct lock reset_lock = LOCK_UNLOCKED;
 
-static void flush_caches(void)
+static int set_special_wakeup(struct cpu_thread *cpu)
 {
-	uint64_t base = SKIBOOT_BASE;
-	uint64_t end = base + SKIBOOT_SIZE;
+	uint64_t val, poll_target, stamp;
+	uint32_t core_id;
+	int rc;
+
+	/*
+	 * Note: HWP checks for checkstops, but I assume we don't need to
+	 * as we wouldn't be running if one was present
+	 */
 
-	/* Not sure what the effect of sreset is on cores, so let's
-	 * shoot a series of dcbf's on all cachelines that make up
-	 * our core memory just in case...
+	/* Grab core ID once */
+	core_id = pir_to_core_id(cpu->pir);
+
+	prlog(PR_DEBUG, "RESET Waking up core 0x%x\n", core_id);
+
+	/*
+	 * The original HWp reads the XSCOM first but ignores the result
+	 * and error, let's do the same until I know for sure that is
+	 * not necessary
 	 */
-	while(base < end) {
-		asm volatile("dcbf 0,%0" : : "r" (base) : "memory");
-		base += 128;
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP),
+		   &val);
+
+	/* Then we write special wakeup */
+	rc = xscom_write(cpu->chip_id,
+			 XSCOM_ADDR_P8_EX_SLAVE(core_id,
+						EX_PM_SPECIAL_WAKEUP_PHYP),
+			 PPC_BIT(0));
+	if (rc) {
+		prerror("RESET: XSCOM error %d asserting special"
+			" wakeup on 0x%x\n", rc, cpu->pir);
+		return rc;
 	}
-	sync();
+
+	/*
+	 * HWP uses the history for Perf register here, dunno why it uses
+	 * that one instead of the pHyp one, maybe to avoid clobbering it...
+	 *
+	 * In any case, it does that to check for run/nap vs.sleep/winkle/other
+	 * to decide whether to poll on checkstop or not. Since we don't deal
+	 * with checkstop conditions here, we ignore that part.
+	 */
+
+	/*
+	 * Now poll for completion of special wakeup. The HWP is nasty here,
+	 * it will poll at 5ms intervals for up to 200ms. This is not quite
+	 * acceptable for us at runtime, at least not until we have the
+	 * ability to "context switch" HBRT. In practice, because we don't
+	 * winkle, it will never take that long, so we increase the polling
+	 * frequency to 1us per poll. However we do have to keep the same
+	 * timeout.
+	 *
+	 * We don't use time_wait_ms() either for now as we don't want to
+	 * poll the FSP here.
+	 */
+	stamp = mftb();
+	poll_target = stamp + msecs_to_tb(200);
+	val = 0;
+	while (!(val & EX_PM_GP0_SPECIAL_WAKEUP_DONE)) {
+		/* Wait 1 us */
+		time_wait_us(1);
+
+		/* Read PM state */
+		rc = xscom_read(cpu->chip_id,
+				XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_GP0),
+				&val);
+		if (rc) {
+			prerror("RESET: XSCOM error %d reading PM state on"
+				" 0x%x\n", rc, cpu->pir);
+			return rc;
+		}
+		/* Check timeout */
+		if (mftb() > poll_target)
+			break;
+	}
+
+	/* Success ? */
+	if (val & EX_PM_GP0_SPECIAL_WAKEUP_DONE) {
+		uint64_t now = mftb();
+		prlog(PR_TRACE, "RESET: Special wakeup complete after %ld us\n",
+		      tb_to_usecs(now - stamp));
+		return 0;
+	}
+
+	/*
+	 * We timed out ...
+	 *
+	 * HWP has a complex workaround for HW255321 which affects
+	 * Murano DD1 and Venice DD1. Ignore that for now
+	 *
+	 * Instead we just dump some XSCOMs for error logging
+	 */
+	prerror("RESET: Timeout on special wakeup of 0x%0x\n", cpu->pir);
+	prerror("RESET:      PM0 = 0x%016llx\n", val);
+	val = -1;
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP),
+		   &val);
+	prerror("RESET: SPC_WKUP = 0x%016llx\n", val);
+	val = -1;
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id,
+					  EX_PM_IDLE_STATE_HISTORY_PHYP),
+		   &val);
+	prerror("RESET:  HISTORY = 0x%016llx\n", val);
+
+	return OPAL_HARDWARE;
 }
 
-static bool do_reset_core_p7(struct cpu_thread *cpu)
+static int clr_special_wakeup(struct cpu_thread *cpu)
 {
-	uint32_t xscom_addr, chip;
-	uint64_t ctl;
+	uint64_t val;
+	uint32_t core_id;
 	int rc;
 
-	/* Add the Core# */
-	xscom_addr = EX0_TCTL_DIRECT_CONTROLS0;
-	xscom_addr |= ((cpu->pir >> 2) & 7) << 24;
+	/*
+	 * Note: HWP checks for checkstops, but I assume we don't need to
+	 * as we wouldn't be running if one was present
+	 */
+
+	/* Grab core ID once */
+	core_id = pir_to_core_id(cpu->pir);
 
-	chip = pir_to_chip_id(cpu->pir);
+	prlog(PR_DEBUG, "RESET: Releasing core 0x%x wakeup\n", core_id);
 
-	ctl = TCTL_DC_SRESET_REQUEST;
-	rc = xscom_write(chip, xscom_addr, ctl);
-	rc |= xscom_write(chip, xscom_addr + 0x40, ctl);
-	rc |= xscom_write(chip, xscom_addr + 0x80, ctl);
-	rc |= xscom_write(chip, xscom_addr + 0xc0, ctl);
+	/*
+	 * The original HWp reads the XSCOM first but ignores the result
+	 * and error, let's do the same until I know for sure that is
+	 * not necessary
+	 */
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP),
+		   &val);
+
+	/* Then we write special wakeup */
+	rc = xscom_write(cpu->chip_id,
+			 XSCOM_ADDR_P8_EX_SLAVE(core_id,
+						EX_PM_SPECIAL_WAKEUP_PHYP), 0);
 	if (rc) {
-		prerror("RESET: Error %d resetting CPU 0x%04x\n",
-			rc, cpu->pir);
-		return false;
+		prerror("RESET: XSCOM error %d deasserting"
+			" special wakeup on 0x%x\n", rc, cpu->pir);
+		return rc;
 	}
-	return true;
+
+	/*
+	 * The original HWp reads the XSCOM again with the comment
+	 * "This puts an inherent delay in the propagation of the reset
+	 * transition"
+	 */
+	xscom_read(cpu->chip_id,
+		   XSCOM_ADDR_P8_EX_SLAVE(core_id, EX_PM_SPECIAL_WAKEUP_PHYP),
+		   &val);
+
+	return 0;
 }
 
-static void fast_reset_p7(void)
+static void set_direct_ctl(struct cpu_thread *cpu, uint64_t bits)
+{
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t thread_id = pir_to_thread_id(cpu->pir);
+	uint32_t xscom_addr;
+
+	xscom_addr = XSCOM_ADDR_P8_EX(core_id,
+				      P8_EX_TCTL_DIRECT_CONTROLS(thread_id));
+
+	xscom_write(chip_id, xscom_addr, bits);
+}
+
+static bool fast_reset_p8(void)
 {
 	struct cpu_thread *cpu;
 
-	resettee = this_cpu();
-	resettor = NULL;
+	/* Mark ourselves as last man standing in need of a reset */
+	last_man_standing = this_cpu();
 
-	/* Pick up a candidate resettor. We do that before we flush
-	 * the caches
-	 */
+	prlog(PR_DEBUG, "RESET: Resetting from cpu: 0x%x (core 0x%x)\n",
+	      this_cpu()->pir, pir_to_core_id(this_cpu()->pir));
+
+	/* Assert special wakup on all cores */
 	for_each_cpu(cpu) {
-		/*
-		 * Some threads might still be in skiboot.
-		 *
-		 * But because we deal with entire cores and we don't want
-		 * to special case things, we are just going to reset them
-		 * too making the assumption that this is safe, they are
-		 * holding no locks. This can only be true if they don't
-		 * have jobs scheduled which is hopefully the case.
-		 */
-		if (cpu->state != cpu_state_os &&
-		    cpu->state != cpu_state_active)
-			continue;
+		if (cpu->primary == cpu)
+			if (set_special_wakeup(cpu) != OPAL_SUCCESS)
+				return false;
+	}
 
-		/*
-		 * Only hit cores and only if they aren't on the same core
-		 * as ourselves
-		 */
-		if (cpu_get_thread0(cpu) == cpu_get_thread0(this_cpu()) ||
-		    cpu->pir & 0x3)
-			continue;
+	prlog(PR_DEBUG, "RESET: Stopping the world...\n");
 
-		/* Pick up one of those guys as our "resettor". It will be
-		 * in charge of resetting this CPU. We avoid resetting
-		 * ourselves, not sure how well it would do with SCOM
-		 */
-		resettor = cpu;
-		break;
-	}
+	/* Put everybody in stop except myself */
+	for_each_cpu(cpu) {
+		if (cpu != this_cpu())
+			set_direct_ctl(cpu, P8_DIRECT_CTL_STOP);
 
-	if (!resettor) {
-		printf("RESET: Can't find a resettor !\n");
-		return;
+		/* Also make sure that saved_r1 is 0 ! That's what will
+		 * make our reset vector jump to fast_reboot_entry
+		 */
+		cpu->save_r1 = 0;
 	}
-	printf("RESET: Resetting from 0x%04x, resettor 0x%04x\n",
-	       this_cpu()->pir, resettor->pir);
 
-	printf("RESET: Flushing caches...\n");
+	/* Restore skiboot vectors  */
+	copy_exception_vectors();
+	setup_reset_vector();
 
-	/* Is that necessary ? */
-	flush_caches();
+	prlog(PR_DEBUG, "RESET: Pre-napping all threads but one...\n");
 
-	/* Reset everybody except self and except resettor */
+	/* Put everybody in pre-nap except myself */
 	for_each_cpu(cpu) {
-		if (cpu->state != cpu_state_os &&
-		    cpu->state != cpu_state_active)
-			continue;
-		if (cpu_get_thread0(cpu) == cpu_get_thread0(this_cpu()) ||
-		    cpu->pir & 0x3)
-			continue;
-		if (cpu_get_thread0(cpu) == cpu_get_thread0(resettor))
-			continue;
+		if (cpu != this_cpu())
+			set_direct_ctl(cpu, P8_DIRECT_CTL_PRENAP);
+	}
 
-		printf("RESET: Resetting CPU 0x%04x...\n", cpu->pir);
+	prlog(PR_DEBUG, "RESET: Resetting all threads but one...\n");
 
-		if (!do_reset_core_p7(cpu))
-			return;
+	/* Reset everybody except my own core threads */
+	for_each_cpu(cpu) {
+		if (cpu != this_cpu())
+			set_direct_ctl(cpu, P8_DIRECT_CTL_SRESET);
 	}
 
-	/* Reset the resettor last because it's going to kill me ! */
-	printf("RESET: Resetting CPU 0x%04x...\n", resettor->pir);
-	if (!do_reset_core_p7(resettor))
-		return;
-
-	/* Don't return */
-	for (;;)
-		;
+	return true;
 }
 
-void fast_reset(void)
+void fast_reboot(void)
 {
-	uint32_t pvr = mfspr(SPR_PVR);
-	extern uint32_t fast_reset_patch_start;
-	extern uint32_t fast_reset_patch_end;
-	uint32_t *dst, *src;
+	bool success;
 
-	printf("RESET: Fast reboot request !\n");
+	if (proc_gen != proc_gen_p8) {
+		prlog(PR_DEBUG,
+		      "RESET: Fast reboot not available on this CPU\n");
+		return;
+	}
+	if (chip_quirk(QUIRK_NO_DIRECT_CTL)) {
+		prlog(PR_DEBUG,
+		      "RESET: Fast reboot disabled by quirk\n");
+		return;
+	}
+
+	prlog(PR_INFO, "RESET: Initiating fast reboot...\n");
 
 	/* XXX We need a way to ensure that no other CPU is in skiboot
 	 * holding locks (via the OPAL APIs) and if they are, we need
-	 * for them to get out
+	 * for them to get out. Hopefully that isn't happening, but...
+	 *
+	 * To fix this properly, we want to keep track of OPAL entry/exit
+	 * on all CPUs.
 	 */
 	reboot_in_progress = 1;
 	time_wait_ms(200);
 
-	/* Copy reset trampoline */
-	printf("RESET: Copying reset trampoline...\n");
-	src = &fast_reset_patch_start;
-	dst = (uint32_t *)0x100;
-	while(src < &fast_reset_patch_end)
-		*(dst++) = *(src++);
-	sync_icache();
-
-	switch(PVR_TYPE(pvr)) {
-	case PVR_TYPE_P7:
-	case PVR_TYPE_P7P:
-		fast_reset_p7();
-	}
+	/* Lock so the new guys coming don't reset us */
+	lock(&reset_lock);
+
+	fast_boot_release = false;
+
+	success = fast_reset_p8();
+
+	/* Unlock, at this point we go away */
+	unlock(&reset_lock);
+
+	if (success)
+		/* Don't return */
+		for (;;)
+			;
 }
 
 static void cleanup_cpu_state(void)
 {
-	if (cpu_is_thread0(this_cpu())) {
-		cleanup_tlb();
+	struct cpu_thread *cpu = this_cpu();
+
+	cpu->current_hile = false;
+
+	/* Per core cleanup */
+	if (cpu_is_thread0(cpu)) {
+		/* Shared SPRs whacked back to normal */
+
+		/* XXX Update the SLW copies ! Also dbl check HIDs etc... */
 		init_shared_sprs();
+
+		/* If somebody was in fast_sleep, we may have a workaround
+		 * to undo
+		 */
+		if (cpu->in_fast_sleep) {
+			prlog(PR_DEBUG, "RESET: CPU 0x%04x in fast sleep"
+			      " undoing workarounds...\n", cpu->pir);
+			fast_sleep_exit();
+		}
+
+		/* And we might have lost TB sync */
+		chiptod_wakeup_resync();
+
+		/* The TLB surely contains garbage */
+		cleanup_tlb();
 	}
+
+	/* Per-thread additional cleanup */
 	init_replicated_sprs();
-	reset_cpu_icp();
+
+	// XXX Cleanup SLW, check HIDs ...
 }
 
-#ifdef FAST_REBOOT_CLEARS_MEMORY
-static void fast_mem_clear(uint64_t start, uint64_t end)
+void __noreturn enter_nap(void);
+
+static void check_split_core(void)
 {
-	printf("MEMORY: Clearing %llx..%llx\n", start, end);
+	struct cpu_thread *cpu;
+	u64 mask, hid0;
+
+        hid0 = mfspr(SPR_HID0);
+	mask = SPR_HID0_POWER8_4LPARMODE | SPR_HID0_POWER8_2LPARMODE;
 
-	while(start < end) {
-		asm volatile("dcbz 0,%0" : : "r" (start) : "memory");
-		start += 128;
+	if ((hid0 & mask) == 0)
+		return;
+
+	prlog(PR_INFO, "RESET: CPU 0x%04x is split !\n", this_cpu()->pir);
+
+	/* If it's a secondary thread, just send it to nap */
+	if (this_cpu()->pir & 7) {
+		/* Prepare to be woken up */
+		icp_prep_for_pm();
+		/* Setup LPCR to wakeup on external interrupts only */
+		mtspr(SPR_LPCR, ((mfspr(SPR_LPCR) & ~SPR_LPCR_P8_PECE) |
+				 SPR_LPCR_P8_PECE2));
+		/* Go to nap (doesn't return) */
+		enter_nap();
 	}
-}
 
-static void memory_reset(void)
-{
-	struct address_range *i;
-	uint64_t skistart = SKIBOOT_BASE;
-	uint64_t skiend = SKIBOOT_BASE + SKIBOOT_SIZE;
-
-	printf("MEMORY: Clearing ...\n");
-
-	list_for_each(&address_ranges, i, list) {
-		uint64_t start = cleanup_addr(i->arange->start);
-		uint64_t end = cleanup_addr(i->arange->end);
-
-		if (start >= skiend || end <= skistart)
-			fast_mem_clear(start, end);
-		else {
-			if (start < skistart)
-				fast_mem_clear(start, skistart);
-			if (end > skiend)
-				fast_mem_clear(skiend, end);
-		}
+	prlog(PR_INFO, "RESET: Primary, unsplitting... \n");
+
+	/* Trigger unsplit operation and update SLW image */
+	hid0 &= ~SPR_HID0_POWER8_DYNLPARDIS;
+	set_hid0(hid0);
+	opal_slw_set_reg(this_cpu()->pir, SPR_HID0, hid0);
+
+	/* Wait for unsplit */
+	while (mfspr(SPR_HID0) & mask)
+		cpu_relax();
+
+	/* Now the guys are sleeping, wake'em up. They will come back
+	 * via reset and continue the fast reboot process normally.
+	 * No need to wait.
+	 */
+	prlog(PR_INFO, "RESET: Waking unsplit secondaries... \n");
+
+	for_each_cpu(cpu) {
+		if (!cpu_is_sibling(cpu, this_cpu()) || (cpu == this_cpu()))
+			continue;
+		icp_kick_cpu(cpu);
 	}
 }
-#endif /* FAST_REBOOT_CLEARS_MEMORY */
+
 
 /* Entry from asm after a fast reset */
-void __noreturn fast_reboot(void);
+void __noreturn fast_reboot_entry(void);
 
-void __noreturn fast_reboot(void)
+void __noreturn fast_reboot_entry(void)
 {
-	static volatile bool fast_boot_release;
 	struct cpu_thread *cpu;
 
-	printf("INIT: CPU PIR 0x%04x reset in\n", this_cpu()->pir);
+	prlog(PR_DEBUG, "RESET: CPU 0x%04x reset in\n", this_cpu()->pir);
+	time_wait_ms(100);
+
+	lock(&reset_lock);
+	if (last_man_standing) {
+		prlog(PR_DEBUG, "RESET: last man standing fixup...\n");
+		set_direct_ctl(last_man_standing, P8_DIRECT_CTL_PRENAP);
+		set_direct_ctl(last_man_standing, P8_DIRECT_CTL_SRESET);
+		last_man_standing = NULL;
+	}
+	unlock(&reset_lock);
+
+	/* We reset our ICP first ! Otherwise we might get stray interrupts
+	 * when unsplitting
+	 */
+	reset_cpu_icp();
 
-	/* If this CPU was chosen as the resettor, it must reset the
-	 * resettee (the one that initiated the whole process
+	/* If we are split, we need to unsplit. Since that can send us
+	 * to NAP, which will come back via reset, we do it now
 	 */
-	if (this_cpu() == resettor)
-		do_reset_core_p7(resettee);
+	check_split_core();
 
 	/* Are we the original boot CPU ? If not, we spin waiting
 	 * for a relase signal from CPU 1, then we clean ourselves
@@ -277,8 +436,10 @@ void __noreturn fast_reboot(void)
 		__secondary_cpu_entry();
 	}
 
+	prlog(PR_INFO, "RESET: Boot CPU waiting for everybody...\n");
+
 	/* We are the original boot CPU, wait for secondaries to
-	 * be captured
+	 * be captured.
 	 */
 	for_each_cpu(cpu) {
 		if (cpu == this_cpu())
@@ -292,7 +453,7 @@ void __noreturn fast_reboot(void)
 		smt_medium();
 	}
 
-	printf("INIT: Releasing secondaries...\n");
+	prlog(PR_INFO, "RESET: Releasing secondaries...\n");
 
 	/* Release everybody */
 	fast_boot_release = true;
@@ -310,7 +471,14 @@ void __noreturn fast_reboot(void)
 		}
 	}
 
-	printf("INIT: All done, resetting everything else...\n");
+	prlog(PR_DEBUG, "RESET: Releasing special wakeups...\n");
+
+	for_each_cpu(cpu) {
+		if (cpu->primary == cpu)
+			clr_special_wakeup(cpu);
+	}
+
+	prlog(PR_INFO, "RESET: All done, cleaning up...\n");
 
 	/* Clear release flag for next time */
 	fast_boot_release = false;
@@ -322,6 +490,12 @@ void __noreturn fast_reboot(void)
 	/* Set our state to active */
 	this_cpu()->state = cpu_state_active;
 
+	/* We can now do NAP mode */
+	cpu_set_pm_enable(true);
+
+	/* Start preloading kernel and ramdisk */
+	start_preload_kernel();
+
 	/* Poke the consoles (see comments in the code there) */
 	fsp_console_reset();
 
@@ -331,15 +505,6 @@ void __noreturn fast_reboot(void)
 	/* Remove all PCI devices */
 	pci_reset();
 
-	/* Reset IO Hubs */
-	cec_reset();
-
-	/* Re-Initialize all discovered PCI slots */
-	pci_init_slots();
-
-	/* Clear memory */
-#ifdef FAST_REBOOT_CLEARS_MEMORY
-	memory_reset();
-#endif
+	/* Load and boot payload */
 	load_and_boot_kernel(true);
 }
diff --git a/core/init.c b/core/init.c
index 16bae50..e15240d 100644
--- a/core/init.c
+++ b/core/init.c
@@ -293,7 +293,7 @@ extern uint64_t boot_offset;
 static size_t kernel_size;
 static size_t initramfs_size;
 
-static bool start_preload_kernel(void)
+bool start_preload_kernel(void)
 {
 	int loaded;
 
@@ -392,6 +392,9 @@ static void load_initramfs(void)
 {
 	int loaded;
 
+	dt_check_del_prop(dt_chosen, "linux,initrd-start");
+	dt_check_del_prop(dt_chosen, "linux,initrd-end");
+
 	loaded = wait_for_resource_loaded(RESOURCE_ID_INITRAMFS,
 					  RESOURCE_SUBID_NONE);
 
@@ -454,6 +457,7 @@ void __noreturn load_and_boot_kernel(bool is_reboot)
 	occ_pstates_init();
 
 	/* Set kernel command line argument if specified */
+	dt_check_del_prop(dt_chosen, "bootargs");
 #ifdef KERNEL_COMMAND_LINE
 	dt_add_property_string(dt_chosen, "bootargs", KERNEL_COMMAND_LINE);
 #else
@@ -598,7 +602,7 @@ void setup_reset_vector(void)
 		*(dst++) = *(src++);
 }
 
-static void copy_exception_vectors(void)
+void copy_exception_vectors(void)
 {
 	/* Backup previous vectors as this could contain a kernel
 	 * image.
diff --git a/core/lock.c b/core/lock.c
index 53cc337..e82048b 100644
--- a/core/lock.c
+++ b/core/lock.c
@@ -110,6 +110,9 @@ void unlock(struct lock *l)
 	this_cpu()->lock_depth--;
 	l->lock_val = 0;
 
+	/* WARNING: On fast reboot, we can be reset right at that
+	 * point, so the reset_lock in there cannot be in the con path
+	 */
 	if (l->in_con_path) {
 		cpu->con_suspend--;
 		if (cpu->con_suspend == 0 && cpu->con_need_flush)
diff --git a/core/pci.c b/core/pci.c
index cbaea35..3ab8daa 100644
--- a/core/pci.c
+++ b/core/pci.c
@@ -1456,6 +1456,7 @@ static void __pci_reset(struct list_head *list)
 
 	while ((pd = list_pop(list, struct pci_device, link)) != NULL) {
 		__pci_reset(&pd->children);
+		dt_free(pd->dn);
 		free(pd);
 	}
 }
@@ -1472,10 +1473,17 @@ void pci_reset(void)
 	 * state machine could be done in parallel)
 	 */
 	for (i = 0; i < ARRAY_SIZE(phbs); i++) {
-		if (!phbs[i])
+		struct phb *phb = phbs[i];
+		if (!phb)
 			continue;
-		__pci_reset(&phbs[i]->devices);
+		__pci_reset(&phb->devices);
+		if (phb->ops->ioda_reset)
+			phb->ops->ioda_reset(phb, true);
 	}
+
+	/* Re-Initialize all discovered PCI slots */
+	pci_init_slots();
+
 }
 
 static void pci_do_jobs(void (*fn)(void *))
diff --git a/core/platform.c b/core/platform.c
index de6e406..7915857 100644
--- a/core/platform.c
+++ b/core/platform.c
@@ -52,9 +52,9 @@ static int64_t opal_cec_reboot(void)
 
 	console_complete_flush();
 
-#ifdef ENABLE_FAST_RESET
+#ifdef ENABLE_FAST_REBOOT
 	/* Try a fast reset first */
-	fast_reset();
+	fast_reboot();
 #endif
 	if (platform.cec_reboot)
 		return platform.cec_reboot();
diff --git a/hw/fsp/fsp-console.c b/hw/fsp/fsp-console.c
index 87e509d..5e27197 100644
--- a/hw/fsp/fsp-console.c
+++ b/hw/fsp/fsp-console.c
@@ -884,6 +884,9 @@ static void reopen_all_hvsi(void)
 
 void fsp_console_reset(void)
 {
+	if (!fsp_present())
+		return;
+
 	prlog(PR_NOTICE, "FSP: Console reset !\n");
 
 	/* This is called on a fast-reset. To work around issues with HVSI
@@ -985,6 +988,8 @@ void fsp_console_select_stdout(void)
 			 */
 		}
 	}
+	dt_check_del_prop(dt_chosen, "linux,stdout-path");
+
 	if (fsp_serials[1].open && use_serial) {
 		dt_add_property_string(dt_chosen, "linux,stdout-path",
 				       "/ibm,opal/consoles/serial at 1");
diff --git a/hw/fsp/fsp-leds.c b/hw/fsp/fsp-leds.c
index 50e82b5..b5a32ad 100644
--- a/hw/fsp/fsp-leds.c
+++ b/hw/fsp/fsp-leds.c
@@ -1570,6 +1570,9 @@ void create_led_device_nodes(void)
 	if (!pled)
 		return;
 
+	/* Check if already populated (fast-reboot) */
+	if (dt_has_node_property(pled, "compatible", NULL))
+		return;
 	dt_add_property_strings(pled, "compatible", DT_PROPERTY_LED_COMPATIBLE);
 
 	led_mode = dt_prop_get(pled, DT_PROPERTY_LED_MODE);
diff --git a/hw/occ.c b/hw/occ.c
index b606a67..3d86f7a 100644
--- a/hw/occ.c
+++ b/hw/occ.c
@@ -517,10 +517,14 @@ void occ_pstates_init(void)
 	struct proc_chip *chip;
 	struct cpu_thread *c;
 	s8 pstate_nom;
+	static bool occ_pstates_initialized;
 
 	/* OCC is P8 only */
 	if (proc_gen != proc_gen_p8)
 		return;
+	/* Handle fast reboots */
+	if (occ_pstates_initialized)
+		return;
 
 	chip = next_chip(NULL);
 	if (!chip->homer_base) {
@@ -558,6 +562,7 @@ void occ_pstates_init(void)
 	for_each_chip(chip)
 		chip->throttle = 0;
 	opal_add_poller(occ_throttle_poll, NULL);
+	occ_pstates_initialized = true;
 }
 
 struct occ_load_req {
diff --git a/hw/psi.c b/hw/psi.c
index 3efc177..bb55c10 100644
--- a/hw/psi.c
+++ b/hw/psi.c
@@ -432,34 +432,25 @@ static int64_t psi_p7_get_xive(struct irq_source *is, uint32_t isn __unused,
 	return OPAL_SUCCESS;
 }
 
+static const uint32_t psi_p8_irq_to_xivr[P8_IRQ_PSI_ALL_COUNT] = {
+	[P8_IRQ_PSI_FSP]	= PSIHB_XIVR_FSP,
+	[P8_IRQ_PSI_OCC]	= PSIHB_XIVR_OCC,
+	[P8_IRQ_PSI_FSI]	= PSIHB_XIVR_FSI,
+	[P8_IRQ_PSI_LPC]	= PSIHB_XIVR_LPC,
+	[P8_IRQ_PSI_LOCAL_ERR]	= PSIHB_XIVR_LOCAL_ERR,
+	[P8_IRQ_PSI_HOST_ERR]	= PSIHB_XIVR_HOST_ERR,
+};
+
 static int64_t psi_p8_set_xive(struct irq_source *is, uint32_t isn,
 			       uint16_t server, uint8_t priority)
 {
 	struct psi *psi = is->data;
 	uint64_t xivr_p, xivr;
+	uint32_t irq_idx = isn & 7;
 
-	switch(isn & 7) {
-	case P8_IRQ_PSI_FSP:
-		xivr_p = PSIHB_XIVR_FSP;
-		break;
-	case P8_IRQ_PSI_OCC:
-		xivr_p = PSIHB_XIVR_OCC;
-		break;
-	case P8_IRQ_PSI_FSI:
-		xivr_p = PSIHB_XIVR_FSI;
-		break;
-	case P8_IRQ_PSI_LPC:
-		xivr_p = PSIHB_XIVR_LPC;
-		break;
-	case P8_IRQ_PSI_LOCAL_ERR:
-		xivr_p = PSIHB_XIVR_LOCAL_ERR;
-		break;
-	case P8_IRQ_PSI_HOST_ERR:
-		xivr_p = PSIHB_XIVR_HOST_ERR;
-		break;
-	default:
+	if (irq_idx >= P8_IRQ_PSI_ALL_COUNT)
 		return OPAL_PARAMETER;
-	}
+	xivr_p = psi_p8_irq_to_xivr[irq_idx];
 
 	/* Populate the XIVR */
 	xivr  = (uint64_t)server << 40;
@@ -476,29 +467,11 @@ static int64_t psi_p8_get_xive(struct irq_source *is, uint32_t isn __unused,
 {
 	struct psi *psi = is->data;
 	uint64_t xivr_p, xivr;
+	uint32_t irq_idx = isn & 7;
 
-	switch(isn & 7) {
-	case P8_IRQ_PSI_FSP:
-		xivr_p = PSIHB_XIVR_FSP;
-		break;
-	case P8_IRQ_PSI_OCC:
-		xivr_p = PSIHB_XIVR_OCC;
-		break;
-	case P8_IRQ_PSI_FSI:
-		xivr_p = PSIHB_XIVR_FSI;
-		break;
-	case P8_IRQ_PSI_LPC:
-		xivr_p = PSIHB_XIVR_LPC;
-		break;
-	case P8_IRQ_PSI_LOCAL_ERR:
-		xivr_p = PSIHB_XIVR_LOCAL_ERR;
-		break;
-	case P8_IRQ_PSI_HOST_ERR:
-		xivr_p = PSIHB_XIVR_HOST_ERR;
-		break;
-	default:
+	if (irq_idx >= P8_IRQ_PSI_ALL_COUNT)
 		return OPAL_PARAMETER;
-	}
+	xivr_p = psi_p8_irq_to_xivr[irq_idx];
 
 	/* Read & decode the XIVR */
 	xivr = in_be64(psi->regs + xivr_p);
@@ -509,33 +482,41 @@ static int64_t psi_p8_get_xive(struct irq_source *is, uint32_t isn __unused,
 	return OPAL_SUCCESS;
 }
 
+static void psi_cleanup_irq(struct psi *psi)
+{
+	uint32_t irq;
+	uint64_t xivr, xivr_p;
+
+	for (irq = 0; irq < P8_IRQ_PSI_ALL_COUNT; irq++) {
+		prlog(PR_DEBUG, "PSI[0x%03x]: Cleaning up IRQ %d\n",
+		      psi->chip_id, irq);
+
+		xivr_p = psi_p8_irq_to_xivr[irq];
+		xivr = in_be64(psi->regs + xivr_p);
+		xivr |= (0xffull << 32);
+		out_be64(psi->regs + xivr_p, xivr);
+		time_wait_ms_nopoll(10);
+		xivr = in_be64(psi->regs + xivr_p);
+		if (xivr & PPC_BIT(39)) {
+			printf(" Need EOI !\n");
+			icp_send_eoi(psi->interrupt + irq);
+		}
+	}
+}
+
 /* Called on a fast reset, make sure we aren't stuck with
  * an accepted and never EOId PSI interrupt
  */
 void psi_irq_reset(void)
 {
 	struct psi *psi;
-	uint64_t xivr;
 
 	printf("PSI: Hot reset!\n");
 
-	assert(proc_gen == proc_gen_p7);
+	assert(proc_gen == proc_gen_p8);
 
 	list_for_each(&psis, psi, list) {
-		/* Mask the interrupt & clean the XIVR */
-		xivr = 0x000000ff00000000UL;
-		xivr |=	P7_IRQ_BUID(psi->interrupt) << 16;
-		out_be64(psi->regs + PSIHB_XIVR, xivr);
-
-#if 0 /* Seems to checkstop ... */
-		/*
-		 * Maybe not anymore; we were just blindly sending
-		 * this on all iopaths, not just the active one;
-		 * We don't even know if those psis are even correct.
-		 */
-		/* Send a dummy EOI to make sure the ICP is clear */
-		icp_send_eoi(psi->interrupt);
-#endif
+		psi_cleanup_irq(psi);
 	}
 }
 
@@ -920,3 +901,4 @@ void psi_init(void)
 		psi_init_psihb(np);
 }
 
+
diff --git a/hw/slw.c b/hw/slw.c
index e3ee9e7..80d295d 100644
--- a/hw/slw.c
+++ b/hw/slw.c
@@ -1060,6 +1060,8 @@ static void fast_sleep_enter(void)
 	}
 
 	primary_thread->save_l2_fir_action1 = tmp;
+	primary_thread->in_fast_sleep = true;
+
 	tmp = tmp & ~0x0200000000000000ULL;
 	rc = xscom_write(chip_id, XSCOM_ADDR_P8_EX(core, L2_FIR_ACTION1),
 			 tmp);
@@ -1082,7 +1084,7 @@ static void fast_sleep_enter(void)
 
 /* Workarounds while exiting fast-sleep */
 
-static void fast_sleep_exit(void)
+void fast_sleep_exit(void)
 {
 	uint32_t core = pir_to_core_id(this_cpu()->pir);
 	uint32_t chip_id = this_cpu()->chip_id;
@@ -1090,6 +1092,7 @@ static void fast_sleep_exit(void)
 	int rc;
 
 	primary_thread = this_cpu()->primary;
+	primary_thread->in_fast_sleep = false;
 
 	rc = xscom_write(chip_id, XSCOM_ADDR_P8_EX(core, L2_FIR_ACTION1),
 			primary_thread->save_l2_fir_action1);
@@ -1131,7 +1134,7 @@ static int64_t opal_config_cpu_idle_state(uint64_t state, uint64_t enter)
 opal_call(OPAL_CONFIG_CPU_IDLE_STATE, opal_config_cpu_idle_state, 2);
 
 #ifdef __HAVE_LIBPORE__
-static int64_t opal_slw_set_reg(uint64_t cpu_pir, uint64_t sprn, uint64_t val)
+int64_t opal_slw_set_reg(uint64_t cpu_pir, uint64_t sprn, uint64_t val)
 {
 
 	struct cpu_thread *c = find_cpu_by_pir(cpu_pir);
diff --git a/include/config.h b/include/config.h
index 2524570..b1b2aeb 100644
--- a/include/config.h
+++ b/include/config.h
@@ -73,7 +73,7 @@
 //#define FORCE_DUMMY_CONSOLE 1
 
 /* Enable this to do fast resets. Currently unreliable... */
-//#define ENABLE_FAST_RESET	1
+#define ENABLE_FAST_REBOOT	1
 
 /* Enable this to make fast reboot clear memory */
 //#define FAST_REBOOT_CLEARS_MEMORY	1
diff --git a/include/cpu.h b/include/cpu.h
index 341e73d..f649a13 100644
--- a/include/cpu.h
+++ b/include/cpu.h
@@ -66,6 +66,7 @@ struct cpu_thread {
 	bool				in_mcount;
 	bool				in_poller;
 	bool				in_reinit;
+	bool				in_fast_sleep;
 	bool				in_sleep;
 	bool				in_idle;
 	uint32_t			hbrt_spec_wakeup; /* primary only */
diff --git a/include/processor.h b/include/processor.h
index caca804..fe4487b 100644
--- a/include/processor.h
+++ b/include/processor.h
@@ -163,6 +163,7 @@
 /* Bits in HID0 */
 #define SPR_HID0_POWER8_4LPARMODE	PPC_BIT(2)
 #define SPR_HID0_POWER8_2LPARMODE	PPC_BIT(6)
+#define SPR_HID0_POWER8_DYNLPARDIS	PPC_BIT(15)
 #define SPR_HID0_POWER8_HILE		PPC_BIT(19)
 #define SPR_HID0_POWER9_HILE		PPC_BIT(4)
 #define SPR_HID0_POWER8_ENABLE_ATTN	PPC_BIT(31)
diff --git a/include/skiboot.h b/include/skiboot.h
index 1dbe38f..ec4b957 100644
--- a/include/skiboot.h
+++ b/include/skiboot.h
@@ -190,12 +190,14 @@ extern unsigned long get_symbol(unsigned long addr,
 				char **sym, char **sym_end);
 
 /* Fast reboot support */
-extern void fast_reset(void);
+extern void fast_reboot(void);
 extern void __noreturn __secondary_cpu_entry(void);
 extern void __noreturn load_and_boot_kernel(bool is_reboot);
 extern void cleanup_tlb(void);
 extern void init_shared_sprs(void);
 extern void init_replicated_sprs(void);
+extern bool start_preload_kernel(void);
+extern void copy_exception_vectors(void);
 extern void setup_reset_vector(void);
 
 /* Various probe routines, to replace with an initcall system */
@@ -267,6 +269,11 @@ extern void slw_update_timer_expiry(uint64_t new_target);
 /* Is SLW timer available ? */
 extern bool slw_timer_ok(void);
 
+/* Patch SPR in SLW image */
+extern int64_t opal_slw_set_reg(uint64_t cpu_pir, uint64_t sprn, uint64_t val);
+
+extern void fast_sleep_exit(void);
+
 /* Fallback fake RTC */
 extern void fake_rtc_init(void);
 



More information about the Skiboot mailing list