[Skiboot] [RFC/WIP/PATCH] Fast reboot for P8

Benjamin Herrenschmidt benh at kernel.crashing.org
Fri Jul 22 18:52:51 AEST 2016


On Fri, 2016-07-22 at 17:17 +1000, Benjamin Herrenschmidt wrote:
> This is an experimental patch that implements "Fast reboot" on P8
> machines.

Found bugs .... TLB not flushed among others. Monitor github benh-wip
branch for now, I'll re-post when I've done more testing.

> The basic idea is that when the OS calls OPAL reboot, we gather all
> the threads in the system using a combination of patching the reset
> vector and soft-resetting them, then cleanup a few bits of hardware
> (we do re-probe PCIe for example), and reload & restart the
> bootloader.
> 
> This is very experimental and needs a lot of testing and also
> auditing
> code for other bits of HW that might need to be cleaned up. I also
> need
> to check if we are properly PERST'ing PCI devices.
> 
> I've successfully fast rebooted a Habanero a few times.
> 
> This is partially based on old code I had to do that on P7. I only
> support it on P8 though as there are issues with the PSI interrupts
> on P7 that cannot be reliably solved.
> 
> Not-yet-signed-off-by: Benjamin Herrenschmidt <benh at kernel.crashing.o
> rg>
> ---
> 
>  asm/head.S           | 125 +++++++--------
>  core/device.c        |   8 +
>  core/fast-reboot.c   | 438 ++++++++++++++++++++++++++++++-----------
> ----------
>  core/init.c          |   6 +-
>  core/lock.c          |   3 +
>  core/pci.c           |  13 +-
>  core/platform.c      |   4 +-
>  hw/fsp/fsp-console.c |   5 +
>  hw/occ.c             |   5 +
>  hw/psi.c             |  97 +++++-------
>  include/config.h     |   4 +-
>  include/device.h     |   2 +
>  include/skiboot.h    |   3 +-
>  13 files changed, 402 insertions(+), 311 deletions(-)
> 
> diff --git a/asm/head.S b/asm/head.S
> index e92f9b8..0c5649d 100644
> --- a/asm/head.S
> +++ b/asm/head.S
> @@ -294,7 +294,7 @@ boot_entry:
>  	bne	secondary_wait
>  
>  	/* Initialize per-core SPRs */
> -	bl init_shared_sprs
> +	bl	init_shared_sprs
>  
>  	/* Pick a boot CPU, cpu index in r31 */
>  	LOAD_IMM32(%r3, boot_sem - __head)
> @@ -311,7 +311,7 @@ boot_entry:
>  	smt_medium
>  
>  	/* Initialize thread SPRs */
> -	bl init_replicated_sprs
> +	bl	init_replicated_sprs
>  
>  	/* Save the initial offset. The secondary threads will spin
> on boot_flag
>  	 * before relocation so we need to keep track of its
> location to wake
> @@ -410,11 +410,11 @@ secondary_wait:
>  	add	%r3,%r3,%r30
>  	mtctr	%r3
>  	isync
> -	bctr	
> +	bctr
>  1:
>  	/* Now wait for cpu_secondary_start to be set */
>  	LOAD_ADDR_FROM_TOC(%r3, cpu_secondary_start)
> -1:	smt_very_low	
> +1:	smt_very_low
>  	ld	%r0,0(%r3)
>  	cmpdi	%r0,0
>  	beq	1b
> @@ -457,64 +457,6 @@ call_relocate:
>  1:	/* Fatal relocate failure */
>  	attn
>  
> -/* This is a little piece of code that is copied down to
> - * 0x100 when doing a "fast reset"
> - */
> -.global fast_reset_patch_start
> -fast_reset_patch_start:	
> -	smt_medium
> -	LOAD_IMM64(%r30, SKIBOOT_BASE)
> -	LOAD_IMM32(%r3, fast_reset_entry - __head)
> -	add	%r3,%r30,%r3
> -	mtctr	%r3
> -	bctr
> -.global fast_reset_patch_end
> -fast_reset_patch_end:
> -
> -/* Fast reset code. We clean up the TLB and a few SPRs and
> - * return to C code. All CPUs do that, the CPU triggering the
> - * reset does it to itself last. The C code will sort out who
> - * the master is. We come from the trampoline above with
> - * r30 containing SKIBOOT_BASE
> - */
> -fast_reset_entry:
> -	/* Clear out SLB */
> -	li	%r6,0
> -	slbmte	%r6,%r6
> -	slbia
> -	ptesync
> -
> -	/* Get PIR */
> -	mfspr	%r31,SPR_PIR
> -
> -	/* Get a stack and restore r13 */
> -	GET_STACK(%r1,%r31)
> -	li	%r3,0
> -	std	%r3,0(%r1)
> -	std	%r3,8(%r1)
> -	std	%r3,16(%r1)
> -	GET_CPU()
> -
> -	/* Get our TOC */
> -	addis	%r2,%r30,(__toc_start - __head)@ha
> -	addi	%r2,%r2,(__toc_start - __head)@l
> -
> -	/* Go to C ! */
> -	bl	fast_reboot
> -	b	.
> -
> -.global cleanup_tlb
> -cleanup_tlb:
> -	/* Clean the TLB */
> -	li	%r3,128
> -	mtctr	%r3
> -	li	%r4,0x800		/* IS field = 0b10 */
> -	ptesync
> -1:	tlbiel	%r4
> -	addi	%r4,%r4,0x1000
> -	bdnz	1b
> -	ptesync
> -
>  #define FIXUP_ENDIAN                                              \
>         tdi   0,0,0x48;   /* Reverse endian of b . + 8          */ \
>         b     $+36;       /* Skip trampoline if endian is good  */ \
> @@ -652,6 +594,65 @@ rvwinkle_restore:
>  	mtlr	%r0
>  	blr
>  
> +/* This is a little piece of code that is copied down to
> + * 0x100 when doing a "fast reset"
> + */
> +.global fast_reset_patch_start
> +fast_reset_patch_start:
> +	FIXUP_ENDIAN
> +	smt_medium
> +	LOAD_IMM64(%r30, SKIBOOT_BASE)
> +	LOAD_IMM32(%r3, fast_reset_entry - __head)
> +	add	%r3,%r30,%r3
> +	mtctr	%r3
> +	bctr
> +.global fast_reset_patch_end
> +fast_reset_patch_end:
> +
> +/* Fast reset code. We clean up the TLB and a few SPRs and
> + * return to C code. All CPUs do that, the CPU triggering the
> + * reset does it to itself last. The C code will sort out who
> + * the master is. We come from the trampoline above with
> + * r30 containing SKIBOOT_BASE
> + */
> +fast_reset_entry:
> +	/* Clear out SLB */
> +	li	%r6,0
> +	slbmte	%r6,%r6
> +	slbia
> +	ptesync
> +
> +	/* Get PIR */
> +	mfspr	%r31,SPR_PIR
> +
> +	/* Get a stack and restore r13 */
> +	GET_STACK(%r1,%r31)
> +	li	%r3,0
> +	std	%r3,0(%r1)
> +	std	%r3,8(%r1)
> +	std	%r3,16(%r1)
> +	GET_CPU()
> +
> +	/* Get our TOC */
> +	addis	%r2,%r30,(__toc_start - __head)@ha
> +	addi	%r2,%r2,(__toc_start - __head)@l
> +
> +	/* Go to C ! */
> +	bl	fast_reboot_entry
> +	b	.
> +
> +.global cleanup_tlb
> +cleanup_tlb:
> +	/* Clean the TLB */
> +	li	%r3,128
> +	mtctr	%r3
> +	li	%r4,0x800		/* IS field = 0b10 */
> +	ptesync
> +1:	tlbiel	%r4
> +	addi	%r4,%r4,0x1000
> +	bdnz	1b
> +	ptesync
> +
>  /* Functions to initialize replicated and shared SPRs to sane
>   * values. This is called at boot and on soft-reset
>   */
> diff --git a/core/device.c b/core/device.c
> index 9e7ef0d..e7b53a8 100644
> --- a/core/device.c
> +++ b/core/device.c
> @@ -581,6 +581,14 @@ const struct dt_property *dt_find_property(const
> struct dt_node *node,
>  	return NULL;
>  }
>  
> +void dt_check_del_prop(struct dt_node *node, const char *name)
> +{
> +	struct dt_property *p;
> +
> +	p = __dt_find_property(node, name);
> +	if (p)
> +		dt_del_property(node, p);
> +}
>  const struct dt_property *dt_require_property(const struct dt_node
> *node,
>  					      const char *name, int
> wanted_len)
>  {
> diff --git a/core/fast-reboot.c b/core/fast-reboot.c
> index 30b77e9..1a7f2cc 100644
> --- a/core/fast-reboot.c
> +++ b/core/fast-reboot.c
> @@ -26,179 +26,283 @@
>  #include <pci.h>
>  #include <chip.h>
>  
> -/*
> - * To get control of all threads, we sreset them via XSCOM after
> - * patching the 0x100 vector. This will work as long as the target
> - * HRMOR is 0. If Linux ever uses HRMOR, we'll have to consider
> - * a more messy approach.
> - *
> - * The SCOM register we want is called "Core RAS Control" in the doc
> - * and EX0.EC.PC.TCTL_GENERATE#0.TCTL.DIRECT_CONTROLS in the SCOM
> list
> - *
> - * Bits in there change from CPU rev to CPU rev but the bit we care
> - * about, bit 60 "sreset_request" appears to have stuck to the same
> - * place in both P7 and P7+. The register also has the same SCOM
> - * address
> - */
> -#define EX0_TCTL_DIRECT_CONTROLS0	0x08010400
> -#define EX0_TCTL_DIRECT_CONTROLS1	0x08010440
> -#define EX0_TCTL_DIRECT_CONTROLS2	0x08010480
> -#define EX0_TCTL_DIRECT_CONTROLS3	0x080104c0
> -#define   TCTL_DC_SRESET_REQUEST	PPC_BIT(60)
> +#define P8_EX_TCTL_DIRECT_CONTROLS(t)	(0x10013000 + (t) *
> 0x10)
> +#define P8_DIRECT_CTL_STOP		PPC_BIT(63)
> +#define P8_DIRECT_CTL_PRENAP		PPC_BIT(47)
> +#define P8_DIRECT_CTL_SRESET		PPC_BIT(60)
> +
>  
>  /* Flag tested by the OPAL entry code */
>  uint8_t reboot_in_progress;
> -static struct cpu_thread *resettor, *resettee;
> +static volatile bool fast_boot_release;
> +static struct cpu_thread *last_man_standing;
> +static struct lock reset_lock = LOCK_UNLOCKED;
>  
> -static void flush_caches(void)
> +static int set_special_wakeup(struct cpu_thread *cpu)
>  {
> -	uint64_t base = SKIBOOT_BASE;
> -	uint64_t end = base + SKIBOOT_SIZE;
> +	uint64_t val, poll_target, stamp;
> +	uint32_t core_id;
> +	int rc;
>  
> -	/* Not sure what the effect of sreset is on cores, so let's
> -	 * shoot a series of dcbf's on all cachelines that make up
> -	 * our core memory just in case...
> +	/*
> +	 * Note: HWP checks for checkstops, but I assume we don't
> need to
> +	 * as we wouldn't be running if one was present
>  	 */
> -	while(base < end) {
> -		asm volatile("dcbf 0,%0" : : "r" (base) : "memory");
> -		base += 128;
> +
> +	/* Grab core ID once */
> +	core_id = pir_to_core_id(cpu->pir);
> +
> +	prlog(PR_DEBUG, "RESET Waking up core 0x%x\n", core_id);
> +
> +	/*
> +	 * The original HWp reads the XSCOM first but ignores the
> result
> +	 * and error, let's do the same until I know for sure that
> is
> +	 * not necessary
> +	 */
> +	xscom_read(cpu->chip_id,
> +		   XSCOM_ADDR_P8_EX_SLAVE(core_id,
> EX_PM_SPECIAL_WAKEUP_PHYP),
> +		   &val);
> +
> +	/* Then we write special wakeup */
> +	rc = xscom_write(cpu->chip_id,
> +			 XSCOM_ADDR_P8_EX_SLAVE(core_id,
> +						EX_PM_SPECIAL_WAKEUP
> _PHYP),
> +			 PPC_BIT(0));
> +	if (rc) {
> +		prerror("RESET: XSCOM error %d asserting special"
> +			" wakeup on 0x%x\n", rc, cpu->pir);
> +		return rc;
>  	}
> -	sync();
> +
> +	/*
> +	 * HWP uses the history for Perf register here, dunno why it
> uses
> +	 * that one instead of the pHyp one, maybe to avoid
> clobbering it...
> +	 *
> +	 * In any case, it does that to check for run/nap
> vs.sleep/winkle/other
> +	 * to decide whether to poll on checkstop or not. Since we
> don't deal
> +	 * with checkstop conditions here, we ignore that part.
> +	 */
> +
> +	/*
> +	 * Now poll for completion of special wakeup. The HWP is
> nasty here,
> +	 * it will poll at 5ms intervals for up to 200ms. This is
> not quite
> +	 * acceptable for us at runtime, at least not until we have
> the
> +	 * ability to "context switch" HBRT. In practice, because we
> don't
> +	 * winkle, it will never take that long, so we increase the
> polling
> +	 * frequency to 1us per poll. However we do have to keep the
> same
> +	 * timeout.
> +	 *
> +	 * We don't use time_wait_ms() either for now as we don't
> want to
> +	 * poll the FSP here.
> +	 */
> +	stamp = mftb();
> +	poll_target = stamp + msecs_to_tb(200);
> +	val = 0;
> +	while (!(val & EX_PM_GP0_SPECIAL_WAKEUP_DONE)) {
> +		/* Wait 1 us */
> +		time_wait_us(1);
> +
> +		/* Read PM state */
> +		rc = xscom_read(cpu->chip_id,
> +				XSCOM_ADDR_P8_EX_SLAVE(core_id,
> EX_PM_GP0),
> +				&val);
> +		if (rc) {
> +			prerror("RESET: XSCOM error %d reading PM
> state on"
> +				" 0x%x\n", rc, cpu->pir);
> +			return rc;
> +		}
> +		/* Check timeout */
> +		if (mftb() > poll_target)
> +			break;
> +	}
> +
> +	/* Success ? */
> +	if (val & EX_PM_GP0_SPECIAL_WAKEUP_DONE) {
> +		uint64_t now = mftb();
> +		prlog(PR_TRACE, "RESET: Special wakeup complete
> after %ld us\n",
> +		      tb_to_usecs(now - stamp));
> +		return 0;
> +	}
> +
> +	/*
> +	 * We timed out ...
> +	 *
> +	 * HWP has a complex workaround for HW255321 which affects
> +	 * Murano DD1 and Venice DD1. Ignore that for now
> +	 *
> +	 * Instead we just dump some XSCOMs for error logging
> +	 */
> +	prerror("RESET: Timeout on special wakeup of 0x%0x\n", cpu-
> >pir);
> +	prerror("RESET:      PM0 = 0x%016llx\n", val);
> +	val = -1;
> +	xscom_read(cpu->chip_id,
> +		   XSCOM_ADDR_P8_EX_SLAVE(core_id,
> EX_PM_SPECIAL_WAKEUP_PHYP),
> +		   &val);
> +	prerror("RESET: SPC_WKUP = 0x%016llx\n", val);
> +	val = -1;
> +	xscom_read(cpu->chip_id,
> +		   XSCOM_ADDR_P8_EX_SLAVE(core_id,
> +					  EX_PM_IDLE_STATE_HISTORY_P
> HYP),
> +		   &val);
> +	prerror("RESET:  HISTORY = 0x%016llx\n", val);
> +
> +	return OPAL_HARDWARE;
>  }
>  
> -static bool do_reset_core_p7(struct cpu_thread *cpu)
> +static int clr_special_wakeup(struct cpu_thread *cpu)
>  {
> -	uint32_t xscom_addr, chip;
> -	uint64_t ctl;
> +	uint64_t val;
> +	uint32_t core_id;
>  	int rc;
>  
> -	/* Add the Core# */
> -	xscom_addr = EX0_TCTL_DIRECT_CONTROLS0;
> -	xscom_addr |= ((cpu->pir >> 2) & 7) << 24;
> +	/*
> +	 * Note: HWP checks for checkstops, but I assume we don't
> need to
> +	 * as we wouldn't be running if one was present
> +	 */
> +
> +	/* Grab core ID once */
> +	core_id = pir_to_core_id(cpu->pir);
>  
> -	chip = pir_to_chip_id(cpu->pir);
> +	prlog(PR_DEBUG, "RESET: Releasing core 0x%x wakeup\n",
> core_id);
>  
> -	ctl = TCTL_DC_SRESET_REQUEST;
> -	rc = xscom_write(chip, xscom_addr, ctl);
> -	rc |= xscom_write(chip, xscom_addr + 0x40, ctl);
> -	rc |= xscom_write(chip, xscom_addr + 0x80, ctl);
> -	rc |= xscom_write(chip, xscom_addr + 0xc0, ctl);
> +	/*
> +	 * The original HWp reads the XSCOM first but ignores the
> result
> +	 * and error, let's do the same until I know for sure that
> is
> +	 * not necessary
> +	 */
> +	xscom_read(cpu->chip_id,
> +		   XSCOM_ADDR_P8_EX_SLAVE(core_id,
> EX_PM_SPECIAL_WAKEUP_PHYP),
> +		   &val);
> +
> +	/* Then we write special wakeup */
> +	rc = xscom_write(cpu->chip_id,
> +			 XSCOM_ADDR_P8_EX_SLAVE(core_id,
> +						EX_PM_SPECIAL_WAKEUP
> _PHYP), 0);
>  	if (rc) {
> -		prerror("RESET: Error %d resetting CPU 0x%04x\n",
> -			rc, cpu->pir);
> -		return false;
> +		prerror("RESET: XSCOM error %d deasserting"
> +			" special wakeup on 0x%x\n", rc, cpu->pir);
> +		return rc;
>  	}
> -	return true;
> +
> +	/*
> +	 * The original HWp reads the XSCOM again with the comment
> +	 * "This puts an inherent delay in the propagation of the
> reset
> +	 * transition"
> +	 */
> +	xscom_read(cpu->chip_id,
> +		   XSCOM_ADDR_P8_EX_SLAVE(core_id,
> EX_PM_SPECIAL_WAKEUP_PHYP),
> +		   &val);
> +
> +	return 0;
>  }
>  
> -static void fast_reset_p7(void)
> +static void set_direct_ctl(struct cpu_thread *cpu, uint64_t bits)
>  {
> -	struct cpu_thread *cpu;
> +	uint32_t core_id = pir_to_core_id(cpu->pir);
> +	uint32_t chip_id = pir_to_chip_id(cpu->pir);
> +	uint32_t thread_id = pir_to_thread_id(cpu->pir);
> +	uint32_t xscom_addr;
>  
> -	resettee = this_cpu();
> -	resettor = NULL;
> +	xscom_addr = XSCOM_ADDR_P8_EX(core_id,
> P8_EX_TCTL_DIRECT_CONTROLS(thread_id));
>  
> -	/* Pick up a candidate resettor. We do that before we flush
> -	 * the caches
> -	 */
> -	for_each_cpu(cpu) {
> -		/*
> -		 * Some threads might still be in skiboot.
> -		 *
> -		 * But because we deal with entire cores and we
> don't want
> -		 * to special case things, we are just going to
> reset them
> -		 * too making the assumption that this is safe, they
> are
> -		 * holding no locks. This can only be true if they
> don't
> -		 * have jobs scheduled which is hopefully the case.
> -		 */
> -		if (cpu->state != cpu_state_os &&
> -		    cpu->state != cpu_state_active)
> -			continue;
> +	xscom_write(chip_id, xscom_addr, bits);
> +}
>  
> -		/*
> -		 * Only hit cores and only if they aren't on the
> same core
> -		 * as ourselves
> -		 */
> -		if (cpu_get_thread0(cpu) ==
> cpu_get_thread0(this_cpu()) ||
> -		    cpu->pir & 0x3)
> -			continue;
> +static void patch_reset_vector(void)
> +{
> +	extern uint32_t fast_reset_patch_start;
> +	extern uint32_t fast_reset_patch_end;
> +	uint32_t *dst, *src;
> +
> +	/* Copy reset trampoline */
> +	prlog(PR_DEBUG, "RESET: Copying reset trampoline...\n");
> +	src = &fast_reset_patch_start;
> +	dst = (uint32_t *)0x100;
> +	while(src < &fast_reset_patch_end)
> +		*(dst++) = *(src++);
> +	sync_icache();
> +}
> +
> +static bool fast_reset_p8(void)
> +{
> +	struct cpu_thread *cpu;
> +
> +	/* Mark ourselves as last man standing in need of a reset */
> +	last_man_standing = this_cpu();
>  
> -		/* Pick up one of those guys as our "resettor". It
> will be
> -		 * in charge of resetting this CPU. We avoid
> resetting
> -		 * ourselves, not sure how well it would do with
> SCOM
> -		 */
> -		resettor = cpu;
> -		break;
> +	prlog(PR_DEBUG, "RESET: Resetting from cpu: 0x%x (core
> 0x%x)\n",
> +	      this_cpu()->pir, pir_to_core_id(this_cpu()->pir));
> +
> +	/* Assert special wakup on all cores */
> +	for_each_cpu(cpu) {
> +		if (cpu->primary == cpu)
> +			if (set_special_wakeup(cpu) != OPAL_SUCCESS)
> +				return false;
>  	}
>  
> -	if (!resettor) {
> -		printf("RESET: Can't find a resettor !\n");
> -		return;
> +	prlog(PR_DEBUG, "RESET: Stopping the world...\n");
> +
> +	/* Put everybody in stop except myself */
> +	for_each_cpu(cpu) {
> +		if (cpu != this_cpu())
> +			set_direct_ctl(cpu, P8_DIRECT_CTL_STOP);
>  	}
> -	printf("RESET: Resetting from 0x%04x, resettor 0x%04x\n",
> -	       this_cpu()->pir, resettor->pir);
>  
> -	printf("RESET: Flushing caches...\n");
> +	/* Patch reset */
> +	patch_reset_vector();
>  
> -	/* Is that necessary ? */
> -	flush_caches();
> +	prlog(PR_DEBUG, "RESET: Pre-napping all threads but
> one...\n");
>  
> -	/* Reset everybody except self and except resettor */
> +	/* Put everybody in pre-nap except myself */
>  	for_each_cpu(cpu) {
> -		if (cpu->state != cpu_state_os &&
> -		    cpu->state != cpu_state_active)
> -			continue;
> -		if (cpu_get_thread0(cpu) ==
> cpu_get_thread0(this_cpu()) ||
> -		    cpu->pir & 0x3)
> -			continue;
> -		if (cpu_get_thread0(cpu) ==
> cpu_get_thread0(resettor))
> -			continue;
> +		if (cpu != this_cpu())
> +			set_direct_ctl(cpu, P8_DIRECT_CTL_PRENAP);
> +	}
>  
> -		printf("RESET: Resetting CPU 0x%04x...\n", cpu-
> >pir);
> +	prlog(PR_DEBUG, "RESET: Resetting all threads but
> one...\n");
>  
> -		if (!do_reset_core_p7(cpu))
> -			return;
> +	/* Reset everybody except my own core threads */
> +	for_each_cpu(cpu) {
> +		if (cpu != this_cpu())
> +			set_direct_ctl(cpu, P8_DIRECT_CTL_SRESET);
>  	}
>  
> -	/* Reset the resettor last because it's going to kill me !
> */
> -	printf("RESET: Resetting CPU 0x%04x...\n", resettor->pir);
> -	if (!do_reset_core_p7(resettor))
> -		return;
> -
> -	/* Don't return */
> -	for (;;)
> -		;
> +	return true;
>  }
>  
> -void fast_reset(void)
> +void fast_reboot(void)
>  {
> -	uint32_t pvr = mfspr(SPR_PVR);
> -	extern uint32_t fast_reset_patch_start;
> -	extern uint32_t fast_reset_patch_end;
> -	uint32_t *dst, *src;
> +	bool success;
>  
> -	printf("RESET: Fast reboot request !\n");
> +	if (proc_gen != proc_gen_p8)
> +		return;
> +
> +	prlog(PR_INFO, "RESET: Initiating fast reboot...\n");
>  
>  	/* XXX We need a way to ensure that no other CPU is in
> skiboot
>  	 * holding locks (via the OPAL APIs) and if they are, we
> need
> -	 * for them to get out
> +	 * for them to get out. Hopefully that isn't happening,
> but...
> +	 *
> +	 * To fix this properly, we want to keep track of OPAL
> entry/exit
> +	 * on all CPUs.
>  	 */
>  	reboot_in_progress = 1;
>  	time_wait_ms(200);
>  
> -	/* Copy reset trampoline */
> -	printf("RESET: Copying reset trampoline...\n");
> -	src = &fast_reset_patch_start;
> -	dst = (uint32_t *)0x100;
> -	while(src < &fast_reset_patch_end)
> -		*(dst++) = *(src++);
> -	sync_icache();
> +	/* Lock so the new guys coming don't reset us */
> +	lock(&reset_lock);
>  
> -	switch(PVR_TYPE(pvr)) {
> -	case PVR_TYPE_P7:
> -	case PVR_TYPE_P7P:
> -		fast_reset_p7();
> -	}
> +	fast_boot_release = false;
> +
> +	success = fast_reset_p8();
> +
> +	/* Unlock, at this point we go away */
> +	unlock(&reset_lock);
> +
> +	if (success)
> +		/* Don't return */
> +		for (;;)
> +			;
>  }
>  
>  static void cleanup_cpu_state(void)
> @@ -211,56 +315,24 @@ static void cleanup_cpu_state(void)
>  	reset_cpu_icp();
>  }
>  
> -#ifdef FAST_REBOOT_CLEARS_MEMORY
> -static void fast_mem_clear(uint64_t start, uint64_t end)
> -{
> -	printf("MEMORY: Clearing %llx..%llx\n", start, end);
> -
> -	while(start < end) {
> -		asm volatile("dcbz 0,%0" : : "r" (start) :
> "memory");
> -		start += 128;
> -	}
> -}
> -
> -static void memory_reset(void)
> -{
> -	struct address_range *i;
> -	uint64_t skistart = SKIBOOT_BASE;
> -	uint64_t skiend = SKIBOOT_BASE + SKIBOOT_SIZE;
> -
> -	printf("MEMORY: Clearing ...\n");
> -
> -	list_for_each(&address_ranges, i, list) {
> -		uint64_t start = cleanup_addr(i->arange->start);
> -		uint64_t end = cleanup_addr(i->arange->end);
> -
> -		if (start >= skiend || end <= skistart)
> -			fast_mem_clear(start, end);
> -		else {
> -			if (start < skistart)
> -				fast_mem_clear(start, skistart);
> -			if (end > skiend)
> -				fast_mem_clear(skiend, end);
> -		}
> -	}
> -}
> -#endif /* FAST_REBOOT_CLEARS_MEMORY */
> -
>  /* Entry from asm after a fast reset */
> -void __noreturn fast_reboot(void);
> +void __noreturn fast_reboot_entry(void);
>  
> -void __noreturn fast_reboot(void)
> +void __noreturn fast_reboot_entry(void)
>  {
> -	static volatile bool fast_boot_release;
>  	struct cpu_thread *cpu;
>  
> -	printf("INIT: CPU PIR 0x%04x reset in\n", this_cpu()->pir);
> +	prlog(PR_DEBUG, "RESET: CPU PIR 0x%04x reset in\n",
> this_cpu()->pir);
> +	time_wait_ms(100);
>  
> -	/* If this CPU was chosen as the resettor, it must reset the
> -	 * resettee (the one that initiated the whole process
> -	 */
> -	if (this_cpu() == resettor)
> -		do_reset_core_p7(resettee);
> +	lock(&reset_lock);
> +	if (last_man_standing) {
> +		prlog(PR_DEBUG, "RESET: last man standing
> fixup...\n");
> +		set_direct_ctl(last_man_standing,
> P8_DIRECT_CTL_PRENAP);
> +		set_direct_ctl(last_man_standing,
> P8_DIRECT_CTL_SRESET);
> +		last_man_standing = NULL;
> +	}
> +	unlock(&reset_lock);
>  
>  	/* Are we the original boot CPU ? If not, we spin waiting
>  	 * for a relase signal from CPU 1, then we clean ourselves
> @@ -277,6 +349,8 @@ void __noreturn fast_reboot(void)
>  		__secondary_cpu_entry();
>  	}
>  
> +	prlog(PR_INFO, "RESET: Boot CPU waiting for
> everybody...\n");
> +
>  	/* We are the original boot CPU, wait for secondaries to
>  	 * be captured
>  	 */
> @@ -292,7 +366,7 @@ void __noreturn fast_reboot(void)
>  		smt_medium();
>  	}
>  
> -	printf("INIT: Releasing secondaries...\n");
> +	prlog(PR_INFO, "RESET: Releasing secondaries...\n");
>  
>  	/* Release everybody */
>  	fast_boot_release = true;
> @@ -310,7 +384,14 @@ void __noreturn fast_reboot(void)
>  		}
>  	}
>  
> -	printf("INIT: All done, resetting everything else...\n");
> +	prlog(PR_DEBUG, "RESET: Releasing special wakeups...\n");
> +
> +	for_each_cpu(cpu) {
> +		if (cpu->primary == cpu)
> +			clr_special_wakeup(cpu);
> +	}
> +
> +	prlog(PR_INFO, "RESET: All done, cleaning up...\n");
>  
>  	/* Clear release flag for next time */
>  	fast_boot_release = false;
> @@ -322,6 +403,8 @@ void __noreturn fast_reboot(void)
>  	/* Set our state to active */
>  	this_cpu()->state = cpu_state_active;
>  
> +	start_preload_kernel();
> +
>  	/* Poke the consoles (see comments in the code there) */
>  	fsp_console_reset();
>  
> @@ -331,15 +414,6 @@ void __noreturn fast_reboot(void)
>  	/* Remove all PCI devices */
>  	pci_reset();
>  
> -	/* Reset IO Hubs */
> -	cec_reset();
> -
> -	/* Re-Initialize all discovered PCI slots */
> -	pci_init_slots();
> -
> -	/* Clear memory */
> -#ifdef FAST_REBOOT_CLEARS_MEMORY
> -	memory_reset();
> -#endif
> +	/* Load and boot payload */
>  	load_and_boot_kernel(true);
>  }
> diff --git a/core/init.c b/core/init.c
> index ca3ad55..1a3d741 100644
> --- a/core/init.c
> +++ b/core/init.c
> @@ -287,7 +287,7 @@ extern uint64_t boot_offset;
>  static size_t kernel_size;
>  static size_t initramfs_size;
>  
> -static bool start_preload_kernel(void)
> +bool start_preload_kernel(void)
>  {
>  	int loaded;
>  
> @@ -384,6 +384,9 @@ static void load_initramfs(void)
>  {
>  	int loaded;
>  
> +	dt_check_del_prop(dt_chosen, "linux,initrd-start");
> +	dt_check_del_prop(dt_chosen, "linux,initrd-end");
> +
>  	loaded = wait_for_resource_loaded(RESOURCE_ID_INITRAMFS,
>  					  RESOURCE_SUBID_NONE);
>  
> @@ -447,6 +450,7 @@ void __noreturn load_and_boot_kernel(bool
> is_reboot)
>  
>  	/* Set kernel command line argument if specified */
>  #ifdef KERNEL_COMMAND_LINE
> +	dt_check_del_prop(dt_chosen, "bootargs");
>  	dt_add_property_string(dt_chosen, "bootargs",
> KERNEL_COMMAND_LINE);
>  #endif
>  
> diff --git a/core/lock.c b/core/lock.c
> index 53cc337..e82048b 100644
> --- a/core/lock.c
> +++ b/core/lock.c
> @@ -110,6 +110,9 @@ void unlock(struct lock *l)
>  	this_cpu()->lock_depth--;
>  	l->lock_val = 0;
>  
> +	/* WARNING: On fast reboot, we can be reset right at that
> +	 * point, so the reset_lock in there cannot be in the con
> path
> +	 */
>  	if (l->in_con_path) {
>  		cpu->con_suspend--;
>  		if (cpu->con_suspend == 0 && cpu->con_need_flush)
> diff --git a/core/pci.c b/core/pci.c
> index cbaea35..bbf4583 100644
> --- a/core/pci.c
> +++ b/core/pci.c
> @@ -1456,6 +1456,7 @@ static void __pci_reset(struct list_head *list)
>  
>  	while ((pd = list_pop(list, struct pci_device, link)) !=
> NULL) {
>  		__pci_reset(&pd->children);
> +		dt_free(pd->dn);
>  		free(pd);
>  	}
>  }
> @@ -1466,16 +1467,22 @@ void pci_reset(void)
>  
>  	prlog(PR_NOTICE, "PCI: Clearing all devices...\n");
>  
> -	/* This is a remnant of fast-reboot, not currently used */
>  
>  	/* XXX Do those in parallel (at least the power up
>  	 * state machine could be done in parallel)
>  	 */
>  	for (i = 0; i < ARRAY_SIZE(phbs); i++) {
> -		if (!phbs[i])
> +		struct phb *phb = phbs[i];
> +		if (!phb)
>  			continue;
> -		__pci_reset(&phbs[i]->devices);
> +		__pci_reset(&phb->devices);
> +		if (phb->ops->ioda_reset)
> +			phb->ops->ioda_reset(phb, true);
>  	}
> +
> +	/* Re-Initialize all discovered PCI slots */
> +	pci_init_slots();
> +
>  }
>  
>  static void pci_do_jobs(void (*fn)(void *))
> diff --git a/core/platform.c b/core/platform.c
> index de6e406..7915857 100644
> --- a/core/platform.c
> +++ b/core/platform.c
> @@ -52,9 +52,9 @@ static int64_t opal_cec_reboot(void)
>  
>  	console_complete_flush();
>  
> -#ifdef ENABLE_FAST_RESET
> +#ifdef ENABLE_FAST_REBOOT
>  	/* Try a fast reset first */
> -	fast_reset();
> +	fast_reboot();
>  #endif
>  	if (platform.cec_reboot)
>  		return platform.cec_reboot();
> diff --git a/hw/fsp/fsp-console.c b/hw/fsp/fsp-console.c
> index 87e509d..5e27197 100644
> --- a/hw/fsp/fsp-console.c
> +++ b/hw/fsp/fsp-console.c
> @@ -884,6 +884,9 @@ static void reopen_all_hvsi(void)
>  
>  void fsp_console_reset(void)
>  {
> +	if (!fsp_present())
> +		return;
> +
>  	prlog(PR_NOTICE, "FSP: Console reset !\n");
>  
>  	/* This is called on a fast-reset. To work around issues
> with HVSI
> @@ -985,6 +988,8 @@ void fsp_console_select_stdout(void)
>  			 */
>  		}
>  	}
> +	dt_check_del_prop(dt_chosen, "linux,stdout-path");
> +
>  	if (fsp_serials[1].open && use_serial) {
>  		dt_add_property_string(dt_chosen, "linux,stdout-
> path",
>  				       "/ibm,opal/consoles/serial at 1"
> );
> diff --git a/hw/occ.c b/hw/occ.c
> index b606a67..3d86f7a 100644
> --- a/hw/occ.c
> +++ b/hw/occ.c
> @@ -517,10 +517,14 @@ void occ_pstates_init(void)
>  	struct proc_chip *chip;
>  	struct cpu_thread *c;
>  	s8 pstate_nom;
> +	static bool occ_pstates_initialized;
>  
>  	/* OCC is P8 only */
>  	if (proc_gen != proc_gen_p8)
>  		return;
> +	/* Handle fast reboots */
> +	if (occ_pstates_initialized)
> +		return;
>  
>  	chip = next_chip(NULL);
>  	if (!chip->homer_base) {
> @@ -558,6 +562,7 @@ void occ_pstates_init(void)
>  	for_each_chip(chip)
>  		chip->throttle = 0;
>  	opal_add_poller(occ_throttle_poll, NULL);
> +	occ_pstates_initialized = true;
>  }
>  
>  struct occ_load_req {
> diff --git a/hw/psi.c b/hw/psi.c
> index 3efc177..03527f6 100644
> --- a/hw/psi.c
> +++ b/hw/psi.c
> @@ -432,34 +432,25 @@ static int64_t psi_p7_get_xive(struct
> irq_source *is, uint32_t isn __unused,
>  	return OPAL_SUCCESS;
>  }
>  
> +static const uint32_t psi_p8_irq_to_xivr[P8_IRQ_PSI_ALL_COUNT] = {
> +	[P8_IRQ_PSI_FSP]	= PSIHB_XIVR_FSP,
> +	[P8_IRQ_PSI_OCC]	= PSIHB_XIVR_OCC,
> +	[P8_IRQ_PSI_FSI]	= PSIHB_XIVR_FSI,
> +	[P8_IRQ_PSI_LPC]	= PSIHB_XIVR_LPC,
> +	[P8_IRQ_PSI_LOCAL_ERR]	= PSIHB_XIVR_LOCAL_ERR,
> +	[P8_IRQ_PSI_HOST_ERR]	= PSIHB_XIVR_HOST_ERR,
> +};
> +
>  static int64_t psi_p8_set_xive(struct irq_source *is, uint32_t isn,
>  			       uint16_t server, uint8_t priority)
>  {
>  	struct psi *psi = is->data;
>  	uint64_t xivr_p, xivr;
> +	uint32_t irq_idx = isn & 7;
>  
> -	switch(isn & 7) {
> -	case P8_IRQ_PSI_FSP:
> -		xivr_p = PSIHB_XIVR_FSP;
> -		break;
> -	case P8_IRQ_PSI_OCC:
> -		xivr_p = PSIHB_XIVR_OCC;
> -		break;
> -	case P8_IRQ_PSI_FSI:
> -		xivr_p = PSIHB_XIVR_FSI;
> -		break;
> -	case P8_IRQ_PSI_LPC:
> -		xivr_p = PSIHB_XIVR_LPC;
> -		break;
> -	case P8_IRQ_PSI_LOCAL_ERR:
> -		xivr_p = PSIHB_XIVR_LOCAL_ERR;
> -		break;
> -	case P8_IRQ_PSI_HOST_ERR:
> -		xivr_p = PSIHB_XIVR_HOST_ERR;
> -		break;
> -	default:
> +	if (irq_idx >= P8_IRQ_PSI_ALL_COUNT)
>  		return OPAL_PARAMETER;
> -	}
> +	xivr_p = psi_p8_irq_to_xivr[irq_idx];
>  
>  	/* Populate the XIVR */
>  	xivr  = (uint64_t)server << 40;
> @@ -476,29 +467,11 @@ static int64_t psi_p8_get_xive(struct
> irq_source *is, uint32_t isn __unused,
>  {
>  	struct psi *psi = is->data;
>  	uint64_t xivr_p, xivr;
> +	uint32_t irq_idx = isn & 7;
>  
> -	switch(isn & 7) {
> -	case P8_IRQ_PSI_FSP:
> -		xivr_p = PSIHB_XIVR_FSP;
> -		break;
> -	case P8_IRQ_PSI_OCC:
> -		xivr_p = PSIHB_XIVR_OCC;
> -		break;
> -	case P8_IRQ_PSI_FSI:
> -		xivr_p = PSIHB_XIVR_FSI;
> -		break;
> -	case P8_IRQ_PSI_LPC:
> -		xivr_p = PSIHB_XIVR_LPC;
> -		break;
> -	case P8_IRQ_PSI_LOCAL_ERR:
> -		xivr_p = PSIHB_XIVR_LOCAL_ERR;
> -		break;
> -	case P8_IRQ_PSI_HOST_ERR:
> -		xivr_p = PSIHB_XIVR_HOST_ERR;
> -		break;
> -	default:
> +	if (irq_idx >= P8_IRQ_PSI_ALL_COUNT)
>  		return OPAL_PARAMETER;
> -	}
> +	xivr_p = psi_p8_irq_to_xivr[irq_idx];
>  
>  	/* Read & decode the XIVR */
>  	xivr = in_be64(psi->regs + xivr_p);
> @@ -509,33 +482,41 @@ static int64_t psi_p8_get_xive(struct
> irq_source *is, uint32_t isn __unused,
>  	return OPAL_SUCCESS;
>  }
>  
> +static void psi_cleanup_irq(struct psi *psi)
> +{
> +	uint32_t irq;
> +	uint64_t xivr, xivr_p;
> +
> +	for (irq = 0; irq < P8_IRQ_PSI_ALL_COUNT; irq++) {
> +		printf("PSI[0x%03x]: Cleaning up IRQ %d\n",
> +		       psi->chip_id, irq);
> +
> +		xivr_p = psi_p8_irq_to_xivr[irq];
> +		xivr = in_be64(psi->regs + xivr_p);
> +		xivr |= (0xffull << 32);
> +		out_be64(psi->regs + xivr_p, xivr);
> +		time_wait_ms_nopoll(10);
> +		xivr = in_be64(psi->regs + xivr_p);
> +		if (xivr & PPC_BIT(39)) {
> +			printf(" Need EOI !\n");
> +			icp_send_eoi(psi->interrupt + irq);
> +		}
> +	}
> +}
> +
>  /* Called on a fast reset, make sure we aren't stuck with
>   * an accepted and never EOId PSI interrupt
>   */
>  void psi_irq_reset(void)
>  {
>  	struct psi *psi;
> -	uint64_t xivr;
>  
>  	printf("PSI: Hot reset!\n");
>  
> -	assert(proc_gen == proc_gen_p7);
> +	assert(proc_gen == proc_gen_p8);
>  
>  	list_for_each(&psis, psi, list) {
> -		/* Mask the interrupt & clean the XIVR */
> -		xivr = 0x000000ff00000000UL;
> -		xivr |=	P7_IRQ_BUID(psi->interrupt) << 16;
> -		out_be64(psi->regs + PSIHB_XIVR, xivr);
> -
> -#if 0 /* Seems to checkstop ... */
> -		/*
> -		 * Maybe not anymore; we were just blindly sending
> -		 * this on all iopaths, not just the active one;
> -		 * We don't even know if those psis are even
> correct.
> -		 */
> -		/* Send a dummy EOI to make sure the ICP is clear */
> -		icp_send_eoi(psi->interrupt);
> -#endif
> +		psi_cleanup_irq(psi);
>  	}
>  }
>  
> diff --git a/include/config.h b/include/config.h
> index 2524570..3163c65 100644
> --- a/include/config.h
> +++ b/include/config.h
> @@ -72,8 +72,8 @@
>   */
>  //#define FORCE_DUMMY_CONSOLE 1
>  
> -/* Enable this to do fast resets. Currently unreliable... */
> -//#define ENABLE_FAST_RESET	1
> +/* Enable this to do fast reboots. Currently unreliable... */
> +#define ENABLE_FAST_REBOOT	1
>  
>  /* Enable this to make fast reboot clear memory */
>  //#define FAST_REBOOT_CLEARS_MEMORY	1
> diff --git a/include/device.h b/include/device.h
> index ed4fc46..4198a41 100644
> --- a/include/device.h
> +++ b/include/device.h
> @@ -119,6 +119,8 @@ static inline struct dt_property
> *dt_add_property_u64(struct dt_node *node,
>  
>  void dt_del_property(struct dt_node *node, struct dt_property
> *prop);
>  
> +void dt_check_del_prop(struct dt_node *node, const char *name);
> +
>  /* Warning: moves *prop! */
>  void dt_resize_property(struct dt_property **prop, size_t len);
>  
> diff --git a/include/skiboot.h b/include/skiboot.h
> index 72cda14..d073cf5 100644
> --- a/include/skiboot.h
> +++ b/include/skiboot.h
> @@ -190,12 +190,13 @@ extern unsigned long get_symbol(unsigned long
> addr,
>  				char **sym, char **sym_end);
>  
>  /* Fast reboot support */
> -extern void fast_reset(void);
> +extern void fast_reboot(void);
>  extern void __noreturn __secondary_cpu_entry(void);
>  extern void __noreturn load_and_boot_kernel(bool is_reboot);
>  extern void cleanup_tlb(void);
>  extern void init_shared_sprs(void);
>  extern void init_replicated_sprs(void);
> +extern bool start_preload_kernel(void);
>  
>  /* Various probe routines, to replace with an initcall system */
>  extern void probe_p7ioc(void);


More information about the Skiboot mailing list