[Skiboot] [RFC/WIP/PATCH] Fast reboot for P8
Benjamin Herrenschmidt
benh at kernel.crashing.org
Fri Jul 22 18:52:51 AEST 2016
On Fri, 2016-07-22 at 17:17 +1000, Benjamin Herrenschmidt wrote:
> This is an experimental patch that implements "Fast reboot" on P8
> machines.
Found bugs .... TLB not flushed among others. Monitor github benh-wip
branch for now, I'll re-post when I've done more testing.
> The basic idea is that when the OS calls OPAL reboot, we gather all
> the threads in the system using a combination of patching the reset
> vector and soft-resetting them, then cleanup a few bits of hardware
> (we do re-probe PCIe for example), and reload & restart the
> bootloader.
>
> This is very experimental and needs a lot of testing and also
> auditing
> code for other bits of HW that might need to be cleaned up. I also
> need
> to check if we are properly PERST'ing PCI devices.
>
> I've successfully fast rebooted a Habanero a few times.
>
> This is partially based on old code I had to do that on P7. I only
> support it on P8 though as there are issues with the PSI interrupts
> on P7 that cannot be reliably solved.
>
> Not-yet-signed-off-by: Benjamin Herrenschmidt <benh at kernel.crashing.o
> rg>
> ---
>
> asm/head.S | 125 +++++++--------
> core/device.c | 8 +
> core/fast-reboot.c | 438 ++++++++++++++++++++++++++++++-----------
> ----------
> core/init.c | 6 +-
> core/lock.c | 3 +
> core/pci.c | 13 +-
> core/platform.c | 4 +-
> hw/fsp/fsp-console.c | 5 +
> hw/occ.c | 5 +
> hw/psi.c | 97 +++++-------
> include/config.h | 4 +-
> include/device.h | 2 +
> include/skiboot.h | 3 +-
> 13 files changed, 402 insertions(+), 311 deletions(-)
>
> diff --git a/asm/head.S b/asm/head.S
> index e92f9b8..0c5649d 100644
> --- a/asm/head.S
> +++ b/asm/head.S
> @@ -294,7 +294,7 @@ boot_entry:
> bne secondary_wait
>
> /* Initialize per-core SPRs */
> - bl init_shared_sprs
> + bl init_shared_sprs
>
> /* Pick a boot CPU, cpu index in r31 */
> LOAD_IMM32(%r3, boot_sem - __head)
> @@ -311,7 +311,7 @@ boot_entry:
> smt_medium
>
> /* Initialize thread SPRs */
> - bl init_replicated_sprs
> + bl init_replicated_sprs
>
> /* Save the initial offset. The secondary threads will spin
> on boot_flag
> * before relocation so we need to keep track of its
> location to wake
> @@ -410,11 +410,11 @@ secondary_wait:
> add %r3,%r3,%r30
> mtctr %r3
> isync
> - bctr
> + bctr
> 1:
> /* Now wait for cpu_secondary_start to be set */
> LOAD_ADDR_FROM_TOC(%r3, cpu_secondary_start)
> -1: smt_very_low
> +1: smt_very_low
> ld %r0,0(%r3)
> cmpdi %r0,0
> beq 1b
> @@ -457,64 +457,6 @@ call_relocate:
> 1: /* Fatal relocate failure */
> attn
>
> -/* This is a little piece of code that is copied down to
> - * 0x100 when doing a "fast reset"
> - */
> -.global fast_reset_patch_start
> -fast_reset_patch_start:
> - smt_medium
> - LOAD_IMM64(%r30, SKIBOOT_BASE)
> - LOAD_IMM32(%r3, fast_reset_entry - __head)
> - add %r3,%r30,%r3
> - mtctr %r3
> - bctr
> -.global fast_reset_patch_end
> -fast_reset_patch_end:
> -
> -/* Fast reset code. We clean up the TLB and a few SPRs and
> - * return to C code. All CPUs do that, the CPU triggering the
> - * reset does it to itself last. The C code will sort out who
> - * the master is. We come from the trampoline above with
> - * r30 containing SKIBOOT_BASE
> - */
> -fast_reset_entry:
> - /* Clear out SLB */
> - li %r6,0
> - slbmte %r6,%r6
> - slbia
> - ptesync
> -
> - /* Get PIR */
> - mfspr %r31,SPR_PIR
> -
> - /* Get a stack and restore r13 */
> - GET_STACK(%r1,%r31)
> - li %r3,0
> - std %r3,0(%r1)
> - std %r3,8(%r1)
> - std %r3,16(%r1)
> - GET_CPU()
> -
> - /* Get our TOC */
> - addis %r2,%r30,(__toc_start - __head)@ha
> - addi %r2,%r2,(__toc_start - __head)@l
> -
> - /* Go to C ! */
> - bl fast_reboot
> - b .
> -
> -.global cleanup_tlb
> -cleanup_tlb:
> - /* Clean the TLB */
> - li %r3,128
> - mtctr %r3
> - li %r4,0x800 /* IS field = 0b10 */
> - ptesync
> -1: tlbiel %r4
> - addi %r4,%r4,0x1000
> - bdnz 1b
> - ptesync
> -
> #define FIXUP_ENDIAN \
> tdi 0,0,0x48; /* Reverse endian of b . + 8 */ \
> b $+36; /* Skip trampoline if endian is good */ \
> @@ -652,6 +594,65 @@ rvwinkle_restore:
> mtlr %r0
> blr
>
> +/* This is a little piece of code that is copied down to
> + * 0x100 when doing a "fast reset"
> + */
> +.global fast_reset_patch_start
> +fast_reset_patch_start:
> + FIXUP_ENDIAN
> + smt_medium
> + LOAD_IMM64(%r30, SKIBOOT_BASE)
> + LOAD_IMM32(%r3, fast_reset_entry - __head)
> + add %r3,%r30,%r3
> + mtctr %r3
> + bctr
> +.global fast_reset_patch_end
> +fast_reset_patch_end:
> +
> +/* Fast reset code. We clean up the TLB and a few SPRs and
> + * return to C code. All CPUs do that, the CPU triggering the
> + * reset does it to itself last. The C code will sort out who
> + * the master is. We come from the trampoline above with
> + * r30 containing SKIBOOT_BASE
> + */
> +fast_reset_entry:
> + /* Clear out SLB */
> + li %r6,0
> + slbmte %r6,%r6
> + slbia
> + ptesync
> +
> + /* Get PIR */
> + mfspr %r31,SPR_PIR
> +
> + /* Get a stack and restore r13 */
> + GET_STACK(%r1,%r31)
> + li %r3,0
> + std %r3,0(%r1)
> + std %r3,8(%r1)
> + std %r3,16(%r1)
> + GET_CPU()
> +
> + /* Get our TOC */
> + addis %r2,%r30,(__toc_start - __head)@ha
> + addi %r2,%r2,(__toc_start - __head)@l
> +
> + /* Go to C ! */
> + bl fast_reboot_entry
> + b .
> +
> +.global cleanup_tlb
> +cleanup_tlb:
> + /* Clean the TLB */
> + li %r3,128
> + mtctr %r3
> + li %r4,0x800 /* IS field = 0b10 */
> + ptesync
> +1: tlbiel %r4
> + addi %r4,%r4,0x1000
> + bdnz 1b
> + ptesync
> +
> /* Functions to initialize replicated and shared SPRs to sane
> * values. This is called at boot and on soft-reset
> */
> diff --git a/core/device.c b/core/device.c
> index 9e7ef0d..e7b53a8 100644
> --- a/core/device.c
> +++ b/core/device.c
> @@ -581,6 +581,14 @@ const struct dt_property *dt_find_property(const
> struct dt_node *node,
> return NULL;
> }
>
> +void dt_check_del_prop(struct dt_node *node, const char *name)
> +{
> + struct dt_property *p;
> +
> + p = __dt_find_property(node, name);
> + if (p)
> + dt_del_property(node, p);
> +}
> const struct dt_property *dt_require_property(const struct dt_node
> *node,
> const char *name, int
> wanted_len)
> {
> diff --git a/core/fast-reboot.c b/core/fast-reboot.c
> index 30b77e9..1a7f2cc 100644
> --- a/core/fast-reboot.c
> +++ b/core/fast-reboot.c
> @@ -26,179 +26,283 @@
> #include <pci.h>
> #include <chip.h>
>
> -/*
> - * To get control of all threads, we sreset them via XSCOM after
> - * patching the 0x100 vector. This will work as long as the target
> - * HRMOR is 0. If Linux ever uses HRMOR, we'll have to consider
> - * a more messy approach.
> - *
> - * The SCOM register we want is called "Core RAS Control" in the doc
> - * and EX0.EC.PC.TCTL_GENERATE#0.TCTL.DIRECT_CONTROLS in the SCOM
> list
> - *
> - * Bits in there change from CPU rev to CPU rev but the bit we care
> - * about, bit 60 "sreset_request" appears to have stuck to the same
> - * place in both P7 and P7+. The register also has the same SCOM
> - * address
> - */
> -#define EX0_TCTL_DIRECT_CONTROLS0 0x08010400
> -#define EX0_TCTL_DIRECT_CONTROLS1 0x08010440
> -#define EX0_TCTL_DIRECT_CONTROLS2 0x08010480
> -#define EX0_TCTL_DIRECT_CONTROLS3 0x080104c0
> -#define TCTL_DC_SRESET_REQUEST PPC_BIT(60)
> +#define P8_EX_TCTL_DIRECT_CONTROLS(t) (0x10013000 + (t) *
> 0x10)
> +#define P8_DIRECT_CTL_STOP PPC_BIT(63)
> +#define P8_DIRECT_CTL_PRENAP PPC_BIT(47)
> +#define P8_DIRECT_CTL_SRESET PPC_BIT(60)
> +
>
> /* Flag tested by the OPAL entry code */
> uint8_t reboot_in_progress;
> -static struct cpu_thread *resettor, *resettee;
> +static volatile bool fast_boot_release;
> +static struct cpu_thread *last_man_standing;
> +static struct lock reset_lock = LOCK_UNLOCKED;
>
> -static void flush_caches(void)
> +static int set_special_wakeup(struct cpu_thread *cpu)
> {
> - uint64_t base = SKIBOOT_BASE;
> - uint64_t end = base + SKIBOOT_SIZE;
> + uint64_t val, poll_target, stamp;
> + uint32_t core_id;
> + int rc;
>
> - /* Not sure what the effect of sreset is on cores, so let's
> - * shoot a series of dcbf's on all cachelines that make up
> - * our core memory just in case...
> + /*
> + * Note: HWP checks for checkstops, but I assume we don't
> need to
> + * as we wouldn't be running if one was present
> */
> - while(base < end) {
> - asm volatile("dcbf 0,%0" : : "r" (base) : "memory");
> - base += 128;
> +
> + /* Grab core ID once */
> + core_id = pir_to_core_id(cpu->pir);
> +
> + prlog(PR_DEBUG, "RESET Waking up core 0x%x\n", core_id);
> +
> + /*
> + * The original HWp reads the XSCOM first but ignores the
> result
> + * and error, let's do the same until I know for sure that
> is
> + * not necessary
> + */
> + xscom_read(cpu->chip_id,
> + XSCOM_ADDR_P8_EX_SLAVE(core_id,
> EX_PM_SPECIAL_WAKEUP_PHYP),
> + &val);
> +
> + /* Then we write special wakeup */
> + rc = xscom_write(cpu->chip_id,
> + XSCOM_ADDR_P8_EX_SLAVE(core_id,
> + EX_PM_SPECIAL_WAKEUP
> _PHYP),
> + PPC_BIT(0));
> + if (rc) {
> + prerror("RESET: XSCOM error %d asserting special"
> + " wakeup on 0x%x\n", rc, cpu->pir);
> + return rc;
> }
> - sync();
> +
> + /*
> + * HWP uses the history for Perf register here, dunno why it
> uses
> + * that one instead of the pHyp one, maybe to avoid
> clobbering it...
> + *
> + * In any case, it does that to check for run/nap
> vs.sleep/winkle/other
> + * to decide whether to poll on checkstop or not. Since we
> don't deal
> + * with checkstop conditions here, we ignore that part.
> + */
> +
> + /*
> + * Now poll for completion of special wakeup. The HWP is
> nasty here,
> + * it will poll at 5ms intervals for up to 200ms. This is
> not quite
> + * acceptable for us at runtime, at least not until we have
> the
> + * ability to "context switch" HBRT. In practice, because we
> don't
> + * winkle, it will never take that long, so we increase the
> polling
> + * frequency to 1us per poll. However we do have to keep the
> same
> + * timeout.
> + *
> + * We don't use time_wait_ms() either for now as we don't
> want to
> + * poll the FSP here.
> + */
> + stamp = mftb();
> + poll_target = stamp + msecs_to_tb(200);
> + val = 0;
> + while (!(val & EX_PM_GP0_SPECIAL_WAKEUP_DONE)) {
> + /* Wait 1 us */
> + time_wait_us(1);
> +
> + /* Read PM state */
> + rc = xscom_read(cpu->chip_id,
> + XSCOM_ADDR_P8_EX_SLAVE(core_id,
> EX_PM_GP0),
> + &val);
> + if (rc) {
> + prerror("RESET: XSCOM error %d reading PM
> state on"
> + " 0x%x\n", rc, cpu->pir);
> + return rc;
> + }
> + /* Check timeout */
> + if (mftb() > poll_target)
> + break;
> + }
> +
> + /* Success ? */
> + if (val & EX_PM_GP0_SPECIAL_WAKEUP_DONE) {
> + uint64_t now = mftb();
> + prlog(PR_TRACE, "RESET: Special wakeup complete
> after %ld us\n",
> + tb_to_usecs(now - stamp));
> + return 0;
> + }
> +
> + /*
> + * We timed out ...
> + *
> + * HWP has a complex workaround for HW255321 which affects
> + * Murano DD1 and Venice DD1. Ignore that for now
> + *
> + * Instead we just dump some XSCOMs for error logging
> + */
> + prerror("RESET: Timeout on special wakeup of 0x%0x\n", cpu-
> >pir);
> + prerror("RESET: PM0 = 0x%016llx\n", val);
> + val = -1;
> + xscom_read(cpu->chip_id,
> + XSCOM_ADDR_P8_EX_SLAVE(core_id,
> EX_PM_SPECIAL_WAKEUP_PHYP),
> + &val);
> + prerror("RESET: SPC_WKUP = 0x%016llx\n", val);
> + val = -1;
> + xscom_read(cpu->chip_id,
> + XSCOM_ADDR_P8_EX_SLAVE(core_id,
> + EX_PM_IDLE_STATE_HISTORY_P
> HYP),
> + &val);
> + prerror("RESET: HISTORY = 0x%016llx\n", val);
> +
> + return OPAL_HARDWARE;
> }
>
> -static bool do_reset_core_p7(struct cpu_thread *cpu)
> +static int clr_special_wakeup(struct cpu_thread *cpu)
> {
> - uint32_t xscom_addr, chip;
> - uint64_t ctl;
> + uint64_t val;
> + uint32_t core_id;
> int rc;
>
> - /* Add the Core# */
> - xscom_addr = EX0_TCTL_DIRECT_CONTROLS0;
> - xscom_addr |= ((cpu->pir >> 2) & 7) << 24;
> + /*
> + * Note: HWP checks for checkstops, but I assume we don't
> need to
> + * as we wouldn't be running if one was present
> + */
> +
> + /* Grab core ID once */
> + core_id = pir_to_core_id(cpu->pir);
>
> - chip = pir_to_chip_id(cpu->pir);
> + prlog(PR_DEBUG, "RESET: Releasing core 0x%x wakeup\n",
> core_id);
>
> - ctl = TCTL_DC_SRESET_REQUEST;
> - rc = xscom_write(chip, xscom_addr, ctl);
> - rc |= xscom_write(chip, xscom_addr + 0x40, ctl);
> - rc |= xscom_write(chip, xscom_addr + 0x80, ctl);
> - rc |= xscom_write(chip, xscom_addr + 0xc0, ctl);
> + /*
> + * The original HWp reads the XSCOM first but ignores the
> result
> + * and error, let's do the same until I know for sure that
> is
> + * not necessary
> + */
> + xscom_read(cpu->chip_id,
> + XSCOM_ADDR_P8_EX_SLAVE(core_id,
> EX_PM_SPECIAL_WAKEUP_PHYP),
> + &val);
> +
> + /* Then we write special wakeup */
> + rc = xscom_write(cpu->chip_id,
> + XSCOM_ADDR_P8_EX_SLAVE(core_id,
> + EX_PM_SPECIAL_WAKEUP
> _PHYP), 0);
> if (rc) {
> - prerror("RESET: Error %d resetting CPU 0x%04x\n",
> - rc, cpu->pir);
> - return false;
> + prerror("RESET: XSCOM error %d deasserting"
> + " special wakeup on 0x%x\n", rc, cpu->pir);
> + return rc;
> }
> - return true;
> +
> + /*
> + * The original HWp reads the XSCOM again with the comment
> + * "This puts an inherent delay in the propagation of the
> reset
> + * transition"
> + */
> + xscom_read(cpu->chip_id,
> + XSCOM_ADDR_P8_EX_SLAVE(core_id,
> EX_PM_SPECIAL_WAKEUP_PHYP),
> + &val);
> +
> + return 0;
> }
>
> -static void fast_reset_p7(void)
> +static void set_direct_ctl(struct cpu_thread *cpu, uint64_t bits)
> {
> - struct cpu_thread *cpu;
> + uint32_t core_id = pir_to_core_id(cpu->pir);
> + uint32_t chip_id = pir_to_chip_id(cpu->pir);
> + uint32_t thread_id = pir_to_thread_id(cpu->pir);
> + uint32_t xscom_addr;
>
> - resettee = this_cpu();
> - resettor = NULL;
> + xscom_addr = XSCOM_ADDR_P8_EX(core_id,
> P8_EX_TCTL_DIRECT_CONTROLS(thread_id));
>
> - /* Pick up a candidate resettor. We do that before we flush
> - * the caches
> - */
> - for_each_cpu(cpu) {
> - /*
> - * Some threads might still be in skiboot.
> - *
> - * But because we deal with entire cores and we
> don't want
> - * to special case things, we are just going to
> reset them
> - * too making the assumption that this is safe, they
> are
> - * holding no locks. This can only be true if they
> don't
> - * have jobs scheduled which is hopefully the case.
> - */
> - if (cpu->state != cpu_state_os &&
> - cpu->state != cpu_state_active)
> - continue;
> + xscom_write(chip_id, xscom_addr, bits);
> +}
>
> - /*
> - * Only hit cores and only if they aren't on the
> same core
> - * as ourselves
> - */
> - if (cpu_get_thread0(cpu) ==
> cpu_get_thread0(this_cpu()) ||
> - cpu->pir & 0x3)
> - continue;
> +static void patch_reset_vector(void)
> +{
> + extern uint32_t fast_reset_patch_start;
> + extern uint32_t fast_reset_patch_end;
> + uint32_t *dst, *src;
> +
> + /* Copy reset trampoline */
> + prlog(PR_DEBUG, "RESET: Copying reset trampoline...\n");
> + src = &fast_reset_patch_start;
> + dst = (uint32_t *)0x100;
> + while(src < &fast_reset_patch_end)
> + *(dst++) = *(src++);
> + sync_icache();
> +}
> +
> +static bool fast_reset_p8(void)
> +{
> + struct cpu_thread *cpu;
> +
> + /* Mark ourselves as last man standing in need of a reset */
> + last_man_standing = this_cpu();
>
> - /* Pick up one of those guys as our "resettor". It
> will be
> - * in charge of resetting this CPU. We avoid
> resetting
> - * ourselves, not sure how well it would do with
> SCOM
> - */
> - resettor = cpu;
> - break;
> + prlog(PR_DEBUG, "RESET: Resetting from cpu: 0x%x (core
> 0x%x)\n",
> + this_cpu()->pir, pir_to_core_id(this_cpu()->pir));
> +
> + /* Assert special wakup on all cores */
> + for_each_cpu(cpu) {
> + if (cpu->primary == cpu)
> + if (set_special_wakeup(cpu) != OPAL_SUCCESS)
> + return false;
> }
>
> - if (!resettor) {
> - printf("RESET: Can't find a resettor !\n");
> - return;
> + prlog(PR_DEBUG, "RESET: Stopping the world...\n");
> +
> + /* Put everybody in stop except myself */
> + for_each_cpu(cpu) {
> + if (cpu != this_cpu())
> + set_direct_ctl(cpu, P8_DIRECT_CTL_STOP);
> }
> - printf("RESET: Resetting from 0x%04x, resettor 0x%04x\n",
> - this_cpu()->pir, resettor->pir);
>
> - printf("RESET: Flushing caches...\n");
> + /* Patch reset */
> + patch_reset_vector();
>
> - /* Is that necessary ? */
> - flush_caches();
> + prlog(PR_DEBUG, "RESET: Pre-napping all threads but
> one...\n");
>
> - /* Reset everybody except self and except resettor */
> + /* Put everybody in pre-nap except myself */
> for_each_cpu(cpu) {
> - if (cpu->state != cpu_state_os &&
> - cpu->state != cpu_state_active)
> - continue;
> - if (cpu_get_thread0(cpu) ==
> cpu_get_thread0(this_cpu()) ||
> - cpu->pir & 0x3)
> - continue;
> - if (cpu_get_thread0(cpu) ==
> cpu_get_thread0(resettor))
> - continue;
> + if (cpu != this_cpu())
> + set_direct_ctl(cpu, P8_DIRECT_CTL_PRENAP);
> + }
>
> - printf("RESET: Resetting CPU 0x%04x...\n", cpu-
> >pir);
> + prlog(PR_DEBUG, "RESET: Resetting all threads but
> one...\n");
>
> - if (!do_reset_core_p7(cpu))
> - return;
> + /* Reset everybody except my own core threads */
> + for_each_cpu(cpu) {
> + if (cpu != this_cpu())
> + set_direct_ctl(cpu, P8_DIRECT_CTL_SRESET);
> }
>
> - /* Reset the resettor last because it's going to kill me !
> */
> - printf("RESET: Resetting CPU 0x%04x...\n", resettor->pir);
> - if (!do_reset_core_p7(resettor))
> - return;
> -
> - /* Don't return */
> - for (;;)
> - ;
> + return true;
> }
>
> -void fast_reset(void)
> +void fast_reboot(void)
> {
> - uint32_t pvr = mfspr(SPR_PVR);
> - extern uint32_t fast_reset_patch_start;
> - extern uint32_t fast_reset_patch_end;
> - uint32_t *dst, *src;
> + bool success;
>
> - printf("RESET: Fast reboot request !\n");
> + if (proc_gen != proc_gen_p8)
> + return;
> +
> + prlog(PR_INFO, "RESET: Initiating fast reboot...\n");
>
> /* XXX We need a way to ensure that no other CPU is in
> skiboot
> * holding locks (via the OPAL APIs) and if they are, we
> need
> - * for them to get out
> + * for them to get out. Hopefully that isn't happening,
> but...
> + *
> + * To fix this properly, we want to keep track of OPAL
> entry/exit
> + * on all CPUs.
> */
> reboot_in_progress = 1;
> time_wait_ms(200);
>
> - /* Copy reset trampoline */
> - printf("RESET: Copying reset trampoline...\n");
> - src = &fast_reset_patch_start;
> - dst = (uint32_t *)0x100;
> - while(src < &fast_reset_patch_end)
> - *(dst++) = *(src++);
> - sync_icache();
> + /* Lock so the new guys coming don't reset us */
> + lock(&reset_lock);
>
> - switch(PVR_TYPE(pvr)) {
> - case PVR_TYPE_P7:
> - case PVR_TYPE_P7P:
> - fast_reset_p7();
> - }
> + fast_boot_release = false;
> +
> + success = fast_reset_p8();
> +
> + /* Unlock, at this point we go away */
> + unlock(&reset_lock);
> +
> + if (success)
> + /* Don't return */
> + for (;;)
> + ;
> }
>
> static void cleanup_cpu_state(void)
> @@ -211,56 +315,24 @@ static void cleanup_cpu_state(void)
> reset_cpu_icp();
> }
>
> -#ifdef FAST_REBOOT_CLEARS_MEMORY
> -static void fast_mem_clear(uint64_t start, uint64_t end)
> -{
> - printf("MEMORY: Clearing %llx..%llx\n", start, end);
> -
> - while(start < end) {
> - asm volatile("dcbz 0,%0" : : "r" (start) :
> "memory");
> - start += 128;
> - }
> -}
> -
> -static void memory_reset(void)
> -{
> - struct address_range *i;
> - uint64_t skistart = SKIBOOT_BASE;
> - uint64_t skiend = SKIBOOT_BASE + SKIBOOT_SIZE;
> -
> - printf("MEMORY: Clearing ...\n");
> -
> - list_for_each(&address_ranges, i, list) {
> - uint64_t start = cleanup_addr(i->arange->start);
> - uint64_t end = cleanup_addr(i->arange->end);
> -
> - if (start >= skiend || end <= skistart)
> - fast_mem_clear(start, end);
> - else {
> - if (start < skistart)
> - fast_mem_clear(start, skistart);
> - if (end > skiend)
> - fast_mem_clear(skiend, end);
> - }
> - }
> -}
> -#endif /* FAST_REBOOT_CLEARS_MEMORY */
> -
> /* Entry from asm after a fast reset */
> -void __noreturn fast_reboot(void);
> +void __noreturn fast_reboot_entry(void);
>
> -void __noreturn fast_reboot(void)
> +void __noreturn fast_reboot_entry(void)
> {
> - static volatile bool fast_boot_release;
> struct cpu_thread *cpu;
>
> - printf("INIT: CPU PIR 0x%04x reset in\n", this_cpu()->pir);
> + prlog(PR_DEBUG, "RESET: CPU PIR 0x%04x reset in\n",
> this_cpu()->pir);
> + time_wait_ms(100);
>
> - /* If this CPU was chosen as the resettor, it must reset the
> - * resettee (the one that initiated the whole process
> - */
> - if (this_cpu() == resettor)
> - do_reset_core_p7(resettee);
> + lock(&reset_lock);
> + if (last_man_standing) {
> + prlog(PR_DEBUG, "RESET: last man standing
> fixup...\n");
> + set_direct_ctl(last_man_standing,
> P8_DIRECT_CTL_PRENAP);
> + set_direct_ctl(last_man_standing,
> P8_DIRECT_CTL_SRESET);
> + last_man_standing = NULL;
> + }
> + unlock(&reset_lock);
>
> /* Are we the original boot CPU ? If not, we spin waiting
> * for a relase signal from CPU 1, then we clean ourselves
> @@ -277,6 +349,8 @@ void __noreturn fast_reboot(void)
> __secondary_cpu_entry();
> }
>
> + prlog(PR_INFO, "RESET: Boot CPU waiting for
> everybody...\n");
> +
> /* We are the original boot CPU, wait for secondaries to
> * be captured
> */
> @@ -292,7 +366,7 @@ void __noreturn fast_reboot(void)
> smt_medium();
> }
>
> - printf("INIT: Releasing secondaries...\n");
> + prlog(PR_INFO, "RESET: Releasing secondaries...\n");
>
> /* Release everybody */
> fast_boot_release = true;
> @@ -310,7 +384,14 @@ void __noreturn fast_reboot(void)
> }
> }
>
> - printf("INIT: All done, resetting everything else...\n");
> + prlog(PR_DEBUG, "RESET: Releasing special wakeups...\n");
> +
> + for_each_cpu(cpu) {
> + if (cpu->primary == cpu)
> + clr_special_wakeup(cpu);
> + }
> +
> + prlog(PR_INFO, "RESET: All done, cleaning up...\n");
>
> /* Clear release flag for next time */
> fast_boot_release = false;
> @@ -322,6 +403,8 @@ void __noreturn fast_reboot(void)
> /* Set our state to active */
> this_cpu()->state = cpu_state_active;
>
> + start_preload_kernel();
> +
> /* Poke the consoles (see comments in the code there) */
> fsp_console_reset();
>
> @@ -331,15 +414,6 @@ void __noreturn fast_reboot(void)
> /* Remove all PCI devices */
> pci_reset();
>
> - /* Reset IO Hubs */
> - cec_reset();
> -
> - /* Re-Initialize all discovered PCI slots */
> - pci_init_slots();
> -
> - /* Clear memory */
> -#ifdef FAST_REBOOT_CLEARS_MEMORY
> - memory_reset();
> -#endif
> + /* Load and boot payload */
> load_and_boot_kernel(true);
> }
> diff --git a/core/init.c b/core/init.c
> index ca3ad55..1a3d741 100644
> --- a/core/init.c
> +++ b/core/init.c
> @@ -287,7 +287,7 @@ extern uint64_t boot_offset;
> static size_t kernel_size;
> static size_t initramfs_size;
>
> -static bool start_preload_kernel(void)
> +bool start_preload_kernel(void)
> {
> int loaded;
>
> @@ -384,6 +384,9 @@ static void load_initramfs(void)
> {
> int loaded;
>
> + dt_check_del_prop(dt_chosen, "linux,initrd-start");
> + dt_check_del_prop(dt_chosen, "linux,initrd-end");
> +
> loaded = wait_for_resource_loaded(RESOURCE_ID_INITRAMFS,
> RESOURCE_SUBID_NONE);
>
> @@ -447,6 +450,7 @@ void __noreturn load_and_boot_kernel(bool
> is_reboot)
>
> /* Set kernel command line argument if specified */
> #ifdef KERNEL_COMMAND_LINE
> + dt_check_del_prop(dt_chosen, "bootargs");
> dt_add_property_string(dt_chosen, "bootargs",
> KERNEL_COMMAND_LINE);
> #endif
>
> diff --git a/core/lock.c b/core/lock.c
> index 53cc337..e82048b 100644
> --- a/core/lock.c
> +++ b/core/lock.c
> @@ -110,6 +110,9 @@ void unlock(struct lock *l)
> this_cpu()->lock_depth--;
> l->lock_val = 0;
>
> + /* WARNING: On fast reboot, we can be reset right at that
> + * point, so the reset_lock in there cannot be in the con
> path
> + */
> if (l->in_con_path) {
> cpu->con_suspend--;
> if (cpu->con_suspend == 0 && cpu->con_need_flush)
> diff --git a/core/pci.c b/core/pci.c
> index cbaea35..bbf4583 100644
> --- a/core/pci.c
> +++ b/core/pci.c
> @@ -1456,6 +1456,7 @@ static void __pci_reset(struct list_head *list)
>
> while ((pd = list_pop(list, struct pci_device, link)) !=
> NULL) {
> __pci_reset(&pd->children);
> + dt_free(pd->dn);
> free(pd);
> }
> }
> @@ -1466,16 +1467,22 @@ void pci_reset(void)
>
> prlog(PR_NOTICE, "PCI: Clearing all devices...\n");
>
> - /* This is a remnant of fast-reboot, not currently used */
>
> /* XXX Do those in parallel (at least the power up
> * state machine could be done in parallel)
> */
> for (i = 0; i < ARRAY_SIZE(phbs); i++) {
> - if (!phbs[i])
> + struct phb *phb = phbs[i];
> + if (!phb)
> continue;
> - __pci_reset(&phbs[i]->devices);
> + __pci_reset(&phb->devices);
> + if (phb->ops->ioda_reset)
> + phb->ops->ioda_reset(phb, true);
> }
> +
> + /* Re-Initialize all discovered PCI slots */
> + pci_init_slots();
> +
> }
>
> static void pci_do_jobs(void (*fn)(void *))
> diff --git a/core/platform.c b/core/platform.c
> index de6e406..7915857 100644
> --- a/core/platform.c
> +++ b/core/platform.c
> @@ -52,9 +52,9 @@ static int64_t opal_cec_reboot(void)
>
> console_complete_flush();
>
> -#ifdef ENABLE_FAST_RESET
> +#ifdef ENABLE_FAST_REBOOT
> /* Try a fast reset first */
> - fast_reset();
> + fast_reboot();
> #endif
> if (platform.cec_reboot)
> return platform.cec_reboot();
> diff --git a/hw/fsp/fsp-console.c b/hw/fsp/fsp-console.c
> index 87e509d..5e27197 100644
> --- a/hw/fsp/fsp-console.c
> +++ b/hw/fsp/fsp-console.c
> @@ -884,6 +884,9 @@ static void reopen_all_hvsi(void)
>
> void fsp_console_reset(void)
> {
> + if (!fsp_present())
> + return;
> +
> prlog(PR_NOTICE, "FSP: Console reset !\n");
>
> /* This is called on a fast-reset. To work around issues
> with HVSI
> @@ -985,6 +988,8 @@ void fsp_console_select_stdout(void)
> */
> }
> }
> + dt_check_del_prop(dt_chosen, "linux,stdout-path");
> +
> if (fsp_serials[1].open && use_serial) {
> dt_add_property_string(dt_chosen, "linux,stdout-
> path",
> "/ibm,opal/consoles/serial at 1"
> );
> diff --git a/hw/occ.c b/hw/occ.c
> index b606a67..3d86f7a 100644
> --- a/hw/occ.c
> +++ b/hw/occ.c
> @@ -517,10 +517,14 @@ void occ_pstates_init(void)
> struct proc_chip *chip;
> struct cpu_thread *c;
> s8 pstate_nom;
> + static bool occ_pstates_initialized;
>
> /* OCC is P8 only */
> if (proc_gen != proc_gen_p8)
> return;
> + /* Handle fast reboots */
> + if (occ_pstates_initialized)
> + return;
>
> chip = next_chip(NULL);
> if (!chip->homer_base) {
> @@ -558,6 +562,7 @@ void occ_pstates_init(void)
> for_each_chip(chip)
> chip->throttle = 0;
> opal_add_poller(occ_throttle_poll, NULL);
> + occ_pstates_initialized = true;
> }
>
> struct occ_load_req {
> diff --git a/hw/psi.c b/hw/psi.c
> index 3efc177..03527f6 100644
> --- a/hw/psi.c
> +++ b/hw/psi.c
> @@ -432,34 +432,25 @@ static int64_t psi_p7_get_xive(struct
> irq_source *is, uint32_t isn __unused,
> return OPAL_SUCCESS;
> }
>
> +static const uint32_t psi_p8_irq_to_xivr[P8_IRQ_PSI_ALL_COUNT] = {
> + [P8_IRQ_PSI_FSP] = PSIHB_XIVR_FSP,
> + [P8_IRQ_PSI_OCC] = PSIHB_XIVR_OCC,
> + [P8_IRQ_PSI_FSI] = PSIHB_XIVR_FSI,
> + [P8_IRQ_PSI_LPC] = PSIHB_XIVR_LPC,
> + [P8_IRQ_PSI_LOCAL_ERR] = PSIHB_XIVR_LOCAL_ERR,
> + [P8_IRQ_PSI_HOST_ERR] = PSIHB_XIVR_HOST_ERR,
> +};
> +
> static int64_t psi_p8_set_xive(struct irq_source *is, uint32_t isn,
> uint16_t server, uint8_t priority)
> {
> struct psi *psi = is->data;
> uint64_t xivr_p, xivr;
> + uint32_t irq_idx = isn & 7;
>
> - switch(isn & 7) {
> - case P8_IRQ_PSI_FSP:
> - xivr_p = PSIHB_XIVR_FSP;
> - break;
> - case P8_IRQ_PSI_OCC:
> - xivr_p = PSIHB_XIVR_OCC;
> - break;
> - case P8_IRQ_PSI_FSI:
> - xivr_p = PSIHB_XIVR_FSI;
> - break;
> - case P8_IRQ_PSI_LPC:
> - xivr_p = PSIHB_XIVR_LPC;
> - break;
> - case P8_IRQ_PSI_LOCAL_ERR:
> - xivr_p = PSIHB_XIVR_LOCAL_ERR;
> - break;
> - case P8_IRQ_PSI_HOST_ERR:
> - xivr_p = PSIHB_XIVR_HOST_ERR;
> - break;
> - default:
> + if (irq_idx >= P8_IRQ_PSI_ALL_COUNT)
> return OPAL_PARAMETER;
> - }
> + xivr_p = psi_p8_irq_to_xivr[irq_idx];
>
> /* Populate the XIVR */
> xivr = (uint64_t)server << 40;
> @@ -476,29 +467,11 @@ static int64_t psi_p8_get_xive(struct
> irq_source *is, uint32_t isn __unused,
> {
> struct psi *psi = is->data;
> uint64_t xivr_p, xivr;
> + uint32_t irq_idx = isn & 7;
>
> - switch(isn & 7) {
> - case P8_IRQ_PSI_FSP:
> - xivr_p = PSIHB_XIVR_FSP;
> - break;
> - case P8_IRQ_PSI_OCC:
> - xivr_p = PSIHB_XIVR_OCC;
> - break;
> - case P8_IRQ_PSI_FSI:
> - xivr_p = PSIHB_XIVR_FSI;
> - break;
> - case P8_IRQ_PSI_LPC:
> - xivr_p = PSIHB_XIVR_LPC;
> - break;
> - case P8_IRQ_PSI_LOCAL_ERR:
> - xivr_p = PSIHB_XIVR_LOCAL_ERR;
> - break;
> - case P8_IRQ_PSI_HOST_ERR:
> - xivr_p = PSIHB_XIVR_HOST_ERR;
> - break;
> - default:
> + if (irq_idx >= P8_IRQ_PSI_ALL_COUNT)
> return OPAL_PARAMETER;
> - }
> + xivr_p = psi_p8_irq_to_xivr[irq_idx];
>
> /* Read & decode the XIVR */
> xivr = in_be64(psi->regs + xivr_p);
> @@ -509,33 +482,41 @@ static int64_t psi_p8_get_xive(struct
> irq_source *is, uint32_t isn __unused,
> return OPAL_SUCCESS;
> }
>
> +static void psi_cleanup_irq(struct psi *psi)
> +{
> + uint32_t irq;
> + uint64_t xivr, xivr_p;
> +
> + for (irq = 0; irq < P8_IRQ_PSI_ALL_COUNT; irq++) {
> + printf("PSI[0x%03x]: Cleaning up IRQ %d\n",
> + psi->chip_id, irq);
> +
> + xivr_p = psi_p8_irq_to_xivr[irq];
> + xivr = in_be64(psi->regs + xivr_p);
> + xivr |= (0xffull << 32);
> + out_be64(psi->regs + xivr_p, xivr);
> + time_wait_ms_nopoll(10);
> + xivr = in_be64(psi->regs + xivr_p);
> + if (xivr & PPC_BIT(39)) {
> + printf(" Need EOI !\n");
> + icp_send_eoi(psi->interrupt + irq);
> + }
> + }
> +}
> +
> /* Called on a fast reset, make sure we aren't stuck with
> * an accepted and never EOId PSI interrupt
> */
> void psi_irq_reset(void)
> {
> struct psi *psi;
> - uint64_t xivr;
>
> printf("PSI: Hot reset!\n");
>
> - assert(proc_gen == proc_gen_p7);
> + assert(proc_gen == proc_gen_p8);
>
> list_for_each(&psis, psi, list) {
> - /* Mask the interrupt & clean the XIVR */
> - xivr = 0x000000ff00000000UL;
> - xivr |= P7_IRQ_BUID(psi->interrupt) << 16;
> - out_be64(psi->regs + PSIHB_XIVR, xivr);
> -
> -#if 0 /* Seems to checkstop ... */
> - /*
> - * Maybe not anymore; we were just blindly sending
> - * this on all iopaths, not just the active one;
> - * We don't even know if those psis are even
> correct.
> - */
> - /* Send a dummy EOI to make sure the ICP is clear */
> - icp_send_eoi(psi->interrupt);
> -#endif
> + psi_cleanup_irq(psi);
> }
> }
>
> diff --git a/include/config.h b/include/config.h
> index 2524570..3163c65 100644
> --- a/include/config.h
> +++ b/include/config.h
> @@ -72,8 +72,8 @@
> */
> //#define FORCE_DUMMY_CONSOLE 1
>
> -/* Enable this to do fast resets. Currently unreliable... */
> -//#define ENABLE_FAST_RESET 1
> +/* Enable this to do fast reboots. Currently unreliable... */
> +#define ENABLE_FAST_REBOOT 1
>
> /* Enable this to make fast reboot clear memory */
> //#define FAST_REBOOT_CLEARS_MEMORY 1
> diff --git a/include/device.h b/include/device.h
> index ed4fc46..4198a41 100644
> --- a/include/device.h
> +++ b/include/device.h
> @@ -119,6 +119,8 @@ static inline struct dt_property
> *dt_add_property_u64(struct dt_node *node,
>
> void dt_del_property(struct dt_node *node, struct dt_property
> *prop);
>
> +void dt_check_del_prop(struct dt_node *node, const char *name);
> +
> /* Warning: moves *prop! */
> void dt_resize_property(struct dt_property **prop, size_t len);
>
> diff --git a/include/skiboot.h b/include/skiboot.h
> index 72cda14..d073cf5 100644
> --- a/include/skiboot.h
> +++ b/include/skiboot.h
> @@ -190,12 +190,13 @@ extern unsigned long get_symbol(unsigned long
> addr,
> char **sym, char **sym_end);
>
> /* Fast reboot support */
> -extern void fast_reset(void);
> +extern void fast_reboot(void);
> extern void __noreturn __secondary_cpu_entry(void);
> extern void __noreturn load_and_boot_kernel(bool is_reboot);
> extern void cleanup_tlb(void);
> extern void init_shared_sprs(void);
> extern void init_replicated_sprs(void);
> +extern bool start_preload_kernel(void);
>
> /* Various probe routines, to replace with an initcall system */
> extern void probe_p7ioc(void);
More information about the Skiboot
mailing list