[PATCH v2 3/5] powerpc/powernv: Add winkle infrastructure

Benjamin Herrenschmidt benh at kernel.crashing.org
Tue Oct 7 16:33:53 EST 2014


On Wed, 2014-10-01 at 13:16 +0530, Shreyas B. Prabhu wrote:
> Winkle causes power to be gated off to the entire chiplet. Hence the
> hypervisor/firmware state in the entire chiplet is lost.
> 
> This patch adds necessary infrastructure to support waking up from
> hypervisor state loss. Specifically does following:
> - Before entering winkle, save state of registers that need to be
>   restored on wake up (SDR1, HFSCR)

 Add ... to your list, it's not exhaustive, is it ?

> - SRR1 bits 46:47 which is used to identify which power saving mode cpu
>   woke up from is '11' for both winkle and sleep. Hence introduce a flag
>   in PACA to distinguish b/w winkle and sleep.
> 
> - Upon waking up, restore all saved registers, recover slb
> 
> Cc: Benjamin Herrenschmidt <benh at kernel.crashing.org>
> Cc: Paul Mackerras <paulus at samba.org>
> Cc: Michael Ellerman <mpe at ellerman.id.au>
> Cc: linuxppc-dev at lists.ozlabs.org
> Suggested-by: Vaidyanathan Srinivasan <svaidy at linux.vnet.ibm.com>
> Signed-off-by: Shreyas B. Prabhu <shreyas at linux.vnet.ibm.com>
> ---
>  arch/powerpc/include/asm/machdep.h     |  1 +
>  arch/powerpc/include/asm/paca.h        |  3 ++
>  arch/powerpc/include/asm/ppc-opcode.h  |  2 +
>  arch/powerpc/include/asm/processor.h   |  2 +
>  arch/powerpc/kernel/asm-offsets.c      |  1 +
>  arch/powerpc/kernel/exceptions-64s.S   |  8 ++--
>  arch/powerpc/kernel/idle.c             | 11 +++++
>  arch/powerpc/kernel/idle_power7.S      | 81 +++++++++++++++++++++++++++++++++-
>  arch/powerpc/platforms/powernv/setup.c | 24 ++++++++++
>  9 files changed, 127 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
> index f37014f..0a3ced9 100644
> --- a/arch/powerpc/include/asm/machdep.h
> +++ b/arch/powerpc/include/asm/machdep.h
> @@ -301,6 +301,7 @@ struct machdep_calls {
>  	/* Idle handlers */
>  	void		(*setup_idle)(void);
>  	unsigned long	(*power7_sleep)(void);
> +	unsigned long	(*power7_winkle)(void);
>  };

Why does it need to be ppc_md ? Same comments as for sleep
 
>  extern void e500_idle(void);
> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
> index a5139ea..3358f09 100644
> --- a/arch/powerpc/include/asm/paca.h
> +++ b/arch/powerpc/include/asm/paca.h
> @@ -158,6 +158,9 @@ struct paca_struct {
>  	 * early exception handler for use by high level C handler
>  	 */
>  	struct opal_machine_check_event *opal_mc_evt;
> +
> +	/* Flag to distinguish b/w sleep and winkle */
> +	u8 offline_state;

Not fan of the name. I'd rather you call it "wakeup_state_loss" or
something a bit more explicit about what that actually means if it's
going to be a boolean value. Otherwise make it an enumeration of
constants.

>  #endif
>  #ifdef CONFIG_PPC_BOOK3S_64
>  	/* Exclusive emergency stack pointer for machine check exception. */
> diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h
> index 6f85362..5155be7 100644
> --- a/arch/powerpc/include/asm/ppc-opcode.h
> +++ b/arch/powerpc/include/asm/ppc-opcode.h
> @@ -194,6 +194,7 @@
>  
>  #define PPC_INST_NAP			0x4c000364
>  #define PPC_INST_SLEEP			0x4c0003a4
> +#define PPC_INST_WINKLE			0x4c0003e4
>  
>  /* A2 specific instructions */
>  #define PPC_INST_ERATWE			0x7c0001a6
> @@ -374,6 +375,7 @@
>  
>  #define PPC_NAP			stringify_in_c(.long PPC_INST_NAP)
>  #define PPC_SLEEP		stringify_in_c(.long PPC_INST_SLEEP)
> +#define PPC_WINKLE		stringify_in_c(.long PPC_INST_WINKLE)
>  
>  /* BHRB instructions */
>  #define PPC_CLRBHRB		stringify_in_c(.long PPC_INST_CLRBHRB)
> diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
> index 41953cd..00e3df9 100644
> --- a/arch/powerpc/include/asm/processor.h
> +++ b/arch/powerpc/include/asm/processor.h
> @@ -455,6 +455,8 @@ extern void arch_setup_idle(void);
>  extern void power7_nap(int check_irq);
>  extern unsigned long power7_sleep(void);
>  extern unsigned long __power7_sleep(void);
> +extern unsigned long power7_winkle(void);
> +extern unsigned long __power7_winkle(void);
>  extern void flush_instruction_cache(void);
>  extern void hard_reset_now(void);
>  extern void poweroff_now(void);
> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
> index 9d7dede..ea98817 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -731,6 +731,7 @@ int main(void)
>  	DEFINE(OPAL_MC_SRR0, offsetof(struct opal_machine_check_event, srr0));
>  	DEFINE(OPAL_MC_SRR1, offsetof(struct opal_machine_check_event, srr1));
>  	DEFINE(PACA_OPAL_MC_EVT, offsetof(struct paca_struct, opal_mc_evt));
> +	DEFINE(PACAOFFLINESTATE, offsetof(struct paca_struct, offline_state));
>  #endif
>  
>  	return 0;
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index c64f3cc0..261f348 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -115,9 +115,7 @@ BEGIN_FTR_SECTION
>  #endif
>  
>  	/* Running native on arch 2.06 or later, check if we are
> -	 * waking up from nap. We only handle no state loss and
> -	 * supervisor state loss. We do -not- handle hypervisor
> -	 * state loss at this time.
> +	 * waking up from power saving mode.
>  	 */
>  	mfspr	r13,SPRN_SRR1
>  	rlwinm.	r13,r13,47-31,30,31
> @@ -133,8 +131,8 @@ BEGIN_FTR_SECTION
>  	b	power7_wakeup_noloss
>  2:	b	power7_wakeup_loss
>  
> -	/* Fast Sleep wakeup on PowerNV */
> -8:	b 	power7_wakeup_tb_loss
> +	/* Fast Sleep / Winkle wakeup on PowerNV */
> +8:	b 	power7_wakeup_hv_state_loss
>  
>  9:
>  END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
> diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c
> index 1f268e0..ed46217 100644
> --- a/arch/powerpc/kernel/idle.c
> +++ b/arch/powerpc/kernel/idle.c
> @@ -98,6 +98,17 @@ unsigned long power7_sleep(void)
>  	return ret;
>  }
>  
> +unsigned long power7_winkle(void)
> +{
> +	unsigned long ret;
> +
> +	if (ppc_md.power7_winkle)
> +		ret = ppc_md.power7_winkle();
> +	else
> +		ret = __power7_winkle();
> +	return ret;
> +}
> +
>  int powersave_nap;
>  
>  #ifdef CONFIG_SYSCTL
> diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S
> index c3481c9..87b2556 100644
> --- a/arch/powerpc/kernel/idle_power7.S
> +++ b/arch/powerpc/kernel/idle_power7.S
> @@ -18,6 +18,13 @@
>  #include <asm/hw_irq.h>
>  #include <asm/kvm_book3s_asm.h>
>  #include <asm/opal.h>
> +#include <asm/mmu-hash64.h>
> +
> +/*
> + * Use volatile GPRs' space to save essential SPRs before entering winkle
> + */
> +#define _SDR1	GPR3
> +#define _TSCR	GPR4
>  
>  #undef DEBUG
>  
> @@ -39,6 +46,7 @@
>   * Pass requested state in r3:
>   * 	0 - nap
>   * 	1 - sleep
> + *	2 - winkle
>   *
>   * To check IRQ_HAPPENED in r4
>   * 	0 - don't check
> @@ -109,9 +117,27 @@ _GLOBAL(power7_enter_nap_mode)
>  #endif
>  	cmpwi	cr0,r3,1
>  	beq	2f
> +	cmpwi	cr0,r3,2
> +	beq	3f
>  	IDLE_STATE_ENTER_SEQ(PPC_NAP)
>  	/* No return */
> -2:	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
> +2:
> +	li	r4,1
> +	stb	r4,PACAOFFLINESTATE(r13)
> +	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
> +	/* No return */
> +
> +3:
> +	mfspr   r4,SPRN_SDR1
> +	std     r4,_SDR1(r1)
> +
> +	mfspr   r4,SPRN_TSCR
> +	std     r4,_TSCR(r1)
> +
> +	/* Enter winkle */
> +        li      r4,0
> +        stb     r4,PACAOFFLINESTATE(r13)
> +	IDLE_STATE_ENTER_SEQ(PPC_WINKLE)
>  	/* No return */
>  
>  _GLOBAL(power7_idle)
> @@ -187,6 +213,59 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);		\
>  20:	nop;
>  
> 
> +_GLOBAL(__power7_winkle)
> +	li	r3,2
> +	li	r4,1
> +	b	power7_powersave_common
> +	/* No return */
> +
> +_GLOBAL(power7_wakeup_hv_state_loss)
> +	/* Check paca flag to diffentiate b/w fast sleep and winkle */
> +	lbz	r4,PACAOFFLINESTATE(13)
> +	cmpwi	cr0,r4,0
> +	bne	power7_wakeup_tb_loss
> +
> +	ld      r2,PACATOC(r13);
> +	ld      r1,PACAR1(r13)
> +
> +	bl 	__restore_cpu_power8

So if I understand correctly, you use a per-cpu flag not a per-core flag
which means we will assume a pessimistic case of having to restore stuff
even if the core didn't actually enter winkle (because the last thread
to go down went to sleep). This is sub-optimal. Also see below:

> +	/* Time base re-sync */
> +	li	r3,OPAL_RESYNC_TIMEBASE
> +	bl	opal_call_realmode;

You will also resync the timebase (and restore all the core shared SPRs)
for each thread. This is problematic, especially with KVM as you could
have a situation where:

 - The first thread comes out and starts diving into KVM

 - The other threads start coming out while the first one is doing the
above.

Thus the first thread might already be manipulating some core registers
(SDR1 etc...) while the secondaries come back and ... whack it. Worse,
the primary might have applied the TB offset using TBU40 while the
secondaries resync the timebase back to the host value, incurring a
loss of TB for the guest.

> +	/* Restore SLB  from PACA */
> +	ld	r8,PACA_SLBSHADOWPTR(r13)
> +
> +	.rept	SLB_NUM_BOLTED
> +	li	r3, SLBSHADOW_SAVEAREA
> +	LDX_BE	r5, r8, r3
> +	addi	r3, r3, 8
> +	LDX_BE	r6, r8, r3
> +	andis.	r7,r5,SLB_ESID_V at h
> +	beq	1f
> +	slbmte	r6,r5
> +1:	addi	r8,r8,16
> +	.endr
> +
> +	ld      r4,_SDR1(r1)
> +	mtspr   SPRN_SDR1,r4
> +
> +	ld      r4,_TSCR(r1)
> +	mtspr   SPRN_TSCR,r4
> +
> +	REST_NVGPRS(r1)
> +	REST_GPR(2, r1)
> +	ld      r3,_CCR(r1)
> +	ld      r4,_MSR(r1)
> +	ld      r5,_NIP(r1)
> +	addi    r1,r1,INT_FRAME_SIZE
> +	mtcr    r3
> +	mfspr   r3,SPRN_SRR1            /* Return SRR1 */
> +	mtspr   SPRN_SRR1,r4
> +	mtspr   SPRN_SRR0,r5
> +	rfid
> +
>  _GLOBAL(power7_wakeup_tb_loss)
>  	ld	r2,PACATOC(r13);
>  	ld	r1,PACAR1(r13)
> diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
> index 9d9a898..f45b52d 100644
> --- a/arch/powerpc/platforms/powernv/setup.c
> +++ b/arch/powerpc/platforms/powernv/setup.c
> @@ -370,6 +370,29 @@ static unsigned long pnv_power7_sleep(void)
>  	return srr1;
>  }
>  
> +/*
> + * We need to keep track of offline cpus also for calling
> + * fastsleep workaround appropriately
> + */
> +static unsigned long pnv_power7_winkle(void)
> +{
> +	int cpu, primary_thread;
> +	unsigned long srr1;
> +
> +	cpu = smp_processor_id();
> +	primary_thread = cpu_first_thread_sibling(cpu);
> +
> +	if (need_fastsleep_workaround) {
> +		pnv_apply_fastsleep_workaround(1, primary_thread);
> +		srr1 = __power7_winkle();
> +		pnv_apply_fastsleep_workaround(0, primary_thread);
> +	} else {
> +			srr1 = __power7_winkle();
> +	}
> +	return srr1;
> +}
> +
> +
>  static void __init pnv_setup_machdep_opal(void)
>  {
>  	ppc_md.get_boot_time = opal_get_boot_time;
> @@ -384,6 +407,7 @@ static void __init pnv_setup_machdep_opal(void)
>  	ppc_md.handle_hmi_exception = opal_handle_hmi_exception;
>  	ppc_md.setup_idle = pnv_setup_idle;
>  	ppc_md.power7_sleep = pnv_power7_sleep;
> +	ppc_md.power7_winkle = pnv_power7_winkle;
>  }
>  
>  #ifdef CONFIG_PPC_POWERNV_RTAS




More information about the Linuxppc-dev mailing list