[PATCH v3 3/4] powernv: cpuidle: Redesign idle states management

Shreyas B Prabhu shreyas at linux.vnet.ibm.com
Mon Dec 8 16:26:46 AEDT 2014


Hi Paul,

On Monday 08 December 2014 10:31 AM, Paul Mackerras wrote:
> On Thu, Dec 04, 2014 at 12:58:22PM +0530, Shreyas B. Prabhu wrote:
>> Deep idle states like sleep and winkle are per core idle states. A core
>> enters these states only when all the threads enter either the
>> particular idle state or a deeper one. There are tasks like fastsleep
>> hardware bug workaround and hypervisor core state save which have to be
>> done only by the last thread of the core entering deep idle state and
>> similarly tasks like timebase resync, hypervisor core register restore
>> that have to be done only by the first thread waking up from these
>> state.
>>
>> The current idle state management does not have a way to distinguish the
>> first/last thread of the core waking/entering idle states. Tasks like
>> timebase resync are done for all the threads. This is not only is
>> suboptimal, but can cause functionality issues when subcores and kvm is
>> involved.
>>
>> This patch adds the necessary infrastructure to track idle states of
>> threads in a per-core structure. It uses this info to perform tasks like
>> fastsleep workaround and timebase resync only once per core.
> 
> Comments below...
> 
>> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
>> index a5139ea..e4578c3 100644
>> --- a/arch/powerpc/include/asm/paca.h
>> +++ b/arch/powerpc/include/asm/paca.h
>> @@ -158,6 +158,12 @@ struct paca_struct {
>>  	 * early exception handler for use by high level C handler
>>  	 */
>>  	struct opal_machine_check_event *opal_mc_evt;
>> +
>> +	/* Per-core mask tracking idle threads and a lock bit-[L][TTTTTTTT] */
>> +	u32 *core_idle_state_ptr;
>> +	u8 thread_idle_state;		/* ~Idle[0]/Nap[1]/Sleep[2]/Winkle[3] */
> 
> Might be clearer in the comment to say "/* PNV_THREAD_xxx */" so it's
> clear the value should be one of PNV_THREAD_NAP, PNV_THREAD_SLEEP,
> etc.

Okay. 
> 
>> diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S
>> index 283c603..8c3a1f4 100644
>> --- a/arch/powerpc/kernel/idle_power7.S
>> +++ b/arch/powerpc/kernel/idle_power7.S
>> @@ -18,6 +18,7 @@
>>  #include <asm/hw_irq.h>
>>  #include <asm/kvm_book3s_asm.h>
>>  #include <asm/opal.h>
>> +#include <asm/cpuidle.h>
>>  
>>  #undef DEBUG
>>  
>> @@ -37,8 +38,7 @@
>>  
>>  /*
>>   * Pass requested state in r3:
>> - * 	0 - nap
>> - * 	1 - sleep
>> + *	r3 - PNV_THREAD_NAP/SLEEP/WINKLE
>>   *
>>   * To check IRQ_HAPPENED in r4
>>   * 	0 - don't check
>> @@ -123,12 +123,58 @@ power7_enter_nap_mode:
>>  	li	r4,KVM_HWTHREAD_IN_NAP
>>  	stb	r4,HSTATE_HWTHREAD_STATE(r13)
>>  #endif
>> -	cmpwi	cr0,r3,1
>> -	beq	2f
>> +	stb	r3,PACA_THREAD_IDLE_STATE(r13)
>> +	cmpwi	cr1,r3,PNV_THREAD_SLEEP
>> +	bge	cr1,2f
>>  	IDLE_STATE_ENTER_SEQ(PPC_NAP)
>>  	/* No return */
>> -2:	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
>> -	/* No return */
>> +2:
>> +	/* Sleep or winkle */
>> +	lbz	r7,PACA_THREAD_MASK(r13)
>> +	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
>> +lwarx_loop1:
>> +	lwarx	r15,0,r14
>> +	andc	r15,r15,r7			/* Clear thread bit */
>> +
>> +	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
>> +
>> +/*
>> + * If cr0 = 0, then current thread is the last thread of the core entering
>> + * sleep. Last thread needs to execute the hardware bug workaround code if
>> + * required by the platform.
>> + * Make the workaround call unconditionally here. The below branch call is
>> + * patched out when the idle states are discovered if the platform does not
>> + * require it.
>> + */
>> +.global pnv_fastsleep_workaround_at_entry
>> +pnv_fastsleep_workaround_at_entry:
>> +	beq	fastsleep_workaround_at_entry
> 
> Did you investigate using the feature bit mechanism to do this
> patching for you?  You would need to allocate a CPU feature bit and
> parse the device tree early on and set or clear the feature bit,
> before the feature fixups are done.  The code here would then end up
> looking like:
> 
> BEGIN_FTR_SECTION
> 	beq	fastsleep_workaround_at_entry
> END_FTR_SECTION_IFSET(CPU_FTR_FASTSLEEP_WORKAROUND)
> 

I agree using feature fixup is a much cleaner implementation. The difficulty is,
information on whether fastsleep workaround is needed is passed in the device
tree. do_feature_fixups is currently called before we unflatten the device tree.
Any suggestions for this?

>> +	stwcx.	r15,0,r14
>> +	isync
>> +	bne-	lwarx_loop1
> 
> The isync has to come after the bne.  Please fix this here and in the
> other places where you added the isync.
> 
Okay. 

>> +common_enter: /* common code for all the threads entering sleep */
>> +	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
>> +
>> +fastsleep_workaround_at_entry:
>> +	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
>> +	stwcx.	r15,0,r14
>> +	isync
>> +	bne-	lwarx_loop1
>> +
>> +	/* Fast sleep workaround */
>> +	li	r3,1
>> +	li	r4,1
>> +	li	r0,OPAL_CONFIG_CPU_IDLE_STATE
>> +	bl	opal_call_realmode
>> +
>> +	/* Clear Lock bit */
>> +	li	r0,0
>> +	lwsync
>> +	stw	r0,0(r14)
>> +	b	common_enter
>> +
>>  
>>  _GLOBAL(power7_idle)
>>  	/* Now check if user or arch enabled NAP mode */
>> @@ -141,49 +187,16 @@ _GLOBAL(power7_idle)
>>  
>>  _GLOBAL(power7_nap)
>>  	mr	r4,r3
>> -	li	r3,0
>> +	li	r3,PNV_THREAD_NAP
>>  	b	power7_powersave_common
>>  	/* No return */
>>  
>>  _GLOBAL(power7_sleep)
>> -	li	r3,1
>> +	li	r3,PNV_THREAD_SLEEP
>>  	li	r4,1
>>  	b	power7_powersave_common
>>  	/* No return */
>>  
>> -/*
>> - * Make opal call in realmode. This is a generic function to be called
>> - * from realmode from reset vector. It handles endianess.
>> - *
>> - * r13 - paca pointer
>> - * r1  - stack pointer
>> - * r3  - opal token
>> - */
>> -opal_call_realmode:
>> -	mflr	r12
>> -	std	r12,_LINK(r1)
>> -	ld	r2,PACATOC(r13)
>> -	/* Set opal return address */
>> -	LOAD_REG_ADDR(r0,return_from_opal_call)
>> -	mtlr	r0
>> -	/* Handle endian-ness */
>> -	li	r0,MSR_LE
>> -	mfmsr	r12
>> -	andc	r12,r12,r0
>> -	mtspr	SPRN_HSRR1,r12
>> -	mr	r0,r3			/* Move opal token to r0 */
>> -	LOAD_REG_ADDR(r11,opal)
>> -	ld	r12,8(r11)
>> -	ld	r2,0(r11)
>> -	mtspr	SPRN_HSRR0,r12
>> -	hrfid
>> -
>> -return_from_opal_call:
>> -	FIXUP_ENDIAN
>> -	ld	r0,_LINK(r1)
>> -	mtlr	r0
>> -	blr
>> -
>>  #define CHECK_HMI_INTERRUPT						\
>>  	mfspr	r0,SPRN_SRR1;						\
>>  BEGIN_FTR_SECTION_NESTED(66);						\
>> @@ -196,10 +209,8 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);		\
>>  	/* Invoke opal call to handle hmi */				\
>>  	ld	r2,PACATOC(r13);					\
>>  	ld	r1,PACAR1(r13);						\
>> -	std	r3,ORIG_GPR3(r1);	/* Save original r3 */		\
>> -	li	r3,OPAL_HANDLE_HMI;	/* Pass opal token argument*/	\
>> +	li	r0,OPAL_HANDLE_HMI;	/* Pass opal token argument*/	\
>>  	bl	opal_call_realmode;					\
>> -	ld	r3,ORIG_GPR3(r1);	/* Restore original r3 */	\
>>  20:	nop;
> 
> I recently sent a patch "powerpc: powernv: Return to cpu offline loop
> when finished in KVM guest" which passes a value in r3 through
> power7_wakeup_loss and power7_wakeup_noloss back to the caller of
> power7_nap().  So please don't take out the save/restore of r3 here.
> 

Okay. I'll base my these patches on top of your patch and resend.

>> @@ -210,12 +221,90 @@ _GLOBAL(power7_wakeup_tb_loss)
>>  BEGIN_FTR_SECTION
>>  	CHECK_HMI_INTERRUPT
>>  END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
>> +
>> +	lbz	r7,PACA_THREAD_MASK(r13)
>> +	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
>> +lwarx_loop2:
>> +	lwarx	r15,0,r14
>> +	andi.	r9,r15,PNV_CORE_IDLE_LOCK_BIT
>> +	/*
>> +	 * Lock bit is set in one of the 2 cases-
>> +	 * a. In the sleep/winkle enter path, the last thread is executing
>> +	 * fastsleep workaround code.
>> +	 * b. In the wake up path, another thread is executing fastsleep
>> +	 * workaround undo code or resyncing timebase or restoring context
>> +	 * In either case loop until the lock bit is cleared.
>> +	 */
>> +	bne	core_idle_lock_held
>> +
>> +	cmpwi	cr2,r15,0
>> +	or	r15,r15,r7		/* Set thread bit */
>> +
>> +	beq	cr2,first_thread
>> +
>> +	/* Not first thread in core to wake up */
>> +	stwcx.	r15,0,r14
>> +	isync
>> +	bne-	lwarx_loop2
>> +	b	common_exit
>> +
>> +core_idle_lock_held:
>> +	HMT_LOW
>> +core_idle_lock_loop:
>> +	lwz	r15,0(14)
>> +	andi.   r9,r15,PNV_CORE_IDLE_LOCK_BIT
>> +	bne	core_idle_lock_loop
>> +	HMT_MEDIUM
>> +	b	lwarx_loop2
>> +
>> +first_thread:
>> +	/* First thread in core to wakeup */
>> +	ori	r15,r15,PNV_CORE_IDLE_LOCK_BIT
>> +	stwcx.	r15,0,r14
>> +	isync
>> +	bne-	lwarx_loop2
>> +
>> +	/*
>> +	 * First thread in the core waking up from fastsleep. It needs to
>> +	 * call the fastsleep workaround code if the platform requires it.
>> +	 * Call it unconditionally here. The below branch instruction will
>> +	 * be patched out when the idle states are discovered if platform
>> +	 * does not require workaround.
>> +	 */
>> +.global pnv_fastsleep_workaround_at_exit
>> +pnv_fastsleep_workaround_at_exit:
>> +	b	fastsleep_workaround_at_exit
>> +
>> +timebase_resync:
>> +	/* Do timebase resync if we are waking up from sleep. Use cr3 value
>> +	 * set in exceptions-64s.S */
>> +	ble	cr3,clear_lock
>>  	/* Time base re-sync */
>> -	li	r3,OPAL_RESYNC_TIMEBASE
>> +	li	r0,OPAL_RESYNC_TIMEBASE
>>  	bl	opal_call_realmode;
>> -
>>  	/* TODO: Check r3 for failure */
>>  
>> +clear_lock:
>> +	andi.	r15,r15,PNV_CORE_IDLE_THREAD_BITS
>> +	lwsync
>> +	stw	r15,0(r14)
>> +
>> +common_exit:
>> +	li	r5,PNV_THREAD_RUNNING
>> +	stb     r5,PACA_THREAD_IDLE_STATE(r13)
>> +
>> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>> +	li      r0,KVM_HWTHREAD_IN_KERNEL
>> +	stb     r0,HSTATE_HWTHREAD_STATE(r13)
>> +	/* Order setting hwthread_state vs. testing hwthread_req */
>> +	sync
>> +	lbz     r0,HSTATE_HWTHREAD_REQ(r13)
>> +	cmpwi   r0,0
>> +	beq     6f
>> +	b       kvm_start_guest
> 
> There is a bit of a problem here: the FIXUP_ENDIAN in
> opal_call_realmode will trash SRR1 (if the kernel is little-endian),
> but the code at kvm_start_guest needs SRR1 from the system reset
> exception so that it can know what the wakeup reason was.
> 
Hmm, I'll save/restore SRR1 before calling opal_call_realmode. Thanks for catching this.

>> diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
>> index 34c6665..97e0279 100644
>> --- a/arch/powerpc/platforms/powernv/setup.c
>> +++ b/arch/powerpc/platforms/powernv/setup.c
>> @@ -36,6 +36,9 @@
>>  #include <asm/opal.h>
>>  #include <asm/kexec.h>
>>  #include <asm/smp.h>
>> +#include <asm/cputhreads.h>
>> +#include <asm/cpuidle.h>
>> +#include <asm/code-patching.h>
>>  
>>  #include "powernv.h"
>>  
>> @@ -292,10 +295,43 @@ static void __init pnv_setup_machdep_rtas(void)
>>  
>>  static u32 supported_cpuidle_states;
>>  
>> +static void pnv_alloc_idle_core_states(void)
>> +{
>> +	int i, j;
>> +	int nr_cores = cpu_nr_cores();
>> +	u32 *core_idle_state;
>> +
>> +	/*
>> +	 * core_idle_state - First 8 bits track the idle state of each thread
>> +	 * of the core. The 8th bit is the lock bit. Initially all thread bits
>> +	 * are set. They are cleared when the thread enters deep idle state
>> +	 * like sleep and winkle. Initially the lock bit is cleared.
>> +	 * The lock bit has 2 purposes
>> +	 * a. While the first thread is restoring core state, it prevents
>> +	 * from other threads in the core from switching to prcoess context.
> 
>            ^^^^ remove "from"                               ^^^^^^^ process
> 
>> +	 * b. While the last thread in the core is saving the core state, it
>> +	 * prevent a different thread from waking up.
> 
>            ^^^^^^^ prevents
> 
Oops. Will fix it.
>> +	 */
>> +	for (i = 0; i < nr_cores; i++) {
>> +		int first_cpu = i * threads_per_core;
>> +		int node = cpu_to_node(first_cpu);
>> +
>> +		core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
>> +		for (j = 0; j < threads_per_core; j++) {
>> +			int cpu = first_cpu + j;
>> +
>> +			paca[cpu].core_idle_state_ptr = core_idle_state;
>> +			paca[cpu].thread_idle_state = PNV_THREAD_RUNNING;
>> +			paca[cpu].thread_mask = 1 << (cpu % threads_per_core);
> 
> This would be simpler and quicker:
> 
> 			paca[cpu].thread_mask = 1 << j;
> 
Will make the change.


Thanks,
Shreyas 




More information about the Linuxppc-dev mailing list