[PATCH v8 1/2] powerpc/64s: reimplement book3s idle code in C

Satheesh Rajendran sathnaga at linux.vnet.ibm.com
Mon Apr 8 17:32:51 AEST 2019


Hi,

Hit with below kernel crash during Power8 Host boot with this patch series on top
of powerpc merge branch commit https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git/commit/?h=merge&id=6a821ffee18a6e6c0027c523fa8c958df98ca361

built with ppc64le_defconfig

Host Console log:
[    0.454666] EEH: PCI Enhanced I/O Error Handling Enabled
[    0.456524] create_dump_obj: New platform dump. ID = 0x4 Size 7457968
[    0.457627] opal-power: OPAL EPOW, DPO support detected.
[    0.457722] BUG: Unable to handle kernel data access at 0xffffffffff76184a
[    0.457733] Faulting instruction address: 0xc00000000001a94c
[    0.457740] Oops: Kernel access of bad area, sig: 11 [#1]
[    0.457745] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
[    0.457750] Modules linked in:
[    0.457756] CPU: 58 PID: 0 Comm: swapper/58 Not tainted 5.1.0-rc2-gd0ae6c548 #1
[    0.457762] NIP:  c00000000001a94c LR: c0000000000a6e9c CTR: c000000000008000
[    0.457768] REGS: c000000f272b7b50 TRAP: 0380   Not tainted  (5.1.0-rc2-gd0ae6c548)
[    0.457773] MSR:  9000000000001033 <SF,HV,ME,IR,DR,RI,LE>  CR: 24004222  XER: 00000000
[    0.457781] CFAR: c0000000000a6e98 IRQMASK: 1 
[    0.457781] GPR00: c0000000000a6e9c c000000f272b7de0 0000000000000004 0000000000000006 
[    0.457781] GPR04: c0000000000a5dd4 0000000024004222 c000000f272b7d48 0000000000000001 
[    0.457781] GPR08: 0000000000000002 ffffffffff761844 c000000f27250c00 0000c3feb1676be1 
[    0.457781] GPR12: 0000000000004400 c000000ffff9d380 c000000ffe60ff90 0000000000000000 
[    0.457781] GPR16: 0000000000000000 0000000000000000 c00000000004b4d0 c00000000004b4a0 
[    0.457781] GPR20: c000000001526214 0000000000000800 0000000000000001 c000000001521b78 
[    0.457781] GPR24: 000000000000003a 0000000000000000 0000000000080000 0000000000000000 
[    0.457781] GPR28: c000000001526140 0000000000000001 0400000000000000 c000000001525ce0 
[    0.457829] NIP [c00000000001a94c] irq_set_pending_from_srr1+0x1c/0x50
[    0.457835] LR [c0000000000a6e9c] power7_idle+0x3c/0x50
[    0.457839] Call Trace:
[    0.457843] [c000000f272b7de0] [c0000000000a6e98] power7_idle+0x38/0x50 (unreliable)
[    0.457849] [c000000f272b7e00] [c0000000000210f4] arch_cpu_idle+0x54/0x160
[    0.457856] [c000000f272b7e30] [c000000000c47bc4] default_idle_call+0x74/0x88
[    0.457862] [c000000f272b7e50] [c000000000158f54] do_idle+0x2f4/0x3d0
[    0.457868] [c000000f272b7ec0] [c000000000159288] cpu_startup_entry+0x38/0x40
[    0.457874] [c000000f272b7ef0] [c00000000004dae4] start_secondary+0x654/0x680
[    0.457881] [c000000f272b7f90] [c00000000000b25c] start_secondary_prolog+0x10/0x14
[    0.457886] Instruction dump:
[    0.457890] 992d098b 7c630034 5463d97e 4e800020 60000000 3c4c014d 38424dd0 7c0802a6 
[    0.457898] 60000000 3d22ff76 78637722 39291840 
[    0.457900] BUG: Unable to handle kernel data access at 0xffffffffff76184a
[    0.457901] <7d4918ae> 2b8a00ff 419e001c 892d098b 
[    0.457907] Faulting instruction address: 0xc00000000001a94c
[    0.457910] BUG: Unable to handle kernel data access at 0xffffffffff76184a
[    0.457915] ---[ end trace fa7343cfd21c8798 ]---
[    0.457919] Faulting instruction address: 0xc00000000001a94c
[    0.458961] BUG: Unable to handle kernel data access at 0xffffffffff76184a
[    0.458963] BUG: Unable to handle kernel data access at 0xffffffffff76184a
[    0.458964] BUG: Unable to handle kernel data access at 0xffffffffff76184a
[    0.458966] BUG: Unable to handle kernel data access at 0xffffffffff76184a
[    0.458968] BUG: Unable to handle kernel data access at 0xffffffffff76184a
[    0.458970] BUG: Unable to handle kernel data access at 0xffffffffff76184a
[    0.458972] Faulting instruction address: 0xc00000000001a94c
[    0.458973] Faulting instruction address: 0xc00000000001a94c
[    0.458974] Faulting instruction address: 0xc00000000001a94c
[    0.458975] Faulting instruction address: 0xc00000000001a94c
[    0.458976] Faulting instruction address: 0xc00000000001a94c
[    0.458978] initcall __machine_initcall_powernv_pnv_init_idle_states+0x0/0xb30 returned 0 after 0 usecs
[    0.458981] calling  __machine_initcall_powernv_opal_time_init+0x0/0x150 @ 1
[    0.458982] Faulting instruction address: 0xc00000000001a94c
[    0.459022] BUG: Unable to handle kernel data access at 0xffffffffff76184a
[    0.459040] Faulting instruction address: 0xc00000000001a94c
[    0.459043] initcall __machine_initcall_powernv_opal_time_init+0x0/0x150 returned 0 after 0 usecs
[    0.459044] BUG: Unable to handle kernel data access at 0xffffffffff76184c
[    0.459045] Faulting instruction address: 0xc00000000001a94c
[    0.459060] calling  __machine_initcall_powernv_rng_init+0x0/0x334 @ 1
[    0.459084] powernv-rng: Registering arch random hook.
[    0.459141] BUG: Unable to handle kernel data access at 0xffffffffff76184a
[    0.459147] Faulting instruction address: 0xc00000000001a94c
[    0.459191] BUG: Unable to handle kernel data access at 0xffffffffff76184a
[    0.459199] Faulting instruction address: 0xc00000000001a94c
[    0.459216] BUG: Unable to handle kernel data access at 0xffffffffff76184a
[    0.459224] Faulting instruction address: 0xc00000000001a94c
[    0.459228] BUG: Unable to handle kernel data access at 0xffffffffff76184a
[    0.459234] Faulting instruction address: 0xc00000000001a94c
[    0.459268] BUG: Unable to handle kernel data access at 0xffffffffff76184a
[    0.459275] Faulting instruction address: 0xc00000000001a94c
[    0.459375] 
[    0.459380] Oops: Kernel access of bad area, sig: 11 [#2]
[    0.459385] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA PowerNV
[    0.459390] Modules linked in:
[    0.459395] CPU: 63 PID: 0 Comm: swapper/63 Tainted: G      D           5.1.0-rc2-gd0ae6c548 #1
[    0.459401] NIP:  c00000000001a94c LR: c0000000000a6e9c CTR: c000000000008000
[    0.459407] REGS: c000000f272a3b50 TRAP: 0380   Tainted: G      D            (5.1.0-rc2-gd0ae6c548)
[    0.459414] MSR:  9000000000001033 <SF,HV,ME,IR,DR,RI,LE>  CR: 24004222  XER: 00000000
[    0.459419] BUG: Unable to handle kernel data access at 0xffffffffff76184c
[    0.459422] CFAR: c0000000000a6e98 IRQMASK: 1 
[    0.459422] GPR00: c0000000000a6e9c c000000f272a3de0 0000000000000004 0000000000000006 
[    0.459422] GPR04: c0000000000a5dd4 0000000024004222 c000000f272a3d48 0000000000000001 
[    0.459422] GPR08: 0000000000000007 ffffffffff761844 c000000f27244e00 0000c3feb18a5128 
[    0.459422] GPR12: 0000000000004400 c000000ffff99080 c000000ffe623f90 0000000000000000 
[    0.459422] GPR16: 0000000000000000 0000000000000000 c00000000004b4d0 c00000000004b4a0 
[    0.459422] GPR20: c000000001526214 0000000000000800 0000000000000001 c000000001521b78 
[    0.459422] GPR24: 000000000000003f 0000000000000000 0000000000080000 0000000000000000 
[    0.459422] GPR28: c000000001526140 0000000000000001 8000000000000000 c000000001525ce0 
[    0.459443] NIP [c00000000001a94c] irq_set_pending_from_srr1+0x1c/0x50
[    0.459449] Faulting instruction address: 0xc00000000001a94c
[    0.459483] LR [c0000000000a6e9c] power7_idle+0x3c/0x50
[    0.459485] Call Trace:
[    0.459490] initcall __machine_initcall_powernv_rng_init+0x0/0x334 returned 0 after 0 usecs
[    0.459493] calling  __machine_initcall_pseries_init_ras_IRQ+0x0/0xf4 @ 1
[    0.459497] [c000000f272a3de0] [c0000000000a6e98] power7_idle+0x38/0x50 (unreliable)
[    0.459500] [c000000f272a3e00] [c0000000000210f4] arch_cpu_idle+0x54/0x160
[    0.459503] [c000000f272a3e30] [c000000000c47bc4] default_idle_call+0x74/0x88
[    0.459507] initcall __machine_initcall_pseries_init_ras_IRQ+0x0/0xf4 returned 0 after 0 usecs
[    0.459510] calling  __machine_initcall_pseries_rng_init+0x0/0xa4 @ 1
[    0.459514] [c000000f272a3e50] [c000000000158f54] do_idle+0x2f4/0x3d0
[    0.459518] [c000000f272a3ec0] [c000000000159288] cpu_startup_entry+0x38/0x40
[    0.459523] initcall __machine_initcall_pseries_rng_init+0x0/0xa4 returned 0 after 0 usecs
[    0.459527] [c000000f272a3ef0] [c00000000004dae4] start_secondary+0x654/0x680
[    0.459531] [c000000f272a3f90] [c00000000000b25c] start_secondary_prolog+0x10/0x14
[    0.459535] calling  __machine_initcall_pseries_ioei_init+0x0/0xd8 @ 1
[    0.459539] Instruction dump:
[    0.459542] 992d098b 7c630034 5463d97e 4e800020 60000000 3c4c014d 38424dd0 7c0802a6 
[    0.459549] initcall __machine_initcall_pseries_ioei_init+0x0/0xd8 returned 0 after 0 usecs
[    0.459553] 60000000 3d22ff76 78637722 39291840 <7d4918ae> 2b8a00ff 419e001c 892d098b 
[    0.459559] calling  uid_cache_init+0x0/0x108 @ 1
[    0.459564] ---[ end trace fa7343cfd21c8799 ]---
[    0.459574] initcall uid_cache_init+0x0/0x108 returned 0 after 0 usecs
[    0.459576] calling  param_sysfs_init+0x0/0x248 @ 1


Regards,
-Satheesh.
 
On Mon, Apr 08, 2019 at 04:34:30PM +1000, Nicholas Piggin wrote:
> Reimplement Book3S idle code in C, moving POWER7/8/9 implementation
> speific HV idle code to the powernv platform code.
> 
> Book3S assembly stubs are kept in common code and used only to save
> the stack frame and non-volatile GPRs before executing architected
> idle instructions, and restoring the stack and reloading GPRs then
> returning to C after waking from idle.
> 
> The complex logic dealing with threads and subcores, locking, SPRs,
> HMIs, timebase resync, etc., is all done in C which makes it more
> maintainable.
> 
> This is not a strict translation to C code, there are some
> significant differences:
> 
> - Idle wakeup no longer uses the ->cpu_restore call to reinit SPRs,
>   but saves and restores them itself.
> 
> - The optimisation where EC=ESL=0 idle modes did not have to save GPRs
>   or change MSR is restored, because it's now simple to do. ESL=1
>   sleeps that do not lose GPRs can use this optimization too.
> 
> - KVM secondary entry and cede is now more of a call/return style
>   rather than branchy. nap_state_lost is not required because KVM
>   always returns via NVGPR restoring path.
> 
> - KVM secondary wakeup from offline sequence is moved entirely into
>   the offline wakeup, which avoids a hwsync in the normal idle wakeup
>   path.
> 
> Reviewed-by: Gautham R. Shenoy <ego at linux.vnet.ibm.com>
> Signed-off-by: Nicholas Piggin <npiggin at gmail.com>
> 
> Notes:
> - The KVM code has been significantly changed and now actually boots a
>   HPT on radix guest with dependent threads mode and >0 secondaries.
>   With previous iterations my test wasn't actually catching this case
>   and there were some obvious bugs.
> 
>   I've broken the KVM code into the second patch just for review. The
>   first patch makes KVM kind-of work following its existing design.
>   The main thing that's missing from it is deep idle states that lose
>   SPRs on the secondaries don't restore them if it's a KVM request
>   wakeup. But you can run guests with deep idle states disabled.
>   Rather than a significant rework of the code to make that work with
>   the new idle code that would need testing, which then gets undone,
>   I have just broken it up like this for hopefully easier review of
>   the KVM parts. Patches can be squashed together before upstream merge.
> 
> - There's so many combinations of KVM modes and options I could use more
>   help with review and testing.
> 
> - This is not ported up to powerpc next yet.
> 
> - P9 restores some of the PMU SPRs, but not others, and P8 only zeroes
>   them. There are improvmets to be made to SPR save restore policies and
>   documentation, but this first pass tries to keep things as they were.
> 
> Left to do:
> - Test actual POWER7 hardware.
> 
> - More KVM testing and review.
> 
> - Port to powerpc next.
> 
> Since RFC v1:
> - Now tested and working with POWER9 hash and radix.
> - KVM support added. This took a bit of work to untangle and might
>   still have some issues, but POWER9 seems to work including hash on
>   radix with dependent threads mode.
> - This snowballed a bit because of KVM and other details making it
>   not feasible to leave POWER7/8 code alone. That's only half done
>   at the moment.
> - So far this trades about 800 lines of asm for 500 of C. With POWER7/8
>   support done it might be another hundred or so lines of C.
> 
> Since RFC v2:
> - Fixed deep state SLB reloading
> - Now tested and working with POWER8.
> - Accounted for most feedback.
> 
> Since RFC v3:
> - Rebased to powerpc merge + idle state bugfix
> - Split SLB flush/restore code out and shared with MCE code (pseries
>   MCE patches can also use).
> - More testing on POWER8 including KVM with secondaries.
> - Performance testing looks good. EC=ESL=0 is about 5% faster, other
>   stop states look a little faster too.
> - Adjusted SPR saving to handler POWER7, haven't tested it.
> 
> Since v1:
> - More review comments from Gautham.
> - Rename isa3_ to isa300_ prefix.
> - Tinkered with some comments, copyright notice, changelog.
> - Cede and regular idle do not go via KVM secondary wakeup code path,
>   so hwthread_state stores and barriers can be simplified, and some
>   KVM code paths simplified a little.
> 
> Since v2:
> - Rebase, SLB reload patch has been merged.
> - More testing. Tested machine check idle wakeup path with mambo stepping
>   through instructions.
> 
> Since v3:
> - Build fixes caught by CI
> 
> Since v4:
> - PSSCR test PLS rather than RL (Akshay)
> 
> Since v5:
> - Fix TB loss test to use PLS instead of RL as well
> - Rename hv_loss variable to spr_loss to better describe its usage
> - Clamp the SPR loss level to shallower of SPR loss or TB loss in case
>   future CPU has that behaviour (P8 type behaviour).
> - Added a few more comments.
> 
> Since v6:
> - Comment improvements
> - Remove the restore_cpu() simplification. Now that restore_cpu is not
>   called from idle, it can be simplified, however it's not required so
>   leave that to a future patch, to avoid risking change to boot/kexec
>   paths.
> - Actually use the stack red zone rather than pt_regs beyond it to save
>   GPRs. A MCE or SRESET while saving regs (that runs with MSR[RI]=1 on
>   P9) would have trashed our saved regs.
> 
> Since v7:
> - Hopefully fix KVM dependent threads mode.
> - Split KVM patch out.
> ---
>  arch/powerpc/include/asm/cpuidle.h       |   19 +-
>  arch/powerpc/include/asm/paca.h          |   41 +-
>  arch/powerpc/include/asm/processor.h     |    9 +-
>  arch/powerpc/include/asm/reg.h           |    8 +-
>  arch/powerpc/kernel/asm-offsets.c        |   17 -
>  arch/powerpc/kernel/exceptions-64s.S     |   21 +-
>  arch/powerpc/kernel/idle_book3s.S        | 1053 +++-------------------
>  arch/powerpc/kernel/setup-common.c       |    4 +-
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S  |    6 +-
>  arch/powerpc/platforms/powernv/idle.c    |  843 +++++++++++++----
>  arch/powerpc/platforms/powernv/subcore.c |    2 +-
>  arch/powerpc/xmon/xmon.c                 |   25 +-
>  12 files changed, 902 insertions(+), 1146 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h
> index 43e5f31fe64d..9844b3ded187 100644
> --- a/arch/powerpc/include/asm/cpuidle.h
> +++ b/arch/powerpc/include/asm/cpuidle.h
> @@ -27,10 +27,11 @@
>   * the THREAD_WINKLE_BITS are set, which indicate which threads have not
>   * yet woken from the winkle state.
>   */
> -#define PNV_CORE_IDLE_LOCK_BIT			0x10000000
> +#define NR_PNV_CORE_IDLE_LOCK_BIT		28
> +#define PNV_CORE_IDLE_LOCK_BIT			(1ULL << NR_PNV_CORE_IDLE_LOCK_BIT)
> 
> +#define PNV_CORE_IDLE_WINKLE_COUNT_SHIFT	16
>  #define PNV_CORE_IDLE_WINKLE_COUNT		0x00010000
> -#define PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT	0x00080000
>  #define PNV_CORE_IDLE_WINKLE_COUNT_BITS		0x000F0000
>  #define PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT	8
>  #define PNV_CORE_IDLE_THREAD_WINKLE_BITS	0x0000FF00
> @@ -68,16 +69,6 @@
>  #define ERR_DEEP_STATE_ESL_MISMATCH	-2
> 
>  #ifndef __ASSEMBLY__
> -/* Additional SPRs that need to be saved/restored during stop */
> -struct stop_sprs {
> -	u64 pid;
> -	u64 ldbar;
> -	u64 fscr;
> -	u64 hfscr;
> -	u64 mmcr1;
> -	u64 mmcr2;
> -	u64 mmcra;
> -};
> 
>  #define PNV_IDLE_NAME_LEN    16
>  struct pnv_idle_states_t {
> @@ -92,10 +83,6 @@ struct pnv_idle_states_t {
> 
>  extern struct pnv_idle_states_t *pnv_idle_states;
>  extern int nr_pnv_idle_states;
> -extern u32 pnv_fastsleep_workaround_at_entry[];
> -extern u32 pnv_fastsleep_workaround_at_exit[];
> -
> -extern u64 pnv_first_deep_stop_state;
> 
>  unsigned long pnv_cpu_offline(unsigned int cpu);
>  int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags);
> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
> index e843bc5d1a0f..e55dedd7ee3e 100644
> --- a/arch/powerpc/include/asm/paca.h
> +++ b/arch/powerpc/include/asm/paca.h
> @@ -173,8 +173,8 @@ struct paca_struct {
>  	u8 irq_happened;		/* irq happened while soft-disabled */
>  	u8 io_sync;			/* writel() needs spin_unlock sync */
>  	u8 irq_work_pending;		/* IRQ_WORK interrupt while soft-disable */
> -	u8 nap_state_lost;		/* NV GPR values lost in power7_idle */
>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +	u8 nap_state_lost;		/* NV GPR values lost in power7_idle */
>  	u8 pmcregs_in_use;		/* pseries puts this in lppaca */
>  #endif
>  	u64 sprg_vdso;			/* Saved user-visible sprg */
> @@ -183,23 +183,28 @@ struct paca_struct {
>  #endif
> 
>  #ifdef CONFIG_PPC_POWERNV
> -	/* Per-core mask tracking idle threads and a lock bit-[L][TTTTTTTT] */
> -	u32 *core_idle_state_ptr;
> -	u8 thread_idle_state;		/* PNV_THREAD_RUNNING/NAP/SLEEP	*/
> -	/* Mask to indicate thread id in core */
> -	u8 thread_mask;
> -	/* Mask to denote subcore sibling threads */
> -	u8 subcore_sibling_mask;
> -	/* Flag to request this thread not to stop */
> -	atomic_t dont_stop;
> -	/* The PSSCR value that the kernel requested before going to stop */
> -	u64 requested_psscr;
> -
> -	/*
> -	 * Save area for additional SPRs that need to be
> -	 * saved/restored during cpuidle stop.
> -	 */
> -	struct stop_sprs stop_sprs;
> +	/* PowerNV idle fields */
> +	/* PNV_CORE_IDLE_* bits, all siblings work on thread 0 paca */
> +	unsigned long idle_state;
> +	union {
> +		/* P7/P8 specific fields */
> +		struct {
> +			/* PNV_THREAD_RUNNING/NAP/SLEEP	*/
> +			u8 thread_idle_state;
> +			/* Mask to denote subcore sibling threads */
> +			u8 subcore_sibling_mask;
> +		};
> +
> +		/* P9 specific fields */
> +		struct {
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +			/* The PSSCR value that the kernel requested before going to stop */
> +			u64 requested_psscr;
> +			/* Flag to request this thread not to stop */
> +			atomic_t dont_stop;
> +#endif
> +		};
> +	};
>  #endif
> 
>  #ifdef CONFIG_PPC_BOOK3S_64
> diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
> index 3351bcf42f2d..3120cca72e1f 100644
> --- a/arch/powerpc/include/asm/processor.h
> +++ b/arch/powerpc/include/asm/processor.h
> @@ -411,14 +411,17 @@ static inline unsigned long get_clean_sp(unsigned long sp, int is_32)
>  }
>  #endif
> 
> +/* asm stubs */
> +extern unsigned long isa300_idle_stop_noloss(unsigned long psscr_val);
> +extern unsigned long isa300_idle_stop_mayloss(unsigned long psscr_val);
> +extern unsigned long isa206_idle_insn_mayloss(unsigned long type);
> +
>  extern unsigned long cpuidle_disable;
>  enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
> 
>  extern int powersave_nap;	/* set if nap mode can be used in idle loop */
> -extern unsigned long power7_idle_insn(unsigned long type); /* PNV_THREAD_NAP/etc*/
> +
>  extern void power7_idle_type(unsigned long type);
> -extern unsigned long power9_idle_stop(unsigned long psscr_val);
> -extern unsigned long power9_offline_stop(unsigned long psscr_val);
>  extern void power9_idle_type(unsigned long stop_psscr_val,
>  			      unsigned long stop_psscr_mask);
> 
> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
> index c5b2aff0ce8e..10caa145f98b 100644
> --- a/arch/powerpc/include/asm/reg.h
> +++ b/arch/powerpc/include/asm/reg.h
> @@ -168,6 +168,7 @@
>  #define PSSCR_ESL		0x00200000 /* Enable State Loss */
>  #define PSSCR_SD		0x00400000 /* Status Disable */
>  #define PSSCR_PLS	0xf000000000000000 /* Power-saving Level Status */
> +#define PSSCR_PLS_SHIFT	60
>  #define PSSCR_GUEST_VIS	0xf0000000000003ffUL /* Guest-visible PSSCR fields */
>  #define PSSCR_FAKE_SUSPEND	0x00000400 /* Fake-suspend bit (P9 DD2.2) */
>  #define PSSCR_FAKE_SUSPEND_LG	10	   /* Fake-suspend bit position */
> @@ -758,10 +759,9 @@
>  #define	  SRR1_WAKERESET	0x00100000 /* System reset */
>  #define   SRR1_WAKEHDBELL	0x000c0000 /* Hypervisor doorbell on P8 */
>  #define	  SRR1_WAKESTATE	0x00030000 /* Powersave exit mask [46:47] */
> -#define	  SRR1_WS_DEEPEST	0x00030000 /* Some resources not maintained,
> -					  * may not be recoverable */
> -#define	  SRR1_WS_DEEPER	0x00020000 /* Some resources not maintained */
> -#define	  SRR1_WS_DEEP		0x00010000 /* All resources maintained */
> +#define	  SRR1_WS_HVLOSS	0x00030000 /* HV resources not maintained */
> +#define	  SRR1_WS_GPRLOSS	0x00020000 /* GPRs not maintained */
> +#define	  SRR1_WS_NOLOSS	0x00010000 /* All resources maintained */
>  #define   SRR1_PROGTM		0x00200000 /* TM Bad Thing */
>  #define   SRR1_PROGFPE		0x00100000 /* Floating Point Enabled */
>  #define   SRR1_PROGILL		0x00080000 /* Illegal instruction */
> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
> index 86a61e5f8285..167a59fda12e 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -766,23 +766,6 @@ int main(void)
>  	OFFSET(VCPU_TIMING_LAST_ENTER_TBL, kvm_vcpu, arch.timing_last_enter.tv32.tbl);
>  #endif
> 
> -#ifdef CONFIG_PPC_POWERNV
> -	OFFSET(PACA_CORE_IDLE_STATE_PTR, paca_struct, core_idle_state_ptr);
> -	OFFSET(PACA_THREAD_IDLE_STATE, paca_struct, thread_idle_state);
> -	OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask);
> -	OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask);
> -	OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr);
> -	OFFSET(PACA_DONT_STOP, paca_struct, dont_stop);
> -#define STOP_SPR(x, f)	OFFSET(x, paca_struct, stop_sprs.f)
> -	STOP_SPR(STOP_PID, pid);
> -	STOP_SPR(STOP_LDBAR, ldbar);
> -	STOP_SPR(STOP_FSCR, fscr);
> -	STOP_SPR(STOP_HFSCR, hfscr);
> -	STOP_SPR(STOP_MMCR1, mmcr1);
> -	STOP_SPR(STOP_MMCR2, mmcr2);
> -	STOP_SPR(STOP_MMCRA, mmcra);
> -#endif
> -
>  	DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER);
>  	DEFINE(PPC_DBELL_MSGTYPE, PPC_DBELL_MSGTYPE);
> 
> diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
> index a5b8fbae56a0..7d54cfa5ca84 100644
> --- a/arch/powerpc/kernel/exceptions-64s.S
> +++ b/arch/powerpc/kernel/exceptions-64s.S
> @@ -144,8 +144,11 @@ TRAMP_KVM(PACA_EXNMI, 0x100)
> 
>  #ifdef CONFIG_PPC_P7_NAP
>  EXC_COMMON_BEGIN(system_reset_idle_common)
> -	mfspr	r12,SPRN_SRR1
> -	b	pnv_powersave_wakeup
> +	mfspr	r3,SPRN_SRR1
> +#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +	bltlr	cr3	/* no state loss, return to idle caller */
> +#endif
> +	b	idle_return_gpr_loss
>  #endif
> 
>  /*
> @@ -427,17 +430,19 @@ EXC_COMMON_BEGIN(machine_check_idle_common)
>  	 * Then decrement MCE nesting after finishing with the stack.
>  	 */
>  	ld	r3,_MSR(r1)
> +	ld	r4,_LINK(r1)
> 
>  	lhz	r11,PACA_IN_MCE(r13)
>  	subi	r11,r11,1
>  	sth	r11,PACA_IN_MCE(r13)
> 
> -	/* Turn off the RI bit because SRR1 is used by idle wakeup code. */
> -	/* Recoverability could be improved by reducing the use of SRR1. */
> -	li	r11,0
> -	mtmsrd	r11,1
> -
> -	b	pnv_powersave_wakeup_mce
> +	mtlr	r4
> +	rlwinm	r10,r3,47-31,30,31
> +	cmpwi	cr3,r10,2
> +#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +	bltlr	cr3	/* no state loss, return to idle caller */
> +#endif
> +	b	idle_return_gpr_loss
>  #endif
>  	/*
>  	 * Handle machine check early in real mode. We come here with
> diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
> index 7f5ac2e8581b..af002b82145d 100644
> --- a/arch/powerpc/kernel/idle_book3s.S
> +++ b/arch/powerpc/kernel/idle_book3s.S
> @@ -1,519 +1,98 @@
>  /*
> - *  This file contains idle entry/exit functions for POWER7,
> - *  POWER8 and POWER9 CPUs.
> + *  Copyright 2018, IBM Corporation.
>   *
>   *  This program is free software; you can redistribute it and/or
>   *  modify it under the terms of the GNU General Public License
>   *  as published by the Free Software Foundation; either version
>   *  2 of the License, or (at your option) any later version.
> + *
> + *  This file contains general idle entry/exit functions to save
> + *  and restore stack and NVGPRs which allows C code to call idle
> + *  states that lose GPRs, and it will return transparently with
> + *  SRR1 wakeup reason return value.
> + *
> + *  The platform / CPU caller must ensure SPRs and any other non-GPR
> + *  state is saved and restored correctly, handle KVM, interrupts, etc.
>   */
> 
> -#include <linux/threads.h>
> -#include <asm/processor.h>
> -#include <asm/page.h>
> -#include <asm/cputable.h>
> -#include <asm/thread_info.h>
>  #include <asm/ppc_asm.h>
>  #include <asm/asm-offsets.h>
>  #include <asm/ppc-opcode.h>
> -#include <asm/hw_irq.h>
> -#include <asm/kvm_book3s_asm.h>
> -#include <asm/opal.h>
>  #include <asm/cpuidle.h>
> -#include <asm/exception-64s.h>
> -#include <asm/book3s/64/mmu-hash.h>
> -#include <asm/mmu.h>
> -#include <asm/asm-compat.h>
> -#include <asm/feature-fixups.h>
> -
> -#undef DEBUG
> -
> -/*
> - * Use unused space in the interrupt stack to save and restore
> - * registers for winkle support.
> - */
> -#define _MMCR0	GPR0
> -#define _SDR1	GPR3
> -#define _PTCR	GPR3
> -#define _RPR	GPR4
> -#define _SPURR	GPR5
> -#define _PURR	GPR6
> -#define _TSCR	GPR7
> -#define _DSCR	GPR8
> -#define _AMOR	GPR9
> -#define _WORT	GPR10
> -#define _WORC	GPR11
> -#define _LPCR	GPR12
> -
> -#define PSSCR_EC_ESL_MASK_SHIFTED          (PSSCR_EC | PSSCR_ESL) >> 16
> -
> -	.text
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +#include <asm/kvm_book3s_asm.h>
> +#endif
> 
>  /*
> - * Used by threads before entering deep idle states. Saves SPRs
> - * in interrupt stack frame
> - */
> -save_sprs_to_stack:
> -	/*
> -	 * Note all register i.e per-core, per-subcore or per-thread is saved
> -	 * here since any thread in the core might wake up first
> -	 */
> -BEGIN_FTR_SECTION
> -	/*
> -	 * Note - SDR1 is dropped in Power ISA v3. Hence not restoring
> -	 * SDR1 here
> -	 */
> -	mfspr	r3,SPRN_PTCR
> -	std	r3,_PTCR(r1)
> -	mfspr	r3,SPRN_LPCR
> -	std	r3,_LPCR(r1)
> -FTR_SECTION_ELSE
> -	mfspr	r3,SPRN_SDR1
> -	std	r3,_SDR1(r1)
> -ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
> -	mfspr	r3,SPRN_RPR
> -	std	r3,_RPR(r1)
> -	mfspr	r3,SPRN_SPURR
> -	std	r3,_SPURR(r1)
> -	mfspr	r3,SPRN_PURR
> -	std	r3,_PURR(r1)
> -	mfspr	r3,SPRN_TSCR
> -	std	r3,_TSCR(r1)
> -	mfspr	r3,SPRN_DSCR
> -	std	r3,_DSCR(r1)
> -	mfspr	r3,SPRN_AMOR
> -	std	r3,_AMOR(r1)
> -	mfspr	r3,SPRN_WORT
> -	std	r3,_WORT(r1)
> -	mfspr	r3,SPRN_WORC
> -	std	r3,_WORC(r1)
> -/*
> - * On POWER9, there are idle states such as stop4, invoked via cpuidle,
> - * that lose hypervisor resources. In such cases, we need to save
> - * additional SPRs before entering those idle states so that they can
> - * be restored to their older values on wakeup from the idle state.
> + * Desired PSSCR in r3
>   *
> - * On POWER8, the only such deep idle state is winkle which is used
> - * only in the context of CPU-Hotplug, where these additional SPRs are
> - * reinitiazed to a sane value. Hence there is no need to save/restore
> - * these SPRs.
> - */
> -BEGIN_FTR_SECTION
> -	blr
> -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
> -
> -power9_save_additional_sprs:
> -	mfspr	r3, SPRN_PID
> -	mfspr	r4, SPRN_LDBAR
> -	std	r3, STOP_PID(r13)
> -	std	r4, STOP_LDBAR(r13)
> -
> -	mfspr	r3, SPRN_FSCR
> -	mfspr	r4, SPRN_HFSCR
> -	std	r3, STOP_FSCR(r13)
> -	std	r4, STOP_HFSCR(r13)
> -
> -	mfspr	r3, SPRN_MMCRA
> -	mfspr	r4, SPRN_MMCR0
> -	std	r3, STOP_MMCRA(r13)
> -	std	r4, _MMCR0(r1)
> -
> -	mfspr	r3, SPRN_MMCR1
> -	mfspr	r4, SPRN_MMCR2
> -	std	r3, STOP_MMCR1(r13)
> -	std	r4, STOP_MMCR2(r13)
> -	blr
> -
> -power9_restore_additional_sprs:
> -	ld	r3,_LPCR(r1)
> -	ld	r4, STOP_PID(r13)
> -	mtspr	SPRN_LPCR,r3
> -	mtspr	SPRN_PID, r4
> -
> -	ld	r3, STOP_LDBAR(r13)
> -	ld	r4, STOP_FSCR(r13)
> -	mtspr	SPRN_LDBAR, r3
> -	mtspr	SPRN_FSCR, r4
> -
> -	ld	r3, STOP_HFSCR(r13)
> -	ld	r4, STOP_MMCRA(r13)
> -	mtspr	SPRN_HFSCR, r3
> -	mtspr	SPRN_MMCRA, r4
> -
> -	ld	r3, _MMCR0(r1)
> -	ld	r4, STOP_MMCR1(r13)
> -	mtspr	SPRN_MMCR0, r3
> -	mtspr	SPRN_MMCR1, r4
> -
> -	ld	r3, STOP_MMCR2(r13)
> -	ld	r4, PACA_SPRG_VDSO(r13)
> -	mtspr	SPRN_MMCR2, r3
> -	mtspr	SPRN_SPRG3, r4
> -	blr
> -
> -/*
> - * Used by threads when the lock bit of core_idle_state is set.
> - * Threads will spin in HMT_LOW until the lock bit is cleared.
> - * r14 - pointer to core_idle_state
> - * r15 - used to load contents of core_idle_state
> - * r9  - used as a temporary variable
> + * No state will be lost regardless of wakeup mechanism (interrupt or NIA).
> + *
> + * An EC=0 type wakeup will return with a value of 0. SRESET wakeup (which can
> + * happen with xscom SRESET and possibly MCE) may clobber volatiles except LR,
> + * and must blr, to return to caller with r3 set according to caller's expected
> + * return code (for Book3S/64 that is SRR1).
>   */
> -
> -core_idle_lock_held:
> -	HMT_LOW
> -3:	lwz	r15,0(r14)
> -	andis.	r15,r15,PNV_CORE_IDLE_LOCK_BIT at h
> -	bne	3b
> -	HMT_MEDIUM
> -	lwarx	r15,0,r14
> -	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT at h
> -	bne-	core_idle_lock_held
> +_GLOBAL(isa300_idle_stop_noloss)
> +	mtspr 	SPRN_PSSCR,r3
> +	PPC_STOP
> +	li	r3,0
>  	blr
> 
>  /*
> - * Pass requested state in r3:
> - *	r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8
> - *	   - Requested PSSCR value in POWER9
> + * Desired PSSCR in r3
> + *
> + * GPRs may be lost, so they are saved here. Wakeup is by interrupt only.
> + * The SRESET wakeup returns to this function's caller by calling
> + * idle_return_gpr_loss with r3 set to desired return value.
> + *
> + * A wakeup without GPR loss may alteratively be handled as in
> + * isa300_idle_stop_noloss and blr directly, as an optimisation.
>   *
> - * Address of idle handler to branch to in realmode in r4
> + * The caller is responsible for saving/restoring SPRs, MSR, timebase,
> + * etc.
>   */
> -pnv_powersave_common:
> -	/* Use r3 to pass state nap/sleep/winkle */
> -	/* NAP is a state loss, we create a regs frame on the
> -	 * stack, fill it up with the state we care about and
> -	 * stick a pointer to it in PACAR1. We really only
> -	 * need to save PC, some CR bits and the NV GPRs,
> -	 * but for now an interrupt frame will do.
> -	 */
> -	mtctr	r4
> -
> -	mflr	r0
> -	std	r0,16(r1)
> -	stdu	r1,-INT_FRAME_SIZE(r1)
> -	std	r0,_LINK(r1)
> -	std	r0,_NIP(r1)
> -
> -	/* We haven't lost state ... yet */
> -	li	r0,0
> -	stb	r0,PACA_NAPSTATELOST(r13)
> -
> -	/* Continue saving state */
> -	SAVE_GPR(2, r1)
> -	SAVE_NVGPRS(r1)
> -	mfcr	r5
> -	std	r5,_CCR(r1)
> +_GLOBAL(isa300_idle_stop_mayloss)
> +	mtspr 	SPRN_PSSCR,r3
>  	std	r1,PACAR1(r13)
> -
> -BEGIN_FTR_SECTION
> -	/*
> -	 * POWER9 does not require real mode to stop, and presently does not
> -	 * set hwthread_state for KVM (threads don't share MMU context), so
> -	 * we can remain in virtual mode for this.
> -	 */
> -	bctr
> -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
> -	/*
> -	 * POWER8
> -	 * Go to real mode to do the nap, as required by the architecture.
> -	 * Also, we need to be in real mode before setting hwthread_state,
> -	 * because as soon as we do that, another thread can switch
> -	 * the MMU context to the guest.
> -	 */
> -	LOAD_REG_IMMEDIATE(r7, MSR_IDLE)
> -	mtmsrd	r7,0
> -	bctr
> -
> -/*
> - * This is the sequence required to execute idle instructions, as
> - * specified in ISA v2.07 (and earlier). MSR[IR] and MSR[DR] must be 0.
> - */
> -#define IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST)			\
> -	/* Magic NAP/SLEEP/WINKLE mode enter sequence */	\
> -	std	r0,0(r1);					\
> -	ptesync;						\
> -	ld	r0,0(r1);					\
> -236:	cmpd	cr0,r0,r0;					\
> -	bne	236b;						\
> -	IDLE_INST;
> -
> -
> -	.globl pnv_enter_arch207_idle_mode
> -pnv_enter_arch207_idle_mode:
> -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> -	/* Tell KVM we're entering idle */
> -	li	r4,KVM_HWTHREAD_IN_IDLE
> -	/******************************************************/
> -	/*  N O T E   W E L L    ! ! !    N O T E   W E L L   */
> -	/* The following store to HSTATE_HWTHREAD_STATE(r13)  */
> -	/* MUST occur in real mode, i.e. with the MMU off,    */
> -	/* and the MMU must stay off until we clear this flag */
> -	/* and test HSTATE_HWTHREAD_REQ(r13) in               */
> -	/* pnv_powersave_wakeup in this file.                 */
> -	/* The reason is that another thread can switch the   */
> -	/* MMU to a guest context whenever this flag is set   */
> -	/* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on,    */
> -	/* that would potentially cause this thread to start  */
> -	/* executing instructions from guest memory in        */
> -	/* hypervisor mode, leading to a host crash or data   */
> -	/* corruption, or worse.                              */
> -	/******************************************************/
> -	stb	r4,HSTATE_HWTHREAD_STATE(r13)
> -#endif
> -	stb	r3,PACA_THREAD_IDLE_STATE(r13)
> -	cmpwi	cr3,r3,PNV_THREAD_SLEEP
> -	bge	cr3,2f
> -	IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP)
> -	/* No return */
> -2:
> -	/* Sleep or winkle */
> -	lbz	r7,PACA_THREAD_MASK(r13)
> -	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
> -	li	r5,0
> -	beq	cr3,3f
> -	lis	r5,PNV_CORE_IDLE_WINKLE_COUNT at h
> -3:
> -lwarx_loop1:
> -	lwarx	r15,0,r14
> -
> -	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT at h
> -	bnel-	core_idle_lock_held
> -
> -	add	r15,r15,r5			/* Add if winkle */
> -	andc	r15,r15,r7			/* Clear thread bit */
> -
> -	andi.	r9,r15,PNV_CORE_IDLE_THREAD_BITS
> -
> -/*
> - * If cr0 = 0, then current thread is the last thread of the core entering
> - * sleep. Last thread needs to execute the hardware bug workaround code if
> - * required by the platform.
> - * Make the workaround call unconditionally here. The below branch call is
> - * patched out when the idle states are discovered if the platform does not
> - * require it.
> - */
> -.global pnv_fastsleep_workaround_at_entry
> -pnv_fastsleep_workaround_at_entry:
> -	beq	fastsleep_workaround_at_entry
> -
> -	stwcx.	r15,0,r14
> -	bne-	lwarx_loop1
> -	isync
> -
> -common_enter: /* common code for all the threads entering sleep or winkle */
> -	bgt	cr3,enter_winkle
> -	IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
> -
> -fastsleep_workaround_at_entry:
> -	oris	r15,r15,PNV_CORE_IDLE_LOCK_BIT at h
> -	stwcx.	r15,0,r14
> -	bne-	lwarx_loop1
> -	isync
> -
> -	/* Fast sleep workaround */
> -	li	r3,1
> -	li	r4,1
> -	bl	opal_config_cpu_idle_state
> -
> -	/* Unlock */
> -	xoris	r15,r15,PNV_CORE_IDLE_LOCK_BIT at h
> -	lwsync
> -	stw	r15,0(r14)
> -	b	common_enter
> -
> -enter_winkle:
> -	bl	save_sprs_to_stack
> -
> -	IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
> -
> -/*
> - * r3 - PSSCR value corresponding to the requested stop state.
> - */
> -power_enter_stop:
> -/*
> - * Check if we are executing the lite variant with ESL=EC=0
> - */
> -	andis.   r4,r3,PSSCR_EC_ESL_MASK_SHIFTED
> -	clrldi   r3,r3,60 /* r3 = Bits[60:63] = Requested Level (RL) */
> -	bne	 .Lhandle_esl_ec_set
> +	mflr	r4
> +	mfcr	r5
> +	/* use stack red zone rather than a new frame for saving regs */
> +	std	r2,-8*0(r1)
> +	std	r14,-8*1(r1)
> +	std	r15,-8*2(r1)
> +	std	r16,-8*3(r1)
> +	std	r17,-8*4(r1)
> +	std	r18,-8*5(r1)
> +	std	r19,-8*6(r1)
> +	std	r20,-8*7(r1)
> +	std	r21,-8*8(r1)
> +	std	r22,-8*9(r1)
> +	std	r23,-8*10(r1)
> +	std	r24,-8*11(r1)
> +	std	r25,-8*12(r1)
> +	std	r26,-8*13(r1)
> +	std	r27,-8*14(r1)
> +	std	r28,-8*15(r1)
> +	std	r29,-8*16(r1)
> +	std	r30,-8*17(r1)
> +	std	r31,-8*18(r1)
> +	std	r4,-8*19(r1)
> +	std	r5,-8*20(r1)
> +	/* 168 bytes */
>  	PPC_STOP
> -	li	r3,0  /* Since we didn't lose state, return 0 */
> -	std	r3, PACA_REQ_PSSCR(r13)
> -
> -	/*
> -	 * pnv_wakeup_noloss() expects r12 to contain the SRR1 value so
> -	 * it can determine if the wakeup reason is an HMI in
> -	 * CHECK_HMI_INTERRUPT.
> -	 *
> -	 * However, when we wakeup with ESL=0, SRR1 will not contain the wakeup
> -	 * reason, so there is no point setting r12 to SRR1.
> -	 *
> -	 * Further, we clear r12 here, so that we don't accidentally enter the
> -	 * HMI in pnv_wakeup_noloss() if the value of r12[42:45] == WAKE_HMI.
> -	 */
> -	li	r12, 0
> -	b 	pnv_wakeup_noloss
> -
> -.Lhandle_esl_ec_set:
> -BEGIN_FTR_SECTION
> -	/*
> -	 * POWER9 DD2.0 or earlier can incorrectly set PMAO when waking up after
> -	 * a state-loss idle. Saving and restoring MMCR0 over idle is a
> -	 * workaround.
> -	 */
> -	mfspr	r4,SPRN_MMCR0
> -	std	r4,_MMCR0(r1)
> -END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1)
> -
> -/*
> - * Check if the requested state is a deep idle state.
> - */
> -	LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
> -	ld	r4,ADDROFF(pnv_first_deep_stop_state)(r5)
> -	cmpd	r3,r4
> -	bge	.Lhandle_deep_stop
> -	PPC_STOP	/* Does not return (system reset interrupt) */
> -
> -.Lhandle_deep_stop:
> -/*
> - * Entering deep idle state.
> - * Clear thread bit in PACA_CORE_IDLE_STATE, save SPRs to
> - * stack and enter stop
> - */
> -	lbz     r7,PACA_THREAD_MASK(r13)
> -	ld      r14,PACA_CORE_IDLE_STATE_PTR(r13)
> -
> -lwarx_loop_stop:
> -	lwarx   r15,0,r14
> -	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT at h
> -	bnel-	core_idle_lock_held
> -	andc    r15,r15,r7                      /* Clear thread bit */
> -
> -	stwcx.  r15,0,r14
> -	bne-    lwarx_loop_stop
> -	isync
> -
> -	bl	save_sprs_to_stack
> -
> -	PPC_STOP	/* Does not return (system reset interrupt) */
> -
> -/*
> - * Entered with MSR[EE]=0 and no soft-masked interrupts pending.
> - * r3 contains desired idle state (PNV_THREAD_NAP/SLEEP/WINKLE).
> - */
> -_GLOBAL(power7_idle_insn)
> -	/* Now check if user or arch enabled NAP mode */
> -	LOAD_REG_ADDR(r4, pnv_enter_arch207_idle_mode)
> -	b	pnv_powersave_common
> -
> -#define CHECK_HMI_INTERRUPT						\
> -BEGIN_FTR_SECTION_NESTED(66);						\
> -	rlwinm	r0,r12,45-31,0xf;  /* extract wake reason field (P8) */	\
> -FTR_SECTION_ELSE_NESTED(66);						\
> -	rlwinm	r0,r12,45-31,0xe;  /* P7 wake reason field is 3 bits */	\
> -ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66);		\
> -	cmpwi	r0,0xa;			/* Hypervisor maintenance ? */	\
> -	bne+	20f;							\
> -	/* Invoke opal call to handle hmi */				\
> -	ld	r2,PACATOC(r13);					\
> -	ld	r1,PACAR1(r13);						\
> -	std	r3,ORIG_GPR3(r1);	/* Save original r3 */		\
> -	li	r3,0;			/* NULL argument */		\
> -	bl	hmi_exception_realmode;					\
> -	nop;								\
> -	ld	r3,ORIG_GPR3(r1);	/* Restore original r3 */	\
> -20:	nop;
> +	b	.	/* catch bugs */
> 
>  /*
> - * Entered with MSR[EE]=0 and no soft-masked interrupts pending.
> - * r3 contains desired PSSCR register value.
> + * Desired return value in r3
>   *
> - * Offline (CPU unplug) case also must notify KVM that the CPU is
> - * idle.
> - */
> -_GLOBAL(power9_offline_stop)
> -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> -	/*
> -	 * Tell KVM we're entering idle.
> -	 * This does not have to be done in real mode because the P9 MMU
> -	 * is independent per-thread. Some steppings share radix/hash mode
> -	 * between threads, but in that case KVM has a barrier sync in real
> -	 * mode before and after switching between radix and hash.
> -	 */
> -	li	r4,KVM_HWTHREAD_IN_IDLE
> -	stb	r4,HSTATE_HWTHREAD_STATE(r13)
> -#endif
> -	/* fall through */
> -
> -_GLOBAL(power9_idle_stop)
> -	std	r3, PACA_REQ_PSSCR(r13)
> -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> -BEGIN_FTR_SECTION
> -	sync
> -	lwz	r5, PACA_DONT_STOP(r13)
> -	cmpwi	r5, 0
> -	bne	1f
> -END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
> -#endif
> -	mtspr 	SPRN_PSSCR,r3
> -	LOAD_REG_ADDR(r4,power_enter_stop)
> -	b	pnv_powersave_common
> -	/* No return */
> -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> -1:
> -	/*
> -	 * We get here when TM / thread reconfiguration bug workaround
> -	 * code wants to get the CPU into SMT4 mode, and therefore
> -	 * we are being asked not to stop.
> -	 */
> -	li	r3, 0
> -	std	r3, PACA_REQ_PSSCR(r13)
> -	blr		/* return 0 for wakeup cause / SRR1 value */
> -#endif
> -
> -/*
> - * Called from machine check handler for powersave wakeups.
> - * Low level machine check processing has already been done. Now just
> - * go through the wake up path to get everything in order.
> + * The idle wakeup SRESET interrupt can call this after calling
> + * to return to the idle sleep function caller with r3 as the return code.
>   *
> - * r3 - The original SRR1 value.
> - * Original SRR[01] have been clobbered.
> - * MSR_RI is clear.
> + * This must not be used if idle was entered via a _noloss function (use
> + * a simple blr instead).
>   */
> -.global pnv_powersave_wakeup_mce
> -pnv_powersave_wakeup_mce:
> -	/* Set cr3 for pnv_powersave_wakeup */
> -	rlwinm	r11,r3,47-31,30,31
> -	cmpwi	cr3,r11,2
> -
> -	/*
> -	 * Now put the original SRR1 with SRR1_WAKEMCE_RESVD as the wake
> -	 * reason into r12, which allows reuse of the system reset wakeup
> -	 * code without being mistaken for another type of wakeup.
> -	 */
> -	oris	r12,r3,SRR1_WAKEMCE_RESVD at h
> -
> -	b	pnv_powersave_wakeup
> -
> -/*
> - * Called from reset vector for powersave wakeups.
> - * cr3 - set to gt if waking up with partial/complete hypervisor state loss
> - * r12 - SRR1
> - */
> -.global pnv_powersave_wakeup
> -pnv_powersave_wakeup:
> -	ld	r2, PACATOC(r13)
> -
> -BEGIN_FTR_SECTION
> -	bl	pnv_restore_hyp_resource_arch300
> -FTR_SECTION_ELSE
> -	bl	pnv_restore_hyp_resource_arch207
> -ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
> -
> -	li	r0,PNV_THREAD_RUNNING
> -	stb	r0,PACA_THREAD_IDLE_STATE(r13)	/* Clear thread state */
> -
> -	mr	r3,r12
> -
> +_GLOBAL(idle_return_gpr_loss)
>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>  	lbz	r0,HSTATE_HWTHREAD_STATE(r13)
>  	cmpwi	r0,KVM_HWTHREAD_IN_KERNEL
> @@ -527,430 +106,98 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
>  	beq	1f
>  	b	kvm_start_guest
>  1:
> +	lbz	r0,PACA_NAPSTATELOST(r13)
> +	cmpwi	r0,0
> +	bne	2f
> +	bltlr	cr3
> +2:
>  #endif
> -
> -	/* Return SRR1 from power7_nap() */
> -	blt	cr3,pnv_wakeup_noloss
> -	b	pnv_wakeup_loss
> +	ld	r1,PACAR1(r13)
> +	ld	r4,-8*19(r1)
> +	ld	r5,-8*20(r1)
> +	mtlr	r4
> +	mtcr	r5
> +	ld	r2,-8*0(r1)
> +	ld	r14,-8*1(r1)
> +	ld	r15,-8*2(r1)
> +	ld	r16,-8*3(r1)
> +	ld	r17,-8*4(r1)
> +	ld	r18,-8*5(r1)
> +	ld	r19,-8*6(r1)
> +	ld	r20,-8*7(r1)
> +	ld	r21,-8*8(r1)
> +	ld	r22,-8*9(r1)
> +	ld	r23,-8*10(r1)
> +	ld	r24,-8*11(r1)
> +	ld	r25,-8*12(r1)
> +	ld	r26,-8*13(r1)
> +	ld	r27,-8*14(r1)
> +	ld	r28,-8*15(r1)
> +	ld	r29,-8*16(r1)
> +	ld	r30,-8*17(r1)
> +	ld	r31,-8*18(r1)
> +	blr
> 
>  /*
> - * Check whether we have woken up with hypervisor state loss.
> - * If yes, restore hypervisor state and return back to link.
> - *
> - * cr3 - set to gt if waking up with partial/complete hypervisor state loss
> + * This is the sequence required to execute idle instructions, as
> + * specified in ISA v2.07 (and earlier). MSR[IR] and MSR[DR] must be 0.
>   */
> -pnv_restore_hyp_resource_arch300:
> -	/*
> -	 * Workaround for POWER9, if we lost resources, the ERAT
> -	 * might have been mixed up and needs flushing. We also need
> -	 * to reload MMCR0 (see comment above). We also need to set
> -	 * then clear bit 60 in MMCRA to ensure the PMU starts running.
> -	 */
> -	blt	cr3,1f
> -BEGIN_FTR_SECTION
> -	PPC_INVALIDATE_ERAT
> -	ld	r1,PACAR1(r13)
> -	ld	r4,_MMCR0(r1)
> -	mtspr	SPRN_MMCR0,r4
> -END_FTR_SECTION_IFCLR(CPU_FTR_POWER9_DD2_1)
> -	mfspr	r4,SPRN_MMCRA
> -	ori	r4,r4,(1 << (63-60))
> -	mtspr	SPRN_MMCRA,r4
> -	xori	r4,r4,(1 << (63-60))
> -	mtspr	SPRN_MMCRA,r4
> -1:
> -	/*
> -	 * POWER ISA 3. Use PSSCR to determine if we
> -	 * are waking up from deep idle state
> -	 */
> -	LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state)
> -	ld	r4,ADDROFF(pnv_first_deep_stop_state)(r5)
> -
> -	/*
> -	 * 0-3 bits correspond to Power-Saving Level Status
> -	 * which indicates the idle state we are waking up from
> -	 */
> -	mfspr	r5, SPRN_PSSCR
> -	rldicl  r5,r5,4,60
> -	li	r0, 0		/* clear requested_psscr to say we're awake */
> -	std	r0, PACA_REQ_PSSCR(r13)
> -	cmpd	cr4,r5,r4
> -	bge	cr4,pnv_wakeup_tb_loss /* returns to caller */
> -
> -	blr	/* Waking up without hypervisor state loss. */
> -
> -/* Same calling convention as arch300 */
> -pnv_restore_hyp_resource_arch207:
> -	/*
> -	 * POWER ISA 2.07 or less.
> -	 * Check if we slept with sleep or winkle.
> -	 */
> -	lbz	r4,PACA_THREAD_IDLE_STATE(r13)
> -	cmpwi	cr2,r4,PNV_THREAD_NAP
> -	bgt	cr2,pnv_wakeup_tb_loss	/* Either sleep or Winkle */
> -
> -	/*
> -	 * We fall through here if PACA_THREAD_IDLE_STATE shows we are waking
> -	 * up from nap. At this stage CR3 shouldn't contains 'gt' since that
> -	 * indicates we are waking with hypervisor state loss from nap.
> -	 */
> -	bgt	cr3,.
> -
> -	blr	/* Waking up without hypervisor state loss */
> +#define IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST)			\
> +	/* Magic NAP/SLEEP/WINKLE mode enter sequence */	\
> +	std	r0,0(r1);					\
> +	ptesync;						\
> +	ld	r0,0(r1);					\
> +236:	cmpd	cr0,r0,r0;					\
> +	bne	236b;						\
> +	IDLE_INST;						\
> +	b	.	/* catch bugs */
> 
>  /*
> - * Called if waking up from idle state which can cause either partial or
> - * complete hyp state loss.
> - * In POWER8, called if waking up from fastsleep or winkle
> - * In POWER9, called if waking up from stop state >= pnv_first_deep_stop_state
> + * Desired instruction type in r3
>   *
> - * r13 - PACA
> - * cr3 - gt if waking up with partial/complete hypervisor state loss
> + * GPRs may be lost, so they are saved here. Wakeup is by interrupt only.
> + * The SRESET wakeup returns to this function's caller by calling
> + * idle_return_gpr_loss with r3 set to desired return value.
>   *
> - * If ISA300:
> - * cr4 - gt or eq if waking up from complete hypervisor state loss.
> + * A wakeup without GPR loss may alteratively be handled as in
> + * isa300_idle_stop_noloss and blr directly, as an optimisation.
>   *
> - * If ISA207:
> - * r4 - PACA_THREAD_IDLE_STATE
> - */
> -pnv_wakeup_tb_loss:
> -	ld	r1,PACAR1(r13)
> -	/*
> -	 * Before entering any idle state, the NVGPRs are saved in the stack.
> -	 * If there was a state loss, or PACA_NAPSTATELOST was set, then the
> -	 * NVGPRs are restored. If we are here, it is likely that state is lost,
> -	 * but not guaranteed -- neither ISA207 nor ISA300 tests to reach
> -	 * here are the same as the test to restore NVGPRS:
> -	 * PACA_THREAD_IDLE_STATE test for ISA207, PSSCR test for ISA300,
> -	 * and SRR1 test for restoring NVGPRs.
> -	 *
> -	 * We are about to clobber NVGPRs now, so set NAPSTATELOST to
> -	 * guarantee they will always be restored. This might be tightened
> -	 * with careful reading of specs (particularly for ISA300) but this
> -	 * is already a slow wakeup path and it's simpler to be safe.
> -	 */
> -	li	r0,1
> -	stb	r0,PACA_NAPSTATELOST(r13)
> -
> -	/*
> -	 *
> -	 * Save SRR1 and LR in NVGPRs as they might be clobbered in
> -	 * opal_call() (called in CHECK_HMI_INTERRUPT). SRR1 is required
> -	 * to determine the wakeup reason if we branch to kvm_start_guest. LR
> -	 * is required to return back to reset vector after hypervisor state
> -	 * restore is complete.
> -	 */
> -	mr	r19,r12
> -	mr	r18,r4
> -	mflr	r17
> -BEGIN_FTR_SECTION
> -	CHECK_HMI_INTERRUPT
> -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
> -
> -	ld	r14,PACA_CORE_IDLE_STATE_PTR(r13)
> -	lbz	r7,PACA_THREAD_MASK(r13)
> -
> -	/*
> -	 * Take the core lock to synchronize against other threads.
> -	 *
> -	 * Lock bit is set in one of the 2 cases-
> -	 * a. In the sleep/winkle enter path, the last thread is executing
> -	 * fastsleep workaround code.
> -	 * b. In the wake up path, another thread is executing fastsleep
> -	 * workaround undo code or resyncing timebase or restoring context
> -	 * In either case loop until the lock bit is cleared.
> -	 */
> -1:
> -	lwarx	r15,0,r14
> -	andis.	r9,r15,PNV_CORE_IDLE_LOCK_BIT at h
> -	bnel-	core_idle_lock_held
> -	oris	r15,r15,PNV_CORE_IDLE_LOCK_BIT at h
> -	stwcx.	r15,0,r14
> -	bne-	1b
> -	isync
> -
> -	andi.	r9,r15,PNV_CORE_IDLE_THREAD_BITS
> -	cmpwi	cr2,r9,0
> -
> -	/*
> -	 * At this stage
> -	 * cr2 - eq if first thread to wakeup in core
> -	 * cr3-  gt if waking up with partial/complete hypervisor state loss
> -	 * ISA300:
> -	 * cr4 - gt or eq if waking up from complete hypervisor state loss.
> -	 */
> -
> -BEGIN_FTR_SECTION
> -	/*
> -	 * Were we in winkle?
> -	 * If yes, check if all threads were in winkle, decrement our
> -	 * winkle count, set all thread winkle bits if all were in winkle.
> -	 * Check if our thread has a winkle bit set, and set cr4 accordingly
> -	 * (to match ISA300, above). Pseudo-code for core idle state
> -	 * transitions for ISA207 is as follows (everything happens atomically
> -	 * due to store conditional and/or lock bit):
> -	 *
> -	 * nap_idle() { }
> -	 * nap_wake() { }
> -	 *
> -	 * sleep_idle()
> -	 * {
> -	 *	core_idle_state &= ~thread_in_core
> -	 * }
> -	 *
> -	 * sleep_wake()
> -	 * {
> -	 *     bool first_in_core, first_in_subcore;
> -	 *
> -	 *     first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0;
> -	 *     first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0;
> -	 *
> -	 *     core_idle_state |= thread_in_core;
> -	 * }
> -	 *
> -	 * winkle_idle()
> -	 * {
> -	 *	core_idle_state &= ~thread_in_core;
> -	 *	core_idle_state += 1 << WINKLE_COUNT_SHIFT;
> -	 * }
> -	 *
> -	 * winkle_wake()
> -	 * {
> -	 *     bool first_in_core, first_in_subcore, winkle_state_lost;
> -	 *
> -	 *     first_in_core = (core_idle_state & IDLE_THREAD_BITS) == 0;
> -	 *     first_in_subcore = (core_idle_state & SUBCORE_SIBLING_MASK) == 0;
> -	 *
> -	 *     core_idle_state |= thread_in_core;
> -	 *
> -	 *     if ((core_idle_state & WINKLE_MASK) == (8 << WINKLE_COUNT_SIHFT))
> -	 *         core_idle_state |= THREAD_WINKLE_BITS;
> -	 *     core_idle_state -= 1 << WINKLE_COUNT_SHIFT;
> -	 *
> -	 *     winkle_state_lost = core_idle_state &
> -	 *				(thread_in_core << WINKLE_THREAD_SHIFT);
> -	 *     core_idle_state &= ~(thread_in_core << WINKLE_THREAD_SHIFT);
> -	 * }
> -	 *
> -	 */
> -	cmpwi	r18,PNV_THREAD_WINKLE
> -	bne	2f
> -	andis.	r9,r15,PNV_CORE_IDLE_WINKLE_COUNT_ALL_BIT at h
> -	subis	r15,r15,PNV_CORE_IDLE_WINKLE_COUNT at h
> -	beq	2f
> -	ori	r15,r15,PNV_CORE_IDLE_THREAD_WINKLE_BITS /* all were winkle */
> -2:
> -	/* Shift thread bit to winkle mask, then test if this thread is set,
> -	 * and remove it from the winkle bits */
> -	slwi	r8,r7,8
> -	and	r8,r8,r15
> -	andc	r15,r15,r8
> -	cmpwi	cr4,r8,1 /* cr4 will be gt if our bit is set, lt if not */
> -
> -	lbz	r4,PACA_SUBCORE_SIBLING_MASK(r13)
> -	and	r4,r4,r15
> -	cmpwi	r4,0	/* Check if first in subcore */
> -
> -	or	r15,r15,r7		/* Set thread bit */
> -	beq	first_thread_in_subcore
> -END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
> -
> -	or	r15,r15,r7		/* Set thread bit */
> -	beq	cr2,first_thread_in_core
> -
> -	/* Not first thread in core or subcore to wake up */
> -	b	clear_lock
> -
> -first_thread_in_subcore:
> -	/*
> -	 * If waking up from sleep, subcore state is not lost. Hence
> -	 * skip subcore state restore
> -	 */
> -	blt	cr4,subcore_state_restored
> -
> -	/* Restore per-subcore state */
> -	ld      r4,_SDR1(r1)
> -	mtspr   SPRN_SDR1,r4
> -
> -	ld      r4,_RPR(r1)
> -	mtspr   SPRN_RPR,r4
> -	ld	r4,_AMOR(r1)
> -	mtspr	SPRN_AMOR,r4
> -
> -subcore_state_restored:
> -	/*
> -	 * Check if the thread is also the first thread in the core. If not,
> -	 * skip to clear_lock.
> -	 */
> -	bne	cr2,clear_lock
> -
> -first_thread_in_core:
> -
> -	/*
> -	 * First thread in the core waking up from any state which can cause
> -	 * partial or complete hypervisor state loss. It needs to
> -	 * call the fastsleep workaround code if the platform requires it.
> -	 * Call it unconditionally here. The below branch instruction will
> -	 * be patched out if the platform does not have fastsleep or does not
> -	 * require the workaround. Patching will be performed during the
> -	 * discovery of idle-states.
> -	 */
> -.global pnv_fastsleep_workaround_at_exit
> -pnv_fastsleep_workaround_at_exit:
> -	b	fastsleep_workaround_at_exit
> -
> -timebase_resync:
> -	/*
> -	 * Use cr3 which indicates that we are waking up with atleast partial
> -	 * hypervisor state loss to determine if TIMEBASE RESYNC is needed.
> -	 */
> -	ble	cr3,.Ltb_resynced
> -	/* Time base re-sync */
> -	bl	opal_resync_timebase;
> -	/*
> -	 * If waking up from sleep (POWER8), per core state
> -	 * is not lost, skip to clear_lock.
> -	 */
> -.Ltb_resynced:
> -	blt	cr4,clear_lock
> -
> -	/*
> -	 * First thread in the core to wake up and its waking up with
> -	 * complete hypervisor state loss. Restore per core hypervisor
> -	 * state.
> -	 */
> -BEGIN_FTR_SECTION
> -	ld	r4,_PTCR(r1)
> -	mtspr	SPRN_PTCR,r4
> -	ld	r4,_RPR(r1)
> -	mtspr	SPRN_RPR,r4
> -	ld	r4,_AMOR(r1)
> -	mtspr	SPRN_AMOR,r4
> -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
> -
> -	ld	r4,_TSCR(r1)
> -	mtspr	SPRN_TSCR,r4
> -	ld	r4,_WORC(r1)
> -	mtspr	SPRN_WORC,r4
> -
> -clear_lock:
> -	xoris	r15,r15,PNV_CORE_IDLE_LOCK_BIT at h
> -	lwsync
> -	stw	r15,0(r14)
> -
> -common_exit:
> -	/*
> -	 * Common to all threads.
> -	 *
> -	 * If waking up from sleep, hypervisor state is not lost. Hence
> -	 * skip hypervisor state restore.
> -	 */
> -	blt	cr4,hypervisor_state_restored
> -
> -	/* Waking up from winkle */
> -
> -BEGIN_MMU_FTR_SECTION
> -	b	no_segments
> -END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
> -	/* Restore SLB  from PACA */
> -	ld	r8,PACA_SLBSHADOWPTR(r13)
> -
> -	.rept	SLB_NUM_BOLTED
> -	li	r3, SLBSHADOW_SAVEAREA
> -	LDX_BE	r5, r8, r3
> -	addi	r3, r3, 8
> -	LDX_BE	r6, r8, r3
> -	andis.	r7,r5,SLB_ESID_V at h
> -	beq	1f
> -	slbmte	r6,r5
> -1:	addi	r8,r8,16
> -	.endr
> -no_segments:
> -
> -	/* Restore per thread state */
> -
> -	ld	r4,_SPURR(r1)
> -	mtspr	SPRN_SPURR,r4
> -	ld	r4,_PURR(r1)
> -	mtspr	SPRN_PURR,r4
> -	ld	r4,_DSCR(r1)
> -	mtspr	SPRN_DSCR,r4
> -	ld	r4,_WORT(r1)
> -	mtspr	SPRN_WORT,r4
> -
> -	/* Call cur_cpu_spec->cpu_restore() */
> -	LOAD_REG_ADDR(r4, cur_cpu_spec)
> -	ld	r4,0(r4)
> -	ld	r12,CPU_SPEC_RESTORE(r4)
> -#ifdef PPC64_ELF_ABI_v1
> -	ld	r12,0(r12)
> -#endif
> -	mtctr	r12
> -	bctrl
> -
> -/*
> - * On POWER9, we can come here on wakeup from a cpuidle stop state.
> - * Hence restore the additional SPRs to the saved value.
> + * The caller is responsible for saving/restoring SPRs, MSR, timebase,
> + * etc.
>   *
> - * On POWER8, we come here only on winkle. Since winkle is used
> - * only in the case of CPU-Hotplug, we don't need to restore
> - * the additional SPRs.
> - */
> -BEGIN_FTR_SECTION
> -	bl 	power9_restore_additional_sprs
> -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
> -hypervisor_state_restored:
> -
> -	mr	r12,r19
> -	mtlr	r17
> -	blr		/* return to pnv_powersave_wakeup */
> -
> -fastsleep_workaround_at_exit:
> -	li	r3,1
> -	li	r4,0
> -	bl	opal_config_cpu_idle_state
> -	b	timebase_resync
> -
> -/*
> - * R3 here contains the value that will be returned to the caller
> - * of power7_nap.
> - * R12 contains SRR1 for CHECK_HMI_INTERRUPT.
> + * This must be called in real-mode (MSR_IDLE).
>   */
> -.global pnv_wakeup_loss
> -pnv_wakeup_loss:
> -	ld	r1,PACAR1(r13)
> -BEGIN_FTR_SECTION
> -	CHECK_HMI_INTERRUPT
> -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
> -	REST_NVGPRS(r1)
> -	REST_GPR(2, r1)
> -	ld	r4,PACAKMSR(r13)
> -	ld	r5,_LINK(r1)
> -	ld	r6,_CCR(r1)
> -	addi	r1,r1,INT_FRAME_SIZE
> -	mtlr	r5
> -	mtcr	r6
> -	mtmsrd	r4
> -	blr
> +_GLOBAL(isa206_idle_insn_mayloss)
> +	std	r1,PACAR1(r13)
> +	mflr	r4
> +	mfcr	r5
> +	/* use stack red zone rather than a new frame for saving regs */
> +	std	r2,-8*0(r1)
> +	std	r14,-8*1(r1)
> +	std	r15,-8*2(r1)
> +	std	r16,-8*3(r1)
> +	std	r17,-8*4(r1)
> +	std	r18,-8*5(r1)
> +	std	r19,-8*6(r1)
> +	std	r20,-8*7(r1)
> +	std	r21,-8*8(r1)
> +	std	r22,-8*9(r1)
> +	std	r23,-8*10(r1)
> +	std	r24,-8*11(r1)
> +	std	r25,-8*12(r1)
> +	std	r26,-8*13(r1)
> +	std	r27,-8*14(r1)
> +	std	r28,-8*15(r1)
> +	std	r29,-8*16(r1)
> +	std	r30,-8*17(r1)
> +	std	r31,-8*18(r1)
> +	std	r4,-8*19(r1)
> +	std	r5,-8*20(r1)
> +	cmpwi	r3,PNV_THREAD_NAP
> +	bne	1f
> +	IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP)
> +1:	cmpwi	r3,PNV_THREAD_SLEEP
> +	bne	2f
> +	IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP)
> +2:	IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE)
> 
> -/*
> - * R3 here contains the value that will be returned to the caller
> - * of power7_nap.
> - * R12 contains SRR1 for CHECK_HMI_INTERRUPT.
> - */
> -pnv_wakeup_noloss:
> -	lbz	r0,PACA_NAPSTATELOST(r13)
> -	cmpwi	r0,0
> -	bne	pnv_wakeup_loss
> -	ld	r1,PACAR1(r13)
> -BEGIN_FTR_SECTION
> -	CHECK_HMI_INTERRUPT
> -END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
> -	ld	r4,PACAKMSR(r13)
> -	ld	r5,_NIP(r1)
> -	ld	r6,_CCR(r1)
> -	addi	r1,r1,INT_FRAME_SIZE
> -	mtlr	r5
> -	mtcr	r6
> -	mtmsrd	r4
> -	blr
> diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c
> index 2e5dfb6e0823..8b4858f82229 100644
> --- a/arch/powerpc/kernel/setup-common.c
> +++ b/arch/powerpc/kernel/setup-common.c
> @@ -401,8 +401,8 @@ void __init check_for_initrd(void)
> 
>  #ifdef CONFIG_SMP
> 
> -int threads_per_core, threads_per_subcore, threads_shift;
> -cpumask_t threads_core_mask;
> +int threads_per_core, threads_per_subcore, threads_shift __read_mostly;
> +cpumask_t threads_core_mask __read_mostly;
>  EXPORT_SYMBOL_GPL(threads_per_core);
>  EXPORT_SYMBOL_GPL(threads_per_subcore);
>  EXPORT_SYMBOL_GPL(threads_shift);
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 3a5e719ef032..58d0f1ba845d 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -448,8 +448,10 @@ kvm_no_guest:
>  	rlwimi	r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
>  	mtspr	SPRN_LPCR, r4
>  	li	r3, 0
> -	mfspr	r12,SPRN_SRR1
> -	b	pnv_wakeup_loss
> +	/* set up cr3 and r3 for return */
> +	cmpdi	cr3, r3, 0
> +	mfspr	r3,SPRN_SRR1
> +	b	idle_return_gpr_loss
> 
>  53:	HMT_LOW
>  	ld	r5, HSTATE_KVM_VCORE(r13)
> diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
> index e52f9b06dd9c..6ea1543c2d6d 100644
> --- a/arch/powerpc/platforms/powernv/idle.c
> +++ b/arch/powerpc/platforms/powernv/idle.c
> @@ -16,6 +16,7 @@
>  #include <linux/device.h>
>  #include <linux/cpu.h>
> 
> +#include <asm/asm-prototypes.h>
>  #include <asm/firmware.h>
>  #include <asm/machdep.h>
>  #include <asm/opal.h>
> @@ -48,10 +49,10 @@ static u64 pnv_default_stop_mask;
>  static bool default_stop_found;
> 
>  /*
> - * First deep stop state. Used to figure out when to save/restore
> - * hypervisor context.
> + * First stop state levels when SPR and TB loss can occur.
>   */
> -u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
> +static u64 pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
> +static u64 pnv_first_spr_loss_level = MAX_STOP_STATE + 1;
> 
>  /*
>   * psscr value and mask of the deepest stop idle state.
> @@ -62,6 +63,8 @@ static u64 pnv_deepest_stop_psscr_mask;
>  static u64 pnv_deepest_stop_flag;
>  static bool deepest_stop_found;
> 
> +static unsigned long power7_offline_type;
> +
>  static int pnv_save_sprs_for_deep_states(void)
>  {
>  	int cpu;
> @@ -72,12 +75,12 @@ static int pnv_save_sprs_for_deep_states(void)
>  	 * all cpus at boot. Get these reg values of current cpu and use the
>  	 * same across all cpus.
>  	 */
> -	uint64_t lpcr_val = mfspr(SPRN_LPCR);
> -	uint64_t hid0_val = mfspr(SPRN_HID0);
> -	uint64_t hid1_val = mfspr(SPRN_HID1);
> -	uint64_t hid4_val = mfspr(SPRN_HID4);
> -	uint64_t hid5_val = mfspr(SPRN_HID5);
> -	uint64_t hmeer_val = mfspr(SPRN_HMEER);
> +	uint64_t lpcr_val	= mfspr(SPRN_LPCR);
> +	uint64_t hid0_val	= mfspr(SPRN_HID0);
> +	uint64_t hid1_val	= mfspr(SPRN_HID1);
> +	uint64_t hid4_val	= mfspr(SPRN_HID4);
> +	uint64_t hid5_val	= mfspr(SPRN_HID5);
> +	uint64_t hmeer_val	= mfspr(SPRN_HMEER);
>  	uint64_t msr_val = MSR_IDLE;
>  	uint64_t psscr_val = pnv_deepest_stop_psscr_val;
> 
> @@ -137,89 +140,6 @@ static int pnv_save_sprs_for_deep_states(void)
>  	return 0;
>  }
> 
> -static void pnv_alloc_idle_core_states(void)
> -{
> -	int i, j;
> -	int nr_cores = cpu_nr_cores();
> -	u32 *core_idle_state;
> -
> -	/*
> -	 * core_idle_state - The lower 8 bits track the idle state of
> -	 * each thread of the core.
> -	 *
> -	 * The most significant bit is the lock bit.
> -	 *
> -	 * Initially all the bits corresponding to threads_per_core
> -	 * are set. They are cleared when the thread enters deep idle
> -	 * state like sleep and winkle/stop.
> -	 *
> -	 * Initially the lock bit is cleared.  The lock bit has 2
> -	 * purposes:
> -	 * 	a. While the first thread in the core waking up from
> -	 * 	   idle is restoring core state, it prevents other
> -	 * 	   threads in the core from switching to process
> -	 * 	   context.
> -	 * 	b. While the last thread in the core is saving the
> -	 *	   core state, it prevents a different thread from
> -	 *	   waking up.
> -	 */
> -	for (i = 0; i < nr_cores; i++) {
> -		int first_cpu = i * threads_per_core;
> -		int node = cpu_to_node(first_cpu);
> -		size_t paca_ptr_array_size;
> -
> -		core_idle_state = kmalloc_node(sizeof(u32), GFP_KERNEL, node);
> -		*core_idle_state = (1 << threads_per_core) - 1;
> -		paca_ptr_array_size = (threads_per_core *
> -				       sizeof(struct paca_struct *));
> -
> -		for (j = 0; j < threads_per_core; j++) {
> -			int cpu = first_cpu + j;
> -
> -			paca_ptrs[cpu]->core_idle_state_ptr = core_idle_state;
> -			paca_ptrs[cpu]->thread_idle_state = PNV_THREAD_RUNNING;
> -			paca_ptrs[cpu]->thread_mask = 1 << j;
> -		}
> -	}
> -
> -	update_subcore_sibling_mask();
> -
> -	if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) {
> -		int rc = pnv_save_sprs_for_deep_states();
> -
> -		if (likely(!rc))
> -			return;
> -
> -		/*
> -		 * The stop-api is unable to restore hypervisor
> -		 * resources on wakeup from platform idle states which
> -		 * lose full context. So disable such states.
> -		 */
> -		supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT;
> -		pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n");
> -		pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n");
> -
> -		if (cpu_has_feature(CPU_FTR_ARCH_300) &&
> -		    (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) {
> -			/*
> -			 * Use the default stop state for CPU-Hotplug
> -			 * if available.
> -			 */
> -			if (default_stop_found) {
> -				pnv_deepest_stop_psscr_val =
> -					pnv_default_stop_val;
> -				pnv_deepest_stop_psscr_mask =
> -					pnv_default_stop_mask;
> -				pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n",
> -					pnv_deepest_stop_psscr_val);
> -			} else { /* Fallback to snooze loop for CPU-Hotplug */
> -				deepest_stop_found = false;
> -				pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n");
> -			}
> -		}
> -	}
> -}
> -
>  u32 pnv_get_supported_cpuidle_states(void)
>  {
>  	return supported_cpuidle_states;
> @@ -238,6 +158,9 @@ static void pnv_fastsleep_workaround_apply(void *info)
>  		*err = 1;
>  }
> 
> +static bool power7_fastsleep_workaround_entry = true;
> +static bool power7_fastsleep_workaround_exit = true;
> +
>  /*
>   * Used to store fastsleep workaround state
>   * 0 - Workaround applied/undone at fastsleep entry/exit path (Default)
> @@ -269,21 +192,15 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev,
>  	 * fastsleep_workaround_applyonce = 1 implies
>  	 * fastsleep workaround needs to be left in 'applied' state on all
>  	 * the cores. Do this by-
> -	 * 1. Patching out the call to 'undo' workaround in fastsleep exit path
> -	 * 2. Sending ipi to all the cores which have at least one online thread
> -	 * 3. Patching out the call to 'apply' workaround in fastsleep entry
> -	 * path
> +	 * 1. Disable the 'undo' workaround in fastsleep exit path
> +	 * 2. Sendi IPIs to all the cores which have at least one online thread
> +	 * 3. Disable the 'apply' workaround in fastsleep entry path
> +	 *
>  	 * There is no need to send ipi to cores which have all threads
>  	 * offlined, as last thread of the core entering fastsleep or deeper
>  	 * state would have applied workaround.
>  	 */
> -	err = patch_instruction(
> -		(unsigned int *)pnv_fastsleep_workaround_at_exit,
> -		PPC_INST_NOP);
> -	if (err) {
> -		pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_exit");
> -		goto fail;
> -	}
> +	power7_fastsleep_workaround_exit = false;
> 
>  	get_online_cpus();
>  	primary_thread_mask = cpu_online_cores_map();
> @@ -296,13 +213,7 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev,
>  		goto fail;
>  	}
> 
> -	err = patch_instruction(
> -		(unsigned int *)pnv_fastsleep_workaround_at_entry,
> -		PPC_INST_NOP);
> -	if (err) {
> -		pr_err("fastsleep_workaround_applyonce change failed while patching pnv_fastsleep_workaround_at_entry");
> -		goto fail;
> -	}
> +	power7_fastsleep_workaround_entry = false;
> 
>  	fastsleep_workaround_applyonce = 1;
> 
> @@ -315,6 +226,301 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600,
>  			show_fastsleep_workaround_applyonce,
>  			store_fastsleep_workaround_applyonce);
> 
> +static inline void atomic_start_thread_idle(void)
> +{
> +	int cpu = raw_smp_processor_id();
> +	int first = cpu_first_thread_sibling(cpu);
> +	int thread_nr = cpu_thread_in_core(cpu);
> +	unsigned long *state = &paca_ptrs[first]->idle_state;
> +
> +	clear_bit(thread_nr, state);
> +}
> +
> +static inline void atomic_stop_thread_idle(void)
> +{
> +	int cpu = raw_smp_processor_id();
> +	int first = cpu_first_thread_sibling(cpu);
> +	int thread_nr = cpu_thread_in_core(cpu);
> +	unsigned long *state = &paca_ptrs[first]->idle_state;
> +
> +	set_bit(thread_nr, state);
> +}
> +
> +static inline void atomic_lock_thread_idle(void)
> +{
> +	int cpu = raw_smp_processor_id();
> +	int first = cpu_first_thread_sibling(cpu);
> +	unsigned long *state = &paca_ptrs[first]->idle_state;
> +
> +	while (unlikely(test_and_set_bit_lock(NR_PNV_CORE_IDLE_LOCK_BIT, state)))
> +		barrier();
> +}
> +
> +static inline void atomic_unlock_and_stop_thread_idle(void)
> +{
> +	int cpu = raw_smp_processor_id();
> +	int first = cpu_first_thread_sibling(cpu);
> +	unsigned long thread = 1UL << cpu_thread_in_core(cpu);
> +	unsigned long *state = &paca_ptrs[first]->idle_state;
> +	u64 s = READ_ONCE(*state);
> +	u64 new, tmp;
> +
> +	BUG_ON(!(s & PNV_CORE_IDLE_LOCK_BIT));
> +	BUG_ON(s & thread);
> +
> +again:
> +	new = (s | thread) & ~PNV_CORE_IDLE_LOCK_BIT;
> +	tmp = cmpxchg(state, s, new);
> +	if (unlikely(tmp != s)) {
> +		s = tmp;
> +		goto again;
> +	}
> +}
> +
> +static inline void atomic_unlock_thread_idle(void)
> +{
> +	int cpu = raw_smp_processor_id();
> +	int first = cpu_first_thread_sibling(cpu);
> +	unsigned long *state = &paca_ptrs[first]->idle_state;
> +
> +	BUG_ON(!test_bit(NR_PNV_CORE_IDLE_LOCK_BIT, state));
> +	clear_bit_unlock(NR_PNV_CORE_IDLE_LOCK_BIT, state);
> +}
> +
> +/* P7 and P8 */
> +struct p7_sprs {
> +	/* per core */
> +	u64 tscr;
> +	u64 worc;
> +
> +	/* per subcore */
> +	u64 sdr1;
> +	u64 rpr;
> +	u64 amor;
> +
> +	/* per thread */
> +	u64 lpcr;
> +	u64 hfscr;
> +	u64 fscr;
> +	u64 purr;
> +	u64 spurr;
> +	u64 dscr;
> +	u64 wort;
> +};
> +
> +static unsigned long power7_idle_insn(unsigned long type)
> +{
> +	int cpu = raw_smp_processor_id();
> +	int first = cpu_first_thread_sibling(cpu);
> +	unsigned long *state = &paca_ptrs[first]->idle_state;
> +	unsigned long thread = 1UL << cpu_thread_in_core(cpu);
> +	unsigned long core_thread_mask = (1UL << threads_per_core) - 1;
> +	unsigned long srr1;
> +	bool full_winkle;
> +	struct p7_sprs sprs;
> +	bool sprs_saved = false;
> +	int rc;
> +
> +	memset(&sprs, 0, sizeof(sprs));
> +
> +	if (unlikely(type != PNV_THREAD_NAP)) {
> +		atomic_lock_thread_idle();
> +
> +		BUG_ON(!(*state & thread));
> +		*state &= ~thread;
> +
> +		if (power7_fastsleep_workaround_entry) {
> +			if ((*state & core_thread_mask) == 0) {
> +				rc = opal_config_cpu_idle_state(
> +						OPAL_CONFIG_IDLE_FASTSLEEP,
> +						OPAL_CONFIG_IDLE_APPLY);
> +				BUG_ON(rc);
> +			}
> +		}
> +
> +		if (type == PNV_THREAD_WINKLE) {
> +			sprs.tscr	= mfspr(SPRN_TSCR);
> +			sprs.worc	= mfspr(SPRN_WORC);
> +
> +			sprs.sdr1	= mfspr(SPRN_SDR1);
> +			sprs.rpr	= mfspr(SPRN_RPR);
> +			sprs.amor	= mfspr(SPRN_AMOR);
> +
> +			sprs.lpcr	= mfspr(SPRN_LPCR);
> +			if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
> +				sprs.hfscr	= mfspr(SPRN_HFSCR);
> +				sprs.fscr	= mfspr(SPRN_FSCR);
> +			}
> +			sprs.purr	= mfspr(SPRN_PURR);
> +			sprs.spurr	= mfspr(SPRN_SPURR);
> +			sprs.dscr	= mfspr(SPRN_DSCR);
> +			sprs.wort	= mfspr(SPRN_WORT);
> +
> +			sprs_saved = true;
> +
> +			/*
> +			 * Increment winkle counter and set all winkle bits if
> +			 * all threads are winkling. This allows wakeup side to
> +			 * distinguish between fast sleep and winkle state
> +			 * loss. Fast sleep still has to resync the timebase so
> +			 * this may not be a really big win.
> +			 */
> +			*state += 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT;
> +			if ((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS)
> +					>> PNV_CORE_IDLE_WINKLE_COUNT_SHIFT
> +					== threads_per_core)
> +				*state |= PNV_CORE_IDLE_THREAD_WINKLE_BITS;
> +			WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0);
> +		}
> +
> +		atomic_unlock_thread_idle();
> +	}
> +
> +	local_paca->thread_idle_state = type;
> +	srr1 = isa206_idle_insn_mayloss(type);		/* go idle */
> +	local_paca->thread_idle_state = PNV_THREAD_RUNNING;
> +
> +	WARN_ON_ONCE(!srr1);
> +	WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR));
> +
> +	if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI))
> +		hmi_exception_realmode(NULL);
> +
> +	if (likely((srr1 & SRR1_WAKESTATE) != SRR1_WS_HVLOSS)) {
> +		if (unlikely(type != PNV_THREAD_NAP)) {
> +			atomic_lock_thread_idle();
> +			if (type == PNV_THREAD_WINKLE) {
> +				WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0);
> +				*state -= 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT;
> +				*state &= ~(thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT);
> +			}
> +			atomic_unlock_and_stop_thread_idle();
> +		}
> +		return srr1;
> +	}
> +
> +	/* HV state loss */
> +	BUG_ON(type == PNV_THREAD_NAP);
> +
> +	atomic_lock_thread_idle();
> +
> +	full_winkle = false;
> +	if (type == PNV_THREAD_WINKLE) {
> +		WARN_ON((*state & PNV_CORE_IDLE_WINKLE_COUNT_BITS) == 0);
> +		*state -= 1 << PNV_CORE_IDLE_WINKLE_COUNT_SHIFT;
> +		if (*state & (thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT)) {
> +			*state &= ~(thread << PNV_CORE_IDLE_THREAD_WINKLE_BITS_SHIFT);
> +			full_winkle = true;
> +			BUG_ON(!sprs_saved);
> +		}
> +	}
> +
> +	WARN_ON(*state & thread);
> +
> +	if ((*state & core_thread_mask) != 0)
> +		goto core_woken;
> +
> +	/* Per-core SPRs */
> +	if (full_winkle) {
> +		mtspr(SPRN_TSCR,	sprs.tscr);
> +		mtspr(SPRN_WORC,	sprs.worc);
> +	}
> +
> +	if (power7_fastsleep_workaround_exit) {
> +		rc = opal_config_cpu_idle_state(OPAL_CONFIG_IDLE_FASTSLEEP,
> +						OPAL_CONFIG_IDLE_UNDO);
> +		BUG_ON(rc);
> +	}
> +
> +	/* TB */
> +	if (opal_resync_timebase() != OPAL_SUCCESS)
> +		BUG();
> +
> +core_woken:
> +	if (!full_winkle)
> +		goto subcore_woken;
> +
> +	if ((*state & local_paca->subcore_sibling_mask) != 0)
> +		goto subcore_woken;
> +
> +	/* Per-subcore SPRs */
> +	mtspr(SPRN_SDR1,	sprs.sdr1);
> +	mtspr(SPRN_RPR,		sprs.rpr);
> +	mtspr(SPRN_AMOR,	sprs.amor);
> +
> +subcore_woken:
> +	/*
> +	 * isync after restoring shared SPRs and before unlocking. Unlock
> +	 * only contains hwsync which does not necessarily do the right
> +	 * thing for SPRs.
> +	 */
> +	isync();
> +	atomic_unlock_and_stop_thread_idle();
> +
> +	/* Fast sleep does not lose SPRs */
> +	if (!full_winkle)
> +		return srr1;
> +
> +	/* Per-thread SPRs */
> +	mtspr(SPRN_LPCR,	sprs.lpcr);
> +	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
> +		mtspr(SPRN_HFSCR,	sprs.hfscr);
> +		mtspr(SPRN_FSCR,	sprs.fscr);
> +	}
> +	mtspr(SPRN_PURR,	sprs.purr);
> +	mtspr(SPRN_SPURR,	sprs.spurr);
> +	mtspr(SPRN_DSCR,	sprs.dscr);
> +	mtspr(SPRN_WORT,	sprs.wort);
> +
> +	mtspr(SPRN_SPRG3,	local_paca->sprg_vdso);
> +
> +	/*
> +	 * The SLB has to be restored here, but it sometimes still
> +	 * contains entries, so the __ variant must be used to prevent
> +	 * multi hits.
> +	 */
> +	__slb_restore_bolted_realmode();
> +
> +	return srr1;
> +}
> +
> +#ifdef CONFIG_HOTPLUG_CPU
> +static unsigned long power7_offline(void)
> +{
> +	unsigned long srr1;
> +
> +	mtmsr(MSR_IDLE);
> +
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +	/* Tell KVM we're entering idle. */
> +	/******************************************************/
> +	/*  N O T E   W E L L    ! ! !    N O T E   W E L L   */
> +	/* The following store to HSTATE_HWTHREAD_STATE(r13)  */
> +	/* MUST occur in real mode, i.e. with the MMU off,    */
> +	/* and the MMU must stay off until we clear this flag */
> +	/* and test HSTATE_HWTHREAD_REQ(r13) in               */
> +	/* pnv_powersave_wakeup in this file.                 */
> +	/* The reason is that another thread can switch the   */
> +	/* MMU to a guest context whenever this flag is set   */
> +	/* to KVM_HWTHREAD_IN_IDLE, and if the MMU was on,    */
> +	/* that would potentially cause this thread to start  */
> +	/* executing instructions from guest memory in        */
> +	/* hypervisor mode, leading to a host crash or data   */
> +	/* corruption, or worse.                              */
> +	/******************************************************/
> +	local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE;
> +#endif
> +
> +	__ppc64_runlatch_off();
> +	srr1 = power7_idle_insn(power7_offline_type);
> +	__ppc64_runlatch_on();
> +
> +	mtmsr(MSR_KERNEL);
> +
> +	return srr1;
> +}
> +#endif
> +
>  static unsigned long __power7_idle_type(unsigned long type)
>  {
>  	unsigned long srr1;
> @@ -322,9 +528,11 @@ static unsigned long __power7_idle_type(unsigned long type)
>  	if (!prep_irq_for_idle_irqsoff())
>  		return 0;
> 
> +	mtmsr(MSR_IDLE);
>  	__ppc64_runlatch_off();
>  	srr1 = power7_idle_insn(type);
>  	__ppc64_runlatch_on();
> +	mtmsr(MSR_KERNEL);
> 
>  	fini_irq_for_idle_irqsoff();
> 
> @@ -347,6 +555,256 @@ void power7_idle(void)
>  	power7_idle_type(PNV_THREAD_NAP);
>  }
> 
> +struct p9_sprs {
> +	/* per core */
> +	u64 ptcr;
> +	u64 rpr;
> +	u64 tscr;
> +	u64 ldbar;
> +	u64 amor;
> +
> +	/* per thread */
> +	u64 lpcr;
> +	u64 hfscr;
> +	u64 fscr;
> +	u64 pid;
> +	u64 purr;
> +	u64 spurr;
> +	u64 dscr;
> +	u64 wort;
> +
> +	u64 mmcra;
> +	u32 mmcr0;
> +	u32 mmcr1;
> +	u64 mmcr2;
> +};
> +
> +static unsigned long power9_idle_stop(unsigned long psscr, bool mmu_on)
> +{
> +	int cpu = raw_smp_processor_id();
> +	int first = cpu_first_thread_sibling(cpu);
> +	unsigned long *state = &paca_ptrs[first]->idle_state;
> +	unsigned long core_thread_mask = (1UL << threads_per_core) - 1;
> +	unsigned long srr1;
> +	unsigned long pls;
> +	unsigned long mmcr0 = 0;
> +	struct p9_sprs sprs;
> +	bool sprs_saved = false;
> +
> +	/* This should not be required but GCC warns about used uninitialized */
> +	memset(&sprs, 0, sizeof(sprs));
> +
> +	if (!(psscr & (PSSCR_EC|PSSCR_ESL))) {
> +		/* EC=ESL=0 case */
> +
> +		BUG_ON(!mmu_on);
> +
> +		/*
> +		 * Wake synchronously. SRESET via xscom may still cause
> +		 * a 0x100 powersave wakeup with SRR1 reason!
> +		 */
> +		srr1 = isa300_idle_stop_noloss(psscr);		/* go idle */
> +		if (likely(!srr1))
> +			return 0;
> +
> +		/*
> +		 * Registers not saved, can't recover!
> +		 * This would be a hardware bug
> +		 */
> +		BUG_ON((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS);
> +
> +		goto out;
> +	}
> +
> +	/* EC=ESL=1 case */
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +	if (cpu_has_feature(CPU_FTR_P9_TM_XER_SO_BUG)) {
> +		local_paca->requested_psscr = psscr;
> +		/* order setting requested_psscr vs testing dont_stop */
> +		smp_mb();
> +		if (atomic_read(&local_paca->dont_stop)) {
> +			local_paca->requested_psscr = 0;
> +			return 0;
> +		}
> +	}
> +#endif
> +
> +	if (!cpu_has_feature(CPU_FTR_POWER9_DD2_1)) {
> +		 /*
> +		  * POWER9 DD2 can incorrectly set PMAO when waking up
> +		  * after a state-loss idle. Saving and restoring MMCR0
> +		  * over idle is a workaround.
> +		  */
> +		mmcr0		= mfspr(SPRN_MMCR0);
> +	}
> +	if ((psscr & PSSCR_RL_MASK) >= pnv_first_spr_loss_level) {
> +		sprs.lpcr	= mfspr(SPRN_LPCR);
> +		sprs.hfscr	= mfspr(SPRN_HFSCR);
> +		sprs.fscr	= mfspr(SPRN_FSCR);
> +		sprs.pid	= mfspr(SPRN_PID);
> +		sprs.purr	= mfspr(SPRN_PURR);
> +		sprs.spurr	= mfspr(SPRN_SPURR);
> +		sprs.dscr	= mfspr(SPRN_DSCR);
> +		sprs.wort	= mfspr(SPRN_WORT);
> +
> +		sprs.mmcra	= mfspr(SPRN_MMCRA);
> +		sprs.mmcr0	= mfspr(SPRN_MMCR0);
> +		sprs.mmcr1	= mfspr(SPRN_MMCR1);
> +		sprs.mmcr2	= mfspr(SPRN_MMCR2);
> +
> +		sprs.ptcr	= mfspr(SPRN_PTCR);
> +		sprs.rpr	= mfspr(SPRN_RPR);
> +		sprs.tscr	= mfspr(SPRN_TSCR);
> +		sprs.ldbar	= mfspr(SPRN_LDBAR);
> +		sprs.amor	= mfspr(SPRN_AMOR);
> +
> +		sprs_saved = true;
> +
> +		atomic_start_thread_idle();
> +	}
> +
> +	srr1 = isa300_idle_stop_mayloss(psscr);		/* go idle */
> +
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +	local_paca->requested_psscr = 0;
> +#endif
> +
> +	psscr = mfspr(SPRN_PSSCR);
> +
> +	WARN_ON_ONCE(!srr1);
> +	WARN_ON_ONCE(mfmsr() & (MSR_IR|MSR_DR));
> +
> +	/* Workarounds for SMT thread switch problems */
> +	if ((srr1 & SRR1_WAKESTATE) != SRR1_WS_NOLOSS) {
> +		unsigned long mmcra;
> +
> +		/*
> +		 * Workaround for POWER9 DD2.0, if we lost resources, the ERAT
> +		 * might have been corrupted and needs flushing. We also need
> +		 * to reload MMCR0 (see mmcr0 comment above).
> +		 */
> +		if (!cpu_has_feature(CPU_FTR_POWER9_DD2_1)) {
> +			asm volatile(PPC_INVALIDATE_ERAT);
> +			mtspr(SPRN_MMCR0, mmcr0);
> +		}
> +
> +		/*
> +		 * DD2.2 and earlier need to set then clear bit 60 in MMCRA
> +		 * to ensure the PMU starts running.
> +		 */
> +		mmcra = mfspr(SPRN_MMCRA);
> +		mmcra |= PPC_BIT(60);
> +		mtspr(SPRN_MMCRA, mmcra);
> +		mmcra &= ~PPC_BIT(60);
> +		mtspr(SPRN_MMCRA, mmcra);
> +	}
> +
> +	if (unlikely((srr1 & SRR1_WAKEMASK_P8) == SRR1_WAKEHMI))
> +		hmi_exception_realmode(NULL);
> +
> +	/*
> +	 * On POWER9, SRR1 bits do not match exactly as expected.
> +	 * SRR1_WS_GPRLOSS (10b) can also result in SPR loss, so
> +	 * just always test PSSCR for SPR/TB state loss.
> +	 */
> +	pls = (psscr & PSSCR_PLS) >> PSSCR_PLS_SHIFT;
> +	if (likely(pls < pnv_first_spr_loss_level)) {
> +		if (sprs_saved)
> +			atomic_stop_thread_idle();
> +		goto out;
> +	}
> +
> +	/* HV state loss */
> +	BUG_ON(!sprs_saved);
> +
> +	atomic_lock_thread_idle();
> +
> +	if ((*state & core_thread_mask) != 0)
> +		goto core_woken;
> +
> +	/* Per-core SPRs */
> +	mtspr(SPRN_PTCR,	sprs.ptcr);
> +	mtspr(SPRN_RPR,		sprs.rpr);
> +	mtspr(SPRN_TSCR,	sprs.tscr);
> +	mtspr(SPRN_LDBAR,	sprs.ldbar);
> +	mtspr(SPRN_AMOR,	sprs.amor);
> +
> +	if (pls >= pnv_first_tb_loss_level) {
> +		/* TB loss */
> +		if (opal_resync_timebase() != OPAL_SUCCESS)
> +			BUG();
> +	}
> +
> +	/*
> +	 * isync after restoring shared SPRs and before unlocking. Unlock
> +	 * only contains hwsync which does not necessarily do the right
> +	 * thing for SPRs.
> +	 */
> +	isync();
> +
> +core_woken:
> +	atomic_unlock_and_stop_thread_idle();
> +
> +	/* Per-thread SPRs */
> +	mtspr(SPRN_LPCR,	sprs.lpcr);
> +	mtspr(SPRN_HFSCR,	sprs.hfscr);
> +	mtspr(SPRN_FSCR,	sprs.fscr);
> +	mtspr(SPRN_PID,		sprs.pid);
> +	mtspr(SPRN_PURR,	sprs.purr);
> +	mtspr(SPRN_SPURR,	sprs.spurr);
> +	mtspr(SPRN_DSCR,	sprs.dscr);
> +	mtspr(SPRN_WORT,	sprs.wort);
> +
> +	mtspr(SPRN_MMCRA,	sprs.mmcra);
> +	mtspr(SPRN_MMCR0,	sprs.mmcr0);
> +	mtspr(SPRN_MMCR1,	sprs.mmcr1);
> +	mtspr(SPRN_MMCR2,	sprs.mmcr2);
> +
> +	mtspr(SPRN_SPRG3,	local_paca->sprg_vdso);
> +
> +	if (!radix_enabled())
> +		__slb_restore_bolted_realmode();
> +
> +out:
> +	if (mmu_on)
> +		mtmsr(MSR_KERNEL);
> +
> +	return srr1;
> +}
> +
> +#ifdef CONFIG_HOTPLUG_CPU
> +static unsigned long power9_offline_stop(unsigned long psscr)
> +{
> +	unsigned long srr1;
> +
> +#ifndef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +	__ppc64_runlatch_off();
> +	srr1 = power9_idle_stop(psscr, true);
> +	__ppc64_runlatch_on();
> +#else
> +	/*
> +	 * Tell KVM we're entering idle.
> +	 * This does not have to be done in real mode because the P9 MMU
> +	 * is independent per-thread. Some steppings share radix/hash mode
> +	 * between threads, but in that case KVM has a barrier sync in real
> +	 * mode before and after switching between radix and hash.
> +	 *
> +	 * kvm_start_guest must still be called in real mode though, hence
> +	 * the false argument.
> +	 */
> +	local_paca->kvm_hstate.hwthread_state = KVM_HWTHREAD_IN_IDLE;
> +
> +	__ppc64_runlatch_off();
> +	srr1 = power9_idle_stop(psscr, false);
> +	__ppc64_runlatch_on();
> +
> +	mtmsr(MSR_KERNEL);
> +#endif
> +
> +	return srr1;
> +}
> +#endif
> +
>  static unsigned long __power9_idle_type(unsigned long stop_psscr_val,
>  				      unsigned long stop_psscr_mask)
>  {
> @@ -360,7 +818,7 @@ static unsigned long __power9_idle_type(unsigned long stop_psscr_val,
>  	psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val;
> 
>  	__ppc64_runlatch_off();
> -	srr1 = power9_idle_stop(psscr);
> +	srr1 = power9_idle_stop(psscr, true);
>  	__ppc64_runlatch_on();
> 
>  	fini_irq_for_idle_irqsoff();
> @@ -409,7 +867,7 @@ void pnv_power9_force_smt4_catch(void)
>  			atomic_inc(&paca_ptrs[cpu0+thr]->dont_stop);
>  	}
>  	/* order setting dont_stop vs testing requested_psscr */
> -	mb();
> +	smp_mb();
>  	for (thr = 0; thr < threads_per_core; ++thr) {
>  		if (!paca_ptrs[cpu0+thr]->requested_psscr)
>  			++awake_threads;
> @@ -481,7 +939,6 @@ void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val)
>  unsigned long pnv_cpu_offline(unsigned int cpu)
>  {
>  	unsigned long srr1;
> -	u32 idle_states = pnv_get_supported_cpuidle_states();
> 
>  	__ppc64_runlatch_off();
> 
> @@ -492,15 +949,8 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
>  		psscr = (psscr & ~pnv_deepest_stop_psscr_mask) |
>  						pnv_deepest_stop_psscr_val;
>  		srr1 = power9_offline_stop(psscr);
> -
> -	} else if ((idle_states & OPAL_PM_WINKLE_ENABLED) &&
> -		   (idle_states & OPAL_PM_LOSE_FULL_CONTEXT)) {
> -		srr1 = power7_idle_insn(PNV_THREAD_WINKLE);
> -	} else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
> -		   (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
> -		srr1 = power7_idle_insn(PNV_THREAD_SLEEP);
> -	} else if (idle_states & OPAL_PM_NAP_ENABLED) {
> -		srr1 = power7_idle_insn(PNV_THREAD_NAP);
> +	} else if (cpu_has_feature(CPU_FTR_ARCH_206) && power7_offline_type) {
> +		srr1 = power7_offline();
>  	} else {
>  		/* This is the fallback method. We emulate snooze */
>  		while (!generic_check_cpu_restart(cpu)) {
> @@ -596,33 +1046,44 @@ int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags)
>   * @dt_idle_states: Number of idle state entries
>   * Returns 0 on success
>   */
> -static int __init pnv_power9_idle_init(void)
> +static void __init pnv_power9_idle_init(void)
>  {
>  	u64 max_residency_ns = 0;
>  	int i;
> 
>  	/*
> -	 * Set pnv_first_deep_stop_state, pnv_deepest_stop_psscr_{val,mask},
> -	 * and the pnv_default_stop_{val,mask}.
> -	 *
> -	 * pnv_first_deep_stop_state should be set to the first stop
> -	 * level to cause hypervisor state loss.
> -	 *
>  	 * pnv_deepest_stop_{val,mask} should be set to values corresponding to
>  	 * the deepest stop state.
>  	 *
>  	 * pnv_default_stop_{val,mask} should be set to values corresponding to
> -	 * the shallowest (OPAL_PM_STOP_INST_FAST) loss-less stop state.
> +	 * the deepest loss-less (OPAL_PM_STOP_INST_FAST) stop state.
>  	 */
> -	pnv_first_deep_stop_state = MAX_STOP_STATE;
> +	pnv_first_tb_loss_level = MAX_STOP_STATE + 1;
> +	pnv_first_spr_loss_level = MAX_STOP_STATE + 1;
>  	for (i = 0; i < nr_pnv_idle_states; i++) {
>  		int err;
>  		struct pnv_idle_states_t *state = &pnv_idle_states[i];
>  		u64 psscr_rl = state->psscr_val & PSSCR_RL_MASK;
> 
> +		if ((state->flags & OPAL_PM_TIMEBASE_STOP) &&
> +		     (pnv_first_tb_loss_level > psscr_rl))
> +			pnv_first_tb_loss_level = psscr_rl;
> +
>  		if ((state->flags & OPAL_PM_LOSE_FULL_CONTEXT) &&
> -		    pnv_first_deep_stop_state > psscr_rl)
> -			pnv_first_deep_stop_state = psscr_rl;
> +		     (pnv_first_spr_loss_level > psscr_rl))
> +			pnv_first_spr_loss_level = psscr_rl;
> +
> +		/*
> +		 * The idle code does not deal with TB loss occurring
> +		 * in a shallower state than SPR loss, so force it to
> +		 * behave like SPRs are lost if TB is lost. POWER9 would
> +		 * never encouter this, but a POWER8 core would if it
> +		 * implemented the stop instruction. So this is for forward
> +		 * compatibility.
> +		 */
> +		if ((state->flags & OPAL_PM_TIMEBASE_STOP) &&
> +		     (pnv_first_spr_loss_level > psscr_rl))
> +			pnv_first_spr_loss_level = psscr_rl;
> 
>  		err = validate_psscr_val_mask(&state->psscr_val,
>  					      &state->psscr_mask,
> @@ -647,6 +1108,7 @@ static int __init pnv_power9_idle_init(void)
>  			pnv_default_stop_val = state->psscr_val;
>  			pnv_default_stop_mask = state->psscr_mask;
>  			default_stop_found = true;
> +			WARN_ON(state->flags & OPAL_PM_LOSE_FULL_CONTEXT);
>  		}
>  	}
> 
> @@ -666,10 +1128,40 @@ static int __init pnv_power9_idle_init(void)
>  			pnv_deepest_stop_psscr_mask);
>  	}
> 
> -	pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n",
> -		pnv_first_deep_stop_state);
> +	pr_info("cpuidle-powernv: First stop level that may lose SPRs = 0x%lld\n",
> +		pnv_first_spr_loss_level);
> 
> -	return 0;
> +	pr_info("cpuidle-powernv: First stop level that may lose timebase = 0x%lld\n",
> +		pnv_first_tb_loss_level);
> +}
> +
> +static void __init pnv_disable_deep_states(void)
> +{
> +	/*
> +	 * The stop-api is unable to restore hypervisor
> +	 * resources on wakeup from platform idle states which
> +	 * lose full context. So disable such states.
> +	 */
> +	supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT;
> +	pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n");
> +	pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n");
> +
> +	if (cpu_has_feature(CPU_FTR_ARCH_300) &&
> +	    (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) {
> +		/*
> +		 * Use the default stop state for CPU-Hotplug
> +		 * if available.
> +		 */
> +		if (default_stop_found) {
> +			pnv_deepest_stop_psscr_val = pnv_default_stop_val;
> +			pnv_deepest_stop_psscr_mask = pnv_default_stop_mask;
> +			pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n",
> +				pnv_deepest_stop_psscr_val);
> +		} else { /* Fallback to snooze loop for CPU-Hotplug */
> +			deepest_stop_found = false;
> +			pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n");
> +		}
> +	}
>  }
> 
>  /*
> @@ -684,10 +1176,8 @@ static void __init pnv_probe_idle_states(void)
>  		return;
>  	}
> 
> -	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
> -		if (pnv_power9_idle_init())
> -			return;
> -	}
> +	if (cpu_has_feature(CPU_FTR_ARCH_300))
> +		pnv_power9_idle_init();
> 
>  	for (i = 0; i < nr_pnv_idle_states; i++)
>  		supported_cpuidle_states |= pnv_idle_states[i].flags;
> @@ -807,11 +1297,33 @@ static int pnv_parse_cpuidle_dt(void)
> 
>  static int __init pnv_init_idle_states(void)
>  {
> +	int cpu;
>  	int rc = 0;
> -	supported_cpuidle_states = 0;
> +
> +	/* Set up PACA fields */
> +	for_each_present_cpu(cpu) {
> +		struct paca_struct *p = paca_ptrs[cpu];
> +
> +		p->idle_state = 0;
> +		if (cpu == cpu_first_thread_sibling(cpu))
> +			p->idle_state = (1 << threads_per_core) - 1;
> +
> +		if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
> +			/* P7/P8 nap */
> +			p->thread_idle_state = PNV_THREAD_RUNNING;
> +		} else {
> +			/* P9 stop */
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +			p->requested_psscr = 0;
> +			atomic_set(&p->dont_stop, 0);
> +#endif
> +		}
> +	}
> 
>  	/* In case we error out nr_pnv_idle_states will be zero */
>  	nr_pnv_idle_states = 0;
> +	supported_cpuidle_states = 0;
> +
>  	if (cpuidle_disable != IDLE_NO_OVERRIDE)
>  		goto out;
>  	rc = pnv_parse_cpuidle_dt();
> @@ -819,27 +1331,40 @@ static int __init pnv_init_idle_states(void)
>  		return rc;
>  	pnv_probe_idle_states();
> 
> -	if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
> -		patch_instruction(
> -			(unsigned int *)pnv_fastsleep_workaround_at_entry,
> -			PPC_INST_NOP);
> -		patch_instruction(
> -			(unsigned int *)pnv_fastsleep_workaround_at_exit,
> -			PPC_INST_NOP);
> -	} else {
> -		/*
> -		 * OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that
> -		 * workaround is needed to use fastsleep. Provide sysfs
> -		 * control to choose how this workaround has to be applied.
> -		 */
> -		device_create_file(cpu_subsys.dev_root,
> +	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
> +		if (!(supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
> +			power7_fastsleep_workaround_entry = false;
> +			power7_fastsleep_workaround_exit = false;
> +		} else {
> +			/*
> +			 * OPAL_PM_SLEEP_ENABLED_ER1 is set. It indicates that
> +			 * workaround is needed to use fastsleep. Provide sysfs
> +			 * control to choose how this workaround has to be
> +			 * applied.
> +			 */
> +			device_create_file(cpu_subsys.dev_root,
>  				&dev_attr_fastsleep_workaround_applyonce);
> -	}
> +		}
> +
> +		update_subcore_sibling_mask();
> 
> -	pnv_alloc_idle_core_states();
> +		if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED) {
> +			ppc_md.power_save = power7_idle;
> +			power7_offline_type = PNV_THREAD_NAP;
> +		}
> 
> -	if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED)
> -		ppc_md.power_save = power7_idle;
> +		if ((supported_cpuidle_states & OPAL_PM_WINKLE_ENABLED) &&
> +			   (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT))
> +			power7_offline_type = PNV_THREAD_WINKLE;
> +		else if ((supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED) ||
> +			   (supported_cpuidle_states & OPAL_PM_SLEEP_ENABLED_ER1))
> +			power7_offline_type = PNV_THREAD_SLEEP;
> +	}
> +
> +	if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) {
> +		if (pnv_save_sprs_for_deep_states())
> +			pnv_disable_deep_states();
> +	}
> 
>  out:
>  	return 0;
> diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c
> index 45563004feda..1d7a9fd30dd1 100644
> --- a/arch/powerpc/platforms/powernv/subcore.c
> +++ b/arch/powerpc/platforms/powernv/subcore.c
> @@ -183,7 +183,7 @@ static void unsplit_core(void)
>  	cpu = smp_processor_id();
>  	if (cpu_thread_in_core(cpu) != 0) {
>  		while (mfspr(SPRN_HID0) & mask)
> -			power7_idle_insn(PNV_THREAD_NAP);
> +			power7_idle_type(PNV_THREAD_NAP);
> 
>  		per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT;
>  		return;
> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
> index a0f44f992360..77197110e900 100644
> --- a/arch/powerpc/xmon/xmon.c
> +++ b/arch/powerpc/xmon/xmon.c
> @@ -2431,7 +2431,9 @@ static void dump_one_paca(int cpu)
>  	DUMP(p, irq_happened, "%#-*x");
>  	DUMP(p, io_sync, "%#-*x");
>  	DUMP(p, irq_work_pending, "%#-*x");
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>  	DUMP(p, nap_state_lost, "%#-*x");
> +#endif
>  	DUMP(p, sprg_vdso, "%#-*llx");
> 
>  #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> @@ -2439,19 +2441,16 @@ static void dump_one_paca(int cpu)
>  #endif
> 
>  #ifdef CONFIG_PPC_POWERNV
> -	DUMP(p, core_idle_state_ptr, "%-*px");
> -	DUMP(p, thread_idle_state, "%#-*x");
> -	DUMP(p, thread_mask, "%#-*x");
> -	DUMP(p, subcore_sibling_mask, "%#-*x");
> -	DUMP(p, requested_psscr, "%#-*llx");
> -	DUMP(p, stop_sprs.pid, "%#-*llx");
> -	DUMP(p, stop_sprs.ldbar, "%#-*llx");
> -	DUMP(p, stop_sprs.fscr, "%#-*llx");
> -	DUMP(p, stop_sprs.hfscr, "%#-*llx");
> -	DUMP(p, stop_sprs.mmcr1, "%#-*llx");
> -	DUMP(p, stop_sprs.mmcr2, "%#-*llx");
> -	DUMP(p, stop_sprs.mmcra, "%#-*llx");
> -	DUMP(p, dont_stop.counter, "%#-*x");
> +	DUMP(p, idle_state, "%#-*lx");
> +	if (!early_cpu_has_feature(CPU_FTR_ARCH_300)) {
> +		DUMP(p, thread_idle_state, "%#-*x");
> +		DUMP(p, subcore_sibling_mask, "%#-*x");
> +	} else {
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +		DUMP(p, requested_psscr, "%#-*llx");
> +		DUMP(p, dont_stop.counter, "%#-*x");
> +#endif
> +	}
>  #endif
> 
>  	DUMP(p, accounting.utime, "%#-*lx");
> -- 
> 2.20.1
> 
> 



More information about the Linuxppc-dev mailing list