[PATCH v4 29/46] KVM: PPC: Book3S HV P9: Implement the rest of the P9 path in C
Alexey Kardashevskiy
aik at ozlabs.ru
Thu Apr 1 16:30:22 AEDT 2021
On 3/23/21 12:02 PM, Nicholas Piggin wrote:
> Almost all logic is moved to C, by introducing a new in_guest mode that
> selects and branches very early in the interrupt handler to the P9 exit
> code.
>
> The remaining assembly is only about 160 lines of low level stack setup,
> with VCPU vs host register save and restore, plus a small shim to the
> legacy paths in the interrupt handler.
>
> There are two motivations for this, the first is just make the code more
> maintainable being in C. The second is to reduce the amount of code
> running in a special KVM mode, "realmode". I put that in quotes because
> with radix it is no longer necessarily real-mode in the MMU, but it
> still has to be treated specially because it may be in real-mode, and
> has various important registers like PID, DEC, TB, etc set to guest.
> This is hostile to the rest of Linux and can't use arbitrary kernel
> functionality or be instrumented well.
>
> This initial patch is a reasonably faithful conversion of the asm code.
> It does lack any loop to return quickly back into the guest without
> switching out of realmode in the case of unimportant or easily handled
> interrupts, as explained in the previous change, handling HV interrupts
> in real mode is not so important for P9.
>
> Signed-off-by: Nicholas Piggin <npiggin at gmail.com>
> ---
> arch/powerpc/include/asm/asm-prototypes.h | 3 +-
> arch/powerpc/include/asm/kvm_asm.h | 3 +-
> arch/powerpc/include/asm/kvm_book3s_64.h | 8 +
> arch/powerpc/kernel/security.c | 5 +-
> arch/powerpc/kvm/Makefile | 3 +
> arch/powerpc/kvm/book3s_64_entry.S | 246 ++++++++++++++++++++++
> arch/powerpc/kvm/book3s_hv.c | 9 +-
> arch/powerpc/kvm/book3s_hv_interrupt.c | 223 ++++++++++++++++++++
> arch/powerpc/kvm/book3s_hv_rmhandlers.S | 123 +----------
> 9 files changed, 500 insertions(+), 123 deletions(-)
> create mode 100644 arch/powerpc/kvm/book3s_hv_interrupt.c
>
> diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h
> index 939f3c94c8f3..7c74c80ed994 100644
> --- a/arch/powerpc/include/asm/asm-prototypes.h
> +++ b/arch/powerpc/include/asm/asm-prototypes.h
> @@ -122,6 +122,7 @@ extern s32 patch__call_flush_branch_caches3;
> extern s32 patch__flush_count_cache_return;
> extern s32 patch__flush_link_stack_return;
> extern s32 patch__call_kvm_flush_link_stack;
> +extern s32 patch__call_kvm_flush_link_stack_p9;
> extern s32 patch__memset_nocache, patch__memcpy_nocache;
>
> extern long flush_branch_caches;
> @@ -142,7 +143,7 @@ void kvmhv_load_host_pmu(void);
> void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use);
> void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu);
>
> -int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu);
> +void kvmppc_p9_enter_guest(struct kvm_vcpu *vcpu);
>
> long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr);
> long kvmppc_h_set_xdabr(struct kvm_vcpu *vcpu, unsigned long dabr,
> diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h
> index a3633560493b..b4f9996bd331 100644
> --- a/arch/powerpc/include/asm/kvm_asm.h
> +++ b/arch/powerpc/include/asm/kvm_asm.h
> @@ -146,7 +146,8 @@
> #define KVM_GUEST_MODE_GUEST 1
> #define KVM_GUEST_MODE_SKIP 2
> #define KVM_GUEST_MODE_GUEST_HV 3
> -#define KVM_GUEST_MODE_HOST_HV 4
> +#define KVM_GUEST_MODE_GUEST_HV_FAST 4 /* ISA v3.0 with host radix mode */
> +#define KVM_GUEST_MODE_HOST_HV 5
>
> #define KVM_INST_FETCH_FAILED -1
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
> index 9bb9bb370b53..c214bcffb441 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -153,9 +153,17 @@ static inline bool kvmhv_vcpu_is_radix(struct kvm_vcpu *vcpu)
> return radix;
> }
>
> +int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu);
> +
> #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */
> #endif
>
> +/*
> + * Invalid HDSISR value which is used to indicate when HW has not set the reg.
> + * Used to work around an errata.
> + */
> +#define HDSISR_CANARY 0x7fff
> +
> /*
> * We use a lock bit in HPTE dword 0 to synchronize updates and
> * accesses to each HPTE, and another bit to indicate non-present
> diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c
> index e4e1a94ccf6a..3a607c11f20f 100644
> --- a/arch/powerpc/kernel/security.c
> +++ b/arch/powerpc/kernel/security.c
> @@ -430,16 +430,19 @@ device_initcall(stf_barrier_debugfs_init);
>
> static void update_branch_cache_flush(void)
> {
> - u32 *site;
> + u32 *site, __maybe_unused *site2;
>
> #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> site = &patch__call_kvm_flush_link_stack;
> + site2 = &patch__call_kvm_flush_link_stack_p9;
> // This controls the branch from guest_exit_cont to kvm_flush_link_stack
> if (link_stack_flush_type == BRANCH_CACHE_FLUSH_NONE) {
> patch_instruction_site(site, ppc_inst(PPC_INST_NOP));
> + patch_instruction_site(site2, ppc_inst(PPC_INST_NOP));
> } else {
> // Could use HW flush, but that could also flush count cache
> patch_branch_site(site, (u64)&kvm_flush_link_stack, BRANCH_SET_LINK);
> + patch_branch_site(site2, (u64)&kvm_flush_link_stack, BRANCH_SET_LINK);
> }
> #endif
>
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index cdd119028f64..40a55a5ba4ff 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -43,6 +43,9 @@ kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs)
> kvm-book3s_64-builtin-objs-$(CONFIG_SPAPR_TCE_IOMMU) := \
> book3s_64_vio_hv.o
>
> +kvm-book3s_64-builtin-objs-y += \
> + book3s_hv_interrupt.o
> +
> kvm-pr-y := \
> fpu.o \
> emulate.o \
> diff --git a/arch/powerpc/kvm/book3s_64_entry.S b/arch/powerpc/kvm/book3s_64_entry.S
> index de81ab69555b..845df5fefdbd 100644
> --- a/arch/powerpc/kvm/book3s_64_entry.S
> +++ b/arch/powerpc/kvm/book3s_64_entry.S
> @@ -1,11 +1,16 @@
> /* SPDX-License-Identifier: GPL-2.0-only */
> #include <asm/asm-offsets.h>
> #include <asm/cache.h>
> +#include <asm/code-patching-asm.h>
> #include <asm/exception-64s.h>
> +#include <asm/export.h>
> #include <asm/kvm_asm.h>
> #include <asm/kvm_book3s_asm.h>
> +#include <asm/mmu.h>
> #include <asm/ppc_asm.h>
> +#include <asm/ptrace.h>
> #include <asm/reg.h>
> +#include <asm/ultravisor-api.h>
>
> /*
> * These are branched to from interrupt handlers in exception-64s.S which set
> @@ -22,10 +27,15 @@
> .global kvmppc_hcall
> .balign IFETCH_ALIGN_BYTES
> kvmppc_hcall:
> + lbz r10,HSTATE_IN_GUEST(r13)
> + cmpwi r10,KVM_GUEST_MODE_GUEST_HV_FAST
> + beq kvmppc_p9_exit_hcall
> ld r10,PACA_EXGEN+EX_R13(r13)
> SET_SCRATCH0(r10)
> li r10,0xc00
> /* Now we look like kvmppc_interrupt */
> + li r11,PACA_EXGEN
> + b 1f
>
> /*
> * KVM interrupt entry occurs after GEN_INT_ENTRY runs, and follows that
> @@ -46,6 +56,12 @@ kvmppc_hcall:
> .global kvmppc_interrupt
> .balign IFETCH_ALIGN_BYTES
> kvmppc_interrupt:
> + std r10,HSTATE_SCRATCH0(r13)
> + lbz r10,HSTATE_IN_GUEST(r13)
> + cmpwi r10,KVM_GUEST_MODE_GUEST_HV_FAST
> + beq kvmppc_p9_exit_interrupt
> + ld r10,HSTATE_SCRATCH0(r13)
> + lbz r11,HSTATE_IN_GUEST(r13)
> li r11,PACA_EXGEN
> cmpdi r10,0x200
> bgt+ 1f
> @@ -140,3 +156,233 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
> GET_SCRATCH0(r13)
> HRFI_TO_KERNEL
> #endif
> +
> +/* Stack frame offsets for kvmppc_hv_entry */
> +#define SFS (144 + STACK_FRAME_MIN_SIZE)
> +#define STACK_SLOT_NVGPRS (SFS - 144) /* 18 gprs */
> +
> +/*
> + * void kvmppc_p9_enter_guest(struct vcpu *vcpu);
> + *
> + * Enter the guest on a ISAv3.0 or later system where we have exactly
> + * one vcpu per vcore, and both the host and guest are radix, and threads
> + * are set to "indepdent mode".
> + */
> +.balign IFETCH_ALIGN_BYTES
> +_GLOBAL(kvmppc_p9_enter_guest)
> +EXPORT_SYMBOL_GPL(kvmppc_p9_enter_guest)
> + mflr r0
> + std r0,PPC_LR_STKOFF(r1)
> + stdu r1,-SFS(r1)
> +
> + std r1,HSTATE_HOST_R1(r13)
> +
> + mfcr r4
> + stw r4,SFS+8(r1)
> +
> + reg = 14
> + .rept 18
> + std reg,STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
> + reg = reg + 1
> + .endr
> +
> + ld r4,VCPU_LR(r3)
> + mtlr r4
> + ld r4,VCPU_CTR(r3)
> + mtctr r4
> + ld r4,VCPU_XER(r3)
> + mtspr SPRN_XER,r4
> +
> + ld r1,VCPU_CR(r3)
> +
> +BEGIN_FTR_SECTION
> + ld r4,VCPU_CFAR(r3)
> + mtspr SPRN_CFAR,r4
> +END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
> +BEGIN_FTR_SECTION
> + ld r4,VCPU_PPR(r3)
> + mtspr SPRN_PPR,r4
> +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
> +
> + reg = 4
> + .rept 28
> + ld reg,__VCPU_GPR(reg)(r3)
> + reg = reg + 1
> + .endr
> +
> + ld r4,VCPU_KVM(r3)
> + lbz r4,KVM_SECURE_GUEST(r4)
> + cmpdi r4,0
> + ld r4,VCPU_GPR(R4)(r3)
> + bne .Lret_to_ultra
> +
> + mtcr r1
> +
> + ld r0,VCPU_GPR(R0)(r3)
> + ld r1,VCPU_GPR(R1)(r3)
> + ld r2,VCPU_GPR(R2)(r3)
> + ld r3,VCPU_GPR(R3)(r3)
> +
> + HRFI_TO_GUEST
> + b .
> +
> + /*
> + * Use UV_RETURN ultracall to return control back to the Ultravisor
> + * after processing an hypercall or interrupt that was forwarded
> + * (a.k.a. reflected) to the Hypervisor.
> + *
> + * All registers have already been reloaded except the ucall requires:
> + * R0 = hcall result
> + * R2 = SRR1, so UV can detect a synthesized interrupt (if any)
> + * R3 = UV_RETURN
> + */
> +.Lret_to_ultra:
> + mtcr r1
> + ld r1,VCPU_GPR(R1)(r3)
> +
> + ld r0,VCPU_GPR(R3)(r3)
> + mfspr r2,SPRN_SRR1
> + LOAD_REG_IMMEDIATE(r3, UV_RETURN)
> + sc 2
> +
> +/*
> + * kvmppc_p9_exit_hcall and kvmppc_p9_exit_interrupt are branched to from
> + * above if the interrupt was taken for a guest that was entered via
> + * kvmppc_p9_enter_guest().
> + *
> + * This code recovers the host stack and vcpu pointer, saves all GPRs and
> + * CR, LR, CTR, XER as well as guest MSR and NIA into the VCPU, then re-
> + * establishes the host stack and registers to return from the
> + * kvmppc_p9_enter_guest() function.
What does "this code" refer to? If it is the asm below, then it does not
save CTR, it is in the c code. Otherwise it is confusing (to me) :)
> + */
> +.balign IFETCH_ALIGN_BYTES
> +kvmppc_p9_exit_hcall:
> + mfspr r11,SPRN_SRR0
> + mfspr r12,SPRN_SRR1
> + li r10,0xc00
> + std r10,HSTATE_SCRATCH0(r13)
> +
> +.balign IFETCH_ALIGN_BYTES
> +kvmppc_p9_exit_interrupt:
> + /*
> + * If set to KVM_GUEST_MODE_GUEST_HV_FAST but we're still in the
> + * hypervisor, that means we can't return from the entry stack.
> + */
> + rldicl. r10,r12,64-MSR_HV_LG,63
> + bne- kvmppc_p9_bad_interrupt
> +
> + std r1,HSTATE_SCRATCH1(r13)
> + std r3,HSTATE_SCRATCH2(r13)
> + ld r1,HSTATE_HOST_R1(r13)
> + ld r3,HSTATE_KVM_VCPU(r13)
> +
> + std r9,VCPU_CR(r3)
> +
> +1:
> + std r11,VCPU_PC(r3)
> + std r12,VCPU_MSR(r3)
> +
> + reg = 14
> + .rept 18
> + std reg,__VCPU_GPR(reg)(r3)
> + reg = reg + 1
> + .endr
> +
> + /* r1, r3, r9-r13 are saved to vcpu by C code */
> + std r0,VCPU_GPR(R0)(r3)
> + std r2,VCPU_GPR(R2)(r3)
> + reg = 4
> + .rept 5
> + std reg,__VCPU_GPR(reg)(r3)
> + reg = reg + 1
> + .endr
> +
> + ld r2,PACATOC(r13)
> +
> + mflr r4
> + std r4,VCPU_LR(r3)
> + mfspr r4,SPRN_XER
> + std r4,VCPU_XER(r3)
> +
> + reg = 14
> + .rept 18
> + ld reg,STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
> + reg = reg + 1
> + .endr
> +
> + lwz r4,SFS+8(r1)
> + mtcr r4
> +
> + /*
> + * Flush the link stack here, before executing the first blr on the
> + * way out of the guest.
> + *
> + * The link stack won't match coming out of the guest anyway so the
> + * only cost is the flush itself. The call clobbers r0.
> + */
> +1: nop
> + patch_site 1b patch__call_kvm_flush_link_stack_p9
> +
> + addi r1,r1,SFS
> + ld r0,PPC_LR_STKOFF(r1)
> + mtlr r0
> + blr
> +
> +/*
> + * Took an interrupt somewhere right before HRFID to guest, so registers are
> + * in a bad way. Return things hopefully enough to run host virtual code and
> + * run the Linux interrupt handler (SRESET or MCE) to print something useful.
> + *
> + * We could be really clever and save all host registers in known locations
> + * before setting HSTATE_IN_GUEST, then restoring them all here, and setting
> + * return address to a fixup that sets them up again. But that's a lot of
> + * effort for a small bit of code. Lots of other things to do first.
> + */
> +kvmppc_p9_bad_interrupt:
> + /*
> + * Set GUEST_MODE_NONE so the handler won't branch to KVM, and clear
> + * MSR_RI in r12 ([H]SRR1) so the handler won't try to return.
> + */
> + li r10,KVM_GUEST_MODE_NONE
> + stb r10,HSTATE_IN_GUEST(r13)
> + li r10,MSR_RI
> + andc r12,r12,r10
> +
> + /*
> + * Clean up guest registers to give host a chance to run.
> + */
> + li r10,0
> + mtspr SPRN_AMR,r10
> + mtspr SPRN_IAMR,r10
> + mtspr SPRN_CIABR,r10
> + mtspr SPRN_DAWRX0,r10
> +BEGIN_FTR_SECTION
> + mtspr SPRN_DAWRX1,r10
> +END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
> + mtspr SPRN_PID,r10
> +
> + /*
> + * Switch to host MMU mode
> + */
> + ld r10, HSTATE_KVM_VCPU(r13)
> + ld r10, VCPU_KVM(r10)
> + lwz r10, KVM_HOST_LPID(r10)
> + mtspr SPRN_LPID,r10
> +
> + ld r10, HSTATE_KVM_VCPU(r13)
> + ld r10, VCPU_KVM(r10)
> + ld r10, KVM_HOST_LPCR(r10)
> + mtspr SPRN_LPCR,r10
> +
> + /*
> + * Go back to interrupt handler
> + */
> + ld r10,HSTATE_SCRATCH0(r13)
> + cmpwi r10,BOOK3S_INTERRUPT_MACHINE_CHECK
> + beq machine_check_common
> +
> + ld r10,HSTATE_SCRATCH0(r13)
> + cmpwi r10,BOOK3S_INTERRUPT_SYSTEM_RESET
> + beq system_reset_common
> +
> + b .
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 989a1ff5ad11..6dca639ed997 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -1431,6 +1431,8 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
> */
> case BOOK3S_INTERRUPT_H_DATA_STORAGE:
> r = RESUME_PAGE_FAULT;
> + if (vcpu->arch.fault_dsisr == HDSISR_CANARY)
> + r = RESUME_GUEST; /* Just retry if it's the canary */
> break;
> case BOOK3S_INTERRUPT_H_INST_STORAGE:
> vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
> @@ -3690,6 +3692,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
> u64 tb, next_timer;
> int trap, save_pmu;
>
> + WARN_ON_ONCE(vcpu->arch.ceded);
> +
> tb = mftb();
> next_timer = timer_get_next_tb();
> if (tb >= next_timer)
> @@ -3698,8 +3702,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
> if (next_timer < time_limit)
> time_limit = next_timer;
>
> - vcpu->arch.ceded = 0;
> -
> kvmhv_save_host_pmu(); /* saves it to PACA kvm_hstate */
>
> kvmppc_subcore_enter_guest();
> @@ -3826,9 +3828,10 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
> }
> }
> kvmppc_xive_pull_vcpu(vcpu);
> +
> + vcpu->arch.slb_max = 0;
> }
>
> - vcpu->arch.slb_max = 0;
> dec = mfspr(SPRN_DEC);
> if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
> dec = (s32) dec;
> diff --git a/arch/powerpc/kvm/book3s_hv_interrupt.c b/arch/powerpc/kvm/book3s_hv_interrupt.c
> new file mode 100644
> index 000000000000..3151b3d62c01
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_hv_interrupt.c
> @@ -0,0 +1,223 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +#include <linux/kernel.h>
> +#include <linux/kvm_host.h>
> +#include <asm/asm-prototypes.h>
> +#include <asm/dbell.h>
> +#include <asm/kvm_ppc.h>
> +
> +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
> +static void __start_timing(struct kvm_vcpu *vcpu, struct kvmhv_tb_accumulator *next)
> +{
> + struct kvmppc_vcore *vc = vcpu->arch.vcore;
> + u64 tb = mftb() - vc->tb_offset_applied;
> +
> + vcpu->arch.cur_activity = next;
> + vcpu->arch.cur_tb_start = tb;
> +}
> +
> +static void __accumulate_time(struct kvm_vcpu *vcpu, struct kvmhv_tb_accumulator *next)
> +{
> + struct kvmppc_vcore *vc = vcpu->arch.vcore;
> + struct kvmhv_tb_accumulator *curr;
> + u64 tb = mftb() - vc->tb_offset_applied;
> + u64 prev_tb;
> + u64 delta;
> + u64 seq;
> +
> + curr = vcpu->arch.cur_activity;
> + vcpu->arch.cur_activity = next;
> + prev_tb = vcpu->arch.cur_tb_start;
> + vcpu->arch.cur_tb_start = tb;
> +
> + if (!curr)
> + return;
> +
> + delta = tb - prev_tb;
> +
> + seq = curr->seqcount;
> + curr->seqcount = seq + 1;
> + smp_wmb();
> + curr->tb_total += delta;
> + if (seq == 0 || delta < curr->tb_min)
> + curr->tb_min = delta;
> + if (delta > curr->tb_max)
> + curr->tb_max = delta;
> + smp_wmb();
> + curr->seqcount = seq + 2;
> +}
> +
> +#define start_timing(vcpu, next) __start_timing(vcpu, next)
> +#define end_timing(vcpu) __start_timing(vcpu, NULL)
> +#define accumulate_time(vcpu, next) __accumulate_time(vcpu, next)
> +#else
> +#define start_timing(vcpu, next) do {} while (0)
> +#define end_timing(vcpu) do {} while (0)
> +#define accumulate_time(vcpu, next) do {} while (0)
> +#endif
> +
> +static inline void mfslb(unsigned int idx, u64 *slbee, u64 *slbev)
> +{
> + asm volatile("slbmfev %0,%1" : "=r" (*slbev) : "r" (idx));
> + asm volatile("slbmfee %0,%1" : "=r" (*slbee) : "r" (idx));
> +}
> +
> +static inline void mtslb(unsigned int idx, u64 slbee, u64 slbev)
> +{
> + BUG_ON((slbee & 0xfff) != idx);
> +
> + asm volatile("slbmte %0,%1" :: "r" (slbev), "r" (slbee));
> +}
> +
> +static inline void slb_invalidate(unsigned int ih)
> +{
> + asm volatile("slbia %0" :: "i"(ih));
> +}
This one is not used.
> +
> +/*
> + * Malicious or buggy radix guests may have inserted SLB entries
> + * (only 0..3 because radix always runs with UPRT=1), so these must
> + * be cleared here to avoid side-channels. slbmte is used rather
> + * than slbia, as it won't clear cached translations.
> + */
> +static void radix_clear_slb(void)
> +{
> + u64 slbee, slbev;
> + int i;
> +
> + for (i = 0; i < 4; i++) {
> + mfslb(i, &slbee, &slbev);
> + if (unlikely(slbee || slbev)) {
> + slbee = i;
> + slbev = 0;
> + mtslb(i, slbee, slbev);
> + }
> + }
> +}
> +
> +int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu)
> +{
> + u64 *exsave;
> + unsigned long msr = mfmsr();
> + int trap;
> +
> + start_timing(vcpu, &vcpu->arch.rm_entry);
> +
> + vcpu->arch.ceded = 0;
> +
> + WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_HV);
> + WARN_ON_ONCE(!(vcpu->arch.shregs.msr & MSR_ME));
> +
> + mtspr(SPRN_HSRR0, vcpu->arch.regs.nip);
> + mtspr(SPRN_HSRR1, (vcpu->arch.shregs.msr & ~MSR_HV) | MSR_ME);
> +
> + /*
> + * On POWER9 DD2.1 and below, sometimes on a Hypervisor Data Storage
> + * Interrupt (HDSI) the HDSISR is not be updated at all.
> + *
> + * To work around this we put a canary value into the HDSISR before
> + * returning to a guest and then check for this canary when we take a
> + * HDSI. If we find the canary on a HDSI, we know the hardware didn't
> + * update the HDSISR. In this case we return to the guest to retake the
> + * HDSI which should correctly update the HDSISR the second time HDSI
> + * entry.
> + *
> + * Just do this on all p9 processors for now.
> + */
> + mtspr(SPRN_HDSISR, HDSISR_CANARY);
> +
> + accumulate_time(vcpu, &vcpu->arch.guest_time);
> +
> + local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST_HV_FAST;
> + kvmppc_p9_enter_guest(vcpu);
> + // Radix host and guest means host never runs with guest MMU state
> + local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_NONE;
> +
> + accumulate_time(vcpu, &vcpu->arch.rm_intr);
> +
> + /* Get these from r11/12 and paca exsave */
> + vcpu->arch.shregs.srr0 = mfspr(SPRN_SRR0);
> + vcpu->arch.shregs.srr1 = mfspr(SPRN_SRR1);
> + vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
> + vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
> +
> + /* 0x2 bit for HSRR is only used by PR and P7/8 HV paths, clear it */
> + trap = local_paca->kvm_hstate.scratch0 & ~0x2;
> + if (likely(trap > BOOK3S_INTERRUPT_MACHINE_CHECK)) {
> + exsave = local_paca->exgen;
> + } else if (trap == BOOK3S_INTERRUPT_SYSTEM_RESET) {
> + exsave = local_paca->exnmi;
> + } else { /* trap == 0x200 */
> + exsave = local_paca->exmc;
> + }
> +
> + vcpu->arch.regs.gpr[1] = local_paca->kvm_hstate.scratch1;
> + vcpu->arch.regs.gpr[3] = local_paca->kvm_hstate.scratch2;
> + vcpu->arch.regs.gpr[9] = exsave[EX_R9/sizeof(u64)];
> + vcpu->arch.regs.gpr[10] = exsave[EX_R10/sizeof(u64)];
> + vcpu->arch.regs.gpr[11] = exsave[EX_R11/sizeof(u64)];
> + vcpu->arch.regs.gpr[12] = exsave[EX_R12/sizeof(u64)];
> + vcpu->arch.regs.gpr[13] = exsave[EX_R13/sizeof(u64)];
> + vcpu->arch.ppr = exsave[EX_PPR/sizeof(u64)];
> + vcpu->arch.cfar = exsave[EX_CFAR/sizeof(u64)];
> + vcpu->arch.regs.ctr = exsave[EX_CTR/sizeof(u64)];
> +
> + vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
> +
> + if (unlikely(trap == BOOK3S_INTERRUPT_MACHINE_CHECK)) {
> + vcpu->arch.fault_dar = exsave[EX_DAR/sizeof(u64)];
> + vcpu->arch.fault_dsisr = exsave[EX_DSISR/sizeof(u64)];
> + kvmppc_realmode_machine_check(vcpu);
> +
> + } else if (unlikely(trap == BOOK3S_INTERRUPT_HMI)) {
> + kvmppc_realmode_hmi_handler();
> +
> + } else if (trap == BOOK3S_INTERRUPT_H_EMUL_ASSIST) {
> + vcpu->arch.emul_inst = mfspr(SPRN_HEIR);
> +
> + } else if (trap == BOOK3S_INTERRUPT_H_DATA_STORAGE) {
> + vcpu->arch.fault_dar = exsave[EX_DAR/sizeof(u64)];
> + vcpu->arch.fault_dsisr = exsave[EX_DSISR/sizeof(u64)];
> + vcpu->arch.fault_gpa = mfspr(SPRN_ASDR);
> +
> + } else if (trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
> + vcpu->arch.fault_gpa = mfspr(SPRN_ASDR);
> +
> + } else if (trap == BOOK3S_INTERRUPT_H_FAC_UNAVAIL) {
> + vcpu->arch.hfscr = mfspr(SPRN_HFSCR);
> +
> +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
> + /*
> + * Softpatch interrupt for transactional memory emulation cases
> + * on POWER9 DD2.2. This is early in the guest exit path - we
> + * haven't saved registers or done a treclaim yet.
> + */
> + } else if (trap == BOOK3S_INTERRUPT_HV_SOFTPATCH) {
> + vcpu->arch.emul_inst = mfspr(SPRN_HEIR);
> +
> + /*
> + * The cases we want to handle here are those where the guest
> + * is in real suspend mode and is trying to transition to
> + * transactional mode.
> + */
> + if (local_paca->kvm_hstate.fake_suspend &&
> + (vcpu->arch.shregs.msr & MSR_TS_S)) {
> + if (kvmhv_p9_tm_emulation_early(vcpu)) {
> + /* Prevent it being handled again. */
> + trap = 0;
> + }
> + }
> +#endif
> + }
> +
> + radix_clear_slb();
> +
> + __mtmsrd(msr, 0);
The asm code only sets RI but this potentially sets more bits including
MSR_EE, is it expected to be 0 when __kvmhv_vcpu_entry_p9() is called?
> + mtspr(SPRN_CTRLT, 1);
What is this for? ISA does not shed much light:
===
63 RUN This bit controls an external I/O pin.
===
> +
> + accumulate_time(vcpu, &vcpu->arch.rm_exit);
This should not compile without CONFIG_KVM_BOOK3S_HV_EXIT_TIMING.
> +
> + end_timing(vcpu);
> +
> + return trap;
The asm does "For hash guest, read the guest SLB and save it away", this
code does not. Is this new fast-path-in-c only for radix-on-radix or
hash VMs are supported too?
> +}
> +EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9);
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 2d0d14ed1d92..6118e8a97ddd 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -44,9 +44,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
> #define NAPPING_UNSPLIT 3
>
> /* Stack frame offsets for kvmppc_hv_entry */
> -#define SFS 208
> +#define SFS 160
> #define STACK_SLOT_TRAP (SFS-4)
> -#define STACK_SLOT_SHORT_PATH (SFS-8)
> #define STACK_SLOT_TID (SFS-16)
> #define STACK_SLOT_PSSCR (SFS-24)
> #define STACK_SLOT_PID (SFS-32)
> @@ -59,8 +58,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
> #define STACK_SLOT_UAMOR (SFS-88)
> #define STACK_SLOT_DAWR1 (SFS-96)
> #define STACK_SLOT_DAWRX1 (SFS-104)
> -/* the following is used by the P9 short path */
> -#define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */
>
> /*
> * Call kvmppc_hv_entry in real mode.
> @@ -1008,9 +1005,6 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
> no_xive:
> #endif /* CONFIG_KVM_XICS */
>
> - li r0, 0
> - stw r0, STACK_SLOT_SHORT_PATH(r1)
> -
> deliver_guest_interrupt: /* r4 = vcpu, r13 = paca */
> /* Check if we can deliver an external or decrementer interrupt now */
> ld r0, VCPU_PENDING_EXC(r4)
> @@ -1030,7 +1024,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
> mtspr SPRN_SRR0, r6
> mtspr SPRN_SRR1, r7
>
> -fast_guest_entry_c:
> ld r10, VCPU_PC(r4)
> ld r11, VCPU_MSR(r4)
> /* r11 = vcpu->arch.msr & ~MSR_HV */
> @@ -1135,97 +1128,6 @@ ret_to_ultra:
> ld r4, VCPU_GPR(R4)(r4)
> sc 2
>
> -/*
> - * Enter the guest on a P9 or later system where we have exactly
> - * one vcpu per vcore and we don't need to go to real mode
> - * (which implies that host and guest are both using radix MMU mode).
> - * r3 = vcpu pointer
> - * Most SPRs and all the VSRs have been loaded already.
> - */
> -_GLOBAL(__kvmhv_vcpu_entry_p9)
> -EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9)
> - mflr r0
> - std r0, PPC_LR_STKOFF(r1)
> - stdu r1, -SFS(r1)
> -
> - li r0, 1
> - stw r0, STACK_SLOT_SHORT_PATH(r1)
> -
> - std r3, HSTATE_KVM_VCPU(r13)
> - mfcr r4
> - stw r4, SFS+8(r1)
> -
> - std r1, HSTATE_HOST_R1(r13)
> -
> - reg = 14
> - .rept 18
> - std reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
> - reg = reg + 1
> - .endr
> -
> - reg = 14
> - .rept 18
> - ld reg, __VCPU_GPR(reg)(r3)
> - reg = reg + 1
> - .endr
> -
> - mfmsr r10
> - std r10, HSTATE_HOST_MSR(r13)
> -
> - mr r4, r3
> - b fast_guest_entry_c
> -guest_exit_short_path:
> - /*
> - * Malicious or buggy radix guests may have inserted SLB entries
> - * (only 0..3 because radix always runs with UPRT=1), so these must
> - * be cleared here to avoid side-channels. slbmte is used rather
> - * than slbia, as it won't clear cached translations.
> - */
> - li r0,0
> - slbmte r0,r0
> - li r4,1
> - slbmte r0,r4
> - li r4,2
> - slbmte r0,r4
> - li r4,3
> - slbmte r0,r4
> -
> - li r0, KVM_GUEST_MODE_NONE
> - stb r0, HSTATE_IN_GUEST(r13)
> -
> - reg = 14
> - .rept 18
> - std reg, __VCPU_GPR(reg)(r9)
> - reg = reg + 1
> - .endr
> -
> - reg = 14
> - .rept 18
> - ld reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
> - reg = reg + 1
> - .endr
> -
> - lwz r4, SFS+8(r1)
> - mtcr r4
> -
> - mr r3, r12 /* trap number */
> -
> - addi r1, r1, SFS
> - ld r0, PPC_LR_STKOFF(r1)
> - mtlr r0
> -
> - /* If we are in real mode, do a rfid to get back to the caller */
> - mfmsr r4
> - andi. r5, r4, MSR_IR
> - bnelr
> - rldicl r5, r4, 64 - MSR_TS_S_LG, 62 /* extract TS field */
> - mtspr SPRN_SRR0, r0
> - ld r10, HSTATE_HOST_MSR(r13)
> - rldimi r10, r5, MSR_TS_S_LG, 63 - MSR_TS_T_LG
> - mtspr SPRN_SRR1, r10
> - RFI_TO_KERNEL
> - b .
> -
> secondary_too_late:
> li r12, 0
> stw r12, STACK_SLOT_TRAP(r1)
> @@ -1397,14 +1299,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
> mr r4,r9
> bge fast_guest_return
> 2:
> - /* If we came in through the P9 short path, no real mode hcalls */
> - lwz r0, STACK_SLOT_SHORT_PATH(r1)
> - cmpwi r0, 0
> - bne no_try_real
> /* See if this is an hcall we can handle in real mode */
> cmpwi r12,BOOK3S_INTERRUPT_SYSCALL
> beq hcall_try_real_mode
> -no_try_real:
>
> /* Hypervisor doorbell - exit only if host IPI flag set */
> cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL
> @@ -1447,11 +1344,6 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
> bl kvmhv_accumulate_time
> #endif
> #ifdef CONFIG_KVM_XICS
> - /* If we came in through the P9 short path, xive pull is done in C */
> - lwz r0, STACK_SLOT_SHORT_PATH(r1)
> - cmpwi r0, 0
> - bne 1f
> -
> /* We are exiting, pull the VP from the XIVE */
> lbz r0, VCPU_XIVE_PUSHED(r9)
> cmpwi cr0, r0, 0
> @@ -1496,11 +1388,6 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
> 1: nop
> patch_site 1b patch__call_kvm_flush_link_stack
>
> - /* If we came in through the P9 short path, go back out to C now */
> - lwz r0, STACK_SLOT_SHORT_PATH(r1)
> - cmpwi r0, 0
> - bne guest_exit_short_path
> -
> /* For hash guest, read the guest SLB and save it away */
> ld r5, VCPU_KVM(r9)
> lbz r0, KVM_RADIX(r5)
> @@ -1548,8 +1435,10 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
> b guest_bypass
>
> 0: /*
> - * Sanitise radix guest SLB, see guest_exit_short_path comment.
> - * We clear vcpu->arch.slb_max to match earlier behaviour.
> + * Malicious or buggy radix guests may have inserted SLB entries
> + * (only 0..3 because radix always runs with UPRT=1), so these must
> + * be cleared here to avoid side-channels. slbmte is used rather
> + * than slbia, as it won't clear cached translations.
> */
> li r0,0
> stw r0,VCPU_SLB_MAX(r9)
> @@ -3362,7 +3251,7 @@ BEGIN_FTR_SECTION
> mtspr SPRN_DAWRX1, r0
> END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
>
> - /* Clear hash and radix guest SLB, see guest_exit_short_path comment. */
> + /* Clear hash and radix guest SLB. */
> slbmte r0, r0
> PPC_SLBIA(6)
>
>
--
Alexey
More information about the Linuxppc-dev
mailing list