[RFC PATCH v2 5/6] KVM: PPC: Add support for nested PAPR guests

Sat Jun 10 12:16:25 AEST 2023

On Wed, Jun 7, 2023 at 7:09 PM Nicholas Piggin <npiggin at gmail.com> wrote:
[snip]
>
> You lost your comments.

Thanks

>
> > diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
> > index 0ca2d8b37b42..c5c57552b447 100644
> > --- a/arch/powerpc/include/asm/kvm_book3s.h
> > +++ b/arch/powerpc/include/asm/kvm_book3s.h
> > @@ -12,6 +12,7 @@
> >  #include <linux/types.h>
> >  #include <linux/kvm_host.h>
> >  #include <asm/kvm_book3s_asm.h>
> > +#include <asm/guest-state-buffer.h>
> >
> >  struct kvmppc_bat {
> >       u64 raw;
> > @@ -316,6 +317,57 @@ long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu);
> >
> >  void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
> >
> > +
> > +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> > +
> > +extern bool __kvmhv_on_papr;
> > +
> > +static inline bool kvmhv_on_papr(void)
> > +{
> > +     return __kvmhv_on_papr;
> > +}
>
> It's a nitpick, but kvmhv_on_pseries() is because we're runnning KVM-HV
> on a pseries guest kernel. Which is a papr guest kernel. So this kind of
> doesn't make sense if you read it the same way.
>
> kvmhv_nested_using_papr() or something like that might read a bit
> better.

Will we go with kvmhv_using_nested_v2()?

>
> This could be a static key too.

Will do.

>
> > @@ -575,6 +593,7 @@ struct kvm_vcpu_arch {
> >       ulong dscr;
> >       ulong amr;
> >       ulong uamor;
> > +     ulong amor;
> >       ulong iamr;
> >       u32 ctrl;
> >       u32 dabrx;
>
> This belongs somewhere else.

It can be dropped.

>
> > @@ -829,6 +848,8 @@ struct kvm_vcpu_arch {
> >       u64 nested_hfscr;       /* HFSCR that the L1 requested for the nested guest */
> >       u32 nested_vcpu_id;
> >       gpa_t nested_io_gpr;
> > +     /* For nested APIv2 guests*/
> > +     struct kvmhv_papr_host papr_host;
> >  #endif
>
> This is not exactly a papr host. Might have to come up with a better
> name especially if we implement a L0 things could get confusing.

Any name ideas? nestedv2_state?

>
> > @@ -342,6 +343,203 @@ static inline long plpar_get_cpu_characteristics(struct h_cpu_char_result *p)
> >       return rc;
> >  }
> >
> > +static inline long plpar_guest_create(unsigned long flags, unsigned long *guest_id)
> > +{
> > +     unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
> > +     unsigned long token;
> > +     long rc;
> > +
> > +     token = -1UL;
> > +     while (true) {
> > +             rc = plpar_hcall(H_GUEST_CREATE, retbuf, flags, token);
> > +             if (rc == H_SUCCESS) {
> > +                     *guest_id = retbuf[0];
> > +                     break;
> > +             }
> > +
> > +             if (rc == H_BUSY) {
> > +                     token = retbuf[0];
> > +                     cpu_relax();
> > +                     continue;
> > +             }
> > +
> > +             if (H_IS_LONG_BUSY(rc)) {
> > +                     token = retbuf[0];
> > +                     mdelay(get_longbusy_msecs(rc));
>
> All of these things need a non-sleeping delay? Can we sleep instead?
> Or if not, might have to think about going back to the caller and it
> can retry.
>
> get/set state might be a bit inconvenient, although I don't expect
> that should potentially take so long as guest and vcpu create/delete,
> so at least those ones would be good if they're called while
> preemptable.

Yeah no reason not to sleep except for get/set, let me try it out.

>
> > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> > index 521d84621422..f22ee582e209 100644
> > --- a/arch/powerpc/kvm/book3s_hv.c
> > +++ b/arch/powerpc/kvm/book3s_hv.c
> > @@ -383,6 +383,11 @@ static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
> >       spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
> >  }
> >
> > +static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
> > +{
> > +     vcpu->arch.pvr = pvr;
> > +}
>
> Didn't you lose this in a previous patch? I thought it must have moved
> to a header but it reappears.

Yes, that was meant to stay put.

>
> > +
> >  /* Dummy value used in computing PCR value below */
> >  #define PCR_ARCH_31    (PCR_ARCH_300 << 1)
> >
> > @@ -1262,13 +1267,14 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
> >                       return RESUME_HOST;
> >               break;
> >  #endif
> > -     case H_RANDOM:
> > +     case H_RANDOM: {
> >               unsigned long rand;
> >
> >               if (!arch_get_random_seed_longs(&rand, 1))
> >                       ret = H_HARDWARE;
> >               kvmppc_set_gpr(vcpu, 4, rand);
> >               break;
> > +     }
> >       case H_RPT_INVALIDATE:
> >               ret = kvmppc_h_rpt_invalidate(vcpu, kvmppc_get_gpr(vcpu, 4),
> >                                             kvmppc_get_gpr(vcpu, 5),
>
> Compile fix for a previous patch.

Thanks.

>
> > @@ -2921,14 +2927,21 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu *vcpu)
> >       vcpu->arch.shared_big_endian = false;
> >  #endif
> >  #endif
> > -     kvmppc_set_mmcr_hv(vcpu, 0, MMCR0_FC);
> >
> > +     if (kvmhv_on_papr()) {
> > +             err = kvmhv_papr_vcpu_create(vcpu, &vcpu->arch.papr_host);
> > +             if (err < 0)
> > +                     return err;
> > +     }
> > +
> > +     kvmppc_set_mmcr_hv(vcpu, 0, MMCR0_FC);
> >       if (cpu_has_feature(CPU_FTR_ARCH_31)) {
> >               kvmppc_set_mmcr_hv(vcpu, 0, kvmppc_get_mmcr_hv(vcpu, 0) | MMCR0_PMCCEXT);
> >               kvmppc_set_mmcra_hv(vcpu, MMCRA_BHRB_DISABLE);
> >       }
> >
> >       kvmppc_set_ctrl_hv(vcpu, CTRL_RUNLATCH);
> > +     kvmppc_set_amor_hv(vcpu, ~0);
>
> This AMOR thing should go somewhere else. Not actually sure why it needs
> to be added to the vcpu since it's always ~0. Maybe just put that in a
> #define somewhere and use that.

Yes, you are right, just can get rid of it from the vcpu entirely.
>
> > @@ -4042,6 +4059,50 @@ static void vcpu_vpa_increment_dispatch(struct kvm_vcpu *vcpu)
> >       }
> >  }
> >
> > +static int kvmhv_vcpu_entry_papr(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr, u64 *tb)
> > +{
> > +     struct kvmhv_papr_host *ph;
> > +     unsigned long msr, i;
> > +     int trap;
> > +     long rc;
> > +
> > +     ph = &vcpu->arch.papr_host;
> > +
> > +     msr = mfmsr();
> > +     kvmppc_msr_hard_disable_set_facilities(vcpu, msr);
> > +     if (lazy_irq_pending())
> > +             return 0;
> > +
> > +     kvmhv_papr_flush_vcpu(vcpu, time_limit);
> > +
> > +     accumulate_time(vcpu, &vcpu->arch.in_guest);
> > +     rc = plpar_guest_run_vcpu(0, vcpu->kvm->arch.lpid, vcpu->vcpu_id,
> > +                               &trap, &i);
> > +
> > +     if (rc != H_SUCCESS) {
> > +             pr_err("KVM Guest Run VCPU hcall failed\n");
> > +             if (rc == H_INVALID_ELEMENT_ID)
> > +                     pr_err("KVM: Guest Run VCPU invalid element id at %ld\n", i);
> > +             else if (rc == H_INVALID_ELEMENT_SIZE)
> > +                     pr_err("KVM: Guest Run VCPU invalid element size at %ld\n", i);
> > +             else if (rc == H_INVALID_ELEMENT_VALUE)
> > +                     pr_err("KVM: Guest Run VCPU invalid element value at %ld\n", i);
> > +             return 0;
> > +     }
>
> This needs the proper error handling. Were you going to wait until I
> sent that up for existing code?

Overall the unhappy paths need to be tightened up in the next revision.
But yeah this hits the same thing as the v1 API.

>
> > @@ -5119,6 +5183,7 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
> >   */
> >  void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask)
> >  {
> > +     struct kvm_vcpu *vcpu;
> >       long int i;
> >       u32 cores_done = 0;
> >
> > @@ -5139,6 +5204,12 @@ void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask)
> >               if (++cores_done >= kvm->arch.online_vcores)
> >                       break;
> >       }
> > +
> > +     if (kvmhv_on_papr()) {
> > +             kvm_for_each_vcpu(i, vcpu, kvm) {
> > +                     kvmppc_set_lpcr_hv(vcpu, vcpu->arch.vcore->lpcr);
> > +             }
> > +     }
> >  }
>
> vcpu define could go in that scope I guess.

True.

>
> > @@ -5405,15 +5476,43 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
> >
> >       /* Allocate the guest's logical partition ID */
> >
> > -     lpid = kvmppc_alloc_lpid();
> > -     if ((long)lpid < 0)
> > -             return -ENOMEM;
> > -     kvm->arch.lpid = lpid;
> > +     if (!kvmhv_on_papr()) {
> > +             lpid = kvmppc_alloc_lpid();
> > +             if ((long)lpid < 0)
> > +                     return -ENOMEM;
> > +             kvm->arch.lpid = lpid;
> > +     }
> >
> >       kvmppc_alloc_host_rm_ops();
> >
> >       kvmhv_vm_nested_init(kvm);
> >
> > +     if (kvmhv_on_papr()) {
> > +             long rc;
> > +             unsigned long guest_id;
> > +
> > +             rc = plpar_guest_create(0, &guest_id);
> > +
> > +             if (rc != H_SUCCESS)
> > +                     pr_err("KVM: Create Guest hcall failed, rc=%ld\n", rc);
> > +
> > +             switch (rc) {
> > +             case H_PARAMETER:
> > +             case H_FUNCTION:
> > +             case H_STATE:
> > +                     return -EINVAL;
> > +             case H_NOT_ENOUGH_RESOURCES:
> > +             case H_ABORTED:
> > +                     return -ENOMEM;
> > +             case H_AUTHORITY:
> > +                     return -EPERM;
> > +             case H_NOT_AVAILABLE:
> > +                     return -EBUSY;
> > +             }
> > +             kvm->arch.lpid = guest_id;
> > +     }
>
> I wouldn't mind putting lpid/guest_id in different variables. guest_id
> is 64-bit isn't it? LPIDR is 32. If nothing else that could cause
> issues if the hypervisor does something clever with the token.

I was trying to get rid of a difference between this API and  the
others, but I'd forgotten about the 64bit / 32bit difference.
Will put it back in its own variable.

>
> > @@ -5573,10 +5675,14 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
> >               kvm->arch.process_table = 0;
> >               if (kvm->arch.secure_guest)
> >                       uv_svm_terminate(kvm->arch.lpid);
> > -             kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
> > +             if (!kvmhv_on_papr())
> > +                     kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
> >       }
>
> Would be nice to have a +ve test for the "existing" API. All we have to
> do is think of a name for it.

Will we go with nestedv1?

Thanks,
Jordan

>
> Thanks,
> Nick
>