[PATCH v2 18/33] KVM: PPC: Book3S HV: Framework and hcall stubs for nested virtualization

Tue Oct 2 16:01:52 AEST 2018

On Fri, Sep 28, 2018 at 07:45:48PM +1000, Paul Mackerras wrote:
> This starts the process of adding the code to support nested HV-style
> virtualization.  It defines a new H_SET_PARTITION_TABLE hypercall which
> a nested hypervisor can use to set the base address and size of a
> partition table in its memory (analogous to the PTCR register).
> On the host (level 0 hypervisor) side, the H_SET_PARTITION_TABLE
> hypercall from the guest is handled by code that saves the virtual
> PTCR value for the guest.
> 
> This also adds code for creating and destroying nested guests and for
> reading the partition table entry for a nested guest from L1 memory.
> Each nested guest has its own shadow LPID value, different in general
> from the LPID value used by the nested hypervisor to refer to it.  The
> shadow LPID value is allocated at nested guest creation time.
> 
> Nested hypervisor functionality is only available for a radix guest,
> which therefore means a radix host on a POWER9 (or later) processor.
> 
> Signed-off-by: Paul Mackerras <paulus at ozlabs.org>

Reviewed-by: David Gibson <david at gibson.dropbear.id.au>

I've made a number of comments below, but they're all pretty minor
things.  They might be worth including if we have to respin for
whatever reason, or as follow-up improvements, but I don't think we
need to hold this up for them.

[snip]
> @@ -287,6 +288,7 @@ struct kvm_arch {
>  	u8 radix;
>  	u8 fwnmi_enabled;
>  	bool threads_indep;
> +	bool nested_enable;
>  	pgd_t *pgtable;
>  	u64 process_table;
>  	struct dentry *debugfs_dir;
> @@ -312,6 +314,9 @@ struct kvm_arch {
>  #endif
>  	struct kvmppc_ops *kvm_ops;
>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> +	u64 l1_ptcr;
> +	int max_nested_lpid;
> +	struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS];

This array could be quite large.  As a followup would it be worth
dynamically allocating it, so it can be skipped for L1s with no
nesting allowed, and/or dynamically resized as the L1 adds/removes L2s.

>  	/* This array can grow quite large, keep it at the end */
>  	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
>  #endif

[snip]
> diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
> new file mode 100644
> index 0000000..5341052
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_hv_nested.c
> @@ -0,0 +1,283 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright IBM Corporation, 2018
> + * Authors Suraj Jitindar Singh <sjitindarsingh at gmail.com>
> + *	   Paul Mackerras <paulus at ozlabs.org>
> + *
> + * Description: KVM functions specific to running nested KVM-HV guests
> + * on Book3S processors (specifically POWER9 and later).
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/kvm_host.h>
> +
> +#include <asm/kvm_ppc.h>
> +#include <asm/mmu.h>
> +#include <asm/pgtable.h>
> +#include <asm/pgalloc.h>
> +
> +static struct patb_entry *pseries_partition_tb;
> +
> +static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
> +
> +/* Only called when we're not in hypervisor mode */

This comment isn't strictly accurate, the function is called, but
exits trivially.

> +bool kvmhv_nested_init(void)
> +{
> +	long int ptb_order;
> +	unsigned long ptcr;
> +	long rc;
> +
> +	if (!kvmhv_on_pseries())
> +		return true;
> +	if (!radix_enabled())
> +		return false;
> +
> +	/* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
> +	ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
> +	if (ptb_order < 8)
> +		ptb_order = 8;
> +	pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
> +				       GFP_KERNEL);
> +	if (!pseries_partition_tb) {
> +		pr_err("kvm-hv: failed to allocated nested partition table\n");
> +		return false;

Since this can fail in several different ways, it seems like returning
an errno, rather than a bool would make sense.

> +	}
> +
> +	ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
> +	rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
> +	if (rc != H_SUCCESS) {
> +		pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n",
> +		       rc);
> +		kfree(pseries_partition_tb);
> +		pseries_partition_tb = NULL;
> +		return false;
> +	}
> +
> +	return true;
> +}
> +
> +void kvmhv_nested_exit(void)
> +{
> +	if (kvmhv_on_pseries() && pseries_partition_tb) {

First clause is redundant there, isn't it, since pseries_partition_tb
can only be set if we're on pseries?

> +		plpar_hcall_norets(H_SET_PARTITION_TABLE, 0);
> +		kfree(pseries_partition_tb);
> +		pseries_partition_tb = NULL;
> +	}
> +}
> +
> +void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
> +{
> +	if (cpu_has_feature(CPU_FTR_HVMODE)) {
> +		mmu_partition_table_set_entry(lpid, dw0, dw1);
> +	} else {
> +		pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
> +		pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
> +		/* this will be emulated, L0 will do the necessary barriers */
> +		asm volatile(PPC_TLBIE_5(%0, %1, 2, 0, 1) : :
> +			     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));

I think in this version you were using a paravirt TLB flush, instead
of emulation?

> +	}
> +}
> +
> +static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp)
> +{
> +	unsigned long dw0;
> +
> +	dw0 = PATB_HR | radix__get_tree_size() |
> +		__pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE;
> +	kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table);
> +}
> +
> +void kvmhv_vm_nested_init(struct kvm *kvm)
> +{
> +	kvm->arch.max_nested_lpid = -1;
> +}
> +
> +/*
> + * Handle the H_SET_PARTITION_TABLE hcall.
> + * r4 = guest real address of partition table + log_2(size) - 12
> + * (formatted as for the PTCR).
> + */
> +long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
> +{
> +	struct kvm *kvm = vcpu->kvm;
> +	unsigned long ptcr = kvmppc_get_gpr(vcpu, 4);
> +
> +	kvm->arch.l1_ptcr = ptcr;

I don't think it's actually dangerous, since we validate the L1
addresses when we read from the table, but it would probably be better
for debugging a guest if this failed the hcall if the PTCR didn't make
sense (out of bounds order, or not within L1 memory size).

> +	return H_SUCCESS;
> +}

[snip]
> +/*
> + * Free up any resources allocated for a nested guest.
> + */
> +static void kvmhv_release_nested(struct kvm_nested_guest *gp)
> +{
> +	kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0);
> +	kvmppc_free_lpid(gp->shadow_lpid);
> +	if (gp->shadow_pgtable)
> +		pgd_free(gp->l1_host->mm, gp->shadow_pgtable);
> +	kfree(gp);
> +}
> +
> +static void kvmhv_remove_nested(struct kvm_nested_guest *gp)
> +{
> +	struct kvm *kvm = gp->l1_host;
> +	int lpid = gp->l1_lpid;
> +	long ref;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	if (gp == kvm->arch.nested_guests[lpid]) {

This is to protect against a race with another remove, yes?  Since kvm
and lpid are read before you take the lock.  Is that right?

> +		kvm->arch.nested_guests[lpid] = NULL;
> +		if (lpid == kvm->arch.max_nested_lpid) {
> +			while (--lpid >= 0 && !kvm->arch.nested_guests[lpid])
> +				;
> +			kvm->arch.max_nested_lpid = lpid;
> +		}
> +		--gp->refcnt;
> +	}
> +	ref = gp->refcnt;
> +	spin_unlock(&kvm->mmu_lock);
> +	if (ref == 0)
> +		kvmhv_release_nested(gp);
> +}

[snip]
> +struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
> +					  bool create)
> +{
> +	struct kvm_nested_guest *gp, *newgp;
> +
> +	if (l1_lpid >= KVM_MAX_NESTED_GUESTS ||
> +	    l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
> +		return NULL;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	gp = kvm->arch.nested_guests[l1_lpid];
> +	if (gp)
> +		++gp->refcnt;
> +	spin_unlock(&kvm->mmu_lock);
> +
> +	if (gp || !create)
> +		return gp;
> +
> +	newgp = kvmhv_alloc_nested(kvm, l1_lpid);
> +	if (!newgp)
> +		return NULL;
> +	spin_lock(&kvm->mmu_lock);
> +	if (kvm->arch.nested_guests[l1_lpid]) {
> +		/* someone else beat us to it */

Should we print a message in this case.  It's no skin off the host's
nose, but wouldn't this mean the guest is concurrently trying to start
two guests with the same lpid, which seems like a dubious thing for it
to be doing.

> +		gp = kvm->arch.nested_guests[l1_lpid];
> +	} else {
> +		kvm->arch.nested_guests[l1_lpid] = newgp;
> +		++newgp->refcnt;
> +		gp = newgp;
> +		newgp = NULL;
> +		if (l1_lpid > kvm->arch.max_nested_lpid)
> +			kvm->arch.max_nested_lpid = l1_lpid;
> +	}
> +	++gp->refcnt;
> +	spin_unlock(&kvm->mmu_lock);
> +
> +	if (newgp)
> +		kvmhv_release_nested(newgp);
> +
> +	return gp;
> +}
> +
> +void kvmhv_put_nested(struct kvm_nested_guest *gp)
> +{
> +	struct kvm *kvm = gp->l1_host;
> +	long ref;
> +
> +	spin_lock(&kvm->mmu_lock);
> +	ref = --gp->refcnt;
> +	spin_unlock(&kvm->mmu_lock);
> +	if (ref == 0)
> +		kvmhv_release_nested(gp);
> +}

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 833 bytes
Desc: not available
URL: <http://lists.ozlabs.org/pipermail/linuxppc-dev/attachments/20181002/48e381b8/attachment-0001.sig>