[RFC PATCH] KVM: PPC: BOOK3S: HV: THP support for guest

Aneesh Kumar K.V aneesh.kumar at linux.vnet.ibm.com
Mon May 5 03:36:05 EST 2014


"Aneesh Kumar K.V" <aneesh.kumar at linux.vnet.ibm.com> writes:

> Signed-off-by: Aneesh Kumar K.V <aneesh.kumar at linux.vnet.ibm.com>
> ---
>  arch/powerpc/include/asm/kvm_book3s_64.h | 146 ++++++++++++++++++++++++++-----
>  arch/powerpc/kvm/book3s_hv.c             |   7 ++
>  2 files changed, 130 insertions(+), 23 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
> index 51388befeddb..f03ea8f90576 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_64.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_64.h
> @@ -77,34 +77,122 @@ static inline long try_lock_hpte(unsigned long *hpte, unsigned long bits)
>  	return old == 0;
>  }
>
> +static inline int __hpte_actual_psize(unsigned int lp, int psize)
> +{
> +	int i, shift;
> +	unsigned int mask;
> +
> +	/* start from 1 ignoring MMU_PAGE_4K */
> +	for (i = 1; i < MMU_PAGE_COUNT; i++) {
> +
> +		/* invalid penc */
> +		if (mmu_psize_defs[psize].penc[i] == -1)
> +			continue;
> +		/*
> +		 * encoding bits per actual page size
> +		 *        PTE LP     actual page size
> +		 *    rrrr rrrz		>=8KB
> +		 *    rrrr rrzz		>=16KB
> +		 *    rrrr rzzz		>=32KB
> +		 *    rrrr zzzz		>=64KB
> +		 * .......
> +		 */
> +		shift = mmu_psize_defs[i].shift - LP_SHIFT;
> +		if (shift > LP_BITS)
> +			shift = LP_BITS;
> +		mask = (1 << shift) - 1;
> +		if ((lp & mask) == mmu_psize_defs[psize].penc[i])
> +			return i;
> +	}
> +	return -1;
> +}
> +
>  static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
>  					     unsigned long pte_index)
>  {
> -	unsigned long rb, va_low;
> +	int b_size, a_size;
> +	unsigned int penc;
> +	unsigned long rb = 0, va_low, sllp;
> +	unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
> +
> +	if (!(v & HPTE_V_LARGE)) {
> +		/* both base and actual psize is 4k */
> +		b_size = MMU_PAGE_4K;
> +		a_size = MMU_PAGE_4K;
> +	} else {
> +		for (b_size = 0; b_size < MMU_PAGE_COUNT; b_size++) {
> +
> +			/* valid entries have a shift value */
> +			if (!mmu_psize_defs[b_size].shift)
> +				continue;
>
> +			a_size = __hpte_actual_psize(lp, b_size);
> +			if (a_size != -1)
> +				break;
> +		}
> +	}
> +	/*
> +	 * Ignore the top 14 bits of va
> +	 * v have top two bits covering segment size, hence move
> +	 * by 16 bits, Also clear the lower HPTE_V_AVPN_SHIFT (7) bits.
> +	 * AVA field in v also have the lower 23 bits ignored.
> +	 * For base page size 4K we need 14 .. 65 bits (so need to
> +	 * collect extra 11 bits)
> +	 * For others we need 14..14+i
> +	 */
> +	/* This covers 14..54 bits of va*/
>  	rb = (v & ~0x7fUL) << 16;		/* AVA field */
> +	/*
> +	 * AVA in v had cleared lower 23 bits. We need to derive
> +	 * that from pteg index
> +	 */
>  	va_low = pte_index >> 3;
>  	if (v & HPTE_V_SECONDARY)
>  		va_low = ~va_low;
> -	/* xor vsid from AVA */
> +	/*
> +	 * get the vpn bits from va_low using reverse of hashing.
> +	 * In v we have va with 23 bits dropped and then left shifted
> +	 * HPTE_V_AVPN_SHIFT (7) bits. Now to find vsid we need
> +	 * right shift it with (SID_SHIFT - (23 - 7))
> +	 */
>  	if (!(v & HPTE_V_1TB_SEG))
> -		va_low ^= v >> 12;
> +		va_low ^= v >> (SID_SHIFT - 16);
>  	else
> -		va_low ^= v >> 24;
> +		va_low ^= v >> (SID_SHIFT_1T - 16);
>  	va_low &= 0x7ff;
> -	if (v & HPTE_V_LARGE) {
> -		rb |= 1;			/* L field */
> -		if (cpu_has_feature(CPU_FTR_ARCH_206) &&
> -		    (r & 0xff000)) {
> -			/* non-16MB large page, must be 64k */
> -			/* (masks depend on page size) */
> -			rb |= 0x1000;		/* page encoding in LP field */
> -			rb |= (va_low & 0x7f) << 16; /* 7b of VA in AVA/LP field */
> -			rb |= ((va_low << 4) & 0xf0);	/* AVAL field (P7 doesn't seem to care) */
> -		}
> -	} else {
> -		/* 4kB page */
> -		rb |= (va_low & 0x7ff) << 12;	/* remaining 11b of VA */
> +
> +	switch (b_size) {
> +	case MMU_PAGE_4K:
> +		sllp = ((mmu_psize_defs[a_size].sllp & SLB_VSID_L) >> 6) |
> +			((mmu_psize_defs[a_size].sllp & SLB_VSID_LP) >> 4);
> +		rb |= sllp << 5;	/*  AP field */
> +		rb |= (va_low & 0x7ff) << 12;	/* remaining 11 bits of AVA */
> +		break;
> +	default:
> +	{
> +		int aval_shift;
> +		/*
> +		 * remaining 7bits of AVA/LP fields
> +		 * Also contain the rr bits of LP
> +		 */
> +		rb |= (va_low & 0x7f) << 16;
> +		/*
> +		 * Now clear not needed LP bits based on actual psize
> +		 */
> +		rb &= ~((1ul << mmu_psize_defs[a_size].shift) - 1);
> +		/*
> +		 * AVAL field 58..77 - base_page_shift bits of va
> +		 * we have space for 58..64 bits, Missing bits should
> +		 * be zero filled. +1 is to take care of L bit shift
> +		 */
> +		aval_shift = 64 - (77 - mmu_psize_defs[b_size].shift) + 1;
> +		rb |= ((va_low << aval_shift) & 0xfe);
> +
> +		rb |= 1;		/* L field */
> +		penc = mmu_psize_defs[b_size].penc[a_size];
> +		rb |= penc << 12;	/* LP field */
> +		break;
> +	}
>  	}
>  	rb |= (v >> 54) & 0x300;		/* B field */
>  	return rb;
> @@ -112,14 +200,26 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
>
>  static inline unsigned long hpte_page_size(unsigned long h, unsigned long l)
>  {
> +	int size, a_size;
> +	/* Look at the 8 bit LP value */
> +	unsigned int lp = (l >> LP_SHIFT) & ((1 << LP_BITS) - 1);
> +
>  	/* only handle 4k, 64k and 16M pages for now */
>  	if (!(h & HPTE_V_LARGE))
> -		return 1ul << 12;		/* 4k page */
> -	if ((l & 0xf000) == 0x1000 && cpu_has_feature(CPU_FTR_ARCH_206))
> -		return 1ul << 16;		/* 64k page */
> -	if ((l & 0xff000) == 0)
> -		return 1ul << 24;		/* 16M page */
> -	return 0;				/* error */
> +		return 1ul << 12;
> +	else {
> +		for (size = 0; size < MMU_PAGE_COUNT; size++) {
> +			/* valid entries have a shift value */
> +			if (!mmu_psize_defs[size].shift)
> +				continue;
> +
> +			a_size = __hpte_actual_psize(lp, size);
> +			if (a_size != -1)
> +				return 1ul << mmu_psize_defs[a_size].shift;
> +		}
> +
> +	}
> +	return 0;
>  }
>
>  static inline unsigned long hpte_rpn(unsigned long ptel, unsigned long psize)
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 8227dba5af0f..a38d3289320a 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -1949,6 +1949,13 @@ static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
>  	 * support pte_enc here
>  	 */
>  	(*sps)->enc[0].pte_enc = def->penc[linux_psize];
> +	/*
> +	 * Add 16MB MPSS support
> +	 */
> +	if (linux_psize != MMU_PAGE_16M) {
> +		(*sps)->enc[1].page_shift = 24;
> +		(*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
> +	}

We ideally want to do this only when the guest memory is backed up by
hugetlbfs. I was thinking qemu should ensure that. But then i am not
sure existing qemu work that way. So we may want to look at how to
enable MPSS.

-aneesh



More information about the Linuxppc-dev mailing list