[PATCH V4 3/5] kvm/ppc/book3s_hv: Implement halt polling in the kvm_hv kernel module

Tue Jul 19 18:38:34 AEST 2016

On 07/19/2016 10:12 AM, Suraj Jitindar Singh wrote:
> This patch introduces new halt polling functionality into the kvm_hv kernel
> module. When a vcore is idle it will poll for some period of time before
> scheduling itself out.

Some wording on why you cannot use the common code might be useful. 
> 
> When all of the runnable vcpus on a vcore have ceded (and thus the vcore is
> idle) we schedule ourselves out to allow something else to run. In the
> event that we need to wake up very quickly (for example an interrupt
> arrives), we are required to wait until we get scheduled again.
> 
> Implement halt polling so that when a vcore is idle, and before scheduling
> ourselves, we poll for vcpus in the runnable_threads list which have
> pending exceptions or which leave the ceded state. If we poll successfully
> then we can get back into the guest very quickly without ever scheduling
> ourselves, otherwise we schedule ourselves out as before.
> 
> Testing of this patch with a TCP round robin test between two guests with
> virtio network interfaces has found a decrease in round trip time of ~15us
> on average. A performance gain is only seen when going out of and
> back into the guest often and quickly, otherwise there is no net benefit
> from the polling. The polling interval is adjusted such that when we are
> often scheduled out for long periods of time it is reduced, and when we
> often poll successfully it is increased. The rate at which the polling
> interval increases or decreases, and the maximum polling interval, can
> be set through module parameters.
> 
> Based on the implementation in the generic kvm module by Wanpeng Li and
> Paolo Bonzini, and on direction from Paul Mackerras.
> 
> Signed-off-by: Suraj Jitindar Singh <sjitindarsingh at gmail.com>
> ---
>  arch/powerpc/include/asm/kvm_book3s.h |   1 +
>  arch/powerpc/include/asm/kvm_host.h   |   1 +
>  arch/powerpc/kvm/book3s_hv.c          | 116 ++++++++++++++++++++++++++++++----
>  arch/powerpc/kvm/trace_hv.h           |  22 +++++++
>  4 files changed, 126 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
> index 151f817..c261f52 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -102,6 +102,7 @@ struct kvmppc_vcore {
>  	ulong pcr;
>  	ulong dpdes;		/* doorbell state (POWER8) */
>  	ulong conferring_threads;
> +	unsigned int halt_poll_ns;
>  };
> 
>  struct kvmppc_vcpu_book3s {
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index 02d06e9..610f393 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -294,6 +294,7 @@ struct kvm_arch {
>  #define VCORE_SLEEPING	3
>  #define VCORE_RUNNING	4
>  #define VCORE_EXITING	5
> +#define VCORE_POLLING	6
> 
>  /*
>   * Struct used to manage memory for a virtual processor area
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 3bcf9e6..a9de1d4 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -94,6 +94,23 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
>  MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
>  #endif
> 
> +/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */
> +static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT;
> +module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR);
> +MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns");
> +
> +/* Factor by which the vcore halt poll interval is grown, default is to double
> + */
> +static unsigned int halt_poll_ns_grow = 2;
> +module_param(halt_poll_ns_grow, int, S_IRUGO);
> +MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by");
> +
> +/* Factor by which the vcore halt poll interval is shrunk, default is to reset
> + */
> +static unsigned int halt_poll_ns_shrink;
> +module_param(halt_poll_ns_shrink, int, S_IRUGO);
> +MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by");
> +
>  static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
>  static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
> 
> @@ -2620,32 +2637,82 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
>  	finish_wait(&vcpu->arch.cpu_run, &wait);
>  }
> 
> +static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
> +{
> +	/* 10us base */
> +	if (vc->halt_poll_ns == 0 && halt_poll_ns_grow)
> +		vc->halt_poll_ns = 10000;
> +	else
> +		vc->halt_poll_ns *= halt_poll_ns_grow;
> +
> +	if (vc->halt_poll_ns > halt_poll_max_ns)
> +		vc->halt_poll_ns = halt_poll_max_ns;
> +}
> +
> +static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
> +{
> +	if (halt_poll_ns_shrink == 0)
> +		vc->halt_poll_ns = 0;
> +	else
> +		vc->halt_poll_ns /= halt_poll_ns_shrink;
> +}
> +
> +/* Check to see if any of the runnable vcpus on the vcore have pending
> + * exceptions or are no longer ceded
> + */
> +static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
> +{
> +	struct kvm_vcpu *vcpu;
> +	int i;
> +
> +	for_each_runnable_thread(i, vcpu, vc) {
> +		if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded)
> +			return 1;
> +	}
> +
> +	return 0;
> +}
> +
>  /*
>   * All the vcpus in this vcore are idle, so wait for a decrementer
>   * or external interrupt to one of the vcpus.  vc->lock is held.
>   */
>  static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
>  {
> -	struct kvm_vcpu *vcpu;
> -	int do_sleep = 1, i;
> +	int do_sleep = 1;
> +	ktime_t cur, start;
> +	u64 block_ns;
>  	DECLARE_SWAITQUEUE(wait);
> 
> -	prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
> +	/* Poll for pending exceptions and ceded state */
> +	cur = start = ktime_get();
> +	if (vc->halt_poll_ns) {
> +		ktime_t stop = ktime_add_ns(start, vc->halt_poll_ns);
> 
> -	/*
> -	 * Check one last time for pending exceptions and ceded state after
> -	 * we put ourselves on the wait queue
> -	 */
> -	for_each_runnable_thread(i, vcpu, vc) {
> -		if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) {
> -			do_sleep = 0;
> -			break;
> -		}
> +		vc->vcore_state = VCORE_POLLING;
> +		spin_unlock(&vc->lock);
> +
> +		do {
> +			if (kvmppc_vcore_check_block(vc)) {
> +				do_sleep = 0;
> +				break;
> +			}
> +			cur = ktime_get();
> +		} while (ktime_before(cur, stop));
> +
> +		spin_lock(&vc->lock);
> +		vc->vcore_state = VCORE_INACTIVE;
> +
> +		if (!do_sleep)
> +			goto out;
>  	}
> 
> -	if (!do_sleep) {
> +	prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
> +
> +	if (kvmppc_vcore_check_block(vc)) {
>  		finish_swait(&vc->wq, &wait);
> -		return;
> +		do_sleep = 0;
> +		goto out;
>  	}
> 
>  	vc->vcore_state = VCORE_SLEEPING;
> @@ -2656,6 +2723,27 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
>  	spin_lock(&vc->lock);
>  	vc->vcore_state = VCORE_INACTIVE;
>  	trace_kvmppc_vcore_blocked(vc, 1);
> +
> +	cur = ktime_get();
> +
> +out:
> +	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
> +
> +	/* Adjust poll time */
> +	if (halt_poll_max_ns) {
> +		if (block_ns <= vc->halt_poll_ns)
> +			;
> +		/* We slept and blocked for longer than the max halt time */
> +		else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns)
> +			shrink_halt_poll_ns(vc);
> +		/* We slept and our poll time is too small */
> +		else if (vc->halt_poll_ns < halt_poll_max_ns &&
> +				block_ns < halt_poll_max_ns)
> +			grow_halt_poll_ns(vc);
> +	} else
> +		vc->halt_poll_ns = 0;
> +
> +	trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
>  }
> 
>  static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
> diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h
> index 33d9daf..fb21990 100644
> --- a/arch/powerpc/kvm/trace_hv.h
> +++ b/arch/powerpc/kvm/trace_hv.h
> @@ -432,6 +432,28 @@ TRACE_EVENT(kvmppc_vcore_blocked,
>  		   __entry->runner_vcpu, __entry->n_runnable, __entry->tgid)
>  );
> 
> +TRACE_EVENT(kvmppc_vcore_wakeup,
> +	TP_PROTO(int do_sleep, __u64 ns),
> +
> +	TP_ARGS(do_sleep, ns),
> +
> +	TP_STRUCT__entry(
> +		__field(__u64,  ns)
> +		__field(int,    waited)
> +		__field(pid_t,  tgid)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->ns     = ns;
> +		__entry->waited = do_sleep;
> +		__entry->tgid   = current->tgid;
> +	),
> +
> +	TP_printk("%s time %lld ns, tgid=%d",
> +		__entry->waited ? "wait" : "poll",
> +		__entry->ns, __entry->tgid)
> +);
> +
>  TRACE_EVENT(kvmppc_run_vcpu_enter,
>  	TP_PROTO(struct kvm_vcpu *vcpu),
>