[PATCH V4 3/5] kvm/ppc/book3s_hv: Implement halt polling in the kvm_hv kernel module

Thu Jul 21 19:24:59 AEST 2016

On 20/07/16 04:58, David Matlack wrote:
> On Tue, Jul 19, 2016 at 1:12 AM, Suraj Jitindar Singh
> <sjitindarsingh at gmail.com> wrote:
>> This patch introduces new halt polling functionality into the kvm_hv kernel
>> module. When a vcore is idle it will poll for some period of time before
>> scheduling itself out.
>>
>> When all of the runnable vcpus on a vcore have ceded (and thus the vcore is
>> idle) we schedule ourselves out to allow something else to run. In the
>> event that we need to wake up very quickly (for example an interrupt
>> arrives), we are required to wait until we get scheduled again.
>>
>> Implement halt polling so that when a vcore is idle, and before scheduling
>> ourselves, we poll for vcpus in the runnable_threads list which have
>> pending exceptions or which leave the ceded state. If we poll successfully
>> then we can get back into the guest very quickly without ever scheduling
>> ourselves, otherwise we schedule ourselves out as before.
>>
>> Testing of this patch with a TCP round robin test between two guests with
>> virtio network interfaces has found a decrease in round trip time of ~15us
>> on average. A performance gain is only seen when going out of and
>> back into the guest often and quickly, otherwise there is no net benefit
>> from the polling. The polling interval is adjusted such that when we are
>> often scheduled out for long periods of time it is reduced, and when we
>> often poll successfully it is increased. The rate at which the polling
>> interval increases or decreases, and the maximum polling interval, can
>> be set through module parameters.
>>
>> Based on the implementation in the generic kvm module by Wanpeng Li and
>> Paolo Bonzini, and on direction from Paul Mackerras.
>>
>> Signed-off-by: Suraj Jitindar Singh <sjitindarsingh at gmail.com>
>> ---
>>  arch/powerpc/include/asm/kvm_book3s.h |   1 +
>>  arch/powerpc/include/asm/kvm_host.h   |   1 +
>>  arch/powerpc/kvm/book3s_hv.c          | 116 ++++++++++++++++++++++++++++++----
>>  arch/powerpc/kvm/trace_hv.h           |  22 +++++++
>>  4 files changed, 126 insertions(+), 14 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
>> index 151f817..c261f52 100644
>> --- a/arch/powerpc/include/asm/kvm_book3s.h
>> +++ b/arch/powerpc/include/asm/kvm_book3s.h
>> @@ -102,6 +102,7 @@ struct kvmppc_vcore {
>>         ulong pcr;
>>         ulong dpdes;            /* doorbell state (POWER8) */
>>         ulong conferring_threads;
>> +       unsigned int halt_poll_ns;
>>  };
>>
>>  struct kvmppc_vcpu_book3s {
>> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
>> index 02d06e9..610f393 100644
>> --- a/arch/powerpc/include/asm/kvm_host.h
>> +++ b/arch/powerpc/include/asm/kvm_host.h
>> @@ -294,6 +294,7 @@ struct kvm_arch {
>>  #define VCORE_SLEEPING 3
>>  #define VCORE_RUNNING  4
>>  #define VCORE_EXITING  5
>> +#define VCORE_POLLING  6
>>
>>  /*
>>   * Struct used to manage memory for a virtual processor area
>> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
>> index 3bcf9e6..a9de1d4 100644
>> --- a/arch/powerpc/kvm/book3s_hv.c
>> +++ b/arch/powerpc/kvm/book3s_hv.c
>> @@ -94,6 +94,23 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
>>  MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
>>  #endif
>>
>> +/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */
>> +static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT;
>> +module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR);
>> +MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns");
>> +
>> +/* Factor by which the vcore halt poll interval is grown, default is to double
>> + */
>> +static unsigned int halt_poll_ns_grow = 2;
>> +module_param(halt_poll_ns_grow, int, S_IRUGO);
>> +MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by");
>> +
>> +/* Factor by which the vcore halt poll interval is shrunk, default is to reset
>> + */
>> +static unsigned int halt_poll_ns_shrink;
>> +module_param(halt_poll_ns_shrink, int, S_IRUGO);
>> +MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by");
>> +
>>  static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
>>  static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
>>
>> @@ -2620,32 +2637,82 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
>>         finish_wait(&vcpu->arch.cpu_run, &wait);
>>  }
>>
>> +static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
>> +{
>> +       /* 10us base */
>> +       if (vc->halt_poll_ns == 0 && halt_poll_ns_grow)
>> +               vc->halt_poll_ns = 10000;
>> +       else
>> +               vc->halt_poll_ns *= halt_poll_ns_grow;
>> +
>> +       if (vc->halt_poll_ns > halt_poll_max_ns)
>> +               vc->halt_poll_ns = halt_poll_max_ns;
>> +}
>> +
>> +static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
>> +{
>> +       if (halt_poll_ns_shrink == 0)
>> +               vc->halt_poll_ns = 0;
>> +       else
>> +               vc->halt_poll_ns /= halt_poll_ns_shrink;
>> +}
>> +
>> +/* Check to see if any of the runnable vcpus on the vcore have pending
>> + * exceptions or are no longer ceded
>> + */
>> +static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
>> +{
>> +       struct kvm_vcpu *vcpu;
>> +       int i;
>> +
>> +       for_each_runnable_thread(i, vcpu, vc) {
>> +               if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded)
>> +                       return 1;
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>>  /*
>>   * All the vcpus in this vcore are idle, so wait for a decrementer
>>   * or external interrupt to one of the vcpus.  vc->lock is held.
>>   */
>>  static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
>>  {
>> -       struct kvm_vcpu *vcpu;
>> -       int do_sleep = 1, i;
>> +       int do_sleep = 1;
>> +       ktime_t cur, start;
>> +       u64 block_ns;
>>         DECLARE_SWAITQUEUE(wait);
>>
>> -       prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
>> +       /* Poll for pending exceptions and ceded state */
>> +       cur = start = ktime_get();
>> +       if (vc->halt_poll_ns) {
>> +               ktime_t stop = ktime_add_ns(start, vc->halt_poll_ns);
>>
>> -       /*
>> -        * Check one last time for pending exceptions and ceded state after
>> -        * we put ourselves on the wait queue
>> -        */
>> -       for_each_runnable_thread(i, vcpu, vc) {
>> -               if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) {
>> -                       do_sleep = 0;
>> -                       break;
>> -               }
>> +               vc->vcore_state = VCORE_POLLING;
>> +               spin_unlock(&vc->lock);
>> +
>> +               do {
>> +                       if (kvmppc_vcore_check_block(vc)) {
>> +                               do_sleep = 0;
>> +                               break;
>> +                       }
>> +                       cur = ktime_get();
>> +               } while (ktime_before(cur, stop));
> The generic halt-polling implementation only polls if there are no
> other threads waiting to run (single_task_running() == 1). Should/Can
> we do the same here?

Thanks for catching that.

Yes I think we should.

>
>> +
>> +               spin_lock(&vc->lock);
>> +               vc->vcore_state = VCORE_INACTIVE;
>> +
>> +               if (!do_sleep)
>> +                       goto out;
>>         }
>>
>> -       if (!do_sleep) {
>> +       prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
>> +
>> +       if (kvmppc_vcore_check_block(vc)) {
>>                 finish_swait(&vc->wq, &wait);
>> -               return;
>> +               do_sleep = 0;
>> +               goto out;
>>         }
>>
>>         vc->vcore_state = VCORE_SLEEPING;
>> @@ -2656,6 +2723,27 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
>>         spin_lock(&vc->lock);
>>         vc->vcore_state = VCORE_INACTIVE;
>>         trace_kvmppc_vcore_blocked(vc, 1);
>> +
>> +       cur = ktime_get();
>> +
>> +out:
>> +       block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
>> +
>> +       /* Adjust poll time */
>> +       if (halt_poll_max_ns) {
>> +               if (block_ns <= vc->halt_poll_ns)
>> +                       ;
>> +               /* We slept and blocked for longer than the max halt time */
>> +               else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns)
>> +                       shrink_halt_poll_ns(vc);
>> +               /* We slept and our poll time is too small */
>> +               else if (vc->halt_poll_ns < halt_poll_max_ns &&
>> +                               block_ns < halt_poll_max_ns)
>> +                       grow_halt_poll_ns(vc);
>> +       } else
>> +               vc->halt_poll_ns = 0;
>> +
>> +       trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
>>  }
>>
>>  static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
>> diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h
>> index 33d9daf..fb21990 100644
>> --- a/arch/powerpc/kvm/trace_hv.h
>> +++ b/arch/powerpc/kvm/trace_hv.h
>> @@ -432,6 +432,28 @@ TRACE_EVENT(kvmppc_vcore_blocked,
>>                    __entry->runner_vcpu, __entry->n_runnable, __entry->tgid)
>>  );
>>
>> +TRACE_EVENT(kvmppc_vcore_wakeup,
>> +       TP_PROTO(int do_sleep, __u64 ns),
>> +
>> +       TP_ARGS(do_sleep, ns),
>> +
>> +       TP_STRUCT__entry(
>> +               __field(__u64,  ns)
>> +               __field(int,    waited)
>> +               __field(pid_t,  tgid)
>> +       ),
>> +
>> +       TP_fast_assign(
>> +               __entry->ns     = ns;
>> +               __entry->waited = do_sleep;
>> +               __entry->tgid   = current->tgid;
>> +       ),
>> +
>> +       TP_printk("%s time %lld ns, tgid=%d",
>> +               __entry->waited ? "wait" : "poll",
>> +               __entry->ns, __entry->tgid)
>> +);
>> +
>>  TRACE_EVENT(kvmppc_run_vcpu_enter,
>>         TP_PROTO(struct kvm_vcpu *vcpu),
>>
>> --
>> 2.5.5
>>