[PATCH v4 06/17] KVM: PPC: Book3S HV: XIVE: add controls for the EQ configuration
Cédric Le Goater
clg at kaod.org
Thu Mar 21 19:48:39 AEDT 2019
On 3/21/19 12:09 AM, David Gibson wrote:
> On Wed, Mar 20, 2019 at 09:37:40AM +0100, Cédric Le Goater wrote:
>> These controls will be used by the H_INT_SET_QUEUE_CONFIG and
>> H_INT_GET_QUEUE_CONFIG hcalls from QEMU to configure the underlying
>> Event Queue in the XIVE IC. They will also be used to restore the
>> configuration of the XIVE EQs and to capture the internal run-time
>> state of the EQs. Both 'get' and 'set' rely on an OPAL call to access
>> the EQ toggle bit and EQ index which are updated by the XIVE IC when
>> event notifications are enqueued in the EQ.
>>
>> The value of the guest physical address of the event queue is saved in
>> the XIVE internal xive_q structure for later use. That is when
>> migration needs to mark the EQ pages dirty to capture a consistent
>> memory state of the VM.
>>
>> To be noted that H_INT_SET_QUEUE_CONFIG does not require the extra
>> OPAL call setting the EQ toggle bit and EQ index to configure the EQ,
>> but restoring the EQ state will.
>>
>> Signed-off-by: Cédric Le Goater <clg at kaod.org>
>> ---
>>
>> Changes since v3 :
>>
>> - fix the test ont the initial setting of the EQ toggle bit : 0 -> 1
>> - renamed qsize to qshift
>> - renamed qpage to qaddr
>> - checked host page size
>> - limited flags to KVM_XIVE_EQ_ALWAYS_NOTIFY to fit sPAPR specs
>>
>> Changes since v2 :
>>
>> - fixed comments on the KVM device attribute definitions
>> - fixed check on supported EQ size to restrict to 64K pages
>> - checked kvm_eq.flags that need to be zero
>> - removed the OPAL call when EQ qtoggle bit and index are zero.
>>
>> arch/powerpc/include/asm/xive.h | 2 +
>> arch/powerpc/include/uapi/asm/kvm.h | 19 ++
>> arch/powerpc/kvm/book3s_xive.h | 2 +
>> arch/powerpc/kvm/book3s_xive.c | 15 +-
>> arch/powerpc/kvm/book3s_xive_native.c | 242 +++++++++++++++++++++
>> Documentation/virtual/kvm/devices/xive.txt | 34 +++
>> 6 files changed, 308 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
>> index b579a943407b..c4e88abd3b67 100644
>> --- a/arch/powerpc/include/asm/xive.h
>> +++ b/arch/powerpc/include/asm/xive.h
>> @@ -73,6 +73,8 @@ struct xive_q {
>> u32 esc_irq;
>> atomic_t count;
>> atomic_t pending_count;
>> + u64 guest_qaddr;
>> + u32 guest_qshift;
>> };
>>
>> /* Global enable flags for the XIVE support */
>> diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
>> index e8161e21629b..85005400fd86 100644
>> --- a/arch/powerpc/include/uapi/asm/kvm.h
>> +++ b/arch/powerpc/include/uapi/asm/kvm.h
>> @@ -681,6 +681,7 @@ struct kvm_ppc_cpu_char {
>> #define KVM_DEV_XIVE_GRP_CTRL 1
>> #define KVM_DEV_XIVE_GRP_SOURCE 2 /* 64-bit source identifier */
>> #define KVM_DEV_XIVE_GRP_SOURCE_CONFIG 3 /* 64-bit source identifier */
>> +#define KVM_DEV_XIVE_GRP_EQ_CONFIG 4 /* 64-bit EQ identifier */
>>
>> /* Layout of 64-bit XIVE source attribute values */
>> #define KVM_XIVE_LEVEL_SENSITIVE (1ULL << 0)
>> @@ -696,4 +697,22 @@ struct kvm_ppc_cpu_char {
>> #define KVM_XIVE_SOURCE_EISN_SHIFT 33
>> #define KVM_XIVE_SOURCE_EISN_MASK 0xfffffffe00000000ULL
>>
>> +/* Layout of 64-bit EQ identifier */
>> +#define KVM_XIVE_EQ_PRIORITY_SHIFT 0
>> +#define KVM_XIVE_EQ_PRIORITY_MASK 0x7
>> +#define KVM_XIVE_EQ_SERVER_SHIFT 3
>> +#define KVM_XIVE_EQ_SERVER_MASK 0xfffffff8ULL
>> +
>> +/* Layout of EQ configuration values (64 bytes) */
>> +struct kvm_ppc_xive_eq {
>> + __u32 flags;
>> + __u32 qshift;
>> + __u64 qaddr;
>> + __u32 qtoggle;
>> + __u32 qindex;
>> + __u8 pad[40];
>> +};
>> +
>> +#define KVM_XIVE_EQ_ALWAYS_NOTIFY 0x00000001
>> +
>> #endif /* __LINUX_KVM_POWERPC_H */
>> diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
>> index ae26fe653d98..622f594d93e1 100644
>> --- a/arch/powerpc/kvm/book3s_xive.h
>> +++ b/arch/powerpc/kvm/book3s_xive.h
>> @@ -272,6 +272,8 @@ struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
>> struct kvmppc_xive *xive, int irq);
>> void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb);
>> int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio);
>> +int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
>> + bool single_escalation);
>>
>> #endif /* CONFIG_KVM_XICS */
>> #endif /* _KVM_PPC_BOOK3S_XICS_H */
>> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
>> index e09f3addffe5..c1b7aa7dbc28 100644
>> --- a/arch/powerpc/kvm/book3s_xive.c
>> +++ b/arch/powerpc/kvm/book3s_xive.c
>> @@ -166,7 +166,8 @@ static irqreturn_t xive_esc_irq(int irq, void *data)
>> return IRQ_HANDLED;
>> }
>>
>> -static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
>> +int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
>> + bool single_escalation)
>> {
>> struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
>> struct xive_q *q = &xc->queues[prio];
>> @@ -185,7 +186,7 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
>> return -EIO;
>> }
>>
>> - if (xc->xive->single_escalation)
>> + if (single_escalation)
>> name = kasprintf(GFP_KERNEL, "kvm-%d-%d",
>> vcpu->kvm->arch.lpid, xc->server_num);
>> else
>> @@ -217,7 +218,7 @@ static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
>> * interrupt, thus leaving it effectively masked after
>> * it fires once.
>> */
>> - if (xc->xive->single_escalation) {
>> + if (single_escalation) {
>> struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]);
>> struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
>>
>> @@ -291,7 +292,8 @@ static int xive_check_provisioning(struct kvm *kvm, u8 prio)
>> continue;
>> rc = xive_provision_queue(vcpu, prio);
>> if (rc == 0 && !xive->single_escalation)
>> - xive_attach_escalation(vcpu, prio);
>> + kvmppc_xive_attach_escalation(vcpu, prio,
>> + xive->single_escalation);
>> if (rc)
>> return rc;
>> }
>> @@ -1214,7 +1216,8 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
>> if (xive->qmap & (1 << i)) {
>> r = xive_provision_queue(vcpu, i);
>> if (r == 0 && !xive->single_escalation)
>> - xive_attach_escalation(vcpu, i);
>> + kvmppc_xive_attach_escalation(
>> + vcpu, i, xive->single_escalation);
>> if (r)
>> goto bail;
>> } else {
>> @@ -1229,7 +1232,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
>> }
>>
>> /* If not done above, attach priority 0 escalation */
>> - r = xive_attach_escalation(vcpu, 0);
>> + r = kvmppc_xive_attach_escalation(vcpu, 0, xive->single_escalation);
>> if (r)
>> goto bail;
>>
>> diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
>> index 492825a35958..2c335454da72 100644
>> --- a/arch/powerpc/kvm/book3s_xive_native.c
>> +++ b/arch/powerpc/kvm/book3s_xive_native.c
>> @@ -335,6 +335,236 @@ static int kvmppc_xive_native_set_source_config(struct kvmppc_xive *xive,
>> priority, masked, eisn);
>> }
>>
>> +static int xive_native_validate_queue_size(u32 qshift)
>> +{
>> + /*
>> + * We only support 64K pages for the moment. This is also
>> + * advertised in the DT property "ibm,xive-eq-sizes"
>> + */
>> + switch (qshift) {
>> + case 0: /* EQ reset */
>> + case 16:
>> + return 0;
>> + case 12:
>> + case 21:
>> + case 24:
>> + default:
>> + return -EINVAL;
>
> Now that you're doing a proper test against the guest page size, it
> would be very easy to allow this to support things other than a 64kiB
> queue. That can be a followup change, though.
Supporting larger page sizes could be interesting for AIX or Linux. Talking
of which, we should start with QEMU.
>
>> + }
>> +}
>> +
>> +static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
>> + long eq_idx, u64 addr)
>> +{
>> + struct kvm *kvm = xive->kvm;
>> + struct kvm_vcpu *vcpu;
>> + struct kvmppc_xive_vcpu *xc;
>> + void __user *ubufp = (void __user *) addr;
>> + u32 server;
>> + u8 priority;
>> + struct kvm_ppc_xive_eq kvm_eq;
>> + int rc;
>> + __be32 *qaddr = 0;
>> + struct page *page;
>> + struct xive_q *q;
>> + gfn_t gfn;
>> + unsigned long page_size;
>> +
>> + /*
>> + * Demangle priority/server tuple from the EQ identifier
>> + */
>> + priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
>> + KVM_XIVE_EQ_PRIORITY_SHIFT;
>> + server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
>> + KVM_XIVE_EQ_SERVER_SHIFT;
>> +
>> + if (copy_from_user(&kvm_eq, ubufp, sizeof(kvm_eq)))
>> + return -EFAULT;
>> +
>> + vcpu = kvmppc_xive_find_server(kvm, server);
>> + if (!vcpu) {
>> + pr_err("Can't find server %d\n", server);
>> + return -ENOENT;
>> + }
>> + xc = vcpu->arch.xive_vcpu;
>> +
>> + if (priority != xive_prio_from_guest(priority)) {
>> + pr_err("Trying to restore invalid queue %d for VCPU %d\n",
>> + priority, server);
>> + return -EINVAL;
>> + }
>> + q = &xc->queues[priority];
>> +
>> + pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
>> + __func__, server, priority, kvm_eq.flags,
>> + kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
>> +
>> + /*
>> + * sPAPR specifies a "Unconditional Notify (n) flag" for the
>> + * H_INT_SET_QUEUE_CONFIG hcall which forces notification
>> + * without using the coalescing mechanisms provided by the
>> + * XIVE END ESBs.
>> + */
>> + if (kvm_eq.flags & ~KVM_XIVE_EQ_ALWAYS_NOTIFY) {
>> + pr_err("invalid flags %d\n", kvm_eq.flags);
>> + return -EINVAL;
>> + }
>
> So this test prevents setting any (as yet undefined) flags other than
> EQ_ALWAYS_NOTIFY, which is good. However, AFAICT you never actually
> branch based on whether the EQ_ALWAYS_NOTIFY flag is set - you just
> assume that it is.
Yes because it is enforced by xive_native_configure_queue().
> You should either actually pass this flag through
> to the OPAL call, or have another test here which rejects the ioctl()
> if the flag is *not* set.
Indeed. Linux/PowerNV and KVM require this flag to be set, I should error
out if this is not the case. It is safer.
>> +
>> + rc = xive_native_validate_queue_size(kvm_eq.qshift);
>> + if (rc) {
>> + pr_err("invalid queue size %d\n", kvm_eq.qshift);
>> + return rc;
>> + }
>> +
>> + /* reset queue and disable queueing */
>> + if (!kvm_eq.qshift) {
>> + q->guest_qaddr = 0;
>> + q->guest_qshift = 0;
>> +
>> + rc = xive_native_configure_queue(xc->vp_id, q, priority,
>> + NULL, 0, true);
>> + if (rc) {
>> + pr_err("Failed to reset queue %d for VCPU %d: %d\n",
>> + priority, xc->server_num, rc);
>> + return rc;
>> + }
>> +
>> + if (q->qpage) {
>> + put_page(virt_to_page(q->qpage));
>> + q->qpage = NULL;
>> + }
>> +
>> + return 0;
>> + }
>> +
>> + gfn = gpa_to_gfn(kvm_eq.qaddr);
>> + page = gfn_to_page(kvm, gfn);
>> + if (is_error_page(page)) {
>> + pr_warn("Couldn't get guest page for %llx!\n", kvm_eq.qaddr);
>> + return -EINVAL;
>> + }
>> +
>> + page_size = kvm_host_page_size(kvm, gfn);
>> + if (1ull << kvm_eq.qshift > page_size) {
>> + pr_warn("Incompatible host page size %lx!\n", page_size);
>> + return -EINVAL;
>> + }
>
> Sorry.. I should have thought of this earlier. If qaddr isn't aligned
> to a (page_size) boundary, this test isn't sufficient - you need to
> make sure the whole thing fits within the host page, even if it's
> offset.
>
> Alternatively, you could check that qaddr is naturally aligned for the
> requested qshift. Possibly that's necessary for the hardware anyway.
Indeed again, it should be "naturally aligned". I will add an extra test
for this. QEMU will need it also btw.
Thanks,
C.
>> +
>> + qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK);
>> +
>> + /*
>> + * Backup the queue page guest address to the mark EQ page
>> + * dirty for migration.
>> + */
>> + q->guest_qaddr = kvm_eq.qaddr;
>> + q->guest_qshift = kvm_eq.qshift;
>> +
>> + /*
>> + * Unconditional Notification is forced by default at the
>> + * OPAL level because the use of END ESBs is not supported by
>> + * Linux.
>> + */
>> + rc = xive_native_configure_queue(xc->vp_id, q, priority,
>> + (__be32 *) qaddr, kvm_eq.qshift, true);
>> + if (rc) {
>> + pr_err("Failed to configure queue %d for VCPU %d: %d\n",
>> + priority, xc->server_num, rc);
>> + put_page(page);
>> + return rc;
>> + }
>> +
>> + /*
>> + * Only restore the queue state when needed. When doing the
>> + * H_INT_SET_SOURCE_CONFIG hcall, it should not.
>> + */
>> + if (kvm_eq.qtoggle != 1 || kvm_eq.qindex != 0) {
>> + rc = xive_native_set_queue_state(xc->vp_id, priority,
>> + kvm_eq.qtoggle,
>> + kvm_eq.qindex);
>> + if (rc)
>> + goto error;
>> + }
>> +
>> + rc = kvmppc_xive_attach_escalation(vcpu, priority,
>> + xive->single_escalation);
>> +error:
>> + if (rc)
>> + kvmppc_xive_native_cleanup_queue(vcpu, priority);
>> + return rc;
>> +}
>> +
>> +static int kvmppc_xive_native_get_queue_config(struct kvmppc_xive *xive,
>> + long eq_idx, u64 addr)
>> +{
>> + struct kvm *kvm = xive->kvm;
>> + struct kvm_vcpu *vcpu;
>> + struct kvmppc_xive_vcpu *xc;
>> + struct xive_q *q;
>> + void __user *ubufp = (u64 __user *) addr;
>> + u32 server;
>> + u8 priority;
>> + struct kvm_ppc_xive_eq kvm_eq;
>> + u64 qaddr;
>> + u64 qshift;
>> + u64 qeoi_page;
>> + u32 escalate_irq;
>> + u64 qflags;
>> + int rc;
>> +
>> + /*
>> + * Demangle priority/server tuple from the EQ identifier
>> + */
>> + priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
>> + KVM_XIVE_EQ_PRIORITY_SHIFT;
>> + server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
>> + KVM_XIVE_EQ_SERVER_SHIFT;
>> +
>> + vcpu = kvmppc_xive_find_server(kvm, server);
>> + if (!vcpu) {
>> + pr_err("Can't find server %d\n", server);
>> + return -ENOENT;
>> + }
>> + xc = vcpu->arch.xive_vcpu;
>> +
>> + if (priority != xive_prio_from_guest(priority)) {
>> + pr_err("invalid priority for queue %d for VCPU %d\n",
>> + priority, server);
>> + return -EINVAL;
>> + }
>> + q = &xc->queues[priority];
>> +
>> + memset(&kvm_eq, 0, sizeof(kvm_eq));
>> +
>> + if (!q->qpage)
>> + return 0;
>> +
>> + rc = xive_native_get_queue_info(xc->vp_id, priority, &qaddr, &qshift,
>> + &qeoi_page, &escalate_irq, &qflags);
>> + if (rc)
>> + return rc;
>> +
>> + kvm_eq.flags = 0;
>> + if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
>> + kvm_eq.flags |= KVM_XIVE_EQ_ALWAYS_NOTIFY;
>> +
>> + kvm_eq.qshift = q->guest_qshift;
>> + kvm_eq.qaddr = q->guest_qaddr;
>> +
>> + rc = xive_native_get_queue_state(xc->vp_id, priority, &kvm_eq.qtoggle,
>> + &kvm_eq.qindex);
>> + if (rc)
>> + return rc;
>> +
>> + pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
>> + __func__, server, priority, kvm_eq.flags,
>> + kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
>> +
>> + if (copy_to_user(ubufp, &kvm_eq, sizeof(kvm_eq)))
>> + return -EFAULT;
>> +
>> + return 0;
>> +}
>> +
>> static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>> struct kvm_device_attr *attr)
>> {
>> @@ -349,6 +579,9 @@ static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>> case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
>> return kvmppc_xive_native_set_source_config(xive, attr->attr,
>> attr->addr);
>> + case KVM_DEV_XIVE_GRP_EQ_CONFIG:
>> + return kvmppc_xive_native_set_queue_config(xive, attr->attr,
>> + attr->addr);
>> }
>> return -ENXIO;
>> }
>> @@ -356,6 +589,13 @@ static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
>> static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
>> struct kvm_device_attr *attr)
>> {
>> + struct kvmppc_xive *xive = dev->private;
>> +
>> + switch (attr->group) {
>> + case KVM_DEV_XIVE_GRP_EQ_CONFIG:
>> + return kvmppc_xive_native_get_queue_config(xive, attr->attr,
>> + attr->addr);
>> + }
>> return -ENXIO;
>> }
>>
>> @@ -371,6 +611,8 @@ static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
>> attr->attr < KVMPPC_XIVE_NR_IRQS)
>> return 0;
>> break;
>> + case KVM_DEV_XIVE_GRP_EQ_CONFIG:
>> + return 0;
>> }
>> return -ENXIO;
>> }
>> diff --git a/Documentation/virtual/kvm/devices/xive.txt b/Documentation/virtual/kvm/devices/xive.txt
>> index 33c64b2cdbe8..8bb5e6e6923a 100644
>> --- a/Documentation/virtual/kvm/devices/xive.txt
>> +++ b/Documentation/virtual/kvm/devices/xive.txt
>> @@ -53,3 +53,37 @@ the legacy interrupt mode, referred as XICS (POWER7/8).
>> -ENXIO: CPU event queues not configured or configuration of the
>> underlying HW interrupt failed
>> -EBUSY: No CPU available to serve interrupt
>> +
>> + 4. KVM_DEV_XIVE_GRP_EQ_CONFIG (read-write)
>> + Configures an event queue of a CPU
>> + Attributes:
>> + EQ descriptor identifier (64-bit)
>> + The EQ descriptor identifier is a tuple (server, priority) :
>> + bits: | 63 .... 32 | 31 .. 3 | 2 .. 0
>> + values: | unused | server | priority
>> + The kvm_device_attr.addr points to :
>> + struct kvm_ppc_xive_eq {
>> + __u32 flags;
>> + __u32 qshift;
>> + __u64 qaddr;
>> + __u32 qtoggle;
>> + __u32 qindex;
>> + __u8 pad[40];
>> + };
>> + - flags: queue flags
>> + KVM_XIVE_EQ_ALWAYS_NOTIFY
>> + forces notification without using the coalescing mechanism
>> + provided by the XIVE END ESBs.
>> + - qshift: queue size (power of 2)
>> + - qaddr: real address of queue
>> + - qtoggle: current queue toggle bit
>> + - qindex: current queue index
>> + - pad: reserved for future use
>> + Errors:
>> + -ENOENT: Invalid CPU number
>> + -EINVAL: Invalid priority
>> + -EINVAL: Invalid flags
>> + -EINVAL: Invalid queue size
>> + -EINVAL: Invalid queue address
>> + -EFAULT: Invalid user pointer for attr->addr.
>> + -EIO: Configuration of the underlying HW failed
>
More information about the Linuxppc-dev
mailing list