[PATCH 07/14] KVM: PPC: Book3S HV: Handle passthrough interrupts in guest

Suresh Warrier warrier at linux.vnet.ibm.com
Sat Feb 27 05:40:25 AEDT 2016


Currently, KVM switches back to the host to handle any external
interrupt (when the interrupt is received while running in the
guest). This patch updates real-mode KVM to check if an interrupt
is generated by a passthrough adapter that is owned by this guest.
If so, the real mode KVM will directly inject the corresponding
virtual interrupt to the guest VCPU's ICS and also EOI the interrupt
in hardware. In short, the interrupt is handled entirely in real
mode in the guest context without switching back to the host.

In some rare cases, the interrupt cannot be completely handled in
real mode, for instance, a VCPU that is sleeping needs to be woken
up. In this case, KVM simply switches back to the host with trap
reason set to 0x500. This works, but it is clearly not very efficient.
A following patch will distinguish this case and handle it
correctly in the host. Note that we can use the existing
check_too_hard() routine even though we are not in a hypercall to
determine if there is unfinished business that needs to be
completed in host virtual mode.

The patch assumes that the mapping between hardware interrupt IRQ
and virtual IRQ to be injected to the guest already exists for the
PCI passthrough interrupts that need to be handled in real mode.
If the mapping does not exist, KVM falls back to the default
existing behavior.

The KVM real mode code only reads mappings from the cached array
in the passthrough IRQ map. The caching code fails if there are
no more cache slots available and the uncaching code is only
called a mapping is removed.  We also carefully orders the loads
and stores of the fields in the kvmppc_irq_map data structure
using memory barriers to avoid an inconsistent mapping being seen
by the reader. Thus, although it is possible to miss a cache entry,
it is not possible to read a stale value.

One additional complication involves HOT plugging of SRIOV
functions. If a SRIOV function gets removed and then re-added through
HOT plug to the same guest, it is possible for the HW IRQ to
be assigned a new value for the guest GSI. To ensure that the KVM
real mode handlers do not read a stale value for this case, we
call kick_all_cpus_sync() after unmapping which does not return
until every vcpu executing in the guest has come back to the host
at least once.

Signed-off-by: Suresh Warrier <warrier at linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/kvm_ppc.h        |  3 ++
 arch/powerpc/include/asm/pnv-pci.h        |  1 +
 arch/powerpc/kvm/book3s_hv.c              | 21 ++++++++++
 arch/powerpc/kvm/book3s_hv_builtin.c      | 64 +++++++++++++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv_rm_xics.c      | 44 +++++++++++++++++++++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   | 16 ++++++++
 arch/powerpc/platforms/powernv/pci-ioda.c | 14 +++++--
 7 files changed, 160 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 4107f7f..c5c7386 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -226,6 +226,9 @@ extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server,
 				u32 *priority);
 extern int kvmppc_xics_int_on(struct kvm *kvm, u32 irq);
 extern int kvmppc_xics_int_off(struct kvm *kvm, u32 irq);
+extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr,
+				 struct kvmppc_irq_map *irq_map,
+				 struct kvmppc_passthru_irqmap *pimap);
 
 void kvmppc_core_dequeue_debug(struct kvm_vcpu *vcpu);
 void kvmppc_core_queue_debug(struct kvm_vcpu *vcpu);
diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h
index 6f77f71..f0564ee 100644
--- a/arch/powerpc/include/asm/pnv-pci.h
+++ b/arch/powerpc/include/asm/pnv-pci.h
@@ -20,6 +20,7 @@ int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num);
 void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num);
 int pnv_cxl_get_irq_count(struct pci_dev *dev);
 struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev);
+int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq);
 
 #ifdef CONFIG_CXL_BASE
 int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs,
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 97150f0..8504a5d 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3343,6 +3343,12 @@ static int kvmppc_cache_passthru_irq_hv(struct kvm *kvm, int irq)
 
 	pimap->cached[cidx].v_hwirq = pimap->mapped[midx].v_hwirq;
 	pimap->cached[cidx].desc = pimap->mapped[midx].desc;
+
+	/*
+	 * Order the above two stores before the next to serialize with
+	 * the KVM real mode handler.
+	 */
+	smp_wmb();
 	pimap->cached[cidx].r_hwirq = pimap->mapped[midx].r_hwirq;
 
 	if (cidx >= pimap->n_cached)
@@ -3369,6 +3375,10 @@ static void _uncache_passthru_irq(struct kvmppc_passthru_irqmap *pimap, int irq)
 
 			/*
 			 * Zero out the IRQ being uncached.
+			 * No barriers needed since the IRQ must
+			 * be disabled/unmasked before it is uncached,
+			 * so real mode code cannot possibly be
+			 * searching for this IRQ in the map.
 			 */
 			pimap->cached[i].r_hwirq = 0;
 			pimap->cached[i].v_hwirq = 0;
@@ -3380,6 +3390,17 @@ static void _uncache_passthru_irq(struct kvmppc_passthru_irqmap *pimap, int irq)
 			 */
 			if (i + 1 == pimap->n_cached)
 				pimap->n_cached--;
+			/*
+			 * Ensure that all readers have exited any
+			 * critical sections in real mode KVM and
+			 * come back to the host at least once. This
+			 * guarantees that they cannot see any stale
+			 * values for the HW IRQ being uncached. This
+			 * is required to handle Hot plug re-adding
+			 * this function/device and passing through
+			 * the HW IRQ with a different guest GSI.
+			 */
+			kick_all_cpus_sync();
 			return;
 		}
 	}
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 5db386a..403fb50 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -288,12 +288,52 @@ void kvmhv_commence_exit(int trap)
 struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
 EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv);
 
+static struct kvmppc_irq_map *get_irqmap(struct kvmppc_passthru_irqmap *pimap,
+					 u32 xisr)
+{
+	int i;
+
+	/*
+	 * We can access the cached array unsafely because if there
+	 * is a pending IRQ, its mapping cannot be removed
+	 * and replaced with a new mapping (that corresponds to a
+	 * different device) while we are accessing it. After
+	 * uncaching, we do a kick_all_cpus_sync which guarantees
+	 * that we don't see a stale value in here.
+	 *
+	 * Since we don't take a lock, we might skip over or read
+	 * more than the available entries in here (if a different
+	 * entry here is * being deleted), and we might thus miss
+	 * our hwirq, but we can never get a bad mapping. Missing
+	 * an entry is not fatal, in this case, we simply fall back
+	 * on the default interrupt handling mechanism - that is,
+	 * this interrupt goes through VFIO. Currently, we only
+	 * search in the cache for a mapping.
+	 *
+	 * We have also carefully ordered the stores in the writer
+	 * and the loads here in the reader, so that if we find a matching
+	 * hwirq here, the associated GSI and irq_desc fields are valid.
+	 */
+	for (i = 0; i < pimap->n_cached; i++)  {
+		if (xisr == pimap->cached[i].r_hwirq) {
+			/*
+			 * Order subsequent reads in the caller to serialize
+			 * with the writer.
+			 */
+			smp_rmb();
+			return &pimap->cached[i];
+		}
+	}
+	return NULL;
+}
+
 /*
  * Determine what sort of external interrupt is pending (if any).
  * Returns:
  *	0 if no interrupt is pending
  *	1 if an interrupt is pending that needs to be handled by the host
  *	-1 if there was a guest wakeup IPI (which has now been cleared)
+ *	-2 if there is PCI passthrough external interrupt that was handled
  */
 
 long kvmppc_read_intr(struct kvm_vcpu *vcpu, int path)
@@ -302,6 +342,9 @@ long kvmppc_read_intr(struct kvm_vcpu *vcpu, int path)
 	u32 h_xirr;
 	__be32 xirr;
 	u32 xisr;
+	struct kvmppc_passthru_irqmap *pimap;
+	struct kvmppc_irq_map *irq_map;
+	int r;
 	u8 host_ipi;
 
 	/* see if a host IPI is pending */
@@ -368,5 +411,26 @@ long kvmppc_read_intr(struct kvm_vcpu *vcpu, int path)
 		return -1;
 	}
 
+	/*
+	 * If it's not an IPI, check if we have a passthrough adapter and
+	 * if so, check if this external interrupt is for the adapter.
+	 * We will attempt to deliver the IRQ directly to the target VCPU's
+	 * ICP, the virtual ICP (based on affinity - the xive value in ICS).
+	 *
+	 * If the delivery fails or if this is not for a passthrough adapter,
+	 * return to the host to handle this interrupt. We earlier
+	 * saved a copy of the XIRR in the PACA, it will be picked up by
+	 * the host ICP driver
+	 */
+	pimap = kvmppc_get_passthru_irqmap(vcpu);
+	if (pimap) {
+		irq_map = get_irqmap(pimap, xisr);
+		if (irq_map) {
+			r = kvmppc_deliver_irq_passthru(vcpu, xirr,
+								irq_map, pimap);
+			return r;
+		}
+	}
+
 	return 1;
 }
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 980d8a6..1089bfa 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -19,6 +19,7 @@
 #include <asm/synch.h>
 #include <asm/cputhreads.h>
 #include <asm/ppc-opcode.h>
+#include <asm/pnv-pci.h>
 
 #include "book3s_xics.h"
 
@@ -712,6 +713,49 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 	return check_too_hard(xics, icp);
 }
 
+unsigned long eoi_rc;
+
+static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr)
+{
+	unsigned long xics_phys;
+	int64_t rc;
+
+	rc = pnv_opal_pci_msi_eoi(c, hwirq);
+
+	if (rc)
+		eoi_rc = rc;
+
+	iosync();
+
+	/* EOI it */
+	xics_phys = local_paca->kvm_hstate.xics_phys;
+	_stwcix(xics_phys + XICS_XIRR, xirr);
+}
+
+long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
+				 u32 xirr,
+				 struct kvmppc_irq_map *irq_map,
+				 struct kvmppc_passthru_irqmap *pimap)
+{
+	struct kvmppc_xics *xics;
+	struct kvmppc_icp *icp;
+	u32 irq;
+
+	irq = irq_map->v_hwirq;
+	xics = vcpu->kvm->arch.xics;
+	icp = vcpu->arch.icp;
+
+	icp_rm_deliver_irq(xics, icp, irq);
+
+	/* EOI the interrupt */
+	icp_eoi(pimap->irq_chip, irq_map->r_hwirq, xirr);
+
+	if (check_too_hard(xics, icp) == H_TOO_HARD)
+		return 1;
+	else
+		return -2;
+}
+
 /*  --- Non-real mode XICS-related built-in routines ---  */
 
 /**
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 29e6a8a..5e12cb4e 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1287,6 +1287,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	 * -1 A guest wakeup IPI (which has now been cleared)
 	 *    In either case, we return to guest to deliver any pending
 	 *    guest interrupts.
+	 *
+	 * -2 A PCI passthrough external interrupt was handled
+	 *    (interrupt was delivered directly to guest)
+	 *    Return to guest to deliver any pending guest interrupts.
+	 *    However, we also need to restore SRR0 and SRR1 since we have
+	 *    made an OPAL call to EOI the interrupt which scratches them.
 	 */
 
 	cmpdi	r3, 0
@@ -1300,10 +1306,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	bge	guest_exit_cont
 
 	/* Return code <= 0, return to guest */
+	cmpdi	r3, -2
 	ld	r6, VCPU_CTR(r4)
 	ld	r7, VCPU_XER(r4)
 	mtctr	r6
 	mtxer	r7
+	bne	deliver_guest_interrupt
+
+	/* Return code = -2 */
+	ld	r6, VCPU_SRR0(r4)
+	ld	r7, VCPU_SRR1(r4)
+	mtsrr0	r6
+	mtsrr1	r7
 	b	deliver_guest_interrupt
 
 guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
@@ -2500,6 +2514,8 @@ machine_check_realmode:
  *	0 if nothing needs to be done
  *	1 if something happened that needs to be handled by the host
  *	-1 if there was a guest wakeup (IPI or msgsnd)
+ *	-2 if we handled a PCI passthrough interrupt (returned by
+ *		kvmppc_read_intr only)
  *
  * Also sets r31 to the interrupt vector for any interrupt that needs
  * to be handled now by the host (0x500 for external interrupt), or zero.
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index f90dc04..2fd8b01 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2565,15 +2565,23 @@ static void pnv_ioda_setup_dma(struct pnv_phb *phb)
 }
 
 #ifdef CONFIG_PCI_MSI
-static void pnv_ioda2_msi_eoi(struct irq_data *d)
+int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
 {
-	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
-	struct irq_chip *chip = irq_data_get_irq_chip(d);
 	struct pnv_phb *phb = container_of(chip, struct pnv_phb,
 					   ioda.irq_chip);
 	int64_t rc;
 
 	rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
+	return rc;
+}
+
+static void pnv_ioda2_msi_eoi(struct irq_data *d)
+{
+	int64_t rc;
+	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+	struct irq_chip *chip = irq_data_get_irq_chip(d);
+
+	rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
 	WARN_ON_ONCE(rc);
 
 	icp_native_eoi(d);
-- 
1.8.3.4



More information about the Linuxppc-dev mailing list