[PATCH 12/12] powerpc/kvm: Native usage of the XIVE interrupt controller
Paul Mackerras
paulus at ozlabs.org
Tue Mar 28 16:26:33 AEDT 2017
On Mon, Mar 20, 2017 at 05:49:14PM +1100, Benjamin Herrenschmidt wrote:
> This patch makes KVM capable of using the XIVE interrupt controller
> to provide the standard PAPR "XICS" style hypercalls. It is necessary
> for proper operations when the host uses XIVE natively.
>
> This has been lightly tested on an actual system, including PCI
> pass-through with a TG3 device.
>
> Signed-off-by: Benjamin Herrenschmidt <benh at kernel.crashing.org>
Looks good overall, some comments below...
> ---
> arch/powerpc/include/asm/kvm_book3s_asm.h | 2 +
> arch/powerpc/include/asm/kvm_host.h | 28 +-
> arch/powerpc/include/asm/kvm_ppc.h | 38 +
> arch/powerpc/include/asm/xive.h | 11 +-
> arch/powerpc/kernel/asm-offsets.c | 10 +
> arch/powerpc/kvm/Makefile | 4 +-
> arch/powerpc/kvm/book3s.c | 73 +-
> arch/powerpc/kvm/book3s_hv.c | 52 +-
> arch/powerpc/kvm/book3s_hv_builtin.c | 108 ++
> arch/powerpc/kvm/book3s_hv_rm_xics.c | 10 +-
> arch/powerpc/kvm/book3s_hv_rm_xive.c | 47 +
> arch/powerpc/kvm/book3s_hv_rmhandlers.S | 60 +-
> arch/powerpc/kvm/book3s_rtas.c | 21 +-
> arch/powerpc/kvm/book3s_xics.c | 35 +-
> arch/powerpc/kvm/book3s_xics.h | 5 +
> arch/powerpc/kvm/book3s_xive.c | 1898 +++++++++++++++++++++++++++++
> arch/powerpc/kvm/book3s_xive.h | 251 ++++
> arch/powerpc/kvm/book3s_xive_template.c | 490 ++++++++
> arch/powerpc/kvm/irq.h | 1 +
> arch/powerpc/kvm/powerpc.c | 17 +-
> arch/powerpc/platforms/powernv/opal.c | 1 +
> arch/powerpc/sysdev/xive/common.c | 131 +-
> arch/powerpc/sysdev/xive/native.c | 92 +-
> include/linux/kvm_host.h | 1 -
> virt/kvm/kvm_main.c | 4 -
> 25 files changed, 3305 insertions(+), 85 deletions(-)
> create mode 100644 arch/powerpc/kvm/book3s_hv_rm_xive.c
> create mode 100644 arch/powerpc/kvm/book3s_xive.c
> create mode 100644 arch/powerpc/kvm/book3s_xive.h
> create mode 100644 arch/powerpc/kvm/book3s_xive_template.c
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
> index 0593d94..e719002 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> @@ -111,6 +111,8 @@ struct kvmppc_host_state {
> struct kvm_vcpu *kvm_vcpu;
> struct kvmppc_vcore *kvm_vcore;
> void __iomem *xics_phys;
> + void __iomem *xive_tm_area_phys;
> + void __iomem *xive_tm_area_virt;
Does this cause the paca to become a cacheline larger? (Not that
there is much alternative to having these fields.)
> u32 saved_xirr;
> u64 dabr;
> u64 host_mmcr[7]; /* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index 7bba8f4..fc491ac 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -205,6 +205,12 @@ struct kvmppc_spapr_tce_table {
> /* XICS components, defined in book3s_xics.c */
> struct kvmppc_xics;
> struct kvmppc_icp;
> +extern struct kvm_device_ops kvm_xics_ops;
> +
> +/* XIVE components, defined in book3s_xive.c */
> +struct kvmppc_xive;
> +struct kvmppc_xive_vcpu;
> +extern struct kvm_device_ops kvm_xive_ops;
>
> struct kvmppc_passthru_irqmap;
>
> @@ -293,6 +299,7 @@ struct kvm_arch {
> #endif
> #ifdef CONFIG_KVM_XICS
> struct kvmppc_xics *xics;
> + struct kvmppc_xive *xive;
> struct kvmppc_passthru_irqmap *pimap;
> #endif
> struct kvmppc_ops *kvm_ops;
> @@ -421,7 +428,7 @@ struct kvmppc_passthru_irqmap {
>
> #define KVMPPC_IRQ_DEFAULT 0
> #define KVMPPC_IRQ_MPIC 1
> -#define KVMPPC_IRQ_XICS 2
> +#define KVMPPC_IRQ_XICS 2 /* Includes a XIVE option */
>
> #define MMIO_HPTE_CACHE_SIZE 4
>
> @@ -443,6 +450,21 @@ struct mmio_hpte_cache {
>
> struct openpic;
>
> +/* QW0 and QW1 of a context */
> +union xive_qw01 {
> + struct {
> + u8 nsr;
> + u8 cppr;
> + u8 ipb;
> + u8 lsmfb;
> + u8 ack;
> + u8 inc;
> + u8 age;
> + u8 pipr;
> + };
> + __be64 qw;
> +};
This is slightly confusing because a "QW" (quadword) would normally be
128 bits, but this union is 64 bits.
> +
> struct kvm_vcpu_arch {
> ulong host_stack;
> u32 host_pid;
> @@ -688,6 +710,10 @@ struct kvm_vcpu_arch {
> struct openpic *mpic; /* KVM_IRQ_MPIC */
> #ifdef CONFIG_KVM_XICS
> struct kvmppc_icp *icp; /* XICS presentation controller */
> + struct kvmppc_xive_vcpu *xive_vcpu; /* XIVE virtual CPU data */
> + __be32 xive_cam_word; /* Cooked W2 in proper endian with valid bit */
> + u32 xive_pushed; /* Is the VP pushed on the physical CPU ? */
> + union xive_qw01 xive_saved_state; /* W0..1 of XIVE state */
> #endif
>
> #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index c387799..2fcf6cf 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -225,6 +225,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
> extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp);
> extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu);
> extern void kvmppc_rtas_tokens_free(struct kvm *kvm);
> +
> extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server,
> u32 priority);
> extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server,
> @@ -232,6 +233,15 @@ extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server,
> extern int kvmppc_xics_int_on(struct kvm *kvm, u32 irq);
> extern int kvmppc_xics_int_off(struct kvm *kvm, u32 irq);
>
> +extern int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
> + u32 priority);
> +extern int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
> + u32 *priority);
Might be worth a comment here to explain that the first xive is
eXternal Interrupt Virtualization Engine and the second xive is
eXternal Interrupt Vector Entry.
> +extern int kvmppc_xive_int_on(struct kvm *kvm, u32 irq);
> +extern int kvmppc_xive_int_off(struct kvm *kvm, u32 irq);
> +extern void kvmppc_xive_init_module(void);
> +extern void kvmppc_xive_exit_module(void);
> +
> void kvmppc_core_dequeue_debug(struct kvm_vcpu *vcpu);
> void kvmppc_core_queue_debug(struct kvm_vcpu *vcpu);
>
> @@ -412,6 +422,14 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
> paca[cpu].kvm_hstate.xics_phys = (void __iomem *)addr;
> }
>
> +static inline void kvmppc_set_xive_tm_area(int cpu,
> + unsigned long phys_addr,
> + void __iomem *virt_addr)
> +{
> + paca[cpu].kvm_hstate.xive_tm_area_phys = (void __iomem *)phys_addr;
> + paca[cpu].kvm_hstate.xive_tm_area_virt = virt_addr;
> +}
> +
> static inline u32 kvmppc_get_xics_latch(void)
> {
> u32 xirr;
> @@ -442,6 +460,9 @@ static inline void __init kvm_cma_reserve(void)
> static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
> {}
>
> +static inline void kvmppc_set_xive_tm_area_phys(int cpu, unsigned long addr)
> +{}
Shouldn't this be kvmppc_set_xive_tm_area to match the other definition?
> +
> static inline u32 kvmppc_get_xics_latch(void)
> {
> return 0;
> @@ -492,6 +513,21 @@ extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, __be32 xirr,
> struct kvmppc_irq_map *irq_map,
> struct kvmppc_passthru_irqmap *pimap,
> bool *again);
> +extern int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
> + struct kvm_vcpu *vcpu, u32 cpu);
> +extern void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu);
> +extern int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
> + struct irq_desc *host_desc);
> +extern int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
> + struct irq_desc *host_desc);
> +extern u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu);
> +extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
> +
> +extern int kvmppc_xics_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
> + int level, bool line_status);
> +extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
> + int level, bool line_status);
> +
> extern int h_ipi_redirect;
> #else
> static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
> @@ -546,6 +582,8 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
> long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
> unsigned long slb_v, unsigned int status, bool data);
> unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu);
> +unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu);
> +unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server);
> int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
> unsigned long mfrr);
> int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
> diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
> index b1604b73..94b5cca 100644
> --- a/arch/powerpc/include/asm/xive.h
> +++ b/arch/powerpc/include/asm/xive.h
> @@ -55,7 +55,8 @@ struct xive_q {
> #define XIVE_ESB_SET_PQ_01 0xd00
> #define XIVE_ESB_SET_PQ_10 0xe00
> #define XIVE_ESB_SET_PQ_11 0xf00
> -#define XIVE_ESB_MASK XIVE_ESB_SET_PQ_01
> +#define XIVE_ESB_SOFT_MASK XIVE_ESB_SET_PQ_10
> +#define XIVE_ESB_HARD_MASK XIVE_ESB_SET_PQ_01
What's the difference between a "soft" mask and a "hard" mask?
>
> extern bool __xive_enabled;
>
> @@ -88,11 +89,11 @@ extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
> __be32 *qpage, u32 order, bool can_escalate);
> extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
>
> -extern bool __xive_irq_trigger(struct xive_irq_data *xd);
> -extern bool __xive_irq_retrigger(struct xive_irq_data *xd);
> -extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd);
> -
> +extern void xive_native_sync_source(u32 hw_irq);
> extern bool is_xive_irq(struct irq_chip *chip);
> +extern int xive_native_enable_vp(u32 vp_id);
> +extern int xive_native_disable_vp(u32 vp_id);
> +extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id);
>
> #else
>
> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
> index 4367e7d..59fa705 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -630,6 +630,8 @@ int main(void)
> HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
> HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
> HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
> + HSTATE_FIELD(HSTATE_XIVE_TM_AREA_PHYS, xive_tm_area_phys);
> + HSTATE_FIELD(HSTATE_XIVE_TM_AREA_VIRT, xive_tm_area_virt);
> HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
> HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
> HSTATE_FIELD(HSTATE_PTID, ptid);
> @@ -715,6 +717,14 @@ int main(void)
> OFFSET(VCPU_HOST_MAS6, kvm_vcpu, arch.host_mas6);
> #endif
>
> +#ifdef CONFIG_KVM_XICS
> + DEFINE(VCPU_XIVE_SAVED_STATE, offsetof(struct kvm_vcpu,
> + arch.xive_saved_state));
> + DEFINE(VCPU_XIVE_CAM_WORD, offsetof(struct kvm_vcpu,
> + arch.xive_cam_word));
> + DEFINE(VCPU_XIVE_PUSHED, offsetof(struct kvm_vcpu, arch.xive_pushed));
> +#endif
> +
> #ifdef CONFIG_KVM_EXIT_TIMING
> OFFSET(VCPU_TIMING_EXIT_TBU, kvm_vcpu, arch.timing_exit.tv32.tbu);
> OFFSET(VCPU_TIMING_EXIT_TBL, kvm_vcpu, arch.timing_exit.tv32.tbl);
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index b87ccde..ef89c8c 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -74,7 +74,7 @@ kvm-hv-y += \
> book3s_64_mmu_radix.o
>
> kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
> - book3s_hv_rm_xics.o
> + book3s_hv_rm_xics.o book3s_hv_rm_xive.o
>
> ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
> @@ -87,7 +87,7 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
> endif
>
> kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
> - book3s_xics.o
> + book3s_xics.o book3s_xive.o
>
> kvm-book3s_64-module-objs := \
> $(common-objs-y) \
> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> index aedacef..e459ec4 100644
> --- a/arch/powerpc/kvm/book3s.c
> +++ b/arch/powerpc/kvm/book3s.c
> @@ -35,6 +35,7 @@
> #include <asm/kvm_book3s.h>
> #include <asm/mmu_context.h>
> #include <asm/page.h>
> +#include <asm/xive.h>
>
> #include "book3s.h"
> #include "trace.h"
> @@ -578,11 +579,14 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
> break;
> #ifdef CONFIG_KVM_XICS
> case KVM_REG_PPC_ICP_STATE:
> - if (!vcpu->arch.icp) {
> + if (!vcpu->arch.icp && !vcpu->arch.xive_vcpu) {
> r = -ENXIO;
> break;
> }
> - *val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
> + if (xive_enabled())
> + *val = get_reg_val(id, kvmppc_xive_get_icp(vcpu));
> + else
> + *val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
> break;
> #endif /* CONFIG_KVM_XICS */
> case KVM_REG_PPC_FSCR:
> @@ -648,12 +652,14 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
> #endif /* CONFIG_VSX */
> #ifdef CONFIG_KVM_XICS
> case KVM_REG_PPC_ICP_STATE:
> - if (!vcpu->arch.icp) {
> + if (!vcpu->arch.icp && !vcpu->arch.xive_vcpu) {
> r = -ENXIO;
> break;
> }
> - r = kvmppc_xics_set_icp(vcpu,
> - set_reg_val(id, *val));
> + if (xive_enabled())
> + r = kvmppc_xive_set_icp(vcpu, set_reg_val(id, *val));
> + else
> + r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val));
> break;
> #endif /* CONFIG_KVM_XICS */
> case KVM_REG_PPC_FSCR:
> @@ -924,6 +930,50 @@ int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hcall)
> return kvm->arch.kvm_ops->hcall_implemented(hcall);
> }
>
> +#ifdef CONFIG_KVM_XICS
> +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
> + bool line_status)
> +{
> + if (xive_enabled())
> + return kvmppc_xive_set_irq(kvm, irq_source_id, irq, level,
> + line_status);
> + else
> + return kvmppc_xics_set_irq(kvm, irq_source_id, irq, level,
> + line_status);
> +}
> +
> +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *irq_entry,
> + struct kvm *kvm, int irq_source_id,
> + int level, bool line_status)
> +{
> + return kvm_set_irq(kvm, irq_source_id, irq_entry->gsi,
> + level, line_status);
> +}
> +static int kvmppc_book3s_set_irq(struct kvm_kernel_irq_routing_entry *e,
> + struct kvm *kvm, int irq_source_id, int level,
> + bool line_status)
> +{
> + return kvm_set_irq(kvm, irq_source_id, e->gsi, level, line_status);
> +}
> +
> +int kvm_irq_map_gsi(struct kvm *kvm,
> + struct kvm_kernel_irq_routing_entry *entries, int gsi)
> +{
> + entries->gsi = gsi;
> + entries->type = KVM_IRQ_ROUTING_IRQCHIP;
> + entries->set = kvmppc_book3s_set_irq;
> + entries->irqchip.irqchip = 0;
> + entries->irqchip.pin = gsi;
> + return 1;
> +}
> +
> +int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
> +{
> + return pin;
> +}
> +
> +#endif /* CONFIG_KVM_XICS */
> +
> static int kvmppc_book3s_init(void)
> {
> int r;
> @@ -934,12 +984,23 @@ static int kvmppc_book3s_init(void)
> #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
> r = kvmppc_book3s_init_pr();
> #endif
> - return r;
>
> +#ifdef CONFIG_KVM_XICS
> + if (xive_enabled()) {
> + kvmppc_xive_init_module();
> + kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS);
> + } else
> + kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS);
> +#endif
> + return r;
> }
>
> static void kvmppc_book3s_exit(void)
> {
> +#ifdef CONFIG_KVM_XICS
> + if (xive_enabled())
> + kvmppc_xive_exit_module();
> +#endif
> #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
> kvmppc_book3s_exit_pr();
> #endif
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index fadb75a..5c340c2 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -67,6 +67,7 @@
> #include <asm/mmu.h>
> #include <asm/opal.h>
> #include <asm/xics.h>
> +#include <asm/xive.h>
>
> #include "book3s.h"
>
> @@ -837,6 +838,10 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
> case H_IPOLL:
> case H_XIRR_X:
> if (kvmppc_xics_enabled(vcpu)) {
> + if (xive_enabled()) {
> + ret = H_NOT_AVAILABLE;
> + return RESUME_GUEST;
> + }
> ret = kvmppc_xics_hcall(vcpu, req);
> break;
> }
> @@ -2947,8 +2952,12 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
> r = kvmppc_book3s_hv_page_fault(run, vcpu,
> vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
> srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
> - } else if (r == RESUME_PASSTHROUGH)
> - r = kvmppc_xics_rm_complete(vcpu, 0);
> + } else if (r == RESUME_PASSTHROUGH) {
> + if (WARN_ON(xive_enabled()))
> + r = H_SUCCESS;
> + else
> + r = kvmppc_xics_rm_complete(vcpu, 0);
> + }
> } while (is_kvmppc_resume_guest(r));
>
> out:
> @@ -3400,10 +3409,19 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
> /*
> * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed)
> * Set HVICE bit to enable hypervisor virtualization interrupts.
> + * Set HEIC to prevent OS interrupts to go to hypervisor (should
> + * be unnecessary but better safe than sorry in case we re-enable
> + * EE in HV mode with this LPCR still set)
> */
> if (cpu_has_feature(CPU_FTR_ARCH_300)) {
> lpcr &= ~LPCR_VPM0;
> - lpcr |= LPCR_HVICE;
> + lpcr |= LPCR_HVICE | LPCR_HEIC;
> +
> + /* If xive is enabled, we route 0x500 interrupts directly
> + * to the guest
> + */
> + if (xive_enabled())
> + lpcr |= LPCR_LPES;
> }
>
> /*
> @@ -3533,7 +3551,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
> struct kvmppc_irq_map *irq_map;
> struct kvmppc_passthru_irqmap *pimap;
> struct irq_chip *chip;
> - int i;
> + int i, rc = 0;
>
> if (!kvm_irq_bypass)
> return 1;
> @@ -3558,10 +3576,10 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
> /*
> * For now, we only support interrupts for which the EOI operation
> * is an OPAL call followed by a write to XIRR, since that's
> - * what our real-mode EOI code does.
> + * what our real-mode EOI code does, or a XIVE interrupt
> */
> chip = irq_data_get_irq_chip(&desc->irq_data);
> - if (!chip || !is_pnv_opal_msi(chip)) {
> + if (!chip || !(is_pnv_opal_msi(chip) || is_xive_irq(chip))) {
> pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
> host_irq, guest_gsi);
> mutex_unlock(&kvm->lock);
> @@ -3603,7 +3621,14 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
> if (i == pimap->n_mapped)
> pimap->n_mapped++;
>
> - kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
> + if (xive_enabled())
> + rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc);
> + else
> + kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
> + printk("set mapped for IRQ %d -> %d returned %d\n",
> + host_irq, guest_gsi, rc);
This seems like a debugging thing that should be removed or turned
into a DBG().
> + if (rc)
> + irq_map->r_hwirq = 0;
>
> mutex_unlock(&kvm->lock);
>
> @@ -3614,7 +3639,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
> {
> struct irq_desc *desc;
> struct kvmppc_passthru_irqmap *pimap;
> - int i;
> + int i, rc = 0;
>
> if (!kvm_irq_bypass)
> return 0;
> @@ -3641,9 +3666,12 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
> return -ENODEV;
> }
>
> - kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
> + if (xive_enabled())
> + rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc);
> + else
> + kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
>
> - /* invalidate the entry */
> + /* invalidate the entry (what do do on error from the above ?) */
> pimap->mapped[i].r_hwirq = 0;
>
> /*
> @@ -3652,7 +3680,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
> */
>
> mutex_unlock(&kvm->lock);
> - return 0;
> + return rc;
> }
>
> static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
> @@ -3930,7 +3958,7 @@ static int kvmppc_book3s_init_hv(void)
> * indirectly, via OPAL.
> */
> #ifdef CONFIG_SMP
> - if (!get_paca()->kvm_hstate.xics_phys) {
> + if (!xive_enabled() && !get_paca()->kvm_hstate.xics_phys) {
> struct device_node *np;
>
> np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
> diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
> index d48f9b6..8de7ed4 100644
> --- a/arch/powerpc/kvm/book3s_hv_builtin.c
> +++ b/arch/powerpc/kvm/book3s_hv_builtin.c
> @@ -23,6 +23,7 @@
> #include <asm/kvm_book3s.h>
> #include <asm/archrandom.h>
> #include <asm/xics.h>
> +#include <asm/xive.h>
> #include <asm/dbell.h>
> #include <asm/cputhreads.h>
> #include <asm/io.h>
> @@ -31,6 +32,24 @@
>
> #define KVM_CMA_CHUNK_ORDER 18
>
> +#include "book3s_xics.h"
> +#include "book3s_xive.h"
> +
> +/*
> + * The XIVE module will populate these when it loads
> + */
> +unsigned long (*__xive_vm_h_xirr)(struct kvm_vcpu *vcpu);
> +unsigned long (*__xive_vm_h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server);
> +int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
> + unsigned long mfrr);
> +int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr);
> +int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
> +EXPORT_SYMBOL_GPL(__xive_vm_h_xirr);
> +EXPORT_SYMBOL_GPL(__xive_vm_h_ipoll);
> +EXPORT_SYMBOL_GPL(__xive_vm_h_ipi);
> +EXPORT_SYMBOL_GPL(__xive_vm_h_cppr);
> +EXPORT_SYMBOL_GPL(__xive_vm_h_eoi);
> +
> /*
> * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
> * should be power of 2.
> @@ -209,6 +228,7 @@ void kvmhv_rm_send_ipi(int cpu)
> __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
> return;
> }
> +
> /* On POWER8 for IPIs to threads in the same core, use msgsnd. */
> if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
> cpu_first_thread_sibling(cpu) ==
> @@ -218,6 +238,10 @@ void kvmhv_rm_send_ipi(int cpu)
> return;
> }
>
> + /* We should never reach this */
> + if (WARN_ON_ONCE(xive_enabled()))
> + return;
> +
> /* Else poke the target with an IPI */
> xics_phys = paca[cpu].kvm_hstate.xics_phys;
> if (xics_phys)
> @@ -398,6 +422,9 @@ static long kvmppc_read_one_intr(bool *again)
> u8 host_ipi;
> int64_t rc;
>
> + if (xive_enabled())
> + return 1;
Why not do this in kvmppc_read_intr() rather than here?
> +
> /* see if a host IPI is pending */
> host_ipi = local_paca->kvm_hstate.host_ipi;
> if (host_ipi)
> @@ -482,3 +509,84 @@ static long kvmppc_read_one_intr(bool *again)
>
> return kvmppc_check_passthru(xisr, xirr, again);
> }
> +
> +static inline bool is_rm(void)
> +{
> + return !(mfmsr() & MSR_DR);
> +}
> +
> +/* XXX FIXME: The xive_vm_* calls are in a module... */
> +
> +unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
> +{
> + if (xive_enabled()) {
> + if (is_rm())
> + return xive_rm_h_xirr(vcpu);
> + if (unlikely(!__xive_vm_h_xirr))
> + return H_NOT_AVAILABLE;
> + return __xive_vm_h_xirr(vcpu);
> + } else
> + return xics_rm_h_xirr(vcpu);
> +}
> +
> +unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
> +{
> + vcpu->arch.gpr[5] = get_tb();
> + if (xive_enabled()) {
> + if (is_rm())
> + return xive_rm_h_xirr(vcpu);
> + if (unlikely(!__xive_vm_h_xirr))
> + return H_NOT_AVAILABLE;
> + return __xive_vm_h_xirr(vcpu);
> + } else
> + return xics_rm_h_xirr(vcpu);
> +}
> +
> +unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
> +{
> + if (xive_enabled()) {
> + if (is_rm())
> + return xive_rm_h_ipoll(vcpu, server);
> + if (unlikely(!__xive_vm_h_ipoll))
> + return H_NOT_AVAILABLE;
> + return __xive_vm_h_ipoll(vcpu, server);
> + } else
> + return H_TOO_HARD;
> +}
> +
> +int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
> + unsigned long mfrr)
> +{
> + if (xive_enabled()) {
> + if (is_rm())
> + return xive_rm_h_ipi(vcpu, server, mfrr);
> + if (unlikely(!__xive_vm_h_ipi))
> + return H_NOT_AVAILABLE;
> + return __xive_vm_h_ipi(vcpu, server, mfrr);
> + } else
> + return xics_rm_h_ipi(vcpu, server, mfrr);
> +}
> +
> +int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
> +{
> + if (xive_enabled()) {
> + if (is_rm())
> + return xive_rm_h_cppr(vcpu, cppr);
> + if (unlikely(!__xive_vm_h_cppr))
> + return H_NOT_AVAILABLE;
> + return __xive_vm_h_cppr(vcpu, cppr);
> + } else
> + return xics_rm_h_cppr(vcpu, cppr);
> +}
> +
> +int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
> +{
> + if (xive_enabled()) {
> + if (is_rm())
> + return xive_rm_h_eoi(vcpu, xirr);
> + if (unlikely(!__xive_vm_h_eoi))
> + return H_NOT_AVAILABLE;
> + return __xive_vm_h_eoi(vcpu, xirr);
> + } else
> + return xics_rm_h_eoi(vcpu, xirr);
> +}
> diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
> index 3a1a463..f806880 100644
> --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
> +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
> @@ -485,7 +485,7 @@ static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
> }
>
>
> -unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
> +unsigned long xics_rm_h_xirr(struct kvm_vcpu *vcpu)
> {
> union kvmppc_icp_state old_state, new_state;
> struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
> @@ -523,8 +523,8 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
> return check_too_hard(xics, icp);
> }
>
> -int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
> - unsigned long mfrr)
> +int xics_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
> + unsigned long mfrr)
> {
> union kvmppc_icp_state old_state, new_state;
> struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
> @@ -610,7 +610,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
> return check_too_hard(xics, this_icp);
> }
>
> -int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
> +int xics_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
> {
> union kvmppc_icp_state old_state, new_state;
> struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
> @@ -730,7 +730,7 @@ static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq)
> return check_too_hard(xics, icp);
> }
>
> -int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
> +int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
> {
> struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
> struct kvmppc_icp *icp = vcpu->arch.icp;
> diff --git a/arch/powerpc/kvm/book3s_hv_rm_xive.c b/arch/powerpc/kvm/book3s_hv_rm_xive.c
> new file mode 100644
> index 0000000..6390f71
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_hv_rm_xive.c
> @@ -0,0 +1,47 @@
> +#include <linux/kernel.h>
> +#include <linux/kvm_host.h>
> +#include <linux/err.h>
> +#include <linux/kernel_stat.h>
> +
> +#include <asm/kvm_book3s.h>
> +#include <asm/kvm_ppc.h>
> +#include <asm/hvcall.h>
> +#include <asm/xics.h>
> +#include <asm/debug.h>
> +#include <asm/synch.h>
> +#include <asm/cputhreads.h>
> +#include <asm/pgtable.h>
> +#include <asm/ppc-opcode.h>
> +#include <asm/pnv-pci.h>
> +#include <asm/opal.h>
> +#include <asm/smp.h>
> +#include <asm/asm-prototypes.h>
> +#include <asm/xive.h>
> +
> +#include "book3s_xive.h"
> +#include "../sysdev/xive/xive-regs.h"
> +
> +/* XXX */
> +#include <asm/udbg.h>
> +//#define DBG(fmt...) udbg_printf(fmt)
> +#define DBG(fmt...) do { } while(0)
> +
> +static inline void __iomem *get_tm_area_phys(void)
> +{
> + return local_paca->kvm_hstate.xive_tm_area_phys;
> +}
> +
> +#undef XIVE_RUNTIME_CHECKS
> +#define X_PFX xive_rm_
> +#define X_STATIC
> +#define X_STAT_PFX stat_rm_
> +#define __x_tm_area get_tm_area_phys()
> +#define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_page))
> +#define __x_trig_page(xd) ((void __iomem *)((xd)->trig_page))
> +#define __x_readb __raw_rm_readb
> +#define __x_writeb __raw_rm_writeb
> +#define __x_readw __raw_rm_readw
> +#define __x_readq __raw_rm_readq
> +#define __x_writeq __raw_rm_writeq
> +
> +#include "book3s_xive_template.c"
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 720b9c0..c06cccd 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -31,6 +31,8 @@
> #include <asm/tm.h>
> #include <asm/opal.h>
>
> +#include "../sysdev/xive/xive-regs.h"
> +
> #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
>
> /* Values in HSTATE_NAPPING(r13) */
> @@ -982,6 +984,23 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
> cmpwi r3, 512 /* 1 microsecond */
> blt hdec_soon
>
> +#ifdef CONFIG_KVM_XICS
> + /* We are entering the guest on that thread, push VCPU to XIVE */
> + ld r10, HSTATE_XIVE_TM_AREA_PHYS(r13)
> + cmpldi cr0, r10, r0
> + beq no_xive
> + ld r11, VCPU_XIVE_SAVED_STATE(r4)
> + li r9, TM_QW1_OS
> + stdcix r11,r9,r10
> + eieio
> + lwz r11, VCPU_XIVE_CAM_WORD(r4)
> + li r9, TM_QW1_OS + TM_WORD2
> + stwcix r11,r9,r10
> + li r9, 1
> + stw r9, VCPU_XIVE_PUSHED(r4)
> +no_xive:
> +#endif /* CONFIG_KVM_XICS */
> +
> deliver_guest_interrupt:
> ld r6, VCPU_CTR(r4)
> ld r7, VCPU_XER(r4)
> @@ -1319,6 +1338,38 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
> blt deliver_guest_interrupt
>
> guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
> +#ifdef CONFIG_KVM_XICS
> + /* We are exiting, pull the VP from the XIVE */
> + lwz r0, VCPU_XIVE_PUSHED(r9)
> + cmpwi cr0, r0, 0
> + beq 1f
> + li r7, TM_SPC_PULL_OS_CTX
> + li r6, TM_QW1_OS
> + mfmsr r0
> + andi. r0, r0, MSR_IR /* in real mode? */
> + beq 2f
> + ld r10, HSTATE_XIVE_TM_AREA_VIRT(r13)
> + cmpldi cr0, r10, 0
> + beq 1f
> + lwzx r11, r7, r10
> + eieio
> + ldx r11, r6, r10
I assume you meant to do these two loads into the same target
register, but I don't know why, so a comment would be useful.
> + b 3f
> +2: ld r10, HSTATE_XIVE_TM_AREA_PHYS(r13)
> + cmpldi cr0, r10, 0
> + beq 1f
> + lwzcix r11, r7, r10
> + eieio
> + ldcix r11, r6, r10
> +3: std r11, VCPU_XIVE_SAVED_STATE(r9)
> + /* Fixup some of the state for the next load */
> + li r10, 0
> + li r0, 0xff
> + stw r10, VCPU_XIVE_PUSHED(r9)
> + stb r10, (VCPU_XIVE_SAVED_STATE+3)(r9)
> + stb r0, (VCPU_XIVE_SAVED_STATE+4)(r9)
> +1:
> +#endif /* CONFIG_KVM_XICS */
> /* Save more register state */
> mfdar r6
> mfdsisr r7
> @@ -2035,7 +2086,7 @@ hcall_real_table:
> .long DOTSYM(kvmppc_rm_h_eoi) - hcall_real_table
> .long DOTSYM(kvmppc_rm_h_cppr) - hcall_real_table
> .long DOTSYM(kvmppc_rm_h_ipi) - hcall_real_table
> - .long 0 /* 0x70 - H_IPOLL */
> + .long DOTSYM(kvmppc_rm_h_ipoll) - hcall_real_table
> .long DOTSYM(kvmppc_rm_h_xirr) - hcall_real_table
> #else
> .long 0 /* 0x64 - H_EOI */
> @@ -2205,7 +2256,11 @@ hcall_real_table:
> .long 0 /* 0x2f0 */
> .long 0 /* 0x2f4 */
> .long 0 /* 0x2f8 */
> - .long 0 /* 0x2fc */
> +#ifdef CONFIG_KVM_XICS
> + .long DOTSYM(kvmppc_rm_h_xirr_x) - hcall_real_table
> +#else
> + .long 0 /* 0x2fc - H_XIRR_X*/
> +#endif
> .long DOTSYM(kvmppc_h_random) - hcall_real_table
> .globl hcall_real_table_end
> hcall_real_table_end:
> @@ -2980,6 +3035,7 @@ kvmppc_fix_pmao:
> isync
> blr
>
> +
Gratuitous extra blank line.
> #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
> /*
> * Start timing an activity
> diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
> index 20528701..2d3b2b1 100644
> --- a/arch/powerpc/kvm/book3s_rtas.c
> +++ b/arch/powerpc/kvm/book3s_rtas.c
> @@ -16,6 +16,7 @@
> #include <asm/kvm_ppc.h>
> #include <asm/hvcall.h>
> #include <asm/rtas.h>
> +#include <asm/xive.h>
>
> #ifdef CONFIG_KVM_XICS
> static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
> @@ -32,7 +33,10 @@ static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
> server = be32_to_cpu(args->args[1]);
> priority = be32_to_cpu(args->args[2]);
>
> - rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
> + if (xive_enabled())
> + rc = kvmppc_xive_set_xive(vcpu->kvm, irq, server, priority);
> + else
> + rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
> if (rc)
> rc = -3;
> out:
> @@ -52,7 +56,10 @@ static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
> irq = be32_to_cpu(args->args[0]);
>
> server = priority = 0;
> - rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
> + if (xive_enabled())
> + rc = kvmppc_xive_get_xive(vcpu->kvm, irq, &server, &priority);
> + else
> + rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
> if (rc) {
> rc = -3;
> goto out;
> @@ -76,7 +83,10 @@ static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args)
>
> irq = be32_to_cpu(args->args[0]);
>
> - rc = kvmppc_xics_int_off(vcpu->kvm, irq);
> + if (xive_enabled())
> + rc = kvmppc_xive_int_off(vcpu->kvm, irq);
> + else
> + rc = kvmppc_xics_int_off(vcpu->kvm, irq);
> if (rc)
> rc = -3;
> out:
> @@ -95,7 +105,10 @@ static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args)
>
> irq = be32_to_cpu(args->args[0]);
>
> - rc = kvmppc_xics_int_on(vcpu->kvm, irq);
> + if (xive_enabled())
> + rc = kvmppc_xive_int_on(vcpu->kvm, irq);
> + else
> + rc = kvmppc_xics_int_on(vcpu->kvm, irq);
> if (rc)
> rc = -3;
> out:
> diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
> index ef4fd52..e6829c4 100644
> --- a/arch/powerpc/kvm/book3s_xics.c
> +++ b/arch/powerpc/kvm/book3s_xics.c
> @@ -1307,8 +1307,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
> return 0;
> }
>
> -int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
> - bool line_status)
> +int kvmppc_xics_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
> + bool line_status)
> {
> struct kvmppc_xics *xics = kvm->arch.xics;
>
> @@ -1317,14 +1317,6 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
> return ics_deliver_irq(xics, irq, level);
> }
>
> -int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *irq_entry,
> - struct kvm *kvm, int irq_source_id,
> - int level, bool line_status)
> -{
> - return kvm_set_irq(kvm, irq_source_id, irq_entry->gsi,
> - level, line_status);
> -}
> -
> static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> {
> struct kvmppc_xics *xics = dev->private;
> @@ -1458,29 +1450,6 @@ void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu)
> vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
> }
>
> -static int xics_set_irq(struct kvm_kernel_irq_routing_entry *e,
> - struct kvm *kvm, int irq_source_id, int level,
> - bool line_status)
> -{
> - return kvm_set_irq(kvm, irq_source_id, e->gsi, level, line_status);
> -}
> -
> -int kvm_irq_map_gsi(struct kvm *kvm,
> - struct kvm_kernel_irq_routing_entry *entries, int gsi)
> -{
> - entries->gsi = gsi;
> - entries->type = KVM_IRQ_ROUTING_IRQCHIP;
> - entries->set = xics_set_irq;
> - entries->irqchip.irqchip = 0;
> - entries->irqchip.pin = gsi;
> - return 1;
> -}
> -
> -int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
> -{
> - return pin;
> -}
> -
> void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long irq,
> unsigned long host_irq)
> {
> diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
> index ec5474c..5016676 100644
> --- a/arch/powerpc/kvm/book3s_xics.h
> +++ b/arch/powerpc/kvm/book3s_xics.h
> @@ -144,5 +144,10 @@ static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
> return ics;
> }
>
> +extern unsigned long xics_rm_h_xirr(struct kvm_vcpu *vcpu);
> +extern int xics_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
> + unsigned long mfrr);
> +extern int xics_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
> +extern int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
>
> #endif /* _KVM_PPC_BOOK3S_XICS_H */
> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
> new file mode 100644
> index 0000000..acc882d
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_xive.c
> @@ -0,0 +1,1898 @@
> +/*
> + * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/kvm_host.h>
> +#include <linux/err.h>
> +#include <linux/gfp.h>
> +#include <linux/spinlock.h>
> +#include <linux/delay.h>
> +#include <linux/percpu.h>
> +#include <linux/cpumask.h>
> +#include <asm/uaccess.h>
> +#include <asm/kvm_book3s.h>
> +#include <asm/kvm_ppc.h>
> +#include <asm/hvcall.h>
> +#include <asm/xics.h>
> +#include <asm/xive.h>
> +#include <asm/debug.h>
> +#include <asm/time.h>
> +#include <asm/opal.h>
> +
> +#include <linux/debugfs.h>
> +#include <linux/seq_file.h>
> +
> +#include "book3s_xive.h"
> +#include "../sysdev/xive/xive-regs.h"
> +
> +//#define DBG(fmt...) printk("KVM/XIVE: " fmt)
> +#define DBG(fmt...) do { } while(0)
> +
> +#ifdef XIVE_RUNTIME_CHECKS
> +#define xive_assert(cond) WARN_ON(!(cond))
> +#else
> +#define xive_assert(cond) (false)
> +#endif
> +
> +/*
> + * Virtual mode variants of the hcalls for use on radix/radix
> + * with AIL. They require the VCPU's VP to be "pushed"
> + *
> + * We still instanciate them here because we use some of the
> + * generated utility functions as well in this file.
> + */
> +#define XIVE_RUNTIME_CHECKS
> +#define X_PFX xive_vm_
> +#define X_STATIC static
> +#define X_STAT_PFX stat_vm_
> +#define __x_tm_area xive_tm_area
> +#define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_mmio))
> +#define __x_trig_page(xd) ((void __iomem *)((xd)->trig_mmio))
> +#define __x_readb __raw_readb
> +#define __x_writeb __raw_writeb
> +#define __x_readw __raw_readw
> +#define __x_readq __raw_readq
> +#define __x_writeq __raw_writeq
> +
> +#include "book3s_xive_template.c"
> +
> +/* We leave a gap of a couple of interrupts in the queue to
> + * account for the IPI and additional safety guard
> + */
> +#define XIVE_Q_GAP 2
> +
> +/*
> + * This is a simple trigger for a generic XIVE IRQ. This must
> + * only be called for interrupts that support a trigger page
> + */
> +static bool xive_irq_trigger(struct xive_irq_data *xd)
> +{
> + /* This should be only for MSIs */
> + if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI))
> + return false;
> +
> + /* Those interrupts should always have a trigger page */
> + if (WARN_ON(!xd->trig_mmio))
> + return false;
> +
> + out_be64(xd->trig_mmio, 0);
> +
> + return true;
> +}
> +
> +static irqreturn_t xive_esc_irq(int irq, void *data)
> +{
> + struct kvm_vcpu *vcpu = data;
> +
> + /* We use the existing H_PROD mechanism to wake up the target */
> + vcpu->arch.prodded = 1;
> + smp_mb();
> + if (vcpu->arch.ceded)
> + kvmppc_fast_vcpu_kick(vcpu);
> +
> + return IRQ_HANDLED;
> +}
> +
> +static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
> +{
> + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> + struct xive_q *q = &xc->queues[prio];
> + char *name = NULL;
> + int rc;
> +
> + /* Already there ? */
> + if (xc->esc_virq[prio])
> + return 0;
> +
> + /* Hook up the escalation interrupt */
> + xc->esc_virq[prio] = irq_create_mapping(NULL, q->esc_irq);
> + if (!xc->esc_virq[prio]) {
> + pr_err("XIVE-KVM: Failed to map escalation interrupt"
> + " for queue %d of VCPU %d\n",
> + prio, xc->server_num);
> + return -EIO;
> + }
> +
> + /*
> + * Future improvement: start with them disabled
> + * and handle DD2 and later scheme of merged escalation
> + * interrupts
> + */
> + name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d\n",
> + vcpu->kvm->arch.lpid, xc->server_num, prio);
> + if (!name) {
> + pr_err("XIVE-KVM: Failed to allocate escalation irq name"
> + " for queue %d of VCPU %d\n",
> + prio, xc->server_num);
> + rc = -ENOMEM;
> + goto error;
> + }
> + rc = request_irq(xc->esc_virq[prio], xive_esc_irq,
> + IRQF_NO_THREAD, name, vcpu);
> + if (rc) {
> + pr_err("XIVE-KVM: Failed to request escalation interrupt"
> + " for queue %d of VCPU %d\n",
> + prio, xc->server_num);
> + goto error;
> + }
> + xc->esc_virq_names[prio] = name;
> + return 0;
> + error:
> + irq_dispose_mapping(xc->esc_virq[prio]);
> + xc->esc_virq[prio] = 0;
> + kfree(name);
> + return rc;
> +}
> +
> +static int xive_provision_queue(struct kvm_vcpu *vcpu, u8 prio)
> +{
> + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> + struct kvmppc_xive *xive = xc->xive;
> + struct xive_q *q = &xc->queues[prio];
> + void *qpage;
> + int rc;
> +
> + if (WARN_ON(q->qpage))
> + return 0;
> +
> + /* Allocate the queue and retrieve infos on current node for now */
> + qpage = (__be32 *)__get_free_pages(GFP_KERNEL, xive->q_alloc_order);
Possibly q_page_order would be a better name than q_alloc_order.
> + if (!qpage) {
> + pr_err("XIVE-KVM: Failed to allocate queue %d for VCPU %d\n",
> + prio, xc->server_num);
> + return -ENOMEM;;
> + }
> + memset(qpage, 0, 1 << xive->q_order);
> +
> + /*
> + * Reconfigure the queue. This will set q->qpage only once the
> + * queue is fully configured. This is a requirement for prio 0
> + * as we will stop doing EOIs for every IPI as soon as we observe
> + * qpage being non-NULL, and instead will only EOI when we receive
> + * corresponding queue 0 entries
> + */
> + rc = xive_native_configure_queue(xc->vp_id, q, prio, qpage,
> + xive->q_order, true);
> + if (rc)
> + pr_err("XIVE-KVM: Failed to configure queue %d for VCPU %d\n",
> + prio, xc->server_num);
> + return rc;
> +}
> +
> +/* Called with kvm_lock held */
> +static int xive_check_provisioning(struct kvm *kvm, u8 prio)
> +{
> + struct kvmppc_xive *xive = kvm->arch.xive;
> + struct kvm_vcpu *vcpu;
> + int i, rc;
> +
> + lockdep_assert_held(&kvm->lock);
> +
> + /* Already provisioned ? */
> + if (xive->qmap & (1 << prio))
> + return 0;
> +
> + DBG("Provisioning prio... %d\n", prio);
> +
> + /* Provision each VCPU and enable escalations */
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + if (!vcpu->arch.xive_vcpu)
> + continue;
> + rc = xive_provision_queue(vcpu, prio);
> + if (rc == 0)
> + xive_attach_escalation(vcpu, prio);
> + if (rc)
> + return rc;
> + }
> +
> + /* Order previous stores and mark it as provisioned */
> + mb();
> + xive->qmap |= (1 << prio);
> + return 0;
> +}
> +
> +static void xive_inc_q_pending(struct kvm *kvm, u32 server, u8 prio)
> +{
> + struct kvm_vcpu *vcpu;
> + struct kvmppc_xive_vcpu *xc;
> + struct xive_q *q;
> +
> + /* Locate target server */
> + vcpu = kvmppc_xive_find_server(kvm, server);
> + if (!vcpu) {
> + pr_warn("%s: Can't find server %d\n", __func__, server);
> + return;
> + }
> + xc = vcpu->arch.xive_vcpu;
> + if (WARN_ON(!xc))
> + return;
> +
> + q = &xc->queues[prio];
> + atomic_inc(&q->pending_count);
> +}
> +
> +static int xive_try_pick_queue(struct kvm_vcpu *vcpu, u8 prio)
> +{
> + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> + struct xive_q *q;
> + u32 max;
> +
> + if (WARN_ON(!xc))
> + return -ENXIO;
> + if (!xc->valid)
> + return -ENXIO;
> +
> + q = &xc->queues[prio];
> + if (WARN_ON(!q->qpage))
> + return -ENXIO;
> +
> + /* Calculate max number of interrupts in that queue. */
> + max = (q->msk + 1) - XIVE_Q_GAP;
> + return atomic_add_unless(&q->count, 1, max) ? 0 : -EBUSY;
> +}
> +
> +static int xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
> +{
> + struct kvm_vcpu *vcpu;
> + int i, rc;
> +
> + /* Locate target server */
> + vcpu = kvmppc_xive_find_server(kvm, *server);
> + if (!vcpu) {
> + DBG("Can't find server %d\n", *server);
> + return -EINVAL;
> + }
> +
> + DBG("Finding irq target on 0x%x/%d...\n", *server, prio);
> +
> + /* Try pick it */
> + rc = xive_try_pick_queue(vcpu, prio);
> + if (rc == 0)
> + return rc;
> +
> + DBG(" .. failed, looking up candidate...\n");
> +
> + /* Failed, pick another VCPU */
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + if (!vcpu->arch.xive_vcpu)
> + continue;
> + rc = xive_try_pick_queue(vcpu, prio);
> + if (rc == 0) {
> + *server = vcpu->arch.xive_vcpu->server_num;
> + DBG(" found on 0x%x/%d\n", *server, prio);
> + return rc;
> + }
> + }
> + DBG(" no available target !\n");
> +
> + /* No available target ! */
> + return -EBUSY;
> +}
> +
> +static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
> + struct kvmppc_xive_src_block *sb,
> + struct kvmppc_xive_irq_state *state)
> +{
> + struct xive_irq_data *xd;
> + u32 hw_num;
> + u8 old_prio;
> + u64 val;
> +
> + /*
> + * Take the lock, set masked, try again if racing
> + * with H_EOI
> + */
> + for (;;) {
> + arch_spin_lock(&sb->lock);
> + old_prio = state->guest_priority;
> + state->guest_priority = MASKED;
> + mb();
> + if (!state->in_eoi)
> + break;
> + state->guest_priority = old_prio;
> + arch_spin_unlock(&sb->lock);
> + }
> +
> + /* No change ? Bail */
> + if (old_prio == MASKED)
> + return old_prio;
> +
> + /* Get the right irq */
> + kvmppc_xive_select_irq(state, &hw_num, &xd);
> +
> + /*
> + * If the interrupt is marked as needing masking via
> + * firmware, we do it here. Firmware masking however
> + * is "lossy", it won't return the old p and q bits
> + * and won't set the interrupt to a state where it will
> + * record queued ones. If this is an issue we should do
> + * lazy masking instead.
> + *
> + * For now, we work around this in unmask by forcing
> + * an interrupt whenever we unmask a non-LSI via FW
> + * (if ever).
> + */
> + if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
> + xive_native_configure_irq(hw_num,
> + xive->vp_base + state->act_server,
> + MASKED, state->number);
> + /* set old_p so we can track if an H_EOI was done */
> + state->old_p = true;
> + state->old_q = false;
> + } else {
> + /* Set PQ to 10, return old P and old Q and remember them */
> + val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_10);
> + state->old_p = !!(val & 2);
> + state->old_q = !!(val & 1);
> +
> + /*
> + * Synchronize hardware to sensure the queues are updated
> + * when masking
> + */
> + xive_native_sync_source(hw_num);
> + }
> +
> + return old_prio;
> +}
> +
> +static void xive_lock_for_unmask(struct kvmppc_xive_src_block *sb,
> + struct kvmppc_xive_irq_state *state)
> +{
> + /*
> + * Take the lock try again if racing with H_EOI
> + */
> + for (;;) {
> + arch_spin_lock(&sb->lock);
> + if (!state->in_eoi)
> + break;
> + arch_spin_unlock(&sb->lock);
> + }
> +}
> +
> +static void xive_finish_unmask(struct kvmppc_xive *xive,
> + struct kvmppc_xive_src_block *sb,
> + struct kvmppc_xive_irq_state *state,
> + u8 prio)
> +{
> + struct xive_irq_data *xd;
> + u32 hw_num;
> +
> + /* If we aren't changing a thing, move on */
> + if (state->guest_priority != MASKED)
> + goto bail;
> +
> + /* Get the right irq */
> + kvmppc_xive_select_irq(state, &hw_num, &xd);
> +
> + /*
> + * See command in xive_lock_and_mask() concerning masking
> + * via firmware.
> + */
> + if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
> + xive_native_configure_irq(hw_num,
> + xive->vp_base + state->act_server,
> + state->act_priority, state->number);
> + /* If an EOI is needed, do it here */
> + if (!state->old_p)
> + xive_vm_source_eoi(hw_num, xd);
> + /* If this is not an LSI, force a trigger */
> + if (!(xd->flags & OPAL_XIVE_IRQ_LSI))
> + xive_irq_trigger(xd);
> + goto bail;
> + }
> +
> + /* Old Q set, set PQ to 11 */
> + if (state->old_q)
> + xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_11);
> +
> + /*
> + * If not old P, then perform an "effective" EOI,
> + * on the source. This will handle the cases where
> + * FW EOI is needed.
> + */
> + if (!state->old_p)
> + xive_vm_source_eoi(hw_num, xd);
> +
> + /* Synchronize ordering and mark unmasked */
> + mb();
> + bail:
> + state->guest_priority = prio;
> +}
> +
> +/*
> + * Target an interrupt to a given server/prio, this will fallback
> + * to another server if necessary and perform the HW targetting
> + * updates as needed
> + *
> + * NOTE: Must be called with the state lock held
> + */
> +static int xive_target_interrupt(struct kvm *kvm,
> + struct kvmppc_xive_irq_state *state,
> + u32 server, u8 prio)
> +{
> + struct kvmppc_xive *xive = kvm->arch.xive;
> + u32 hw_num;
> + int rc;
> +
> + /*
> + * This will return a tentative server and actual
> + * priority. The count for that new target will have
> + * already been incremented.
> + */
> + rc = xive_select_target(kvm, &server, prio);
> +
> + /* We failed to find a target ? Not much we can do
> + * at least until we support the GIQ.
> + */
> + if (rc)
> + return rc;
> +
> + /*
> + * Increment the old queue pending count if there
> + * was one so that the old queue count gets adjusted later
> + * when observed to be empty.
> + */
> + if (state->act_priority != MASKED)
> + xive_inc_q_pending(kvm,
> + state->act_server,
> + state->act_priority);
> + /*
> + * Update state and HW
> + */
> + state->act_priority = prio;
> + state->act_server = server;
> +
> + /* Get the right irq */
> + kvmppc_xive_select_irq(state, &hw_num, NULL);
> +
> + return xive_native_configure_irq(hw_num,
> + xive->vp_base + server,
> + prio, state->number);
> +}
> +
> +/*
> + * Targetting rules: In order to avoid losing track of
> + * pending interrupts accross mask and unmask, which would
> + * allow queue overflows, we implement the following rules:
> + *
> + * - Unless it was never enabled (or we run out of capacity)
> + * an interrupt is always targetted at a valid server/queue
> + * pair even when "masked" by the guest. This pair tends to
> + * be the last one used but it can be changed under some
> + * circumstances. That allows us to separate targetting
> + * from masking, we only handle accounting during (re)targetting,
> + * this also allows us to let an interrupt drain into its target
> + * queue after masking, avoiding complex schemes to remove
> + * interrupts out of remote processor queues.
> + *
> + * - When masking, we set PQ to 10 and save the previous value
> + * of P and Q.
> + *
> + * - When unmasking, if saved Q was set, we set PQ to 11
> + * otherwise we leave PQ to the HW state which will be either
> + * 10 if nothing happened or 11 if the interrupt fired while
> + * masked. Effectively we are OR'ing the previous Q into the
> + * HW Q.
> + *
> + * Then if saved P is clear, we do an effective EOI (Q->P->Trigger)
> + * which will unmask the interrupt and shoot a new one if Q was
> + * set.
> + *
> + * Otherwise (saved P is set) we leave PQ unchanged (so 10 or 11,
> + * effectively meaning an H_EOI from the guest is still expected
> + * for that interrupt).
> + *
> + * - If H_EOI occurs while masked, we clear the saved P.
> + *
> + * - When changing target, we account on the new target and
> + * increment a separate "pending" counter on the old one.
> + * This pending counter will be used to decrement the old
> + * target's count when its queue has been observed empty.
> + */
> +
> +int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
> + u32 priority)
> +{
> + struct kvmppc_xive *xive = kvm->arch.xive;
> + struct kvmppc_xive_src_block *sb;
> + struct kvmppc_xive_irq_state *state;
> + u8 new_act_prio;
> + int rc = 0;
> + u16 idx;
> +
> + if (!xive)
> + return -ENODEV;
> +
> + DBG("set_xive ! irq 0x%x server 0x%x prio %d\n",
> + irq, server, priority);
> +
> + /* First, check provisioning of queues */
> + if (priority != MASKED)
> + rc = xive_check_provisioning(xive->kvm,
> + xive_prio_from_guest(priority));
> + if (rc) {
> + DBG(" provisioning failure %d !\n", rc);
> + return rc;
> + }
> +
> + sb = kvmppc_xive_find_source(xive, irq, &idx);
> + if (!sb)
> + return -EINVAL;
> + state = &sb->irq_state[idx];
> +
> + /*
> + * We first handle masking/unmasking since the locking
> + * might need to be retried due to EOIs, we'll handle
> + * targetting changes later. These functions will return
> + * with the SB lock held.
> + *
> + * xive_lock_and_mask() will also set state->guest_priority
> + * but won't otherwise change other fields of the state.
> + *
> + * xive_lock_for_unmask will not actually unmask, this will
> + * be done later by xive_finish_unmask() once the targetting
> + * has been done, so we don't try to unmask an interrupt
> + * that hasn't yet been targetted.
> + */
> + if (priority == MASKED)
> + xive_lock_and_mask(xive, sb, state);
> + else
> + xive_lock_for_unmask(sb, state);
> +
> +
> + /*
> + * Then we handle targetting.
> + *
> + * First calculate a new "actual priority"
> + */
> + new_act_prio = state->act_priority;
> + if (priority != MASKED)
> + new_act_prio = xive_prio_from_guest(priority);
> +
> + DBG(" new_act_prio=%x act_server=%x act_prio=%x\n",
> + new_act_prio, state->act_server, state->act_priority);
> +
> + /*
> + * Then check if we actually need to change anything,
> + *
> + * The condition for re-targetting the interrupt is that
> + * we have a valid new priority (new_act_prio is not 0xff)
> + * and either the server or the priority changed.
> + *
> + * Note: If act_priority was ff and the new priority is
> + * also ff, we don't do anything and leave the interrupt
> + * untargetted. An attempt of doing an int_on on an
> + * untargetted interrupt will fail. If that is a problem
> + * we could initialize interrupts with valid default
> + */
> +
> + if (new_act_prio != MASKED &&
> + (state->act_server != server ||
> + state->act_priority != new_act_prio))
> + rc = xive_target_interrupt(kvm, state, server, new_act_prio);
> +
> + /*
> + * Perform the final unmasking of the interrupt source
> + * if necessary
> + */
> + if (priority != MASKED)
> + xive_finish_unmask(xive, sb, state, priority);
> +
> + /*
> + * Finally Update saved_priority to match. Only int_on/off
> + * set this field to a different value.
> + */
> + state->saved_priority = priority;
> +
> + arch_spin_unlock(&sb->lock);
> + return rc;
> +}
> +
> +int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
> + u32 *priority)
> +{
> + struct kvmppc_xive *xive = kvm->arch.xive;
> + struct kvmppc_xive_src_block *sb;
> + struct kvmppc_xive_irq_state *state;
> + u16 idx;
> +
> + if (!xive)
> + return -ENODEV;
> +
> + sb = kvmppc_xive_find_source(xive, irq, &idx);
> + if (!sb)
> + return -EINVAL;
> + state = &sb->irq_state[idx];
> + arch_spin_lock(&sb->lock);
> + *server = state->guest_server;
> + *priority = state->guest_priority;
> + arch_spin_unlock(&sb->lock);
> +
> + return 0;
> +}
> +
> +int kvmppc_xive_int_on(struct kvm *kvm, u32 irq)
> +{
> + struct kvmppc_xive *xive = kvm->arch.xive;
> + struct kvmppc_xive_src_block *sb;
> + struct kvmppc_xive_irq_state *state;
> + u16 idx;
> +
> + if (!xive)
> + return -ENODEV;
> +
> + sb = kvmppc_xive_find_source(xive, irq, &idx);
> + if (!sb)
> + return -EINVAL;
> + state = &sb->irq_state[idx];
> +
> + DBG("int_on(irq=0x%x)\n", irq);
> +
> + /*
> + * Check if interrupt was not targetted
> + */
> + if (state->act_priority == MASKED) {
> + DBG("int_on on untargetted interrupt\n");
> + return -EINVAL;
> + }
> +
> + /* If saved_priority is 0xff, do nothing */
> + if (state->saved_priority == MASKED)
> + return 0;
> +
> + /*
> + * Lock and unmask it.
> + */
> + xive_lock_for_unmask(sb, state);
> + xive_finish_unmask(xive, sb, state, state->saved_priority);
> + arch_spin_unlock(&sb->lock);
> +
> + return 0;
> +}
> +
> +int kvmppc_xive_int_off(struct kvm *kvm, u32 irq)
> +{
> + struct kvmppc_xive *xive = kvm->arch.xive;
> + struct kvmppc_xive_src_block *sb;
> + struct kvmppc_xive_irq_state *state;
> + u16 idx;
> +
> + if (!xive)
> + return -ENODEV;
> +
> + sb = kvmppc_xive_find_source(xive, irq, &idx);
> + if (!sb)
> + return -EINVAL;
> + state = &sb->irq_state[idx];
> +
> + DBG("int_off(irq=0x%x)\n", irq);
> +
> + /*
> + * Lock and mask
> + */
> + state->saved_priority = xive_lock_and_mask(xive, sb, state);
> + arch_spin_unlock(&sb->lock);
> +
> + return 0;
> +}
> +
> +static bool xive_restore_pending_irq(struct kvmppc_xive *xive, u32 irq)
> +{
> + struct kvmppc_xive_src_block *sb;
> + struct kvmppc_xive_irq_state *state;
> + u16 idx;
> +
> + sb = kvmppc_xive_find_source(xive, irq, &idx);
> + if (!sb)
> + return false;
> + state = &sb->irq_state[idx];
> + if (!state->valid)
> + return false;
> +
> + /*
> + * Trigger the IPI. This assumes we never restore a pass-through
> + * interrupt which should be safe enough
> + */
> + xive_irq_trigger(&state->ipi_data);
> +
> + return true;
> +}
> +
> +u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu)
> +{
> + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +
> + if (!xc)
> + return 0;
> +
> + /* Return the per-cpu state for state saving/migration */
> + return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT |
> + (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT;
> +}
> +
> +int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
> +{
> + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> + struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
> + u8 cppr, mfrr;
> + u32 xisr;
> +
> + if (!xc || !xive)
> + return -ENOENT;
> +
> + /* Grab individual state fields. We don't use pending_pri */
> + cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT;
> + xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) &
> + KVM_REG_PPC_ICP_XISR_MASK;
> + mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT;
> +
> + DBG("set_icp vcpu %d cppr=0x%x mfrr=0x%x xisr=0x%x\n",
> + xc->server_num, cppr, mfrr, xisr);
> +
> + /*
> + * We can't update the state of a "pushed" VCPU, but that
> + * shouldn't happen.
> + */
> + if (WARN_ON(vcpu->arch.xive_pushed))
> + return -EIO;
> +
> + /* Update VCPU HW saved state */
> + vcpu->arch.xive_saved_state.cppr = cppr;
> + xc->hw_cppr = xc->cppr = cppr;
> +
> + /*
> + * Update MFRR state. If it's not 0xff, we mark the VCPU as
> + * having a pending MFRR change, which will re-evaluate the
> + * target. The VCPU will thus potentially get a spurious
> + * interrupt but that's not a big deal.
> + */
> + xc->mfrr = mfrr;
> + if (mfrr < cppr)
> + xive_irq_trigger(&xc->vp_ipi_data);
> +
> + /*
> + * Now saved XIRR is "interesting". It means there's something in
> + * the legacy "1 element" queue... for an IPI we simply ignore it,
> + * as the MFRR restore will handle that. For anything else we need
> + * to force a resend of the source.
> + * However the source may not have been setup yet. If that's the
> + * case, we keep that info and increment a counter in the xive to
> + * tell subsequent xive_set_source() to go look.
> + */
> + if (xisr > XICS_IPI && !xive_restore_pending_irq(xive, xisr)) {
> + xc->delayed_irq = xisr;
> + xive->delayed_irqs++;
> + DBG(" xisr restore delayed\n");
> + }
> +
> + return 0;
> +}
> +
> +int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
> + struct irq_desc *host_desc)
> +{
> + struct kvmppc_xive *xive = kvm->arch.xive;
> + struct kvmppc_xive_src_block *sb;
> + struct kvmppc_xive_irq_state *state;
> + struct irq_data *host_data = irq_desc_get_irq_data(host_desc);
> + unsigned int host_irq = irq_desc_get_irq(host_desc);
> + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data);
> + u16 idx;
> + u8 prio;
> + int rc;
> +
> + if (!xive)
> + return -ENODEV;
> +
> + DBG("set_mapped girq 0x%lx host HW irq 0x%x...\n", guest_irq, hw_irq);
> +
> + sb = kvmppc_xive_find_source(xive, guest_irq, &idx);
> + if (!sb)
> + return -EINVAL;
> + state = &sb->irq_state[idx];
> +
> + /*
> + * Mark the passed-through interrupt as going to a VCPU,
> + * this will prevent further EOIs and similar operations
> + * from the XIVE code. It will also mask the interrupt
> + * to either PQ=10 or 11 state, the latter if the interrupt
> + * is pending. This will allow us to unmask or retrigger it
> + * after routing it to the guest with a simple EOI.
> + *
> + * The "state" argument is a "token", all it needs is to be
> + * non-NULL to switch to passed-through or NULL for the
> + * other way around. We may not yet have an actual VCPU
> + * target here and we don't really care.
> + */
> + rc = irq_set_vcpu_affinity(host_irq, state);
> + if (rc) {
> + pr_err("Failed to set VCPU affinity for irq %d\n", host_irq);
> + return rc;
> + }
> +
> + /*
> + * Mask and read state of IPI. We need to know if its P bit
> + * is set as that means it's potentially already using a
> + * queue entry in the target
> + */
> + prio = xive_lock_and_mask(xive, sb, state);
> + DBG(" old IPI prio %02x P:%d Q:%d\n", prio, state->old_p, state->old_q);
> +
> + /* Turn the IPI hard off */
> + xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
> +
> + /* Grab info about irq */
> + state->pt_number = hw_irq;
> + state->pt_data = irq_data_get_irq_handler_data(host_data);
> +
> + /*
> + * Configure the IRQ to match the existing configuration of
> + * the IPI if it was already targetted. Otherwise this will
> + * mask the interrupt in a lossy way (act_priority is 0xff)
> + * which is fine for a never started interrupt.
> + */
> + xive_native_configure_irq(hw_irq,
> + xive->vp_base + state->act_server,
> + state->act_priority, state->number);
> +
> + /*
> + * We do an EOI to enable the interrupt (and retrigger if needed)
> + * if the guest has the interrupt unmasked and the P bit was *not*
> + * set in the IPI. If it was set, we know a slot may still be in
> + * use in the target queue thus we have to wait for a guest
> + * originated EOI
> + */
> + if (prio != MASKED && !state->old_p)
> + xive_vm_source_eoi(hw_irq, state->pt_data);
> +
> + /* Clear old_p/old_q as they are no longer relevant */
> + state->old_p = state->old_q = false;
> +
> + /* Restore guest prio (unlocks EOI) */
> + mb();
> + state->guest_priority = prio;
> + arch_spin_unlock(&sb->lock);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_xive_set_mapped);
> +
> +int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
> + struct irq_desc *host_desc)
> +{
> + struct kvmppc_xive *xive = kvm->arch.xive;
> + struct kvmppc_xive_src_block *sb;
> + struct kvmppc_xive_irq_state *state;
> + unsigned int host_irq = irq_desc_get_irq(host_desc);
> + u16 idx;
> + u8 prio;
> + int rc;
> +
> + if (!xive)
> + return -ENODEV;
> +
> + DBG("clr_mapped girq 0x%lx...\n", guest_irq);
> +
> + sb = kvmppc_xive_find_source(xive, guest_irq, &idx);
> + if (!sb)
> + return -EINVAL;
> + state = &sb->irq_state[idx];
> +
> + /*
> + * Mask and read state of IRQ. We need to know if its P bit
> + * is set as that means it's potentially already using a
> + * queue entry in the target
> + */
> + prio = xive_lock_and_mask(xive, sb, state);
> + DBG(" old IRQ prio %02x P:%d Q:%d\n", prio, state->old_p, state->old_q);
> +
> + /*
> + * If old_p is set, the interrupt is pending, we switch it to
> + * PQ=11. This will force a resend in the host so the interrupt
> + * isn't lost to whatver host driver may pick it up
> + */
> + if (state->old_p)
> + xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_11);
> +
> + /* Relase the passed-through interrupt to the host */
^^^^^^ Release
> + rc = irq_set_vcpu_affinity(host_irq, NULL);
> + if (rc) {
> + pr_err("Failed to clr VCPU affinity for irq %d\n", host_irq);
> + return rc;
> + }
> +
> + /* Forget about the IRQ */
> + state->pt_number = 0;
> + state->pt_data = NULL;
> +
> + /* Reconfigure the IPI */
> + xive_native_configure_irq(state->ipi_number,
> + xive->vp_base + state->act_server,
> + state->act_priority, state->number);
> +
> + /*
> + * If old_p is set (we have a queue entry potentially
> + * occupied) or the interrupt is masked, we set the IPI
> + * to PQ=10 state. Otherwise we just re-enable it (PQ=00).
> + */
> + if (prio == MASKED || state->old_p)
> + xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_10);
> + else
> + xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_00);
> +
> + /* Restore guest prio (unlocks EOI) */
> + mb();
> + state->guest_priority = prio;
> + arch_spin_unlock(&sb->lock);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped);
> +
> +static void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
> +{
> + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> + struct kvm *kvm = vcpu->kvm;
> + struct kvmppc_xive *xive = kvm->arch.xive;
> + int i, j;
> +
> + for (i = 0; i <= xive->max_sbid; i++) {
> + struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
> +
> + if (!sb)
> + continue;
> + for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
> + struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
> +
> + if (!state->valid)
> + continue;
> + if (state->act_priority == MASKED)
> + continue;
> + if (state->act_server != xc->server_num)
> + continue;
> +
> + /* Clean it up */
> + arch_spin_lock(&sb->lock);
> + state->act_priority = MASKED;
> + xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
> + xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
> + if (state->pt_number) {
> + xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
> + xive_native_configure_irq(state->pt_number, 0, MASKED, 0);
> + }
> + arch_spin_unlock(&sb->lock);
> + }
> + }
> +}
> +
> +void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
> +{
> + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> + struct kvmppc_xive *xive = xc->xive;
> + int i;
> +
> + DBG("cleanup_vcpu(cpu=%d)\n", xc->server_num);
> +
> + /* Ensure no interrupt is still routed to that VP */
> + xc->valid = false;
> + kvmppc_xive_disable_vcpu_interrupts(vcpu);
> +
> + /* Mask the VP IPI */
> + xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01);
> +
> + /* Disable the VP */
> + xive_native_disable_vp(xc->vp_id);
> +
> + /* Free the queues & associated interrupts */
> + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
> + struct xive_q *q = &xc->queues[i];
> +
> + /* Free the escalation irq */
> + if (xc->esc_virq[i]) {
> + free_irq(xc->esc_virq[i], vcpu);
> + irq_dispose_mapping(xc->esc_virq[i]);
> + kfree(xc->esc_virq_names[i]);
> + }
> + /* Free the queue */
> + xive_native_disable_queue(xc->vp_id, q, i);
> + if (q->qpage) {
> + free_pages((unsigned long)q->qpage,
> + xive->q_alloc_order);
> + q->qpage = NULL;
> + }
> + }
> +
> + /* Free the IPI */
> + if (xc->vp_ipi) {
> + xive_cleanup_irq_data(&xc->vp_ipi_data);
> + xive_native_free_irq(xc->vp_ipi);
> + }
> + /* Free the VP */
> + kfree(xc);
> +}
> +
> +int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
> + struct kvm_vcpu *vcpu, u32 cpu)
> +{
> + struct kvmppc_xive *xive = dev->private;
> + struct kvmppc_xive_vcpu *xc;
> + int i, r = -EBUSY;
> +
> + DBG("connect_vcpu(cpu=%d)\n", cpu);
> +
> + if (dev->ops != &kvm_xive_ops) {
> + DBG("Wrong ops !\n");
> + return -EPERM;
> + }
> + if (xive->kvm != vcpu->kvm)
> + return -EPERM;
> + if (vcpu->arch.irq_type)
> + return -EBUSY;
> + if (kvmppc_xive_find_server(vcpu->kvm, cpu)) {
> + DBG("Duplicate !\n");
> + return -EEXIST;
> + }
> + if (cpu >= KVM_MAX_VCPUS) {
> + DBG("Out of bounds !\n");
> + return -EINVAL;
> + }
> + xc = kzalloc(sizeof(*xc), GFP_KERNEL);
> + if (!xc)
> + return -ENOMEM;
> +
> + /* We need to synchronize with queue provisioning */
> + mutex_lock(&vcpu->kvm->lock);
> + vcpu->arch.xive_vcpu = xc;
> + xc->xive = xive;
> + xc->vcpu = vcpu;
> + xc->server_num = cpu;
> + xc->vp_id = xive->vp_base + cpu;
> + xc->mfrr = 0xff;
> + xc->valid = true;
> +
> + r = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
> + if (r)
> + goto bail;
> +
> + /* Configure VCPU fields for use by assembly push/pull */
> + vcpu->arch.xive_saved_state.qw = cpu_to_be64(0xff000000);
> + vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
> +
> + /* Allocate IPI */
> + xc->vp_ipi = xive_native_alloc_irq();
> + if (!xc->vp_ipi) {
> + r = -EIO;
> + goto bail;
> + }
> + DBG(" IPI=0x%x\n", xc->vp_ipi);
> +
> + r = xive_native_populate_irq_data(xc->vp_ipi, &xc->vp_ipi_data);
> + if (r)
> + goto bail;
> +
> + /*
> + * Initialize queues. Initially we set them all for no queueing
> + * and we enable escalation for queue 0 only which we'll use for
> + * our mfrr change notifications. If the VCPU is hot-plugged, we
> + * do handle provisioning however.
> + */
> + for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
> + struct xive_q *q = &xc->queues[i];
> +
> + /* Is queue already enabled ? Provision it */
> + if (xive->qmap & (1 << i)) {
> + r = xive_provision_queue(vcpu, i);
> + if (r == 0)
> + xive_attach_escalation(vcpu, i);
> + if (r)
> + goto bail;
> + } else {
> + r = xive_native_configure_queue(xc->vp_id,
> + q, i, NULL, 0, true);
> + if (r) {
> + pr_err("XIVE-KVM: Failed to configure queue %d"
> + " for VCPU %d\n",
> + i, cpu);
> + goto bail;
> + }
> + }
> + }
> +
> + /* If not done above, attach priority 0 escalation */
> + r = xive_attach_escalation(vcpu, 0);
> + if (r)
> + goto bail;
> +
> + /* Enable the VP */
> + r = xive_native_enable_vp(xc->vp_id);
> + if (r)
> + goto bail;
> +
> + /* Route the IPI */
> + r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI);
> + if (!r)
> + xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_00);
> +
> + bail:
> + mutex_unlock(&vcpu->kvm->lock);
> + if (r) {
> + kvmppc_xive_cleanup_vcpu(vcpu);
> + return r;
> + }
> +
> + vcpu->arch.irq_type = KVMPPC_IRQ_XICS;
> + return 0;
> +}
> +
> +/*
> + * Scanning of queues before/after migration save
> + */
> +static void xive_pre_save_set_queued(struct kvmppc_xive *xive, u32 irq)
> +{
> + struct kvmppc_xive_src_block *sb;
> + struct kvmppc_xive_irq_state *state;
> + u16 idx;
> +
> + sb = kvmppc_xive_find_source(xive, irq, &idx);
> + if (!sb)
> + return;
> +
> + state = &sb->irq_state[idx];
> +
> + /* Some sanity checking */
> + if (!state->valid) {
> + pr_err("XIVE/XIVE: invalid irq 0x%x in cpu queue!\n", irq);
> + return;
> + }
> +
> + /*
> + * If the interrupt is in a queue it should have P set.
> + * We warn so that gets reported. A backtrace isn't useful
> + * so no need to use a WARN_ON.
> + */
> + if (!state->saved_p)
> + pr_err("KVM/XIVE: Interrupt 0x%x is marked in a queue"
> + " but P not set !\n", irq);
> +
> + /* Set flag */
> + state->in_queue = true;
> +}
> +
> +static void xive_pre_scan_mask_irq(struct kvmppc_xive *xive,
> + struct kvmppc_xive_src_block *sb,
> + u32 irq)
> +{
> + struct kvmppc_xive_irq_state *state = &sb->irq_state[irq];
> +
> + if (!state->valid)
> + return;
> +
> + /* Mask and save state, this will also sync HW queues */
> + state->saved_scan_prio = xive_lock_and_mask(xive, sb, state);
> +
> + /* Transfer P and Q */
> + state->saved_p = state->old_p;
> + state->saved_q = state->old_q;
> +
> + /* Unlock */
> + arch_spin_unlock(&sb->lock);
> +}
> +
> +static void xive_pre_scan_unmask_irq(struct kvmppc_xive *xive,
I think a better name would be "xive_pre_save_unmask", since this is
actually called after the scan.
> + struct kvmppc_xive_src_block *sb,
> + u32 irq)
> +{
> + struct kvmppc_xive_irq_state *state = &sb->irq_state[irq];
> +
> + if (!state->valid)
> + return;
> +
> + /*
> + * Lock / exclude EOI (not technically necessary if the
> + * guest isn't running concurrently. If this becomes a
> + * performance issue we can probably remove the lock.
> + */
> + xive_lock_for_unmask(sb, state);
> +
> + /* Restore mask/prio if it wasn't masked */
> + if (state->saved_scan_prio != MASKED)
> + xive_finish_unmask(xive, sb, state, state->saved_scan_prio);
> +
> + /* Unlock */
> + arch_spin_unlock(&sb->lock);
> +}
> +
> +static void xive_pre_save_queue(struct kvmppc_xive *xive, struct xive_q *q)
> +{
> + u32 idx = q->idx;
> + u32 toggle = q->toggle;
> + u32 irq;
> +
> + do {
> + irq = __xive_read_eq(q->qpage, q->msk, &idx, &toggle);
> + if (irq > XICS_IPI)
> + xive_pre_save_set_queued(xive, irq);
> + } while(irq);
> +}
> +
> +static void xive_pre_save_scan(struct kvmppc_xive *xive)
> +{
> + struct kvm_vcpu *vcpu = NULL;
> + int i, j;
> +
> + /*
> + * See comment in xive_get_source() about how this
> + * work. Collect a stable state for all interrupts
> + */
> + for (i = 0; i <= xive->max_sbid; i++) {
> + struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
> + if (!sb)
> + continue;
> + for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++)
> + xive_pre_scan_mask_irq(xive, sb, j);
> + }
> +
> + /* Then scan the queues and update the "in_queue" flag */
> + kvm_for_each_vcpu(i, vcpu, xive->kvm) {
> + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> + if (!xc)
> + continue;
> + for (j = 0; j < KVMPPC_XIVE_Q_COUNT; j++) {
> + if (xc->queues[i].qpage)
> + xive_pre_save_queue(xive, &xc->queues[i]);
> + }
> + }
> +
> + /* Finally restore interrupt states */
> + for (i = 0; i <= xive->max_sbid; i++) {
> + struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
> + if (!sb)
> + continue;
> + for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++)
> + xive_pre_scan_unmask_irq(xive, sb, j);
> + }
> +}
> +
> +static void xive_post_save_scan(struct kvmppc_xive *xive)
> +{
> + u32 i, j;
> +
> + /* Clear all the in_queue flags */
> + for (i = 0; i <= xive->max_sbid; i++) {
> + struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
> + if (!sb)
> + continue;
> + for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++)
> + sb->irq_state[j].in_queue = false;
> + }
> +
> + /* Next get_source() will do a new scan */
> + xive->saved_src_count = 0;
> +}
> +
> +/*
> + * This returns the source configuration and state to user space.
> + */
> +static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
> +{
> + struct kvmppc_xive_src_block *sb;
> + struct kvmppc_xive_irq_state *state;
> + u64 __user *ubufp = (u64 __user *) addr;
> + u64 val, prio;
> + u16 idx;
> +
> + sb = kvmppc_xive_find_source(xive, irq, &idx);
> + if (!sb)
> + return -ENOENT;
> +
> + state = &sb->irq_state[idx];
> +
> + if (!state->valid)
> + return -ENOENT;
> +
> + DBG("get_source(%ld)...\n", irq);
> +
> + /*
> + * So to properly save the state into something that looks like a
> + * XICS migration stream we cannot treat interrupts individually.
> + *
> + * We need, instead, mask them all (& save their previous PQ state)
> + * to get a stable state in the HW, then sync them to ensure that
> + * any interrupt that had already fired hits its queue, and finally
> + * scan all the queues to collect which interrupts are still present
> + * in the queues, so we can set the "pending" flag on them and
> + * they can be resent on restore.
> + *
> + * So we do it all when the "first" interrupt gets saved, all the
> + * state is collected at that point, the rest of xive_get_source()
> + * will merely collect and convert that state to the expected
> + * userspace bit mask.
> + */
> + if (xive->saved_src_count == 0)
> + xive_pre_save_scan(xive);
> + xive->saved_src_count++;
> +
> + /* Convert saved state into something compatible with xics */
> + val = state->guest_server;
> + prio = state->saved_scan_prio;
> +
> + if (prio == MASKED) {
> + val |= KVM_XICS_MASKED;
> + prio = state->saved_priority;
> + }
> + val |= prio << KVM_XICS_PRIORITY_SHIFT;
> + if (state->lsi) {
> + val |= KVM_XICS_LEVEL_SENSITIVE;
> + if (state->saved_p)
> + val |= KVM_XICS_PENDING;
> + } else {
> + if (state->saved_p)
> + val |= KVM_XICS_PRESENTED;
> +
> + if (state->saved_q)
> + val |= KVM_XICS_QUEUED;
> +
> + /*
> + * We mark it pending (which will attempt a re-delivery)
> + * if we are in a queue *or* we were masked and had
> + * Q set which is equivalent to the XICS "masked pending"
> + * state
> + */
> + if (state->in_queue || (prio == MASKED && state->saved_q))
> + val |= KVM_XICS_PENDING;
> + }
> +
> + /*
> + * If that was the last interrupt saved, reset the
> + * in_queue flags
> + */
> + if (xive->saved_src_count == xive->src_count)
> + xive_post_save_scan(xive);
> +
> + /* Copy the result to userspace */
> + if (put_user(val, ubufp))
> + return -EFAULT;
> +
> + return 0;
> +}
> +
> +static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *xive,
> + int irq)
> +{
> + struct kvm *kvm = xive->kvm;
> + struct kvmppc_xive_src_block *sb;
> + int i, bid;
> +
> + bid = irq >> KVMPPC_XICS_ICS_SHIFT;
> +
> + mutex_lock(&kvm->lock);
> +
> + /* block already exists - somebody else got here first */
> + if (xive->src_blocks[bid])
> + goto out;
> +
> + /* Create the ICS */
> + sb = kzalloc(sizeof(*sb), GFP_KERNEL);
> + if (!sb)
> + goto out;
> +
> + sb->id = bid;
> +
> + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
> + sb->irq_state[i].number = (bid << KVMPPC_XICS_ICS_SHIFT) | i;
> + sb->irq_state[i].guest_priority = MASKED;
> + sb->irq_state[i].saved_priority = MASKED;
> + sb->irq_state[i].act_priority = MASKED;
> + }
> + smp_wmb();
> + xive->src_blocks[bid] = sb;
> +
> + if (bid > xive->max_sbid)
> + xive->max_sbid = bid;
> +
> + out:
> + mutex_unlock(&kvm->lock);
> + return xive->src_blocks[bid];
> +}
> +
> +static bool xive_check_delayed_irq(struct kvmppc_xive *xive, u32 irq)
> +{
> + struct kvm *kvm = xive->kvm;
> + struct kvm_vcpu *vcpu = NULL;
> + int i;
> +
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +
> + if (!xc)
> + continue;
> +
> + if (xc->delayed_irq == irq) {
> + xc->delayed_irq = 0;
> + xive->delayed_irqs--;
> + return true;
> + }
> + }
> + return false;
> +}
> +
> +static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
> +{
> + struct kvmppc_xive_src_block *sb;
> + struct kvmppc_xive_irq_state *state;
> + u64 __user *ubufp = (u64 __user *) addr;
> + u16 idx;
> + u64 val;
> + u8 act_prio, guest_prio;
> + u32 server;
> + int rc = 0;
> +
> + if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
> + return -ENOENT;
> +
> + DBG("set_source(irq=0x%lx)\n", irq);
> +
> + /* Find the source */
> + sb = kvmppc_xive_find_source(xive, irq, &idx);
> + if (!sb) {
> + DBG("No source, creating source block...\n");
> + sb = xive_create_src_block(xive, irq);
> + if (!sb) {
> + DBG("Failed to create block...\n");
> + return -ENOMEM;
> + }
> + }
> + state = &sb->irq_state[idx];
> +
> + /* Read user passed data */
> + if (get_user(val, ubufp)) {
> + DBG("fault getting user info !\n");
> + return -EFAULT;
> + }
> +
> + server = val & KVM_XICS_DESTINATION_MASK;
> + guest_prio = val >> KVM_XICS_PRIORITY_SHIFT;
> +
> + DBG(" val=0x016%llx (server=0x%x, guest_prio=%d)\n",
> + val, server, guest_prio);
> + /*
> + * If the source doesn't already have an IPI, allocate
> + * one and get the corresponding data
> + */
> + if (!state->ipi_number) {
> + state->ipi_number = xive_native_alloc_irq();
> + if (state->ipi_number == 0) {
> + DBG("Failed to allocate IPI !\n");
> + return -ENOMEM;
> + }
> + xive_native_populate_irq_data(state->ipi_number, &state->ipi_data);
> + DBG(" src_ipi=0x%x\n", state->ipi_number);
> + }
> +
> + /*
> + * We use lock_and_mask() to set us in the right masked
> + * state. We will override that state from the saved state
> + * further down, but this will handle the cases of interrupts
> + * that need FW masking. We set the initial guest_priority to
> + * 0 before calling it to ensure it actually performs the masking.
> + */
> + state->guest_priority = 0;
> + xive_lock_and_mask(xive, sb, state);
> +
> + /*
> + * Now, we select a target if we have one. If we don't we
> + * leave the interrupt untargetted. It means that an interrupt
> + * can become "untargetted" accross migration if it was masked
> + * by set_xive() but there is little we can do about it.
> + */
> +
> + /* First convert prio and mark interrupt as untargetted */
> + act_prio = xive_prio_from_guest(guest_prio);
> + state->act_priority = MASKED;
> + state->guest_server = server;
> +
> + /*
> + * We need to drop the lock due to the mutex below. Hopefully
> + * nothing is touching that interrupt yet since it hasn't been
> + * advertized to a running guest yet
> + */
> + arch_spin_unlock(&sb->lock);
> +
> + /* If we have a priority target the interrupt */
> + if (act_prio != MASKED) {
> + /* First, check provisioning of queues */
> + mutex_lock(&xive->kvm->lock);
> + rc = xive_check_provisioning(xive->kvm, act_prio);
> + mutex_unlock(&xive->kvm->lock);
> +
> + /* Target interrupt */
> + if (rc == 0)
> + rc = xive_target_interrupt(xive->kvm, state,
> + server, act_prio);
> + /*
> + * If provisioning or targetting failed, leave it
> + * alone and masked. It will remain disabled until
> + * the guest re-targets it.
> + */
> + }
> +
> + /*
> + * Find out if this was a delayed irq stashed in an ICP,
> + * in which case, treat it as pending
> + */
> + if (xive->delayed_irqs && xive_check_delayed_irq(xive, irq)) {
> + val |= KVM_XICS_PENDING;
> + DBG(" Found delayed ! forcing PENDING !\n");
> + }
> +
> + /* Cleanup the SW state */
> + state->old_p = false;
> + state->old_q = false;
> + state->lsi = false;
> + state->asserted = false;
> +
> + /* Restore LSI state */
> + if (val & KVM_XICS_LEVEL_SENSITIVE) {
> + state->lsi = true;
> + if (val & KVM_XICS_PENDING)
> + state->asserted = true;
> + DBG(" LSI ! Asserted=%d\n", state->asserted);
> + }
> +
> + /*
> + * Restore P and Q. If the interrupt was pending, we
> + * force both P and Q, which will trigger a resend.
> + *
> + * That means that a guest that had both an interrupt
> + * pending (queued) and Q set will restore with only
> + * one instance of that interrupt instead of 2, but that
> + * is perfectly fine as coalescing interrupts that haven't
> + * been presented yet is always allowed.
> + */
> + if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING)
> + state->old_p = true;
> + if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING)
> + state->old_q = true;
> +
> + DBG(" P=%d, Q=%d\n", state->old_p, state->old_q);
> +
> + /*
> + * If the interrupt was unmasked, update guest priority and
> + * perform the appropriate state transition and do a
> + * re-trigger if necessary.
> + */
> + if (val & KVM_XICS_MASKED) {
> + DBG(" masked, saving prio\n");
> + state->guest_priority = MASKED;
> + state->saved_priority = guest_prio;
> + } else {
> + DBG(" unmasked, restoring to prio %d\n", guest_prio);
> + xive_finish_unmask(xive, sb, state, guest_prio);
> + state->saved_priority = guest_prio;
> + }
> +
> + /* Increment the number of valid sources and mark this one valid */
> + if (!state->valid)
> + xive->src_count++;
> + state->valid = true;
> +
> + return 0;
> +}
> +
> +int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
> + bool line_status)
> +{
> + struct kvmppc_xive *xive = kvm->arch.xive;
> + struct kvmppc_xive_src_block *sb;
> + struct kvmppc_xive_irq_state *state;
> + u16 idx;
> +
> + if (!xive)
> + return -ENODEV;
> +
> + sb = kvmppc_xive_find_source(xive, irq, &idx);
> + if (!sb)
> + return -EINVAL;
> +
> + /* Perform locklessly .... (we need to do some RCUisms here...) */
> + state = &sb->irq_state[idx];
> + if (!state->valid)
> + return -EINVAL;
> +
> + /* We don't allow a trigger on a passed-through interrupt */
> + if (state->pt_number)
> + return -EINVAL;
> +
> + if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL)
> + state->asserted = 1;
> + else if (level == 0 || level == KVM_INTERRUPT_UNSET) {
> + state->asserted = 0;
> + return 0;
> + }
> +
> + /* Trigger the IPI */
> + xive_irq_trigger(&state->ipi_data);
> +
> + return 0;
> +}
> +
> +static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> +{
> + struct kvmppc_xive *xive = dev->private;
> +
> + /* We honor the existing XICS ioctl */
> + switch (attr->group) {
> + case KVM_DEV_XICS_GRP_SOURCES:
> + return xive_set_source(xive, attr->attr, attr->addr);
> + }
> + return -ENXIO;
> +}
> +
> +static int xive_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> +{
> + struct kvmppc_xive *xive = dev->private;
> +
> + /* We honor the existing XICS ioctl */
> + switch (attr->group) {
> + case KVM_DEV_XICS_GRP_SOURCES:
> + return xive_get_source(xive, attr->attr, attr->addr);
> + }
> + return -ENXIO;
> +}
> +
> +static int xive_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> +{
> + /* We honor the same limits as XICS, at least for now */
> + switch (attr->group) {
> + case KVM_DEV_XICS_GRP_SOURCES:
> + if (attr->attr >= KVMPPC_XICS_FIRST_IRQ &&
> + attr->attr < KVMPPC_XICS_NR_IRQS)
> + return 0;
> + break;
> + }
> + return -ENXIO;
> +}
> +
> +static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd)
> +{
> + xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
> + xive_native_configure_irq(hw_num, 0, MASKED, 0);
> + xive_cleanup_irq_data(xd);
> +}
> +
> +static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
> +{
> + int i;
> +
> + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
> + struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
> +
> + if (!state->valid)
> + continue;
> +
> + kvmppc_xive_cleanup_irq(state->ipi_number, &state->ipi_data);
> + xive_native_free_irq(state->ipi_number);
> +
> + /* Pass-through, cleanup too */
> + if (state->pt_number)
> + kvmppc_xive_cleanup_irq(state->pt_number, state->pt_data);
> +
> + state->valid = false;
> + }
> +}
> +
> +static void kvmppc_xive_free(struct kvm_device *dev)
> +{
> + struct kvmppc_xive *xive = dev->private;
> + struct kvm *kvm = xive->kvm;
> + int i;
> +
> + debugfs_remove(xive->dentry);
> +
> + if (kvm)
> + kvm->arch.xive = NULL;
> +
> + /* Mask and free interrupts */
> + for (i = 0; i <= xive->max_sbid; i++) {
> + if (xive->src_blocks[i])
> + kvmppc_xive_free_sources(xive->src_blocks[i]);
> + kfree(xive->src_blocks[i]);
> + xive->src_blocks[i] = NULL;
> + }
> +
> + if (xive->vp_base != XIVE_INVALID_VP)
> + xive_native_free_vp_block(xive->vp_base);
> +
> +
> + kfree(xive);
> + kfree(dev);
> +}
> +
> +static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
> +{
> + struct kvmppc_xive *xive;
> + struct kvm *kvm = dev->kvm;
> + int ret = 0;
> +
> + DBG("Creating xive for partition\n");
> +
> + xive = kzalloc(sizeof(*xive), GFP_KERNEL);
> + if (!xive)
> + return -ENOMEM;
> +
> + dev->private = xive;
> + xive->dev = dev;
> + xive->kvm = kvm;
> +
> + /* Already there ? */
> + if (kvm->arch.xive)
> + ret = -EEXIST;
> + else
> + kvm->arch.xive = xive;
> +
> + /* We use the default queue size set by the host */
> + xive->q_order = xive_native_default_eq_shift();
> + if (xive->q_order < PAGE_SHIFT)
> + xive->q_alloc_order = 0;
> + else
> + xive->q_alloc_order = xive->q_order - PAGE_SHIFT;
> +
> + /* Allocate a bunch of VPs */
> + xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS);
> + DBG("VP_Base=%x\n", xive->vp_base);
> + if (xive->vp_base == XIVE_INVALID_VP)
> + ret = -ENOMEM;
> +
> + if (ret) {
> + kfree(xive);
> + return ret;
> + }
> +
> + return 0;
> +}
> +
> +
> +static int xive_debug_show(struct seq_file *m, void *private)
> +{
> + struct kvmppc_xive *xive = m->private;
> + struct kvm *kvm = xive->kvm;
> + struct kvm_vcpu *vcpu;
> + u64 t_rm_h_xirr = 0;
> + u64 t_rm_h_ipoll = 0;
> + u64 t_rm_h_cppr = 0;
> + u64 t_rm_h_eoi = 0;
> + u64 t_rm_h_ipi = 0;
> + u64 t_vm_h_xirr = 0;
> + u64 t_vm_h_ipoll = 0;
> + u64 t_vm_h_cppr = 0;
> + u64 t_vm_h_eoi = 0;
> + u64 t_vm_h_ipi = 0;
> + unsigned int i;
> +
> + if (!kvm)
> + return 0;
> +
> + seq_printf(m, "=========\nVCPU state\n=========\n");
> +
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +
> + if (!xc)
> + continue;
> +
> + seq_printf(m, "cpu server %#x CPPR:%#x HWCPPR:%#x"
> + " MFRR:%#x PEND:%#x h_xirr: R=%lld V=%lld\n",
> + xc->server_num, xc->cppr, xc->hw_cppr,
> + xc->mfrr, xc->pending,
> + xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
> + t_rm_h_xirr += xc->stat_rm_h_xirr;
> + t_rm_h_ipoll += xc->stat_rm_h_ipoll;
> + t_rm_h_cppr += xc->stat_rm_h_cppr;
> + t_rm_h_eoi += xc->stat_rm_h_eoi;
> + t_rm_h_ipi += xc->stat_rm_h_ipi;
> + t_vm_h_xirr += xc->stat_vm_h_xirr;
> + t_vm_h_ipoll += xc->stat_vm_h_ipoll;
> + t_vm_h_cppr += xc->stat_vm_h_cppr;
> + t_vm_h_eoi += xc->stat_vm_h_eoi;
> + t_vm_h_ipi += xc->stat_vm_h_ipi;
> + }
> +
> + seq_printf(m, "Hcalls totals\n");
> + seq_printf(m, " H_XIRR R=%10lld V=%10lld\n", t_rm_h_xirr, t_vm_h_xirr);
> + seq_printf(m, " H_IPOLL R=%10lld V=%10lld\n", t_rm_h_ipoll, t_vm_h_ipoll);
> + seq_printf(m, " H_CPPR R=%10lld V=%10lld\n", t_rm_h_cppr, t_vm_h_cppr);
> + seq_printf(m, " H_EOI R=%10lld V=%10lld\n", t_rm_h_eoi, t_vm_h_eoi);
> + seq_printf(m, " H_IPI R=%10lld V=%10lld\n", t_rm_h_ipi, t_vm_h_ipi);
> +
> + return 0;
> +}
> +
> +static int xive_debug_open(struct inode *inode, struct file *file)
> +{
> + return single_open(file, xive_debug_show, inode->i_private);
> +}
> +
> +static const struct file_operations xive_debug_fops = {
> + .open = xive_debug_open,
> + .read = seq_read,
> + .llseek = seq_lseek,
> + .release = single_release,
> +};
> +
> +static void xive_debugfs_init(struct kvmppc_xive *xive)
> +{
> + char *name;
> +
> + name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive);
> + if (!name) {
> + pr_err("%s: no memory for name\n", __func__);
> + return;
> + }
> +
> + xive->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root,
> + xive, &xive_debug_fops);
> +
> + pr_debug("%s: created %s\n", __func__, name);
> + kfree(name);
> +}
> +
> +static void kvmppc_xive_init(struct kvm_device *dev)
> +{
> + struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private;
> +
> + /* Register some debug interfaces */
> + xive_debugfs_init(xive);
> +}
> +
> +struct kvm_device_ops kvm_xive_ops = {
> + .name = "kvm-xive",
> + .create = kvmppc_xive_create,
> + .init = kvmppc_xive_init,
> + .destroy = kvmppc_xive_free,
> + .set_attr = xive_set_attr,
> + .get_attr = xive_get_attr,
> + .has_attr = xive_has_attr,
> +};
> +
> +void kvmppc_xive_init_module(void)
> +{
> + __xive_vm_h_xirr = xive_vm_h_xirr;
> + __xive_vm_h_ipoll = xive_vm_h_ipoll;
> + __xive_vm_h_ipi = xive_vm_h_ipi;
> + __xive_vm_h_cppr = xive_vm_h_cppr;
> + __xive_vm_h_eoi = xive_vm_h_eoi;
> +}
> +
> +void kvmppc_xive_exit_module(void)
> +{
> + __xive_vm_h_xirr = NULL;
> + __xive_vm_h_ipoll = NULL;
> + __xive_vm_h_ipi = NULL;
> + __xive_vm_h_cppr = NULL;
> + __xive_vm_h_eoi = NULL;
> +}
> diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
> new file mode 100644
> index 0000000..2b7fdbd
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_xive.h
> @@ -0,0 +1,251 @@
> +/*
> + * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + */
> +
> +#ifndef _KVM_PPC_BOOK3S_XIVE_H
> +#define _KVM_PPC_BOOK3S_XIVE_H
> +
> +#include "book3s_xics.h"
> +
> +/* State for one guest irq source.
> + *
> + * For each guest source we allocate a HW interrupt in the XIVE
> + * which we use for all SW triggers. It will be unused for
> + * pass-through but it's easier to keep around as the same
> + * guest interrupt can alternatively be emulated or pass-through
> + * if a physical device is hot unplugged and replaced with an
> + * emulated one.
> + *
> + * This state structure is very similar to the XICS one with
> + * additional XIVE specific tracking.
> + */
> +struct kvmppc_xive_irq_state {
> + bool valid; /* Interrupt entry is valid */
> +
> + u32 number; /* Guest IRQ number */
> + u32 ipi_number; /* XIVE IPI HW number */
> + struct xive_irq_data ipi_data; /* XIVE IPI associated data */
> + u32 pt_number; /* XIVE Pass-through number if any */
> + struct xive_irq_data *pt_data; /* XIVE Pass-through associated data */
> +
> + /* Targetting as set by guest */
> + u32 guest_server; /* Current guest selected target */
> + u8 guest_priority; /* Guest set priority */
> + u8 saved_priority; /* Saved priority when masking */
> +
> + /* Actual targetting */
> + u32 act_server; /* Actual server */
> + u8 act_priority; /* Actual priority */
> +
> + /* Various state bits */
> + bool in_eoi; /* Synchronize with H_EOI */
> + bool old_p; /* P bit state when masking */
> + bool old_q; /* Q bit state when masking */
> + bool lsi; /* level-sensitive interrupt */
> + bool asserted; /* Only for emulated LSI: current state */
> +
> + /* Saved for migration state */
> + bool in_queue;
> + bool saved_p;
> + bool saved_q;
> + u8 saved_scan_prio;
> +};
> +
> +/* Select the "right" interrupt (IPI vs. passthrough) */
> +static inline void kvmppc_xive_select_irq(struct kvmppc_xive_irq_state *state,
> + u32 *out_hw_irq,
> + struct xive_irq_data **out_xd)
> +{
> + if (state->pt_number) {
> + if (out_hw_irq)
> + *out_hw_irq = state->pt_number;
> + if (out_xd)
> + *out_xd = state->pt_data;
> + } else {
> + if (out_hw_irq)
> + *out_hw_irq = state->ipi_number;
> + if (out_xd)
> + *out_xd = &state->ipi_data;
> + }
> +}
> +
> +/* This corresponds to an "ICS" in XICS terminology, we use it
> + * as a mean to break up source information into multiple structures
> + */
> +struct kvmppc_xive_src_block {
> + arch_spinlock_t lock;
> + u16 id;
> + struct kvmppc_xive_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
> +};
> +
> +
> +struct kvmppc_xive {
> + struct kvm *kvm;
> + struct kvm_device *dev;
> + struct dentry *dentry;
> +
> + /* VP block associated with the VM */
> + u32 vp_base;
> +
> + /* Blocks of sources */
> + struct kvmppc_xive_src_block *src_blocks[KVMPPC_XICS_MAX_ICS_ID + 1];
> + u32 max_sbid;
> +
> + /*
> + * For state save, we lazily scan the queues on the first interrupt
> + * being migrated. We don't have a clean way to reset that flags
> + * so we keep track of the number of valid sources and how many of
> + * them were migrated so we can reset when all of them have been
> + * processed.
> + */
> + u32 src_count;
> + u32 saved_src_count;
> +
> + /*
> + * Some irqs are delayed on restore until the source is created,
> + * keep track here of how many of them
> + */
> + u32 delayed_irqs;
> +
> + /* Which queues (priorities) are in use by the guest */
> + u8 qmap;
> +
> + /* Queue orders */
> + u32 q_order;
> + u32 q_alloc_order;
> +
> +};
> +
> +#define KVMPPC_XIVE_Q_COUNT 8
> +
> +struct kvmppc_xive_vcpu {
> + struct kvmppc_xive *xive;
> + struct kvm_vcpu *vcpu;
> + bool valid;
> +
> + /* Server number. This is the HW CPU ID from a guest perspective */
> + u32 server_num;
> +
> + /* HW VP corresponding to this VCPU. This is the base of the VP
> + * block plus the server number
> + */
> + u32 vp_id;
> + u32 vp_chip_id;
> + u32 vp_cam;
> +
> + /* IPI used for sending ... IPIs */
> + u32 vp_ipi;
> + struct xive_irq_data vp_ipi_data;
> +
> + /* Local emulation state */
> + uint8_t cppr; /* guest CPPR */
> + uint8_t hw_cppr;/* Hardware CPPR */
> + uint8_t mfrr;
> + uint8_t pending;
> +
> + /* Each VP has 8 queues though we only provision some */
> + struct xive_q queues[KVMPPC_XIVE_Q_COUNT];
> + u32 esc_virq[KVMPPC_XIVE_Q_COUNT];
> + char *esc_virq_names[KVMPPC_XIVE_Q_COUNT];
> +
> + /* Stash a delayed irq on restore from migration (see set_icp) */
> + u32 delayed_irq;
> +
> + /* Stats */
> + u64 stat_rm_h_xirr;
> + u64 stat_rm_h_ipoll;
> + u64 stat_rm_h_cppr;
> + u64 stat_rm_h_eoi;
> + u64 stat_rm_h_ipi;
> + u64 stat_vm_h_xirr;
> + u64 stat_vm_h_ipoll;
> + u64 stat_vm_h_cppr;
> + u64 stat_vm_h_eoi;
> + u64 stat_vm_h_ipi;
> +};
> +
> +static inline struct kvm_vcpu *kvmppc_xive_find_server(struct kvm *kvm, u32 nr)
> +{
> + struct kvm_vcpu *vcpu = NULL;
> + int i;
> +
> + kvm_for_each_vcpu(i, vcpu, kvm) {
> + if (vcpu->arch.xive_vcpu && nr == vcpu->arch.xive_vcpu->server_num)
> + return vcpu;
> + }
> + return NULL;
> +}
> +
> +static inline struct kvmppc_xive_src_block *kvmppc_xive_find_source(struct kvmppc_xive *xive,
> + u32 irq, u16 *source)
> +{
> + u32 bid = irq >> KVMPPC_XICS_ICS_SHIFT;
> + u16 src = irq & KVMPPC_XICS_SRC_MASK;
> +
> + if (source)
> + *source = src;
> + if (bid > KVMPPC_XICS_MAX_ICS_ID)
> + return NULL;
> + return xive->src_blocks[bid];
> +}
> +
> +/*
> + * Mapping between guest priorities and host priorities
> + * is as follow.
> + *
> + * Guest request for 0...6 are honored. Guest request for anything
> + * higher results in a priority of 7 being applied.
> + *
> + * However, when XIRR is returned via H_XIRR, 7 is translated to 0xb
> + * in order to match AIX expectations
> + *
> + * Similar mapping is done for CPPR values
> + */
> +static inline u8 xive_prio_from_guest(u8 prio)
> +{
> + if (prio == 0xff || prio < 8)
> + return prio;
> + return 7;
> +}
> +
> +static inline u8 xive_prio_to_guest(u8 prio)
> +{
> + if (prio == 0xff || prio < 7)
> + return prio;
> + return 0xb;
> +}
> +
> +static inline u32 __xive_read_eq(__be32 *qpage, u32 msk, u32 *idx, u32 *toggle)
> +{
> + u32 cur;
> +
> + if (!qpage)
> + return 0;
> + cur = be32_to_cpup(qpage + *idx);
> + if ((cur >> 31) == *toggle)
> + return 0;
> + *idx = (*idx + 1) & msk;
> + if (*idx == 0)
> + (*toggle) ^= 1;
> + return cur & 0x7fffffff;
> +}
> +
> +extern unsigned long xive_rm_h_xirr(struct kvm_vcpu *vcpu);
> +extern unsigned long xive_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server);
> +extern int xive_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
> + unsigned long mfrr);
> +extern int xive_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
> +extern int xive_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
> +
> +extern unsigned long (*__xive_vm_h_xirr)(struct kvm_vcpu *vcpu);
> +extern unsigned long (*__xive_vm_h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server);
> +extern int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
> + unsigned long mfrr);
> +extern int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr);
> +extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
> +
> +#endif /* _KVM_PPC_BOOK3S_XICS_H */
> diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c
> new file mode 100644
> index 0000000..b28c264
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_xive_template.c
> @@ -0,0 +1,490 @@
> +/*
> + * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + */
> +
> +/* File to be included by other .c files */
> +
> +#define XGLUE(a,b) a##b
> +#define GLUE(a,b) XGLUE(a,b)
> +
> +static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc)
> +{
> + u8 cppr;
> + u16 ack;
> +
> + /* XXX DD1 bug workaround: Check PIPR vs. CPPR first ! */
> +
> + /* Perform the acknowledge OS to register cycle. */
> + ack = be16_to_cpu(__x_readw(__x_tm_area + TM_SPC_ACK_OS_REG));
> +
> + /* Synchronize subsequent queue accesses */
> + mb();
> +
> + /* XXX Check grouping level */
> +
> + /* Anything ? */
> + if (!((ack >> 8) & TM_QW1_NSR_EO))
> + return;
> +
> + /* Grab CPPR of the most favored pending interrupt */
> + cppr = ack & 0xff;
> + if (cppr < 8)
> + xc->pending |= 1 << cppr;
> +
> +#ifdef XIVE_RUNTIME_CHECKS
> + /* Check consistency */
> + if (cppr >= xc->hw_cppr)
> + pr_warn("KVM-XIVE: CPU %d odd ack CPPR, got %d at %d\n",
> + smp_processor_id(), cppr, xc->hw_cppr);
> +#endif
> +
> + /* Update our image of the HW CPPR. We don't yet modify
> + * xc->cppr, this will be done as we scan for interrupts
> + * in the queues.
> + */
> + xc->hw_cppr = cppr;
> +}
> +
> +static u8 GLUE(X_PFX,esb_load)(struct xive_irq_data *xd, u32 offset)
> +{
> + u64 val;
> +
> + if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
> + offset |= offset << 4;
> +
> + val =__x_readq(__x_eoi_page(xd) + offset);
> +#ifdef __LITTLE_ENDIAN__
> + val >>= 64-8;
> +#endif
> + return (u8)val;
> +}
> +
> +
> +static void GLUE(X_PFX,source_eoi)(u32 hw_irq, struct xive_irq_data *xd)
> +{
> + /* If the XIVE supports the new "store EOI facility, use it */
> + if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
> + __x_writeq(0, __x_eoi_page(xd));
> + else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) {
> + opal_int_eoi(hw_irq);
> + } else {
> + uint64_t eoi_val;
> +
> + /* Otherwise for EOI, we use the special MMIO that does
> + * a clear of both P and Q and returns the old Q.
> + *
> + * This allows us to then do a re-trigger if Q was set
> + * rather than synthetizing an interrupt in software
> + */
> + eoi_val = GLUE(X_PFX,esb_load)(xd, XIVE_ESB_SET_PQ_00);
> + if ((xd->flags & XIVE_IRQ_FLAG_LSI) || !(eoi_val & 1))
> + return;
> +
> + /* Re-trigger */
> + if (__x_trig_page(xd))
> + __x_writeq(0, __x_trig_page(xd));
> + }
> +
> +}
> +
> +enum {
> + scan_fetch,
> + scan_poll,
> + scan_eoi,
> +};
> +
> +static u32 GLUE(X_PFX,scan_interrupts)(struct kvmppc_xive_vcpu *xc,
> + u8 pending, int scan_type)
> +{
> + u32 hirq = 0;
> + u8 prio = 0xff;
> +
> + /* Find highest pending priority */
> + while ((xc->mfrr != 0xff || pending != 0) && hirq == 0) {
> + struct xive_q *q;
> + u32 idx, toggle;
> + __be32 *qpage;
> +
> + /*
> + * If pending is 0 this will return 0xff which is what
> + * we want
> + */
> + prio = ffs(pending) - 1;
> +
> + /*
> + * If the most favoured prio we found pending is less
> + * favored (or equal) than a pending IPI, we return
> + * the IPI instead.
> + *
> + * Note: If pending was 0 and mfrr is 0xff, we will
> + * not spurriously take an IPI because mfrr cannot
> + * then be smaller than cppr.
> + */
> + if (prio >= xc->mfrr && xc->mfrr < xc->cppr) {
> + prio = xc->mfrr;
> + hirq = XICS_IPI;
> + break;
> + }
> +
> + /* Don't scan past the guest cppr */
> + if (prio >= xc->cppr || prio > 7)
> + break;
> +
> + /* Grab queue and pointers */
> + q = &xc->queues[prio];
> + idx = q->idx;
> + toggle = q->toggle;
> +
> + /*
> + * Snapshot the queue page. The test further down for EOI
> + * must use the same "copy" that was used by __xive_read_eq
> + * since qpage can be set concurrently and we don't want
> + * to miss an EOI.
> + */
> + qpage = READ_ONCE(q->qpage);
> +
> + skip_ipi:
> + /* Try to fetch from the queue. Will return 0 for a
> + * non-queueing priority (ie, qpage = 0)
> + */
> + hirq = __xive_read_eq(qpage, q->msk, &idx, &toggle);
> +
> + /*
> + * If this was a signal for an MFFR change done by
> + * H_IPI we skip it. Additionally, if we were fetching
> + * we EOI it now, thus re-enabling reception of a new
> + * such signal.
> + *
> + * We also need to do that if prio is 0 and we had no
> + * page for the queue. In this case, we have non-queued
> + * IPI that needs to be EOId.
> + *
> + * This is safe because if we have another pending MFRR
> + * change that wasn't observed above, the Q bit will have
> + * been set and another occurrence of the IPI will trigger.
> + */
> + if (hirq == XICS_IPI || (prio == 0 && !qpage)) {
> + if (scan_type == scan_fetch)
> + GLUE(X_PFX,source_eoi)(xc->vp_ipi,
> + &xc->vp_ipi_data);
> + /* Loop back on same queue with updated idx/toggle */
> +#ifdef XIVE_RUNTIME_CHECKS
> + WARN_ON(hirq && hirq != XICS_IPI);
> +#endif
> + if (hirq)
> + goto skip_ipi;
> + }
> +
> + /* If fetching, update queue pointers */
> + if (scan_type == scan_fetch) {
> + q->idx = idx;
> + q->toggle = toggle;
> + }
> +
> + /* Something found, stop searching */
> + if (hirq)
> + break;
> +
> + /* Clear the pending bit on the now empty queue */
> + pending &= ~(1 << prio);
> +
> + /*
> + * Check if the queue count needs adjusting due to
> + * interrupts being moved away.
> + */
> + if (atomic_read(&q->pending_count)) {
> + int p = atomic_xchg(&q->pending_count, 0);
> + if (p) {
> +#ifdef XIVE_RUNTIME_CHECKS
> + WARN_ON(p > atomic_read(&q->count));
> +#endif
> + atomic_sub(p, &q->count);
> + }
> + }
> + }
> +
> + /* If we are just taking a "peek", do nothing else */
> + if (scan_type == scan_poll)
> + return hirq;
> +
> + /* Update the pending bits */
> + xc->pending = pending;
> +
> + /* If this is an EOI that's it, no CPPR adjustment done here,
> + * all we needed was cleanup the stale pending bits and check
> + * if there's anything left.
> + */
> + if (scan_type == scan_eoi)
> + return hirq;
> +
> + /* If we found an interrupt, adjust what the guest CPPR should
> + * be as if we had just fetched that interrupt from HW
> + */
> + if (hirq)
> + xc->cppr = prio;
> + /*
> + * If it was an IPI the HW CPPR might have been lowered too much
> + * as the HW interrupt we use for IPIs is routed to priority 0.
> + *
> + * We re-sync it here.
> + */
> + if (xc->cppr != xc->hw_cppr) {
> + xc->hw_cppr = xc->cppr;
> + __x_writeb(xc->cppr, __x_tm_area + TM_QW1_OS + TM_CPPR);
> + }
> +
> + return hirq;
> +}
> +
> +X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu)
> +{
> + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> + u8 old_cppr;
> + u32 hirq;
> +
> + DBG("H_XIRR\n");
> +
> + xc->GLUE(X_STAT_PFX,h_xirr)++;
> +
> + /* First collect pending bits from HW */
> + GLUE(X_PFX,ack_pending)(xc);
> +
> + /* Cleanup the old-style bits if needed (they may have been
> + * set by pull or an escalation interrupts)
> + */
> + if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions))
> + clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
> + &vcpu->arch.pending_exceptions);
> +
> + DBG(" new pending=0x%02x hw_cppr=%d cppr=%d\n",
> + xc->pending, xc->hw_cppr, xc->cppr);
> +
> + /* Grab previous CPPR and reverse map it */
> + old_cppr = xive_prio_to_guest(xc->cppr);
> +
> + /* Scan for actual interrupts */
> + hirq = GLUE(X_PFX,scan_interrupts)(xc, xc->pending, scan_fetch);
> +
> + DBG(" got hirq=0x%x hw_cppr=%d cppr=%d\n",
> + hirq, xc->hw_cppr, xc->cppr);
> +
> +#ifdef XIVE_RUNTIME_CHECKS
> + /* That should never hit */
> + if (hirq & 0xff000000)
> + pr_warn("XIVE: Weird guest interrupt number 0x%08x\n", hirq);
> +#endif
> +
> + /*
> + * XXX We could check if the interrupt is masked here and
> + * filter it. If we chose to do so, we would need to do:
> + *
> + * if (masked) {
> + * lock();
> + * if (masked) {
> + * old_Q = true;
> + * hirq = 0;
> + * }
> + * unlock();
> + * }
> + */
> +
> + /* Return interrupt and old CPPR in GPR4 */
> + vcpu->arch.gpr[4] = hirq | (old_cppr << 24);
> +
> + return H_SUCCESS;
> +}
> +
> +X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server)
> +{
> + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> + u8 pending = xc->pending;
> + u32 hirq;
> + u8 pipr;
> +
> + DBG("H_IPOLL(server=%ld)\n", server);
> +
> + xc->GLUE(X_STAT_PFX,h_ipoll)++;
> +
> + /* Grab the target VCPU if not the current one */
> + if (xc->server_num != server) {
> + vcpu = kvmppc_xive_find_server(vcpu->kvm, server);
> + if (!vcpu)
> + return H_PARAMETER;
> + xc = vcpu->arch.xive_vcpu;
> +
> + /* Scan all priorities */
> + pending = 0xff;
> + } else {
> + /* Grab pending interrupt if any */
> + pipr = __x_readb(__x_tm_area + TM_QW1_OS + TM_PIPR);
> + if (pipr < 8)
> + pending |= 1 << pipr;
> + }
> +
> + hirq = GLUE(X_PFX,scan_interrupts)(xc, pending, scan_poll);
> +
> + /* Return interrupt and old CPPR in GPR4 */
> + vcpu->arch.gpr[4] = hirq | (xc->cppr << 24);
> +
> + return H_SUCCESS;
> +}
> +
> +static void GLUE(X_PFX,push_pending_to_hw)(struct kvmppc_xive_vcpu *xc)
> +{
> + u8 pending, prio;
> +
> + pending = xc->pending;
> + if (xc->mfrr != 0xff) {
> + if (xc->mfrr < 8)
> + pending |= 1 << xc->mfrr;
> + else
> + pending |= 0x80;
> + }
> + if (!pending)
> + return;
> + prio = ffs(pending) - 1;
> +
> + __x_writeb(prio, __x_tm_area + TM_SPC_SET_OS_PENDING);
> +}
> +
> +X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr)
> +{
> + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> + u8 old_cppr;
> +
> + DBG("H_CPPR(cppr=%ld)\n", cppr);
> +
> + xc->GLUE(X_STAT_PFX,h_cppr)++;
> +
> + /* Map CPPR */
> + cppr = xive_prio_from_guest(cppr);
> +
> + /* Remember old and update SW state */
> + old_cppr = xc->cppr;
> + xc->cppr = cppr;
> +
> + /*
> + * We are masking less, we need to look for pending things
> + * to deliver and set VP pending bits accordingly to trigger
> + * a new interrupt otherwise we might miss MFRR changes for
> + * which we have optimized out sending an IPI signal.
> + */
> + if (cppr > old_cppr)
> + GLUE(X_PFX,push_pending_to_hw)(xc);
> +
> + /* Apply new CPPR */
> + xc->hw_cppr = cppr;
> + __x_writeb(cppr, __x_tm_area + TM_QW1_OS + TM_CPPR);
> +
> + return H_SUCCESS;
> +}
> +
> +X_STATIC int GLUE(X_PFX,h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr)
> +{
> + struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
> + struct kvmppc_xive_src_block *sb;
> + struct kvmppc_xive_irq_state *state;
> + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> + struct xive_irq_data *xd;
> + u8 new_cppr = xirr >> 24;
> + u32 irq = xirr & 0x00ffffff, hw_num;
> + u16 src;
> + int rc = 0;
> +
> + DBG("H_EOI(xirr=%08lx)\n", xirr);
> +
> + xc->GLUE(X_STAT_PFX,h_eoi)++;
> +
> + xc->cppr = xive_prio_from_guest(new_cppr);
> +
> + /*
> + * IPIs are synthetized from MFRR and thus don't need
> + * any special EOI handling. The underlying interrupt
> + * used to signal MFRR changes is EOId when fetched from
> + * the queue.
> + */
> + if (irq == XICS_IPI || irq == 0)
> + goto bail;
> +
> + /* Find interrupt source */
> + sb = kvmppc_xive_find_source(xive, irq, &src);
> + if (!sb) {
> + DBG(" source not found !\n");
> + rc = H_PARAMETER;
> + goto bail;
> + }
> + state = &sb->irq_state[src];
> + kvmppc_xive_select_irq(state, &hw_num, &xd);
> +
> + state->in_eoi = true;
> + mb();
> +
> + again:
> + if (state->guest_priority == MASKED) {
> + arch_spin_lock(&sb->lock);
> + if (state->guest_priority != MASKED) {
> + arch_spin_unlock(&sb->lock);
> + goto again;
> + }
> + DBG(" EOI on saved P...\n");
> +
> + /* Clear old_p, that will cause unmask to perform an EOI */
> + state->old_p = false;
> +
> + arch_spin_unlock(&sb->lock);
> + } else {
> + DBG(" EOI on source...\n");
> +
> + /* Perform EOI on the source */
> + GLUE(X_PFX,source_eoi)(hw_num, xd);
> +
> + /* If it's an emulated LSI, check level and resend */
> + if (state->lsi && state->asserted)
> + __x_writeq(0, __x_trig_page(xd));
> +
> + }
> +
> + mb();
> + state->in_eoi = false;
> + bail:
> +
> + /* Re-evaluate pending IRQs and update HW */
> + GLUE(X_PFX,scan_interrupts)(xc, xc->pending, scan_eoi);
> + GLUE(X_PFX,push_pending_to_hw)(xc);
> + DBG(" after scan pending=%02x\n", xc->pending);
> +
> + /* Apply new CPPR */
> + xc->hw_cppr = xc->cppr;
> + __x_writeb(xc->cppr, __x_tm_area + TM_QW1_OS + TM_CPPR);
> +
> + return rc;
> +}
> +
> +X_STATIC int GLUE(X_PFX,h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
> + unsigned long mfrr)
> +{
> + struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +
> + DBG("H_IPI(server=%08lx,mfrr=%ld)\n", server, mfrr);
> +
> + xc->GLUE(X_STAT_PFX,h_ipi)++;
> +
> + /* Find target */
> + vcpu = kvmppc_xive_find_server(vcpu->kvm, server);
> + if (!vcpu)
> + return H_PARAMETER;
> + xc = vcpu->arch.xive_vcpu;
> +
> + /* Locklessly write over MFRR */
> + xc->mfrr = mfrr;
> +
> + /* Shoot the IPI if most favored than target cppr */
> + if (mfrr < xc->cppr)
> + __x_writeq(0, __x_trig_page(&xc->vp_ipi_data));
> +
> + return H_SUCCESS;
> +}
> diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
> index 5a9a10b..3f1be85 100644
> --- a/arch/powerpc/kvm/irq.h
> +++ b/arch/powerpc/kvm/irq.h
> @@ -12,6 +12,7 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
> #endif
> #ifdef CONFIG_KVM_XICS
> ret = ret || (kvm->arch.xics != NULL);
> + ret = ret || (kvm->arch.xive != NULL);
> #endif
> smp_rmb();
> return ret;
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 95c91a9..de79bd72 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -37,6 +37,8 @@
> #include <asm/cputhreads.h>
> #include <asm/irqflags.h>
> #include <asm/iommu.h>
> +#include <asm/xive.h>
> +
> #include "timing.h"
> #include "irq.h"
> #include "../mm/mmu_decl.h"
> @@ -699,7 +701,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
> kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu);
> break;
> case KVMPPC_IRQ_XICS:
> - kvmppc_xics_free_icp(vcpu);
> + if (xive_enabled())
> + kvmppc_xive_cleanup_vcpu(vcpu);
> + else
> + kvmppc_xics_free_icp(vcpu);
> break;
> }
>
> @@ -1219,8 +1224,12 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
>
> r = -EPERM;
> dev = kvm_device_from_filp(f.file);
> - if (dev)
> - r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
> + if (dev) {
> + if (xive_enabled())
> + r = kvmppc_xive_connect_vcpu(dev, vcpu, cap->args[1]);
> + else
> + r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
> + }
>
> fdput(f);
> break;
> @@ -1244,7 +1253,7 @@ bool kvm_arch_intc_initialized(struct kvm *kvm)
> return true;
> #endif
> #ifdef CONFIG_KVM_XICS
> - if (kvm->arch.xics)
> + if (kvm->arch.xics || kvm->arch.xive)
> return true;
> #endif
> return false;
> diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
> index e0f856b..d71cd77 100644
> --- a/arch/powerpc/platforms/powernv/opal.c
> +++ b/arch/powerpc/platforms/powernv/opal.c
> @@ -890,3 +890,4 @@ EXPORT_SYMBOL_GPL(opal_leds_set_ind);
> EXPORT_SYMBOL_GPL(opal_write_oppanel_async);
> /* Export this for KVM */
> EXPORT_SYMBOL_GPL(opal_int_set_mfrr);
> +EXPORT_SYMBOL_GPL(opal_int_eoi);
> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
> index 96037e0..6429cd3 100644
> --- a/arch/powerpc/sysdev/xive/common.c
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -45,12 +45,14 @@
> #endif
>
> bool __xive_enabled;
> +EXPORT_SYMBOL_GPL(__xive_enabled);
> bool xive_cmdline_disabled;
>
> /* We use only one priority for now */
> static u8 xive_irq_priority;
>
> void __iomem *xive_tm_area;
> +EXPORT_SYMBOL_GPL(xive_tm_area);
> u32 xive_tm_offset;
> static const struct xive_ops *xive_ops;
> static struct irq_domain *xive_irq_domain;
> @@ -304,7 +306,7 @@ static void xive_irq_eoi(struct irq_data *d)
> DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n",
> d->irq, irqd_to_hwirq(d), xc->pending_prio);
>
> - if (!irqd_irq_disabled(d))
> + if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d))
> xive_do_source_eoi(irqd_to_hwirq(d), xd);
>
> /*
> @@ -579,9 +581,10 @@ static int xive_irq_set_affinity(struct irq_data *d,
> * Only configure the irq if it's not currently passed-through to
> * a KVM guest
> */
> - rc = xive_ops->configure_irq(hw_irq,
> - get_hard_smp_processor_id(target),
> - xive_irq_priority, d->irq);
> + if (!irqd_is_forwarded_to_vcpu(d))
> + rc = xive_ops->configure_irq(hw_irq,
> + get_hard_smp_processor_id(target),
> + xive_irq_priority, d->irq);
> if (rc < 0) {
> pr_err("XIVE: Error %d reconfiguring irq %d\n", rc, d->irq);
> return rc;
> @@ -661,6 +664,123 @@ static int xive_irq_retrigger(struct irq_data *d)
> return 1;
> }
>
> +static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
> +{
> + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
> + int rc;
> + u8 pq;
> +
> + /*
> + * We only support this on interrupts that do not require
> + * firmware calls for masking and unmasking
> + */
> + if (xd->flags & XIVE_IRQ_FLAG_MASK_FW)
> + return -EIO;
> +
> + /*
> + * This is called by KVM with state non-NULL for enabling
> + * pass-through or NULL for disabling it
> + */
> + if (state) {
> + irqd_set_forwarded_to_vcpu(d);
> +
> + /* Set it to PQ=10 state to prevent further sends */
> + pq = xive_poke_esb(xd, 0xe00);
Use XIVE_ESB_SET_PQ_xx constants in these xive_poke_esb() calls (as
you have done elsewhere).
> +
> + /* No target ? nothing to do */
> + if (xd->target == XIVE_INVALID_TARGET) {
> + /*
> + * An untargetted interrupt should have been
> + * also masked at the source
> + */
> + WARN_ON(pq & 2);
> +
> + return 0;
> + }
> +
> + /*
> + * If P was set, adjust state to PQ=11 to indicate
> + * that a resend is needed for the interrupt to reach
> + * the guest. Also remember the value of P.
> + *
> + * This also tells us that it's in flight to a host queue
> + * or has already been fetched but hasn't been EOIed yet
> + * by the host. This it's potentially using up a host
> + * queue slot. This is important to know because as long
> + * as this is the case, we must not hard-unmask it when
> + * "returning" that interrupt to the host.
> + *
> + * This saved_p is cleared by the host EOI, when we know
> + * for sure the queue slot is no longer in use.
> + */
> + if (pq & 2) {
> + pq = xive_poke_esb(xd, 0xf00);
> + xd->saved_p = true;
> +
> + /*
> + * Sync the XIVE source HW to ensure the interrupt
> + * has gone through the EAS before we change its
> + * target to the guest. That should guarantee us
> + * that we *will* eventually get an EOI for it on
> + * the host. Otherwise there would be a small window
> + * for P to be seen here but the interrupt going
> + * to the guest queue.
> + */
> + if (xive_ops->sync_source)
> + xive_ops->sync_source(hw_irq);
> + } else
> + xd->saved_p = false;
> + } else {
> + irqd_clr_forwarded_to_vcpu(d);
> +
> + /* No host target ? hard mask and return */
> + if (xd->target == XIVE_INVALID_TARGET) {
> + xive_do_source_set_mask(xd, true);
> + return 0;
> + }
> +
> + /*
> + * Sync the XIVE source HW to ensure the interrupt
> + * has gone through the EAS before we change its
> + * target to the host.
> + */
> + if (xive_ops->sync_source)
> + xive_ops->sync_source(hw_irq);
> +
> + /*
> + * By convention we are called with the interrupt in
> + * a PQ=10 or PQ=11 state, ie, it won't fire and will
> + * have latched in Q whether there's a pending HW
> + * interrupt or not.
> + *
> + * First reconfigure the target.
> + */
> + rc = xive_ops->configure_irq(hw_irq,
> + get_hard_smp_processor_id(xd->target),
> + xive_irq_priority, d->irq);
> + if (rc)
> + return rc;
> +
> + /*
> + * Then if saved_p is not set, effectively re-enable the
> + * interrupt with an EOI. If it is set, we know there is
> + * still a message in a host queue somewhere that will be
> + * EOId eventually.
> + *
> + * Note: We don't check irqd_irq_disabled(). Effectively,
> + * we *will* let the irq get through even if masked if the
> + * HW is still firing it in order to deal with the whole
> + * saved_p business properly. If the interrupt triggers
> + * while masked, the generic code will re-mask it anyway.
> + */
> + if (!xd->saved_p)
> + xive_do_source_eoi(hw_irq, xd);
> +
> + }
> + return 0;
> +}
> +
> static struct irq_chip xive_irq_chip = {
> .name = "XIVE-IRQ",
> .irq_startup = xive_irq_startup,
> @@ -671,12 +791,14 @@ static struct irq_chip xive_irq_chip = {
> .irq_set_affinity = xive_irq_set_affinity,
> .irq_set_type = xive_irq_set_type,
> .irq_retrigger = xive_irq_retrigger,
> + .irq_set_vcpu_affinity = xive_irq_set_vcpu_affinity,
> };
>
> bool is_xive_irq(struct irq_chip *chip)
> {
> return chip == &xive_irq_chip;
> }
> +EXPORT_SYMBOL_GPL(is_xive_irq);
>
> void xive_cleanup_irq_data(struct xive_irq_data *xd)
> {
> @@ -691,6 +813,7 @@ void xive_cleanup_irq_data(struct xive_irq_data *xd)
> xd->trig_mmio = NULL;
> }
> }
> +EXPORT_SYMBOL_GPL(xive_cleanup_irq_data);
>
> static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw)
> {
> diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
> index 26cc6bf..0130af8 100644
> --- a/arch/powerpc/sysdev/xive/native.c
> +++ b/arch/powerpc/sysdev/xive/native.c
> @@ -27,6 +27,7 @@
> #include <asm/errno.h>
> #include <asm/xive.h>
> #include <asm/opal.h>
> +#include <asm/kvm_ppc.h>
>
> #include "xive-regs.h"
> #include "xive-internal.h"
> @@ -98,6 +99,7 @@ int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data)
> }
> return 0;
> }
> +EXPORT_SYMBOL_GPL(xive_native_populate_irq_data);
>
> int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
> {
> @@ -111,6 +113,8 @@ int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
> }
> return rc == 0 ? 0 : -ENXIO;
> }
> +EXPORT_SYMBOL_GPL(xive_native_configure_irq);
> +
>
> /* This can be called multiple time to change a queue configuration */
> int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
> @@ -187,6 +191,7 @@ int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
> fail:
> return rc;
> }
> +EXPORT_SYMBOL_GPL(xive_native_configure_queue);
>
> static void __xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
> {
> @@ -211,6 +216,7 @@ void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
> iounmap(q->eoi_mmio);
> q->eoi_mmio = NULL;
> }
> +EXPORT_SYMBOL_GPL(xive_native_disable_queue);
>
> static int xive_native_setup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio)
> {
> @@ -297,6 +303,7 @@ u32 xive_native_alloc_irq(void)
> return 0;
> return rc;
> }
> +EXPORT_SYMBOL_GPL(xive_native_alloc_irq);
>
> void xive_native_free_irq(u32 irq)
> {
> @@ -307,6 +314,7 @@ void xive_native_free_irq(u32 irq)
> msleep(1);
> }
> }
> +EXPORT_SYMBOL_GPL(xive_native_free_irq);
>
> static void xive_native_put_ipi(unsigned int cpu, struct xive_cpu *xc)
> {
> @@ -406,10 +414,11 @@ static void xive_native_teardown_cpu(unsigned int cpu, struct xive_cpu *xc)
> }
> }
>
> -static void xive_native_sync_source(u32 hw_irq)
> +void xive_native_sync_source(u32 hw_irq)
> {
> opal_xive_sync(XIVE_SYNC_EAS, hw_irq);
> }
> +EXPORT_SYMBOL_GPL(xive_native_sync_source);
>
> static const struct xive_ops xive_native_ops = {
> .populate_irq_data = xive_native_populate_irq_data,
> @@ -468,10 +477,38 @@ static bool xive_parse_provisioning(struct device_node *np)
> return true;
> }
>
> +static void xive_native_setup_pools(void)
> +{
> + u32 max_pir = 0;
> + unsigned int cpu;
> +
> + /*
> + * The HW won't let us enable OS VPs for KVM is we don't
> + * have enabled pool VPs so let's do that. First we find
> + * out our highest HW CPU ID
> + */
> + for_each_possible_cpu(cpu) {
> + u32 hw_id = get_hard_smp_processor_id(cpu);
> + if (hw_id > max_pir)
> + max_pir = hw_id;
> + }
> +
> + /* Allocate a pool big enough */
> + pr_debug("XIVE: Allocating VP block for pool size %d\n",
> + max_pir + 1);
> + xive_pool_vps = xive_native_alloc_vp_block(max_pir + 1);
> + if (WARN_ON(xive_pool_vps == XIVE_INVALID_VP))
> + pr_err("XIVE: No pool VPsvp KVM might not function\n");
> +
> + pr_debug("XIVE: Pool VPs allocated at 0x%x for max_pir 0x%x\n",
> + xive_pool_vps, max_pir);
> +}
> +
> u32 xive_native_default_eq_shift(void)
> {
> return xive_queue_shift;
> }
> +EXPORT_SYMBOL_GPL(xive_native_default_eq_shift);
>
> bool xive_native_init(void)
> {
> @@ -481,7 +518,7 @@ bool xive_native_init(void)
> struct property *prop;
> u8 max_prio = 7;
> const __be32 *p;
> - u32 val;
> + u32 val, cpu;
> s64 rc;
>
> if (xive_cmdline_disabled)
> @@ -517,6 +554,10 @@ bool xive_native_init(void)
> break;
> }
>
> + /* Configure TM areas for KVM */
> + for_each_possible_cpu(cpu)
> + kvmppc_set_xive_tm_area(cpu, r.start, tm_area);
> +
> /* Grab size of provisionning pages */
> xive_parse_provisioning(np);
>
> @@ -528,6 +569,9 @@ bool xive_native_init(void)
> return false;
> }
>
> + /* Setup some dummy HV pool VPs */
> + xive_native_setup_pools();
> +
> /* Initialize XIVE core with our backend */
> if (!xive_core_init(&xive_native_ops, tm_area, TM_QW3_HV_PHYS,
> max_prio)) {
> @@ -602,3 +646,47 @@ void xive_native_free_vp_block(u32 vp_base)
> pr_warn("XIVE: OPAL error %lld freeing VP block\n", rc);
> }
> EXPORT_SYMBOL_GPL(xive_native_free_vp_block);
> +
> +int xive_native_enable_vp(u32 vp_id)
> +{
> + s64 rc;
> +
> + for (;;) {
> + rc = opal_xive_set_vp_info(vp_id, OPAL_XIVE_VP_ENABLED, 0);
> + if (rc != OPAL_BUSY)
> + break;
> + msleep(1);
> + }
> + return rc ? -EIO : 0;
> +}
> +EXPORT_SYMBOL_GPL(xive_native_enable_vp);
> +
> +int xive_native_disable_vp(u32 vp_id)
> +{
> + s64 rc;
> +
> + for (;;) {
> + rc = opal_xive_set_vp_info(vp_id, 0, 0);
> + if (rc != OPAL_BUSY)
> + break;
> + msleep(1);
> + }
> + return rc ? -EIO : 0;
> +}
> +EXPORT_SYMBOL_GPL(xive_native_disable_vp);
> +
> +int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id)
> +{
> + __be64 vp_cam_be;
> + __be32 vp_chip_id_be;
> + s64 rc;
> +
> + rc = opal_xive_get_vp_info(vp_id, NULL, &vp_cam_be, NULL, &vp_chip_id_be);
> + if (rc)
> + return -EIO;
> + *out_cam_id = be64_to_cpu(vp_cam_be) & 0xffffffffu;
> + *out_chip_id = be32_to_cpu(vp_chip_id_be);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(xive_native_get_vp_info);
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 2c14ad9..d1a6e55 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1165,7 +1165,6 @@ int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type);
> void kvm_unregister_device_ops(u32 type);
>
> extern struct kvm_device_ops kvm_mpic_ops;
> -extern struct kvm_device_ops kvm_xics_ops;
> extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
> extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
>
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index a17d787..1b0da57 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2839,10 +2839,6 @@ static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
> [KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
> [KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
> #endif
> -
> -#ifdef CONFIG_KVM_XICS
> - [KVM_DEV_TYPE_XICS] = &kvm_xics_ops,
> -#endif
> };
>
> int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
> --
> 2.9.3
More information about the Linuxppc-dev
mailing list