[PATCH 12/12] powerpc/kvm: Native usage of the XIVE interrupt controller

Tue Mar 28 16:26:33 AEDT 2017

On Mon, Mar 20, 2017 at 05:49:14PM +1100, Benjamin Herrenschmidt wrote:
> This patch makes KVM capable of using the XIVE interrupt controller
> to provide the standard PAPR "XICS" style hypercalls. It is necessary
> for proper operations when the host uses XIVE natively.
> 
> This has been lightly tested on an actual system, including PCI
> pass-through with a TG3 device.
> 
> Signed-off-by: Benjamin Herrenschmidt <benh at kernel.crashing.org>

Looks good overall, some comments below...

> ---
>  arch/powerpc/include/asm/kvm_book3s_asm.h |    2 +
>  arch/powerpc/include/asm/kvm_host.h       |   28 +-
>  arch/powerpc/include/asm/kvm_ppc.h        |   38 +
>  arch/powerpc/include/asm/xive.h           |   11 +-
>  arch/powerpc/kernel/asm-offsets.c         |   10 +
>  arch/powerpc/kvm/Makefile                 |    4 +-
>  arch/powerpc/kvm/book3s.c                 |   73 +-
>  arch/powerpc/kvm/book3s_hv.c              |   52 +-
>  arch/powerpc/kvm/book3s_hv_builtin.c      |  108 ++
>  arch/powerpc/kvm/book3s_hv_rm_xics.c      |   10 +-
>  arch/powerpc/kvm/book3s_hv_rm_xive.c      |   47 +
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S   |   60 +-
>  arch/powerpc/kvm/book3s_rtas.c            |   21 +-
>  arch/powerpc/kvm/book3s_xics.c            |   35 +-
>  arch/powerpc/kvm/book3s_xics.h            |    5 +
>  arch/powerpc/kvm/book3s_xive.c            | 1898 +++++++++++++++++++++++++++++
>  arch/powerpc/kvm/book3s_xive.h            |  251 ++++
>  arch/powerpc/kvm/book3s_xive_template.c   |  490 ++++++++
>  arch/powerpc/kvm/irq.h                    |    1 +
>  arch/powerpc/kvm/powerpc.c                |   17 +-
>  arch/powerpc/platforms/powernv/opal.c     |    1 +
>  arch/powerpc/sysdev/xive/common.c         |  131 +-
>  arch/powerpc/sysdev/xive/native.c         |   92 +-
>  include/linux/kvm_host.h                  |    1 -
>  virt/kvm/kvm_main.c                       |    4 -
>  25 files changed, 3305 insertions(+), 85 deletions(-)
>  create mode 100644 arch/powerpc/kvm/book3s_hv_rm_xive.c
>  create mode 100644 arch/powerpc/kvm/book3s_xive.c
>  create mode 100644 arch/powerpc/kvm/book3s_xive.h
>  create mode 100644 arch/powerpc/kvm/book3s_xive_template.c
> 
> diff --git a/arch/powerpc/include/asm/kvm_book3s_asm.h b/arch/powerpc/include/asm/kvm_book3s_asm.h
> index 0593d94..e719002 100644
> --- a/arch/powerpc/include/asm/kvm_book3s_asm.h
> +++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
> @@ -111,6 +111,8 @@ struct kvmppc_host_state {
>  	struct kvm_vcpu *kvm_vcpu;
>  	struct kvmppc_vcore *kvm_vcore;
>  	void __iomem *xics_phys;
> +	void __iomem *xive_tm_area_phys;
> +	void __iomem *xive_tm_area_virt;

Does this cause the paca to become a cacheline larger?  (Not that
there is much alternative to having these fields.)

>  	u32 saved_xirr;
>  	u64 dabr;
>  	u64 host_mmcr[7];	/* MMCR 0,1,A, SIAR, SDAR, MMCR2, SIER */
> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
> index 7bba8f4..fc491ac 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -205,6 +205,12 @@ struct kvmppc_spapr_tce_table {
>  /* XICS components, defined in book3s_xics.c */
>  struct kvmppc_xics;
>  struct kvmppc_icp;
> +extern struct kvm_device_ops kvm_xics_ops;
> +
> +/* XIVE components, defined in book3s_xive.c */
> +struct kvmppc_xive;
> +struct kvmppc_xive_vcpu;
> +extern struct kvm_device_ops kvm_xive_ops;
>  
>  struct kvmppc_passthru_irqmap;
>  
> @@ -293,6 +299,7 @@ struct kvm_arch {
>  #endif
>  #ifdef CONFIG_KVM_XICS
>  	struct kvmppc_xics *xics;
> +	struct kvmppc_xive *xive;
>  	struct kvmppc_passthru_irqmap *pimap;
>  #endif
>  	struct kvmppc_ops *kvm_ops;
> @@ -421,7 +428,7 @@ struct kvmppc_passthru_irqmap {
>  
>  #define KVMPPC_IRQ_DEFAULT	0
>  #define KVMPPC_IRQ_MPIC		1
> -#define KVMPPC_IRQ_XICS		2
> +#define KVMPPC_IRQ_XICS		2 /* Includes a XIVE option */
>  
>  #define MMIO_HPTE_CACHE_SIZE	4
>  
> @@ -443,6 +450,21 @@ struct mmio_hpte_cache {
>  
>  struct openpic;
>  
> +/* QW0 and QW1 of a context */
> +union xive_qw01 {
> +	struct {
> +		u8	nsr;
> +		u8	cppr;
> +		u8	ipb;
> +		u8	lsmfb;
> +		u8	ack;
> +		u8	inc;
> +		u8	age;
> +		u8	pipr;
> +	};
> +	__be64 qw;
> +};

This is slightly confusing because a "QW" (quadword) would normally be
128 bits, but this union is 64 bits.

> +
>  struct kvm_vcpu_arch {
>  	ulong host_stack;
>  	u32 host_pid;
> @@ -688,6 +710,10 @@ struct kvm_vcpu_arch {
>  	struct openpic *mpic;	/* KVM_IRQ_MPIC */
>  #ifdef CONFIG_KVM_XICS
>  	struct kvmppc_icp *icp; /* XICS presentation controller */
> +	struct kvmppc_xive_vcpu *xive_vcpu; /* XIVE virtual CPU data */
> +	__be32 xive_cam_word;    /* Cooked W2 in proper endian with valid bit */
> +	u32 xive_pushed;	 /* Is the VP pushed on the physical CPU ? */
> +	union xive_qw01 xive_saved_state; /* W0..1 of XIVE state */
>  #endif
>  
>  #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
> index c387799..2fcf6cf 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -225,6 +225,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
>  extern int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp);
>  extern int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu);
>  extern void kvmppc_rtas_tokens_free(struct kvm *kvm);
> +
>  extern int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server,
>  				u32 priority);
>  extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server,
> @@ -232,6 +233,15 @@ extern int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server,
>  extern int kvmppc_xics_int_on(struct kvm *kvm, u32 irq);
>  extern int kvmppc_xics_int_off(struct kvm *kvm, u32 irq);
>  
> +extern int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
> +				u32 priority);
> +extern int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
> +				u32 *priority);

Might be worth a comment here to explain that the first xive is
eXternal Interrupt Virtualization Engine and the second xive is
eXternal Interrupt Vector Entry.

> +extern int kvmppc_xive_int_on(struct kvm *kvm, u32 irq);
> +extern int kvmppc_xive_int_off(struct kvm *kvm, u32 irq);
> +extern void kvmppc_xive_init_module(void);
> +extern void kvmppc_xive_exit_module(void);
> +
>  void kvmppc_core_dequeue_debug(struct kvm_vcpu *vcpu);
>  void kvmppc_core_queue_debug(struct kvm_vcpu *vcpu);
>  
> @@ -412,6 +422,14 @@ static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
>  	paca[cpu].kvm_hstate.xics_phys = (void __iomem *)addr;
>  }
>  
> +static inline void kvmppc_set_xive_tm_area(int cpu,
> +					   unsigned long phys_addr,
> +					   void __iomem *virt_addr)
> +{
> +	paca[cpu].kvm_hstate.xive_tm_area_phys = (void __iomem *)phys_addr;
> +	paca[cpu].kvm_hstate.xive_tm_area_virt = virt_addr;
> +}
> +
>  static inline u32 kvmppc_get_xics_latch(void)
>  {
>  	u32 xirr;
> @@ -442,6 +460,9 @@ static inline void __init kvm_cma_reserve(void)
>  static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
>  {}
>  
> +static inline void kvmppc_set_xive_tm_area_phys(int cpu, unsigned long addr)
> +{}

Shouldn't this be kvmppc_set_xive_tm_area to match the other definition?

> +
>  static inline u32 kvmppc_get_xics_latch(void)
>  {
>  	return 0;
> @@ -492,6 +513,21 @@ extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, __be32 xirr,
>  					struct kvmppc_irq_map *irq_map,
>  					struct kvmppc_passthru_irqmap *pimap,
>  					bool *again);
> +extern int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
> +				    struct kvm_vcpu *vcpu, u32 cpu);
> +extern void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu);
> +extern int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
> +				  struct irq_desc *host_desc);
> +extern int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
> +				  struct irq_desc *host_desc);
> +extern u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu);
> +extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
> +
> +extern int kvmppc_xics_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
> +			       int level, bool line_status);
> +extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
> +			       int level, bool line_status);
> +
>  extern int h_ipi_redirect;
>  #else
>  static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
> @@ -546,6 +582,8 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
>  long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
>                            unsigned long slb_v, unsigned int status, bool data);
>  unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu);
> +unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu);
> +unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server);
>  int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
>                      unsigned long mfrr);
>  int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
> diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
> index b1604b73..94b5cca 100644
> --- a/arch/powerpc/include/asm/xive.h
> +++ b/arch/powerpc/include/asm/xive.h
> @@ -55,7 +55,8 @@ struct xive_q {
>  #define XIVE_ESB_SET_PQ_01	0xd00
>  #define XIVE_ESB_SET_PQ_10	0xe00
>  #define XIVE_ESB_SET_PQ_11	0xf00
> -#define XIVE_ESB_MASK		XIVE_ESB_SET_PQ_01
> +#define XIVE_ESB_SOFT_MASK	XIVE_ESB_SET_PQ_10
> +#define XIVE_ESB_HARD_MASK	XIVE_ESB_SET_PQ_01

What's the difference between a "soft" mask and a "hard" mask?

>  
>  extern bool __xive_enabled;
>  
> @@ -88,11 +89,11 @@ extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
>  				       __be32 *qpage, u32 order, bool can_escalate);
>  extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
>  
> -extern bool __xive_irq_trigger(struct xive_irq_data *xd);
> -extern bool __xive_irq_retrigger(struct xive_irq_data *xd);
> -extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd);
> -
> +extern void xive_native_sync_source(u32 hw_irq);
>  extern bool is_xive_irq(struct irq_chip *chip);
> +extern int xive_native_enable_vp(u32 vp_id);
> +extern int xive_native_disable_vp(u32 vp_id);
> +extern int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id);
>  
>  #else
>  
> diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
> index 4367e7d..59fa705 100644
> --- a/arch/powerpc/kernel/asm-offsets.c
> +++ b/arch/powerpc/kernel/asm-offsets.c
> @@ -630,6 +630,8 @@ int main(void)
>  	HSTATE_FIELD(HSTATE_KVM_VCPU, kvm_vcpu);
>  	HSTATE_FIELD(HSTATE_KVM_VCORE, kvm_vcore);
>  	HSTATE_FIELD(HSTATE_XICS_PHYS, xics_phys);
> +	HSTATE_FIELD(HSTATE_XIVE_TM_AREA_PHYS, xive_tm_area_phys);
> +	HSTATE_FIELD(HSTATE_XIVE_TM_AREA_VIRT, xive_tm_area_virt);
>  	HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
>  	HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
>  	HSTATE_FIELD(HSTATE_PTID, ptid);
> @@ -715,6 +717,14 @@ int main(void)
>  	OFFSET(VCPU_HOST_MAS6, kvm_vcpu, arch.host_mas6);
>  #endif
>  
> +#ifdef CONFIG_KVM_XICS
> +	DEFINE(VCPU_XIVE_SAVED_STATE, offsetof(struct kvm_vcpu,
> +					       arch.xive_saved_state));
> +	DEFINE(VCPU_XIVE_CAM_WORD, offsetof(struct kvm_vcpu,
> +					    arch.xive_cam_word));
> +	DEFINE(VCPU_XIVE_PUSHED, offsetof(struct kvm_vcpu, arch.xive_pushed));
> +#endif
> +
>  #ifdef CONFIG_KVM_EXIT_TIMING
>  	OFFSET(VCPU_TIMING_EXIT_TBU, kvm_vcpu, arch.timing_exit.tv32.tbu);
>  	OFFSET(VCPU_TIMING_EXIT_TBL, kvm_vcpu, arch.timing_exit.tv32.tbl);
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index b87ccde..ef89c8c 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -74,7 +74,7 @@ kvm-hv-y += \
>  	book3s_64_mmu_radix.o
>  
>  kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
> -	book3s_hv_rm_xics.o
> +	book3s_hv_rm_xics.o book3s_hv_rm_xive.o
>  
>  ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>  kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
> @@ -87,7 +87,7 @@ kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
>  endif
>  
>  kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
> -	book3s_xics.o
> +	book3s_xics.o book3s_xive.o
>  
>  kvm-book3s_64-module-objs := \
>  	$(common-objs-y) \
> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> index aedacef..e459ec4 100644
> --- a/arch/powerpc/kvm/book3s.c
> +++ b/arch/powerpc/kvm/book3s.c
> @@ -35,6 +35,7 @@
>  #include <asm/kvm_book3s.h>
>  #include <asm/mmu_context.h>
>  #include <asm/page.h>
> +#include <asm/xive.h>
>  
>  #include "book3s.h"
>  #include "trace.h"
> @@ -578,11 +579,14 @@ int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id,
>  			break;
>  #ifdef CONFIG_KVM_XICS
>  		case KVM_REG_PPC_ICP_STATE:
> -			if (!vcpu->arch.icp) {
> +			if (!vcpu->arch.icp && !vcpu->arch.xive_vcpu) {
>  				r = -ENXIO;
>  				break;
>  			}
> -			*val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
> +			if (xive_enabled())
> +				*val = get_reg_val(id, kvmppc_xive_get_icp(vcpu));
> +			else
> +				*val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
>  			break;
>  #endif /* CONFIG_KVM_XICS */
>  		case KVM_REG_PPC_FSCR:
> @@ -648,12 +652,14 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id,
>  #endif /* CONFIG_VSX */
>  #ifdef CONFIG_KVM_XICS
>  		case KVM_REG_PPC_ICP_STATE:
> -			if (!vcpu->arch.icp) {
> +			if (!vcpu->arch.icp && !vcpu->arch.xive_vcpu) {
>  				r = -ENXIO;
>  				break;
>  			}
> -			r = kvmppc_xics_set_icp(vcpu,
> -						set_reg_val(id, *val));
> +			if (xive_enabled())
> +				r = kvmppc_xive_set_icp(vcpu, set_reg_val(id, *val));
> +			else
> +				r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val));
>  			break;
>  #endif /* CONFIG_KVM_XICS */
>  		case KVM_REG_PPC_FSCR:
> @@ -924,6 +930,50 @@ int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hcall)
>  	return kvm->arch.kvm_ops->hcall_implemented(hcall);
>  }
>  
> +#ifdef CONFIG_KVM_XICS
> +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
> +		bool line_status)
> +{
> +	if (xive_enabled())
> +		return kvmppc_xive_set_irq(kvm, irq_source_id, irq, level,
> +					   line_status);
> +	else
> +		return kvmppc_xics_set_irq(kvm, irq_source_id, irq, level,
> +					   line_status);
> +}
> +
> +int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *irq_entry,
> +			      struct kvm *kvm, int irq_source_id,
> +			      int level, bool line_status)
> +{
> +	return kvm_set_irq(kvm, irq_source_id, irq_entry->gsi,
> +			   level, line_status);
> +}
> +static int kvmppc_book3s_set_irq(struct kvm_kernel_irq_routing_entry *e,
> +				 struct kvm *kvm, int irq_source_id, int level,
> +				 bool line_status)
> +{
> +	return kvm_set_irq(kvm, irq_source_id, e->gsi, level, line_status);
> +}
> +
> +int kvm_irq_map_gsi(struct kvm *kvm,
> +		    struct kvm_kernel_irq_routing_entry *entries, int gsi)
> +{
> +	entries->gsi = gsi;
> +	entries->type = KVM_IRQ_ROUTING_IRQCHIP;
> +	entries->set = kvmppc_book3s_set_irq;
> +	entries->irqchip.irqchip = 0;
> +	entries->irqchip.pin = gsi;
> +	return 1;
> +}
> +
> +int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
> +{
> +	return pin;
> +}
> +
> +#endif /* CONFIG_KVM_XICS */
> +
>  static int kvmppc_book3s_init(void)
>  {
>  	int r;
> @@ -934,12 +984,23 @@ static int kvmppc_book3s_init(void)
>  #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
>  	r = kvmppc_book3s_init_pr();
>  #endif
> -	return r;
>  
> +#ifdef CONFIG_KVM_XICS
> +	if (xive_enabled()) {
> +		kvmppc_xive_init_module();
> +		kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS);
> +	} else
> +		kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS);
> +#endif
> +	return r;
>  }
>  
>  static void kvmppc_book3s_exit(void)
>  {
> +#ifdef CONFIG_KVM_XICS
> +	if (xive_enabled())
> +		kvmppc_xive_exit_module();
> +#endif
>  #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
>  	kvmppc_book3s_exit_pr();
>  #endif
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index fadb75a..5c340c2 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -67,6 +67,7 @@
>  #include <asm/mmu.h>
>  #include <asm/opal.h>
>  #include <asm/xics.h>
> +#include <asm/xive.h>
>  
>  #include "book3s.h"
>  
> @@ -837,6 +838,10 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>  	case H_IPOLL:
>  	case H_XIRR_X:
>  		if (kvmppc_xics_enabled(vcpu)) {
> +			if (xive_enabled()) {
> +				ret = H_NOT_AVAILABLE;
> +				return RESUME_GUEST;
> +			}
>  			ret = kvmppc_xics_hcall(vcpu, req);
>  			break;
>  		}
> @@ -2947,8 +2952,12 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
>  			r = kvmppc_book3s_hv_page_fault(run, vcpu,
>  				vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
>  			srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
> -		} else if (r == RESUME_PASSTHROUGH)
> -			r = kvmppc_xics_rm_complete(vcpu, 0);
> +		} else if (r == RESUME_PASSTHROUGH) {
> +			if (WARN_ON(xive_enabled()))
> +				r = H_SUCCESS;
> +			else
> +				r = kvmppc_xics_rm_complete(vcpu, 0);
> +		}
>  	} while (is_kvmppc_resume_guest(r));
>  
>   out:
> @@ -3400,10 +3409,19 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
>  	/*
>  	 * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed)
>  	 * Set HVICE bit to enable hypervisor virtualization interrupts.
> +	 * Set HEIC to prevent OS interrupts to go to hypervisor (should
> +	 * be unnecessary but better safe than sorry in case we re-enable
> +	 * EE in HV mode with this LPCR still set)
>  	 */
>  	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
>  		lpcr &= ~LPCR_VPM0;
> -		lpcr |= LPCR_HVICE;
> +		lpcr |= LPCR_HVICE | LPCR_HEIC;
> +
> +		/* If xive is enabled, we route 0x500 interrupts directly
> +		 * to the guest
> +		 */
> +		if (xive_enabled())
> +			lpcr |= LPCR_LPES;
>  	}
>  
>  	/*
> @@ -3533,7 +3551,7 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
>  	struct kvmppc_irq_map *irq_map;
>  	struct kvmppc_passthru_irqmap *pimap;
>  	struct irq_chip *chip;
> -	int i;
> +	int i, rc = 0;
>  
>  	if (!kvm_irq_bypass)
>  		return 1;
> @@ -3558,10 +3576,10 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
>  	/*
>  	 * For now, we only support interrupts for which the EOI operation
>  	 * is an OPAL call followed by a write to XIRR, since that's
> -	 * what our real-mode EOI code does.
> +	 * what our real-mode EOI code does, or a XIVE interrupt
>  	 */
>  	chip = irq_data_get_irq_chip(&desc->irq_data);
> -	if (!chip || !is_pnv_opal_msi(chip)) {
> +	if (!chip || !(is_pnv_opal_msi(chip) || is_xive_irq(chip))) {
>  		pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n",
>  			host_irq, guest_gsi);
>  		mutex_unlock(&kvm->lock);
> @@ -3603,7 +3621,14 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
>  	if (i == pimap->n_mapped)
>  		pimap->n_mapped++;
>  
> -	kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
> +	if (xive_enabled())
> +		rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc);
> +	else
> +		kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
> +	printk("set mapped for IRQ %d -> %d returned %d\n",
> +	       host_irq, guest_gsi, rc);

This seems like a debugging thing that should be removed or turned
into a DBG().

> +	if (rc)
> +		irq_map->r_hwirq = 0;
>  
>  	mutex_unlock(&kvm->lock);
>  
> @@ -3614,7 +3639,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
>  {
>  	struct irq_desc *desc;
>  	struct kvmppc_passthru_irqmap *pimap;
> -	int i;
> +	int i, rc = 0;
>  
>  	if (!kvm_irq_bypass)
>  		return 0;
> @@ -3641,9 +3666,12 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
>  		return -ENODEV;
>  	}
>  
> -	kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
> +	if (xive_enabled())
> +		rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc);
> +	else
> +		kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
>  
> -	/* invalidate the entry */
> +	/* invalidate the entry (what do do on error from the above ?) */
>  	pimap->mapped[i].r_hwirq = 0;
>  
>  	/*
> @@ -3652,7 +3680,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi)
>  	 */
>  
>  	mutex_unlock(&kvm->lock);
> -	return 0;
> +	return rc;
>  }
>  
>  static int kvmppc_irq_bypass_add_producer_hv(struct irq_bypass_consumer *cons,
> @@ -3930,7 +3958,7 @@ static int kvmppc_book3s_init_hv(void)
>  	 * indirectly, via OPAL.
>  	 */
>  #ifdef CONFIG_SMP
> -	if (!get_paca()->kvm_hstate.xics_phys) {
> +	if (!xive_enabled() && !get_paca()->kvm_hstate.xics_phys) {
>  		struct device_node *np;
>  
>  		np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
> diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
> index d48f9b6..8de7ed4 100644
> --- a/arch/powerpc/kvm/book3s_hv_builtin.c
> +++ b/arch/powerpc/kvm/book3s_hv_builtin.c
> @@ -23,6 +23,7 @@
>  #include <asm/kvm_book3s.h>
>  #include <asm/archrandom.h>
>  #include <asm/xics.h>
> +#include <asm/xive.h>
>  #include <asm/dbell.h>
>  #include <asm/cputhreads.h>
>  #include <asm/io.h>
> @@ -31,6 +32,24 @@
>  
>  #define KVM_CMA_CHUNK_ORDER	18
>  
> +#include "book3s_xics.h"
> +#include "book3s_xive.h"
> +
> +/*
> + * The XIVE module will populate these when it loads
> + */
> +unsigned long (*__xive_vm_h_xirr)(struct kvm_vcpu *vcpu);
> +unsigned long (*__xive_vm_h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server);
> +int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
> +		       unsigned long mfrr);
> +int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr);
> +int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
> +EXPORT_SYMBOL_GPL(__xive_vm_h_xirr);
> +EXPORT_SYMBOL_GPL(__xive_vm_h_ipoll);
> +EXPORT_SYMBOL_GPL(__xive_vm_h_ipi);
> +EXPORT_SYMBOL_GPL(__xive_vm_h_cppr);
> +EXPORT_SYMBOL_GPL(__xive_vm_h_eoi);
> +
>  /*
>   * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206)
>   * should be power of 2.
> @@ -209,6 +228,7 @@ void kvmhv_rm_send_ipi(int cpu)
>  		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
>  		return;
>  	}
> +
>  	/* On POWER8 for IPIs to threads in the same core, use msgsnd. */
>  	if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
>  	    cpu_first_thread_sibling(cpu) ==
> @@ -218,6 +238,10 @@ void kvmhv_rm_send_ipi(int cpu)
>  		return;
>  	}
>  
> +	/* We should never reach this */
> +	if (WARN_ON_ONCE(xive_enabled()))
> +	    return;
> +
>  	/* Else poke the target with an IPI */
>  	xics_phys = paca[cpu].kvm_hstate.xics_phys;
>  	if (xics_phys)
> @@ -398,6 +422,9 @@ static long kvmppc_read_one_intr(bool *again)
>  	u8 host_ipi;
>  	int64_t rc;
>  
> +	if (xive_enabled())
> +		return 1;

Why not do this in kvmppc_read_intr() rather than here?

> +
>  	/* see if a host IPI is pending */
>  	host_ipi = local_paca->kvm_hstate.host_ipi;
>  	if (host_ipi)
> @@ -482,3 +509,84 @@ static long kvmppc_read_one_intr(bool *again)
>  
>  	return kvmppc_check_passthru(xisr, xirr, again);
>  }
> +
> +static inline bool is_rm(void)
> +{
> +	return !(mfmsr() & MSR_DR);
> +}
> +
> +/* XXX FIXME: The xive_vm_* calls are in a module... */
> +
> +unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
> +{
> +	if (xive_enabled()) {
> +		if (is_rm())
> +			return xive_rm_h_xirr(vcpu);
> +		if (unlikely(!__xive_vm_h_xirr))
> +			return H_NOT_AVAILABLE;
> +		return __xive_vm_h_xirr(vcpu);
> +	} else
> +		return xics_rm_h_xirr(vcpu);
> +}
> +
> +unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
> +{
> +	vcpu->arch.gpr[5] = get_tb();
> +	if (xive_enabled()) {
> +		if (is_rm())
> +			return xive_rm_h_xirr(vcpu);
> +		if (unlikely(!__xive_vm_h_xirr))
> +			return H_NOT_AVAILABLE;
> +		return __xive_vm_h_xirr(vcpu);
> +	} else
> +		return xics_rm_h_xirr(vcpu);
> +}
> +
> +unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
> +{
> +	if (xive_enabled()) {
> +		if (is_rm())
> +			return xive_rm_h_ipoll(vcpu, server);
> +		if (unlikely(!__xive_vm_h_ipoll))
> +			return H_NOT_AVAILABLE;
> +		return __xive_vm_h_ipoll(vcpu, server);
> +	} else
> +		return H_TOO_HARD;
> +}
> +
> +int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
> +		    unsigned long mfrr)
> +{
> +	if (xive_enabled()) {
> +		if (is_rm())
> +			return xive_rm_h_ipi(vcpu, server, mfrr);
> +		if (unlikely(!__xive_vm_h_ipi))
> +			return H_NOT_AVAILABLE;
> +		return __xive_vm_h_ipi(vcpu, server, mfrr);
> +	} else
> +		return xics_rm_h_ipi(vcpu, server, mfrr);
> +}
> +
> +int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
> +{
> +	if (xive_enabled()) {
> +		if (is_rm())
> +			return xive_rm_h_cppr(vcpu, cppr);
> +		if (unlikely(!__xive_vm_h_cppr))
> +			return H_NOT_AVAILABLE;
> +		return __xive_vm_h_cppr(vcpu, cppr);
> +	} else
> +		return xics_rm_h_cppr(vcpu, cppr);
> +}
> +
> +int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
> +{
> +	if (xive_enabled()) {
> +		if (is_rm())
> +			return xive_rm_h_eoi(vcpu, xirr);
> +		if (unlikely(!__xive_vm_h_eoi))
> +			return H_NOT_AVAILABLE;
> +		return __xive_vm_h_eoi(vcpu, xirr);
> +	} else
> +		return xics_rm_h_eoi(vcpu, xirr);
> +}
> diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
> index 3a1a463..f806880 100644
> --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
> +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
> @@ -485,7 +485,7 @@ static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
>  }
>  
>  
> -unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
> +unsigned long xics_rm_h_xirr(struct kvm_vcpu *vcpu)
>  {
>  	union kvmppc_icp_state old_state, new_state;
>  	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
> @@ -523,8 +523,8 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
>  	return check_too_hard(xics, icp);
>  }
>  
> -int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
> -		    unsigned long mfrr)
> +int xics_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
> +		  unsigned long mfrr)
>  {
>  	union kvmppc_icp_state old_state, new_state;
>  	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
> @@ -610,7 +610,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
>  	return check_too_hard(xics, this_icp);
>  }
>  
> -int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
> +int xics_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
>  {
>  	union kvmppc_icp_state old_state, new_state;
>  	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
> @@ -730,7 +730,7 @@ static int ics_rm_eoi(struct kvm_vcpu *vcpu, u32 irq)
>  	return check_too_hard(xics, icp);
>  }
>  
> -int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
> +int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
>  {
>  	struct kvmppc_xics *xics = vcpu->kvm->arch.xics;
>  	struct kvmppc_icp *icp = vcpu->arch.icp;
> diff --git a/arch/powerpc/kvm/book3s_hv_rm_xive.c b/arch/powerpc/kvm/book3s_hv_rm_xive.c
> new file mode 100644
> index 0000000..6390f71
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_hv_rm_xive.c
> @@ -0,0 +1,47 @@
> +#include <linux/kernel.h>
> +#include <linux/kvm_host.h>
> +#include <linux/err.h>
> +#include <linux/kernel_stat.h>
> +
> +#include <asm/kvm_book3s.h>
> +#include <asm/kvm_ppc.h>
> +#include <asm/hvcall.h>
> +#include <asm/xics.h>
> +#include <asm/debug.h>
> +#include <asm/synch.h>
> +#include <asm/cputhreads.h>
> +#include <asm/pgtable.h>
> +#include <asm/ppc-opcode.h>
> +#include <asm/pnv-pci.h>
> +#include <asm/opal.h>
> +#include <asm/smp.h>
> +#include <asm/asm-prototypes.h>
> +#include <asm/xive.h>
> +
> +#include "book3s_xive.h"
> +#include "../sysdev/xive/xive-regs.h"
> +
> +/* XXX */
> +#include <asm/udbg.h>
> +//#define DBG(fmt...) udbg_printf(fmt)
> +#define DBG(fmt...) do { } while(0)
> +
> +static inline void __iomem *get_tm_area_phys(void)
> +{
> +	return local_paca->kvm_hstate.xive_tm_area_phys;
> +}
> +
> +#undef XIVE_RUNTIME_CHECKS
> +#define X_PFX xive_rm_
> +#define X_STATIC
> +#define X_STAT_PFX stat_rm_
> +#define __x_tm_area		get_tm_area_phys()
> +#define __x_eoi_page(xd)	((void __iomem *)((xd)->eoi_page))
> +#define __x_trig_page(xd)	((void __iomem *)((xd)->trig_page))
> +#define __x_readb	__raw_rm_readb
> +#define __x_writeb	__raw_rm_writeb
> +#define __x_readw	__raw_rm_readw
> +#define __x_readq	__raw_rm_readq
> +#define __x_writeq	__raw_rm_writeq
> +
> +#include "book3s_xive_template.c"
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index 720b9c0..c06cccd 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -31,6 +31,8 @@
>  #include <asm/tm.h>
>  #include <asm/opal.h>
>  
> +#include "../sysdev/xive/xive-regs.h"
> +
>  #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
>  
>  /* Values in HSTATE_NAPPING(r13) */
> @@ -982,6 +984,23 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
>  	cmpwi	r3, 512		/* 1 microsecond */
>  	blt	hdec_soon
>  
> +#ifdef CONFIG_KVM_XICS
> +	/* We are entering the guest on that thread, push VCPU to XIVE */
> +	ld	r10, HSTATE_XIVE_TM_AREA_PHYS(r13)
> +	cmpldi	cr0, r10, r0
> +	beq	no_xive
> +	ld	r11, VCPU_XIVE_SAVED_STATE(r4)
> +	li	r9, TM_QW1_OS
> +	stdcix	r11,r9,r10
> +	eieio
> +	lwz	r11, VCPU_XIVE_CAM_WORD(r4)
> +	li	r9, TM_QW1_OS + TM_WORD2
> +	stwcix	r11,r9,r10
> +	li	r9, 1
> +	stw	r9, VCPU_XIVE_PUSHED(r4)
> +no_xive:
> +#endif /* CONFIG_KVM_XICS */
> +
>  deliver_guest_interrupt:
>  	ld	r6, VCPU_CTR(r4)
>  	ld	r7, VCPU_XER(r4)
> @@ -1319,6 +1338,38 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
>  	blt	deliver_guest_interrupt
>  
>  guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
> +#ifdef CONFIG_KVM_XICS
> +	/* We are exiting, pull the VP from the XIVE */
> +	lwz	r0, VCPU_XIVE_PUSHED(r9)
> +	cmpwi	cr0, r0, 0
> +	beq	1f
> +	li	r7, TM_SPC_PULL_OS_CTX
> +	li	r6, TM_QW1_OS
> +	mfmsr	r0
> +	andi.	r0, r0, MSR_IR		/* in real mode? */
> +	beq	2f
> +	ld	r10, HSTATE_XIVE_TM_AREA_VIRT(r13)
> +	cmpldi	cr0, r10, 0
> +	beq	1f
> +	lwzx	r11, r7, r10
> +	eieio
> +	ldx	r11, r6, r10

I assume you meant to do these two loads into the same target
register, but I don't know why, so a comment would be useful.

> +	b	3f
> +2:	ld	r10, HSTATE_XIVE_TM_AREA_PHYS(r13)
> +	cmpldi	cr0, r10, 0
> +	beq	1f
> +	lwzcix	r11, r7, r10
> +	eieio
> +	ldcix	r11, r6, r10
> +3:	std	r11, VCPU_XIVE_SAVED_STATE(r9)
> +	/* Fixup some of the state for the next load */
> +	li	r10, 0
> +	li	r0, 0xff
> +	stw	r10, VCPU_XIVE_PUSHED(r9)
> +	stb	r10, (VCPU_XIVE_SAVED_STATE+3)(r9)
> +	stb	r0, (VCPU_XIVE_SAVED_STATE+4)(r9)
> +1:
> +#endif /* CONFIG_KVM_XICS */
>  	/* Save more register state  */
>  	mfdar	r6
>  	mfdsisr	r7
> @@ -2035,7 +2086,7 @@ hcall_real_table:
>  	.long	DOTSYM(kvmppc_rm_h_eoi) - hcall_real_table
>  	.long	DOTSYM(kvmppc_rm_h_cppr) - hcall_real_table
>  	.long	DOTSYM(kvmppc_rm_h_ipi) - hcall_real_table
> -	.long	0		/* 0x70 - H_IPOLL */
> +	.long	DOTSYM(kvmppc_rm_h_ipoll) - hcall_real_table
>  	.long	DOTSYM(kvmppc_rm_h_xirr) - hcall_real_table
>  #else
>  	.long	0		/* 0x64 - H_EOI */
> @@ -2205,7 +2256,11 @@ hcall_real_table:
>  	.long	0		/* 0x2f0 */
>  	.long	0		/* 0x2f4 */
>  	.long	0		/* 0x2f8 */
> -	.long	0		/* 0x2fc */
> +#ifdef CONFIG_KVM_XICS
> +	.long	DOTSYM(kvmppc_rm_h_xirr_x) - hcall_real_table
> +#else
> +	.long	0		/* 0x2fc - H_XIRR_X*/
> +#endif
>  	.long	DOTSYM(kvmppc_h_random) - hcall_real_table
>  	.globl	hcall_real_table_end
>  hcall_real_table_end:
> @@ -2980,6 +3035,7 @@ kvmppc_fix_pmao:
>  	isync
>  	blr
>  
> +

Gratuitous extra blank line.

>  #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
>  /*
>   * Start timing an activity
> diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
> index 20528701..2d3b2b1 100644
> --- a/arch/powerpc/kvm/book3s_rtas.c
> +++ b/arch/powerpc/kvm/book3s_rtas.c
> @@ -16,6 +16,7 @@
>  #include <asm/kvm_ppc.h>
>  #include <asm/hvcall.h>
>  #include <asm/rtas.h>
> +#include <asm/xive.h>
>  
>  #ifdef CONFIG_KVM_XICS
>  static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
> @@ -32,7 +33,10 @@ static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
>  	server = be32_to_cpu(args->args[1]);
>  	priority = be32_to_cpu(args->args[2]);
>  
> -	rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
> +	if (xive_enabled())
> +		rc = kvmppc_xive_set_xive(vcpu->kvm, irq, server, priority);
> +	else
> +		rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
>  	if (rc)
>  		rc = -3;
>  out:
> @@ -52,7 +56,10 @@ static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args)
>  	irq = be32_to_cpu(args->args[0]);
>  
>  	server = priority = 0;
> -	rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
> +	if (xive_enabled())
> +		rc = kvmppc_xive_get_xive(vcpu->kvm, irq, &server, &priority);
> +	else
> +		rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
>  	if (rc) {
>  		rc = -3;
>  		goto out;
> @@ -76,7 +83,10 @@ static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args)
>  
>  	irq = be32_to_cpu(args->args[0]);
>  
> -	rc = kvmppc_xics_int_off(vcpu->kvm, irq);
> +	if (xive_enabled())
> +		rc = kvmppc_xive_int_off(vcpu->kvm, irq);
> +	else
> +		rc = kvmppc_xics_int_off(vcpu->kvm, irq);
>  	if (rc)
>  		rc = -3;
>  out:
> @@ -95,7 +105,10 @@ static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args)
>  
>  	irq = be32_to_cpu(args->args[0]);
>  
> -	rc = kvmppc_xics_int_on(vcpu->kvm, irq);
> +	if (xive_enabled())
> +		rc = kvmppc_xive_int_on(vcpu->kvm, irq);
> +	else
> +		rc = kvmppc_xics_int_on(vcpu->kvm, irq);
>  	if (rc)
>  		rc = -3;
>  out:
> diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
> index ef4fd52..e6829c4 100644
> --- a/arch/powerpc/kvm/book3s_xics.c
> +++ b/arch/powerpc/kvm/book3s_xics.c
> @@ -1307,8 +1307,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
>  	return 0;
>  }
>  
> -int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
> -		bool line_status)
> +int kvmppc_xics_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
> +			bool line_status)
>  {
>  	struct kvmppc_xics *xics = kvm->arch.xics;
>  
> @@ -1317,14 +1317,6 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
>  	return ics_deliver_irq(xics, irq, level);
>  }
>  
> -int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *irq_entry,
> -			      struct kvm *kvm, int irq_source_id,
> -			      int level, bool line_status)
> -{
> -	return kvm_set_irq(kvm, irq_source_id, irq_entry->gsi,
> -			   level, line_status);
> -}
> -
>  static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
>  {
>  	struct kvmppc_xics *xics = dev->private;
> @@ -1458,29 +1450,6 @@ void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu)
>  	vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
>  }
>  
> -static int xics_set_irq(struct kvm_kernel_irq_routing_entry *e,
> -			struct kvm *kvm, int irq_source_id, int level,
> -			bool line_status)
> -{
> -	return kvm_set_irq(kvm, irq_source_id, e->gsi, level, line_status);
> -}
> -
> -int kvm_irq_map_gsi(struct kvm *kvm,
> -		    struct kvm_kernel_irq_routing_entry *entries, int gsi)
> -{
> -	entries->gsi = gsi;
> -	entries->type = KVM_IRQ_ROUTING_IRQCHIP;
> -	entries->set = xics_set_irq;
> -	entries->irqchip.irqchip = 0;
> -	entries->irqchip.pin = gsi;
> -	return 1;
> -}
> -
> -int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
> -{
> -	return pin;
> -}
> -
>  void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long irq,
>  			    unsigned long host_irq)
>  {
> diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
> index ec5474c..5016676 100644
> --- a/arch/powerpc/kvm/book3s_xics.h
> +++ b/arch/powerpc/kvm/book3s_xics.h
> @@ -144,5 +144,10 @@ static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics,
>  	return ics;
>  }
>  
> +extern unsigned long xics_rm_h_xirr(struct kvm_vcpu *vcpu);
> +extern int xics_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
> +			 unsigned long mfrr);
> +extern int xics_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
> +extern int xics_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
>  
>  #endif /* _KVM_PPC_BOOK3S_XICS_H */
> diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
> new file mode 100644
> index 0000000..acc882d
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_xive.c
> @@ -0,0 +1,1898 @@
> +/*
> + * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/kernel.h>
> +#include <linux/kvm_host.h>
> +#include <linux/err.h>
> +#include <linux/gfp.h>
> +#include <linux/spinlock.h>
> +#include <linux/delay.h>
> +#include <linux/percpu.h>
> +#include <linux/cpumask.h>
> +#include <asm/uaccess.h>
> +#include <asm/kvm_book3s.h>
> +#include <asm/kvm_ppc.h>
> +#include <asm/hvcall.h>
> +#include <asm/xics.h>
> +#include <asm/xive.h>
> +#include <asm/debug.h>
> +#include <asm/time.h>
> +#include <asm/opal.h>
> +
> +#include <linux/debugfs.h>
> +#include <linux/seq_file.h>
> +
> +#include "book3s_xive.h"
> +#include "../sysdev/xive/xive-regs.h"
> +
> +//#define DBG(fmt...)	printk("KVM/XIVE: " fmt)
> +#define DBG(fmt...)	do { } while(0)
> +
> +#ifdef XIVE_RUNTIME_CHECKS
> +#define xive_assert(cond) WARN_ON(!(cond))
> +#else
> +#define xive_assert(cond) (false)
> +#endif
> +
> +/*
> + * Virtual mode variants of the hcalls for use on radix/radix
> + * with AIL. They require the VCPU's VP to be "pushed"
> + *
> + * We still instanciate them here because we use some of the
> + * generated utility functions as well in this file.
> + */
> +#define XIVE_RUNTIME_CHECKS
> +#define X_PFX xive_vm_
> +#define X_STATIC static
> +#define X_STAT_PFX stat_vm_
> +#define __x_tm_area		xive_tm_area
> +#define __x_eoi_page(xd)	((void __iomem *)((xd)->eoi_mmio))
> +#define __x_trig_page(xd)	((void __iomem *)((xd)->trig_mmio))
> +#define __x_readb	__raw_readb
> +#define __x_writeb	__raw_writeb
> +#define __x_readw	__raw_readw
> +#define __x_readq	__raw_readq
> +#define __x_writeq	__raw_writeq
> +
> +#include "book3s_xive_template.c"
> +
> +/* We leave a gap of a couple of interrupts in the queue to
> + * account for the IPI and additional safety guard
> + */
> +#define XIVE_Q_GAP	2
> +
> +/*
> + * This is a simple trigger for a generic XIVE IRQ. This must
> + * only be called for interrupts that support a trigger page
> + */
> +static bool xive_irq_trigger(struct xive_irq_data *xd)
> +{
> +	/* This should be only for MSIs */
> +	if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI))
> +		return false;
> +
> +	/* Those interrupts should always have a trigger page */
> +	if (WARN_ON(!xd->trig_mmio))
> +		return false;
> +
> +	out_be64(xd->trig_mmio, 0);
> +
> +	return true;
> +}
> +
> +static irqreturn_t xive_esc_irq(int irq, void *data)
> +{
> +	struct kvm_vcpu *vcpu = data;
> +
> +	/* We use the existing H_PROD mechanism to wake up the target */
> +	vcpu->arch.prodded = 1;
> +	smp_mb();
> +	if (vcpu->arch.ceded)
> +		kvmppc_fast_vcpu_kick(vcpu);
> +
> +	return IRQ_HANDLED;
> +}
> +
> +static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
> +{
> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +	struct xive_q *q = &xc->queues[prio];
> +	char *name = NULL;
> +	int rc;
> +
> +	/* Already there ? */
> +	if (xc->esc_virq[prio])
> +		return 0;
> +
> +	/* Hook up the escalation interrupt */
> +	xc->esc_virq[prio] = irq_create_mapping(NULL, q->esc_irq);
> +	if (!xc->esc_virq[prio]) {
> +		pr_err("XIVE-KVM: Failed to map escalation interrupt"
> +		       " for queue %d of VCPU %d\n",
> +		       prio, xc->server_num);
> +		return -EIO;
> +	}
> +
> +	/*
> +	 * Future improvement: start with them disabled
> +	 * and handle DD2 and later scheme of merged escalation
> +	 * interrupts
> +	 */
> +	name = kasprintf(GFP_KERNEL, "kvm-%d-%d-%d\n",
> +			 vcpu->kvm->arch.lpid, xc->server_num, prio);
> +	if (!name) {
> +		pr_err("XIVE-KVM: Failed to allocate escalation irq name"
> +		       " for queue %d of VCPU %d\n",
> +		       prio, xc->server_num);
> +		rc = -ENOMEM;
> +		goto error;
> +	}
> +	rc = request_irq(xc->esc_virq[prio], xive_esc_irq,
> +			 IRQF_NO_THREAD, name, vcpu);
> +	if (rc) {
> +		pr_err("XIVE-KVM: Failed to request escalation interrupt"
> +		       " for queue %d of VCPU %d\n",
> +		       prio, xc->server_num);
> +		goto error;
> +	}
> +	xc->esc_virq_names[prio] = name;
> +	return 0;
> + error:
> +	irq_dispose_mapping(xc->esc_virq[prio]);
> +	xc->esc_virq[prio] = 0;
> +	kfree(name);
> +	return rc;
> +}
> +
> +static int xive_provision_queue(struct kvm_vcpu *vcpu, u8 prio)
> +{
> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +	struct kvmppc_xive *xive = xc->xive;
> +	struct xive_q *q =  &xc->queues[prio];
> +	void *qpage;
> +	int rc;
> +
> +	if (WARN_ON(q->qpage))
> +		return 0;
> +
> +	/* Allocate the queue and retrieve infos on current node for now */
> +	qpage = (__be32 *)__get_free_pages(GFP_KERNEL, xive->q_alloc_order);

Possibly q_page_order would be a better name than q_alloc_order.

> +	if (!qpage) {
> +		pr_err("XIVE-KVM: Failed to allocate queue %d for VCPU %d\n",
> +		       prio, xc->server_num);
> +		return -ENOMEM;;
> +	}
> +	memset(qpage, 0, 1 << xive->q_order);
> +
> +	/*
> +	 * Reconfigure the queue. This will set q->qpage only once the
> +	 * queue is fully configured. This is a requirement for prio 0
> +	 * as we will stop doing EOIs for every IPI as soon as we observe
> +	 * qpage being non-NULL, and instead will only EOI when we receive
> +	 * corresponding queue 0 entries
> +	 */
> +	rc = xive_native_configure_queue(xc->vp_id, q, prio, qpage,
> +					 xive->q_order, true);
> +	if (rc)
> +		pr_err("XIVE-KVM: Failed to configure queue %d for VCPU %d\n",
> +		       prio, xc->server_num);
> +	return rc;
> +}
> +
> +/* Called with kvm_lock held */
> +static int xive_check_provisioning(struct kvm *kvm, u8 prio)
> +{
> +	struct kvmppc_xive *xive = kvm->arch.xive;
> +	struct kvm_vcpu *vcpu;
> +	int i, rc;
> +
> +	lockdep_assert_held(&kvm->lock);
> +
> +	/* Already provisioned ? */
> +	if (xive->qmap & (1 << prio))
> +		return 0;
> +
> +	DBG("Provisioning prio... %d\n", prio);
> +
> +	/* Provision each VCPU and enable escalations */
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		if (!vcpu->arch.xive_vcpu)
> +			continue;
> +		rc = xive_provision_queue(vcpu, prio);
> +		if (rc == 0)
> +			xive_attach_escalation(vcpu, prio);
> +		if (rc)
> +			return rc;
> +	}
> +
> +	/* Order previous stores and mark it as provisioned */
> +	mb();
> +	xive->qmap |= (1 << prio);
> +	return 0;
> +}
> +
> +static void xive_inc_q_pending(struct kvm *kvm, u32 server, u8 prio)
> +{
> +	struct kvm_vcpu *vcpu;
> +	struct kvmppc_xive_vcpu *xc;
> +	struct xive_q *q;
> +
> +	/* Locate target server */
> +	vcpu = kvmppc_xive_find_server(kvm, server);
> +	if (!vcpu) {
> +		pr_warn("%s: Can't find server %d\n", __func__, server);
> +		return;
> +	}
> +	xc = vcpu->arch.xive_vcpu;
> +	if (WARN_ON(!xc))
> +		return;
> +
> +	q = &xc->queues[prio];
> +	atomic_inc(&q->pending_count);
> +}
> +
> +static int xive_try_pick_queue(struct kvm_vcpu *vcpu, u8 prio)
> +{
> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +	struct xive_q *q;
> +	u32 max;
> +
> +	if (WARN_ON(!xc))
> +		return -ENXIO;
> +	if (!xc->valid)
> +		return -ENXIO;
> +
> +	q = &xc->queues[prio];
> +	if (WARN_ON(!q->qpage))
> +		return -ENXIO;
> +
> +	/* Calculate max number of interrupts in that queue. */
> +	max = (q->msk + 1) - XIVE_Q_GAP;
> +	return atomic_add_unless(&q->count, 1, max) ? 0 : -EBUSY;
> +}
> +
> +static int xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
> +{
> +	struct kvm_vcpu *vcpu;
> +	int i, rc;
> +
> +	/* Locate target server */
> +	vcpu = kvmppc_xive_find_server(kvm, *server);
> +	if (!vcpu) {
> +		DBG("Can't find server %d\n", *server);
> +		return -EINVAL;
> +	}
> +
> +	DBG("Finding irq target on 0x%x/%d...\n", *server, prio);
> +
> +	/* Try pick it */
> +	rc = xive_try_pick_queue(vcpu, prio);
> +	if (rc == 0)
> +		return rc;
> +
> +	DBG(" .. failed, looking up candidate...\n");
> +
> +	/* Failed, pick another VCPU */
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		if (!vcpu->arch.xive_vcpu)
> +			continue;
> +		rc = xive_try_pick_queue(vcpu, prio);
> +		if (rc == 0) {
> +			*server = vcpu->arch.xive_vcpu->server_num;
> +			DBG("  found on 0x%x/%d\n", *server, prio);
> +			return rc;
> +		}
> +	}
> +	DBG("  no available target !\n");
> +
> +	/* No available target ! */
> +	return -EBUSY;
> +}
> +
> +static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
> +			     struct kvmppc_xive_src_block *sb,
> +			     struct kvmppc_xive_irq_state *state)
> +{
> +	struct xive_irq_data *xd;
> +	u32 hw_num;
> +	u8 old_prio;
> +	u64 val;
> +
> +	/*
> +	 * Take the lock, set masked, try again if racing
> +	 * with H_EOI
> +	 */
> +	for (;;) {
> +		arch_spin_lock(&sb->lock);
> +		old_prio = state->guest_priority;
> +		state->guest_priority = MASKED;
> +		mb();
> +		if (!state->in_eoi)
> +			break;
> +		state->guest_priority = old_prio;
> +		arch_spin_unlock(&sb->lock);
> +	}
> +
> +	/* No change ? Bail */
> +	if (old_prio == MASKED)
> +		return old_prio;
> +
> +	/* Get the right irq */
> +	kvmppc_xive_select_irq(state, &hw_num, &xd);
> +
> +	/*
> +	 * If the interrupt is marked as needing masking via
> +	 * firmware, we do it here. Firmware masking however
> +	 * is "lossy", it won't return the old p and q bits
> +	 * and won't set the interrupt to a state where it will
> +	 * record queued ones. If this is an issue we should do
> +	 * lazy masking instead.
> +	 *
> +	 * For now, we work around this in unmask by forcing
> +	 * an interrupt whenever we unmask a non-LSI via FW
> +	 * (if ever).
> +	 */
> +	if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
> +		xive_native_configure_irq(hw_num,
> +					  xive->vp_base + state->act_server,
> +					  MASKED, state->number);
> +		/* set old_p so we can track if an H_EOI was done */
> +		state->old_p = true;
> +		state->old_q = false;
> +	} else {
> +		/* Set PQ to 10, return old P and old Q and remember them */
> +		val = xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_10);
> +		state->old_p = !!(val & 2);
> +		state->old_q = !!(val & 1);
> +
> +		/*
> +		 * Synchronize hardware to sensure the queues are updated
> +		 * when masking
> +		 */
> +		xive_native_sync_source(hw_num);
> +	}
> +
> +	return old_prio;
> +}
> +
> +static void xive_lock_for_unmask(struct kvmppc_xive_src_block *sb,
> +				 struct kvmppc_xive_irq_state *state)
> +{
> +	/*
> +	 * Take the lock try again if racing with H_EOI
> +	 */
> +	for (;;) {
> +		arch_spin_lock(&sb->lock);
> +		if (!state->in_eoi)
> +			break;
> +		arch_spin_unlock(&sb->lock);
> +	}
> +}
> +
> +static void xive_finish_unmask(struct kvmppc_xive *xive,
> +			       struct kvmppc_xive_src_block *sb,
> +			       struct kvmppc_xive_irq_state *state,
> +			       u8 prio)
> +{
> +	struct xive_irq_data *xd;
> +	u32 hw_num;
> +
> +	/* If we aren't changing a thing, move on */
> +	if (state->guest_priority != MASKED)
> +		goto bail;
> +
> +	/* Get the right irq */
> +	kvmppc_xive_select_irq(state, &hw_num, &xd);
> +
> +	/*
> +	 * See command in xive_lock_and_mask() concerning masking
> +	 * via firmware.
> +	 */
> +	if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
> +		xive_native_configure_irq(hw_num,
> +					  xive->vp_base + state->act_server,
> +					  state->act_priority, state->number);
> +		/* If an EOI is needed, do it here */
> +		if (!state->old_p)
> +			xive_vm_source_eoi(hw_num, xd);
> +		/* If this is not an LSI, force a trigger */
> +		if (!(xd->flags & OPAL_XIVE_IRQ_LSI))
> +			xive_irq_trigger(xd);
> +		goto bail;
> +	}
> +
> +	/* Old Q set, set PQ to 11 */
> +	if (state->old_q)
> +		xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_11);
> +
> +	/*
> +	 * If not old P, then perform an "effective" EOI,
> +	 * on the source. This will handle the cases where
> +	 * FW EOI is needed.
> +	 */
> +	if (!state->old_p)
> +		xive_vm_source_eoi(hw_num, xd);
> +
> +	/* Synchronize ordering and mark unmasked */
> +	mb();
> + bail:
> +	state->guest_priority = prio;
> +}
> +
> +/*
> + * Target an interrupt to a given server/prio, this will fallback
> + * to another server if necessary and perform the HW targetting
> + * updates as needed
> + *
> + * NOTE: Must be called with the state lock held
> + */
> +static int xive_target_interrupt(struct kvm *kvm,
> +				 struct kvmppc_xive_irq_state *state,
> +				 u32 server, u8 prio)
> +{
> +	struct kvmppc_xive *xive = kvm->arch.xive;
> +	u32 hw_num;
> +	int rc;
> +
> +	/*
> +	 * This will return a tentative server and actual
> +	 * priority. The count for that new target will have
> +	 * already been incremented.
> +	 */
> +	rc = xive_select_target(kvm, &server, prio);
> +
> +	/* We failed to find a target ? Not much we can do
> +	 * at least until we support the GIQ.
> +	 */
> +	if (rc)
> +		return rc;
> +
> +	/*
> +	 * Increment the old queue pending count if there
> +	 * was one so that the old queue count gets adjusted later
> +	 * when observed to be empty.
> +	 */
> +	if (state->act_priority != MASKED)
> +		xive_inc_q_pending(kvm,
> +				   state->act_server,
> +				   state->act_priority);
> +	/*
> +	 * Update state and HW
> +	 */
> +	state->act_priority = prio;
> +	state->act_server = server;
> +
> +	/* Get the right irq */
> +	kvmppc_xive_select_irq(state, &hw_num, NULL);
> +
> +	return xive_native_configure_irq(hw_num,
> +					 xive->vp_base + server,
> +					 prio, state->number);
> +}
> +
> +/*
> + * Targetting rules: In order to avoid losing track of
> + * pending interrupts accross mask and unmask, which would
> + * allow queue overflows, we implement the following rules:
> + *
> + *  - Unless it was never enabled (or we run out of capacity)
> + *    an interrupt is always targetted at a valid server/queue
> + *    pair even when "masked" by the guest. This pair tends to
> + *    be the last one used but it can be changed under some
> + *    circumstances. That allows us to separate targetting
> + *    from masking, we only handle accounting during (re)targetting,
> + *    this also allows us to let an interrupt drain into its target
> + *    queue after masking, avoiding complex schemes to remove
> + *    interrupts out of remote processor queues.
> + *
> + *  - When masking, we set PQ to 10 and save the previous value
> + *    of P and Q.
> + *
> + *  - When unmasking, if saved Q was set, we set PQ to 11
> + *    otherwise we leave PQ to the HW state which will be either
> + *    10 if nothing happened or 11 if the interrupt fired while
> + *    masked. Effectively we are OR'ing the previous Q into the
> + *    HW Q.
> + *
> + *    Then if saved P is clear, we do an effective EOI (Q->P->Trigger)
> + *    which will unmask the interrupt and shoot a new one if Q was
> + *    set.
> + *
> + *    Otherwise (saved P is set) we leave PQ unchanged (so 10 or 11,
> + *    effectively meaning an H_EOI from the guest is still expected
> + *    for that interrupt).
> + *
> + *  - If H_EOI occurs while masked, we clear the saved P.
> + *
> + *  - When changing target, we account on the new target and
> + *    increment a separate "pending" counter on the old one.
> + *    This pending counter will be used to decrement the old
> + *    target's count when its queue has been observed empty.
> + */
> +
> +int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
> +			 u32 priority)
> +{
> +	struct kvmppc_xive *xive = kvm->arch.xive;
> +	struct kvmppc_xive_src_block *sb;
> +	struct kvmppc_xive_irq_state *state;
> +	u8 new_act_prio;
> +	int rc = 0;
> +	u16 idx;
> +
> +	if (!xive)
> +		return -ENODEV;
> +
> +	DBG("set_xive ! irq 0x%x server 0x%x prio %d\n",
> +	    irq, server, priority);
> +
> +	/* First, check provisioning of queues */
> +	if (priority != MASKED)
> +		rc = xive_check_provisioning(xive->kvm,
> +			      xive_prio_from_guest(priority));
> +	if (rc) {
> +		DBG("  provisioning failure %d !\n", rc);
> +		return rc;
> +	}
> +
> +	sb = kvmppc_xive_find_source(xive, irq, &idx);
> +	if (!sb)
> +		return -EINVAL;
> +	state = &sb->irq_state[idx];
> +
> +	/*
> +	 * We first handle masking/unmasking since the locking
> +	 * might need to be retried due to EOIs, we'll handle
> +	 * targetting changes later. These functions will return
> +	 * with the SB lock held.
> +	 *
> +	 * xive_lock_and_mask() will also set state->guest_priority
> +	 * but won't otherwise change other fields of the state.
> +	 *
> +	 * xive_lock_for_unmask will not actually unmask, this will
> +	 * be done later by xive_finish_unmask() once the targetting
> +	 * has been done, so we don't try to unmask an interrupt
> +	 * that hasn't yet been targetted.
> +	 */
> +	if (priority == MASKED)
> +		xive_lock_and_mask(xive, sb, state);
> +	else
> +		xive_lock_for_unmask(sb, state);
> +
> +
> +	/*
> +	 * Then we handle targetting.
> +	 *
> +	 * First calculate a new "actual priority"
> +	 */
> +	new_act_prio = state->act_priority;
> +	if (priority != MASKED)
> +		new_act_prio = xive_prio_from_guest(priority);
> +
> +	DBG(" new_act_prio=%x act_server=%x act_prio=%x\n",
> +	    new_act_prio, state->act_server, state->act_priority);
> +
> +	/*
> +	 * Then check if we actually need to change anything,
> +	 *
> +	 * The condition for re-targetting the interrupt is that
> +	 * we have a valid new priority (new_act_prio is not 0xff)
> +	 * and either the server or the priority changed.
> +	 *
> +	 * Note: If act_priority was ff and the new priority is
> +	 *       also ff, we don't do anything and leave the interrupt
> +	 *       untargetted. An attempt of doing an int_on on an
> +	 *       untargetted interrupt will fail. If that is a problem
> +	 *       we could initialize interrupts with valid default
> +	 */
> +
> +	if (new_act_prio != MASKED &&
> +	    (state->act_server != server ||
> +	     state->act_priority != new_act_prio))
> +		rc = xive_target_interrupt(kvm, state, server, new_act_prio);
> +
> +	/*
> +	 * Perform the final unmasking of the interrupt source
> +	 * if necessary
> +	 */
> +	if (priority != MASKED)
> +		xive_finish_unmask(xive, sb, state, priority);
> +
> +	/*
> +	 * Finally Update saved_priority to match. Only int_on/off
> +	 * set this field to a different value.
> +	 */
> +	state->saved_priority = priority;
> +
> +	arch_spin_unlock(&sb->lock);
> +	return rc;
> +}
> +
> +int kvmppc_xive_get_xive(struct kvm *kvm, u32 irq, u32 *server,
> +			 u32 *priority)
> +{
> +	struct kvmppc_xive *xive = kvm->arch.xive;
> +	struct kvmppc_xive_src_block *sb;
> +	struct kvmppc_xive_irq_state *state;
> +	u16 idx;
> +
> +	if (!xive)
> +		return -ENODEV;
> +
> +	sb = kvmppc_xive_find_source(xive, irq, &idx);
> +	if (!sb)
> +		return -EINVAL;
> +	state = &sb->irq_state[idx];
> +	arch_spin_lock(&sb->lock);
> +	*server = state->guest_server;
> +	*priority = state->guest_priority;
> +	arch_spin_unlock(&sb->lock);
> +
> +	return 0;
> +}
> +
> +int kvmppc_xive_int_on(struct kvm *kvm, u32 irq)
> +{
> +	struct kvmppc_xive *xive = kvm->arch.xive;
> +	struct kvmppc_xive_src_block *sb;
> +	struct kvmppc_xive_irq_state *state;
> +	u16 idx;
> +
> +	if (!xive)
> +		return -ENODEV;
> +
> +	sb = kvmppc_xive_find_source(xive, irq, &idx);
> +	if (!sb)
> +		return -EINVAL;
> +	state = &sb->irq_state[idx];
> +
> +	DBG("int_on(irq=0x%x)\n", irq);
> +
> +	/*
> +	 * Check if interrupt was not targetted
> +	 */
> +	if (state->act_priority == MASKED) {
> +		DBG("int_on on untargetted interrupt\n");
> +		return -EINVAL;
> +	}
> +
> +	/* If saved_priority is 0xff, do nothing */
> +	if (state->saved_priority == MASKED)
> +		return 0;
> +
> +	/*
> +	 * Lock and unmask it.
> +	 */
> +	xive_lock_for_unmask(sb, state);
> +	xive_finish_unmask(xive, sb, state, state->saved_priority);
> +	arch_spin_unlock(&sb->lock);
> +
> +	return 0;
> +}
> +
> +int kvmppc_xive_int_off(struct kvm *kvm, u32 irq)
> +{
> +	struct kvmppc_xive *xive = kvm->arch.xive;
> +	struct kvmppc_xive_src_block *sb;
> +	struct kvmppc_xive_irq_state *state;
> +	u16 idx;
> +
> +	if (!xive)
> +		return -ENODEV;
> +
> +	sb = kvmppc_xive_find_source(xive, irq, &idx);
> +	if (!sb)
> +		return -EINVAL;
> +	state = &sb->irq_state[idx];
> +
> +	DBG("int_off(irq=0x%x)\n", irq);
> +
> +	/*
> +	 * Lock and mask
> +	 */
> +	state->saved_priority = xive_lock_and_mask(xive, sb, state);
> +	arch_spin_unlock(&sb->lock);
> +
> +	return 0;
> +}
> +
> +static bool xive_restore_pending_irq(struct kvmppc_xive *xive, u32 irq)
> +{
> +	struct kvmppc_xive_src_block *sb;
> +	struct kvmppc_xive_irq_state *state;
> +	u16 idx;
> +
> +	sb = kvmppc_xive_find_source(xive, irq, &idx);
> +	if (!sb)
> +		return false;
> +	state = &sb->irq_state[idx];
> +	if (!state->valid)
> +		return false;
> +
> +	/*
> +	 * Trigger the IPI. This assumes we never restore a pass-through
> +	 * interrupt which should be safe enough
> +	 */
> +	xive_irq_trigger(&state->ipi_data);
> +
> +	return true;
> +}
> +
> +u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu)
> +{
> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +
> +	if (!xc)
> +		return 0;
> +
> +	/* Return the per-cpu state for state saving/migration */
> +	return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT |
> +	       (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT;
> +}
> +
> +int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
> +{
> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
> +	u8 cppr, mfrr;
> +	u32 xisr;
> +
> +	if (!xc || !xive)
> +		return -ENOENT;
> +
> +	/* Grab individual state fields. We don't use pending_pri */
> +	cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT;
> +	xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) &
> +		KVM_REG_PPC_ICP_XISR_MASK;
> +	mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT;
> +
> +	DBG("set_icp vcpu %d cppr=0x%x mfrr=0x%x xisr=0x%x\n",
> +	    xc->server_num, cppr, mfrr, xisr);
> +
> +	/*
> +	 * We can't update the state of a "pushed" VCPU, but that
> +	 * shouldn't happen.
> +	 */
> +	if (WARN_ON(vcpu->arch.xive_pushed))
> +		return -EIO;
> +
> +	/* Update VCPU HW saved state */
> +	vcpu->arch.xive_saved_state.cppr = cppr;
> +	xc->hw_cppr = xc->cppr = cppr;
> +
> +	/*
> +	 * Update MFRR state. If it's not 0xff, we mark the VCPU as
> +	 * having a pending MFRR change, which will re-evaluate the
> +	 * target. The VCPU will thus potentially get a spurious
> +	 * interrupt but that's not a big deal.
> +	 */
> +	xc->mfrr = mfrr;
> +	if (mfrr < cppr)
> +		xive_irq_trigger(&xc->vp_ipi_data);
> +
> +	/*
> +	 * Now saved XIRR is "interesting". It means there's something in
> +	 * the legacy "1 element" queue... for an IPI we simply ignore it,
> +	 * as the MFRR restore will handle that. For anything else we need
> +	 * to force a resend of the source.
> +	 * However the source may not have been setup yet. If that's the
> +	 * case, we keep that info and increment a counter in the xive to
> +	 * tell subsequent xive_set_source() to go look.
> +	 */
> +	if (xisr > XICS_IPI && !xive_restore_pending_irq(xive, xisr)) {
> +		xc->delayed_irq = xisr;
> +		xive->delayed_irqs++;
> +		DBG("  xisr restore delayed\n");
> +	}
> +
> +	return 0;
> +}
> +
> +int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
> +			   struct irq_desc *host_desc)
> +{
> +	struct kvmppc_xive *xive = kvm->arch.xive;
> +	struct kvmppc_xive_src_block *sb;
> +	struct kvmppc_xive_irq_state *state;
> +	struct irq_data *host_data = irq_desc_get_irq_data(host_desc);
> +	unsigned int host_irq = irq_desc_get_irq(host_desc);
> +	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(host_data);
> +	u16 idx;
> +	u8 prio;
> +	int rc;
> +
> +	if (!xive)
> +		return -ENODEV;
> +
> +	DBG("set_mapped girq 0x%lx host HW irq 0x%x...\n", guest_irq, hw_irq);
> +
> +	sb = kvmppc_xive_find_source(xive, guest_irq, &idx);
> +	if (!sb)
> +		return -EINVAL;
> +	state = &sb->irq_state[idx];
> +
> +	/*
> +	 * Mark the passed-through interrupt as going to a VCPU,
> +	 * this will prevent further EOIs and similar operations
> +	 * from the XIVE code. It will also mask the interrupt
> +	 * to either PQ=10 or 11 state, the latter if the interrupt
> +	 * is pending. This will allow us to unmask or retrigger it
> +	 * after routing it to the guest with a simple EOI.
> +	 *
> +	 * The "state" argument is a "token", all it needs is to be
> +	 * non-NULL to switch to passed-through or NULL for the
> +	 * other way around. We may not yet have an actual VCPU
> +	 * target here and we don't really care.
> +	 */
> +	rc = irq_set_vcpu_affinity(host_irq, state);
> +	if (rc) {
> +		pr_err("Failed to set VCPU affinity for irq %d\n", host_irq);
> +		return rc;
> +	}
> +
> +	/*
> +	 * Mask and read state of IPI. We need to know if its P bit
> +	 * is set as that means it's potentially already using a
> +	 * queue entry in the target
> +	 */
> +	prio = xive_lock_and_mask(xive, sb, state);
> +	DBG(" old IPI prio %02x P:%d Q:%d\n", prio, state->old_p, state->old_q);
> +
> +	/* Turn the IPI hard off */
> +	xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
> +
> +	/* Grab info about irq */
> +	state->pt_number = hw_irq;
> +	state->pt_data = irq_data_get_irq_handler_data(host_data);
> +
> +	/*
> +	 * Configure the IRQ to match the existing configuration of
> +	 * the IPI if it was already targetted. Otherwise this will
> +	 * mask the interrupt in a lossy way (act_priority is 0xff)
> +	 * which is fine for a never started interrupt.
> +	 */
> +	xive_native_configure_irq(hw_irq,
> +				  xive->vp_base + state->act_server,
> +				  state->act_priority, state->number);
> +
> +	/*
> +	 * We do an EOI to enable the interrupt (and retrigger if needed)
> +	 * if the guest has the interrupt unmasked and the P bit was *not*
> +	 * set in the IPI. If it was set, we know a slot may still be in
> +	 * use in the target queue thus we have to wait for a guest
> +	 * originated EOI
> +	 */
> +	if (prio != MASKED && !state->old_p)
> +		xive_vm_source_eoi(hw_irq, state->pt_data);
> +
> +	/* Clear old_p/old_q as they are no longer relevant */
> +	state->old_p = state->old_q = false;
> +
> +	/* Restore guest prio (unlocks EOI) */
> +	mb();
> +	state->guest_priority = prio;
> +	arch_spin_unlock(&sb->lock);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_xive_set_mapped);
> +
> +int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
> +			   struct irq_desc *host_desc)
> +{
> +	struct kvmppc_xive *xive = kvm->arch.xive;
> +	struct kvmppc_xive_src_block *sb;
> +	struct kvmppc_xive_irq_state *state;
> +	unsigned int host_irq = irq_desc_get_irq(host_desc);
> +	u16 idx;
> +	u8 prio;
> +	int rc;
> +
> +	if (!xive)
> +		return -ENODEV;
> +
> +	DBG("clr_mapped girq 0x%lx...\n", guest_irq);
> +
> +	sb = kvmppc_xive_find_source(xive, guest_irq, &idx);
> +	if (!sb)
> +		return -EINVAL;
> +	state = &sb->irq_state[idx];
> +
> +	/*
> +	 * Mask and read state of IRQ. We need to know if its P bit
> +	 * is set as that means it's potentially already using a
> +	 * queue entry in the target
> +	 */
> +	prio = xive_lock_and_mask(xive, sb, state);
> +	DBG(" old IRQ prio %02x P:%d Q:%d\n", prio, state->old_p, state->old_q);
> +
> +	/*
> +	 * If old_p is set, the interrupt is pending, we switch it to
> +	 * PQ=11. This will force a resend in the host so the interrupt
> +	 * isn't lost to whatver host driver may pick it up
> +	 */
> +	if (state->old_p)
> +		xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_11);
> +
> +	/* Relase the passed-through interrupt to the host */
           ^^^^^^ Release

> +	rc = irq_set_vcpu_affinity(host_irq, NULL);
> +	if (rc) {
> +		pr_err("Failed to clr VCPU affinity for irq %d\n", host_irq);
> +		return rc;
> +	}
> +
> +	/* Forget about the IRQ */
> +	state->pt_number = 0;
> +	state->pt_data = NULL;
> +
> +	/* Reconfigure the IPI */
> +	xive_native_configure_irq(state->ipi_number,
> +				  xive->vp_base + state->act_server,
> +				  state->act_priority, state->number);
> +
> +	/*
> +	 * If old_p is set (we have a queue entry potentially
> +	 * occupied) or the interrupt is masked, we set the IPI
> +	 * to PQ=10 state. Otherwise we just re-enable it (PQ=00).
> +	 */
> +	if (prio == MASKED || state->old_p)
> +		xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_10);
> +	else
> +		xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_00);
> +
> +	/* Restore guest prio (unlocks EOI) */
> +	mb();
> +	state->guest_priority = prio;
> +	arch_spin_unlock(&sb->lock);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped);
> +
> +static void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
> +{
> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +	struct kvm *kvm = vcpu->kvm;
> +	struct kvmppc_xive *xive = kvm->arch.xive;
> +	int i, j;
> +
> +	for (i = 0; i <= xive->max_sbid; i++) {
> +		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
> +
> +		if (!sb)
> +			continue;
> +		for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
> +			struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
> +
> +			if (!state->valid)
> +				continue;
> +			if (state->act_priority == MASKED)
> +				continue;
> +			if (state->act_server != xc->server_num)
> +				continue;
> +
> +			/* Clean it up */
> +			arch_spin_lock(&sb->lock);
> +			state->act_priority = MASKED;
> +			xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
> +			xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
> +			if (state->pt_number) {
> +				xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
> +				xive_native_configure_irq(state->pt_number, 0, MASKED, 0);
> +			}
> +			arch_spin_unlock(&sb->lock);
> +		}
> +	}
> +}
> +
> +void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
> +{
> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +	struct kvmppc_xive *xive = xc->xive;
> +	int i;
> +
> +	DBG("cleanup_vcpu(cpu=%d)\n", xc->server_num);
> +
> +	/* Ensure no interrupt is still routed to that VP */
> +	xc->valid = false;
> +	kvmppc_xive_disable_vcpu_interrupts(vcpu);
> +
> +	/* Mask the VP IPI */
> +	xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01);
> +
> +	/* Disable the VP */
> +	xive_native_disable_vp(xc->vp_id);
> +
> +	/* Free the queues & associated interrupts */
> +	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
> +		struct xive_q *q = &xc->queues[i];
> +
> +		/* Free the escalation irq */
> +		if (xc->esc_virq[i]) {
> +			free_irq(xc->esc_virq[i], vcpu);
> +			irq_dispose_mapping(xc->esc_virq[i]);
> +			kfree(xc->esc_virq_names[i]);
> +		}
> +		/* Free the queue */
> +		xive_native_disable_queue(xc->vp_id, q, i);
> +		if (q->qpage) {
> +			free_pages((unsigned long)q->qpage,
> +				   xive->q_alloc_order);
> +			q->qpage = NULL;
> +		}
> +	}
> +
> +	/* Free the IPI */
> +	if (xc->vp_ipi) {
> +		xive_cleanup_irq_data(&xc->vp_ipi_data);
> +		xive_native_free_irq(xc->vp_ipi);
> +	}
> +	/* Free the VP */
> +	kfree(xc);
> +}
> +
> +int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
> +			     struct kvm_vcpu *vcpu, u32 cpu)
> +{
> +	struct kvmppc_xive *xive = dev->private;
> +	struct kvmppc_xive_vcpu *xc;
> +	int i, r = -EBUSY;
> +
> +	DBG("connect_vcpu(cpu=%d)\n", cpu);
> +
> +	if (dev->ops != &kvm_xive_ops) {
> +		DBG("Wrong ops !\n");
> +		return -EPERM;
> +	}
> +	if (xive->kvm != vcpu->kvm)
> +		return -EPERM;
> +	if (vcpu->arch.irq_type)
> +		return -EBUSY;
> +	if (kvmppc_xive_find_server(vcpu->kvm, cpu)) {
> +		DBG("Duplicate !\n");
> +		return -EEXIST;
> +	}
> +	if (cpu >= KVM_MAX_VCPUS) {
> +		DBG("Out of bounds !\n");
> +		return -EINVAL;
> +	}
> +	xc = kzalloc(sizeof(*xc), GFP_KERNEL);
> +	if (!xc)
> +		return -ENOMEM;
> +
> +	/* We need to synchronize with queue provisioning */
> +	mutex_lock(&vcpu->kvm->lock);
> +	vcpu->arch.xive_vcpu = xc;
> +	xc->xive = xive;
> +	xc->vcpu = vcpu;
> +	xc->server_num = cpu;
> +	xc->vp_id = xive->vp_base + cpu;
> +	xc->mfrr = 0xff;
> +	xc->valid = true;
> +
> +	r = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
> +	if (r)
> +		goto bail;
> +
> +	/* Configure VCPU fields for use by assembly push/pull */
> +	vcpu->arch.xive_saved_state.qw = cpu_to_be64(0xff000000);
> +	vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
> +
> +	/* Allocate IPI */
> +	xc->vp_ipi = xive_native_alloc_irq();
> +	if (!xc->vp_ipi) {
> +		r = -EIO;
> +		goto bail;
> +	}
> +	DBG(" IPI=0x%x\n", xc->vp_ipi);
> +
> +	r = xive_native_populate_irq_data(xc->vp_ipi, &xc->vp_ipi_data);
> +	if (r)
> +		goto bail;
> +
> +	/*
> +	 * Initialize queues. Initially we set them all for no queueing
> +	 * and we enable escalation for queue 0 only which we'll use for
> +	 * our mfrr change notifications. If the VCPU is hot-plugged, we
> +	 * do handle provisioning however.
> +	 */
> +	for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
> +		struct xive_q *q = &xc->queues[i];
> +
> +		/* Is queue already enabled ? Provision it */
> +		if (xive->qmap & (1 << i)) {
> +			r = xive_provision_queue(vcpu, i);
> +			if (r == 0)
> +				xive_attach_escalation(vcpu, i);
> +			if (r)
> +				goto bail;
> +		} else {
> +			r = xive_native_configure_queue(xc->vp_id,
> +							q, i, NULL, 0, true);
> +			if (r) {
> +				pr_err("XIVE-KVM: Failed to configure queue %d"
> +				       " for VCPU %d\n",
> +				       i, cpu);
> +				goto bail;
> +			}
> +		}
> +	}
> +
> +	/* If not done above, attach priority 0 escalation */
> +	r = xive_attach_escalation(vcpu, 0);
> +	if (r)
> +		goto bail;
> +
> +	/* Enable the VP */
> +	r = xive_native_enable_vp(xc->vp_id);
> +	if (r)
> +		goto bail;
> +
> +	/* Route the IPI */
> +	r = xive_native_configure_irq(xc->vp_ipi, xc->vp_id, 0, XICS_IPI);
> +	if (!r)
> +		xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_00);
> +
> + bail:
> +	mutex_unlock(&vcpu->kvm->lock);
> +	if (r) {
> +		kvmppc_xive_cleanup_vcpu(vcpu);
> +		return r;
> +	}
> +
> +	vcpu->arch.irq_type = KVMPPC_IRQ_XICS;
> +	return 0;
> +}
> +
> +/*
> + * Scanning of queues before/after migration save
> + */
> +static void xive_pre_save_set_queued(struct kvmppc_xive *xive, u32 irq)
> +{
> +	struct kvmppc_xive_src_block *sb;
> +	struct kvmppc_xive_irq_state *state;
> +	u16 idx;
> +
> +	sb = kvmppc_xive_find_source(xive, irq, &idx);
> +	if (!sb)
> +		return;
> +
> +	state = &sb->irq_state[idx];
> +
> +	/* Some sanity checking */
> +	if (!state->valid) {
> +		pr_err("XIVE/XIVE: invalid irq 0x%x in cpu queue!\n", irq);
> +		return;
> +	}
> +
> +	/*
> +	 * If the interrupt is in a queue it should have P set.
> +	 * We warn so that gets reported. A backtrace isn't useful
> +	 * so no need to use a WARN_ON.
> +	 */
> +	if (!state->saved_p)
> +		pr_err("KVM/XIVE: Interrupt 0x%x is marked in a queue"
> +		       " but P not set !\n", irq);
> +
> +	/* Set flag */
> +	state->in_queue = true;
> +}
> +
> +static void xive_pre_scan_mask_irq(struct kvmppc_xive *xive,
> +				   struct kvmppc_xive_src_block *sb,
> +				   u32 irq)
> +{
> +	struct kvmppc_xive_irq_state *state = &sb->irq_state[irq];
> +
> +	if (!state->valid)
> +		return;
> +
> +	/* Mask and save state, this will also sync HW queues */
> +	state->saved_scan_prio = xive_lock_and_mask(xive, sb, state);
> +
> +	/* Transfer P and Q */
> +	state->saved_p = state->old_p;
> +	state->saved_q = state->old_q;
> +
> +	/* Unlock */
> +	arch_spin_unlock(&sb->lock);
> +}
> +
> +static void xive_pre_scan_unmask_irq(struct kvmppc_xive *xive,

I think a better name would be "xive_pre_save_unmask", since this is
actually called after the scan.

> +				     struct kvmppc_xive_src_block *sb,
> +				     u32 irq)
> +{
> +	struct kvmppc_xive_irq_state *state = &sb->irq_state[irq];
> +
> +	if (!state->valid)
> +		return;
> +
> +	/*
> +	 * Lock / exclude EOI (not technically necessary if the
> +	 * guest isn't running concurrently. If this becomes a
> +	 * performance issue we can probably remove the lock.
> +	 */
> +	xive_lock_for_unmask(sb, state);
> +
> +	/* Restore mask/prio if it wasn't masked */
> +	if (state->saved_scan_prio != MASKED)
> +		xive_finish_unmask(xive, sb, state, state->saved_scan_prio);
> +
> +	/* Unlock */
> +	arch_spin_unlock(&sb->lock);
> +}
> +
> +static void xive_pre_save_queue(struct kvmppc_xive *xive, struct xive_q *q)
> +{
> +	u32 idx = q->idx;
> +	u32 toggle = q->toggle;
> +	u32 irq;
> +
> +	do {
> +		irq = __xive_read_eq(q->qpage, q->msk, &idx, &toggle);
> +		if (irq > XICS_IPI)
> +			xive_pre_save_set_queued(xive, irq);
> +	} while(irq);
> +}
> +
> +static void xive_pre_save_scan(struct kvmppc_xive *xive)
> +{
> +	struct kvm_vcpu *vcpu = NULL;
> +	int i, j;
> +
> +	/*
> +	 * See comment in xive_get_source() about how this
> +	 * work. Collect a stable state for all interrupts
> +	 */
> +	for (i = 0; i <= xive->max_sbid; i++) {
> +		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
> +		if (!sb)
> +			continue;
> +		for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
> +			xive_pre_scan_mask_irq(xive, sb, j);
> +	}
> +
> +	/* Then scan the queues and update the "in_queue" flag */
> +	kvm_for_each_vcpu(i, vcpu, xive->kvm) {
> +		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +		if (!xc)
> +			continue;
> +		for (j = 0; j < KVMPPC_XIVE_Q_COUNT; j++) {
> +			if (xc->queues[i].qpage)
> +				xive_pre_save_queue(xive, &xc->queues[i]);
> +		}
> +	}
> +
> +	/* Finally restore interrupt states */
> +	for (i = 0; i <= xive->max_sbid; i++) {
> +		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
> +		if (!sb)
> +			continue;
> +		for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
> +			xive_pre_scan_unmask_irq(xive, sb, j);
> +	}
> +}
> +
> +static void xive_post_save_scan(struct kvmppc_xive *xive)
> +{
> +	u32 i, j;
> +
> +	/* Clear all the in_queue flags */
> +	for (i = 0; i <= xive->max_sbid; i++) {
> +		struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
> +		if (!sb)
> +			continue;
> +		for (j = 0;  j < KVMPPC_XICS_IRQ_PER_ICS; j++)
> +			sb->irq_state[j].in_queue = false;
> +	}
> +
> +	/* Next get_source() will do a new scan */
> +	xive->saved_src_count = 0;
> +}
> +
> +/*
> + * This returns the source configuration and state to user space.
> + */
> +static int xive_get_source(struct kvmppc_xive *xive, long irq, u64 addr)
> +{
> +	struct kvmppc_xive_src_block *sb;
> +	struct kvmppc_xive_irq_state *state;
> +	u64 __user *ubufp = (u64 __user *) addr;
> +	u64 val, prio;
> +	u16 idx;
> +
> +	sb = kvmppc_xive_find_source(xive, irq, &idx);
> +	if (!sb)
> +		return -ENOENT;
> +
> +	state = &sb->irq_state[idx];
> +
> +	if (!state->valid)
> +		return -ENOENT;
> +
> +	DBG("get_source(%ld)...\n", irq);
> +
> +	/*
> +	 * So to properly save the state into something that looks like a
> +	 * XICS migration stream we cannot treat interrupts individually.
> +	 *
> +	 * We need, instead, mask them all (& save their previous PQ state)
> +	 * to get a stable state in the HW, then sync them to ensure that
> +	 * any interrupt that had already fired hits its queue, and finally
> +	 * scan all the queues to collect which interrupts are still present
> +	 * in the queues, so we can set the "pending" flag on them and
> +	 * they can be resent on restore.
> +	 *
> +	 * So we do it all when the "first" interrupt gets saved, all the
> +	 * state is collected at that point, the rest of xive_get_source()
> +	 * will merely collect and convert that state to the expected
> +	 * userspace bit mask.
> +	 */
> +	if (xive->saved_src_count == 0)
> +		xive_pre_save_scan(xive);
> +	xive->saved_src_count++;
> +
> +	/* Convert saved state into something compatible with xics */
> +	val = state->guest_server;
> +	prio = state->saved_scan_prio;
> +
> +	if (prio == MASKED) {
> +		val |= KVM_XICS_MASKED;
> +		prio = state->saved_priority;
> +	}
> +	val |= prio << KVM_XICS_PRIORITY_SHIFT;
> +	if (state->lsi) {
> +		val |= KVM_XICS_LEVEL_SENSITIVE;
> +		if (state->saved_p)
> +			val |= KVM_XICS_PENDING;
> +	} else {
> +		if (state->saved_p)
> +			val |= KVM_XICS_PRESENTED;
> +
> +		if (state->saved_q)
> +			val |= KVM_XICS_QUEUED;
> +
> +		/*
> +		 * We mark it pending (which will attempt a re-delivery)
> +		 * if we are in a queue *or* we were masked and had
> +		 * Q set which is equivalent to the XICS "masked pending"
> +		 * state
> +		 */
> +		if (state->in_queue || (prio == MASKED && state->saved_q))
> +			val |= KVM_XICS_PENDING;
> +	}
> +
> +	/*
> +	 * If that was the last interrupt saved, reset the
> +	 * in_queue flags
> +	 */
> +	if (xive->saved_src_count == xive->src_count)
> +		xive_post_save_scan(xive);
> +
> +	/* Copy the result to userspace */
> +	if (put_user(val, ubufp))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *xive,
> +							   int irq)
> +{
> +	struct kvm *kvm = xive->kvm;
> +	struct kvmppc_xive_src_block *sb;
> +	int i, bid;
> +
> +	bid = irq >> KVMPPC_XICS_ICS_SHIFT;
> +
> +	mutex_lock(&kvm->lock);
> +
> +	/* block already exists - somebody else got here first */
> +	if (xive->src_blocks[bid])
> +		goto out;
> +
> +	/* Create the ICS */
> +	sb = kzalloc(sizeof(*sb), GFP_KERNEL);
> +	if (!sb)
> +		goto out;
> +
> +	sb->id = bid;
> +
> +	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
> +		sb->irq_state[i].number = (bid << KVMPPC_XICS_ICS_SHIFT) | i;
> +		sb->irq_state[i].guest_priority = MASKED;
> +		sb->irq_state[i].saved_priority = MASKED;
> +		sb->irq_state[i].act_priority = MASKED;
> +	}
> +	smp_wmb();
> +	xive->src_blocks[bid] = sb;
> +
> +	if (bid > xive->max_sbid)
> +		xive->max_sbid = bid;
> +
> + out:
> +	mutex_unlock(&kvm->lock);
> +	return xive->src_blocks[bid];
> +}
> +
> +static bool xive_check_delayed_irq(struct kvmppc_xive *xive, u32 irq)
> +{
> +	struct kvm *kvm = xive->kvm;
> +	struct kvm_vcpu *vcpu = NULL;
> +	int i;
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +
> +		if (!xc)
> +			continue;
> +
> +		if (xc->delayed_irq == irq) {
> +			xc->delayed_irq = 0;
> +			xive->delayed_irqs--;
> +			return true;
> +		}
> +	}
> +	return false;
> +}
> +
> +static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
> +{
> +	struct kvmppc_xive_src_block *sb;
> +	struct kvmppc_xive_irq_state *state;
> +	u64 __user *ubufp = (u64 __user *) addr;
> +	u16 idx;
> +	u64 val;
> +	u8 act_prio, guest_prio;
> +	u32 server;
> +	int rc = 0;
> +
> +	if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
> +		return -ENOENT;
> +
> +	DBG("set_source(irq=0x%lx)\n", irq);
> +
> +	/* Find the source */
> +	sb = kvmppc_xive_find_source(xive, irq, &idx);
> +	if (!sb) {
> +		DBG("No source, creating source block...\n");
> +		sb = xive_create_src_block(xive, irq);
> +		if (!sb) {
> +			DBG("Failed to create block...\n");
> +			return -ENOMEM;
> +		}
> +	}
> +	state = &sb->irq_state[idx];
> +
> +	/* Read user passed data */
> +	if (get_user(val, ubufp)) {
> +		DBG("fault getting user info !\n");
> +		return -EFAULT;
> +	}
> +
> +	server = val & KVM_XICS_DESTINATION_MASK;
> +	guest_prio = val >> KVM_XICS_PRIORITY_SHIFT;
> +
> +	DBG("  val=0x016%llx (server=0x%x, guest_prio=%d)\n",
> +	    val, server, guest_prio);
> +	/*
> +	 * If the source doesn't already have an IPI, allocate
> +	 * one and get the corresponding data
> +	 */
> +	if (!state->ipi_number) {
> +		state->ipi_number = xive_native_alloc_irq();
> +		if (state->ipi_number == 0) {
> +			DBG("Failed to allocate IPI !\n");
> +			return -ENOMEM;
> +		}
> +		xive_native_populate_irq_data(state->ipi_number, &state->ipi_data);
> +		DBG(" src_ipi=0x%x\n", state->ipi_number);
> +	}
> +
> +	/*
> +	 * We use lock_and_mask() to set us in the right masked
> +	 * state. We will override that state from the saved state
> +	 * further down, but this will handle the cases of interrupts
> +	 * that need FW masking. We set the initial guest_priority to
> +	 * 0 before calling it to ensure it actually performs the masking.
> +	 */
> +	state->guest_priority = 0;
> +	xive_lock_and_mask(xive, sb, state);
> +
> +	/*
> +	 * Now, we select a target if we have one. If we don't we
> +	 * leave the interrupt untargetted. It means that an interrupt
> +	 * can become "untargetted" accross migration if it was masked
> +	 * by set_xive() but there is little we can do about it.
> +	 */
> +
> +	/* First convert prio and mark interrupt as untargetted */
> +	act_prio = xive_prio_from_guest(guest_prio);
> +	state->act_priority = MASKED;
> +	state->guest_server = server;
> +
> +	/*
> +	 * We need to drop the lock due to the mutex below. Hopefully
> +	 * nothing is touching that interrupt yet since it hasn't been
> +	 * advertized to a running guest yet
> +	 */
> +	arch_spin_unlock(&sb->lock);
> +
> +	/* If we have a priority target the interrupt */
> +	if (act_prio != MASKED) {
> +		/* First, check provisioning of queues */
> +		mutex_lock(&xive->kvm->lock);
> +		rc = xive_check_provisioning(xive->kvm, act_prio);
> +		mutex_unlock(&xive->kvm->lock);
> +
> +		/* Target interrupt */
> +		if (rc == 0)
> +			rc = xive_target_interrupt(xive->kvm, state,
> +						   server, act_prio);
> +		/*
> +		 * If provisioning or targetting failed, leave it
> +		 * alone and masked. It will remain disabled until
> +		 * the guest re-targets it.
> +		 */
> +	}
> +
> +	/*
> +	 * Find out if this was a delayed irq stashed in an ICP,
> +	 * in which case, treat it as pending
> +	 */
> +	if (xive->delayed_irqs && xive_check_delayed_irq(xive, irq)) {
> +		val |= KVM_XICS_PENDING;
> +		DBG("  Found delayed ! forcing PENDING !\n");
> +	}
> +
> +	/* Cleanup the SW state */
> +	state->old_p = false;
> +	state->old_q = false;
> +	state->lsi = false;
> +	state->asserted = false;
> +
> +	/* Restore LSI state */
> +	if (val & KVM_XICS_LEVEL_SENSITIVE) {
> +		state->lsi = true;
> +		if (val & KVM_XICS_PENDING)
> +			state->asserted = true;
> +		DBG("  LSI ! Asserted=%d\n", state->asserted);
> +	}
> +
> +	/*
> +	 * Restore P and Q. If the interrupt was pending, we
> +	 * force both P and Q, which will trigger a resend.
> +	 *
> +	 * That means that a guest that had both an interrupt
> +	 * pending (queued) and Q set will restore with only
> +	 * one instance of that interrupt instead of 2, but that
> +	 * is perfectly fine as coalescing interrupts that haven't
> +	 * been presented yet is always allowed.
> +	 */
> +	if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING)
> +		state->old_p = true;
> +	if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING)
> +		state->old_q = true;
> +
> +	DBG("  P=%d, Q=%d\n", state->old_p, state->old_q);
> +
> +	/*
> +	 * If the interrupt was unmasked, update guest priority and
> +	 * perform the appropriate state transition and do a
> +	 * re-trigger if necessary.
> +	 */
> +	if (val & KVM_XICS_MASKED) {
> +		DBG("  masked, saving prio\n");
> +		state->guest_priority = MASKED;
> +		state->saved_priority = guest_prio;
> +	} else {
> +		DBG("  unmasked, restoring to prio %d\n", guest_prio);
> +		xive_finish_unmask(xive, sb, state, guest_prio);
> +		state->saved_priority = guest_prio;
> +	}
> +
> +	/* Increment the number of valid sources and mark this one valid */
> +	if (!state->valid)
> +		xive->src_count++;
> +	state->valid = true;
> +
> +	return 0;
> +}
> +
> +int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
> +			bool line_status)
> +{
> +	struct kvmppc_xive *xive = kvm->arch.xive;
> +	struct kvmppc_xive_src_block *sb;
> +	struct kvmppc_xive_irq_state *state;
> +	u16 idx;
> +
> +	if (!xive)
> +		return -ENODEV;
> +
> +	sb = kvmppc_xive_find_source(xive, irq, &idx);
> +	if (!sb)
> +		return -EINVAL;
> +
> +	/* Perform locklessly .... (we need to do some RCUisms here...) */
> +	state = &sb->irq_state[idx];
> +	if (!state->valid)
> +		return -EINVAL;
> +
> +	/* We don't allow a trigger on a passed-through interrupt */
> +	if (state->pt_number)
> +		return -EINVAL;
> +
> +	if ((level == 1 && state->lsi) || level == KVM_INTERRUPT_SET_LEVEL)
> +		state->asserted = 1;
> +	else if (level == 0 || level == KVM_INTERRUPT_UNSET) {
> +		state->asserted = 0;
> +		return 0;
> +	}
> +
> +	/* Trigger the IPI */
> +	xive_irq_trigger(&state->ipi_data);
> +
> +	return 0;
> +}
> +
> +static int xive_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> +{
> +	struct kvmppc_xive *xive = dev->private;
> +
> +	/* We honor the existing XICS ioctl */
> +	switch (attr->group) {
> +	case KVM_DEV_XICS_GRP_SOURCES:
> +		return xive_set_source(xive, attr->attr, attr->addr);
> +	}
> +	return -ENXIO;
> +}
> +
> +static int xive_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> +{
> +	struct kvmppc_xive *xive = dev->private;
> +
> +	/* We honor the existing XICS ioctl */
> +	switch (attr->group) {
> +	case KVM_DEV_XICS_GRP_SOURCES:
> +		return xive_get_source(xive, attr->attr, attr->addr);
> +	}
> +	return -ENXIO;
> +}
> +
> +static int xive_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
> +{
> +	/* We honor the same limits as XICS, at least for now */
> +	switch (attr->group) {
> +	case KVM_DEV_XICS_GRP_SOURCES:
> +		if (attr->attr >= KVMPPC_XICS_FIRST_IRQ &&
> +		    attr->attr < KVMPPC_XICS_NR_IRQS)
> +			return 0;
> +		break;
> +	}
> +	return -ENXIO;
> +}
> +
> +static void kvmppc_xive_cleanup_irq(u32 hw_num, struct xive_irq_data *xd)
> +{
> +	xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
> +	xive_native_configure_irq(hw_num, 0, MASKED, 0);
> +	xive_cleanup_irq_data(xd);
> +}
> +
> +static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
> +{
> +	int i;
> +
> +	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
> +		struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
> +
> +		if (!state->valid)
> +			continue;
> +
> +		kvmppc_xive_cleanup_irq(state->ipi_number, &state->ipi_data);
> +		xive_native_free_irq(state->ipi_number);
> +
> +		/* Pass-through, cleanup too */
> +		if (state->pt_number)
> +			kvmppc_xive_cleanup_irq(state->pt_number, state->pt_data);
> +
> +		state->valid = false;
> +	}
> +}
> +
> +static void kvmppc_xive_free(struct kvm_device *dev)
> +{
> +	struct kvmppc_xive *xive = dev->private;
> +	struct kvm *kvm = xive->kvm;
> +	int i;
> +
> +	debugfs_remove(xive->dentry);
> +
> +	if (kvm)
> +		kvm->arch.xive = NULL;
> +
> +	/* Mask and free interrupts */
> +	for (i = 0; i <= xive->max_sbid; i++) {
> +		if (xive->src_blocks[i])
> +			kvmppc_xive_free_sources(xive->src_blocks[i]);
> +		kfree(xive->src_blocks[i]);
> +		xive->src_blocks[i] = NULL;
> +	}
> +
> +	if (xive->vp_base != XIVE_INVALID_VP)
> +		xive_native_free_vp_block(xive->vp_base);
> +
> +
> +	kfree(xive);
> +	kfree(dev);
> +}
> +
> +static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
> +{
> +	struct kvmppc_xive *xive;
> +	struct kvm *kvm = dev->kvm;
> +	int ret = 0;
> +
> +	DBG("Creating xive for partition\n");
> +
> +	xive = kzalloc(sizeof(*xive), GFP_KERNEL);
> +	if (!xive)
> +		return -ENOMEM;
> +
> +	dev->private = xive;
> +	xive->dev = dev;
> +	xive->kvm = kvm;
> +
> +	/* Already there ? */
> +	if (kvm->arch.xive)
> +		ret = -EEXIST;
> +	else
> +		kvm->arch.xive = xive;
> +
> +	/* We use the default queue size set by the host */
> +	xive->q_order = xive_native_default_eq_shift();
> +	if (xive->q_order < PAGE_SHIFT)
> +		xive->q_alloc_order = 0;
> +	else
> +		xive->q_alloc_order = xive->q_order - PAGE_SHIFT;
> +
> +	/* Allocate a bunch of VPs */
> +	xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS);
> +	DBG("VP_Base=%x\n", xive->vp_base);
> +	if (xive->vp_base == XIVE_INVALID_VP)
> +		ret = -ENOMEM;
> +
> +	if (ret) {
> +		kfree(xive);
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +
> +static int xive_debug_show(struct seq_file *m, void *private)
> +{
> +	struct kvmppc_xive *xive = m->private;
> +	struct kvm *kvm = xive->kvm;
> +	struct kvm_vcpu *vcpu;
> +	u64 t_rm_h_xirr = 0;
> +	u64 t_rm_h_ipoll = 0;
> +	u64 t_rm_h_cppr = 0;
> +	u64 t_rm_h_eoi = 0;
> +	u64 t_rm_h_ipi = 0;
> +	u64 t_vm_h_xirr = 0;
> +	u64 t_vm_h_ipoll = 0;
> +	u64 t_vm_h_cppr = 0;
> +	u64 t_vm_h_eoi = 0;
> +	u64 t_vm_h_ipi = 0;
> +	unsigned int i;
> +
> +	if (!kvm)
> +		return 0;
> +
> +	seq_printf(m, "=========\nVCPU state\n=========\n");
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +
> +		if (!xc)
> +			continue;
> +
> +		seq_printf(m, "cpu server %#x CPPR:%#x HWCPPR:%#x"
> +			   " MFRR:%#x PEND:%#x h_xirr: R=%lld V=%lld\n",
> +			   xc->server_num, xc->cppr, xc->hw_cppr,
> +			   xc->mfrr, xc->pending,
> +			   xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
> +		t_rm_h_xirr += xc->stat_rm_h_xirr;
> +		t_rm_h_ipoll += xc->stat_rm_h_ipoll;
> +		t_rm_h_cppr += xc->stat_rm_h_cppr;
> +		t_rm_h_eoi += xc->stat_rm_h_eoi;
> +		t_rm_h_ipi += xc->stat_rm_h_ipi;
> +		t_vm_h_xirr += xc->stat_vm_h_xirr;
> +		t_vm_h_ipoll += xc->stat_vm_h_ipoll;
> +		t_vm_h_cppr += xc->stat_vm_h_cppr;
> +		t_vm_h_eoi += xc->stat_vm_h_eoi;
> +		t_vm_h_ipi += xc->stat_vm_h_ipi;
> +	}
> +
> +	seq_printf(m, "Hcalls totals\n");
> +	seq_printf(m, " H_XIRR  R=%10lld V=%10lld\n", t_rm_h_xirr, t_vm_h_xirr);
> +	seq_printf(m, " H_IPOLL R=%10lld V=%10lld\n", t_rm_h_ipoll, t_vm_h_ipoll);
> +	seq_printf(m, " H_CPPR  R=%10lld V=%10lld\n", t_rm_h_cppr, t_vm_h_cppr);
> +	seq_printf(m, " H_EOI   R=%10lld V=%10lld\n", t_rm_h_eoi, t_vm_h_eoi);
> +	seq_printf(m, " H_IPI   R=%10lld V=%10lld\n", t_rm_h_ipi, t_vm_h_ipi);
> +
> +	return 0;
> +}
> +
> +static int xive_debug_open(struct inode *inode, struct file *file)
> +{
> +	return single_open(file, xive_debug_show, inode->i_private);
> +}
> +
> +static const struct file_operations xive_debug_fops = {
> +	.open = xive_debug_open,
> +	.read = seq_read,
> +	.llseek = seq_lseek,
> +	.release = single_release,
> +};
> +
> +static void xive_debugfs_init(struct kvmppc_xive *xive)
> +{
> +	char *name;
> +
> +	name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive);
> +	if (!name) {
> +		pr_err("%s: no memory for name\n", __func__);
> +		return;
> +	}
> +
> +	xive->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root,
> +					   xive, &xive_debug_fops);
> +
> +	pr_debug("%s: created %s\n", __func__, name);
> +	kfree(name);
> +}
> +
> +static void kvmppc_xive_init(struct kvm_device *dev)
> +{
> +	struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private;
> +
> +	/* Register some debug interfaces */
> +	xive_debugfs_init(xive);
> +}
> +
> +struct kvm_device_ops kvm_xive_ops = {
> +	.name = "kvm-xive",
> +	.create = kvmppc_xive_create,
> +	.init = kvmppc_xive_init,
> +	.destroy = kvmppc_xive_free,
> +	.set_attr = xive_set_attr,
> +	.get_attr = xive_get_attr,
> +	.has_attr = xive_has_attr,
> +};
> +
> +void kvmppc_xive_init_module(void)
> +{
> +	__xive_vm_h_xirr = xive_vm_h_xirr;
> +	__xive_vm_h_ipoll = xive_vm_h_ipoll;
> +	__xive_vm_h_ipi = xive_vm_h_ipi;
> +	__xive_vm_h_cppr = xive_vm_h_cppr;
> +	__xive_vm_h_eoi = xive_vm_h_eoi;
> +}
> +
> +void kvmppc_xive_exit_module(void)
> +{
> +	__xive_vm_h_xirr = NULL;
> +	__xive_vm_h_ipoll = NULL;
> +	__xive_vm_h_ipi = NULL;
> +	__xive_vm_h_cppr = NULL;
> +	__xive_vm_h_eoi = NULL;
> +}
> diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
> new file mode 100644
> index 0000000..2b7fdbd
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_xive.h
> @@ -0,0 +1,251 @@
> +/*
> + * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + */
> +
> +#ifndef _KVM_PPC_BOOK3S_XIVE_H
> +#define _KVM_PPC_BOOK3S_XIVE_H
> +
> +#include "book3s_xics.h"
> +
> +/* State for one guest irq source.
> + *
> + * For each guest source we allocate a HW interrupt in the XIVE
> + * which we use for all SW triggers. It will be unused for
> + * pass-through but it's easier to keep around as the same
> + * guest interrupt can alternatively be emulated or pass-through
> + * if a physical device is hot unplugged and replaced with an
> + * emulated one.
> + *
> + * This state structure is very similar to the XICS one with
> + * additional XIVE specific tracking.
> + */
> +struct kvmppc_xive_irq_state {
> +	bool valid;			/* Interrupt entry is valid */
> +
> +	u32 number;			/* Guest IRQ number */
> +	u32 ipi_number;			/* XIVE IPI HW number */
> +	struct xive_irq_data ipi_data;	/* XIVE IPI associated data */
> +	u32 pt_number;			/* XIVE Pass-through number if any */
> +	struct xive_irq_data *pt_data;	/* XIVE Pass-through associated data */
> +
> +	/* Targetting as set by guest */
> +	u32 guest_server;		/* Current guest selected target */
> +	u8 guest_priority;		/* Guest set priority */
> +	u8 saved_priority;		/* Saved priority when masking */
> +
> +	/* Actual targetting */
> +	u32 act_server;			/* Actual server */
> +	u8 act_priority;		/* Actual priority */
> +
> +	/* Various state bits */
> +	bool in_eoi;			/* Synchronize with H_EOI */
> +	bool old_p;			/* P bit state when masking */
> +	bool old_q;			/* Q bit state when masking */
> +	bool lsi;			/* level-sensitive interrupt */
> +	bool asserted;			/* Only for emulated LSI: current state */
> +
> +	/* Saved for migration state */
> +	bool in_queue;
> +	bool saved_p;
> +	bool saved_q;
> +	u8 saved_scan_prio;
> +};
> +
> +/* Select the "right" interrupt (IPI vs. passthrough) */
> +static inline void kvmppc_xive_select_irq(struct kvmppc_xive_irq_state *state,
> +					  u32 *out_hw_irq,
> +					  struct xive_irq_data **out_xd)
> +{
> +	if (state->pt_number) {
> +		if (out_hw_irq)
> +			*out_hw_irq = state->pt_number;
> +		if (out_xd)
> +			*out_xd = state->pt_data;
> +	} else {
> +		if (out_hw_irq)
> +			*out_hw_irq = state->ipi_number;
> +		if (out_xd)
> +			*out_xd = &state->ipi_data;
> +	}
> +}
> +
> +/* This corresponds to an "ICS" in XICS terminology, we use it
> + * as a mean to break up source information into multiple structures
> + */
> +struct kvmppc_xive_src_block {
> +	arch_spinlock_t lock;
> +	u16 id;
> +	struct kvmppc_xive_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
> +};
> +
> +
> +struct kvmppc_xive {
> +	struct kvm *kvm;
> +	struct kvm_device *dev;
> +	struct dentry *dentry;
> +
> +	/* VP block associated with the VM */
> +	u32	vp_base;
> +
> +	/* Blocks of sources */
> +	struct kvmppc_xive_src_block *src_blocks[KVMPPC_XICS_MAX_ICS_ID + 1];
> +	u32	max_sbid;
> +
> +	/*
> +	 * For state save, we lazily scan the queues on the first interrupt
> +	 * being migrated. We don't have a clean way to reset that flags
> +	 * so we keep track of the number of valid sources and how many of
> +	 * them were migrated so we can reset when all of them have been
> +	 * processed.
> +	 */
> +	u32	src_count;
> +	u32	saved_src_count;
> +
> +	/*
> +	 * Some irqs are delayed on restore until the source is created,
> +	 * keep track here of how many of them
> +	 */
> +	u32	delayed_irqs;
> +
> +	/* Which queues (priorities) are in use by the guest */
> +	u8	qmap;
> +
> +	/* Queue orders */
> +	u32	q_order;
> +	u32	q_alloc_order;
> +
> +};
> +
> +#define KVMPPC_XIVE_Q_COUNT	8
> +
> +struct kvmppc_xive_vcpu {
> +	struct kvmppc_xive	*xive;
> +	struct kvm_vcpu		*vcpu;
> +	bool			valid;
> +
> +	/* Server number. This is the HW CPU ID from a guest perspective */
> +	u32			server_num;
> +
> +	/* HW VP corresponding to this VCPU. This is the base of the VP
> +	 * block plus the server number
> +	 */
> +	u32			vp_id;
> +	u32			vp_chip_id;
> +	u32			vp_cam;
> +
> +	/* IPI used for sending ... IPIs */
> +	u32			vp_ipi;
> +	struct xive_irq_data	vp_ipi_data;
> +
> +	/* Local emulation state */
> +	uint8_t			cppr;	/* guest CPPR */
> +	uint8_t			hw_cppr;/* Hardware CPPR */
> +	uint8_t			mfrr;
> +	uint8_t			pending;
> +
> +	/* Each VP has 8 queues though we only provision some */
> +	struct xive_q		queues[KVMPPC_XIVE_Q_COUNT];
> +	u32			esc_virq[KVMPPC_XIVE_Q_COUNT];
> +	char			*esc_virq_names[KVMPPC_XIVE_Q_COUNT];
> +
> +	/* Stash a delayed irq on restore from migration (see set_icp) */
> +	u32			delayed_irq;
> +
> +	/* Stats */
> +	u64			stat_rm_h_xirr;
> +	u64			stat_rm_h_ipoll;
> +	u64			stat_rm_h_cppr;
> +	u64			stat_rm_h_eoi;
> +	u64			stat_rm_h_ipi;
> +	u64			stat_vm_h_xirr;
> +	u64			stat_vm_h_ipoll;
> +	u64			stat_vm_h_cppr;
> +	u64			stat_vm_h_eoi;
> +	u64			stat_vm_h_ipi;
> +};
> +
> +static inline struct kvm_vcpu *kvmppc_xive_find_server(struct kvm *kvm, u32 nr)
> +{
> +	struct kvm_vcpu *vcpu = NULL;
> +	int i;
> +
> +	kvm_for_each_vcpu(i, vcpu, kvm) {
> +		if (vcpu->arch.xive_vcpu && nr == vcpu->arch.xive_vcpu->server_num)
> +			return vcpu;
> +	}
> +	return NULL;
> +}
> +
> +static inline struct kvmppc_xive_src_block *kvmppc_xive_find_source(struct kvmppc_xive *xive,
> +		u32 irq, u16 *source)
> +{
> +	u32 bid = irq >> KVMPPC_XICS_ICS_SHIFT;
> +	u16 src = irq & KVMPPC_XICS_SRC_MASK;
> +
> +	if (source)
> +		*source = src;
> +	if (bid > KVMPPC_XICS_MAX_ICS_ID)
> +		return NULL;
> +	return xive->src_blocks[bid];
> +}
> +
> +/*
> + * Mapping between guest priorities and host priorities
> + * is as follow.
> + *
> + * Guest request for 0...6 are honored. Guest request for anything
> + * higher results in a priority of 7 being applied.
> + *
> + * However, when XIRR is returned via H_XIRR, 7 is translated to 0xb
> + * in order to match AIX expectations
> + *
> + * Similar mapping is done for CPPR values
> + */
> +static inline u8 xive_prio_from_guest(u8 prio)
> +{
> +	if (prio == 0xff || prio < 8)
> +		return prio;
> +	return 7;
> +}
> +
> +static inline u8 xive_prio_to_guest(u8 prio)
> +{
> +	if (prio == 0xff || prio < 7)
> +		return prio;
> +	return 0xb;
> +}
> +
> +static inline u32 __xive_read_eq(__be32 *qpage, u32 msk, u32 *idx, u32 *toggle)
> +{
> +	u32 cur;
> +
> +	if (!qpage)
> +		return 0;
> +	cur = be32_to_cpup(qpage + *idx);
> +	if ((cur >> 31) == *toggle)
> +		return 0;
> +	*idx = (*idx + 1) & msk;
> +	if (*idx == 0)
> +		(*toggle) ^= 1;
> +	return cur & 0x7fffffff;
> +}
> +
> +extern unsigned long xive_rm_h_xirr(struct kvm_vcpu *vcpu);
> +extern unsigned long xive_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server);
> +extern int xive_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
> +			 unsigned long mfrr);
> +extern int xive_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
> +extern int xive_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
> +
> +extern unsigned long (*__xive_vm_h_xirr)(struct kvm_vcpu *vcpu);
> +extern unsigned long (*__xive_vm_h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server);
> +extern int (*__xive_vm_h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
> +			      unsigned long mfrr);
> +extern int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr);
> +extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
> +
> +#endif /* _KVM_PPC_BOOK3S_XICS_H */
> diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c
> new file mode 100644
> index 0000000..b28c264
> --- /dev/null
> +++ b/arch/powerpc/kvm/book3s_xive_template.c
> @@ -0,0 +1,490 @@
> +/*
> + * Copyright 2017 Benjamin Herrenschmidt, IBM Corporation
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License, version 2, as
> + * published by the Free Software Foundation.
> + */
> +
> +/* File to be included by other .c files */
> +
> +#define XGLUE(a,b) a##b
> +#define GLUE(a,b) XGLUE(a,b)
> +
> +static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc)
> +{
> +	u8 cppr;
> +	u16 ack;
> +
> +	/* XXX DD1 bug workaround: Check PIPR vs. CPPR first ! */
> +
> +	/* Perform the acknowledge OS to register cycle. */
> +	ack = be16_to_cpu(__x_readw(__x_tm_area + TM_SPC_ACK_OS_REG));
> +
> +	/* Synchronize subsequent queue accesses */
> +	mb();
> +
> +	/* XXX Check grouping level */
> +
> +	/* Anything ? */
> +	if (!((ack >> 8) & TM_QW1_NSR_EO))
> +		return;
> +
> +	/* Grab CPPR of the most favored pending interrupt */
> +	cppr = ack & 0xff;
> +	if (cppr < 8)
> +		xc->pending |= 1 << cppr;
> +
> +#ifdef XIVE_RUNTIME_CHECKS
> +	/* Check consistency */
> +	if (cppr >= xc->hw_cppr)
> +		pr_warn("KVM-XIVE: CPU %d odd ack CPPR, got %d at %d\n",
> +			smp_processor_id(), cppr, xc->hw_cppr);
> +#endif
> +
> +	/* Update our image of the HW CPPR. We don't yet modify
> +	 * xc->cppr, this will be done as we scan for interrupts
> +	 * in the queues.
> +	 */
> +	xc->hw_cppr = cppr;
> +}
> +
> +static u8 GLUE(X_PFX,esb_load)(struct xive_irq_data *xd, u32 offset)
> +{
> +	u64 val;
> +
> +	if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
> +		offset |= offset << 4;
> +
> +	val =__x_readq(__x_eoi_page(xd) + offset);
> +#ifdef __LITTLE_ENDIAN__
> +	val >>= 64-8;
> +#endif
> +	return (u8)val;
> +}
> +
> +
> +static void GLUE(X_PFX,source_eoi)(u32 hw_irq, struct xive_irq_data *xd)
> +{
> +	/* If the XIVE supports the new "store EOI facility, use it */
> +	if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
> +		__x_writeq(0, __x_eoi_page(xd));
> +	else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) {
> +		opal_int_eoi(hw_irq);
> +	} else {
> +		uint64_t eoi_val;
> +
> +		/* Otherwise for EOI, we use the special MMIO that does
> +		 * a clear of both P and Q and returns the old Q.
> +		 *
> +		 * This allows us to then do a re-trigger if Q was set
> +		 * rather than synthetizing an interrupt in software
> +		 */
> +		eoi_val = GLUE(X_PFX,esb_load)(xd, XIVE_ESB_SET_PQ_00);
> +		if ((xd->flags & XIVE_IRQ_FLAG_LSI) || !(eoi_val & 1))
> +			return;
> +
> +		/* Re-trigger */
> +		if (__x_trig_page(xd))
> +			__x_writeq(0, __x_trig_page(xd));
> +	}
> +
> +}
> +
> +enum {
> +	scan_fetch,
> +	scan_poll,
> +	scan_eoi,
> +};
> +
> +static u32 GLUE(X_PFX,scan_interrupts)(struct kvmppc_xive_vcpu *xc,
> +				       u8 pending, int scan_type)
> +{
> +	u32 hirq = 0;
> +	u8 prio = 0xff;
> +
> +	/* Find highest pending priority */
> +	while ((xc->mfrr != 0xff || pending != 0) && hirq == 0) {
> +		struct xive_q *q;
> +		u32 idx, toggle;
> +		__be32 *qpage;
> +
> +		/*
> +		 * If pending is 0 this will return 0xff which is what
> +		 * we want
> +		 */
> +		prio = ffs(pending) - 1;
> +
> +		/*
> +		 * If the most favoured prio we found pending is less
> +		 * favored (or equal) than a pending IPI, we return
> +		 * the IPI instead.
> +		 *
> +		 * Note: If pending was 0 and mfrr is 0xff, we will
> +		 * not spurriously take an IPI because mfrr cannot
> +		 * then be smaller than cppr.
> +		 */
> +		if (prio >= xc->mfrr && xc->mfrr < xc->cppr) {
> +			prio = xc->mfrr;
> +			hirq = XICS_IPI;
> +			break;
> +		}
> +
> +		/* Don't scan past the guest cppr */
> +		if (prio >= xc->cppr || prio > 7)
> +			break;
> +
> +		/* Grab queue and pointers */
> +		q = &xc->queues[prio];
> +		idx = q->idx;
> +		toggle = q->toggle;
> +
> +		/*
> +		 * Snapshot the queue page. The test further down for EOI
> +		 * must use the same "copy" that was used by __xive_read_eq
> +		 * since qpage can be set concurrently and we don't want
> +		 * to miss an EOI.
> +		 */
> +		qpage = READ_ONCE(q->qpage);
> +
> +	skip_ipi:
> +		/* Try to fetch from the queue. Will return 0 for a
> +		 * non-queueing priority (ie, qpage = 0)
> +		 */
> +		hirq = __xive_read_eq(qpage, q->msk, &idx, &toggle);
> +
> +		/*
> +		 * If this was a signal for an MFFR change done by
> +		 * H_IPI we skip it. Additionally, if we were fetching
> +		 * we EOI it now, thus re-enabling reception of a new
> +		 * such signal.
> +		 *
> +		 * We also need to do that if prio is 0 and we had no
> +		 * page for the queue. In this case, we have non-queued
> +		 * IPI that needs to be EOId.
> +		 *
> +		 * This is safe because if we have another pending MFRR
> +		 * change that wasn't observed above, the Q bit will have
> +		 * been set and another occurrence of the IPI will trigger.
> +		 */
> +		if (hirq == XICS_IPI || (prio == 0 && !qpage)) {
> +			if (scan_type == scan_fetch)
> +				GLUE(X_PFX,source_eoi)(xc->vp_ipi,
> +						       &xc->vp_ipi_data);
> +			/* Loop back on same queue with updated idx/toggle */
> +#ifdef XIVE_RUNTIME_CHECKS
> +			WARN_ON(hirq && hirq != XICS_IPI);
> +#endif
> +			if (hirq)
> +				goto skip_ipi;
> +		}
> +
> +		/* If fetching, update queue pointers */
> +		if (scan_type == scan_fetch) {
> +			q->idx = idx;
> +			q->toggle = toggle;
> +		}
> +
> +		/* Something found, stop searching */
> +		if (hirq)
> +			break;
> +
> +		/* Clear the pending bit on the now empty queue */
> +		pending &= ~(1 << prio);
> +
> +		/*
> +		 * Check if the queue count needs adjusting due to
> +		 * interrupts being moved away.
> +		 */
> +		if (atomic_read(&q->pending_count)) {
> +			int p = atomic_xchg(&q->pending_count, 0);
> +			if (p) {
> +#ifdef XIVE_RUNTIME_CHECKS
> +				WARN_ON(p > atomic_read(&q->count));
> +#endif
> +				atomic_sub(p, &q->count);
> +			}
> +		}
> +	}
> +
> +	/* If we are just taking a "peek", do nothing else */
> +	if (scan_type == scan_poll)
> +		return hirq;
> +
> +	/* Update the pending bits */
> +	xc->pending = pending;
> +
> +	/* If this is an EOI that's it, no CPPR adjustment done here,
> +	 * all we needed was cleanup the stale pending bits and check
> +	 * if there's anything left.
> +	 */
> +	if (scan_type == scan_eoi)
> +		return hirq;
> +
> +	/* If we found an interrupt, adjust what the guest CPPR should
> +	 * be as if we had just fetched that interrupt from HW
> +	 */
> +	if (hirq)
> +		xc->cppr = prio;
> +	/*
> +	 * If it was an IPI the HW CPPR might have been lowered too much
> +	 * as the HW interrupt we use for IPIs is routed to priority 0.
> +	 *
> +	 * We re-sync it here.
> +	 */
> +	if (xc->cppr != xc->hw_cppr) {
> +		xc->hw_cppr = xc->cppr;
> +		__x_writeb(xc->cppr, __x_tm_area + TM_QW1_OS + TM_CPPR);
> +	}
> +
> +	return hirq;
> +}
> +
> +X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu)
> +{
> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +	u8 old_cppr;
> +	u32 hirq;
> +
> +	DBG("H_XIRR\n");
> +
> +	xc->GLUE(X_STAT_PFX,h_xirr)++;
> +
> +	/* First collect pending bits from HW */
> +	GLUE(X_PFX,ack_pending)(xc);
> +
> +	/* Cleanup the old-style bits if needed (they may have been
> +	 * set by pull or an escalation interrupts)
> +	 */
> +	if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions))
> +		clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
> +			  &vcpu->arch.pending_exceptions);
> +
> +	DBG(" new pending=0x%02x hw_cppr=%d cppr=%d\n",
> +	    xc->pending, xc->hw_cppr, xc->cppr);
> +
> +	/* Grab previous CPPR and reverse map it */
> +	old_cppr = xive_prio_to_guest(xc->cppr);
> +
> +	/* Scan for actual interrupts */
> +	hirq = GLUE(X_PFX,scan_interrupts)(xc, xc->pending, scan_fetch);
> +
> +	DBG(" got hirq=0x%x hw_cppr=%d cppr=%d\n",
> +	    hirq, xc->hw_cppr, xc->cppr);
> +
> +#ifdef XIVE_RUNTIME_CHECKS
> +	/* That should never hit */
> +	if (hirq & 0xff000000)
> +		pr_warn("XIVE: Weird guest interrupt number 0x%08x\n", hirq);
> +#endif
> +
> +	/*
> +	 * XXX We could check if the interrupt is masked here and
> +	 * filter it. If we chose to do so, we would need to do:
> +	 *
> +	 *    if (masked) {
> +	 *        lock();
> +	 *        if (masked) {
> +	 *            old_Q = true;
> +	 *            hirq = 0;
> +	 *        }
> +	 *        unlock();
> +	 *    }
> +	 */
> +
> +	/* Return interrupt and old CPPR in GPR4 */
> +	vcpu->arch.gpr[4] = hirq | (old_cppr << 24);
> +
> +	return H_SUCCESS;
> +}
> +
> +X_STATIC unsigned long GLUE(X_PFX,h_ipoll)(struct kvm_vcpu *vcpu, unsigned long server)
> +{
> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +	u8 pending = xc->pending;
> +	u32 hirq;
> +	u8 pipr;
> +
> +	DBG("H_IPOLL(server=%ld)\n", server);
> +
> +	xc->GLUE(X_STAT_PFX,h_ipoll)++;
> +
> +	/* Grab the target VCPU if not the current one */
> +	if (xc->server_num != server) {
> +		vcpu = kvmppc_xive_find_server(vcpu->kvm, server);
> +		if (!vcpu)
> +			return H_PARAMETER;
> +		xc = vcpu->arch.xive_vcpu;
> +
> +		/* Scan all priorities */
> +		pending = 0xff;
> +	} else {
> +		/* Grab pending interrupt if any */
> +		pipr = __x_readb(__x_tm_area + TM_QW1_OS + TM_PIPR);
> +		if (pipr < 8)
> +			pending |= 1 << pipr;
> +	}
> +
> +	hirq = GLUE(X_PFX,scan_interrupts)(xc, pending, scan_poll);
> +
> +	/* Return interrupt and old CPPR in GPR4 */
> +	vcpu->arch.gpr[4] = hirq | (xc->cppr << 24);
> +
> +	return H_SUCCESS;
> +}
> +
> +static void GLUE(X_PFX,push_pending_to_hw)(struct kvmppc_xive_vcpu *xc)
> +{
> +	u8 pending, prio;
> +
> +	pending = xc->pending;
> +	if (xc->mfrr != 0xff) {
> +		if (xc->mfrr < 8)
> +			pending |= 1 << xc->mfrr;
> +		else
> +			pending |= 0x80;
> +	}
> +	if (!pending)
> +		return;
> +	prio = ffs(pending) - 1;
> +
> +	__x_writeb(prio, __x_tm_area + TM_SPC_SET_OS_PENDING);
> +}
> +
> +X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr)
> +{
> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +	u8 old_cppr;
> +
> +	DBG("H_CPPR(cppr=%ld)\n", cppr);
> +
> +	xc->GLUE(X_STAT_PFX,h_cppr)++;
> +
> +	/* Map CPPR */
> +	cppr = xive_prio_from_guest(cppr);
> +
> +	/* Remember old and update SW state */
> +	old_cppr = xc->cppr;
> +	xc->cppr = cppr;
> +
> +	/*
> +	 * We are masking less, we need to look for pending things
> +	 * to deliver and set VP pending bits accordingly to trigger
> +	 * a new interrupt otherwise we might miss MFRR changes for
> +	 * which we have optimized out sending an IPI signal.
> +	 */
> +	if (cppr > old_cppr)
> +		GLUE(X_PFX,push_pending_to_hw)(xc);
> +
> +	/* Apply new CPPR */
> +	xc->hw_cppr = cppr;
> +	__x_writeb(cppr, __x_tm_area + TM_QW1_OS + TM_CPPR);
> +
> +	return H_SUCCESS;
> +}
> +
> +X_STATIC int GLUE(X_PFX,h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr)
> +{
> +	struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
> +	struct kvmppc_xive_src_block *sb;
> +	struct kvmppc_xive_irq_state *state;
> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +	struct xive_irq_data *xd;
> +	u8 new_cppr = xirr >> 24;
> +	u32 irq = xirr & 0x00ffffff, hw_num;
> +	u16 src;
> +	int rc = 0;
> +
> +	DBG("H_EOI(xirr=%08lx)\n", xirr);
> +
> +	xc->GLUE(X_STAT_PFX,h_eoi)++;
> +
> +	xc->cppr = xive_prio_from_guest(new_cppr);
> +
> +	/*
> +	 * IPIs are synthetized from MFRR and thus don't need
> +	 * any special EOI handling. The underlying interrupt
> +	 * used to signal MFRR changes is EOId when fetched from
> +	 * the queue.
> +	 */
> +	if (irq == XICS_IPI || irq == 0)
> +		goto bail;
> +
> +	/* Find interrupt source */
> +	sb = kvmppc_xive_find_source(xive, irq, &src);
> +	if (!sb) {
> +		DBG(" source not found !\n");
> +		rc = H_PARAMETER;
> +		goto bail;
> +	}
> +	state = &sb->irq_state[src];
> +	kvmppc_xive_select_irq(state, &hw_num, &xd);
> +
> +	state->in_eoi = true;
> +	mb();
> +
> + again:
> +	if (state->guest_priority == MASKED) {
> +		arch_spin_lock(&sb->lock);
> +		if (state->guest_priority != MASKED) {
> +			arch_spin_unlock(&sb->lock);
> +			goto again;
> +		}
> +		DBG(" EOI on saved P...\n");
> +
> +		/* Clear old_p, that will cause unmask to perform an EOI */
> +		state->old_p = false;
> +
> +		arch_spin_unlock(&sb->lock);
> +	} else {
> +		DBG(" EOI on source...\n");
> +
> +		/* Perform EOI on the source */
> +		GLUE(X_PFX,source_eoi)(hw_num, xd);
> +
> +		/* If it's an emulated LSI, check level and resend */
> +		if (state->lsi && state->asserted)
> +			__x_writeq(0, __x_trig_page(xd));
> +
> +	}
> +
> +	mb();
> +	state->in_eoi = false;
> + bail:
> +
> +	/* Re-evaluate pending IRQs and update HW */
> +	GLUE(X_PFX,scan_interrupts)(xc, xc->pending, scan_eoi);
> +	GLUE(X_PFX,push_pending_to_hw)(xc);
> +	DBG(" after scan pending=%02x\n", xc->pending);
> +
> +	/* Apply new CPPR */
> +	xc->hw_cppr = xc->cppr;
> +	__x_writeb(xc->cppr, __x_tm_area + TM_QW1_OS + TM_CPPR);
> +
> +	return rc;
> +}
> +
> +X_STATIC int GLUE(X_PFX,h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
> +			       unsigned long mfrr)
> +{
> +	struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
> +
> +	DBG("H_IPI(server=%08lx,mfrr=%ld)\n", server, mfrr);
> +
> +	xc->GLUE(X_STAT_PFX,h_ipi)++;
> +
> +	/* Find target */
> +	vcpu = kvmppc_xive_find_server(vcpu->kvm, server);
> +	if (!vcpu)
> +		return H_PARAMETER;
> +	xc = vcpu->arch.xive_vcpu;
> +
> +	/* Locklessly write over MFRR */
> +	xc->mfrr = mfrr;
> +
> +	/* Shoot the IPI if most favored than target cppr */
> +	if (mfrr < xc->cppr)
> +		__x_writeq(0, __x_trig_page(&xc->vp_ipi_data));
> +
> +	return H_SUCCESS;
> +}
> diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
> index 5a9a10b..3f1be85 100644
> --- a/arch/powerpc/kvm/irq.h
> +++ b/arch/powerpc/kvm/irq.h
> @@ -12,6 +12,7 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
>  #endif
>  #ifdef CONFIG_KVM_XICS
>  	ret = ret || (kvm->arch.xics != NULL);
> +	ret = ret || (kvm->arch.xive != NULL);
>  #endif
>  	smp_rmb();
>  	return ret;
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 95c91a9..de79bd72 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -37,6 +37,8 @@
>  #include <asm/cputhreads.h>
>  #include <asm/irqflags.h>
>  #include <asm/iommu.h>
> +#include <asm/xive.h>
> +
>  #include "timing.h"
>  #include "irq.h"
>  #include "../mm/mmu_decl.h"
> @@ -699,7 +701,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
>  		kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu);
>  		break;
>  	case KVMPPC_IRQ_XICS:
> -		kvmppc_xics_free_icp(vcpu);
> +		if (xive_enabled())
> +			kvmppc_xive_cleanup_vcpu(vcpu);
> +		else
> +			kvmppc_xics_free_icp(vcpu);
>  		break;
>  	}
>  
> @@ -1219,8 +1224,12 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
>  
>  		r = -EPERM;
>  		dev = kvm_device_from_filp(f.file);
> -		if (dev)
> -			r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
> +		if (dev) {
> +			if (xive_enabled())
> +				r = kvmppc_xive_connect_vcpu(dev, vcpu, cap->args[1]);
> +			else
> +				r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
> +		}
>  
>  		fdput(f);
>  		break;
> @@ -1244,7 +1253,7 @@ bool kvm_arch_intc_initialized(struct kvm *kvm)
>  		return true;
>  #endif
>  #ifdef CONFIG_KVM_XICS
> -	if (kvm->arch.xics)
> +	if (kvm->arch.xics || kvm->arch.xive)
>  		return true;
>  #endif
>  	return false;
> diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
> index e0f856b..d71cd77 100644
> --- a/arch/powerpc/platforms/powernv/opal.c
> +++ b/arch/powerpc/platforms/powernv/opal.c
> @@ -890,3 +890,4 @@ EXPORT_SYMBOL_GPL(opal_leds_set_ind);
>  EXPORT_SYMBOL_GPL(opal_write_oppanel_async);
>  /* Export this for KVM */
>  EXPORT_SYMBOL_GPL(opal_int_set_mfrr);
> +EXPORT_SYMBOL_GPL(opal_int_eoi);
> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
> index 96037e0..6429cd3 100644
> --- a/arch/powerpc/sysdev/xive/common.c
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -45,12 +45,14 @@
>  #endif
>  
>  bool __xive_enabled;
> +EXPORT_SYMBOL_GPL(__xive_enabled);
>  bool xive_cmdline_disabled;
>  
>  /* We use only one priority for now */
>  static u8 xive_irq_priority;
>  
>  void __iomem *xive_tm_area;
> +EXPORT_SYMBOL_GPL(xive_tm_area);
>  u32 xive_tm_offset;
>  static const struct xive_ops *xive_ops;
>  static struct irq_domain *xive_irq_domain;
> @@ -304,7 +306,7 @@ static void xive_irq_eoi(struct irq_data *d)
>  	DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n",
>  		    d->irq, irqd_to_hwirq(d), xc->pending_prio);
>  
> -	if (!irqd_irq_disabled(d))
> +	if (!irqd_irq_disabled(d) && !irqd_is_forwarded_to_vcpu(d))
>  		xive_do_source_eoi(irqd_to_hwirq(d), xd);
>  
>  	/*
> @@ -579,9 +581,10 @@ static int xive_irq_set_affinity(struct irq_data *d,
>  	 * Only configure the irq if it's not currently passed-through to
>  	 * a KVM guest
>  	 */
> -	rc = xive_ops->configure_irq(hw_irq,
> -				     get_hard_smp_processor_id(target),
> -				     xive_irq_priority, d->irq);
> +	if (!irqd_is_forwarded_to_vcpu(d))
> +		rc = xive_ops->configure_irq(hw_irq,
> +					     get_hard_smp_processor_id(target),
> +					     xive_irq_priority, d->irq);
>  	if (rc < 0) {
>  		pr_err("XIVE: Error %d reconfiguring irq %d\n", rc, d->irq);
>  		return rc;
> @@ -661,6 +664,123 @@ static int xive_irq_retrigger(struct irq_data *d)
>  	return 1;
>  }
>  
> +static int xive_irq_set_vcpu_affinity(struct irq_data *d, void *state)
> +{
> +	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
> +	int rc;
> +	u8 pq;
> +
> +	/*
> +	 * We only support this on interrupts that do not require
> +	 * firmware calls for masking and unmasking
> +	 */
> +	if (xd->flags & XIVE_IRQ_FLAG_MASK_FW)
> +		return -EIO;
> +
> +	/*
> +	 * This is called by KVM with state non-NULL for enabling
> +	 * pass-through or NULL for disabling it
> +	 */
> +	if (state) {
> +		irqd_set_forwarded_to_vcpu(d);
> +
> +		/* Set it to PQ=10 state to prevent further sends */
> +		pq = xive_poke_esb(xd, 0xe00);

Use XIVE_ESB_SET_PQ_xx constants in these xive_poke_esb() calls (as
you have done elsewhere).

> +
> +		/* No target ? nothing to do */
> +		if (xd->target == XIVE_INVALID_TARGET) {
> +			/*
> +			 * An untargetted interrupt should have been
> +			 * also masked at the source
> +			 */
> +			WARN_ON(pq & 2);
> +
> +			return 0;
> +		}
> +
> +		/*
> +		 * If P was set, adjust state to PQ=11 to indicate
> +		 * that a resend is needed for the interrupt to reach
> +		 * the guest. Also remember the value of P.
> +		 *
> +		 * This also tells us that it's in flight to a host queue
> +		 * or has already been fetched but hasn't been EOIed yet
> +		 * by the host. This it's potentially using up a host
> +		 * queue slot. This is important to know because as long
> +		 * as this is the case, we must not hard-unmask it when
> +		 * "returning" that interrupt to the host.
> +		 *
> +		 * This saved_p is cleared by the host EOI, when we know
> +		 * for sure the queue slot is no longer in use.
> +		 */
> +		if (pq & 2) {
> +			pq = xive_poke_esb(xd, 0xf00);
> +			xd->saved_p = true;
> +
> +			/*
> +			 * Sync the XIVE source HW to ensure the interrupt
> +			 * has gone through the EAS before we change its
> +			 * target to the guest. That should guarantee us
> +			 * that we *will* eventually get an EOI for it on
> +			 * the host. Otherwise there would be a small window
> +			 * for P to be seen here but the interrupt going
> +			 * to the guest queue.
> +			 */
> +			if (xive_ops->sync_source)
> +				xive_ops->sync_source(hw_irq);
> +		} else
> +			xd->saved_p = false;
> +	} else {
> +		irqd_clr_forwarded_to_vcpu(d);
> +
> +		/* No host target ? hard mask and return */
> +		if (xd->target == XIVE_INVALID_TARGET) {
> +			xive_do_source_set_mask(xd, true);
> +			return 0;
> +		}
> +
> +		/*
> +		 * Sync the XIVE source HW to ensure the interrupt
> +		 * has gone through the EAS before we change its
> +		 * target to the host.
> +		 */
> +		if (xive_ops->sync_source)
> +			xive_ops->sync_source(hw_irq);
> +
> +		/*
> +		 * By convention we are called with the interrupt in
> +		 * a PQ=10 or PQ=11 state, ie, it won't fire and will
> +		 * have latched in Q whether there's a pending HW
> +		 * interrupt or not.
> +		 *
> +		 * First reconfigure the target.
> +		 */
> +		rc = xive_ops->configure_irq(hw_irq,
> +					     get_hard_smp_processor_id(xd->target),
> +					     xive_irq_priority, d->irq);
> +		if (rc)
> +			return rc;
> +
> +		/*
> +		 * Then if saved_p is not set, effectively re-enable the
> +		 * interrupt with an EOI. If it is set, we know there is
> +		 * still a message in a host queue somewhere that will be
> +		 * EOId eventually.
> +		 *
> +		 * Note: We don't check irqd_irq_disabled(). Effectively,
> +		 * we *will* let the irq get through even if masked if the
> +		 * HW is still firing it in order to deal with the whole
> +		 * saved_p business properly. If the interrupt triggers
> +		 * while masked, the generic code will re-mask it anyway.
> +		 */
> +		if (!xd->saved_p)
> +			xive_do_source_eoi(hw_irq, xd);
> +
> +	}
> +	return 0;
> +}
> +
>  static struct irq_chip xive_irq_chip = {
>  	.name = "XIVE-IRQ",
>  	.irq_startup = xive_irq_startup,
> @@ -671,12 +791,14 @@ static struct irq_chip xive_irq_chip = {
>  	.irq_set_affinity = xive_irq_set_affinity,
>  	.irq_set_type = xive_irq_set_type,
>  	.irq_retrigger = xive_irq_retrigger,
> +	.irq_set_vcpu_affinity = xive_irq_set_vcpu_affinity,
>  };
>  
>  bool is_xive_irq(struct irq_chip *chip)
>  {
>  	return chip == &xive_irq_chip;
>  }
> +EXPORT_SYMBOL_GPL(is_xive_irq);
>  
>  void xive_cleanup_irq_data(struct xive_irq_data *xd)
>  {
> @@ -691,6 +813,7 @@ void xive_cleanup_irq_data(struct xive_irq_data *xd)
>  		xd->trig_mmio = NULL;
>  	}
>  }
> +EXPORT_SYMBOL_GPL(xive_cleanup_irq_data);
>  
>  static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw)
>  {
> diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
> index 26cc6bf..0130af8 100644
> --- a/arch/powerpc/sysdev/xive/native.c
> +++ b/arch/powerpc/sysdev/xive/native.c
> @@ -27,6 +27,7 @@
>  #include <asm/errno.h>
>  #include <asm/xive.h>
>  #include <asm/opal.h>
> +#include <asm/kvm_ppc.h>
>  
>  #include "xive-regs.h"
>  #include "xive-internal.h"
> @@ -98,6 +99,7 @@ int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data)
>  	}
>  	return 0;
>  }
> +EXPORT_SYMBOL_GPL(xive_native_populate_irq_data);
>  
>  int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
>  {
> @@ -111,6 +113,8 @@ int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
>  	}
>  	return rc == 0 ? 0 : -ENXIO;
>  }
> +EXPORT_SYMBOL_GPL(xive_native_configure_irq);
> +
>  
>  /* This can be called multiple time to change a queue configuration */
>  int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
> @@ -187,6 +191,7 @@ int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
>   fail:
>  	return rc;
>  }
> +EXPORT_SYMBOL_GPL(xive_native_configure_queue);
>  
>  static void __xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
>  {
> @@ -211,6 +216,7 @@ void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio)
>  		iounmap(q->eoi_mmio);
>  	q->eoi_mmio = NULL;
>  }
> +EXPORT_SYMBOL_GPL(xive_native_disable_queue);
>  
>  static int xive_native_setup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio)
>  {
> @@ -297,6 +303,7 @@ u32 xive_native_alloc_irq(void)
>  		return 0;
>  	return rc;
>  }
> +EXPORT_SYMBOL_GPL(xive_native_alloc_irq);
>  
>  void xive_native_free_irq(u32 irq)
>  {
> @@ -307,6 +314,7 @@ void xive_native_free_irq(u32 irq)
>  		msleep(1);
>  	}
>  }
> +EXPORT_SYMBOL_GPL(xive_native_free_irq);
>  
>  static void xive_native_put_ipi(unsigned int cpu, struct xive_cpu *xc)
>  {
> @@ -406,10 +414,11 @@ static void xive_native_teardown_cpu(unsigned int cpu, struct xive_cpu *xc)
>  	}
>  }
>  
> -static void xive_native_sync_source(u32 hw_irq)
> +void xive_native_sync_source(u32 hw_irq)
>  {
>  	opal_xive_sync(XIVE_SYNC_EAS, hw_irq);
>  }
> +EXPORT_SYMBOL_GPL(xive_native_sync_source);
>  
>  static const struct xive_ops xive_native_ops = {
>  	.populate_irq_data	= xive_native_populate_irq_data,
> @@ -468,10 +477,38 @@ static bool xive_parse_provisioning(struct device_node *np)
>  	return true;
>  }
>  
> +static void xive_native_setup_pools(void)
> +{
> +	u32 max_pir = 0;
> +	unsigned int cpu;
> +
> +	/*
> +	 * The HW won't let us enable OS VPs for KVM is we don't
> +	 * have enabled pool VPs so let's do that. First we find
> +	 * out our highest HW CPU ID
> +	 */
> +	for_each_possible_cpu(cpu) {
> +		u32 hw_id = get_hard_smp_processor_id(cpu);
> +		if (hw_id > max_pir)
> +			max_pir = hw_id;
> +	}
> +
> +	/* Allocate a pool big enough */
> +	pr_debug("XIVE: Allocating VP block for pool size %d\n",
> +		 max_pir + 1);
> +	xive_pool_vps = xive_native_alloc_vp_block(max_pir + 1);
> +	if (WARN_ON(xive_pool_vps == XIVE_INVALID_VP))
> +		pr_err("XIVE: No pool VPsvp KVM might not function\n");
> +
> +	pr_debug("XIVE: Pool VPs allocated at 0x%x for max_pir 0x%x\n",
> +		 xive_pool_vps, max_pir);
> +}
> +
>  u32 xive_native_default_eq_shift(void)
>  {
>  	return xive_queue_shift;
>  }
> +EXPORT_SYMBOL_GPL(xive_native_default_eq_shift);
>  
>  bool xive_native_init(void)
>  {
> @@ -481,7 +518,7 @@ bool xive_native_init(void)
>  	struct property *prop;
>  	u8 max_prio = 7;
>  	const __be32 *p;
> -	u32 val;
> +	u32 val, cpu;
>  	s64 rc;
>  
>  	if (xive_cmdline_disabled)
> @@ -517,6 +554,10 @@ bool xive_native_init(void)
>  			break;
>  	}
>  
> +	/* Configure TM areas for KVM */
> +	for_each_possible_cpu(cpu)
> +		kvmppc_set_xive_tm_area(cpu, r.start, tm_area);
> +
>  	/* Grab size of provisionning pages */
>  	xive_parse_provisioning(np);
>  
> @@ -528,6 +569,9 @@ bool xive_native_init(void)
>  		return false;
>  	}
>  
> +	/* Setup some dummy HV pool VPs */
> +	xive_native_setup_pools();
> +
>  	/* Initialize XIVE core with our backend */
>  	if (!xive_core_init(&xive_native_ops, tm_area, TM_QW3_HV_PHYS,
>  			    max_prio)) {
> @@ -602,3 +646,47 @@ void xive_native_free_vp_block(u32 vp_base)
>  		pr_warn("XIVE: OPAL error %lld freeing VP block\n", rc);
>  }
>  EXPORT_SYMBOL_GPL(xive_native_free_vp_block);
> +
> +int xive_native_enable_vp(u32 vp_id)
> +{
> +	s64 rc;
> +
> +	for (;;) {
> +		rc = opal_xive_set_vp_info(vp_id, OPAL_XIVE_VP_ENABLED, 0);
> +		if (rc != OPAL_BUSY)
> +			break;
> +		msleep(1);
> +	}
> +	return rc ? -EIO : 0;
> +}
> +EXPORT_SYMBOL_GPL(xive_native_enable_vp);
> +
> +int xive_native_disable_vp(u32 vp_id)
> +{
> +	s64 rc;
> +
> +	for (;;) {
> +		rc = opal_xive_set_vp_info(vp_id, 0, 0);
> +		if (rc != OPAL_BUSY)
> +			break;
> +		msleep(1);
> +	}
> +	return rc ? -EIO : 0;
> +}
> +EXPORT_SYMBOL_GPL(xive_native_disable_vp);
> +
> +int xive_native_get_vp_info(u32 vp_id, u32 *out_cam_id, u32 *out_chip_id)
> +{
> +	__be64 vp_cam_be;
> +	__be32 vp_chip_id_be;
> +	s64 rc;
> +
> +	rc = opal_xive_get_vp_info(vp_id, NULL, &vp_cam_be, NULL, &vp_chip_id_be);
> +	if (rc)
> +		return -EIO;
> +	*out_cam_id = be64_to_cpu(vp_cam_be) & 0xffffffffu;
> +	*out_chip_id = be32_to_cpu(vp_chip_id_be);
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(xive_native_get_vp_info);
> diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
> index 2c14ad9..d1a6e55 100644
> --- a/include/linux/kvm_host.h
> +++ b/include/linux/kvm_host.h
> @@ -1165,7 +1165,6 @@ int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type);
>  void kvm_unregister_device_ops(u32 type);
>  
>  extern struct kvm_device_ops kvm_mpic_ops;
> -extern struct kvm_device_ops kvm_xics_ops;
>  extern struct kvm_device_ops kvm_arm_vgic_v2_ops;
>  extern struct kvm_device_ops kvm_arm_vgic_v3_ops;
>  
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index a17d787..1b0da57 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -2839,10 +2839,6 @@ static struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
>  	[KVM_DEV_TYPE_FSL_MPIC_20]	= &kvm_mpic_ops,
>  	[KVM_DEV_TYPE_FSL_MPIC_42]	= &kvm_mpic_ops,
>  #endif
> -
> -#ifdef CONFIG_KVM_XICS
> -	[KVM_DEV_TYPE_XICS]		= &kvm_xics_ops,
> -#endif
>  };
>  
>  int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type)
> -- 
> 2.9.3