[PATCH 06/12] powerpc/xive: Native exploitation of the XIVE interrupt controller

Tue Apr 4 23:03:53 AEST 2017

Benjamin Herrenschmidt <benh at kernel.crashing.org> writes:

> The XIVE interrupt controller is the new interrupt controller
> found in POWER9. It supports advanced virtualization capabilities
> among other things.
>
> Currently we use a set of firmware calls that simulate the old
> "XICS" interrupt controller but this is fairly inefficient.
>
> This adds the framework for using XIVE along with a native
> backend which OPAL for configuration. Later, a backend allowing
               ^
               calls?

> the use in a KVM or PowerVM guest will also be provided.
>
> This disables some fast path for interrupts in KVM when XIVE is
> enabled as these rely on the firmware emulation code which is no
> longer available when the XIVE is used natively by Linux.
>
> A latter patch will make KVM also directly exploit the XIVE, thus
> recovering the lost performance (and more).
>
> Signed-off-by: Benjamin Herrenschmidt <benh at kernel.crashing.org>
> ---
>  arch/powerpc/include/asm/xive.h          |  116 +++
>  arch/powerpc/include/asm/xmon.h          |    2 +
>  arch/powerpc/platforms/powernv/Kconfig   |    2 +
>  arch/powerpc/platforms/powernv/setup.c   |   15 +-
>  arch/powerpc/platforms/powernv/smp.c     |   39 +-
>  arch/powerpc/sysdev/Kconfig              |    1 +
>  arch/powerpc/sysdev/Makefile             |    1 +
>  arch/powerpc/sysdev/xive/Kconfig         |    7 +
>  arch/powerpc/sysdev/xive/Makefile        |    4 +
>  arch/powerpc/sysdev/xive/common.c        | 1175 ++++++++++++++++++++++++++++++
>  arch/powerpc/sysdev/xive/native.c        |  604 +++++++++++++++
>  arch/powerpc/sysdev/xive/xive-internal.h |   51 ++
>  arch/powerpc/sysdev/xive/xive-regs.h     |   88 +++
>  arch/powerpc/xmon/xmon.c                 |   93 ++-
>  14 files changed, 2186 insertions(+), 12 deletions(-)

I'm not going to review this in one go, given it's 10:30pm already.

So just a few things that hit me straight away.

> diff --git a/arch/powerpc/include/asm/xive.h b/arch/powerpc/include/asm/xive.h
> new file mode 100644
> index 0000000..b1604b73
> --- /dev/null
> +++ b/arch/powerpc/include/asm/xive.h
> @@ -0,0 +1,116 @@

Copyright missing.

> +#ifndef _ASM_POWERPC_XIVE_H
> +#define _ASM_POWERPC_XIVE_H
> +
> +#define XIVE_INVALID_VP	0xffffffff
> +
> +#ifdef CONFIG_PPC_XIVE
> +
> +extern void __iomem *xive_tm_area;

I think Paul already commented on "tm" being an overly used acronym.

> +extern u32 xive_tm_offset;
> +
> +/*
> + * Per-irq data (irq_get_handler_data for normal IRQs), IPIs
> + * have it stored in the xive_cpu structure. We also cache
> + * for normal interrupts the current target CPU.
> + */
> +struct xive_irq_data {
> +	/* Setup by backend */
> +	u64 flags;
> +#define XIVE_IRQ_FLAG_STORE_EOI	0x01
> +#define XIVE_IRQ_FLAG_LSI	0x02
> +#define XIVE_IRQ_FLAG_SHIFT_BUG	0x04
> +#define XIVE_IRQ_FLAG_MASK_FW	0x08
> +#define XIVE_IRQ_FLAG_EOI_FW	0x10

I don't love that style, prefer them just prior to the struct.

> +	u64 eoi_page;
> +	void __iomem *eoi_mmio;
> +	u64 trig_page;
> +	void __iomem *trig_mmio;
> +	u32 esb_shift;
> +	int src_chip;

Why not space out the members like you do in xive_q below, I think that
looks better given you have the long __iomem lines.

> +
> +	/* Setup/used by frontend */
> +	int target;
> +	bool saved_p;
> +};
> +#define XIVE_INVALID_CHIP_ID	-1
> +
> +/* A queue tracking structure in a CPU */
> +struct xive_q {
> +	__be32 			*qpage;
> +	u32			msk;
> +	u32			idx;
> +	u32			toggle;
> +	u64			eoi_phys;
> +	void __iomem		*eoi_mmio;
> +	u32			esc_irq;
> +	atomic_t		count;
> +	atomic_t		pending_count;
> +};
> +
> +/*
> + * "magic" ESB MMIO offsets

What's an ESB?

> + */
> +#define XIVE_ESB_GET		0x800
> +#define XIVE_ESB_SET_PQ_00	0xc00
> +#define XIVE_ESB_SET_PQ_01	0xd00
> +#define XIVE_ESB_SET_PQ_10	0xe00
> +#define XIVE_ESB_SET_PQ_11	0xf00
> +#define XIVE_ESB_MASK		XIVE_ESB_SET_PQ_01
> +
> +extern bool __xive_enabled;
> +
> +static inline bool xive_enabled(void) { return __xive_enabled; }
> +
> +extern bool xive_native_init(void);
> +extern void xive_smp_probe(void);
> +extern int  xive_smp_prepare_cpu(unsigned int cpu);
> +extern void xive_smp_setup_cpu(void);
> +extern void xive_smp_disable_cpu(void);
> +extern void xive_kexec_teardown_cpu(int secondary);
> +extern void xive_shutdown(void);
> +extern void xive_flush_interrupt(void);
> +
> +/* xmon hook */
> +extern void xmon_xive_do_dump(int cpu);
> +
> +/* APIs used by KVM */
> +extern u32 xive_native_default_eq_shift(void);
> +extern u32 xive_native_alloc_vp_block(u32 max_vcpus);
> +extern void xive_native_free_vp_block(u32 vp_base);
> +extern int xive_native_populate_irq_data(u32 hw_irq,
> +					 struct xive_irq_data *data);
> +extern void xive_cleanup_irq_data(struct xive_irq_data *xd);
> +extern u32 xive_native_alloc_irq(void);
> +extern void xive_native_free_irq(u32 irq);
> +extern int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq);
> +
> +extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
> +				       __be32 *qpage, u32 order, bool can_escalate);
> +extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio);
> +
> +extern bool __xive_irq_trigger(struct xive_irq_data *xd);
> +extern bool __xive_irq_retrigger(struct xive_irq_data *xd);
> +extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd);
> +
> +extern bool is_xive_irq(struct irq_chip *chip);
> +
> +#else
> +
> +static inline bool xive_enabled(void) { return false; }
> +
> +static inline bool xive_native_init(void) { return false; }
> +static inline void xive_smp_probe(void) { }
> +extern inline int  xive_smp_prepare_cpu(unsigned int cpu) { return -EINVAL; }
> +static inline void xive_smp_setup_cpu(void) { }
> +static inline void xive_smp_disable_cpu(void) { }
> +static inline void xive_kexec_teardown_cpu(int secondary) { }
> +static inline void xive_shutdown(void) { }
> +static inline void xive_flush_interrupt(void) { }
> +
> +static inline u32 xive_native_alloc_vp_block(u32 max_vcpus)
> +    { return XIVE_INVALID_VP; }
> +static inline void xive_native_free_vp_block(u32 vp_base) { }
> +
> +#endif
> +
> +#endif /* _ASM_POWERPC_XIVE_H */
> diff --git a/arch/powerpc/include/asm/xmon.h b/arch/powerpc/include/asm/xmon.h
> index 5eb8e59..eb42a0c 100644
> --- a/arch/powerpc/include/asm/xmon.h
> +++ b/arch/powerpc/include/asm/xmon.h
> @@ -29,5 +29,7 @@ static inline void xmon_register_spus(struct list_head *list) { };
>  extern int cpus_are_in_xmon(void);
>  #endif
>  
> +extern void xmon_printf(const char *format, ...);
> +
>  #endif /* __KERNEL __ */
>  #endif /* __ASM_POWERPC_XMON_H */
> diff --git a/arch/powerpc/platforms/powernv/Kconfig b/arch/powerpc/platforms/powernv/Kconfig
> index 3a07e4d..81ee2ed 100644
> --- a/arch/powerpc/platforms/powernv/Kconfig
> +++ b/arch/powerpc/platforms/powernv/Kconfig
> @@ -4,6 +4,8 @@ config PPC_POWERNV
>  	select PPC_NATIVE
>  	select PPC_XICS
>  	select PPC_ICP_NATIVE
> +	select PPC_XIVE
> +	select PPC_XIVE_NATIVE
>  	select PPC_P7_NAP
>  	select PCI
>  	select PCI_MSI
> diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
> index d50c7d9..adceac9 100644
> --- a/arch/powerpc/platforms/powernv/setup.c
> +++ b/arch/powerpc/platforms/powernv/setup.c
> @@ -32,6 +32,7 @@
>  #include <asm/machdep.h>
>  #include <asm/firmware.h>
>  #include <asm/xics.h>
> +#include <asm/xive.h>
>  #include <asm/opal.h>
>  #include <asm/kexec.h>
>  #include <asm/smp.h>
> @@ -76,7 +77,9 @@ static void __init pnv_init(void)
>  
>  static void __init pnv_init_IRQ(void)
>  {
> -	xics_init();
> +	/* Try using a XIVE if available, otherwise use a XICS */
> +	if (!xive_native_init())
> +		xics_init();
>  
>  	WARN_ON(!ppc_md.get_irq);
>  }
> @@ -218,10 +221,12 @@ static void pnv_kexec_wait_secondaries_down(void)
>  
>  static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
>  {
> -	xics_kexec_teardown_cpu(secondary);
> +	if (xive_enabled())
> +		xive_kexec_teardown_cpu(secondary);
> +	else
> +		xics_kexec_teardown_cpu(secondary);
>  
>  	/* On OPAL, we return all CPUs to firmware */
> -
>  	if (!firmware_has_feature(FW_FEATURE_OPAL))
>  		return;
>  
> @@ -237,6 +242,10 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary)
>  		/* Primary waits for the secondaries to have reached OPAL */
>  		pnv_kexec_wait_secondaries_down();
>  
> +		/* Switch XIVE back to emulation mode */
> +		if (xive_enabled())
> +			xive_shutdown();
> +
>  		/*
>  		 * We might be running as little-endian - now that interrupts
>  		 * are disabled, reset the HILE bit to big-endian so we don't
> diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
> index 8b67e1e..f571955 100644
> --- a/arch/powerpc/platforms/powernv/smp.c
> +++ b/arch/powerpc/platforms/powernv/smp.c
> @@ -29,6 +29,7 @@
>  #include <asm/vdso_datapage.h>
>  #include <asm/cputhreads.h>
>  #include <asm/xics.h>
> +#include <asm/xive.h>
>  #include <asm/opal.h>
>  #include <asm/runlatch.h>
>  #include <asm/code-patching.h>
> @@ -47,7 +48,9 @@
>  
>  static void pnv_smp_setup_cpu(int cpu)
>  {
> -	if (cpu != boot_cpuid)
> +	if (xive_enabled())
> +		xive_smp_setup_cpu();
> +	else if (cpu != boot_cpuid)
>  		xics_setup_cpu();
>  
>  #ifdef CONFIG_PPC_DOORBELL
> @@ -132,7 +135,10 @@ static int pnv_smp_cpu_disable(void)
>  	vdso_data->processorCount--;
>  	if (cpu == boot_cpuid)
>  		boot_cpuid = cpumask_any(cpu_online_mask);
> -	xics_migrate_irqs_away();
> +	if (xive_enabled())
> +		xive_smp_disable_cpu();
> +	else
> +		xics_migrate_irqs_away();
>  	return 0;
>  }
>  
> @@ -213,9 +219,12 @@ static void pnv_smp_cpu_kill_self(void)
>  		if (((srr1 & wmask) == SRR1_WAKEEE) ||
>  		    ((srr1 & wmask) == SRR1_WAKEHVI) ||
>  		    (local_paca->irq_happened & PACA_IRQ_EE)) {
> -			if (cpu_has_feature(CPU_FTR_ARCH_300))
> -				icp_opal_flush_interrupt();
> -			else
> +			if (cpu_has_feature(CPU_FTR_ARCH_300)) {
> +				if (xive_enabled())
> +					xive_flush_interrupt();
> +				else
> +					icp_opal_flush_interrupt();
> +			} else
>  				icp_native_flush_interrupt();
>  		} else if ((srr1 & wmask) == SRR1_WAKEHDBELL) {
>  			unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
> @@ -252,10 +261,26 @@ static int pnv_cpu_bootable(unsigned int nr)
>  	return smp_generic_cpu_bootable(nr);
>  }
>  
> +static int pnv_smp_prepare_cpu(int cpu)
> +{
> +	if (xive_enabled())
> +		return xive_smp_prepare_cpu(cpu);
> +	return 0;
> +}
> +
> +static void __init pnv_smp_probe(void)
> +{
> +	if (xive_enabled())
> +		xive_smp_probe();
> +	else
> +		xics_smp_probe();
> +}
> +
>  static struct smp_ops_t pnv_smp_ops = {
>  	.message_pass	= smp_muxed_ipi_message_pass,
> -	.cause_ipi	= NULL,	/* Filled at runtime by xics_smp_probe() */
> -	.probe		= xics_smp_probe,
> +	.cause_ipi	= NULL, /* Filled at runtime by xi{cs,ve}_smp_probe() */
> +	.probe		= pnv_smp_probe,
> +	.prepare_cpu	= pnv_smp_prepare_cpu,
>  	.kick_cpu	= pnv_smp_kick_cpu,
>  	.setup_cpu	= pnv_smp_setup_cpu,
>  	.cpu_bootable	= pnv_cpu_bootable,
> diff --git a/arch/powerpc/sysdev/Kconfig b/arch/powerpc/sysdev/Kconfig
> index 52dc165..caf882e 100644
> --- a/arch/powerpc/sysdev/Kconfig
> +++ b/arch/powerpc/sysdev/Kconfig
> @@ -28,6 +28,7 @@ config PPC_MSI_BITMAP
>  	default y if PPC_POWERNV
>  
>  source "arch/powerpc/sysdev/xics/Kconfig"
> +source "arch/powerpc/sysdev/xive/Kconfig"
>  
>  config PPC_SCOM
>  	bool
> diff --git a/arch/powerpc/sysdev/Makefile b/arch/powerpc/sysdev/Makefile
> index a254824..c0ae11d 100644
> --- a/arch/powerpc/sysdev/Makefile
> +++ b/arch/powerpc/sysdev/Makefile
> @@ -71,5 +71,6 @@ obj-$(CONFIG_PPC_EARLY_DEBUG_MEMCONS)	+= udbg_memcons.o
>  subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
>  
>  obj-$(CONFIG_PPC_XICS)		+= xics/
> +obj-$(CONFIG_PPC_XIVE)		+= xive/
>  
>  obj-$(CONFIG_GE_FPGA)		+= ge/
> diff --git a/arch/powerpc/sysdev/xive/Kconfig b/arch/powerpc/sysdev/xive/Kconfig
> new file mode 100644
> index 0000000..c8816c8
> --- /dev/null
> +++ b/arch/powerpc/sysdev/xive/Kconfig
> @@ -0,0 +1,7 @@
> +config PPC_XIVE
> +       def_bool n
> +       select PPC_SMP_MUXED_IPI
> +       select HARDIRQS_SW_RESEND
> +
> +config PPC_XIVE_NATIVE
> +       def_bool n
> diff --git a/arch/powerpc/sysdev/xive/Makefile b/arch/powerpc/sysdev/xive/Makefile
> new file mode 100644
> index 0000000..3fab303
> --- /dev/null
> +++ b/arch/powerpc/sysdev/xive/Makefile
> @@ -0,0 +1,4 @@
> +subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
> +
> +obj-y				+= common.o
> +obj-$(CONFIG_PPC_XIVE_NATIVE)	+= native.o
> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
> new file mode 100644
> index 0000000..96037e0
> --- /dev/null
> +++ b/arch/powerpc/sysdev/xive/common.c
> @@ -0,0 +1,1175 @@
> +/*
> + * Copyright 2016,2017 IBM Corporation.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version
> + * 2 of the License, or (at your option) any later version.
> + */

If here you put:

#define pr_fmt(fmt) "xive: " fmt

Then you can drop the prefix from every pr_xxx() in the whole file.

> +#include <linux/types.h>
> +#include <linux/threads.h>
> +#include <linux/kernel.h>
> +#include <linux/irq.h>
> +#include <linux/debugfs.h>

Unused?

> +#include <linux/smp.h>
> +#include <linux/interrupt.h>
> +#include <linux/seq_file.h>

Unused?

> +#include <linux/init.h>
> +#include <linux/cpu.h>
> +#include <linux/of.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/msi.h>
> +
> +#include <asm/prom.h>
> +#include <asm/io.h>
> +#include <asm/smp.h>
> +#include <asm/machdep.h>
> +#include <asm/irq.h>
> +#include <asm/errno.h>
> +#include <asm/xive.h>
> +#include <asm/xmon.h>
> +
> +#include "xive-regs.h"
> +#include "xive-internal.h"
> +
> +#undef DEBUG_FLUSH
> +#undef DEBUG_ALL
> +
> +#define DBG(fmt...)		pr_devel("XIVE: " fmt)
> +
> +#ifdef DEBUG_ALL
> +#define DBG_VERBOSE(fmt...)	pr_devel("XIVE: " fmt)
> +#else
> +#define DBG_VERBOSE(fmt...)	do { } while(0)
> +#endif
> +
> +bool __xive_enabled;
> +bool xive_cmdline_disabled;
> +
> +/* We use only one priority for now */
> +static u8 xive_irq_priority;
> +
> +void __iomem *xive_tm_area;
> +u32 xive_tm_offset;
> +static const struct xive_ops *xive_ops;
> +static struct irq_domain *xive_irq_domain;
> +
> +/* The IPIs all use the same logical irq number */
> +static u32 xive_ipi_irq;
> +
> +/* Xive state for each CPU */
> +static DEFINE_PER_CPU(struct xive_cpu *, xive_cpu);
> +
> +/*
> + * A "disabled" interrupt should never fire, to catch problems
> + * we set its logical number to this
> + */
> +#define XIVE_BAD_IRQ		0x7fffffff

Can it be anything? How about 0x7fbadbad ?

> +#define XIVE_MAX_IRQ		(XIVE_BAD_IRQ - 1)
> +
> +/* An invalid CPU target */
> +#define XIVE_INVALID_TARGET	(-1)
> +
> +static u32 xive_read_eq(struct xive_q *q, u8 prio, bool just_peek)

Can it have a doc comment? And tell me what an EQ is?

> +{
> +	u32 cur;
> +
> +	if (!q->qpage)
> +		return 0;

A newline or ..

> +	cur = be32_to_cpup(q->qpage + q->idx);
> +	if ((cur >> 31) == q->toggle)
> +		return 0;

.. two wouldn't hurt here.

> +	if (!just_peek) {
> +		q->idx = (q->idx + 1) & q->msk;
> +		if (q->idx == 0)
> +			q->toggle ^= 1;
> +	}
> +	return cur & 0x7fffffff;

Is that XIVE_BAD_IRQ ?

> +}
> +
> +static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek)
> +{
> +	u32 hirq = 0;

Is that a hwirq or something different?

> +	u8 prio;
> +
> +	/* Find highest pending priority */
> +	while (xc->pending_prio != 0) {
> +		struct xive_q *q;
> +
> +		prio = ffs(xc->pending_prio) - 1;
> +		DBG_VERBOSE("scan_irq: trying prio %d\n", prio);
> +
> +		/* Try to fetch */
> +		hirq = xive_read_eq(&xc->queue[prio], prio, just_peek);
> +
> +		/* Found something ? That's it */
> +		if (hirq)
> +			break;
> +
> +		/* Clear pending bits */
> +		xc->pending_prio &= ~(1 << prio);
> +
> +		/*
> +		 * Check if the queue count needs adjusting due to
> +		 * interrupts being moved away.
> +		 */
> +		q = &xc->queue[prio];
> +		if (atomic_read(&q->pending_count)) {
> +			int p = atomic_xchg(&q->pending_count, 0);
> +			if (p) {
> +				WARN_ON(p > atomic_read(&q->count));
> +				atomic_sub(p, &q->count);

I am not sure what's going on there.

> +			}
> +		}
> +	}
> +
> +	/* If nothing was found, set CPPR to 0xff */

Would be nice to spell out CPPR somewhere.

> +	if (hirq == 0)
> +		prio = 0xff;
> +
> +	/* Update HW CPPR to match if necessary */
> +	if (prio != xc->cppr) {
> +		DBG_VERBOSE("scan_irq: adjusting CPPR to %d\n", prio);
> +		xc->cppr = prio;
> +		out_8(xive_tm_area + xive_tm_offset + TM_CPPR, prio);

What's the out_8() doing? I was expecting it to use xc, or something per-cpu.

> +	}
> +
> +	return hirq;
> +}
> +
> +#ifdef CONFIG_XMON
> +static void xive_dump_eq(const char *name, struct xive_q *q)
> +{
> +	u32 i0, i1, idx;
> +
> +	if (!q->qpage)
> +		return;
> +	idx = q->idx;
> +	i0 = be32_to_cpup(q->qpage + idx);
> +	idx = (idx + 1) & q->msk;
> +	i1 = be32_to_cpup(q->qpage + idx);
> +	xmon_printf("  %s Q T=%d %08x %08x ...\n", name,
> +		    q->toggle, i0, i1);
> +}
> +
> +void xmon_xive_do_dump(int cpu)
> +{
> +	struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
> +	struct xive_irq_data *xd;
> +	uint64_t val, offset;

u64 ?

> +
> +	xmon_printf("XIVE state for CPU %d:\n", cpu);
> +	xmon_printf("  pp=%02x cppr=%02x\n", xc->pending_prio, xc->cppr);
> +	xive_dump_eq("IRQ", &xc->queue[xive_irq_priority]);
> +	xd = &xc->ipi_data;
> +	offset = 0x800;
> +	if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
> +		offset |= offset << 4;
> +	val = in_be64(xd->eoi_mmio + offset);
> +	xmon_printf("  IPI state: %x:%c%c\n", xc->hw_ipi,
> +		    val & 2 ? 'P' : 'p',
> +		    val & 1 ? 'Q' : 'q');
> +}
> +#endif /* CONFIG_XMON */
> +
> +static void xive_update_pending_irqs(struct xive_cpu *xc)
> +{
> +	u8 he, cppr;
> +	u16 ack;
> +
> +	/* Perform the acknowledge hypervisor to register cycle */
> +	ack = be16_to_cpu(__raw_readw(xive_tm_area + TM_SPC_ACK_HV_REG));
> +
> +	/* Synchronize subsequent queue accesses */
> +	mb();
> +
> +	DBG_VERBOSE("CPU %d get_irq, ack=%04x\n", smp_processor_id(), ack);
> +
> +	/* Check the HE field */
> +	cppr = ack & 0xff;
> +	he = GETFIELD(TM_QW3_NSR_HE, (ack >> 8));
> +	switch(he) {
> +	case TM_QW3_NSR_HE_NONE:
> +		break;
> +	case TM_QW3_NSR_HE_PHYS:
> +		if (cppr == 0xff)
> +			return;
> +		xc->pending_prio |= 1 << cppr;
> +		if (cppr >= xc->cppr)
> +			pr_err("XIVE: CPU %d odd ack CPPR, got %d at %d\n",
> +			       smp_processor_id(), cppr, xc->cppr);
> +		xc->cppr = cppr;
> +		break;
> +	case TM_QW3_NSR_HE_POOL:
> +	case TM_QW3_NSR_HE_LSI:
> +		pr_err("XIVE: CPU %d got unexpected interrupt type HE=%d\n",
> +		       smp_processor_id(), he);
> +		return;
> +	}
> +}
> +
> +static unsigned int xive_get_irq(void)
> +{
> +	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
> +	u32 hirq;
> +
> +	/*
> +	 * This can be called either as a result of a HW interrupt or
> +	 * as a "replay" because EOI decided there was still something
> +	 * in one of the queues.
> +	 *
> +	 * First we perform an ACK cycle in order to update our mask
> +	 * of pending priorities. This will also have the effect of
> +	 * updating the CPPR to the most favored pending interrupts.
> +	 *
> +	 * In the future, if we have a way to differenciate a first
> +	 * entry (on HW interrupt) from a replay triggered by EOI,
> +	 * we could skip this on replays unless we soft-mask tells us
> +	 * that a new HW interrupt occurred.
> +	 */
> +	xive_update_pending_irqs(xc);
> +
> +	DBG_VERBOSE("get_irq: pending=%02x\n", xc->pending_prio);
> +
> +	hirq = xive_scan_interrupts(xc, false);
> +
> +	DBG_VERBOSE("get_irq: got irq 0x%x, new pending=0x%02x\n",
> +	    hirq, xc->pending_prio);
> +
> +	/* Return pending interrupt if any */
> +	if (hirq == XIVE_BAD_IRQ)
> +		return 0;
> +	return hirq;
> +}
> +
> +
> +static void xive_do_queue_eoi(struct xive_cpu *xc)
> +{
> +	if (xive_scan_interrupts(xc, true) != 0) {
> +		DBG_VERBOSE("eoi: pending=0x%02x\n", xc->pending_prio);
> +		force_external_irq_replay();
> +	}
> +}
> +
> +static u8 xive_poke_esb(struct xive_irq_data *xd, u32 offset)
> +{
> +	u64 val;
> +
> +	if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
> +		offset |= offset << 4;
> +
> +	val = in_be64(xd->eoi_mmio + offset);
> +
> +	return (u8)val;
> +}
> +
> +static void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd)
> +{
> +	/* If the XIVE supports the new "store EOI facility, use it */
> +	if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI)
> +		out_be64(xd->eoi_mmio, 0);
> +	else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) {
> +		if (WARN_ON_ONCE(!xive_ops->eoi))
> +			return;
> +		xive_ops->eoi(hw_irq);
> +	} else {
> +		uint8_t eoi_val;

u8?

> +
> +		/*
> +		 * Otherwise for EOI, we use the special MMIO that does
> +		 * a clear of both P and Q and returns the old Q.
> +		 *
> +		 * This allows us to then do a re-trigger if Q was set
> +		 * rather than synthetizing an interrupt in software
> +		 */
> +		eoi_val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_00);
> +		DBG_VERBOSE("eoi_val=%x\n", offset, eoi_val);
> +
> +		if ((xd->flags & XIVE_IRQ_FLAG_LSI) || !(eoi_val & 1))
> +			return;
> +
> +		/* Re-trigger */
> +		if (xd->trig_mmio)
> +			out_be64(xd->trig_mmio, 0);
> +	}
> +
> +}
> +
> +static void xive_irq_eoi(struct irq_data *d)
> +{
> +	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
> +
> +	DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n",
> +		    d->irq, irqd_to_hwirq(d), xc->pending_prio);
> +
> +	if (!irqd_irq_disabled(d))
> +		xive_do_source_eoi(irqd_to_hwirq(d), xd);
> +
> +	/*
> +	 * Clear saved_p to indicate that it's no longer occupying
> +	 * a queue slot on the target queue
> +	 */
> +	xd->saved_p = false;
> +
> +	xive_do_queue_eoi(xc);
> +}
> +
> +static void xive_do_source_set_mask(struct xive_irq_data *xd,
> +				    bool masked)
> +{
> +	if (masked)
> +		xive_poke_esb(xd, XIVE_ESB_SET_PQ_01);
> +	else
> +		xive_poke_esb(xd, XIVE_ESB_SET_PQ_00);
> +}
> +
> +static bool xive_try_pick_target(int cpu)
> +{
> +	struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
> +	struct xive_q *q = &xc->queue[xive_irq_priority];
> +	int max;
> +
> +	/* Calculate max number of interrupts in that queue.
> +	 *
> +	 * We leave a gap of 1 just in case...
> +	 */
> +	max = (q->msk + 1) - 1;
> +	return !!atomic_add_unless(&q->count, 1, max);
> +}
> +
> +static void xive_dec_target_count(int cpu)
> +{
> +	struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
> +	struct xive_q *q = &xc->queue[xive_irq_priority];
> +
> +	if (WARN_ON(cpu < 0))
> +		return;
> +
> +	/*
> +	 * We increment the "pending count" which will be used
> +	 * to decrement the target queue count whenever it's next
> +	 * processed and found empty. This ensure that we don't
> +	 * decrement while we still have the interrupt there
> +	 * occupying a slot.
> +	 */
> +	atomic_inc(&q->pending_count);
> +}
> +
> +static int xive_find_target_in_mask(const struct cpumask *mask,
> +				    unsigned int fuzz)
> +{
> +	int cpu, first, num, i;
> +
> +	/* Pick up a starting point CPU in the mask based on  fuzz */
> +	num = cpumask_weight(mask);
> +	first = (fuzz++) % num;
> +
> +	/* Locate it */
> +	cpu = cpumask_first(mask);
> +	for (i = 0; i < first; i++)
> +		cpu = cpumask_next(cpu, mask);
> +	first = cpu;
> +
> +	/*
> +	 * Now go through the entire mask until we find a valid
> +	 * target.
> +	 */
> +	for (;;) {
> +		/*
> +		 * We re-check online as the fallback case passes us
> +		 * an untested affinity mask
> +		 */
> +		if (cpu_online(cpu) && xive_try_pick_target(cpu))
> +			return cpu;
> +		cpu = cpumask_next(cpu, mask);
> +		if (cpu == first)
> +			break;
> +	}
> +	return -1;
> +}
> +
> +static int xive_pick_irq_target(struct irq_data *d,
> +				const struct cpumask *affinity)
> +{
> +	static unsigned int fuzz;
> +	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +	cpumask_var_t mask;
> +	int cpu = -1;
> +
> +	/*
> +	 * Pick a target CPU for an interrupt. This is done at
> +	 * startup or if the affinity is changed in a way that
> +	 * invalidates the current target.
> +	 */
> +
> +	/* If we have chip IDs, first we try to build a mask of
> +	 * CPUs matching ther CPU and find a target in there
> +	 */
> +	if (xd->src_chip != XIVE_INVALID_CHIP_ID &&
> +		zalloc_cpumask_var(&mask, GFP_ATOMIC)) {
> +		/* Build a mask of matching chip IDs */
> +		for_each_cpu_and(cpu, affinity, cpu_online_mask) {
> +			struct xive_cpu *xc = per_cpu(xive_cpu, cpu);
> +			if (xc->chip_id == xd->src_chip)
> +				cpumask_set_cpu(cpu, mask);
> +		}
> +		/* Try to find a target */
> +		if (!cpumask_empty(mask))
> +			cpu = xive_find_target_in_mask(mask, fuzz++);
> +		free_cpumask_var(mask);
> +		if (cpu >= 0)
> +			return cpu;
> +		fuzz--;
> +	}
> +
> +	/* No chip IDs, fallback to using the affinity mask */
> +	return xive_find_target_in_mask(affinity, fuzz++);
> +}
> +
> +static unsigned int xive_irq_startup(struct irq_data *d)
> +{
> +	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
> +	int target, rc;
> +
> +	DBG("xive_irq_startup: irq %d [0x%x] data @%p\n",
> +	    d->irq, hw_irq, d);
> +
> +#ifdef CONFIG_PCI_MSI
> +	/*
> +	 * The generic MSI code returns with the interrupt disabled on the
> +	 * card, using the MSI mask bits. Firmware doesn't appear to unmask
> +	 * at that level, so we do it here by hand.
> +	 */
> +	if (irq_data_get_msi_desc(d))
> +		pci_msi_unmask_irq(d);
> +#endif
> +
> +	/* Pick a target */
> +	target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d));
> +	if (target == XIVE_INVALID_TARGET) {
> +		/* Try again breaking affinity */
> +		target = xive_pick_irq_target(d, cpu_online_mask);
> +		if (target == XIVE_INVALID_TARGET)
> +			return -ENXIO;
> +		pr_warn("XIVE: irq %d started with broken affinity\n",
> +			d->irq);
> +	}
> +	xd->target = target;
> +
> +	/*
> +	 * Configure the logical number to be the Linux IRQ number
> +	 * and set the target queue
> +	 */
> +	rc = xive_ops->configure_irq(hw_irq,
> +				     get_hard_smp_processor_id(target),
> +				     xive_irq_priority, d->irq);
> +	if (rc)
> +		return rc;
> +
> +	/* Unmask the ESB */
> +	xive_do_source_set_mask(xd, false);
> +
> +	return 0;
> +}
> +
> +static void xive_irq_shutdown(struct irq_data *d)
> +{
> +	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
> +
> +	DBG("xive_irq_shutdown: irq %d [0x%x] data @%p\n",
> +	    d->irq, hw_irq, d);
> +
> +	if (WARN_ON(xd->target == XIVE_INVALID_TARGET))
> +		return;
> +
> +	/* Mask the interrupt at the source */
> +	xive_do_source_set_mask(xd, true);
> +
> +	/* Mask the interrupt in HW in the IVT/EAS */
> +	xive_ops->configure_irq(hw_irq,
> +				get_hard_smp_processor_id(xd->target),
> +				0xff, hw_irq);
> +
> +	xive_dec_target_count(xd->target);
> +	xd->target = XIVE_INVALID_TARGET;
> +}
> +
> +static void xive_irq_unmask(struct irq_data *d)
> +{
> +	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +
> +	DBG("xive_irq_unmask: irq %d data @%p\n", d->irq, xd);
> +
> +	/*
> +	 * This is a workaround for PCI LSI problems on P9, for
> +	 * these, we call FW to set the mask. The problems might
> +	 * be fixed by P9 DD2.0, if that is the case, we will make
> +	 * this a DD1 workaround only
> +	 */
> +	if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) {
> +		unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
> +		xive_ops->configure_irq(hw_irq,
> +					get_hard_smp_processor_id(xd->target),
> +					xive_irq_priority, d->irq);
> +		return;
> +	}
> +
> +	xive_do_source_set_mask(xd, false);
> +}
> +
> +static void xive_irq_mask(struct irq_data *d)
> +{
> +	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +
> +	DBG("xive_irq_mask: irq %d data @%p\n", d->irq, xd);
> +
> +	/*
> +	 * This is a workaround for PCI LSI problems on P9, for
> +	 * these, we call OPAL to set the mask. The problems might
> +	 * be fixed by P9 DD2.0, if that is the case, we will make
> +	 * this a DD1 workaround only
> +	 */
> +	if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) {
> +		unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
> +		xive_ops->configure_irq(hw_irq,
> +					get_hard_smp_processor_id(xd->target),
> +					0xff, d->irq);
> +		return;
> +	}
> +
> +	xive_do_source_set_mask(xd, true);
> +}
> +
> +static int xive_irq_set_affinity(struct irq_data *d,
> +				 const struct cpumask *cpumask,
> +				 bool force)
> +{
> +	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +	unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
> +	u32 target, old_target;
> +	int rc = 0;
> +
> +	DBG("xive_irq_set_affinity: irq %d\n", d->irq);
> +
> +	/* Is this valid ? */
> +	if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids)
> +		return -EINVAL;
> +
> +	/* If existing target is already in the new mask, and is
> +	 * online then do nothing.
> +	 */
> +	if (cpu_online(xd->target) &&
> +	    cpumask_test_cpu(xd->target, cpumask))
> +		return IRQ_SET_MASK_OK;
> +
> +	/* Pick a new target */
> +	target = xive_pick_irq_target(d, cpumask);
> +
> +	/* No target found */
> +	if (target == XIVE_INVALID_TARGET)
> +		return -ENXIO;
> +
> +	old_target = xd->target;
> +
> +	/*
> +	 * Only configure the irq if it's not currently passed-through to
> +	 * a KVM guest
> +	 */
> +	rc = xive_ops->configure_irq(hw_irq,
> +				     get_hard_smp_processor_id(target),
> +				     xive_irq_priority, d->irq);
> +	if (rc < 0) {
> +		pr_err("XIVE: Error %d reconfiguring irq %d\n", rc, d->irq);
> +		return rc;
> +	}
> +
> +	DBG("  target: 0x%x\n", target);
> +	xd->target = target;
> +
> +	/* Give up previous target */
> +	if (old_target != XIVE_INVALID_TARGET)
> +	    xive_dec_target_count(old_target);
> +
> +	return IRQ_SET_MASK_OK;
> +}
> +
> +static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type)
> +{
> +	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +
> +	/*
> +	 * We only support these. This has really no effect other than setting
> +	 * the corresponding descriptor bits mind you but those will in turn
> +	 * affect the resend function when re-enabling an edge interrupt.
> +	 *
> +	 * Set set the default to edge as explained in map().
> +	 */
> +	if (flow_type == IRQ_TYPE_DEFAULT || flow_type == IRQ_TYPE_NONE)
> +		flow_type = IRQ_TYPE_EDGE_RISING;
> +
> +	if (flow_type != IRQ_TYPE_EDGE_RISING &&
> +	    flow_type != IRQ_TYPE_LEVEL_LOW)
> +		return -EINVAL;
> +
> +	irqd_set_trigger_type(d, flow_type);
> +
> +	/*
> +	 * Double check it matches what the FW thinks
> +	 *
> +	 * NOTE: We don't know yet if the PAPR interface will provide
> +	 * the LSI vs MSI information appart from the device-tree so
> +	 * this check might have to move into an optional backend call
> +	 * that is specific to the native backend
> +	 */
> +	if ((flow_type == IRQ_TYPE_LEVEL_LOW) !=
> +	    !!(xd->flags & XIVE_IRQ_FLAG_LSI))
> +		pr_warn("XIVE: Interrupt %d (HW 0x%x) type mismatch,"
> +			" Linux says %s, FW says %s\n",
> +			d->irq, (u32)irqd_to_hwirq(d),
> +			(flow_type == IRQ_TYPE_LEVEL_LOW) ? "Level" : "Edge",
> +			(xd->flags & XIVE_IRQ_FLAG_LSI) ? "Level" : "Edge");
> +
> +	return IRQ_SET_MASK_OK_NOCOPY;
> +}
> +
> +static int xive_irq_retrigger(struct irq_data *d)
> +{
> +	struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
> +
> +	/* This should be only for MSIs */
> +	if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI))
> +		return 0;
> +
> +	/*
> +	 * To perform a retrigger, we first set the PQ bits to
> +	 * 11, then perform an EOI.
> +	 */
> +	xive_poke_esb(xd, XIVE_ESB_SET_PQ_11);
> +
> +	/*
> +	 * Note: We pass "0" to the hw_irq argument in order to
> +	 * avoid calling into the backend EOI code which we don't
> +	 * want to do in the case of a re-trigger. Backends typically
> +	 * only do EOI for LSIs anyway.
> +	 */
> +	xive_do_source_eoi(0, xd);
> +
> +	return 1;
> +}
> +
> +static struct irq_chip xive_irq_chip = {
> +	.name = "XIVE-IRQ",
> +	.irq_startup = xive_irq_startup,
> +	.irq_shutdown = xive_irq_shutdown,
> +	.irq_eoi = xive_irq_eoi,
> +	.irq_mask = xive_irq_mask,
> +	.irq_unmask = xive_irq_unmask,
> +	.irq_set_affinity = xive_irq_set_affinity,
> +	.irq_set_type = xive_irq_set_type,
> +	.irq_retrigger = xive_irq_retrigger,
> +};
> +
> +bool is_xive_irq(struct irq_chip *chip)
> +{
> +	return chip == &xive_irq_chip;
> +}
> +
> +void xive_cleanup_irq_data(struct xive_irq_data *xd)
> +{
> +	if (xd->eoi_mmio) {
> +		iounmap(xd->eoi_mmio);
> +		if (xd->eoi_mmio == xd->trig_mmio)
> +			xd->trig_mmio = NULL;
> +		xd->eoi_mmio = NULL;
> +	}
> +	if (xd->trig_mmio) {
> +		iounmap(xd->trig_mmio);
> +		xd->trig_mmio = NULL;
> +	}
> +}
> +
> +static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw)
> +{
> +	struct xive_irq_data *xd;
> +	int rc;
> +
> +	xd = kzalloc(sizeof(struct xive_irq_data), GFP_KERNEL);
> +	if (!xd)
> +		return -ENOMEM;
> +	rc = xive_ops->populate_irq_data(hw, xd);
> +	if (rc) {
> +		kfree(xd);
> +		return rc;
> +	}
> +	xd->target = XIVE_INVALID_TARGET;
> +	irq_set_handler_data(virq, xd);
> +
> +	return 0;
> +}
> +
> +static void xive_irq_free_data(unsigned int virq)
> +{
> +	struct xive_irq_data *xd = irq_get_handler_data(virq);
> +
> +	if (!xd)
> +		return;
> +	irq_set_handler_data(virq, NULL);
> +	xive_cleanup_irq_data(xd);
> +	kfree(xd);
> +}
> +
> +#ifdef CONFIG_SMP
> +
> +static void xive_cause_ipi(int cpu, unsigned long msg)
> +{
> +	struct xive_cpu *xc;
> +	struct xive_irq_data *xd;
> +
> +	xc = per_cpu(xive_cpu, cpu);
> +
> +	DBG_VERBOSE("IPI msg#%ld CPU %d -> %d (HW IRQ 0x%x)\n",
> +		    msg, smp_processor_id(), cpu, xc->hw_ipi);
> +
> +	xd = &xc->ipi_data;
> +	if (WARN_ON(!xd->trig_mmio))
> +		return;
> +	out_be64(xd->trig_mmio, 0);
> +}
> +
> +static irqreturn_t xive_muxed_ipi_action(int irq, void *dev_id)
> +{
> +	return smp_ipi_demux();
> +}
> +
> +static void xive_ipi_eoi(struct irq_data *d)
> +{
> +	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
> +
> +	/* Handle possible race with unplug and drop stale IPIs */
> +	if (!xc)
> +		return;
> +	xive_do_source_eoi(xc->hw_ipi, &xc->ipi_data);
> +	xive_do_queue_eoi(xc);
> +}
> +
> +static void xive_ipi_unmask(struct irq_data *d)
> +{
> +	/* Nothing to do, we never mask IPIs, but the callback
> +	 * must exist
> +	 */
> +}
> +
> +static void xive_ipi_mask(struct irq_data *d)
> +{
> +	/* Nothing to do, we never mask IPIs, but the callback
> +	 * must exist
> +	 */
> +}
> +
> +static struct irq_chip xive_ipi_chip = {
> +	.name = "XIVE-IPI",
> +	.irq_eoi = xive_ipi_eoi,
> +	.irq_mask = xive_ipi_mask,
> +	.irq_unmask = xive_ipi_unmask,
> +};
> +
> +static void __init xive_request_ipi(void)
> +{
> +	unsigned int virq;
> +
> +	/* Initialize it */
> +	virq = irq_create_mapping(xive_irq_domain, 0);
> +	xive_ipi_irq = virq;
> +
> +	BUG_ON(request_irq(virq, xive_muxed_ipi_action,
> +			   IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL));
> +}
> +
> +static int xive_setup_cpu_ipi(unsigned int cpu)
> +{
> +	struct xive_cpu *xc;
> +	int rc;
> +
> +	pr_debug("XIVE: Setting up IPI for CPU %d\n", cpu);
> +
> +	xc = per_cpu(xive_cpu, cpu);
> +
> +	/* Check if we are already setup */
> +	if (xc->hw_ipi != 0)
> +		return 0;
> +
> +	/* Grab an IPI from the backend, this will populate xc->hw_ipi */
> +	if (xive_ops->get_ipi(cpu, xc))
> +		return -EIO;
> +
> +	/* Populate the IRQ data in the xive_cpu structure and
> +	 * configure the HW / enable the IPIs
> +	 */
> +	rc = xive_ops->populate_irq_data(xc->hw_ipi, &xc->ipi_data);
> +	if (rc) {
> +		pr_err("XIVE: Failed to populate IPI data on CPU %d\n", cpu);
> +		return -EIO;
> +	}
> +	rc = xive_ops->configure_irq(xc->hw_ipi,
> +				     get_hard_smp_processor_id(cpu),
> +				     xive_irq_priority, xive_ipi_irq);
> +	if (rc) {
> +		pr_err("XIVE: Failed to map IPI CPU %d\n", cpu);
> +		return -EIO;
> +	}
> +	DBG("XIVE: CPU %d HW IPI %x, virq %d, trig_mmio=%p\n", cpu,
> +	    xc->hw_ipi, xive_ipi_irq, xc->ipi_data.trig_mmio);
> +
> +	/* Unmask it */
> +	xive_do_source_set_mask(&xc->ipi_data, false);
> +
> +	return 0;
> +}
> +
> +static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc)
> +{
> +	/* Disable the IPI and free the IRQ data */
> +
> +	/* Already cleaned up ? */
> +	if (xc->hw_ipi == 0)
> +		return;
> +
> +	/* Mask the IPI */
> +	xive_do_source_set_mask(&xc->ipi_data, true);
> +
> +	/*
> +	 * Note: We don't call xive_cleanup_irq_data() to free
> +	 * the mappings as this is called from an IPI on kexec
> +	 * which is not a safe environment to call iounmap()
> +	 */
> +
> +	/* Deconfigure/mask in the backend */
> +	xive_ops->configure_irq(xc->hw_ipi, hard_smp_processor_id(),
> +				0xff, xive_ipi_irq);
> +
> +	/* Free the IPIs in the backend */
> +	xive_ops->put_ipi(cpu, xc);
> +}
> +
> +void __init xive_smp_probe(void)
> +{
> +	smp_ops->cause_ipi = xive_cause_ipi;
> +
> +	/* Register the IPI */
> +	xive_request_ipi();
> +
> +	/* Allocate and setup IPI for the boot CPU */
> +	xive_setup_cpu_ipi(smp_processor_id());
> +}
> +
> +#endif /* CONFIG_SMP */
> +
> +static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq,
> +			       irq_hw_number_t hw)
> +{
> +	int rc;
> +
> +	/*
> +	 * Mark interrupts as edge sensitive by default so that resend
> +	 * actually works. Will fix that up below if needed.
> +	 */
> +	irq_clear_status_flags(virq, IRQ_LEVEL);
> +
> +	/* IPIs are special and come up with HW number 0 */
> +	if (hw == 0) {
> +		/*
> +		 * IPIs are marked per-cpu. We use separate HW interrupts under
> +		 * the hood but associated with the same "linux" interrupt
> +		 */
> +		irq_set_chip_and_handler(virq, &xive_ipi_chip,
> +					 handle_percpu_irq);
> +		return 0;
> +	}
> +
> +	rc = xive_irq_alloc_data(virq, hw);
> +	if (rc)
> +		return rc;
> +
> +	irq_set_chip_and_handler(virq, &xive_irq_chip, handle_fasteoi_irq);
> +
> +	return 0;
> +}
> +
> +static void xive_irq_domain_unmap(struct irq_domain *d, unsigned int virq)
> +{
> +	struct irq_data *data = irq_get_irq_data(virq);
> +	unsigned int hw_irq;
> +
> +	if (!data)
> +		return;
> +	hw_irq = (unsigned int)irqd_to_hwirq(data);
> +	if (hw_irq)
> +		xive_irq_free_data(virq);
> +}
> +
> +static int xive_irq_domain_xlate(struct irq_domain *h, struct device_node *ct,
> +				 const u32 *intspec, unsigned int intsize,
> +				 irq_hw_number_t *out_hwirq, unsigned int *out_flags)
> +
> +{
> +	*out_hwirq = intspec[0];
> +
> +	/*
> +	 * If intsize is at least 2, we look for the type in the second cell,
> +	 * we assume the LSB indicates a level interrupt.
> +	 */
> +	if (intsize > 1) {
> +		if (intspec[1] & 1)
> +			*out_flags = IRQ_TYPE_LEVEL_LOW;
> +		else
> +			*out_flags = IRQ_TYPE_EDGE_RISING;
> +	} else
> +		*out_flags = IRQ_TYPE_LEVEL_LOW;
> +
> +	return 0;
> +}
> +
> +static int xive_irq_domain_match(struct irq_domain *h, struct device_node *node,
> +				 enum irq_domain_bus_token bus_token)
> +{
> +	return xive_ops->match(node);
> +}
> +
> +static const struct irq_domain_ops xive_irq_domain_ops = {
> +	.match = xive_irq_domain_match,
> +	.map = xive_irq_domain_map,
> +	.unmap = xive_irq_domain_unmap,
> +	.xlate = xive_irq_domain_xlate,
> +};
> +
> +static void __init xive_init_host(void)
> +{
> +	xive_irq_domain = irq_domain_add_nomap(NULL, XIVE_MAX_IRQ,
> +					       &xive_irq_domain_ops, NULL);
> +	BUG_ON(xive_irq_domain == NULL);
> +	irq_set_default_host(xive_irq_domain);
> +}
> +
> +static void xive_cleanup_cpu_queues(unsigned int cpu, struct xive_cpu *xc)
> +{
> +	if (xc->queue[xive_irq_priority].qpage)
> +		xive_ops->cleanup_queue(cpu, xc, xive_irq_priority);
> +}
> +
> +static int xive_setup_cpu_queues(unsigned int cpu, struct xive_cpu *xc)
> +{
> +	int rc = 0;
> +
> +	/* We setup 1 queues for now with a 64k page */
> +	if (!xc->queue[xive_irq_priority].qpage)
> +		rc = xive_ops->setup_queue(cpu, xc, xive_irq_priority);
> +
> +	return rc;
> +}
> +
> +static int xive_prepare_cpu(unsigned int cpu)
> +{
> +	struct xive_cpu *xc;
> +
> +	xc = per_cpu(xive_cpu, cpu);
> +	if (!xc) {
> +		struct device_node *np;
> +
> +		xc = kzalloc_node(sizeof(struct xive_cpu),
> +				  GFP_KERNEL, cpu_to_node(cpu));
> +		if (!xc)
> +			return -ENOMEM;
> +		np = of_get_cpu_node(cpu, NULL);
> +		if (np)
> +			xc->chip_id = of_get_ibm_chip_id(np);
> +		of_node_put(np);
> +
> +		per_cpu(xive_cpu, cpu) = xc;
> +	}
> +
> +	/* Setup EQs if not already */
> +	return xive_setup_cpu_queues(cpu, xc);
> +}
> +
> +static void xive_setup_cpu(void)
> +{
> +	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
> +
> +	/* Debug: Dump the TM state */
> +	DBG("CPU %d [HW 0x%02x] VT=%02x\n",
> +	    smp_processor_id(), hard_smp_processor_id(),
> +	    in_8(xive_tm_area + xive_tm_offset + TM_WORD2));
> +
> +	/* The backend might have additional things to do */
> +	if (xive_ops->setup_cpu)
> +		xive_ops->setup_cpu(smp_processor_id(), xc);
> +
> +	/* Set CPPR to 0xff to enable flow of interrupts */
> +	xc->cppr = 0xff;
> +	out_8(xive_tm_area + xive_tm_offset + TM_CPPR, 0xff);
> +}
> +
> +#ifdef CONFIG_SMP
> +void xive_smp_setup_cpu(void)
> +{
> +	DBG("XIVE: SMP setup CPU %d\n", smp_processor_id());
> +
> +	/* This will have already been done on the boot CPU */
> +	if (smp_processor_id() != boot_cpuid)
> +		xive_setup_cpu();
> +
> +}
> +
> +int xive_smp_prepare_cpu(unsigned int cpu)
> +{
> +	int rc;
> +
> +	/* Allocate per-CPU data and queues */
> +	rc = xive_prepare_cpu(cpu);
> +	if (rc)
> +		return rc;
> +
> +	/* Allocate and setup IPI for the new CPU */
> +	return xive_setup_cpu_ipi(cpu);
> +}
> +
> +#ifdef CONFIG_HOTPLUG_CPU
> +static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc)
> +{
> +	u32 irq;
> +
> +	/* We assume local irqs are disabled */
> +	WARN_ON(!irqs_disabled());
> +
> +	/* Check what's already in the CPU queue */
> +	while ((irq = xive_scan_interrupts(xc, false)) != 0) {
> +		/*
> +		 * We need to re-route that interrupt to its new distination.
> +		 * First get and lock the descriptor
> +		 */
> +		struct irq_desc *desc = irq_to_desc(irq);
> +		struct irq_data *d = irq_desc_get_irq_data(desc);
> +		struct xive_irq_data *xd;
> +		unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
> +
> +		/*
> +		 * Ignore anything that isn't a XIVE irq and ignore
> +		 * IPIs, so can just be dropped.
> +		 */
> +		if (d->domain != xive_irq_domain || hw_irq == 0)
> +			continue;
> +#ifdef DEBUG_FLUSH
> +		pr_info("CPU %d: Got irq %d while offline, re-routing...\n",
> +			cpu, irq);
> +#endif
> +		raw_spin_lock(&desc->lock);
> +		xd = irq_desc_get_handler_data(desc);
> +
> +		/* For LSIs, we EOI, this will cause a resend if it's
> +		 * still asserted. Otherwise do an MSI retrigger
> +		 */
> +		if (xd->flags & XIVE_IRQ_FLAG_LSI)
> +			xive_do_source_eoi(irqd_to_hwirq(d), xd);
> +		else
> +			xive_irq_retrigger(d);
> +		raw_spin_unlock(&desc->lock);
> +	}
> +}
> +
> +void xive_smp_disable_cpu(void)
> +{
> +	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
> +	unsigned int cpu = smp_processor_id();
> +
> +	/* Migrate interrupts away from the CPU */
> +	irq_migrate_all_off_this_cpu();
> +
> +	/* Set CPPR to 0 to disable flow of interrupts */
> +	xc->cppr = 0;
> +	out_8(xive_tm_area + xive_tm_offset + TM_CPPR, 0);
> +
> +	/* Flush everything still in the queue */
> +	xive_flush_cpu_queue(cpu, xc);
> +
> +	/* Re-enable CPPR  */
> +	xc->cppr = 0xff;
> +	out_8(xive_tm_area + xive_tm_offset + TM_CPPR, 0xff);
> +}
> +
> +void xive_flush_interrupt(void)
> +{
> +	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
> +	unsigned int cpu = smp_processor_id();
> +
> +	/* Called if an interrupt occurs while the CPU is hot unplugged */
> +	xive_flush_cpu_queue(cpu, xc);
> +}
> +
> +#endif /* CONFIG_HOTPLUG_CPU */
> +
> +#endif /* CONFIG_SMP */
> +
> +void xive_kexec_teardown_cpu(int secondary)
> +{
> +	struct xive_cpu *xc = __this_cpu_read(xive_cpu);
> +	unsigned int cpu = smp_processor_id();
> +
> +	/* Set CPPR to 0 to disable flow of interrupts */
> +	xc->cppr = 0;
> +	out_8(xive_tm_area + xive_tm_offset + TM_CPPR, 0);
> +
> +	/* Backend cleanup if any */
> +	if (xive_ops->teardown_cpu)
> +		xive_ops->teardown_cpu(cpu, xc);
> +
> +	/* Get rid of IPI */
> +	xive_cleanup_cpu_ipi(cpu, xc);
> +
> +	/* Disable and free the queues */
> +	xive_cleanup_cpu_queues(cpu, xc);
> +}
> +
> +void xive_shutdown(void)
> +{
> +	xive_ops->shutdown();
> +}
> +
> +bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset,
> +		    u8 max_prio)
> +{
> +	xive_tm_area = area;
> +	xive_tm_offset = offset;
> +	xive_ops = ops;
> +	xive_irq_priority = max_prio;
> +
> +	ppc_md.get_irq = xive_get_irq;
> +	__xive_enabled = true;
> +
> +	DBG("Initializing host..\n");
> +	xive_init_host();
> +
> +	DBG("Initializing boot CPU..\n");
> +
> +	/* Allocate per-CPU data and queues */
> +	xive_prepare_cpu(smp_processor_id());
> +
> +	/* Get ready for interrupts */
> +	xive_setup_cpu();
> +
> +	pr_info("XIVE: Interrupt handling intialized with %s backend\n",
> +		xive_ops->name);
> +	pr_info("XIVE: Using priority %d for all interrupts\n", max_prio);
> +
> +	return true;
> +}
> +
> +static int __init xive_off(char *arg)
> +{
> +	xive_cmdline_disabled = true;
> +	return 0;
> +}
> +__setup("xive=off", xive_off);
> diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c
> new file mode 100644
> index 0000000..26cc6bf
> --- /dev/null
> +++ b/arch/powerpc/sysdev/xive/native.c
> @@ -0,0 +1,604 @@
> +/*
> + * Copyright 2016,2017 IBM Corporation.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version
> + * 2 of the License, or (at your option) any later version.
> + */
> +#include <linux/types.h>
> +#include <linux/irq.h>
> +#include <linux/debugfs.h>

Unused?

> +#include <linux/smp.h>
> +#include <linux/interrupt.h>
> +#include <linux/seq_file.h>

Unused?

> +#include <linux/init.h>
> +#include <linux/of.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/delay.h>
> +#include <linux/cpumask.h>
> +#include <linux/mm.h>
> +
> +#include <asm/prom.h>
> +#include <asm/io.h>
> +#include <asm/smp.h>
> +#include <asm/irq.h>
> +#include <asm/errno.h>
> +#include <asm/xive.h>
> +#include <asm/opal.h>
> +
> +#include "xive-regs.h"
> +#include "xive-internal.h"
> +
> +#define DBG(fmt...)	pr_devel("XIVE: " fmt)
> +
> +/* Enable this for using queue MMIO page for EOI. We don't currently
> + * use it as we always notify
> + */
> +#undef USE_QUEUE_MMIO

Dead code? Or we want to keep it?

> +static u32 xive_provision_size;
> +static u32 *xive_provision_chips;
> +static u32 xive_provision_chip_count;
> +static u32 xive_queue_shift;
> +static u32 xive_pool_vps = XIVE_INVALID_VP;
> +static struct kmem_cache *xive_provision_cache;
> +
> +int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data)
> +{
> +	__be64 flags, eoi_page, trig_page;
> +	__be32 esb_shift, src_chip;
> +	u64 opal_flags;
> +	s64 rc;
> +
> +	memset(data, 0, sizeof(*data));
> +
> +	rc = opal_xive_get_irq_info(hw_irq, &flags, &eoi_page, &trig_page,
> +				    &esb_shift, &src_chip);
> +	if (rc) {
> +		pr_err("XIVE: opal_xive_get_irq_info(0x%x) returned %lld\n",
> +		       hw_irq, rc);
> +		return -EINVAL;
> +	}
> +
> +	opal_flags = be64_to_cpu(flags);
> +	if (opal_flags & OPAL_XIVE_IRQ_STORE_EOI)
> +		data->flags |= XIVE_IRQ_FLAG_STORE_EOI;
> +	if (opal_flags & OPAL_XIVE_IRQ_LSI)
> +		data->flags |= XIVE_IRQ_FLAG_LSI;
> +	if (opal_flags & OPAL_XIVE_IRQ_SHIFT_BUG)
> +		data->flags |= XIVE_IRQ_FLAG_SHIFT_BUG;
> +	if (opal_flags & OPAL_XIVE_IRQ_MASK_VIA_FW)
> +		data->flags |= XIVE_IRQ_FLAG_MASK_FW;
> +	if (opal_flags & OPAL_XIVE_IRQ_EOI_VIA_FW)
> +		data->flags |= XIVE_IRQ_FLAG_EOI_FW;
> +	data->eoi_page = be64_to_cpu(eoi_page);
> +	data->trig_page = be64_to_cpu(trig_page);
> +	data->esb_shift = be32_to_cpu(esb_shift);
> +	data->src_chip = be32_to_cpu(src_chip);
> +
> +	data->eoi_mmio = ioremap(data->eoi_page, 1u << data->esb_shift);
> +	if (!data->eoi_mmio) {
> +		pr_err("XIVE: Failed to map EOI page for irq 0x%x\n", hw_irq);
> +		return -ENOMEM;
> +	}
> +
> +	if (!data->trig_page)
> +		return 0;
> +	if (data->trig_page == data->eoi_page) {
> +		data->trig_mmio = data->eoi_mmio;
> +		return 0;
> +	}
> +
> +	data->trig_mmio = ioremap(data->trig_page, 1u << data->esb_shift);
> +	if (!data->trig_mmio) {
> +		pr_err("XIVE: Failed to map trigger page for irq 0x%x\n", hw_irq);
> +		return -ENOMEM;
> +	}
> +	return 0;
> +}
> +
> +int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq)
> +{
> +	s64 rc;
> +
> +	for (;;) {
> +		rc = opal_xive_set_irq_config(hw_irq, target, prio, sw_irq);
> +		if (rc != OPAL_BUSY)
> +			break;
> +		msleep(1);
> +	}
> +	return rc == 0 ? 0 : -ENXIO;
> +}
> +
> +/* This can be called multiple time to change a queue configuration */
> +int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio,
> +				__be32 *qpage, u32 order, bool can_escalate)
> +{
> +	s64 rc = 0;
> +	__be64 qeoi_page_be;
> +	__be32 esc_irq_be;
> +	u64 flags, qpage_phys;
> +
> +	/* If there's an actual queue page, clean it */
> +	if (order) {
> +		BUG_ON(!qpage);

Can't we just return an error?

> +		qpage_phys = __pa(qpage);
> +	} else
> +		qpage_phys = 0;
> +
> +	/* Initialize the rest of the fields */
> +	q->msk = order ? ((1u << (order - 2)) - 1) : 0;
> +	q->idx = 0;
> +	q->toggle = 0;
> +
> +	rc = opal_xive_get_queue_info(vp_id, prio, NULL, NULL,
> +				      &qeoi_page_be,
> +				      &esc_irq_be,
> +				      NULL);
> +	if (rc) {
> +		pr_err("XIVE: Error %lld getting queue info prio %d\n",
> +		       rc, prio);
> +		rc = -EIO;
> +		goto fail;
> +	}
> +	q->eoi_phys = be64_to_cpu(qeoi_page_be);
> +
> +#ifdef USE_QUEUE_MMIO
> +	if (!q->eoi_mmio)
> +		q->eoi_mmio = ioremap(q->eoi_phys, PAGE_SIZE);
> +	if (!q->eoi_mmio) {
> +		pr_err("XIVE: Failed to map queue MMIO prio %d CPU %d\n",
> +		       rc, prio, cpu);
> +		rc = -ENOMEM;
> +		goto fail;
> +	}
> +#endif /* USE_QUEUE_MMIO */
> +
> +
...
> +static bool xive_parse_provisioning(struct device_node *np)
> +{
> +	int rc;
> +
> +	if (of_property_read_u32(np, "ibm,xive-provision-page-size",
> +				 &xive_provision_size) < 0)
> +		return true;
> +	rc = of_property_count_elems_of_size(np, "ibm,xive-provision-chips", 4);
> +	if (rc < 0) {
> +		pr_err("XIVE: Error %d getting provision chips array\n", rc);
> +		return false;
> +	}
> +	xive_provision_chip_count = rc;
> +	if (rc == 0)
> +		return true;
> +
> +	xive_provision_chips = kzalloc(4 * xive_provision_chip_count,
> +				       GFP_KERNEL);
> +	BUG_ON(!xive_provision_chips);

return false?

> +
> +	rc = of_property_read_u32_array(np, "ibm,xive-provision-chips",
> +					xive_provision_chips,
> +					xive_provision_chip_count);
...
> diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h
> new file mode 100644
> index 0000000..e736fc5
> --- /dev/null
> +++ b/arch/powerpc/sysdev/xive/xive-internal.h
> @@ -0,0 +1,51 @@

Copyright missing.

> +#ifndef __XIVE_INTERNAL_H
> +#define __XIVE_INTERNAL_H
...
> diff --git a/arch/powerpc/sysdev/xive/xive-regs.h b/arch/powerpc/sysdev/xive/xive-regs.h
> new file mode 100644
> index 0000000..f1edb23
> --- /dev/null
> +++ b/arch/powerpc/sysdev/xive/xive-regs.h
> @@ -0,0 +1,88 @@

Copyright missing.

> +#ifndef __XIVE_REGS_H__
> +#define __XIVE_REGS_H__
...
> diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
> index 16321ad..c71e919 100644
> --- a/arch/powerpc/xmon/xmon.c
> +++ b/arch/powerpc/xmon/xmon.c
...
> +
> +static void dump_one_xive_irq(uint32_t num)

u32?

> +{
> +	int64_t rc;
> +	__be64 vp;
> +	uint8_t prio;

u8?

zzzzz ...

cheers