[Skiboot] [PATCH 6/7] hw: Introduce npu3

Alexey Kardashevskiy aik at ozlabs.ru
Wed Jun 26 16:28:58 AEST 2019



On 13/06/2019 07:08, Reza Arbab wrote:
> POWER9P systems have been upgraded with NVLink 3.0 interconnects. The
> underlying hardware is fundamentally different--each POWER9 chip has
> 
>         (1 NPU) * (3 stacks) * (2 bricks) = (6 links)
> 
> Where in each POWER9P chip, there are
> 
>         (3 NPUs) * (4 bricks) = (12 links)
> 
> This flatter hierarchy simplifies the firmware implementation a bit, but
> also prevents sharing much common code with npu2.


Is anything really shared? It looks like all data structures and npu2*.*
files were copied and massively refactored so now it is impossible to
tell what the actual difference between NPU2 and NPU3 is (besides
different grouping). 95+% of data structures and registers seems to be
just the same. Not sure sharing the code worth it but it definitely
makes it harder to follow what is done here...



> As in previous versions, initialize the hardware and expose each link to
> the OS as a virtual PCIe device. This initial support covers NVLink
> devices only, with OpenCAPI to follow.
> 
> Signed-off-by: Reza Arbab <arbab at linux.ibm.com>
> ---
>  core/init.c             |    1 +
>  hw/Makefile.inc         |    3 +-
>  hw/npu-opal.c           |   38 +-
>  hw/npu3-hw-procedures.c |  801 +++++++++++++++++++++
>  hw/npu3-nvlink.c        | 1841 +++++++++++++++++++++++++++++++++++++++++++++++
>  hw/npu3.c               |  554 ++++++++++++++
>  include/npu3-regs.h     |  247 +++++++
>  include/npu3.h          |  180 +++++
>  include/pci.h           |    1 +
>  include/platform.h      |    4 +-
>  include/skiboot.h       |    1 +
>  include/xscom-p9-regs.h |   19 +
>  12 files changed, 3680 insertions(+), 10 deletions(-)
>  create mode 100644 hw/npu3-hw-procedures.c
>  create mode 100644 hw/npu3-nvlink.c
>  create mode 100644 hw/npu3.c
>  create mode 100644 include/npu3-regs.h
>  create mode 100644 include/npu3.h
> 
> diff --git a/core/init.c b/core/init.c
> index 7e8ba7854dcc..89cef87a44f6 100644
> --- a/core/init.c
> +++ b/core/init.c
> @@ -1247,6 +1247,7 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
>  	/* Probe NPUs */
>  	probe_npu();
>  	probe_npu2();
> +	probe_npu3();
>  
>  	/* Initialize PCI */
>  	pci_init_slots();
> diff --git a/hw/Makefile.inc b/hw/Makefile.inc
> index 2f4f4dabef59..d346c594917c 100644
> --- a/hw/Makefile.inc
> +++ b/hw/Makefile.inc
> @@ -8,7 +8,8 @@ HW_OBJS += dts.o lpc-rtc.o npu.o npu-hw-procedures.o xive.o phb4.o
>  HW_OBJS += fake-nvram.o lpc-mbox.o npu2.o npu2-hw-procedures.o
>  HW_OBJS += npu2-common.o phys-map.o sbe-p9.o capp.o occ-sensor.o vas.o
>  HW_OBJS += npu2-opencapi.o phys-map.o sbe-p9.o capp.o occ-sensor.o
> -HW_OBJS += vas.o sbe-p8.o dio-p9.o cache-p9.o npu-opal.o
> +HW_OBJS += npu-opal.o npu3.o npu3-nvlink.o npu3-hw-procedures.o
> +HW_OBJS += vas.o sbe-p8.o dio-p9.o cache-p9.o
>  HW_OBJS += lpc-port80h.o
>  HW=hw/built-in.a
>  
> diff --git a/hw/npu-opal.c b/hw/npu-opal.c
> index 4195ffa2fc60..b4aebc15c65b 100644
> --- a/hw/npu-opal.c
> +++ b/hw/npu-opal.c
> @@ -18,16 +18,23 @@
>  #include <pci.h>
>  #include <phb4.h>
>  #include <npu2.h>
> +#include <npu3.h>
>  
>  static int64_t opal_npu_init_context(uint64_t phb_id, int pid __unused,
>  				     uint64_t msr, uint64_t bdf)
>  {
>  	struct phb *phb = pci_get_phb(phb_id);
>  
> -	if (!phb || phb->phb_type != phb_type_npu_v2)
> +	if (!phb)
>  		return OPAL_PARAMETER;
>  
> -	return npu2_init_context(phb, msr, bdf);
> +	if (phb->phb_type == phb_type_npu_v2)
> +		return npu2_init_context(phb, msr, bdf);
> +
> +	if (phb->phb_type == phb_type_npu_v3)
> +		return npu3_init_context(phb, msr, bdf);
> +
> +	return OPAL_PARAMETER;
>  }
>  opal_call(OPAL_NPU_INIT_CONTEXT, opal_npu_init_context, 4);
>  
> @@ -36,10 +43,16 @@ static int64_t opal_npu_destroy_context(uint64_t phb_id, uint64_t pid __unused,
>  {
>  	struct phb *phb = pci_get_phb(phb_id);
>  
> -	if (!phb || phb->phb_type != phb_type_npu_v2)
> +	if (!phb)
>  		return OPAL_PARAMETER;
>  
> -	return npu2_destroy_context(phb, bdf);
> +	if (phb->phb_type == phb_type_npu_v2)
> +		return npu2_destroy_context(phb, bdf);
> +
> +	if (phb->phb_type == phb_type_npu_v3)
> +		return npu3_destroy_context(phb, bdf);
> +
> +	return OPAL_PARAMETER;
>  }
>  opal_call(OPAL_NPU_DESTROY_CONTEXT, opal_npu_destroy_context, 3);
>  
> @@ -48,10 +61,16 @@ static int64_t opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid,
>  {
>  	struct phb *phb = pci_get_phb(phb_id);
>  
> -	if (!phb || phb->phb_type != phb_type_npu_v2)
> +	if (!phb)
>  		return OPAL_PARAMETER;
>  
> -	return npu2_map_lpar(phb, bdf, lparid, lpcr);
> +	if (phb->phb_type == phb_type_npu_v2)
> +		return npu2_map_lpar(phb, bdf, lparid, lpcr);
> +
> +	if (phb->phb_type == phb_type_npu_v3)
> +		return npu3_map_lpar(phb, bdf, lparid, lpcr);
> +
> +	return OPAL_PARAMETER;
>  }
>  opal_call(OPAL_NPU_MAP_LPAR, opal_npu_map_lpar, 4);
>  
> @@ -81,10 +100,13 @@ static int64_t npu_set_relaxed_order(uint32_t gcid, int pec, bool enable)
>  	uint64_t rc;
>  
>  	for_each_phb(phb) {
> -		if (phb->phb_type != phb_type_npu_v2)
> +		if (phb->phb_type == phb_type_npu_v2)
> +			rc = npu2_set_relaxed_order(phb, gcid, pec, enable);
> +		else if (phb->phb_type == phb_type_npu_v3)
> +			rc = npu3_set_relaxed_order(phb, gcid, pec, enable);
> +		else
>  			continue;
>  
> -		rc = npu2_set_relaxed_order(phb, gcid, pec, enable);
>  		if (rc)
>  			return rc;
>  	}
> diff --git a/hw/npu3-hw-procedures.c b/hw/npu3-hw-procedures.c
> new file mode 100644
> index 000000000000..42b658d1aab2
> --- /dev/null
> +++ b/hw/npu3-hw-procedures.c
> @@ -0,0 +1,801 @@
> +/* Copyright 2019 IBM Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + *	http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> + * implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <skiboot.h>
> +#include <npu3.h>
> +#include <npu3-regs.h>
> +#include <timebase.h>
> +#include <xscom.h>
> +#include <xscom-p9-regs.h>
> +
> +#define NPU3DEVLOG(l, dev, fmt, a...)		\
> +	prlog(l, "NPU[%d:%d:%d]: " fmt,		\
> +	      (dev)->npu->chip_id,		\
> +	      (dev)->npu->index,		\
> +	      (dev)->index, ##a)
> +#define NPU3DEVDBG(dev, fmt, a...) NPU3DEVLOG(PR_DEBUG, dev, fmt, ##a)
> +#define NPU3DEVINF(dev, fmt, a...) NPU3DEVLOG(PR_INFO, dev, fmt, ##a)
> +#define NPU3DEVERR(dev, fmt, a...) NPU3DEVLOG(PR_ERR, dev, fmt, ##a)
> +
> +/*
> + * The documentation for the PHY training is written in terms of bits within an
> + * actual register so we use that representation here.
> + */
> +struct npu3_phy_reg {
> +	uint64_t offset;
> +	uint64_t mask;
> +};
> +
> +static struct npu3_phy_reg
> +NPU3_PHY_RX_RUN_LANE			= { 0x0c8, PPC_BIT(48) },
> +NPU3_PHY_RX_IORESET			= { 0x096, PPC_BIT(63) },
> +NPU3_PHY_TX_IORESET			= { 0x113, PPC_BIT(48) },
> +NPU3_PHY_RX_PR_RESET			= { 0x096, PPC_BIT(62) },
> +NPU3_PHY_RX_LANE_ANA_PDWN		= { 0x002, PPC_BIT(54) },
> +NPU3_PHY_RX_LANE_DIG_PDWN		= { 0x088, PPC_BIT(48) },
> +NPU3_PHY_RX_PR_IQ_RES_SEL		= { 0x004, PPC_BITMASK(59, 61) },
> +NPU3_PHY_RX_PR_PHASE_STEP		= { 0x08a, PPC_BITMASK(60, 63) },
> +NPU3_PHY_TX_LANE_PDWN			= { 0x101, PPC_BIT(48) },
> +NPU3_PHY_RX_RUN_DCCAL			= { 0x0c8, PPC_BIT(49) },
> +NPU3_PHY_RX_DCCAL_DONE			= { 0x0ca, PPC_BIT(49) },
> +NPU3_PHY_RX_LANE_BUSY			= { 0x0ca, PPC_BIT(50) },
> +NPU3_PHY_RX_B_BANK_CONTROLS		= { 0x002, PPC_BITMASK(58, 63) },
> +NPU3_PHY_TX_UNLOAD_CLK_DISABLE		= { 0x103, PPC_BIT(56) },
> +NPU3_PHY_TX_FIFO_INIT			= { 0x105, PPC_BIT(53) },
> +NPU3_PHY_TX_RXCAL			= { 0x103, PPC_BIT(57) },
> +NPU3_PHY_RX_INIT_DONE			= { 0x0ca, PPC_BIT(48) },
> +NPU3_PHY_RX_PR_EDGE_TRACK_CNTL		= { 0x092, PPC_BITMASK(48, 49) },
> +NPU3_PHY_RX_PR_FW_OFF			= { 0x08a, PPC_BIT(56) },
> +NPU3_PHY_RX_PR_FW_INERTIA_AMT		= { 0x08a, PPC_BITMASK(57, 59) },
> +NPU3_PHY_RX_CFG_LTE_MC			= { 0x000, PPC_BITMASK(60, 63) },
> +NPU3_PHY_RX_A_INTEG_COARSE_GAIN		= { 0x00a, PPC_BITMASK(48, 51) },
> +NPU3_PHY_RX_B_INTEG_COARSE_GAIN		= { 0x026, PPC_BITMASK(48, 51) },
> +NPU3_PHY_RX_E_INTEG_COARSE_GAIN		= { 0x030, PPC_BITMASK(48, 51) },
> +
> +/* These registers are per-PHY, not per lane */
> +NPU3_PHY_TX_ZCAL_SWO_EN			= { 0x3c9, PPC_BIT(48) },
> +NPU3_PHY_TX_ZCAL_REQ			= { 0x3c1, PPC_BIT(49) },
> +NPU3_PHY_TX_ZCAL_DONE			= { 0x3c1, PPC_BIT(50) },
> +NPU3_PHY_TX_ZCAL_ERROR			= { 0x3c1, PPC_BIT(51) },
> +NPU3_PHY_TX_ZCAL_N			= { 0x3c3, PPC_BITMASK(48, 56) },
> +NPU3_PHY_TX_ZCAL_P			= { 0x3c5, PPC_BITMASK(48, 56) },
> +NPU3_PHY_TX_PSEG_PRE_EN			= { 0x34d, PPC_BITMASK(51, 55) },
> +NPU3_PHY_TX_PSEG_PRE_SELECT		= { 0x34d, PPC_BITMASK(56, 60) },
> +NPU3_PHY_TX_NSEG_PRE_EN			= { 0x34f, PPC_BITMASK(51, 55) },
> +NPU3_PHY_TX_NSEG_PRE_SELECT		= { 0x34f, PPC_BITMASK(56, 60) },
> +NPU3_PHY_TX_PSEG_POST_EN		= { 0x361, PPC_BITMASK(49, 55) },
> +NPU3_PHY_TX_PSEG_POST_SELECT		= { 0x361, PPC_BITMASK(56, 62) },
> +NPU3_PHY_TX_NSEG_POST_EN		= { 0x363, PPC_BITMASK(49, 55) },
> +NPU3_PHY_TX_NSEG_POST_SELECT		= { 0x363, PPC_BITMASK(56, 62) },
> +NPU3_PHY_TX_PSEG_MARGINPU_EN		= { 0x351, PPC_BITMASK(48, 55) },
> +NPU3_PHY_TX_NSEG_MARGINPU_EN		= { 0x353, PPC_BITMASK(48, 55) },
> +NPU3_PHY_TX_PSEG_MARGINPD_EN		= { 0x351, PPC_BITMASK(56, 63) },
> +NPU3_PHY_TX_NSEG_MARGINPD_EN		= { 0x353, PPC_BITMASK(56, 63) },
> +NPU3_PHY_TX_MARGINPU_SELECT		= { 0x355, PPC_BITMASK(48, 55) },
> +NPU3_PHY_TX_MARGINPD_SELECT		= { 0x355, PPC_BITMASK(56, 63) },
> +NPU3_PHY_TX_PSEG_MAIN_EN		= { 0x357, PPC_BITMASK(51, 57) },
> +NPU3_PHY_TX_NSEG_MAIN_EN		= { 0x359, PPC_BITMASK(51, 57) },
> +NPU3_PHY_RX_CLKDIST_PDWN		= { 0x204, PPC_BITMASK(48, 50) },
> +NPU3_PHY_RX_IREF_PDWN			= { 0x230, PPC_BIT(54) },
> +NPU3_PHY_TX_CLKDIST_PDWN		= { 0x305, PPC_BITMASK(48, 50) },
> +NPU3_PHY_RX_CTL_DATASM_CLKDIST_PDWN	= { 0x2e0, PPC_BIT(60) };
> +
> +static uint64_t npu3_phy_scom(struct npu3_dev *dev, struct npu3_phy_reg *reg,
> +			      int lane)
> +{
> +	uint64_t scom;
> +
> +	/* Don't specify a lane for a non-per-lane register */
> +	if (lane >= 0)
> +		assert(reg->offset < 0x200);
> +	else
> +		assert(reg->offset >= 0x200);
> +
> +	scom = OB_INDIRECT(dev->ob_chiplet);
> +	scom = SETFIELD(PPC_BITMASK(12, 21), scom, reg->offset);
> +
> +	if (lane > 0)
> +		scom = SETFIELD(PPC_BITMASK(27, 31), scom, lane);
> +
> +	return scom;
> +}
> +
> +static void npu3_phy_write_lane(struct npu3_dev *dev, struct npu3_phy_reg *reg,
> +				int lane, uint64_t val)
> +{
> +	struct npu3 *npu = dev->npu;
> +	uint64_t scom, scom_val;
> +
> +	scom = npu3_phy_scom(dev, reg, lane);
> +
> +	xscom_read(npu->chip_id, scom, &scom_val);
> +	scom_val = SETFIELD(reg->mask, scom_val, val);
> +	xscom_write(npu->chip_id, scom, scom_val);
> +}
> +
> +static uint64_t npu3_phy_read_lane(struct npu3_dev *dev,
> +				   struct npu3_phy_reg *reg,
> +				   int lane)
> +{
> +	struct npu3 *npu = dev->npu;
> +	uint64_t scom, scom_val;
> +
> +	scom = npu3_phy_scom(dev, reg, lane);
> +	xscom_read(npu->chip_id, scom, &scom_val);
> +
> +	return GETFIELD(reg->mask, scom_val);
> +}
> +
> +static inline void npu3_phy_write(struct npu3_dev *dev,
> +				  struct npu3_phy_reg *reg,
> +				  uint64_t val)
> +{
> +	npu3_phy_write_lane(dev, reg, -1, val);
> +}
> +
> +static inline uint64_t npu3_phy_read(struct npu3_dev *dev,
> +				     struct npu3_phy_reg *reg)
> +{
> +	return npu3_phy_read_lane(dev, reg, -1);
> +}
> +
> +struct procedure {
> +	const char *name;
> +	uint32_t (*steps[])(struct npu3_dev *);
> +};
> +
> +#define DEFINE_PROCEDURE(NAME, STEPS...)	\
> +static struct procedure procedure_##NAME = {	\
> +	.name = #NAME,				\
> +	.steps = { NAME, ##STEPS }		\
> +}
> +
> +static uint32_t stop(struct npu3_dev *npu_dev __unused)
> +{
> +	return NPU3_PROC_COMPLETE | NPU3_PROC_ABORTED;
> +}
> +
> +DEFINE_PROCEDURE(stop);
> +
> +static uint32_t nop(struct npu3_dev *npu_dev __unused)
> +{
> +	return NPU3_PROC_COMPLETE;
> +}
> +
> +DEFINE_PROCEDURE(nop);
> +
> +static void set_iovalid(struct npu3_dev *dev, bool raise)
> +{
> +	struct npu3 *npu = dev->npu;
> +	uint64_t reg, val;
> +
> +	reg = OB_CPLT_CONF1(dev->ob_chiplet);
> +
> +	xscom_read(npu->chip_id, reg, &val);
> +	val = SETFIELD(OB_CPLT_CONF1_NV_IOVALID(dev->index), val, raise);
> +	xscom_write(npu->chip_id, reg, val);
> +}
> +
> +#define NPU3_PHY_LANES 24
> +
> +#define npu3_for_each_lane(lane, dev)				\
> +	for (lane = 0; lane < NPU3_PHY_LANES; lane++)		\
> +		if (dev->phy_lane_mask & PPC_BIT32(lane))	\
> +
> +static uint32_t phy_reset(struct npu3_dev *dev)
> +{
> +	uint32_t lane;
> +
> +	set_iovalid(dev, false);
> +
> +	npu3_for_each_lane(lane, dev)
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_LANE, lane, 0);
> +
> +	return NPU3_PROC_NEXT;
> +}
> +
> +static uint32_t phy_reset_wait(struct npu3_dev *dev)
> +{
> +	int lane;
> +
> +	/* Wait for all lanes to become inactive */
> +	npu3_for_each_lane(lane, dev)
> +		if (npu3_phy_read_lane(dev, &NPU3_PHY_RX_LANE_BUSY, lane))
> +			return NPU3_PROC_INPROGRESS;
> +
> +	npu3_for_each_lane(lane, dev) {
> +		/* Set lane in reset */
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_IORESET, lane, 1);
> +		npu3_phy_write_lane(dev, &NPU3_PHY_TX_IORESET, lane, 1);
> +
> +		/* Release lane from reset */
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_IORESET, lane, 0);
> +		npu3_phy_write_lane(dev, &NPU3_PHY_TX_IORESET, lane, 0);
> +
> +		/* Reset the phase rotator */
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_RESET, lane, 1);
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_RESET, lane, 0);
> +	}
> +
> +	return NPU3_PROC_NEXT;
> +}
> +
> +/* Procedure 1.2.3 - Initialise I/O PHY Registers */
> +static uint32_t phy_reset_complete(struct npu3_dev *dev)
> +{
> +	int lane;
> +
> +	npu3_for_each_lane(lane, dev) {
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_ANA_PDWN, lane, 0);
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_DIG_PDWN, lane, 0);
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_IQ_RES_SEL, lane, 7);
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_PHASE_STEP, lane, 0xc);
> +		npu3_phy_write_lane(dev, &NPU3_PHY_TX_LANE_PDWN, lane, 0);
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_FW_INERTIA_AMT, lane, 4);
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_CFG_LTE_MC, lane, 3);
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_A_INTEG_COARSE_GAIN, lane, 11);
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_B_INTEG_COARSE_GAIN, lane, 11);
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_E_INTEG_COARSE_GAIN, lane, 11);
> +	}
> +
> +	set_iovalid(dev, true);
> +
> +	return NPU3_PROC_COMPLETE;
> +}
> +
> +DEFINE_PROCEDURE(phy_reset, phy_reset_wait, phy_reset_complete);
> +
> +/* Procedure 1.2.6 - I/O PHY Tx Impedance Calibration */
> +static uint32_t phy_tx_zcal(struct npu3_dev *dev)
> +{
> +	if (dev->npu->tx_zcal_complete)
> +		return NPU3_PROC_COMPLETE;
> +
> +	/* Turn off SW enable and enable zcal state machine */
> +	npu3_phy_write(dev, &NPU3_PHY_TX_ZCAL_SWO_EN, 0);
> +
> +	/* Start impedance calibration state machine */
> +	npu3_phy_write(dev, &NPU3_PHY_TX_ZCAL_REQ, 1);
> +
> +	return NPU3_PROC_NEXT;
> +}
> +
> +static uint32_t phy_tx_zcal_wait(struct npu3_dev *dev)
> +{
> +	if (npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_ERROR))
> +		return NPU3_PROC_COMPLETE | NPU3_PROC_FAILED;
> +
> +	if (!npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_DONE))
> +		return NPU3_PROC_INPROGRESS;
> +
> +	return NPU3_PROC_NEXT;
> +}
> +
> +#define MARGIN_RATIO		0
> +#define FFE_PRE_COEFF		0
> +#define FFE_POST_COEFF		0
> +
> +#define PRE_WIDTH		5
> +#define POST_WIDTH		7
> +#define MAIN_WIDTH		7
> +#define ZCAL_MIN		(16 * 2)
> +#define ZCAL_MAX		(33 * 2)
> +#define PRECURSOR_X2_MAX	(4 * 2 + 1)
> +#define POSTCURSOR_X2_MAX	(6 * 2 + 1)
> +#define MARGIN_X2_MAX		(8 * 2)
> +#define MAIN_X2_MAX		(6 * 2 + 1)
> +#define TOTAL_X2_MAX		(PRECURSOR_X2_MAX + POSTCURSOR_X2_MAX + \
> +				 2 * MARGIN_X2_MAX + MAIN_X2_MAX)
> +
> +static uint32_t therm(uint32_t dec)
> +{
> +	return (0x1 << dec) - 1;
> +}
> +
> +static uint32_t therm_with_half(uint32_t dec, uint8_t width)
> +{
> +	/* If the LSB of the 2r equivalent is on, then we need to set the 2r bit (MSB) */
> +	uint32_t half_on = (dec & 0x1) << (width - 1);
> +
> +	/* Shift the 2r equivalent to a 1r value and convert to a thermometer code. */
> +	uint32_t x1_equiv = ((1 << (dec >> 1)) - 1);
> +
> +	/* Combine 1r equivalent thermometer code + the 2r MSB value. */
> +	return half_on | x1_equiv;
> +}
> +
> +static uint32_t phy_tx_zcal_calculate(struct npu3_dev *dev)
> +{
> +	int p_value, n_value;
> +	uint32_t zcal_n;
> +	uint32_t zcal_p;
> +	uint32_t p_main_enable = MAIN_X2_MAX;
> +	uint32_t p_margin_pu_enable = MARGIN_X2_MAX;
> +	uint32_t p_margin_pd_enable = MARGIN_X2_MAX;
> +	uint32_t p_precursor_select;
> +	uint32_t p_postcursor_select;
> +	uint32_t margin_pu_select;
> +	uint32_t n_main_enable = MAIN_X2_MAX;
> +	uint32_t n_margin_pu_enable = MARGIN_X2_MAX;
> +	uint32_t n_margin_pd_enable = MARGIN_X2_MAX;
> +	uint32_t n_precursor_select;
> +	uint32_t n_postcursor_select;
> +	uint32_t margin_pd_select;
> +	uint32_t margin_select;
> +
> +	/* Convert the value from 8R to 2R by / 4 */
> +	zcal_n = npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_N) / 4;
> +	zcal_p = npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_P) / 4;
> +
> +	/* Again, if the hardware detects an unexpected condition it's
> +	 * better just to fail loudly. */
> +	if (zcal_n < ZCAL_MIN || zcal_n > ZCAL_MAX ||
> +	    zcal_p < ZCAL_MIN || zcal_p > ZCAL_MAX)
> +		return NPU3_PROC_COMPLETE | NPU3_PROC_FAILED;
> +
> +	p_value = zcal_p - TOTAL_X2_MAX;
> +	p_precursor_select = p_value * FFE_PRE_COEFF / 128;
> +	p_postcursor_select = p_value * FFE_POST_COEFF / 128;
> +	margin_pu_select = p_value * MARGIN_RATIO / 256;
> +
> +	if (p_value % 2) {
> +		p_main_enable--;
> +		p_value++;
> +	}
> +
> +	while (p_value < 0) {
> +		if (p_main_enable > 1) {
> +			p_main_enable -= 2;
> +		} else if (p_margin_pu_enable + p_margin_pd_enable > 0) {
> +			if (p_margin_pu_enable == p_margin_pd_enable)
> +				p_margin_pd_enable -= 2;
> +			else
> +				p_margin_pu_enable -= 2;
> +		}
> +		p_value += 2;
> +	}
> +
> +	n_value = zcal_n - TOTAL_X2_MAX;
> +	n_precursor_select = n_value * FFE_PRE_COEFF / 128;
> +	n_postcursor_select = n_value * FFE_POST_COEFF / 128;
> +	margin_pd_select = p_value * MARGIN_RATIO / 256;
> +
> +	if (n_value % 2) {
> +		n_main_enable--;
> +		n_value++;
> +	}
> +
> +	while (n_value < 0) {
> +		if (n_main_enable > 1) {
> +			n_main_enable -= 2;
> +		} else if (n_margin_pu_enable + n_margin_pd_enable > 0) {
> +			if (n_margin_pu_enable == n_margin_pd_enable)
> +				n_margin_pd_enable -= 2;
> +			else
> +				n_margin_pu_enable -= 2;
> +		}
> +		n_value += 2;
> +	}
> +
> +	margin_select = therm((margin_pu_select + 1) / 2) &
> +			therm((margin_pd_select + 1) / 2) &
> +			therm((p_margin_pu_enable + 1) / 2) &
> +			therm((p_margin_pd_enable + 1) / 2) &
> +			therm((n_margin_pu_enable + 1) / 2) &
> +			therm((n_margin_pd_enable + 1) / 2);
> +
> +	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_PRE_EN,      therm_with_half(PRECURSOR_X2_MAX, PRE_WIDTH));
> +	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_PRE_SELECT,  therm_with_half(p_precursor_select, PRE_WIDTH));
> +	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_POST_EN,     therm_with_half(POSTCURSOR_X2_MAX, POST_WIDTH));
> +	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_POST_SELECT, therm_with_half(p_postcursor_select, POST_WIDTH));
> +	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_MARGINPU_EN, therm((p_margin_pu_enable + 1) / 2));
> +	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_MARGINPD_EN, therm((p_margin_pd_enable + 1) / 2));
> +	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_MAIN_EN,     therm_with_half(p_main_enable, MAIN_WIDTH));
> +
> +	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_PRE_EN,      therm_with_half(PRECURSOR_X2_MAX, PRE_WIDTH));
> +	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_PRE_SELECT,  therm_with_half(n_precursor_select, PRE_WIDTH));
> +	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_POST_EN,     therm_with_half(POSTCURSOR_X2_MAX, POST_WIDTH));
> +	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_POST_SELECT, therm_with_half(n_postcursor_select, POST_WIDTH));
> +	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_MARGINPU_EN, therm((n_margin_pu_enable + 1) / 2));
> +	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_MARGINPD_EN, therm((n_margin_pd_enable + 1) / 2));
> +	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_MAIN_EN,     therm_with_half(n_main_enable, MAIN_WIDTH));
> +
> +	npu3_phy_write(dev, &NPU3_PHY_TX_MARGINPU_SELECT,  therm(margin_select + 1) / 2);
> +	npu3_phy_write(dev, &NPU3_PHY_TX_MARGINPD_SELECT,  therm(margin_select + 1) / 2);
> +
> +	dev->npu->tx_zcal_complete = true;
> +
> +	return NPU3_PROC_COMPLETE;
> +}
> +
> +DEFINE_PROCEDURE(phy_tx_zcal, phy_tx_zcal_wait, phy_tx_zcal_calculate);
> +
> +/* Procedure 1.2.4 - I/O PHY DC Calibration */
> +static uint32_t phy_rx_dccal(struct npu3_dev *dev)
> +{
> +	int lane;
> +
> +	set_iovalid(dev, false);
> +
> +	npu3_for_each_lane(lane, dev)
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_FW_OFF, lane, 1);
> +
> +	npu3_for_each_lane(lane, dev)
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_DCCAL, lane, 1);
> +
> +	return NPU3_PROC_NEXT;
> +}
> +
> +static uint32_t phy_rx_dccal_complete(struct npu3_dev *dev)
> +{
> +	int lane;
> +
> +	npu3_for_each_lane(lane, dev)
> +		if (!npu3_phy_read_lane(dev, &NPU3_PHY_RX_DCCAL_DONE, lane))
> +			return NPU3_PROC_INPROGRESS;
> +
> +	npu3_for_each_lane(lane, dev)
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_DCCAL, lane, 0);
> +
> +	npu3_for_each_lane(lane, dev) {
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_B_BANK_CONTROLS, lane, 0);
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_EDGE_TRACK_CNTL, lane, 0);
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_FW_OFF, lane, 0);
> +	}
> +
> +	return NPU3_PROC_NEXT;
> +}
> +
> +/* Procedure 1.2.5 - IO PHY Tx FIFO Init */
> +static uint32_t phy_tx_fifo_init(struct npu3_dev *dev)
> +{
> +	int lane;
> +
> +	npu3_for_each_lane(lane, dev) {
> +		npu3_phy_write_lane(dev, &NPU3_PHY_TX_UNLOAD_CLK_DISABLE, lane, 0);
> +		npu3_phy_write_lane(dev, &NPU3_PHY_TX_FIFO_INIT, lane, 1);
> +		npu3_phy_write_lane(dev, &NPU3_PHY_TX_UNLOAD_CLK_DISABLE, lane, 1);
> +	}
> +
> +	set_iovalid(dev, true);
> +
> +	return NPU3_PROC_COMPLETE;
> +}
> +
> +DEFINE_PROCEDURE(phy_rx_dccal, phy_rx_dccal_complete, phy_tx_fifo_init);
> +
> +/* Procedure 1.2.8 - Enable Downstream Link Training */
> +static uint32_t phy_enable_tx_rxcal(struct npu3_dev *dev)
> +{
> +	int lane;
> +
> +	npu3_for_each_lane(lane, dev)
> +		npu3_phy_write_lane(dev, &NPU3_PHY_TX_RXCAL, lane, 1);
> +
> +	return NPU3_PROC_COMPLETE;
> +}
> +DEFINE_PROCEDURE(phy_enable_tx_rxcal);
> +
> +/* Procedure 1.2.9 - Disable Downstream Link Training */
> +static uint32_t phy_disable_tx_rxcal(struct npu3_dev *dev)
> +{
> +	int lane;
> +
> +	npu3_for_each_lane(lane, dev)
> +		npu3_phy_write_lane(dev, &NPU3_PHY_TX_RXCAL, lane, 0);
> +
> +	return NPU3_PROC_COMPLETE;
> +}
> +DEFINE_PROCEDURE(phy_disable_tx_rxcal);
> +
> +/* Procedure 1.2.7 - I/O PHY Upstream Link Training */
> +static uint32_t phy_rx_training(struct npu3_dev *dev)
> +{
> +	int lane;
> +
> +	npu3_for_each_lane(lane, dev)
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_LANE, lane, 1);
> +
> +	return NPU3_PROC_NEXT;
> +}
> +
> +static uint32_t phy_rx_training_wait(struct npu3_dev *dev)
> +{
> +	int lane;
> +
> +	npu3_for_each_lane(lane, dev)
> +		if (!npu3_phy_read_lane(dev, &NPU3_PHY_RX_INIT_DONE, lane))
> +			return NPU3_PROC_INPROGRESS;
> +
> +	return NPU3_PROC_COMPLETE;
> +}
> +
> +DEFINE_PROCEDURE(phy_rx_training, phy_rx_training_wait);
> +
> +static void npu3_dev_fence_set(struct npu3_dev *dev, uint8_t state)
> +{
> +	struct npu3 *npu = dev->npu;
> +	uint64_t val;
> +
> +	val = npu3_read(npu, NPU3_NTL_MISC_CFG1(dev->index));
> +	val = SETFIELD(NPU3_NTL_MISC_CFG1_NTL_RESET, val, state);
> +	npu3_write(npu, NPU3_NTL_MISC_CFG1(dev->index), val);
> +}
> +
> +static uint8_t npu3_dev_fence_get(struct npu3_dev *dev)
> +{
> +	uint64_t val;
> +
> +	val = npu3_read(dev->npu, NPU3_NTL_CQ_FENCE_STATUS(dev->index));
> +	return GETFIELD(NPU3_NTL_CQ_FENCE_STATUS_FIELD, val);
> +}
> +
> +/* Procedure 1.2.1 - Reset NPU/NDL */
> +static uint32_t reset_ntl(struct npu3_dev *dev)
> +{
> +	struct npu3 *npu = dev->npu;
> +	uint64_t val;
> +	int lane;
> +
> +	set_iovalid(dev, true);
> +
> +	/* Power on clocks */
> +	npu3_phy_write(dev, &NPU3_PHY_RX_CLKDIST_PDWN, 0);
> +	npu3_phy_write(dev, &NPU3_PHY_RX_IREF_PDWN, 1);
> +	npu3_phy_write(dev, &NPU3_PHY_TX_CLKDIST_PDWN, 0);
> +	npu3_phy_write(dev, &NPU3_PHY_RX_CTL_DATASM_CLKDIST_PDWN, 0);
> +
> +	npu3_for_each_lane(lane, dev) {
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_ANA_PDWN, lane, 0);
> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_DIG_PDWN, lane, 0);
> +		npu3_phy_write_lane(dev, &NPU3_PHY_TX_LANE_PDWN, lane, 0);
> +	}
> +
> +	/* Write PRI */
> +	val = SETFIELD(NPU3_NTL_PRI_CFG_NDL, 0ull, dev->index);
> +	npu3_write(npu, NPU3_NTL_PRI_CFG(dev->index), val);
> +
> +	/* Disable RX parity checking */
> +	val = npu3_read(npu, NPU3_NTL_MISC_CFG2(dev->index));
> +	val &= ~NPU3_NTL_MISC_CFG2_NDL_RX_PARITY_ENA;
> +	npu3_write(npu, NPU3_NTL_MISC_CFG2(dev->index), val);
> +
> +	if (dev->type == NPU3_DEV_TYPE_NVLINK)
> +		npu3_pvd_flag_clear(dev, NPU3_DEV_DL_RESET);
> +
> +	npu3_dev_fence_set(dev, NPU3_NTL_CQ_FENCE_STATUS_FULL);
> +
> +	return NPU3_PROC_NEXT;
> +}
> +
> +static uint32_t reset_ndl(struct npu3_dev *dev)
> +{
> +	struct npu3 *npu = dev->npu;
> +	uint64_t reg;
> +	uint32_t val32;
> +
> +	if (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_FULL)
> +		return NPU3_PROC_INPROGRESS;
> +
> +	reg = NPU3_DLPL_CTL(dev->index);
> +	val32 = npu3_read_4b(npu, reg);
> +	val32 |= NPU3_DLPL_CTL_RESET_RX | NPU3_DLPL_CTL_RESET_MISC;
> +	npu3_write_4b(npu, reg, val32);
> +
> +	val32 = npu3_read_4b(npu, reg);
> +	val32 &= ~(NPU3_DLPL_CTL_RESET_RX | NPU3_DLPL_CTL_RESET_MISC);
> +	npu3_write_4b(npu, reg, val32);
> +
> +	reg = NPU3_DLPL_CFG(dev->index);
> +	val32 = NPU3_DLPL_CFG_PRI_BYTESWAP;
> +	npu3_write_4b(npu, reg, val32);
> +
> +	/* Clear FIR bits */
> +	for (uint32_t i = 0; i < NPU3_FIR_MAX; i++)
> +		xscom_write(npu->chip_id, npu->xscom_base + NPU3_FIR(i), 0ull);
> +
> +	npu3_dev_fence_set(dev, NPU3_NTL_CQ_FENCE_STATUS_HALF);
> +
> +	return NPU3_PROC_NEXT;
> +}
> +
> +static uint32_t reset_ntl_release(struct npu3_dev *dev)
> +{
> +	struct npu3 *npu = dev->npu;
> +	uint32_t i = dev->index;
> +
> +	if (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_HALF)
> +		return NPU3_PROC_INPROGRESS;
> +
> +	/* Credit setup */
> +	npu3_write(npu, NPU3_NTL_CREQ_HDR_CRED_SND(i), 0x0200000000000000);
> +	npu3_write(npu, NPU3_NTL_PRB_HDR_CRED_SND(i),  0x0200000000000000);
> +	npu3_write(npu, NPU3_NTL_ATR_HDR_CRED_SND(i),  0x0200000000000000);
> +	npu3_write(npu, NPU3_NTL_RSP_HDR_CRED_SND(i),  0x0200000000000000);
> +	npu3_write(npu, NPU3_NTL_CREQ_DAT_CRED_SND(i), 0x1000000000000000);
> +	npu3_write(npu, NPU3_NTL_RSP_DAT_CRED_SND(i),  0x1000000000000000);
> +
> +	npu3_write(npu, NPU3_NTL_CREQ_HDR_CRED_RCV(i), 0x0000be0000000000);
> +	npu3_write(npu, NPU3_NTL_DGD_HDR_CRED_RCV(i),  0x0000640000000000);
> +	npu3_write(npu, NPU3_NTL_ATSD_HDR_CRED_RCV(i), 0x0000200000000000);
> +	npu3_write(npu, NPU3_NTL_RSP_HDR_CRED_RCV(i),  0x0000be0000000000);
> +	npu3_write(npu, NPU3_NTL_CREQ_DAT_CRED_RCV(i), 0x0001000000000000);
> +	npu3_write(npu, NPU3_NTL_RSP_DAT_CRED_RCV(i),  0x0001000000000000);
> +
> +	npu3_dev_fence_set(dev, NPU3_NTL_CQ_FENCE_STATUS_NONE);
> +
> +	return NPU3_PROC_NEXT;
> +}
> +
> +static uint32_t reset_ntl_finish(struct npu3_dev *dev) {
> +	struct npu3 *npu = dev->npu;
> +	uint64_t val;
> +
> +	if (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_NONE)
> +		return NPU3_PROC_INPROGRESS;
> +
> +	/* Enable RX parity checking */
> +	val = npu3_read(npu, NPU3_NTL_MISC_CFG2(dev->index));
> +	val |= NPU3_NTL_MISC_CFG2_NDL_RX_PARITY_ENA;
> +	npu3_write(npu, NPU3_NTL_MISC_CFG2(dev->index), val);
> +
> +	if (dev->type == NPU3_DEV_TYPE_NVLINK)
> +		npu3_pvd_flag_set(dev, NPU3_DEV_DL_RESET);
> +
> +	return NPU3_PROC_COMPLETE;
> +}
> +
> +DEFINE_PROCEDURE(reset_ntl, reset_ndl, reset_ntl_release, reset_ntl_finish);
> +
> +static int npu3_dev_regcmp(struct npu3_dev *dev, uint64_t reg,
> +			   const char *reg_name, uint64_t expected)
> +{
> +	uint64_t val;
> +
> +	val = npu3_read(dev->npu, reg);
> +	if (val == expected)
> +		return 0;
> +
> +	NPU3DEVERR(dev, "%s: expected 0x%llx, read 0x%llx\n",
> +		   reg_name, expected, val);
> +
> +	return 1;
> +}
> +
> +#define REGCMP(reg, expected) \
> +	npu3_dev_regcmp(dev, reg(dev->index), #reg, expected);
> +
> +static uint32_t check_credits(struct npu3_dev *dev)
> +{
> +	int rc;
> +
> +	rc  = REGCMP(NPU3_NTL_CREQ_HDR_CRED_RCV, 0x0be0be0000000000ull);
> +	rc |= REGCMP(NPU3_NTL_DGD_HDR_CRED_RCV,  0x0640640000000000ull);
> +	rc |= REGCMP(NPU3_NTL_ATSD_HDR_CRED_RCV, 0x0200200000000000ull);
> +	rc |= REGCMP(NPU3_NTL_RSP_HDR_CRED_RCV,  0x0be0be0000000000ull);
> +	rc |= REGCMP(NPU3_NTL_CREQ_DAT_CRED_RCV, 0x1001000000000000ull);
> +	rc |= REGCMP(NPU3_NTL_RSP_DAT_CRED_RCV,  0x1001000000000000ull);
> +	if (rc)
> +		return NPU3_PROC_COMPLETE | NPU3_PROC_FAILED;
> +
> +	return NPU3_PROC_COMPLETE;
> +}
> +
> +DEFINE_PROCEDURE(check_credits);
> +
> +static struct procedure *procedures[] = {
> +	 [0] = &procedure_stop,
> +	 [1] = &procedure_nop,
> +	 [4] = &procedure_phy_reset,
> +	 [5] = &procedure_phy_tx_zcal,
> +	 [6] = &procedure_phy_rx_dccal,
> +	 [7] = &procedure_phy_enable_tx_rxcal,
> +	 [8] = &procedure_phy_disable_tx_rxcal,
> +	 [9] = &procedure_phy_rx_training,
> +	[10] = &procedure_reset_ntl,
> +	[11] = &procedure_nop, /* Placeholder for pre-terminate */
> +	[12] = &procedure_nop, /* Placeholder for terminate */
> +	[13] = &procedure_check_credits,
> +};
> +
> +void npu3_dev_procedure_init(struct npu3_dev *dev, uint32_t pnum)
> +{
> +	struct npu3_procedure *proc = &dev->proc;
> +	const char *name;
> +
> +	if (pnum >= ARRAY_SIZE(procedures) || !procedures[pnum]) {
> +		NPU3DEVERR(dev, "Unsupported procedure number %d\n", pnum);
> +		proc->status = NPU3_PROC_COMPLETE | NPU3_PROC_UNSUPPORTED;
> +		return;
> +	}
> +
> +	name = procedures[pnum]->name;
> +
> +	if (proc->number == pnum && !(proc->status & NPU3_PROC_COMPLETE))
> +		NPU3DEVINF(dev, "Restarting procedure %s\n", name);
> +	else
> +		NPU3DEVINF(dev, "Starting procedure %s\n", name);
> +
> +	proc->status = NPU3_PROC_INPROGRESS;
> +	proc->number = pnum;
> +	proc->step = 0;
> +	proc->timeout = mftb() + msecs_to_tb(1000);
> +}
> +
> +static uint32_t npu3_dev_procedure_run_step(struct npu3_dev *dev)
> +{
> +	struct npu3_procedure *proc = &dev->proc;
> +	uint32_t result;
> +
> +	result = procedures[proc->number]->steps[proc->step](dev);
> +	if (result & NPU3_PROC_NEXT) {
> +		proc->step++;
> +
> +		NPU3DEVINF(dev, "Running procedure %s step %d\n",
> +			   procedures[proc->number]->name, proc->step);
> +	}
> +
> +	return result;
> +}
> +
> +static void npu3_dev_procedure_run(struct npu3_dev *dev)
> +{
> +	struct npu3_procedure *proc = &dev->proc;
> +	const char *name;
> +	uint32_t result;
> +
> +	do {
> +		result = npu3_dev_procedure_run_step(dev);
> +	} while (result & NPU3_PROC_NEXT);
> +
> +	name = procedures[proc->number]->name;
> +
> +	if (result & NPU3_PROC_COMPLETE) {
> +		NPU3DEVINF(dev, "Procedure %s complete\n", name);
> +	} else if (tb_compare(mftb(), proc->timeout) == TB_AAFTERB) {
> +		NPU3DEVINF(dev, "Procedure %s timed out\n", name);
> +		result = NPU3_PROC_COMPLETE | NPU3_PROC_FAILED;
> +	}
> +
> +	/* Mask off internal state bits */
> +	proc->status = result & NPU3_PROC_STATUS_MASK;
> +}
> +
> +uint32_t npu3_dev_procedure_status(struct npu3_dev *dev)
> +{
> +	/* Run the procedure if not already complete */
> +	if (!(dev->proc.status & NPU3_PROC_COMPLETE))
> +		npu3_dev_procedure_run(dev);
> +
> +	return dev->proc.status;
> +}
> +
> +int64_t npu3_dev_reset(struct npu3_dev *dev)
> +{
> +	unsigned long timeout;
> +
> +	reset_ntl(dev);
> +	timeout = mftb() + msecs_to_tb(1000);
> +
> +	while (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_FULL) {
> +		if (tb_compare(mftb(), timeout) == TB_AAFTERB) {
> +			NPU3DEVINF(dev, "Device reset timed out\n");
> +			return OPAL_BUSY;
> +		}
> +	}
> +
> +	return OPAL_SUCCESS;
> +}
> diff --git a/hw/npu3-nvlink.c b/hw/npu3-nvlink.c
> new file mode 100644
> index 000000000000..95188f824e0e
> --- /dev/null
> +++ b/hw/npu3-nvlink.c
> @@ -0,0 +1,1841 @@
> +/* Copyright 2019 IBM Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + *	http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> + * implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <skiboot.h>
> +#include <device.h>
> +#include <phys-map.h>
> +#include <npu3.h>
> +#include <npu3-regs.h>
> +#include <pci-virt.h>
> +#include <xscom.h>
> +#include <xscom-p9-regs.h>
> +#include <interrupts.h>
> +#include <pci-cfg.h>
> +#include <pci-slot.h>
> +#include <cache-p9.h>
> +
> +#define NPU3LOG(l, npu, fmt, a...)		\
> +	prlog(l, "NPU#%04x[%d:%d]: " fmt,	\
> +	      (npu)->nvlink.phb.opal_id,	\
> +	      (npu)->chip_id,			\
> +	      (npu)->index, ##a)
> +#define NPU3DBG(npu, fmt, a...) NPU3LOG(PR_DEBUG, npu, fmt, ##a)
> +#define NPU3INF(npu, fmt, a...) NPU3LOG(PR_INFO, npu, fmt, ##a)
> +#define NPU3ERR(npu, fmt, a...) NPU3LOG(PR_ERR, npu, fmt, ##a)
> +
> +#define NPU3DEVLOG(l, dev, fmt, a...)			\
> +	prlog(l, "NPU#%04x:%02x:%02x.%x " fmt,		\
> +	      (dev)->npu->nvlink.phb.opal_id,		\
> +	      (dev)->nvlink.pvd->bdfn >> 8 & 0xff,	\
> +	      (dev)->nvlink.pvd->bdfn >> 3 & 0x1f,	\
> +	      (dev)->nvlink.pvd->bdfn & 0x7, ##a)
> +#define NPU3DEVDBG(dev, fmt, a...) NPU3DEVLOG(PR_DEBUG, dev, fmt, ##a)
> +#define NPU3DEVINF(dev, fmt, a...) NPU3DEVLOG(PR_INFO, dev, fmt, ##a)
> +#define NPU3DEVERR(dev, fmt, a...) NPU3DEVLOG(PR_ERR, dev, fmt, ##a)
> +
> +#define NPU3_CFG_READ(size, type)					\
> +static int64_t npu3_cfg_read##size(struct phb *phb, uint32_t bdfn,	\
> +				   uint32_t offset, type *data)		\
> +{									\
> +	uint32_t val;							\
> +	int64_t ret;							\
> +									\
> +	ret = pci_virt_cfg_read(phb, bdfn, offset,			\
> +				sizeof(*data), &val);			\
> +	*data = (type)val;						\
> +	return ret;							\
> +}
> +
> +#define NPU3_CFG_WRITE(size, type)					\
> +static int64_t npu3_cfg_write##size(struct phb *phb, uint32_t bdfn,	\
> +				    uint32_t offset, type data)		\
> +{									\
> +	uint32_t val = data;						\
> +	int64_t ret;							\
> +									\
> +	ret = pci_virt_cfg_write(phb, bdfn, offset,			\
> +				 sizeof(data), val);			\
> +	return ret;							\
> +}
> +
> +NPU3_CFG_READ(8, u8);
> +NPU3_CFG_READ(16, u16);
> +NPU3_CFG_READ(32, u32);
> +NPU3_CFG_WRITE(8, u8);
> +NPU3_CFG_WRITE(16, u16);
> +NPU3_CFG_WRITE(32, u32);
> +
> +static int64_t npu3_eeh_freeze_status(struct phb *phb __unused,
> +				      uint64_t pe_num __unused,
> +				      uint8_t *freeze_state,
> +				      uint16_t *pci_error_type,
> +				      uint16_t *severity)
> +{
> +	/*
> +	 * FIXME: When it's called by skiboot PCI config accessor,
> +	 * the PE number is fixed to 0, which is incorrect. We need
> +	 * introduce another PHB callback to translate it. For now,
> +	 * it keeps the skiboot PCI enumeration going.
> +	 */
> +	*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
> +	*pci_error_type = OPAL_EEH_NO_ERROR;
> +
> +	if (severity)
> +		*severity = OPAL_EEH_SEV_NO_ERROR;
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +/* Number of PEs supported */
> +#define NPU3_MAX_PE_NUM		16
> +#define NPU3_RESERVED_PE_NUM	15
> +
> +static int64_t npu3_ioda_reset(struct phb *phb, bool purge __unused)
> +{
> +	struct npu3 *npu = npu3_phb_to_npu(phb);
> +	uint64_t val;
> +
> +	val = NPU3_ATS_IODA_ADDR_AUTO_INC;
> +	val = SETFIELD(NPU3_ATS_IODA_ADDR_TBL_SEL, val,
> +		       NPU3_ATS_IODA_ADDR_TBL_TVT);
> +	npu3_write(npu, NPU3_ATS_IODA_ADDR, val);
> +
> +	for (uint32_t i = 0; i < NPU3_MAX_PE_NUM; i++)
> +		npu3_write(npu, NPU3_ATS_IODA_DATA, 0ull);
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +static inline void npu3_ioda_sel(struct npu3 *npu, uint32_t table,
> +				 uint32_t index)
> +{
> +	uint64_t val;
> +
> +	val = SETFIELD(NPU3_ATS_IODA_ADDR_TBL_SEL, 0ull, table);
> +	val = SETFIELD(NPU3_ATS_IODA_ADDR_TBL_ADDR, val, index);
> +	npu3_write(npu, NPU3_ATS_IODA_ADDR, val);
> +}
> +
> +static int64_t npu3_map_pe_dma_window(struct phb *phb,
> +				      uint64_t pe_num,
> +				      uint16_t window_id,
> +				      uint16_t tce_levels,
> +				      uint64_t tce_table_addr,
> +				      uint64_t tce_table_size,
> +				      uint64_t tce_page_size)
> +{
> +	struct npu3 *npu = npu3_phb_to_npu(phb);
> +	uint64_t tts_encoded, val;
> +	uint32_t page_size;
> +
> +	/* Each PE has one corresponding TVE */
> +	if (window_id != pe_num || pe_num >= NPU3_MAX_PE_NUM)
> +		return OPAL_PARAMETER;
> +
> +	npu3_ioda_sel(npu, NPU3_ATS_IODA_ADDR_TBL_TVT, pe_num);
> +
> +	/* TCE table size zero is used to disable the TVE */
> +	if (!tce_table_size) {
> +		npu3_write(npu, NPU3_ATS_IODA_DATA, 0ull);
> +		return OPAL_SUCCESS;
> +	}
> +
> +	/* TCE table size */
> +	if (!is_pow2(tce_table_size) || tce_table_size < 0x1000)
> +		return OPAL_PARAMETER;
> +
> +	tts_encoded = ilog2(tce_table_size) - 11;
> +	if (tts_encoded > 39)
> +		return OPAL_PARAMETER;
> +
> +	val = SETFIELD(NPU3_ATS_IODA_TVT_TABLE_SIZE, 0ull, tts_encoded);
> +
> +	/* Number of levels */
> +	if (tce_levels < 1 || tce_levels > 4)
> +		return OPAL_PARAMETER;
> +
> +	val = SETFIELD(NPU3_ATS_IODA_TVT_TABLE_LEVEL, val, tce_levels - 1);
> +
> +	/* TCE page size */
> +	switch (tce_page_size) {
> +	case 256 << 20:
> +		page_size = 17;
> +		break;
> +	case 16 << 20:
> +		page_size = 13;
> +		break;
> +	case 64 << 10:
> +		page_size = 5;
> +		break;
> +	default:
> +		page_size = 1;
> +	}
> +
> +	val = SETFIELD(NPU3_ATS_IODA_TVT_PAGE_SIZE, val, page_size);
> +	val = SETFIELD(NPU3_ATS_IODA_TVT_XLAT_ADDR, val, tce_table_addr >> 12);
> +	npu3_write(npu, NPU3_ATS_IODA_DATA, val);
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +static int64_t npu3_map_pe_dma_window_real(struct phb *phb,
> +					   uint64_t pe_num,
> +					   uint16_t window_id,
> +					   uint64_t pci_start_addr __unused,
> +					   uint64_t pci_mem_size __unused)
> +{
> +	struct npu3 *npu = npu3_phb_to_npu(phb);
> +	uint64_t val;
> +
> +	/* Each PE has one corresponding TVE */
> +	if (window_id != pe_num || pe_num >= NPU3_MAX_PE_NUM)
> +		return OPAL_PARAMETER;
> +
> +	if (pci_mem_size) {
> +		/*
> +		 * GPUs need to be able to access the MMIO memory space as well.
> +		 * On POWER9 this is above the top of RAM, so disable the TVT
> +		 * range check, allowing access to all memory addresses.
> +		 */
> +		val = 0;
> +	} else {
> +		/* Disable */
> +		val = PPC_BIT(51);
> +	}
> +
> +	npu3_ioda_sel(npu, NPU3_ATS_IODA_ADDR_TBL_TVT, pe_num);
> +	npu3_write(npu, NPU3_ATS_IODA_DATA, val);
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +static int64_t npu3_next_error(struct phb *phb,
> +			       uint64_t *first_frozen_pe,
> +			       uint16_t *pci_error_type,
> +			       uint16_t *severity)
> +{
> +	struct npu3 *npu = npu3_phb_to_npu(phb);
> +	uint32_t pe_num;
> +
> +	if (!first_frozen_pe || !pci_error_type || !severity)
> +		return OPAL_PARAMETER;
> +
> +	*first_frozen_pe = -1;
> +	*pci_error_type = OPAL_EEH_NO_ERROR;
> +	*severity = OPAL_EEH_SEV_NO_ERROR;
> +
> +	for (pe_num = 0; pe_num < NPU3_MAX_PE_NUM; pe_num++) {
> +		if (!npu3_read(npu, NPU3_MISC_PESTB(pe_num)))
> +			continue;
> +
> +		*first_frozen_pe = pe_num;
> +		*pci_error_type = OPAL_EEH_PE_ERROR;
> +		*severity = OPAL_EEH_SEV_PE_ER;
> +		break;
> +	}
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +static struct npu3_dev *npu3_bdfn_to_dev(struct npu3 *npu, uint32_t bdfn)
> +{
> +	struct pci_virt_device *pvd;
> +
> +	/* All emulated devices are attached to root bus */
> +	if (bdfn & ~0xff)
> +		return NULL;
> +
> +	pvd = pci_virt_find_device(&npu->nvlink.phb, bdfn);
> +	if (pvd)
> +		return pvd->data;
> +
> +	return NULL;
> +}
> +
> +static int npu3_match_gpu(struct phb *phb __unused, struct pci_device *pd,
> +			  void *data)
> +{
> +	const char *slot = data;
> +	struct dt_node *dn;
> +	char *loc_code;
> +
> +	/* Ignore non-NVIDIA devices */
> +	if ((pd->vdid & 0xffff) != 0x10de)



PCI_VENDOR_ID()?


> +		return 0;
> +
> +	/* Find the PCI device's slot location */
> +	for (dn = pd->dn;
> +	     dn && !dt_find_property(dn, "ibm,loc-code");
> +	     dn = dn->parent);
> +
> +	if (!dn)
> +		return 0;
> +
> +	loc_code = (char *)dt_prop_get(dn, "ibm,loc-code");
> +	if (streq(loc_code, slot))
> +		return 1;
> +
> +	return 0;
> +}
> +
> +static void npu3_dev_find_gpu(struct npu3_dev *dev)
> +{
> +	const char *slot = dev->nvlink.loc_code;
> +	struct phb *phb;
> +	struct pci_device *gpu;
> +
> +	if (!slot)
> +		return;
> +
> +	for_each_phb(phb) {
> +		gpu = pci_walk_dev(phb, NULL, npu3_match_gpu, (void *)slot);
> +		if (!gpu)
> +			continue;
> +
> +		dev->nvlink.gpu = gpu;
> +		return;
> +	}
> +
> +	NPU3DEVINF(dev, "No PCI device found for slot '%s'\n", slot);
> +}
> +
> +#define VENDOR_CAP_START		0x80
> +#define VENDOR_CAP_LINK_FLAG_OFFSET	0x0d
> +
> +void npu3_pvd_flag_set(struct npu3_dev *dev, uint8_t flag)
> +{
> +	uint32_t offset = VENDOR_CAP_START + VENDOR_CAP_LINK_FLAG_OFFSET;
> +	uint32_t flags;
> +
> +	PCI_VIRT_CFG_RDONLY_RD(dev->nvlink.pvd, offset, 1, &flags);
> +	flags |= flag;
> +	PCI_VIRT_CFG_INIT_RO(dev->nvlink.pvd, offset, 1, flags);
> +}
> +
> +void npu3_pvd_flag_clear(struct npu3_dev *dev, uint8_t flag)
> +{
> +	uint32_t offset = VENDOR_CAP_START + VENDOR_CAP_LINK_FLAG_OFFSET;
> +	uint32_t flags;
> +
> +	PCI_VIRT_CFG_RDONLY_RD(dev->nvlink.pvd, offset, 1, &flags);
> +	flags &= ~flag;
> +	PCI_VIRT_CFG_INIT_RO(dev->nvlink.pvd, offset, 1, flags);
> +}
> +
> +static struct lock npu3_phandle_lock = LOCK_UNLOCKED;
> +
> +static void npu3_append_phandle(struct dt_node *dn, const char *name,
> +				uint32_t phandle)
> +{
> +	struct dt_property *prop;
> +	uint32_t *phandles;
> +	size_t len;
> +
> +	prop = __dt_find_property(dn, name);
> +	if (!prop) {
> +		dt_add_property_cells(dn, name, phandle);
> +		return;
> +	}
> +
> +	/*
> +	 * Make sure no one else has a reference to the property. Assume
> +	 * this is the only function that holds a reference to it.
> +	 */
> +	lock(&npu3_phandle_lock);
> +
> +	/* Need to append to the property */
> +	len = prop->len + sizeof(*phandles);
> +	dt_resize_property(&prop, len);
> +	prop->len = len;
> +
> +	phandles = (uint32_t *)prop->prop;
> +	phandles[len / sizeof(*phandles) - 1] = phandle;
> +
> +	unlock(&npu3_phandle_lock);
> +}
> +
> +static void npu3_dev_fixup_dt(struct npu3_dev *dev)
> +{
> +	struct pci_device *pd = dev->nvlink.pd;
> +	struct pci_device *gpu = dev->nvlink.gpu;
> +
> +	dt_add_property_cells(pd->dn, "ibm,nvlink", dev->dn->phandle);
> +	dt_add_property_string(pd->dn, "ibm,loc-code", dev->nvlink.loc_code);
> +	if (dev->link_speed != 0xff)
> +		dt_add_property_cells(pd->dn, "ibm,nvlink-speed",
> +				      lo32(dev->link_speed));
> +
> +	if (!gpu)
> +		return;
> +
> +	npu3_append_phandle(gpu->dn, "ibm,npu", pd->dn->phandle);
> +	dt_add_property_cells(pd->dn, "ibm,gpu", gpu->dn->phandle);
> +}
> +
> +static int64_t npu3_gpu_bridge_sec_bus_reset(void *pdev,
> +				struct pci_cfg_reg_filter *pcrf __unused,
> +				uint32_t offset, uint32_t len,
> +				uint32_t *data, bool write)
> +{
> +	struct pci_device *pd = pdev;
> +	struct pci_device *gpu;
> +	struct npu3 *npu;
> +	struct npu3_dev *dev;
> +	bool purge = false;
> +
> +	if (!write)
> +		return OPAL_PARAMETER;
> +
> +	if (len != 2 || offset & 1) {
> +		PCIERR(pd->phb, pd->bdfn,
> +		       "Unsupported write to bridge control register\n");
> +		return OPAL_PARAMETER;
> +	}
> +
> +	if (!(*data & PCI_CFG_BRCTL_SECONDARY_RESET))
> +		return OPAL_PARTIAL;
> +
> +	gpu = list_top(&pd->children, struct pci_device, link);
> +	if (!gpu)
> +		return OPAL_PARTIAL;
> +
> +	npu3_for_each_nvlink_npu(npu)
> +		npu3_for_each_nvlink_dev(dev, npu)
> +			if (dev->nvlink.gpu == gpu)
> +				if (!npu3_dev_reset(dev))
> +					purge = true;
> +
> +	if (purge)
> +		purge_l2_l3_caches();
> +
> +	return OPAL_PARTIAL;
> +}
> +
> +static int npu3_dev_bind(struct phb *phb, struct pci_device *pd,
> +			 void *data __unused)
> +{
> +	struct npu3 *npu = npu3_phb_to_npu(phb);
> +	struct npu3_dev *dev = npu3_bdfn_to_dev(npu, pd->bdfn);
> +	struct pci_device *gpu;
> +
> +	dev->nvlink.pd = pd;
> +
> +	/* The slot label indicates which GPU this link is connected to */
> +	dev->nvlink.loc_code = dt_prop_get_def(dev->dn, "ibm,slot-label", NULL);
> +	if (!dev->nvlink.loc_code) {
> +		/**
> +		 * @fwts-label NPUNoPHBSlotLabel
> +		 * @fwts-advice No GPU/NPU slot information was found.
> +		 * NVLink3 functionality will not work.
> +		 */
> +		NPU3DEVERR(dev, "Cannot find GPU slot information\n");
> +	}
> +
> +	npu3_dev_find_gpu(dev);
> +	npu3_dev_fixup_dt(dev);
> +
> +	gpu = dev->nvlink.gpu;
> +	if (!gpu)
> +		return 0;
> +
> +	/* When a GPU is reset, ensure all of its links are reset too */
> +	if (gpu->parent && gpu->parent->slot)
> +		pci_add_cfg_reg_filter(gpu->parent, PCI_CFG_BRCTL, 2,
> +				       PCI_REG_FLAG_WRITE,
> +				       npu3_gpu_bridge_sec_bus_reset);
> +
> +	npu3_pvd_flag_set(dev, NPU3_DEV_PCI_LINKED);
> +
> +	return 0;
> +}
> +
> +struct npu3 *npu3_next_nvlink_npu(struct npu3 *npu, uint32_t chip_id)
> +{
> +	uint64_t phb_id = 0;
> +	struct phb *phb;
> +
> +	if (npu)
> +		phb_id = npu->nvlink.phb.opal_id + 1;
> +
> +	for (; (phb = __pci_next_phb_idx(&phb_id));) {
> +		if (phb->phb_type != phb_type_npu_v3)
> +			continue;
> +
> +		npu = npu3_phb_to_npu(phb);
> +		if (npu->chip_id == chip_id || chip_id == NPU3_ANY_CHIP)
> +			return npu;
> +	}
> +
> +	return NULL;
> +}
> +
> +static struct npu3 *npu3_last_npu(void)
> +{
> +	static struct npu3 *last = NULL;
> +	struct npu3 *npu;
> +
> +	if (last)
> +		return last;
> +
> +	npu3_for_each_nvlink_npu(npu)
> +		last = npu;
> +
> +	return last;
> +}
> +
> +static uint32_t npu3_gpu_links(struct pci_device *gpu)
> +{
> +	const struct dt_property *prop;
> +
> +	if (!gpu)
> +		return 0;
> +
> +	/* The link count is the number of phandles in "ibm,npu" */
> +	prop = dt_find_property(gpu->dn, "ibm,npu");
> +	if (!prop)
> +		return 0;
> +
> +	return prop->len / sizeof(uint32_t);
> +}
> +
> +static uint32_t npu3_links_per_gpu(void)
> +{
> +	static uint32_t links = -1;
> +	struct npu3 *npu;
> +	struct npu3_dev *dev;
> +
> +	/* Static value, same for all GPUs; only do this once */
> +	if (links != -1)
> +		return links;
> +
> +	/* Use the first GPU we find to figure this out */
> +	npu3_for_each_nvlink_npu(npu) {
> +		npu3_for_each_nvlink_dev(dev, npu) {
> +			links = npu3_gpu_links(dev->nvlink.gpu);
> +			if (links)
> +				goto out;
> +		}
> +	}
> +
> +out:
> +	prlog(PR_DEBUG, "NPU: %s: %d\n", __func__, links);
> +
> +	return links;


Can this possibly return links==0? It relies on "ibm,npu" properties
which are or are not create by now? If they are, then you do not need
"static" for @links as this is only called from
npu3_chip_possible_gpus() which has static @possible itself.

Statics are bad.



> +}
> +
> +uint32_t npu3_dev_gpu_index(struct npu3_dev *dev)
> +{
> +	const char *slot;
> +	char *p = NULL;
> +	int ret;
> +
> +	slot = dev->nvlink.loc_code;
> +	if (!slot)
> +		return -1;
> +
> +	if (memcmp(slot, "GPU", 3))
> +		return -1;
> +
> +	ret = strtol(slot + 3, &p, 10);
> +	if (*p || p == slot + 3)
> +		return -1;
> +
> +	return ret;
> +}
> +
> +static uint32_t npu3_chip_possible_gpu_links(void)
> +{
> +	struct proc_chip *chip;
> +	struct npu3 *npu;
> +	struct npu3_dev *dev;
> +	static uint32_t possible = -1;


You do not need this static as this is only called from
npu3_chip_possible_gpus() which has its own static @possible.



> +
> +	/* Static value, same for all chips; only do this once */
> +	if (possible != -1)
> +		return possible;
> +
> +	possible = 0;
> +
> +	for_each_chip(chip) {
> +		npu3_for_each_chip_nvlink_npu(npu, chip->id)
> +			npu3_for_each_nvlink_dev(dev, npu)
> +				if (npu3_dev_gpu_index(dev) != -1)
> +					possible++;
> +
> +		if (possible)
> +			break;
> +	}
> +
> +	prlog(PR_DEBUG, "NPU: %s: %d\n", __func__, possible);
> +
> +	return possible;
> +}
> +
> +uint32_t npu3_chip_possible_gpus(void)
> +{
> +	static uint32_t possible = -1;
> +	uint32_t links_per_gpu;
> +
> +	/* Static value, same for all chips; only do this once */
> +	if (possible != -1)
> +		return possible;
> +
> +	possible = 0;
> +
> +	links_per_gpu = npu3_links_per_gpu();
> +	if (links_per_gpu)
> +		possible = npu3_chip_possible_gpu_links() / links_per_gpu;
> +
> +	prlog(PR_DEBUG, "NPU: %s: %d\n", __func__, possible);
> +
> +	return possible;
> +}
> +
> +static void npu3_dev_assign_gmb(struct npu3_dev *dev, uint64_t addr,
> +				uint64_t size)
> +{
> +	uint32_t mode;
> +	uint64_t val;
> +
> +	switch (npu3_gpu_links(dev->nvlink.gpu)) {
> +	case 0:
> +		return;
> +	case 1:
> +		mode = 0;
> +		break;
> +	case 2:
> +		mode = 1;
> +		break;
> +	case 3:
> +		mode = 3;
> +		break;
> +	case 4:
> +		mode = 6;
> +		break;
> +	case 6:
> +		mode = 10;
> +		break;
> +	default:
> +		/* Hardware does not support this configuration */
> +		assert(0);
> +	}
> +
> +	mode += dev->nvlink.pvd->bdfn & 0x7;
> +
> +	val = NPU3_GPU_MEM_BAR_ENABLE |
> +	      NPU3_GPU_MEM_BAR_POISON;
> +	val = SETFIELD(NPU3_GPU_MEM_BAR_ADDR, val, addr >> 30);
> +	val = SETFIELD(NPU3_GPU_MEM_BAR_SIZE, val, size >> 30);
> +	val = SETFIELD(NPU3_GPU_MEM_BAR_MODE, val, mode);
> +
> +	npu3_write(dev->npu, NPU3_GPU_MEM_BAR(dev->index), val);
> +}
> +
> +static struct dt_node *npu3_create_memory_dn(struct npu3_dev *dev,
> +					     uint32_t gpu_index, uint64_t addr,
> +					     uint64_t size)
> +{
> +	uint32_t nid = 255 - gpu_index;
> +	struct dt_node *mem;
> +
> +	mem = dt_find_by_name_addr(dt_root, "memory", addr);
> +	if (mem)
> +		return mem;
> +
> +	mem = dt_new_addr(dt_root, "memory", addr);
> +	assert(mem);
> +
> +	dt_add_property_string(mem, "device_type", "memory");
> +	dt_add_property_string(mem, "compatible", "ibm,coherent-device-memory");
> +	dt_add_property_u64s(mem, "reg", addr, size);
> +	dt_add_property_u64s(mem, "linux,usable-memory", addr, 0);
> +	dt_add_property_cells(mem, "ibm,chip-id", nid);
> +	dt_add_property_cells(mem, "ibm,associativity", 4, nid, nid, nid, nid);
> +
> +	NPU3INF(dev->npu, "%s mem: 0x%016llx (nid %d)\n", dev->nvlink.loc_code,
> +		addr, nid);
> +
> +	return mem;
> +}
> +
> +static void npu3_dev_init_gpu_mem(struct npu3_dev *dev)
> +{
> +	struct pci_device *pd = dev->nvlink.pd;
> +	struct npu3 *npu = dev->npu;
> +	struct dt_node *mem;
> +	uint64_t addr, size, gta;
> +	uint32_t gpu_index;
> +
> +	if (!dev->nvlink.gpu)
> +		return;
> +
> +	gpu_index = npu3_dev_gpu_index(dev) % npu3_chip_possible_gpus();
> +	phys_map_get(npu->chip_id, GPU_MEM_4T_DOWN, gpu_index, &addr, &size);
> +
> +	npu3_dev_assign_gmb(dev, addr, size);
> +	mem = npu3_create_memory_dn(dev, gpu_index, addr, size);
> +
> +	/*
> +	 * Coral mode address compression. This is documented in Figure 3.5 of
> +	 * the NPU workbook; "P9->GPU RA Compression (Coral)".
> +	 */
> +	gta  = (addr >> 42 & 0x1) << 42;
> +	gta |= (addr >> 45 & 0x3) << 43;
> +	gta |= (addr >> 49 & 0x3) << 45;
> +	gta |= addr & ((1ul << 43) - 1);
> +
> +	dt_add_property_cells(pd->dn, "memory-region", mem->phandle);
> +	dt_add_property_u64s(pd->dn, "ibm,device-tgt-addr", gta);
> +}
> +
> +static void npu3_final_fixup(void)
> +{
> +	struct npu3 *npu;
> +	struct npu3_dev *dev;
> +
> +	npu3_for_each_nvlink_npu(npu)
> +		npu3_for_each_nvlink_dev(dev, npu)
> +			npu3_dev_init_gpu_mem(dev);
> +}
> +
> +static void npu3_phb_final_fixup(struct phb *phb)
> +{
> +	struct npu3 *npu = npu3_phb_to_npu(phb);
> +
> +	pci_walk_dev(phb, NULL, npu3_dev_bind, NULL);
> +
> +	/* After every npu's devices are bound, do gpu-related fixup */
> +	if (npu == npu3_last_npu())


Why delay this till the last NPU is finally fixed up? What does
guarantee the order? Cannot swift_npu3_fixup() do this? Looks fragile.



> +		npu3_final_fixup();
> +}
> +
> +static int64_t npu3_set_pe(struct phb *phb,
> +			   uint64_t pe_num,
> +			   uint64_t bdfn,
> +			   uint8_t bcompare,
> +			   uint8_t dcompare,
> +			   uint8_t fcompare,
> +			   uint8_t action)
> +{
> +	struct npu3 *npu = npu3_phb_to_npu(phb);
> +	struct npu3_dev *dev;
> +	uint64_t val;
> +
> +	dev = npu3_bdfn_to_dev(npu, bdfn);
> +	if (!dev)
> +		return OPAL_PARAMETER;
> +
> +	if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE)
> +		return OPAL_PARAMETER;
> +
> +	if (pe_num >= NPU3_MAX_PE_NUM)
> +		return OPAL_PARAMETER;
> +
> +	if (bcompare != OpalPciBusAll ||
> +	    dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER ||
> +	    fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER)
> +		return OPAL_UNSUPPORTED;
> +
> +	if (!dev->nvlink.gpu)
> +		return OPAL_SUCCESS;
> +
> +	val = NPU3_CTL_BDF2PE_CFG_ENABLE;
> +	val = SETFIELD(NPU3_CTL_BDF2PE_CFG_PE, val, pe_num);
> +	val = SETFIELD(NPU3_CTL_BDF2PE_CFG_BDF, val, dev->nvlink.gpu->bdfn);
> +	npu3_write(npu, NPU3_CTL_BDF2PE_CFG(pe_num), val);
> +
> +	val = NPU3_MISC_BDF2PE_CFG_ENABLE;
> +	val = SETFIELD(NPU3_MISC_BDF2PE_CFG_PE, val, pe_num);
> +	val = SETFIELD(NPU3_MISC_BDF2PE_CFG_BDF, val, dev->nvlink.gpu->bdfn);
> +	npu3_write(npu, NPU3_MISC_BDF2PE_CFG(pe_num), val);
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +static int64_t npu3_tce_kill_pages(struct npu3 *npu,
> +				   uint64_t pe_num,
> +				   uint32_t tce_size,
> +				   uint64_t dma_addr,
> +				   uint32_t npages)
> +{
> +	uint32_t check_tce_size;
> +	uint64_t val;
> +
> +	if (pe_num >= NPU3_MAX_PE_NUM)
> +		return OPAL_PARAMETER;
> +
> +	npu3_ioda_sel(npu, NPU3_ATS_IODA_ADDR_TBL_TVT, pe_num);
> +	val = npu3_read(npu, NPU3_ATS_IODA_DATA);
> +
> +	check_tce_size = 0x800 << GETFIELD(NPU3_ATS_IODA_TVT_PAGE_SIZE, val);
> +	if (check_tce_size != tce_size) {
> +		NPU3ERR(npu, "%s: Unexpected TCE size (got 0x%x, expected 0x%x)\n",
> +			__func__, tce_size, check_tce_size);
> +
> +		return OPAL_PARAMETER;
> +	}
> +
> +	val = NPU3_ATS_TCE_KILL_ONE;
> +	val = SETFIELD(NPU3_ATS_TCE_KILL_PE_NUMBER, val, pe_num);
> +
> +	while (npages--) {
> +		val = SETFIELD(NPU3_ATS_TCE_KILL_ADDRESS, val, dma_addr >> 12);
> +		npu3_write(npu, NPU3_ATS_TCE_KILL, val);
> +
> +		dma_addr += tce_size;
> +	}
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +static int64_t npu3_tce_kill(struct phb *phb,
> +			     uint32_t kill_type,
> +			     uint64_t pe_num,
> +			     uint32_t tce_size,
> +			     uint64_t dma_addr,
> +			     uint32_t npages)
> +{
> +	struct npu3 *npu = npu3_phb_to_npu(phb);
> +
> +	sync();
> +
> +	switch(kill_type) {
> +	case OPAL_PCI_TCE_KILL_PAGES:
> +		return npu3_tce_kill_pages(npu, pe_num, tce_size,
> +					   dma_addr, npages);
> +	case OPAL_PCI_TCE_KILL_PE:
> +		/*
> +		 * NPU doesn't support killing a PE so fall through
> +		 * and do a kill all instead.
> +		 */
> +	case OPAL_PCI_TCE_KILL_ALL:
> +		npu3_write(npu, NPU3_ATS_TCE_KILL, NPU3_ATS_TCE_KILL_ALL);
> +		return OPAL_SUCCESS;
> +	}
> +
> +	return OPAL_PARAMETER;
> +}
> +
> +static const struct phb_ops npu_ops = {
> +	.cfg_read8		= npu3_cfg_read8,
> +	.cfg_read16		= npu3_cfg_read16,
> +	.cfg_read32		= npu3_cfg_read32,
> +	.cfg_write8		= npu3_cfg_write8,
> +	.cfg_write16		= npu3_cfg_write16,
> +	.cfg_write32		= npu3_cfg_write32,
> +	.eeh_freeze_status	= npu3_eeh_freeze_status,
> +	.ioda_reset		= npu3_ioda_reset,
> +	.map_pe_dma_window	= npu3_map_pe_dma_window,
> +	.map_pe_dma_window_real	= npu3_map_pe_dma_window_real,
> +	.next_error		= npu3_next_error,
> +	.phb_final_fixup	= npu3_phb_final_fixup,
> +	.set_pe			= npu3_set_pe,
> +	.tce_kill		= npu3_tce_kill,
> +};
> +
> +static int64_t npu3_reset(struct pci_slot *slot)
> +{
> +	struct npu3 *npu = npu3_phb_to_npu(slot->phb);
> +	struct npu3_dev *dev;
> +	int64_t rc = OPAL_SUCCESS;
> +	bool purge = false;
> +
> +	npu3_for_each_nvlink_dev(dev, npu) {
> +		rc = npu3_dev_reset(dev);
> +		if (rc)
> +			break;
> +
> +		purge = true;
> +	}
> +
> +	/* No devices reset; don't purge, just return */
> +	if (!purge)
> +		return rc;
> +
> +	/* All devices reset */
> +	if (!rc)
> +		return purge_l2_l3_caches();
> +
> +	/* Some devices successfully reset; purge, but still return error */
> +	purge_l2_l3_caches();
> +	return rc;
> +}
> +
> +static int64_t npu3_freset(struct pci_slot *slot __unused)
> +{
> +	return OPAL_SUCCESS;
> +}
> +
> +static int64_t npu3_get_link_state(struct pci_slot *slot __unused,
> +				   uint8_t *val)
> +{
> +	*val = OPAL_SHPC_LINK_UP_x1;
> +	return OPAL_SUCCESS;
> +}
> +
> +static int64_t npu3_get_power_state(struct pci_slot *slot __unused,
> +				    uint8_t *val)
> +{
> +	*val = PCI_SLOT_POWER_ON;
> +	return OPAL_SUCCESS;
> +}
> +
> +static void npu3_create_phb_slot(struct npu3 *npu)
> +{
> +	struct pci_slot *slot;
> +
> +	slot = pci_slot_alloc(&npu->nvlink.phb, NULL);
> +	if (!slot) {
> +		/**
> +		 * @fwts-label NPUCannotCreatePHBSlot
> +		 * @fwts-advice Firmware probably ran out of memory creating
> +		 * NPU3 slot. NVLink functionality could be broken.
> +		 */
> +		NPU3ERR(npu, "Cannot create PHB slot\n");


No need in this one - pci_slot_alloc() prints errors itself.


> +		return;
> +	}
> +
> +	/* Elementary functions */
> +	slot->ops.creset		= npu3_reset;
> +	slot->ops.freset		= npu3_freset;
> +	slot->ops.hreset		= npu3_reset;
> +	slot->ops.get_link_state	= npu3_get_link_state;
> +	slot->ops.get_power_state	= npu3_get_power_state;
> +}
> +
> +static void npu3_create_phb(struct npu3 *npu)
> +{
> +	struct phb *phb = &npu->nvlink.phb;
> +
> +	phb->phb_type = phb_type_npu_v3;
> +	phb->ops = &npu_ops;
> +	phb->dt_node = dt_new_addr(dt_root, "pciex", npu->regs[0]);
> +	assert(phb->dt_node);
> +
> +	list_head_init(&phb->virt_devices);
> +	pci_register_phb(phb, OPAL_DYNAMIC_PHB_ID);
> +	npu3_create_phb_slot(npu);
> +	npu3_ioda_reset(phb, true);
> +}
> +
> +static void npu3_dev_init_hw(struct npu3_dev *dev)
> +{
> +	struct npu3 *npu = dev->npu;
> +	uint64_t reg, val;
> +
> +	reg = NPU3_RELAXED_CFG2(dev->index);
> +	val = npu3_read(npu, reg);
> +	val |= NPU3_RELAXED_CFG2_CMD_CL_DMA_W |
> +	       NPU3_RELAXED_CFG2_CMD_CL_DMA_W_HP |
> +	       NPU3_RELAXED_CFG2_CMD_CL_DMA_INJ |
> +	       NPU3_RELAXED_CFG2_CMD_PR_DMA_INJ |
> +	       NPU3_RELAXED_CFG2_CMD_DMA_PR_W |
> +	       NPU3_RELAXED_CFG2_CMD_CL_RD_NC_F0 |
> +	       NPU3_RELAXED_CFG2_SRC_RDENA(0);
> +	npu3_write(npu, reg, val);
> +
> +	reg = NPU3_NTL_MISC_CFG2(dev->index);
> +	val = npu3_read(npu, reg);
> +	val |= NPU3_NTL_MISC_CFG2_BRICK_ENABLE |
> +	       NPU3_NTL_MISC_CFG2_NDL_TX_PARITY_ENA |
> +	       NPU3_NTL_MISC_CFG2_NDL_PRI_PARITY_ENA |
> +	       NPU3_NTL_MISC_CFG2_RCV_CREDIT_OVERFLOW_ENA;
> +	npu3_write(npu, reg, val);
> +}
> +
> +static void npu3_init_hw(struct npu3 *npu)
> +{
> +	struct npu3_dev *dev;
> +	uint64_t reg, val;
> +
> +	reg = NPU3_XTS_CFG;
> +	val = npu3_read(npu, reg);
> +	val |= NPU3_XTS_CFG_MMIOSD | NPU3_XTS_CFG_TRY_ATR_RO;
> +	npu3_write(npu, reg, val);
> +
> +	reg = NPU3_XTS_CFG2;
> +	val = npu3_read(npu, reg);

Either npu3_read() does not need to store to @val or...


> +	val = NPU3_XTS_CFG2_NO_FLUSH_ENA;

... something is missing here (NPU2 does
"val | NPU2_XTS_CFG2_NO_FLUSH_ENA").



> +	npu3_write(npu, reg, val);
> +
> +	reg = NPU3_RELAXED_SRC(0);
> +	val = NPU3_RELAXED_SRC_MASK_NPU;
> +	npu3_write(npu, reg, val);
> +
> +	npu3_for_each_nvlink_dev(dev, npu)
> +		npu3_dev_init_hw(dev);
> +}
> +
> +/* PCI command register (BAR enable/disable) */
> +static int64_t npu3_cfg_cmd(void *pvd,
> +			    struct pci_cfg_reg_filter *pcrf __unused,
> +			    uint32_t offset, uint32_t size,
> +			    uint32_t *data, bool write)
> +{
> +	struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
> +
> +	if (!write)
> +		return OPAL_PARTIAL;
> +
> +	if (offset != PCI_CFG_CMD)
> +		return OPAL_PARAMETER;
> +
> +	if (size != 1 && size != 2 && size != 4)
> +		return OPAL_PARAMETER;
> +
> +	npu3_dev_enable_bars(dev, !!(*data & PCI_CFG_CMD_MEM_EN));
> +
> +	return OPAL_PARTIAL;
> +}
> +
> +static int64_t npu3_cfg_bar_write(struct npu3_bar *bar, uint64_t mask,
> +				  uint32_t data)
> +{
> +	if (data != 0xffffffff)
> +		return OPAL_HARDWARE;
> +
> +	/* Return BAR size on next read */
> +	bar->trap |= mask;
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +static int64_t npu3_cfg_bar_read(struct npu3_bar *bar, uint64_t mask,
> +				 uint32_t *data)
> +{
> +	if (!(bar->trap & mask))
> +		return OPAL_PARTIAL;
> +
> +	*data = GETFIELD(mask, bar->size);
> +	bar->trap &= ~mask;
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +/* PCI BAR registers (NTL/GENID) */
> +static int64_t npu3_cfg_bar(void *pvd __unused,
> +			    struct pci_cfg_reg_filter *pcrf,
> +			    uint32_t offset, uint32_t size, uint32_t *data,
> +			    bool write)
> +{
> +	struct npu3_bar *bar = (struct npu3_bar *)pcrf->data;
> +	uint64_t mask;
> +
> +	if (size != 4)
> +		return OPAL_PARAMETER;
> +
> +	if (offset == pcrf->start)
> +		mask = 0xffffffff;
> +	else if (offset == pcrf->start + 4)
> +		mask = 0xffffffffull << 32;
> +	else
> +		return OPAL_PARAMETER;
> +
> +	if (write)
> +		return npu3_cfg_bar_write(bar, mask, *data);
> +
> +	return npu3_cfg_bar_read(bar, mask, data);
> +}
> +
> +/* PCI control register */
> +static int64_t npu3_cfg_devctl(void *pvd,
> +			       struct pci_cfg_reg_filter *pcrf __unused,
> +			       uint32_t offset, uint32_t size,
> +			       uint32_t *data, bool write)
> +{
> +	struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
> +
> +	if (!write)
> +		return OPAL_HARDWARE;
> +
> +	if (size != 2 || offset & 1) {
> +		NPU3DEVERR(dev, "Unsupported write to pcie control register\n");
> +		return OPAL_PARAMETER;
> +	}
> +
> +	if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
> +		if (!npu3_dev_reset(dev))
> +			purge_l2_l3_caches();
> +
> +	return OPAL_PARTIAL;
> +}
> +
> +static uint32_t npu3_cfg_populate_pcie_cap(struct npu3_dev *dev, uint32_t start,
> +					   uint32_t prev_cap)
> +{
> +	struct pci_virt_device *pvd = dev->nvlink.pvd;
> +	uint32_t val;
> +
> +	/* Add capability list */
> +	PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
> +	PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_EXP);
> +
> +	/* 0x00 - ID/PCIE capability */
> +	val = PCI_CFG_CAP_ID_EXP;
> +	val |= 0x2 << 16 | PCIE_TYPE_ENDPOINT << 20;
> +	PCI_VIRT_CFG_INIT_RO(pvd, start, 4, val);
> +
> +	/* 0x04 - Device capability */
> +	val = PCIE_MPSS_128 |
> +	      PCIE_PHANTOM_NONE << 3 |
> +	      PCIE_L0SL_MAX_NO_LIMIT << 6 |
> +	      PCIE_L1L_MAX_NO_LIMIT << 9 |
> +	      PCICAP_EXP_DEVCAP_FUNC_RESET;
> +	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_DEVCAP, 4, val);
> +
> +	pci_virt_add_filter(pvd, start + PCICAP_EXP_DEVCTL, 2,
> +			    PCI_REG_FLAG_WRITE,
> +			    npu3_cfg_devctl, NULL);
> +
> +	/* 0x08 - Device control and status */
> +	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DEVCTL, 4, 0x00002810,
> +			  0xffff0000, 0x000f0000);
> +
> +	/* 0x0c - Link capability */
> +	val = PCIE_LSPEED_VECBIT_2 | PCIE_LWIDTH_1X << 4;
> +	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP, 4, val);
> +
> +	/* 0x10 - Link control and status */
> +	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL, 4, 0x00130000,
> +			  0xfffff000, 0xc0000000);
> +
> +	/* 0x14 - Slot capability */
> +	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCAP, 4, 0x00000000);
> +
> +	/* 0x18 - Slot control and status */
> +	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCTL, 4, 0x00000000);
> +
> +	/* 0x1c - Root control and capability */
> +	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RC, 4, 0x00000000,
> +			  0xffffffe0, 0x00000000);
> +
> +	/* 0x20 - Root status */
> +	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RSTAT, 4, 0x00000000,
> +			  0xffffffff, 0x00010000);
> +
> +	/* 0x24 - Device capability 2 */
> +	PCI_VIRT_CFG_INIT_RO(pvd, start + PCIECAP_EXP_DCAP2, 4, 0x00000000);
> +
> +	/* 0x28 - Device Control and status 2 */
> +	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DCTL2, 4, 0x00070000,
> +			  0xffff0000, 0x00000000);
> +
> +	/* 0x2c - Link capability 2 */
> +	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP2, 4, 0x00000007);
> +
> +	/* 0x30 - Link control and status 2 */
> +	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL2, 4, 0x00000003,
> +			  0xffff0000, 0x00200000);
> +
> +	/* 0x34 - Slot capability 2 */
> +	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCAP2, 4, 0x00000000);
> +
> +	/* 0x38 - Slot control and status 2 */
> +	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCTL2, 4, 0x00000000);
> +
> +	return start + PCICAP_EXP_SCTL2 + 8;
> +}
> +
> +static int64_t npu3_dev_procedure_write(struct npu3_dev *dev, uint32_t offset,
> +					uint32_t data)
> +{
> +	switch (offset) {
> +	case 0:
> +		NPU3DEVINF(dev, "Ignoring write to status register\n");
> +		break;
> +	case 4:
> +		npu3_dev_procedure_init(dev, data);
> +		break;
> +	default:
> +		return OPAL_PARAMETER;
> +	}
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +static int64_t npu3_dev_procedure_read(struct npu3_dev *dev, uint32_t offset,
> +				       uint32_t *data)
> +{
> +	switch (offset) {
> +	case 0:
> +		*data = npu3_dev_procedure_status(dev);
> +		break;
> +	case 4:
> +		*data = dev->proc.number;
> +		break;
> +	default:
> +		*data = 0;
> +		return OPAL_PARAMETER;
> +	}
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +/* Hardware procedure control/status registers */
> +static int64_t npu3_dev_procedure(void *pvd, struct pci_cfg_reg_filter *pcrf,
> +				  uint32_t offset, uint32_t size,
> +				  uint32_t *data, bool write)
> +{
> +	struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
> +
> +	if (size != 4)
> +		return OPAL_PARAMETER;
> +
> +	offset -= pcrf->start;
> +
> +	if (write)
> +		return npu3_dev_procedure_write(dev, offset, *data);
> +
> +	return npu3_dev_procedure_read(dev, offset, data);
> +}
> +
> +/* PPE SRAM access is indirect via CSAR/CSDR */
> +static void npu3_dev_ppe_sram_sel(struct npu3_dev *dev, uint32_t reg)
> +{
> +	uint64_t val;
> +
> +	val = SETFIELD(OB_PPE_CSAR_SRAM_ADDR, 0ull, reg);
> +	xscom_write(dev->npu->chip_id, OB_PPE_CSAR(dev->ob_chiplet), val);
> +}
> +
> +static void npu3_dev_ppe_sram_write(struct npu3_dev *dev, uint32_t reg,
> +				    uint64_t val)
> +{
> +	npu3_dev_ppe_sram_sel(dev, reg);
> +	xscom_write(dev->npu->chip_id, OB_PPE_CSDR(dev->ob_chiplet), val);
> +}
> +
> +static uint64_t npu3_dev_ppe_sram_read(struct npu3_dev *dev, uint32_t reg)
> +{
> +	uint64_t val;
> +
> +	npu3_dev_ppe_sram_sel(dev, reg);
> +	xscom_read(dev->npu->chip_id, OB_PPE_CSDR(dev->ob_chiplet), &val);
> +
> +	return val;
> +}
> +
> +/* Software-initiated autonomous link training (SALT) */
> +static int64_t npu3_dev_salt(void *pvd, struct pci_cfg_reg_filter *pcrf,
> +			     uint32_t offset, uint32_t size, uint32_t *data,
> +			     bool write)
> +{
> +	struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
> +	unsigned long timeout;
> +	uint32_t cmd_reg;
> +	uint64_t val;
> +
> +	if (size != 4 || offset != pcrf->start)
> +		return OPAL_PARAMETER;
> +
> +	/* The config register before this one holds CMD_REG */
> +	pci_virt_cfg_read_raw(pvd, PCI_VIRT_CFG_NORMAL, pcrf->start - 4,
> +			      4, &cmd_reg);
> +
> +	/* Check for another command in progress */
> +	val = npu3_dev_ppe_sram_read(dev, OB_PPE_SALT_CMD);
> +	if (GETFIELD(OB_PPE_SALT_CMD_READY, val))
> +		return OPAL_BUSY;
> +
> +	val = OB_PPE_SALT_CMD_READY;
> +	val = SETFIELD(OB_PPE_SALT_CMD_RW, val, write);
> +	val = SETFIELD(OB_PPE_SALT_CMD_LINKNUM, val, npu3_chip_dev_index(dev));
> +	val = SETFIELD(OB_PPE_SALT_CMD_REG, val, cmd_reg);
> +	if (write)
> +		val = SETFIELD(OB_PPE_SALT_CMD_DATA, val, *data);
> +
> +	npu3_dev_ppe_sram_write(dev, OB_PPE_SALT_CMD, val);
> +
> +	/* Wait for the go bit to clear */
> +	timeout = mftb() + msecs_to_tb(1000);
> +
> +	while (GETFIELD(OB_PPE_SALT_CMD_READY, val)) {
> +		if (tb_compare(mftb(), timeout) == TB_AAFTERB) {
> +			NPU3DEVINF(dev, "SALT_CMD 0x%x: timeout\n", cmd_reg);
> +			return OPAL_BUSY;
> +		}
> +
> +		val = npu3_dev_ppe_sram_read(dev, OB_PPE_SALT_CMD);
> +	}
> +
> +	if (GETFIELD(OB_PPE_SALT_CMD_ERR, val))
> +		NPU3DEVINF(dev, "SALT_CMD 0x%x: error\n", cmd_reg);
> +
> +	if (!write)
> +		*data = GETFIELD(OB_PPE_SALT_CMD_DATA, val);
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +#define VENDOR_CAP_LEN		0x1c
> +#define VENDOR_CAP_VERSION	0x02
> +
> +static uint32_t npu3_cfg_populate_vendor_cap(struct npu3_dev *dev,
> +					     uint32_t start, uint32_t prev_cap)
> +{
> +	struct pci_virt_device *pvd = dev->nvlink.pvd;
> +
> +	/* Capabilities list */
> +	PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
> +	PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_VENDOR);
> +
> +	/* Length and version */
> +	PCI_VIRT_CFG_INIT_RO(pvd, start + 2, 1, VENDOR_CAP_LEN);
> +	PCI_VIRT_CFG_INIT_RO(pvd, start + 3, 1, VENDOR_CAP_VERSION);
> +
> +	/*
> +	 * Defaults when the trap can't handle the read/write (eg. due to
> +	 * reading/writing less than 4 bytes).
> +	 */
> +	PCI_VIRT_CFG_INIT_RO(pvd, start + 4, 4, 0);
> +	PCI_VIRT_CFG_INIT_RO(pvd, start + 8, 4, 0);
> +
> +	/* PHY procedure trap */
> +	pci_virt_add_filter(pvd, start + 4, 8,
> +			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
> +			    npu3_dev_procedure, NULL);
> +
> +	/* Link index */
> +	PCI_VIRT_CFG_INIT_RO(pvd, start + 0xc, 1, npu3_chip_dev_index(dev));
> +
> +	/* SALT registers */
> +	PCI_VIRT_CFG_INIT_RO(pvd, start + 0x10, 4, 0);
> +	PCI_VIRT_CFG_INIT_RO(pvd, start + 0x14, 4, 0);
> +
> +	pci_virt_add_filter(pvd, start + 0x14, 4,
> +			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
> +			    npu3_dev_salt, NULL);
> +
> +	return start + VENDOR_CAP_LEN;
> +}
> +
> +static void npu3_cfg_populate(struct npu3_dev *dev)
> +{
> +	struct pci_virt_device *pvd = dev->nvlink.pvd;
> +	uint64_t addr;
> +	uint32_t pos;
> +
> +	/* 0x00 - Vendor/Device ID */
> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_VENDOR_ID, 4, 0x04ea1014);
> +
> +	/* 0x04 - Command/Status */
> +	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8,
> +			  0xf9000000);
> +
> +	pci_virt_add_filter(pvd, PCI_CFG_CMD, 1, PCI_REG_FLAG_WRITE,
> +			    npu3_cfg_cmd, NULL);
> +
> +	/* 0x08 - Rev/Class/Cache */
> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_REV_ID, 4, 0x06800102);
> +
> +	/* 0x0c - CLS/Latency Timer/Header/BIST */
> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000);
> +
> +	/* 0x10/14 - NTL BAR */
> +	addr = SETFIELD(0xf, dev->ntl_bar.addr,
> +			PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64);
> +	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR0, 4, lo32(addr), 0xf, 0);
> +	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR1, 4, hi32(addr), 0, 0);
> +
> +	pci_virt_add_filter(pvd, PCI_CFG_BAR0, 8,
> +			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
> +			    npu3_cfg_bar, &dev->ntl_bar);
> +
> +	/* 0x18/1c - GENID BAR */
> +	addr = SETFIELD(0xf, dev->genid_bar.addr,
> +			PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64);
> +	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, lo32(addr), 0xf, 0);
> +	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR3, 4, hi32(addr), 0, 0);
> +
> +	pci_virt_add_filter(pvd, PCI_CFG_BAR2, 8,
> +			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
> +			    npu3_cfg_bar, &dev->genid_bar);
> +
> +	/* 0x20/0x24 - BARs, disabled */
> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR4, 4, 0x00000000);
> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR5, 4, 0x00000000);
> +
> +	/* 0x28 - Cardbus CIS pointer */
> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CARDBUS_CIS, 4, 0x00000000);
> +
> +	/* 0x2c - Subsystem ID */
> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000);
> +
> +	/* 0x30 - ROM BAR, zero sized */
> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_ROMBAR, 4, 0xffffffff);
> +
> +	/* 0x34 - PCI Capability */
> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CAP, 4, 0x00000000);
> +
> +	/* 0x38 - Reserved */
> +	PCI_VIRT_CFG_INIT_RO(pvd, 0x38, 4, 0x00000000);
> +
> +	/* 0x3c - INT line/pin/Minimal grant/Maximal latency */
> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000100); /* INT A */
> +
> +	/* PCIE and vendor specific capability */
> +	pos = npu3_cfg_populate_pcie_cap(dev, 0x40, PCI_CFG_CAP);
> +	pos = npu3_cfg_populate_vendor_cap(dev, pos, 0x41);
> +	PCI_VIRT_CFG_INIT_RO(pvd, pos + 1, 1, 0);
> +}
> +
> +static void npu3_dev_create_pvd(struct npu3_dev *dev)
> +{
> +	struct npu3 *npu = dev->npu;
> +	struct phb *phb = &npu->nvlink.phb;
> +
> +	dev->nvlink.pvd = pci_virt_add_device(phb, dev->index, 0x100, dev);
> +	if (!dev->nvlink.pvd)
> +		return;
> +
> +	phb->scan_map |= 0x1 << GETFIELD(0xf8, dev->nvlink.pvd->bdfn);
> +	npu3_cfg_populate(dev);
> +}
> +
> +static void npu3_dt_add_mmio_window(struct npu3 *npu)
> +{
> +	struct dt_node *dn = npu->nvlink.phb.dt_node;
> +	uint32_t ntl0_index = npu->index * NPU3_LINKS_PER_NPU;
> +	uint64_t addr, size, win[2];
> +
> +	/* Device MMIO window (NTL/GENID regs only) */
> +	phys_map_get(npu->chip_id, NPU_NTL, ntl0_index, &win[0], NULL);
> +	phys_map_get(npu->chip_id, NPU_GENID, npu->index, &addr, &size);
> +	win[1] = addr + size - win[0];
> +
> +	dt_add_property(dn, "ibm,mmio-window", win, sizeof(win));
> +	dt_add_property_cells(dn, "ranges", 0x02000000,
> +			      hi32(win[0]), lo32(win[0]),
> +			      hi32(win[0]), lo32(win[0]),
> +			      hi32(win[1]), lo32(win[1]));
> +}
> +
> +/* NDL No-Stall Event level */
> +static uint32_t npu3_dev_interrupt_level(struct npu3_dev *dev)
> +{
> +	const uint32_t level[12] = {  1,  3,  5,  7,  9, 11,
> +				     43, 45, 47, 49, 51, 53 };
> +
> +	return level[npu3_chip_dev_index(dev)];
> +}
> +
> +static void npu3_dt_add_interrupts(struct npu3 *npu)
> +{
> +	struct dt_node *dn = npu->nvlink.phb.dt_node;
> +	uint32_t *map, icsp, i = 0;
> +	struct npu3_dev *dev;
> +	size_t map_size = 0;
> +
> +	npu3_for_each_nvlink_dev(dev, npu)
> +		map_size += sizeof(*map) * 7;
> +
> +	if (!map_size)
> +		return;
> +
> +	icsp = get_ics_phandle();
> +	map = zalloc(map_size);
> +	assert(map);
> +
> +	npu3_for_each_nvlink_dev(dev, npu) {
> +		map[i] = dev->nvlink.pvd->bdfn << 8;
> +		map[i + 3] = 1;		/* INT A */
> +		map[i + 4] = icsp;	/* interrupt-parent */
> +		map[i + 5] = npu->irq_base + npu3_dev_interrupt_level(dev);
> +		map[i + 6] = 0;		/* 0 = EDGE, 1 = LEVEL */
> +		i += 7;
> +	}
> +
> +	dt_add_property_cells(dn, "interrupt-parent", icsp);
> +	dt_add_property(dn, "interrupt-map", map, map_size);
> +	dt_add_property_cells(dn, "interrupt-map-mask", 0xff00, 0x0, 0x0, 0x7);
> +
> +	free(map);
> +}
> +
> +/* Populate PCI root device node */
> +static void npu3_dt_add_props(struct npu3 *npu)
> +{
> +	struct dt_node *dn = npu->nvlink.phb.dt_node;
> +
> +	dt_add_property_cells(dn, "#address-cells", 3);
> +	dt_add_property_cells(dn, "#size-cells", 2);
> +	dt_add_property_cells(dn, "#interrupt-cells", 1);
> +	dt_add_property_cells(dn, "bus-range", 0, 0xff);
> +	dt_add_property_cells(dn, "clock-frequency", 0x200, 0);
> +
> +	dt_add_property_strings(dn, "device_type", "pciex");
> +	/* To the OS, npu2 and npu3 are both ibm,ioda2-npu2-phb */
> +	dt_add_property_strings(dn, "compatible",
> +				"ibm,power9-npu-pciex",
> +				"ibm,ioda2-npu2-phb");
> +
> +	dt_add_property_cells(dn, "ibm,phb-index",
> +			      dt_prop_get_u32(npu->dt_node, "ibm,phb-index"));
> +	dt_add_property_cells(dn, "ibm,phb-diag-data-size", 0);
> +	dt_add_property_cells(dn, "ibm,opal-num-pes", NPU3_MAX_PE_NUM);
> +	dt_add_property_cells(dn, "ibm,opal-reserved-pe", NPU3_RESERVED_PE_NUM);
> +	dt_add_property_cells(dn, "ibm,supported-tce-sizes",
> +			      12, /* 4K */
> +			      16, /* 64K */
> +			      24, /* 16M */
> +			      28); /* 256M */



Still only these 4 sizes, no 2M/1G?



> +
> +	dt_add_property_cells(dn, "ibm,chip-id", npu->chip_id);
> +	dt_add_property_cells(dn, "ibm,npu-index", npu->index);
> +	dt_add_property_cells(dn, "ibm,npcq", npu->dt_node->phandle);
> +	dt_add_property_cells(dn, "ibm,xscom-base", npu->xscom_base);
> +	dt_add_property_cells(dn, "ibm,links", NPU3_LINKS_PER_NPU);
> +
> +	dt_add_property(dn, "reg", npu->regs, sizeof(npu->regs));
> +	dt_add_property_u64s(dn, "ibm,mmio-atsd",
> +			     npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(0),
> +			     npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(1),
> +			     npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(2),
> +			     npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(3),
> +			     npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(4),
> +			     npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(5),
> +			     npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(6),
> +			     npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(7));
> +
> +	npu3_dt_add_mmio_window(npu);
> +	npu3_dt_add_interrupts(npu);
> +}
> +
> +void npu3_init_nvlink(struct npu3 *npu)
> +{
> +	struct npu3_dev *dev;
> +
> +	if (!npu3_next_dev(npu, NULL, NPU3_DEV_TYPE_NVLINK))
> +		return;
> +
> +	npu3_init_hw(npu);
> +	npu3_create_phb(npu);
> +
> +	npu3_for_each_nvlink_dev(dev, npu)
> +		npu3_dev_create_pvd(dev);
> +
> +	npu3_dt_add_props(npu);
> +
> +	/* TODO: Sort out if/why we still can't enable this */
> +	disable_fast_reboot("NVLink device enabled");
> +}
> +
> +static int64_t npu3_init_context_pid(struct npu3 *npu, uint32_t index,
> +				     uint64_t msr)
> +{
> +	uint64_t map, old_map;
> +
> +	/* Unfiltered XTS mode; index is lparshort */
> +	map = SETFIELD(NPU3_XTS_PID_MAP_LPARSHORT, 0ull, index);
> +
> +	/* Enable this mapping for both real and virtual addresses */
> +	map |= NPU3_XTS_PID_MAP_VALID_ATRGPA0 | NPU3_XTS_PID_MAP_VALID_ATRGPA1;
> +
> +	/* Enable TLBIE/MMIOSD forwarding for this entry */
> +	map |= NPU3_XTS_PID_MAP_VALID_ATSD;
> +
> +	/* Set the relevant MSR bits */
> +	if (msr & MSR_DR)
> +		map |= NPU3_XTS_PID_MAP_MSR_DR;
> +
> +	if (msr & MSR_HV)
> +		map |= NPU3_XTS_PID_MAP_MSR_HV;
> +
> +	if (msr & MSR_PR)
> +		map |= NPU3_XTS_PID_MAP_MSR_PR;
> +
> +	/* We don't support anything other than 64-bit so hardcode it here */
> +	map |= NPU3_XTS_PID_MAP_MSR_SF;
> +
> +	old_map = npu3_read(npu, NPU3_XTS_PID_MAP(index));
> +
> +	/* Error out if this entry is already set with different msr bits */
> +	if (old_map && GETFIELD(NPU3_XTS_PID_MAP_MSR, old_map) !=
> +		       GETFIELD(NPU3_XTS_PID_MAP_MSR, map)) {
> +		NPU3ERR(npu, "%s: Unexpected MSR value\n", __func__);
> +		return OPAL_PARAMETER;
> +	}
> +
> +	if (!old_map) {
> +		NPU3DBG(npu, "XTS_PID_MAP[%03d] = 0x%08llx\n", index, map);
> +		npu3_write(npu, NPU3_XTS_PID_MAP(index), map);
> +	}
> +
> +	npu->nvlink.context_refcount[index]++;
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +#define NPU3_VALID_ATS_MSR_BITS (MSR_DR | MSR_HV | MSR_PR | MSR_SF)
> +
> +/*
> + * Allocate a context ID and initialize the tables with the relevant
> + * information. Returns the ID or error if one couldn't be allocated.
> + */
> +int64_t npu3_init_context(struct phb *phb, uint64_t msr, uint64_t bdf)
> +{
> +	struct npu3 *npu = npu3_phb_to_npu(phb);
> +	uint32_t lparshort, i;
> +	uint64_t map;
> +	int64_t rc;
> +
> +	/*
> +	 * MSR bits should be masked by the caller to allow for future
> +	 * expansion if required.
> +	 */
> +	if (msr & ~NPU3_VALID_ATS_MSR_BITS)
> +		return OPAL_UNSUPPORTED;
> +
> +	lock(&npu->lock);
> +
> +	for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) {
> +		map = npu3_read(npu, NPU3_XTS_BDF_MAP(i));
> +
> +		if (map && GETFIELD(NPU3_XTS_BDF_MAP_BDF, map) == bdf)
> +			break;
> +	}
> +
> +	if (i == NPU3_XTS_BDF_MAP_MAX) {
> +		NPU3ERR(npu, "LPARID not associated with any GPU\n");
> +		rc = OPAL_PARAMETER;
> +		goto out;
> +	}
> +
> +	lparshort = GETFIELD(NPU3_XTS_BDF_MAP_LPARSHORT, map);
> +	NPU3DBG(npu, "Found LPARSHORT 0x%x for bdf %02llx:%02llx.%llx\n",
> +		lparshort, bdf >> 8 & 0xff, bdf >> 3 & 0x1f, bdf & 0x7);
> +
> +	rc = npu3_init_context_pid(npu, lparshort, msr);
> +	if (rc)
> +		goto out;
> +
> +	if (!(map & NPU3_XTS_BDF_MAP_VALID)) {
> +		map |= NPU3_XTS_BDF_MAP_VALID;
> +		npu3_write(npu, NPU3_XTS_BDF_MAP(i), map);
> +	}
> +
> +	rc = lparshort;
> +
> +out:
> +	unlock(&npu->lock);
> +	return rc;
> +}
> +
> +static int64_t npu3_destroy_context_pid(struct npu3 *npu, uint32_t index)
> +{
> +	if (!npu->nvlink.context_refcount[index])
> +		return OPAL_PARAMETER;
> +
> +	/* Only destroy when refcount hits 0 */
> +	if (--npu->nvlink.context_refcount[index])
> +		return OPAL_PARTIAL;
> +
> +	NPU3DBG(npu, "XTS_PID_MAP[%03d] = 0 (destroy)\n", index);
> +	npu3_write(npu, NPU3_XTS_PID_MAP(index), 0ull);
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +int64_t npu3_destroy_context(struct phb *phb, uint64_t bdf)
> +{
> +	struct npu3 *npu = npu3_phb_to_npu(phb);
> +	uint32_t lparshort, i;
> +	int64_t map, rc;
> +
> +	lock(&npu->lock);
> +
> +	for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) {
> +		map = npu3_read(npu, NPU3_XTS_BDF_MAP(i));
> +
> +		if (map && GETFIELD(NPU3_XTS_BDF_MAP_BDF, map) == bdf)
> +			break;
> +	}
> +
> +	if (i == NPU3_XTS_BDF_MAP_MAX) {
> +		NPU3ERR(npu, "LPARID not associated with any GPU\n");
> +		rc = OPAL_PARAMETER;
> +		goto out;
> +	}
> +
> +	lparshort = GETFIELD(NPU3_XTS_BDF_MAP_LPARSHORT, map);
> +	rc = npu3_destroy_context_pid(npu, lparshort);
> +
> +out:
> +	unlock(&npu->lock);
> +	return rc;
> +}
> +
> +/* Map the given virtual bdf to lparid with given lpcr */
> +int64_t npu3_map_lpar(struct phb *phb, uint64_t bdf, uint64_t lparid,
> +		      uint64_t lpcr)
> +{
> +	struct npu3 *npu = npu3_phb_to_npu(phb);
> +	struct npu3_dev *dev;
> +	int64_t rc = OPAL_SUCCESS;
> +	uint64_t map, val;
> +	uint32_t i;
> +
> +	/*
> +	 * The LPCR bits are only required for hash based ATS, which we don't
> +	 * currently support, but may need to in the future.
> +	 */
> +	if (lpcr)
> +		return OPAL_UNSUPPORTED;
> +
> +	lock(&npu->lock);
> +
> +	/* Update the entry if it already exists */
> +	for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) {
> +		map = npu3_read(npu, NPU3_XTS_BDF_MAP(i));
> +
> +		if (map && GETFIELD(NPU3_XTS_BDF_MAP_BDF, map) == bdf)
> +			break;
> +	}
> +
> +	if (i == NPU3_XTS_BDF_MAP_MAX) {
> +		/* No existing mapping found, find space for a new one */
> +		for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++)
> +			if (!npu3_read(npu, NPU3_XTS_BDF_MAP(i)))
> +				break;
> +	}
> +
> +	if (i == NPU3_XTS_BDF_MAP_MAX) {
> +		NPU3ERR(npu, "No free XTS_BDF[] entry\n");
> +		rc = OPAL_RESOURCE;
> +		goto out;
> +	}
> +
> +	map = NPU3_XTS_BDF_MAP_UNFILT;
> +	map = SETFIELD(NPU3_XTS_BDF_MAP_BDF, map, bdf);
> +	map = SETFIELD(NPU3_XTS_BDF_MAP_LPARID, map, lparid);
> +	map = SETFIELD(NPU3_XTS_BDF_MAP_LPARSHORT, map, i);
> +
> +	/* We only support radix at the moment */
> +	map = SETFIELD(NPU3_XTS_BDF_MAP_XLAT, map, 0x3);
> +
> +	/* Find a link on which to send ATSDs for this device */
> +	npu3_for_each_nvlink_dev(dev, npu)
> +		if (dev->nvlink.gpu->bdfn == bdf)
> +			break;
> +
> +	if (!dev || dev->nvlink.gpu->bdfn != bdf) {
> +		NPU3ERR(npu, "Can't find a link for bdf %02llx:%02llx.%llx\n",
> +			bdf >> 8 & 0xff, bdf >> 3 & 0x1f, bdf & 0x7);
> +		rc = OPAL_PARAMETER;
> +		goto out;
> +	}
> +
> +	map = SETFIELD(NPU3_XTS_BDF_MAP_BRICK, map, dev->index);
> +
> +	NPU3DBG(npu, "XTS_BDF_MAP[%03d] = 0x%08llx\n", i, map);
> +	npu3_write(npu, NPU3_XTS_BDF_MAP(i), map);
> +
> +	/* We need to allocate an ATSD per link */
> +	val = SETFIELD(NPU3_XTS_ATSD_HYP_LPARID, 0ull, lparid);
> +	if (!lparid)
> +		val |= NPU3_XTS_ATSD_HYP_MSR_HV;
> +
> +	npu3_write(npu, NPU3_XTS_ATSD_HYP(dev->index), val);
> +
> +out:
> +	unlock(&npu->lock);
> +	return rc;
> +}
> +
> +static int64_t npu3_relaxed_order_enable(struct npu3 *npu, uint64_t src)
> +{
> +	struct npu3_dev *dev;
> +	uint32_t i;
> +
> +	for (i = 0; i < NPU3_RELAXED_SRC_MAX; i++)
> +		if (npu3_read(npu, NPU3_RELAXED_SRC(i)) == src)
> +			return OPAL_SUCCESS; /* Already enabled */
> +
> +	/* Find somewhere to write this source */
> +	for (i = 0; i < NPU3_RELAXED_SRC_MAX; i++)
> +		if (!npu3_read(npu, NPU3_RELAXED_SRC(i)))
> +			break;
> +
> +	if (i == NPU3_RELAXED_SRC_MAX) {
> +		NPU3ERR(npu, "Insufficient resources to activate relaxed ordering mode\n");
> +		return OPAL_RESOURCE;
> +	}
> +
> +	npu3_write(npu, NPU3_RELAXED_SRC(i), src);
> +
> +	npu3_for_each_nvlink_dev(dev, npu) {
> +		uint64_t val = npu3_read(npu, NPU3_RELAXED_CFG2(dev->index));
> +
> +		val |= NPU3_RELAXED_CFG2_SRC_WRENA(i) |
> +		       NPU3_RELAXED_CFG2_SRC_RDENA(i);
> +		npu3_write(npu, NPU3_RELAXED_CFG2(dev->index), val);
> +	}
> +
> +	return OPAL_SUCCESS;
> +}
> +
> +static void npu3_relaxed_order_disable(struct npu3 *npu, uint64_t src)
> +{
> +	struct npu3_dev *dev;
> +	uint32_t i;
> +
> +	for (i = 0; i < NPU3_RELAXED_SRC_MAX; i++)
> +		if (npu3_read(npu, NPU3_RELAXED_SRC(i)) == src)
> +			break;
> +
> +	if (i == NPU3_RELAXED_SRC_MAX)
> +		return; /* Already disabled */
> +
> +	npu3_for_each_nvlink_dev(dev, npu) {
> +		uint64_t val = npu3_read(npu, NPU3_RELAXED_CFG2(dev->index));
> +
> +		val &= ~NPU3_RELAXED_CFG2_SRC_WRENA(i);
> +		val &= ~NPU3_RELAXED_CFG2_SRC_RDENA(i);
> +		npu3_write(npu, NPU3_RELAXED_CFG2(dev->index), val);
> +	}
> +
> +	npu3_write(npu, NPU3_RELAXED_SRC(i), 0ull);
> +}
> +
> +/* Enable or disable relaxed ordering on all nvlinks for a given PEC. */
> +int64_t npu3_set_relaxed_order(struct phb *phb, uint32_t gcid, int pec,
> +			       bool enable)
> +{
> +	struct npu3 *npu = npu3_phb_to_npu(phb);
> +	int64_t rc = OPAL_SUCCESS;
> +	uint64_t src;
> +
> +	NPU3INF(npu, "%s relaxed ordering for PEC %d on chip %d\n",
> +		enable ? "Enabling" : "Disabling",
> +		pec, gcid);
> +
> +	lock(&npu->lock);
> +
> +	src = SETFIELD(NPU3_RELAXED_SRC_GRPCHP, 0ull, gcid);
> +	src = SETFIELD(NPU3_RELAXED_SRC_PEC, src, pec);
> +	src = SETFIELD(NPU3_RELAXED_SRC_RDSTART, src, 0);
> +	src = SETFIELD(NPU3_RELAXED_SRC_RDEND, src, 47);
> +	src = SETFIELD(NPU3_RELAXED_SRC_WRSTART, src, 0);
> +	src = SETFIELD(NPU3_RELAXED_SRC_WREND, src, 23);
> +
> +	if (enable)
> +		rc = npu3_relaxed_order_enable(npu, src);
> +	else
> +		npu3_relaxed_order_disable(npu, src);
> +
> +	unlock(&npu->lock);
> +	return rc;
> +}
> diff --git a/hw/npu3.c b/hw/npu3.c
> new file mode 100644
> index 000000000000..22ccef2e01aa
> --- /dev/null
> +++ b/hw/npu3.c
> @@ -0,0 +1,554 @@
> +/* Copyright 2019 IBM Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + *	http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> + * implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <io.h>
> +#include <xscom.h>
> +#include <npu3.h>
> +#include <npu3-regs.h>
> +#include <nvram.h>
> +#include <interrupts.h>
> +#include <xive.h>
> +
> +#define NPU3LOG(l, npu, fmt, a...) \
> +	prlog(l, "NPU[%d:%d]: " fmt, (npu)->chip_id, (npu)->index, ##a)
> +#define NPU3DBG(npu, fmt, a...) NPU3LOG(PR_DEBUG, npu, fmt, ##a)
> +#define NPU3INF(npu, fmt, a...) NPU3LOG(PR_INFO, npu, fmt, ##a)
> +#define NPU3ERR(npu, fmt, a...) NPU3LOG(PR_ERR, npu, fmt, ##a)
> +
> +#define NPU3DEVLOG(l, dev, fmt, a...)		\
> +	prlog(l, "NPU[%d:%d:%d]: " fmt,		\
> +	      (dev)->npu->chip_id,		\
> +	      (dev)->npu->index,		\
> +	      (dev)->index, ##a)
> +#define NPU3DEVDBG(dev, fmt, a...) NPU3DEVLOG(PR_DEBUG, dev, fmt, ##a)
> +#define NPU3DEVINF(dev, fmt, a...) NPU3DEVLOG(PR_INFO, dev, fmt, ##a)
> +#define NPU3DEVERR(dev, fmt, a...) NPU3DEVLOG(PR_ERR, dev, fmt, ##a)
> +
> +static void npu3_dt_create_link(struct dt_node *npu, uint32_t npu_index,
> +				uint32_t dev_index)
> +{
> +	struct dt_node *link;
> +	uint32_t phy_lane_mask, ob_chiplet;
> +
> +	link = dt_new_addr(npu, "link", dev_index);
> +
> +	dt_add_property_string(link, "compatible", "ibm,npu-link");
> +	dt_add_property_cells(link, "reg", dev_index);
> +	dt_add_property_cells(link, "ibm,npu-link-index", dev_index);
> +
> +	switch (npu_index) {
> +	case 0:
> +		/* fall through */
> +	case 2:
> +		ob_chiplet = npu_index ? 3 : 0;
> +
> +		switch (dev_index) {
> +		case 0:
> +			phy_lane_mask = PPC_BITMASK32(0, 3);
> +			break;
> +		case 1:
> +			phy_lane_mask = PPC_BITMASK32(13, 16);
> +			break;
> +		case 2:
> +			phy_lane_mask = PPC_BITMASK32(7, 10);
> +			break;
> +		case 3:
> +			phy_lane_mask = PPC_BITMASK32(20, 23);
> +			break;
> +		}
> +
> +		break;
> +	case 1:
> +		switch (dev_index) {
> +		case 0:
> +			ob_chiplet = 1;
> +			phy_lane_mask = PPC_BITMASK32(0, 3);
> +			break;
> +		case 1:
> +			ob_chiplet = 2;
> +			phy_lane_mask = PPC_BITMASK32(0, 3);
> +			break;
> +		case 2:
> +			ob_chiplet = 1;
> +			phy_lane_mask = PPC_BITMASK32(7, 10);
> +			break;
> +		case 3:
> +			ob_chiplet = 2;
> +			phy_lane_mask = PPC_BITMASK32(7, 10);
> +			break;
> +		}
> +
> +		break;
> +	default:
> +		return;
> +	}
> +
> +	dt_add_property_cells(link, "ibm,npu-phy", ob_chiplet);
> +	dt_add_property_cells(link, "ibm,npu-lane-mask", phy_lane_mask);
> +}
> +
> +static void npu3_dt_create_npu(struct dt_node *xscom, uint32_t npu_index)
> +{
> +	const uint32_t npu_base[] = { 0x5011000, 0x5011400, 0x3011c00 };
> +	struct dt_node *npu;
> +
> +	npu = dt_new_addr(xscom, "npu", npu_base[npu_index]);
> +
> +	dt_add_property_cells(npu, "#size-cells", 0);
> +	dt_add_property_cells(npu, "#address-cells", 1);
> +	dt_add_property_cells(npu, "reg", npu_base[npu_index], 0x2c);
> +	dt_add_property_string(npu, "compatible", "ibm,power9-npu3");
> +	dt_add_property_cells(npu, "ibm,npu-index", npu_index);
> +	dt_add_property_cells(npu, "ibm,phb-index", 7 + npu_index);
> +
> +	for (uint32_t i = 0; i < NPU3_LINKS_PER_NPU; i++)
> +		npu3_dt_create_link(npu, npu_index, i);
> +}
> +
> +/* This can be removed when/if we decide to use HDAT instead */
> +static bool npu3_dt_create(void)
> +{
> +	struct proc_chip *chip = next_chip(NULL);
> +	struct dt_node *xscom;
> +
> +	/* npu3 chips only */
> +	if (proc_gen < proc_gen_p9 ||
> +	    chip->type == PROC_CHIP_P9_NIMBUS ||
> +	    chip->type == PROC_CHIP_P9_CUMULUS)
> +		return false;
> +
> +	dt_for_each_compatible(dt_root, xscom, "ibm,xscom")
> +		for (uint32_t i = 0; i < 3; i++)
> +			npu3_dt_create_npu(xscom, i);
> +
> +	return true;
> +}
> +
> +static struct npu3 *npu3_create(struct dt_node *dn)
> +{
> +	struct npu3 *npu;
> +	struct dt_node *link;
> +	struct npu3_dev *dev;
> +	char *path;
> +	uint32_t i;
> +
> +	npu = zalloc(sizeof(*npu));
> +	assert(npu);
> +
> +	init_lock(&npu->lock);
> +
> +	npu->dt_node = dn;
> +	npu->index = dt_prop_get_u32(dn, "ibm,npu-index");
> +	npu->xscom_base = dt_get_address(dn, 0, NULL);
> +
> +	npu->chip_id = dt_get_chip_id(dn);
> +	assert(get_chip(npu->chip_id));
> +
> +	dt_for_each_compatible(dn, link, "ibm,npu-link") {
> +		i = dt_prop_get_u32(link, "ibm,npu-link-index");
> +		assert(i < NPU3_LINKS_PER_NPU);
> +
> +		dev = &npu->devices[i];
> +		dev->index = i;
> +		dev->npu = npu;
> +		dev->dn = link;
> +		dev->ob_chiplet = dt_prop_get_u32(link, "ibm,npu-phy");
> +		dev->phy_lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask");
> +		dev->proc.status = NPU3_PROC_COMPLETE;
> +	};
> +
> +	path = dt_get_path(dn);
> +	NPU3INF(npu, "Found %s\n", path);
> +	NPU3INF(npu, "SCOM base: 0x%llx\n", npu->xscom_base);
> +	free(path);
> +
> +	return npu;
> +}
> +
> +struct npu3_dev *npu3_next_dev(struct npu3 *npu, struct npu3_dev *dev,
> +			       enum npu3_dev_type type)
> +{
> +	uint32_t i = 0;
> +
> +	if (dev)
> +		i = dev->index + 1;
> +
> +	for (; i < NPU3_LINKS_PER_NPU; i++) {
> +		dev = &npu->devices[i];
> +
> +		if (dev->type == type || type == NPU3_DEV_TYPE_ANY)
> +			return dev;
> +	}
> +
> +	return NULL;
> +}
> +
> +static void npu3_device_detect_fixup(struct npu3_dev *dev)
> +{
> +	struct dt_node *dn = dev->dn;
> +
> +	if (dev->type == NPU3_DEV_TYPE_NVLINK) {
> +		dt_add_property_strings(dn, "ibm,npu-link-type", "nvlink");
> +		dev->link_speed = dt_prop_get_u32_def(
> +					dn, "nvidia,link-speed", 0xff);
> +		return;
> +	}
> +
> +	NPU3DEVDBG(dev, "Link type unknown\n");
> +	dt_add_property_strings(dn, "ibm,npu-link-type", "unknown");
> +}
> +
> +/*
> + * We use the indirect method because it uses the same addresses as
> + * the MMIO offsets (NPU RING)
> + */
> +static void npu3_scom_sel(struct npu3 *npu, uint64_t reg, uint64_t size)
> +{
> +	uint64_t val;
> +
> +	val = SETFIELD(NPU3_MISC_DA_ADDR, 0ull, reg);
> +	val = SETFIELD(NPU3_MISC_DA_LEN, val, size);
> +	xscom_write(npu->chip_id,
> +		    npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_ADDR,
> +		    val);
> +}
> +
> +static void npu3_scom_write(struct npu3 *npu, uint64_t reg, uint64_t size,
> +			    uint64_t val)
> +{
> +	npu3_scom_sel(npu, reg, size);
> +	xscom_write(npu->chip_id,
> +		    npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_DATA,
> +		    val);
> +}
> +
> +static uint64_t npu3_scom_read(struct npu3 *npu, uint64_t reg, uint64_t size)
> +{
> +	uint64_t val;
> +
> +	npu3_scom_sel(npu, reg, size);
> +	xscom_read(npu->chip_id,
> +		   npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_DATA,
> +		   &val);
> +
> +	return val;
> +}
> +
> +void npu3_write(struct npu3 *npu, uint64_t reg, uint64_t val)
> +{
> +	void *mmio = (void *)npu->regs[0];
> +
> +	if (mmio)
> +		out_be64(mmio + reg, val);
> +	else
> +		npu3_scom_write(npu, reg, NPU3_MISC_DA_LEN_8B, val);
> +
> +	/* CQ_SM writes should be mirrored in all four blocks */
> +	if (NPU3_REG_BLOCK(reg) != NPU3_BLOCK_CQ_SM(0))
> +		return;
> +
> +	for (uint32_t i = 1; i < 4; i++)
> +		npu3_write(npu, NPU3_BLOCK_CQ_SM(i) + NPU3_REG_OFFSET(reg),
> +			   val);
> +}
> +
> +uint64_t npu3_read(struct npu3 *npu, uint64_t reg)
> +{
> +	void *mmio = (void *)npu->regs[0];
> +
> +	if (mmio)
> +		return in_be64(mmio + reg);
> +
> +	return npu3_scom_read(npu, reg, NPU3_MISC_DA_LEN_8B);
> +}
> +
> +void npu3_write_4b(struct npu3 *npu, uint64_t reg, uint32_t val)
> +{
> +	void *mmio = (void *)npu->regs[0];
> +
> +	if (mmio)
> +		out_be32(mmio + reg, val);
> +	else
> +		npu3_scom_write(npu, reg, NPU3_MISC_DA_LEN_4B,
> +				(uint64_t)val << 32);
> +
> +	if (NPU3_REG_BLOCK(reg) != NPU3_BLOCK_CQ_SM(0))
> +		return;
> +
> +	for (uint32_t i = 1; i < 4; i++)
> +		npu3_write_4b(npu, NPU3_BLOCK_CQ_SM(i) + NPU3_REG_OFFSET(reg),
> +			      val);
> +}
> +
> +uint32_t npu3_read_4b(struct npu3 *npu, uint64_t reg)
> +{
> +	void *mmio = (void *)npu->regs[0];
> +
> +	if (mmio)
> +		return in_be32(mmio + reg);
> +
> +	return npu3_scom_read(npu, reg, NPU3_MISC_DA_LEN_4B) >> 32;
> +}
> +
> +static void npu3_misc_config(struct npu3 *npu)
> +{
> +	struct npu3_dev *dev;
> +	uint32_t typemap = 0;
> +	uint64_t reg, val;
> +
> +	npu3_for_each_nvlink_dev(dev, npu)
> +		typemap |= 0x10 >> dev->index;
> +
> +	reg = NPU3_SM_MISC_CFG0;
> +	val = npu3_read(npu, reg);
> +	val |= NPU3_SM_MISC_CFG0_ENABLE_PBUS;
> +	val &= ~NPU3_SM_MISC_CFG0_ENABLE_SNARF_CPM;
> +	val = SETFIELD(NPU3_SM_MISC_CFG0_NVLINK_MODE, val, typemap);
> +	val = SETFIELD(NPU3_SM_MISC_CFG0_OCAPI_MODE, val, ~typemap);
> +	npu3_write(npu, reg, val);
> +
> +	reg = NPU3_CTL_MISC_CFG2;
> +	val = npu3_read(npu, reg);
> +	val = SETFIELD(NPU3_CTL_MISC_CFG2_NVLINK_MODE, val, typemap);
> +	val = SETFIELD(NPU3_CTL_MISC_CFG2_OCAPI_MODE, val, ~typemap);
> +	npu3_write(npu, reg, val);
> +
> +	reg = NPU3_DAT_MISC_CFG1;
> +	val = npu3_read(npu, reg);
> +	val = SETFIELD(NPU3_DAT_MISC_CFG1_NVLINK_MODE, val, typemap);
> +	val = SETFIELD(NPU3_DAT_MISC_CFG1_OCAPI_MODE, val, ~typemap);
> +	npu3_write(npu, reg, val);
> +}
> +
> +static void npu3_assign_bars(struct npu3 *npu)
> +{
> +	struct npu3_dev *dev;
> +	uint64_t addr, size, val;
> +
> +	/* Global MMIO bar (per npu) */
> +	phys_map_get(npu->chip_id, NPU_REGS, npu->index, &addr, &size);
> +	val = SETFIELD(NPU3_MMIO_BAR_ADDR, 0ull, addr >> 24);
> +	val |= NPU3_MMIO_BAR_ENABLE;
> +	npu3_write(npu, NPU3_MMIO_BAR, val);
> +
> +	NPU3INF(npu, "MMIO base: 0x%016llx (%lldMB)\n", addr, size >> 20);
> +	npu->regs[0] = addr;
> +	npu->regs[1] = size;
> +
> +	/* NTL bar (per device) */
> +	npu3_for_each_dev(dev, npu) {
> +		phys_map_get(npu->chip_id, NPU_NTL, npu3_chip_dev_index(dev),
> +			     &addr, &size);
> +		val = SETFIELD(NPU3_NTL_BAR_ADDR, 0ull, addr >> 16);
> +		val = SETFIELD(NPU3_NTL_BAR_SIZE, val, ilog2(size >> 16));
> +		npu3_write(npu, NPU3_NTL_BAR(dev->index), val);
> +
> +		dev->ntl_bar.addr = addr;
> +		dev->ntl_bar.size = size;
> +	}
> +
> +	/* GENID bar (logically divided per device) */
> +	phys_map_get(npu->chip_id, NPU_GENID, npu->index, &addr, NULL);
> +	val = SETFIELD(NPU3_GENID_BAR_ADDR, 0ull, addr >> 19);
> +	npu3_write(npu, NPU3_GENID_BAR, val);
> +
> +	npu3_for_each_dev(dev, npu) {
> +		dev->genid_bar.addr = addr + (dev->index << 16);
> +		dev->genid_bar.size = 64 << 10;
> +	}
> +}
> +
> +void npu3_dev_enable_bars(struct npu3_dev *dev, bool enable)
> +{
> +	struct npu3 *npu = dev->npu;
> +	uint64_t reg, val;
> +
> +	if (dev->ntl_bar.enable == enable) /* No state change */
> +		return;
> +
> +	dev->ntl_bar.enable = enable;
> +	dev->genid_bar.enable = enable;
> +
> +	reg = NPU3_NTL_BAR(dev->index);
> +	val = npu3_read(npu, reg);
> +	val = SETFIELD(NPU3_NTL_BAR_ENABLE, val, enable);
> +	npu3_write(npu, reg, val);
> +
> +	/*
> +	 * Generation IDs are a single space in the hardware but we split them
> +	 * per device. Only disable in hardware if every device has disabled.
> +	 */
> +	if (!enable)
> +		npu3_for_each_dev(dev, npu)
> +			if (dev->genid_bar.enable)
> +				return;
> +
> +	reg = NPU3_GENID_BAR;
> +	val = npu3_read(npu, reg);
> +	val = SETFIELD(NPU3_GENID_BAR_ENABLE, val, enable);
> +	npu3_write(npu, reg, val);
> +}
> +
> +static uint64_t npu3_ipi_attributes(struct irq_source *is, uint32_t isn)
> +{
> +	struct npu3 *npu = is->data;
> +	uint32_t level = isn - npu->irq_base;
> +
> +	/* TCE interrupt is used to detect a frozen PE */
> +	if (level == 18)
> +		return IRQ_ATTR_TARGET_OPAL |
> +		       IRQ_ATTR_TARGET_RARE |
> +		       IRQ_ATTR_TYPE_MSI;
> +
> +	return IRQ_ATTR_TARGET_LINUX;
> +}
> +
> +static void npu3_ipi_interrupt(struct irq_source *is, uint32_t isn)
> +{
> +	struct npu3 *npu = is->data;
> +	uint32_t level = isn - npu->irq_base;
> +
> +	if (level != 18) {
> +		NPU3ERR(npu, "Received unknown interrupt %d\n", level);
> +		return;
> +	}
> +
> +	opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, OPAL_EVENT_PCI_ERROR);
> +}
> +
> +#define NPU3_IRQ_LEVELS 60
> +
> +static char *npu3_ipi_name(struct irq_source *is, uint32_t isn)
> +{
> +	struct npu3 *npu = is->data;
> +	uint32_t level = isn - npu->irq_base;
> +	static const char *names[NPU3_IRQ_LEVELS] = {
> +		[0] = "NDL 0 Stall Event (brick 0)",
> +		[1] = "NDL 0 No-Stall Event (brick 0)",
> +		[2] = "NDL 1 Stall Event (brick 1)",
> +		[3] = "NDL 1 No-Stall Event (brick 1)",
> +		[4] = "NDL 2 Stall Event (brick 2)",
> +		[5] = "NDL 2 No-Stall Event (brick 2)",
> +		[6] = "NDL 3 Stall Event (brick 3)",
> +		[7] = "NDL 3 No-Stall Event (brick 3)",
> +		[8] = "NDL 4 Stall Event (brick 4)",
> +		[9] = "NDL 4 No-Stall Event (brick 4)",
> +		[10] = "NDL 5 Stall Event (brick 5)",
> +		[11] = "NDL 5 No-Stall Event (brick 5)",
> +		[12] = "NTL 0 Event",
> +		[13] = "NTL 1 Event",
> +		[14] = "NTL 2 Event",
> +		[15] = "NTL 3 Event",
> +		[16] = "NTL 4 Event",
> +		[17] = "NTL 5 Event",
> +		[18] = "TCE Event",
> +		[19] = "ATS Event",
> +		[20] = "CQ Event",
> +		[21] = "MISC Event",
> +		[41] = "Memory Controller Event",
> +		[42] = "NDL 6 Stall Event (brick 6)",
> +		[43] = "NDL 6 No-Stall Event (brick 6)",
> +		[44] = "NDL 7 Stall Event (brick 7)",
> +		[45] = "NDL 7 No-Stall Event (brick 7)",
> +		[46] = "NDL 8 Stall Event (brick 8)",
> +		[47] = "NDL 8 No-Stall Event (brick 8)",
> +		[48] = "NDL 9 Stall Event (brick 9)",
> +		[49] = "NDL 9 No-Stall Event (brick 9)",
> +		[50] = "NDL 10 Stall Event (brick 10)",
> +		[51] = "NDL 10 No-Stall Event (brick 10)",
> +		[52] = "NDL 11 Stall Event (brick 11)",
> +		[53] = "NDL 11 No-Stall Event (brick 11)",
> +		[54] = "NTL 6 Event",
> +		[55] = "NTL 7 Event",
> +		[56] = "NTL 8 Event",
> +		[57] = "NTL 9 Event",
> +		[58] = "NTL 10 Event",
> +		[59] = "NTL 11 Event",
> +	};
> +
> +	if (level >= NPU3_IRQ_LEVELS || !names[level])
> +		return strdup("Unknown");
> +
> +	return strdup(names[level]);
> +}
> +
> +static const struct irq_source_ops npu3_ipi_ops = {
> +	.attributes	= npu3_ipi_attributes,
> +	.interrupt	= npu3_ipi_interrupt,
> +	.name		= npu3_ipi_name,
> +};
> +
> +static void npu3_setup_irqs(struct npu3 *npu)
> +{
> +	uint64_t reg, val;
> +	uint32_t base;
> +
> +	base = xive_alloc_ipi_irqs(npu->chip_id, NPU3_IRQ_LEVELS, 64);
> +	if (base == XIVE_IRQ_ERROR) {
> +		NPU3ERR(npu, "Failed to allocate interrupt sources\n");
> +		return;
> +	}
> +
> +	xive_register_ipi_source(base, NPU3_IRQ_LEVELS, npu, &npu3_ipi_ops);
> +
> +	/* Set IPI configuration */
> +	reg = NPU3_MISC_CFG;
> +	val = npu3_read(npu, reg);
> +	val = SETFIELD(NPU3_MISC_CFG_IPI_PS, val, NPU3_MISC_CFG_IPI_PS_64K);
> +	val = SETFIELD(NPU3_MISC_CFG_IPI_OS, val, NPU3_MISC_CFG_IPI_OS_AIX);
> +	npu3_write(npu, reg, val);
> +
> +	/* Set IRQ base */
> +	reg = NPU3_MISC_INT_BAR;
> +	val = SETFIELD(NPU3_MISC_INT_BAR_ADDR, 0ull,
> +		       (uint64_t)xive_get_trigger_port(base) >> 12);
> +	npu3_write(npu, reg, val);
> +
> +	npu->irq_base = base;
> +}
> +
> +static void npu3_init(struct npu3 *npu)
> +{
> +	struct npu3_dev *dev;
> +
> +	platform.npu3_device_detect(npu);
> +	npu3_for_each_dev(dev, npu)
> +		npu3_device_detect_fixup(dev);
> +
> +	npu3_misc_config(npu);
> +	npu3_assign_bars(npu);
> +	npu3_setup_irqs(npu);
> +	npu3_init_nvlink(npu);
> +}
> +
> +void probe_npu3(void)
> +{
> +	struct dt_node *dn;
> +	struct npu3 *npu;
> +
> +	if (!npu3_dt_create())
> +		return;
> +
> +	if (!platform.npu3_device_detect) {
> +		prlog(PR_INFO, "NPU: Platform does not support NPU\n");
> +		return;
> +	}
> +
> +	dt_for_each_compatible(dt_root, dn, "ibm,power9-npu3") {
> +		npu = npu3_create(dn);
> +		npu3_init(npu);
> +	}
> +}
> diff --git a/include/npu3-regs.h b/include/npu3-regs.h
> new file mode 100644
> index 000000000000..ce76bf3dc59a
> --- /dev/null
> +++ b/include/npu3-regs.h
> @@ -0,0 +1,247 @@
> +/* Copyright 2019 IBM Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + *      http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> + * implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef __NPU3_REGS_H
> +#define __NPU3_REGS_H
> +
> +#define NPU3_FIR(n)				(0x2c00 + (n) * 0x40)
> +#define NPU3_FIR_MASK(n)			(0x2c03 + (n) * 0x40)
> +#define NPU3_FIR_ACTION0(n)			(0x2c06 + (n) * 0x40)
> +#define NPU3_FIR_ACTION1(n)			(0x2c07 + (n) * 0x40)
> +#define NPU3_FIR_MAX				3
> +
> +/* NPU RING: Indirect address/data port */
> +#define NPU3_MISC_SCOM_IND_SCOM_ADDR		0x33e
> +#define   NPU3_MISC_DA_ADDR			PPC_BITMASK(0, 23)
> +#define   NPU3_MISC_DA_LEN			PPC_BITMASK(24, 25)
> +#define     NPU3_MISC_DA_LEN_4B			2
> +#define     NPU3_MISC_DA_LEN_8B			3
> +#define NPU3_MISC_SCOM_IND_SCOM_DATA		0x33f
> +
> +/* NPU RING: Indirect register blocks */
> +#define NPU3_BLOCK(nib0, nib1)			((nib0) << 20 | (nib1) << 16)
> +#define NPU3_REG_BLOCK(reg)			((reg) & 0xff0000)
> +#define NPU3_REG_OFFSET(reg)			((reg) & 0xffff)
> +
> +#define NPU3_BLOCK_NDL_U(brk)			NPU3_BLOCK(0 + (brk) / 2,\
> +							   8 + (brk) % 2 * 2)
> +#define NPU3_BLOCK_NTL_U(brk)			NPU3_BLOCK(0 + (brk) / 2,\
> +							   9 + (brk) % 2 * 2)
> +#define NPU3_BLOCK_CQ_SM(n)			NPU3_BLOCK(4, (n))
> +#define NPU3_BLOCK_CQ_CTL			NPU3_BLOCK(4, 4)
> +#define NPU3_BLOCK_CQ_DAT			NPU3_BLOCK(4, 5)
> +#define NPU3_BLOCK_NDL(brk)			NPU3_BLOCK(4 + (brk) / 2,\
> +							   8 + (brk) % 2 * 2)
> +#define NPU3_BLOCK_NTL(brk)			NPU3_BLOCK(4 + (brk) / 2,\
> +							   9 + (brk) % 2 * 2)
> +#define NPU3_BLOCK_NPU_ATS			NPU3_BLOCK(7, 0)
> +#define NPU3_BLOCK_NPU_XTS			NPU3_BLOCK(7, 1)
> +#define NPU3_BLOCK_NPU_MISC			NPU3_BLOCK(7, 2)
> +#define NPU3_BLOCK_NPU_XTS_ATSD(n)		NPU3_BLOCK(8, (n))
> +
> +/* NDL_U block registers */
> +#define NPU3_DLPL_CTL(brk)			(NPU3_BLOCK_NDL_U(brk) + 0xfff4)
> +#define   NPU3_DLPL_CTL_RESET_RX		PPC_BIT32(0)
> +#define   NPU3_DLPL_CTL_RESET_MISC		PPC_BIT32(1)
> +#define NPU3_DLPL_CFG(brk)			(NPU3_BLOCK_NDL_U(brk) + 0xfff8)
> +#define   NPU3_DLPL_CFG_PRI_BYTESWAP		PPC_BIT32(0)
> +
> +/* NTL_U block registers */
> +#define NPU3_NTL_MISC_CFG1(brk)			(NPU3_BLOCK_NTL_U(brk) + 0x0c0)
> +#define   NPU3_NTL_MISC_CFG1_NTL_RESET		PPC_BITMASK(8, 9)
> +#define NPU3_NTL_CREQ_HDR_CRED_SND(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x400)
> +#define NPU3_NTL_PRB_HDR_CRED_SND(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x410)
> +#define NPU3_NTL_ATR_HDR_CRED_SND(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x418)
> +#define NPU3_NTL_RSP_HDR_CRED_SND(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x428)
> +#define NPU3_NTL_CREQ_DAT_CRED_SND(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x430)
> +#define NPU3_NTL_RSP_DAT_CRED_SND(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x438)
> +#define NPU3_NTL_CREQ_HDR_CRED_RCV(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x440)
> +#define NPU3_NTL_DGD_HDR_CRED_RCV(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x448)
> +#define NPU3_NTL_ATSD_HDR_CRED_RCV(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x460)
> +#define NPU3_NTL_RSP_HDR_CRED_RCV(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x468)
> +#define NPU3_NTL_CREQ_DAT_CRED_RCV(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x470)
> +#define NPU3_NTL_RSP_DAT_CRED_RCV(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x478)
> +#define NPU3_NTL_CQ_FENCE_STATUS(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x500)
> +#define   NPU3_NTL_CQ_FENCE_STATUS_FIELD	PPC_BITMASK(0, 1)
> +#define     NPU3_NTL_CQ_FENCE_STATUS_FULL	3
> +#define     NPU3_NTL_CQ_FENCE_STATUS_HALF	2
> +#define     NPU3_NTL_CQ_FENCE_STATUS_NONE	0
> +
> +/*
> + * CQ_SM block registers
> + *
> + * Definitions here use NPU3_BLOCK_CQ_SM(0), but when npu3_write() is given
> + * one of these, it will do corresponding writes to every CQ_SM block.
> + */
> +#define NPU3_SM_MISC_CFG0			(NPU3_BLOCK_CQ_SM(0) + 0x000)
> +#define   NPU3_SM_MISC_CFG0_ENABLE_PBUS		PPC_BIT(26)
> +#define   NPU3_SM_MISC_CFG0_ENABLE_SNARF_CPM	PPC_BIT(27)
> +#define   NPU3_SM_MISC_CFG0_OCAPI_MODE		PPC_BITMASK(44, 48)
> +#define   NPU3_SM_MISC_CFG0_NVLINK_MODE		PPC_BITMASK(49, 53)
> +#define NPU3_SM_MISC_CFG1			(NPU3_BLOCK_CQ_SM(0) + 0x008)
> +#define NPU3_SM_MISC_CFG2			(NPU3_BLOCK_CQ_SM(0) + 0x0f0)
> +#define NPU3_GPU_MEM_BAR(brk)			(NPU3_BLOCK_CQ_SM(0) + 0x190 + (brk) * 8)
> +#define   NPU3_GPU_MEM_BAR_ENABLE		PPC_BIT(0)
> +#define   NPU3_GPU_MEM_BAR_ADDR_MASK		PPC_BITMASK(1, 35)
> +#define     NPU3_GPU_MEM_BAR_ADDR		PPC_BITMASK(1, 21)
> +#define     NPU3_GPU_MEM_BAR_SIZE		PPC_BITMASK(22, 35)
> +#define   NPU3_GPU_MEM_BAR_SL_MODE		PPC_BIT(36)
> +#define   NPU3_GPU_MEM_BAR_4T_LIMIT		PPC_BIT(37)
> +#define   NPU3_GPU_MEM_BAR_4T_SELECT		PPC_BITMASK(38, 39)
> +#define   NPU3_GPU_MEM_BAR_MODE			PPC_BITMASK(40, 43)
> +#define   NPU3_GPU_MEM_BAR_POISON		PPC_BIT(45)
> +#define   NPU3_GPU_MEM_BAR_CHIP_EQ_GROUP	PPC_BIT(49)
> +#define NPU3_NTL_BAR(brk)			(NPU3_BLOCK_CQ_SM(0) + 0x1b8 + (brk) * 8)
> +#define   NPU3_NTL_BAR_ENABLE			PPC_BIT(0)
> +#define   NPU3_NTL_BAR_ADDR			PPC_BITMASK(3, 35)
> +#define   NPU3_NTL_BAR_SIZE			PPC_BITMASK(39, 43)
> +#define     NPU3_NTL_BAR_SIZE_128K		1
> +#define NPU3_MMIO_BAR				(NPU3_BLOCK_CQ_SM(0) + 0x1e0)
> +#define   NPU3_MMIO_BAR_ENABLE			PPC_BIT(0)
> +#define   NPU3_MMIO_BAR_ADDR			PPC_BITMASK(3, 27)
> +#define NPU3_GENID_BAR				(NPU3_BLOCK_CQ_SM(0) + 0x1e8)
> +#define   NPU3_GENID_BAR_ENABLE			PPC_BIT(0)
> +#define   NPU3_GENID_BAR_ADDR			PPC_BITMASK(3, 32)
> +#define NPU3_RELAXED_SRC(n)			(NPU3_BLOCK_CQ_SM(0) + 0x1f0 + (n) * 8)
> +#define   NPU3_RELAXED_SRC_MAX			4
> +#define   NPU3_RELAXED_SRC_TAG			PPC_BITMASK(0, 13)
> +#define     NPU3_RELAXED_SRC_GRPCHP		PPC_BITMASK(0, 6)
> +#define     NPU3_RELAXED_SRC_PEC		PPC_BITMASK(12, 13)
> +#define   NPU3_RELAXED_SRC_TAGMASK		PPC_BITMASK(14, 27)
> +#define   NPU3_RELAXED_SRC_MASK_NPU		PPC_BIT(28)
> +#define   NPU3_RELAXED_SRC_MASK_PCIE		PPC_BIT(29)
> +#define   NPU3_RELAXED_SRC_MASK_L2L3		PPC_BIT(30)
> +#define   NPU3_RELAXED_SRC_RDSTART		PPC_BITMASK(32, 39)
> +#define   NPU3_RELAXED_SRC_RDEND		PPC_BITMASK(40, 47)
> +#define   NPU3_RELAXED_SRC_WRSTART		PPC_BITMASK(48, 55)
> +#define   NPU3_RELAXED_SRC_WREND		PPC_BITMASK(56, 63)
> +#define NPU3_RELAXED_CFG2(brk)			(NPU3_BLOCK_CQ_SM(0) + 0x230 + (brk) * 8)
> +#define   NPU3_RELAXED_CFG2_CMD_CL_DMA_W	PPC_BIT(0)
> +#define   NPU3_RELAXED_CFG2_CMD_CL_DMA_W_HP	PPC_BIT(1)
> +#define   NPU3_RELAXED_CFG2_CMD_CL_DMA_INJ	PPC_BIT(2)
> +#define   NPU3_RELAXED_CFG2_CMD_PR_DMA_INJ	PPC_BIT(3)
> +#define   NPU3_RELAXED_CFG2_CMD_DMA_PR_W	PPC_BIT(4)
> +#define   NPU3_RELAXED_CFG2_CMD_CL_RD_NC_F0	PPC_BIT(5)
> +#define   NPU3_RELAXED_CFG2_SRC_WRENA(src)	PPC_BIT(32 + (src) * 4)
> +#define   NPU3_RELAXED_CFG2_SRC_RDENA(src)	PPC_BIT(33 + (src) * 4)
> +#define   NPU3_RELAXED_CFG2_SRC_AWENA(src)	PPC_BIT(34 + (src) * 4)
> +#define   NPU3_RELAXED_CFG2_SRC_ARENA(src)	PPC_BIT(35 + (src) * 4)
> +
> +/* CQ_CTL block registers */
> +#define NPU3_CTL_MISC_CFG0			(NPU3_BLOCK_CQ_CTL + 0x000)
> +#define NPU3_CTL_MISC_CFG1			(NPU3_BLOCK_CQ_CTL + 0x008)
> +#define NPU3_CTL_MISC_CFG2			(NPU3_BLOCK_CQ_CTL + 0x010)
> +#define   NPU3_CTL_MISC_CFG2_OCAPI_MODE		PPC_BITMASK(0, 4)
> +#define   NPU3_CTL_MISC_CFG2_NVLINK_MODE	PPC_BITMASK(5, 9)
> +#define NPU3_CTL_MISC_CFG3			(NPU3_BLOCK_CQ_CTL + 0x018)
> +#define NPU3_CTL_BDF2PE_CFG(n)			(NPU3_BLOCK_CQ_CTL + 0x180 + (n) * 8)
> +#define   NPU3_CTL_BDF2PE_CFG_ENABLE		PPC_BIT(0)
> +#define   NPU3_CTL_BDF2PE_CFG_PE		PPC_BITMASK(4, 7)
> +#define   NPU3_CTL_BDF2PE_CFG_BDF		PPC_BITMASK(8, 23)
> +
> +/* CQ_DAT block registers */
> +#define NPU3_DAT_MISC_CFG1			(NPU3_BLOCK_CQ_DAT + 0x008)
> +#define   NPU3_DAT_MISC_CFG1_OCAPI_MODE		PPC_BITMASK(40, 44)
> +#define   NPU3_DAT_MISC_CFG1_NVLINK_MODE	PPC_BITMASK(45, 49)
> +
> +/* NTL block registers */
> +#define NPU3_NTL_MISC_CFG2(brk)			(NPU3_BLOCK_NTL(brk) + 0x000)
> +#define   NPU3_NTL_MISC_CFG2_BRICK_ENABLE	PPC_BIT(0)
> +#define   NPU3_NTL_MISC_CFG2_NDL_RX_PARITY_ENA	PPC_BIT(16)
> +#define   NPU3_NTL_MISC_CFG2_NDL_TX_PARITY_ENA	PPC_BIT(17)
> +#define   NPU3_NTL_MISC_CFG2_NDL_PRI_PARITY_ENA	PPC_BIT(18)
> +#define   NPU3_NTL_MISC_CFG2_RCV_CREDIT_OVERFLOW_ENA PPC_BIT(19)
> +#define NPU3_NTL_PRI_CFG(brk)			(NPU3_BLOCK_NTL(brk) + 0x0b0)
> +#define   NPU3_NTL_PRI_CFG_NDL			PPC_BITMASK(1, 2)
> +
> +/* NPU_ATS block registers */
> +#define NPU3_ATS_IODA_ADDR			(NPU3_BLOCK_NPU_ATS + 0x108)
> +#define   NPU3_ATS_IODA_ADDR_AUTO_INC		PPC_BIT(0)
> +#define   NPU3_ATS_IODA_ADDR_TBL_SEL		PPC_BITMASK(11, 15)
> +#define     NPU3_ATS_IODA_ADDR_TBL_TVT		9
> +#define   NPU3_ATS_IODA_ADDR_TBL_ADDR		PPC_BITMASK(54, 63)
> +#define NPU3_ATS_IODA_DATA			(NPU3_BLOCK_NPU_ATS + 0x110)
> +#define   NPU3_ATS_IODA_TVT_XLAT_ADDR		PPC_BITMASK(0, 47)
> +#define   NPU3_ATS_IODA_TVT_TABLE_LEVEL		PPC_BITMASK(48, 50)
> +#define   NPU3_ATS_IODA_TVT_TABLE_SIZE		PPC_BITMASK(51, 55)
> +#define   NPU3_ATS_IODA_TVT_PAGE_SIZE		PPC_BITMASK(59, 63)
> +#define NPU3_ATS_TCE_KILL			(NPU3_BLOCK_NPU_ATS + 0x120)
> +#define   NPU3_ATS_TCE_KILL_ALL			PPC_BIT(0)
> +#define   NPU3_ATS_TCE_KILL_ONE			PPC_BIT(2)
> +#define   NPU3_ATS_TCE_KILL_PE_NUMBER		PPC_BITMASK(4, 7)
> +#define   NPU3_ATS_TCE_KILL_ADDRESS		PPC_BITMASK(15, 51)
> +
> +/* NPU_XTS block registers */
> +#define NPU3_XTS_CFG				(NPU3_BLOCK_NPU_XTS + 0x020)
> +#define   NPU3_XTS_CFG_MMIOSD			PPC_BIT(1)
> +#define   NPU3_XTS_CFG_TRY_ATR_RO		PPC_BIT(6)
> +#define   NPU3_XTS_CFG_OPENCAPI			PPC_BIT(15)
> +#define NPU3_XTS_CFG2				(NPU3_BLOCK_NPU_XTS + 0x028)
> +#define   NPU3_XTS_CFG2_NO_FLUSH_ENA		PPC_BIT(49)
> +#define   NPU3_XTS_CFG2_XSL2_ENA		PPC_BIT(55)
> +#define NPU3_XTS_CFG3				(NPU3_BLOCK_NPU_XTS + 0x068)
> +#define NPU3_XTS_ATSD_HYP(n)			(NPU3_BLOCK_NPU_XTS + 0x100 + (n) * 8)
> +#define   NPU3_XTS_ATSD_HYP_MSR_HV		PPC_BIT(51)
> +#define   NPU3_XTS_ATSD_HYP_LPARID		PPC_BITMASK(52, 63)
> +#define NPU3_XTS_BDF_MAP(n)			(NPU3_BLOCK_NPU_XTS + 0x4000 + (n) * 8)
> +#define   NPU3_XTS_BDF_MAP_MAX			16
> +#define   NPU3_XTS_BDF_MAP_VALID		PPC_BIT(0)
> +#define   NPU3_XTS_BDF_MAP_UNFILT		PPC_BIT(1)
> +#define   NPU3_XTS_BDF_MAP_STACK		PPC_BITMASK(4, 6)
> +#define   NPU3_XTS_BDF_MAP_BRICK		PPC_BITMASK(7, 9)
> +#define   NPU3_XTS_BDF_MAP_BDF			PPC_BITMASK(16, 31)
> +#define   NPU3_XTS_BDF_MAP_XLAT			PPC_BITMASK(39, 40)
> +#define   NPU3_XTS_BDF_MAP_LPCR_PS		PPC_BITMASK(41, 43)
> +#define   NPU3_XTS_BDF_MAP_LPCR_ISL		PPC_BIT(44)
> +#define   NPU3_XTS_BDF_MAP_LPCR_TC		PPC_BIT(45)
> +#define   NPU3_XTS_BDF_MAP_LPCR_SC		PPC_BIT(46)
> +#define   NPU3_XTS_BDF_MAP_LPCR_BOT		PPC_BIT(47)
> +#define   NPU3_XTS_BDF_MAP_LPARSHORT		PPC_BITMASK(48, 51)
> +#define   NPU3_XTS_BDF_MAP_LPARID		PPC_BITMASK(52, 63)
> +#define NPU3_XTS_PID_MAP(n)			(NPU3_BLOCK_NPU_XTS + 0x8000 + (n) * 32)
> +#define   NPU3_XTS_PID_MAP_VALID_ATRGPA0	PPC_BIT(0)
> +#define   NPU3_XTS_PID_MAP_VALID_ATRGPA1	PPC_BIT(1)
> +#define   NPU3_XTS_PID_MAP_VALID_ATSD		PPC_BIT(2)
> +#define   NPU3_XTS_PID_MAP_MSR			PPC_BITMASK(25, 31)
> +#define     NPU3_XTS_PID_MAP_MSR_DR		PPC_BIT(25)
> +#define     NPU3_XTS_PID_MAP_MSR_TA		PPC_BIT(26)
> +#define     NPU3_XTS_PID_MAP_MSR_HV		PPC_BIT(27)
> +#define     NPU3_XTS_PID_MAP_MSR_PR		PPC_BIT(28)
> +#define     NPU3_XTS_PID_MAP_MSR_US		PPC_BIT(29)
> +#define     NPU3_XTS_PID_MAP_MSR_SF		PPC_BIT(30)
> +#define     NPU3_XTS_PID_MAP_MSR_UV		PPC_BIT(31)
> +#define   NPU3_XTS_PID_MAP_LPARSHORT		PPC_BITMASK(40, 43)
> +#define   NPU3_XTS_PID_MAP_PID       		PPC_BITMASK(44, 63)
> +
> +/* NPU_MISC block registers */
> +#define NPU3_MISC_CFG				(NPU3_BLOCK_NPU_MISC + 0x030)
> +#define   NPU3_MISC_CFG_IPI_PS			PPC_BIT(11)
> +#define     NPU3_MISC_CFG_IPI_PS_64K		1
> +#define   NPU3_MISC_CFG_IPI_OS			PPC_BIT(12)
> +#define     NPU3_MISC_CFG_IPI_OS_AIX		0
> +#define     NPU3_MISC_CFG_IPI_OS_LINUX		1
> +#define NPU3_MISC_INT_BAR			(NPU3_BLOCK_NPU_MISC + 0x098)
> +#define   NPU3_MISC_INT_BAR_ADDR		PPC_BITMASK(0, 39)
> +#define NPU3_MISC_BDF2PE_CFG(n)			(NPU3_BLOCK_NPU_MISC + 0x100 + (n) * 8)
> +#define   NPU3_MISC_BDF2PE_CFG_ENABLE		PPC_BIT(0)
> +#define   NPU3_MISC_BDF2PE_CFG_PE		PPC_BITMASK(4, 7)
> +#define   NPU3_MISC_BDF2PE_CFG_BDF		PPC_BITMASK(8, 23)
> +#define NPU3_MISC_PESTB(pe)			(NPU3_BLOCK_NPU_MISC + 0x200 + (pe) * 8)
> +
> +/* NPU_XTS_ATSD block registers */
> +#define NPU3_XTS_ATSD_LAUNCH(n)			(NPU3_BLOCK_NPU_XTS_ATSD(n) + 0x000)
> +
> +#endif /* __NPU3_REGS_H */
> diff --git a/include/npu3.h b/include/npu3.h
> new file mode 100644
> index 000000000000..6a4ac6a2d442
> --- /dev/null
> +++ b/include/npu3.h
> @@ -0,0 +1,180 @@
> +/* Copyright 2019 IBM Corp.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at
> + *
> + *      http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> + * implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef __NPU3_H
> +#define __NPU3_H
> +
> +#include <phys-map.h>
> +#include <pci.h>
> +#include <npu3-regs.h>
> +
> +enum npu3_dev_type {
> +	NPU3_DEV_TYPE_UNKNOWN = 0,
> +	NPU3_DEV_TYPE_NVLINK,
> +	NPU3_DEV_TYPE_ANY = INT_MAX
> +};
> +
> +/* Information about a currently running hw procedure */
> +struct npu3_procedure {
> +	uint16_t		number;
> +	uint16_t		step;
> +	uint32_t		status;
> +	unsigned long		timeout;
> +};
> +
> +/* Used to expose a hardware BAR (or logical slice of it) outside skiboot */
> +struct npu3_bar {
> +	bool			enable;
> +	uint64_t		addr;
> +	uint64_t		size;
> +	uint64_t		trap;
> +};
> +
> +struct npu3_dev_nvlink {
> +	/*
> +	 * PCI virtual device. BDFN is allocated based on GPU association.
> +	 * Links connected to the same GPU will be exposed as different
> +	 * functions of the same bus/device.
> +	 */
> +	struct pci_virt_device	*pvd;
> +
> +	/* The PCI device created from pvd */
> +	const char		*loc_code;
> +	struct pci_device	*pd;
> +
> +	/* The associated GPU device */
> +	struct pci_device	*gpu;
> +};
> +
> +struct npu3_dev {
> +	enum npu3_dev_type	type;
> +	uint32_t		index;
> +	struct dt_node		*dn;
> +	struct npu3		*npu;
> +	struct npu3_procedure	proc;
> +	uint64_t		link_speed;
> +
> +	struct npu3_bar		ntl_bar;
> +	struct npu3_bar		genid_bar;
> +
> +	/* Associated PHY information */
> +	uint32_t		ob_chiplet;
> +	uint32_t		phy_lane_mask;
> +
> +	/* For NPU3_DEV_TYPE_NVLINK */
> +	struct npu3_dev_nvlink	nvlink;
> +};
> +
> +struct npu3_nvlink {
> +	struct phb		phb;
> +	uint32_t		context_refcount[NPU3_XTS_BDF_MAP_MAX];


Can we please have same names for the same things? I do not care as much
which one, just the same?


> +};
> +
> +#define NPU3_LINKS_PER_NPU 4
> +
> +struct npu3 {
> +	uint32_t		index;
> +	struct dt_node		*dt_node;
> +	uint32_t		chip_id;
> +	uint64_t		xscom_base;
> +
> +	/* Global MMIO window (all NPU regs) */
> +	uint64_t		regs[2];
> +
> +	uint32_t		irq_base;
> +	struct lock		lock;
> +	bool			tx_zcal_complete;
> +
> +	struct npu3_dev		devices[NPU3_LINKS_PER_NPU];
> +
> +	/* Shared by any NPU3_DEV_TYPE_NVLINK devices */
> +	struct npu3_nvlink	nvlink;
> +};
> +
> +static inline struct npu3 *npu3_phb_to_npu(struct phb *phb)
> +{
> +	assert(phb->phb_type == phb_type_npu_v3);
> +	return container_of(phb, struct npu3, nvlink.phb);
> +}
> +
> +/* Chip-scope index of the link */
> +static inline uint32_t npu3_chip_dev_index(struct npu3_dev *dev)
> +{
> +	return dev->npu->index * NPU3_LINKS_PER_NPU + dev->index;
> +}
> +
> +struct npu3_dev *npu3_next_dev(struct npu3 *npu, struct npu3_dev *dev,
> +			       enum npu3_dev_type type);
> +
> +#define npu3_for_each_dev_type(dev, npu, type) \
> +	for (dev = NULL; (dev = npu3_next_dev(npu, dev, type));)
> +
> +#define npu3_for_each_nvlink_dev(dev, npu) \
> +	npu3_for_each_dev_type(dev, npu, NPU3_DEV_TYPE_NVLINK)
> +
> +#define npu3_for_each_dev(dev, npu) \
> +	npu3_for_each_dev_type(dev, npu, NPU3_DEV_TYPE_ANY)
> +
> +struct npu3 *npu3_next_nvlink_npu(struct npu3 *npu, uint32_t chip_id);
> +
> +#define npu3_for_each_chip_nvlink_npu(npu, chip_id)                    \
> +        for (npu = NULL; (npu = npu3_next_nvlink_npu(npu, chip_id));)
> +
> +#define NPU3_ANY_CHIP INT_MAX
> +#define npu3_for_each_nvlink_npu(npu) \
> +	npu3_for_each_chip_nvlink_npu(npu, NPU3_ANY_CHIP)
> +
> +void npu3_init_nvlink(struct npu3 *npu);
> +void npu3_dev_enable_bars(struct npu3_dev *dev, bool enable);
> +int64_t npu3_dev_reset(struct npu3_dev *dev);
> +
> +uint32_t npu3_chip_possible_gpus(void);
> +uint32_t npu3_dev_gpu_index(struct npu3_dev *dev);
> +
> +/* NPU RING register access */
> +void npu3_write(struct npu3 *npu, uint64_t reg, uint64_t val);
> +uint64_t npu3_read(struct npu3 *npu, uint64_t reg);
> +void npu3_write_4b(struct npu3 *npu, uint64_t reg, uint32_t val);
> +uint32_t npu3_read_4b(struct npu3 *npu, uint64_t reg);
> +
> +/* Link flags */
> +#define NPU3_DEV_PCI_LINKED	0x1
> +#define NPU3_DEV_DL_RESET	0x2
> +
> +void npu3_pvd_flag_set(struct npu3_dev *dev, uint8_t flag);
> +void npu3_pvd_flag_clear(struct npu3_dev *dev, uint8_t flag);
> +
> +/* PHY procedures */
> +#define NPU3_PROC_STATUS_MASK	0xc000000f
> +#define NPU3_PROC_INPROGRESS	(1 << 31)
> +#define NPU3_PROC_COMPLETE	(1 << 30)
> +#define NPU3_PROC_NEXT		(1 << 29)
> +#define NPU3_PROC_FAILED	2
> +#define NPU3_PROC_ABORTED	3
> +#define NPU3_PROC_UNSUPPORTED	4
> +
> +void npu3_dev_procedure_init(struct npu3_dev *dev, uint32_t pnum);
> +uint32_t npu3_dev_procedure_status(struct npu3_dev *dev);
> +
> +/* OPAL entry points */
> +int64_t npu3_init_context(struct phb *phb, uint64_t msr, uint64_t bdf);
> +int64_t npu3_destroy_context(struct phb *phb, uint64_t bdf);
> +int64_t npu3_map_lpar(struct phb *phb, uint64_t bdf, uint64_t lparid,
> +		      uint64_t lpcr);
> +int64_t npu3_set_relaxed_order(struct phb *phb, uint32_t gcid, int pec,
> +			       bool enable);
> +
> +#endif /* __NPU3_H */
> diff --git a/include/pci.h b/include/pci.h
> index 2b7a3c2893d5..ff29010cbb71 100644
> --- a/include/pci.h
> +++ b/include/pci.h
> @@ -366,6 +366,7 @@ enum phb_type {
>  	phb_type_pcie_v4,
>  	phb_type_npu_v2,
>  	phb_type_npu_v2_opencapi,
> +	phb_type_npu_v3,
>  };
>  
>  struct phb {
> diff --git a/include/platform.h b/include/platform.h
> index 4f8627a3a680..0b728a50075e 100644
> --- a/include/platform.h
> +++ b/include/platform.h
> @@ -23,6 +23,7 @@ struct pci_device;
>  struct pci_slot;
>  struct errorlog;
>  struct npu2;
> +struct npu3;
>  
>  enum resource_id {
>  	RESOURCE_ID_KERNEL,
> @@ -94,8 +95,9 @@ struct platform {
>  	/* OpenCAPI platform-specific I2C information */
>  	const struct platform_ocapi *ocapi;
>  
> -	/* NPU2 device detection */
> +	/* NPU device detection */
>  	void		(*npu2_device_detect)(struct npu2 *npu);
> +	void		(*npu3_device_detect)(struct npu3 *npu);
>  
>  	/*
>  	 * Probe platform, return true on a match, called before
> diff --git a/include/skiboot.h b/include/skiboot.h
> index 1b3bacbe73f6..2eafb1118dea 100644
> --- a/include/skiboot.h
> +++ b/include/skiboot.h
> @@ -208,6 +208,7 @@ extern int preload_capp_ucode(void);
>  extern void preload_io_vpd(void);
>  extern void probe_npu(void);
>  extern void probe_npu2(void);
> +extern void probe_npu3(void);
>  extern void uart_init(void);
>  extern void mbox_init(void);
>  extern void early_uart_init(void);
> diff --git a/include/xscom-p9-regs.h b/include/xscom-p9-regs.h
> index 5137d91838d6..856a92d9ab4f 100644
> --- a/include/xscom-p9-regs.h
> +++ b/include/xscom-p9-regs.h
> @@ -82,4 +82,23 @@
>  #define EC_PPM_SPECIAL_WKUP_OCC		0x010C
>  #define EC_PPM_SPECIAL_WKUP_HYP		0x010D
>  
> +#define OB_BASE(ob)				(((ob) + 9) << 24)
> +#define OB_CPLT_CONF1(ob)			(OB_BASE(ob) + 0x9)
> +#define   OB_CPLT_CONF1_NV_IOVALID(brk)		PPC_BIT(6 + (brk))
> +#define OB_INDIRECT(ob)				((OB_BASE(ob) + 0x10c3f) | PPC_BIT(0))
> +
> +/* PPE SRAM: Indirect address/data port */
> +#define OB_PPE_CSAR(ob)				(OB_BASE(ob) + 0x1104d)
> +#define   OB_PPE_CSAR_SRAM_ADDR			PPC_BITMASK(16, 28)
> +#define OB_PPE_CSDR(ob)				(OB_BASE(ob) + 0x1104e)
> +
> +/* PPE SRAM: Indirect registers */
> +#define OB_PPE_SALT_CMD				0x1fe6
> +#define   OB_PPE_SALT_CMD_READY			PPC_BIT(0)
> +#define   OB_PPE_SALT_CMD_RW			PPC_BIT(1)
> +#define   OB_PPE_SALT_CMD_ERR			PPC_BIT(2)
> +#define   OB_PPE_SALT_CMD_LINKNUM		PPC_BITMASK(15, 18)
> +#define   OB_PPE_SALT_CMD_REG			PPC_BITMASK(19, 31)
> +#define   OB_PPE_SALT_CMD_DATA			PPC_BITMASK(32, 63)
> +
>  #endif /* __XSCOM_P9_REGS_H__ */
> 

-- 
Alexey


More information about the Skiboot mailing list