[Skiboot] [PATCH 6/7] hw: Introduce npu3

Tue Jul 2 07:46:47 AEST 2019

Alexey,

Thanks for looking this over! Really appreciate it. Comments inline.

On Wed, Jun 26, 2019 at 04:28:58PM +1000, Alexey Kardashevskiy wrote:
>On 13/06/2019 07:08, Reza Arbab wrote:
>> POWER9P systems have been upgraded with NVLink 3.0 interconnects. The
>> underlying hardware is fundamentally different--each POWER9 chip has
>>
>>         (1 NPU) * (3 stacks) * (2 bricks) = (6 links)
>>
>> Where in each POWER9P chip, there are
>>
>>         (3 NPUs) * (4 bricks) = (12 links)
>>
>> This flatter hierarchy simplifies the firmware implementation a bit, but
>> also prevents sharing much common code with npu2.
>
>
>Is anything really shared? It looks like all data structures and npu2*.*
>files were copied and massively refactored so now it is impossible to
>tell what the actual difference between NPU2 and NPU3 is (besides
>different grouping). 95+% of data structures and registers seems to be
>just the same. Not sure sharing the code worth it but it definitely
>makes it harder to follow what is done here...

So really, the extent of shared code is npu-opal.c and cache-p9.c.

I do understand what you mean. In starting out I tried several 
approaches and found sharing the code directly wasnt't worth it, as you 
said. Even factoring out some common codepaths, you end up with "if 
(npu3) { this } else { that }" spaghetti *somewhere*. Using parallel 
files is much cleaner.

And then, refactoring aside, I think even the hw interface change alone 
keeps things from being diff-friendly. It's unfortunate, but to see how 
an npu3 routine compares to npu2 you probably do have to just put them 
side-by-side and eyeball it.

>> As in previous versions, initialize the hardware and expose each link to
>> the OS as a virtual PCIe device. This initial support covers NVLink
>> devices only, with OpenCAPI to follow.
>>
>> Signed-off-by: Reza Arbab <arbab at linux.ibm.com>
>> ---
>>  core/init.c             |    1 +
>>  hw/Makefile.inc         |    3 +-
>>  hw/npu-opal.c           |   38 +-
>>  hw/npu3-hw-procedures.c |  801 +++++++++++++++++++++
>>  hw/npu3-nvlink.c        | 1841 +++++++++++++++++++++++++++++++++++++++++++++++
>>  hw/npu3.c               |  554 ++++++++++++++
>>  include/npu3-regs.h     |  247 +++++++
>>  include/npu3.h          |  180 +++++
>>  include/pci.h           |    1 +
>>  include/platform.h      |    4 +-
>>  include/skiboot.h       |    1 +
>>  include/xscom-p9-regs.h |   19 +
>>  12 files changed, 3680 insertions(+), 10 deletions(-)
>>  create mode 100644 hw/npu3-hw-procedures.c
>>  create mode 100644 hw/npu3-nvlink.c
>>  create mode 100644 hw/npu3.c
>>  create mode 100644 include/npu3-regs.h
>>  create mode 100644 include/npu3.h
>>
>> diff --git a/core/init.c b/core/init.c
>> index 7e8ba7854dcc..89cef87a44f6 100644
>> --- a/core/init.c
>> +++ b/core/init.c
>> @@ -1247,6 +1247,7 @@ void __noreturn __nomcount main_cpu_entry(const void *fdt)
>>  	/* Probe NPUs */
>>  	probe_npu();
>>  	probe_npu2();
>> +	probe_npu3();
>>
>>  	/* Initialize PCI */
>>  	pci_init_slots();
>> diff --git a/hw/Makefile.inc b/hw/Makefile.inc
>> index 2f4f4dabef59..d346c594917c 100644
>> --- a/hw/Makefile.inc
>> +++ b/hw/Makefile.inc
>> @@ -8,7 +8,8 @@ HW_OBJS += dts.o lpc-rtc.o npu.o npu-hw-procedures.o xive.o phb4.o
>>  HW_OBJS += fake-nvram.o lpc-mbox.o npu2.o npu2-hw-procedures.o
>>  HW_OBJS += npu2-common.o phys-map.o sbe-p9.o capp.o occ-sensor.o vas.o
>>  HW_OBJS += npu2-opencapi.o phys-map.o sbe-p9.o capp.o occ-sensor.o
>> -HW_OBJS += vas.o sbe-p8.o dio-p9.o cache-p9.o npu-opal.o
>> +HW_OBJS += npu-opal.o npu3.o npu3-nvlink.o npu3-hw-procedures.o
>> +HW_OBJS += vas.o sbe-p8.o dio-p9.o cache-p9.o
>>  HW_OBJS += lpc-port80h.o
>>  HW=hw/built-in.a
>>
>> diff --git a/hw/npu-opal.c b/hw/npu-opal.c
>> index 4195ffa2fc60..b4aebc15c65b 100644
>> --- a/hw/npu-opal.c
>> +++ b/hw/npu-opal.c
>> @@ -18,16 +18,23 @@
>>  #include <pci.h>
>>  #include <phb4.h>
>>  #include <npu2.h>
>> +#include <npu3.h>
>>
>>  static int64_t opal_npu_init_context(uint64_t phb_id, int pid __unused,
>>  				     uint64_t msr, uint64_t bdf)
>>  {
>>  	struct phb *phb = pci_get_phb(phb_id);
>>
>> -	if (!phb || phb->phb_type != phb_type_npu_v2)
>> +	if (!phb)
>>  		return OPAL_PARAMETER;
>>
>> -	return npu2_init_context(phb, msr, bdf);
>> +	if (phb->phb_type == phb_type_npu_v2)
>> +		return npu2_init_context(phb, msr, bdf);
>> +
>> +	if (phb->phb_type == phb_type_npu_v3)
>> +		return npu3_init_context(phb, msr, bdf);
>> +
>> +	return OPAL_PARAMETER;
>>  }
>>  opal_call(OPAL_NPU_INIT_CONTEXT, opal_npu_init_context, 4);
>>
>> @@ -36,10 +43,16 @@ static int64_t opal_npu_destroy_context(uint64_t phb_id, uint64_t pid __unused,
>>  {
>>  	struct phb *phb = pci_get_phb(phb_id);
>>
>> -	if (!phb || phb->phb_type != phb_type_npu_v2)
>> +	if (!phb)
>>  		return OPAL_PARAMETER;
>>
>> -	return npu2_destroy_context(phb, bdf);
>> +	if (phb->phb_type == phb_type_npu_v2)
>> +		return npu2_destroy_context(phb, bdf);
>> +
>> +	if (phb->phb_type == phb_type_npu_v3)
>> +		return npu3_destroy_context(phb, bdf);
>> +
>> +	return OPAL_PARAMETER;
>>  }
>>  opal_call(OPAL_NPU_DESTROY_CONTEXT, opal_npu_destroy_context, 3);
>>
>> @@ -48,10 +61,16 @@ static int64_t opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid,
>>  {
>>  	struct phb *phb = pci_get_phb(phb_id);
>>
>> -	if (!phb || phb->phb_type != phb_type_npu_v2)
>> +	if (!phb)
>>  		return OPAL_PARAMETER;
>>
>> -	return npu2_map_lpar(phb, bdf, lparid, lpcr);
>> +	if (phb->phb_type == phb_type_npu_v2)
>> +		return npu2_map_lpar(phb, bdf, lparid, lpcr);
>> +
>> +	if (phb->phb_type == phb_type_npu_v3)
>> +		return npu3_map_lpar(phb, bdf, lparid, lpcr);
>> +
>> +	return OPAL_PARAMETER;
>>  }
>>  opal_call(OPAL_NPU_MAP_LPAR, opal_npu_map_lpar, 4);
>>
>> @@ -81,10 +100,13 @@ static int64_t npu_set_relaxed_order(uint32_t gcid, int pec, bool enable)
>>  	uint64_t rc;
>>
>>  	for_each_phb(phb) {
>> -		if (phb->phb_type != phb_type_npu_v2)
>> +		if (phb->phb_type == phb_type_npu_v2)
>> +			rc = npu2_set_relaxed_order(phb, gcid, pec, enable);
>> +		else if (phb->phb_type == phb_type_npu_v3)
>> +			rc = npu3_set_relaxed_order(phb, gcid, pec, enable);
>> +		else
>>  			continue;
>>
>> -		rc = npu2_set_relaxed_order(phb, gcid, pec, enable);
>>  		if (rc)
>>  			return rc;
>>  	}
>> diff --git a/hw/npu3-hw-procedures.c b/hw/npu3-hw-procedures.c
>> new file mode 100644
>> index 000000000000..42b658d1aab2
>> --- /dev/null
>> +++ b/hw/npu3-hw-procedures.c
>> @@ -0,0 +1,801 @@
>> +/* Copyright 2019 IBM Corp.
>> + *
>> + * Licensed under the Apache License, Version 2.0 (the "License");
>> + * you may not use this file except in compliance with the License.
>> + * You may obtain a copy of the License at
>> + *
>> + *	http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
>> + * implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +#include <skiboot.h>
>> +#include <npu3.h>
>> +#include <npu3-regs.h>
>> +#include <timebase.h>
>> +#include <xscom.h>
>> +#include <xscom-p9-regs.h>
>> +
>> +#define NPU3DEVLOG(l, dev, fmt, a...)		\
>> +	prlog(l, "NPU[%d:%d:%d]: " fmt,		\
>> +	      (dev)->npu->chip_id,		\
>> +	      (dev)->npu->index,		\
>> +	      (dev)->index, ##a)
>> +#define NPU3DEVDBG(dev, fmt, a...) NPU3DEVLOG(PR_DEBUG, dev, fmt, ##a)
>> +#define NPU3DEVINF(dev, fmt, a...) NPU3DEVLOG(PR_INFO, dev, fmt, ##a)
>> +#define NPU3DEVERR(dev, fmt, a...) NPU3DEVLOG(PR_ERR, dev, fmt, ##a)
>> +
>> +/*
>> + * The documentation for the PHY training is written in terms of bits within an
>> + * actual register so we use that representation here.
>> + */
>> +struct npu3_phy_reg {
>> +	uint64_t offset;
>> +	uint64_t mask;
>> +};
>> +
>> +static struct npu3_phy_reg
>> +NPU3_PHY_RX_RUN_LANE			= { 0x0c8, PPC_BIT(48) },
>> +NPU3_PHY_RX_IORESET			= { 0x096, PPC_BIT(63) },
>> +NPU3_PHY_TX_IORESET			= { 0x113, PPC_BIT(48) },
>> +NPU3_PHY_RX_PR_RESET			= { 0x096, PPC_BIT(62) },
>> +NPU3_PHY_RX_LANE_ANA_PDWN		= { 0x002, PPC_BIT(54) },
>> +NPU3_PHY_RX_LANE_DIG_PDWN		= { 0x088, PPC_BIT(48) },
>> +NPU3_PHY_RX_PR_IQ_RES_SEL		= { 0x004, PPC_BITMASK(59, 61) },
>> +NPU3_PHY_RX_PR_PHASE_STEP		= { 0x08a, PPC_BITMASK(60, 63) },
>> +NPU3_PHY_TX_LANE_PDWN			= { 0x101, PPC_BIT(48) },
>> +NPU3_PHY_RX_RUN_DCCAL			= { 0x0c8, PPC_BIT(49) },
>> +NPU3_PHY_RX_DCCAL_DONE			= { 0x0ca, PPC_BIT(49) },
>> +NPU3_PHY_RX_LANE_BUSY			= { 0x0ca, PPC_BIT(50) },
>> +NPU3_PHY_RX_B_BANK_CONTROLS		= { 0x002, PPC_BITMASK(58, 63) },
>> +NPU3_PHY_TX_UNLOAD_CLK_DISABLE		= { 0x103, PPC_BIT(56) },
>> +NPU3_PHY_TX_FIFO_INIT			= { 0x105, PPC_BIT(53) },
>> +NPU3_PHY_TX_RXCAL			= { 0x103, PPC_BIT(57) },
>> +NPU3_PHY_RX_INIT_DONE			= { 0x0ca, PPC_BIT(48) },
>> +NPU3_PHY_RX_PR_EDGE_TRACK_CNTL		= { 0x092, PPC_BITMASK(48, 49) },
>> +NPU3_PHY_RX_PR_FW_OFF			= { 0x08a, PPC_BIT(56) },
>> +NPU3_PHY_RX_PR_FW_INERTIA_AMT		= { 0x08a, PPC_BITMASK(57, 59) },
>> +NPU3_PHY_RX_CFG_LTE_MC			= { 0x000, PPC_BITMASK(60, 63) },
>> +NPU3_PHY_RX_A_INTEG_COARSE_GAIN		= { 0x00a, PPC_BITMASK(48, 51) },
>> +NPU3_PHY_RX_B_INTEG_COARSE_GAIN		= { 0x026, PPC_BITMASK(48, 51) },
>> +NPU3_PHY_RX_E_INTEG_COARSE_GAIN		= { 0x030, PPC_BITMASK(48, 51) },
>> +
>> +/* These registers are per-PHY, not per lane */
>> +NPU3_PHY_TX_ZCAL_SWO_EN			= { 0x3c9, PPC_BIT(48) },
>> +NPU3_PHY_TX_ZCAL_REQ			= { 0x3c1, PPC_BIT(49) },
>> +NPU3_PHY_TX_ZCAL_DONE			= { 0x3c1, PPC_BIT(50) },
>> +NPU3_PHY_TX_ZCAL_ERROR			= { 0x3c1, PPC_BIT(51) },
>> +NPU3_PHY_TX_ZCAL_N			= { 0x3c3, PPC_BITMASK(48, 56) },
>> +NPU3_PHY_TX_ZCAL_P			= { 0x3c5, PPC_BITMASK(48, 56) },
>> +NPU3_PHY_TX_PSEG_PRE_EN			= { 0x34d, PPC_BITMASK(51, 55) },
>> +NPU3_PHY_TX_PSEG_PRE_SELECT		= { 0x34d, PPC_BITMASK(56, 60) },
>> +NPU3_PHY_TX_NSEG_PRE_EN			= { 0x34f, PPC_BITMASK(51, 55) },
>> +NPU3_PHY_TX_NSEG_PRE_SELECT		= { 0x34f, PPC_BITMASK(56, 60) },
>> +NPU3_PHY_TX_PSEG_POST_EN		= { 0x361, PPC_BITMASK(49, 55) },
>> +NPU3_PHY_TX_PSEG_POST_SELECT		= { 0x361, PPC_BITMASK(56, 62) },
>> +NPU3_PHY_TX_NSEG_POST_EN		= { 0x363, PPC_BITMASK(49, 55) },
>> +NPU3_PHY_TX_NSEG_POST_SELECT		= { 0x363, PPC_BITMASK(56, 62) },
>> +NPU3_PHY_TX_PSEG_MARGINPU_EN		= { 0x351, PPC_BITMASK(48, 55) },
>> +NPU3_PHY_TX_NSEG_MARGINPU_EN		= { 0x353, PPC_BITMASK(48, 55) },
>> +NPU3_PHY_TX_PSEG_MARGINPD_EN		= { 0x351, PPC_BITMASK(56, 63) },
>> +NPU3_PHY_TX_NSEG_MARGINPD_EN		= { 0x353, PPC_BITMASK(56, 63) },
>> +NPU3_PHY_TX_MARGINPU_SELECT		= { 0x355, PPC_BITMASK(48, 55) },
>> +NPU3_PHY_TX_MARGINPD_SELECT		= { 0x355, PPC_BITMASK(56, 63) },
>> +NPU3_PHY_TX_PSEG_MAIN_EN		= { 0x357, PPC_BITMASK(51, 57) },
>> +NPU3_PHY_TX_NSEG_MAIN_EN		= { 0x359, PPC_BITMASK(51, 57) },
>> +NPU3_PHY_RX_CLKDIST_PDWN		= { 0x204, PPC_BITMASK(48, 50) },
>> +NPU3_PHY_RX_IREF_PDWN			= { 0x230, PPC_BIT(54) },
>> +NPU3_PHY_TX_CLKDIST_PDWN		= { 0x305, PPC_BITMASK(48, 50) },
>> +NPU3_PHY_RX_CTL_DATASM_CLKDIST_PDWN	= { 0x2e0, PPC_BIT(60) };
>> +
>> +static uint64_t npu3_phy_scom(struct npu3_dev *dev, struct npu3_phy_reg *reg,
>> +			      int lane)
>> +{
>> +	uint64_t scom;
>> +
>> +	/* Don't specify a lane for a non-per-lane register */
>> +	if (lane >= 0)
>> +		assert(reg->offset < 0x200);
>> +	else
>> +		assert(reg->offset >= 0x200);
>> +
>> +	scom = OB_INDIRECT(dev->ob_chiplet);
>> +	scom = SETFIELD(PPC_BITMASK(12, 21), scom, reg->offset);
>> +
>> +	if (lane > 0)
>> +		scom = SETFIELD(PPC_BITMASK(27, 31), scom, lane);
>> +
>> +	return scom;
>> +}
>> +
>> +static void npu3_phy_write_lane(struct npu3_dev *dev, struct npu3_phy_reg *reg,
>> +				int lane, uint64_t val)
>> +{
>> +	struct npu3 *npu = dev->npu;
>> +	uint64_t scom, scom_val;
>> +
>> +	scom = npu3_phy_scom(dev, reg, lane);
>> +
>> +	xscom_read(npu->chip_id, scom, &scom_val);
>> +	scom_val = SETFIELD(reg->mask, scom_val, val);
>> +	xscom_write(npu->chip_id, scom, scom_val);
>> +}
>> +
>> +static uint64_t npu3_phy_read_lane(struct npu3_dev *dev,
>> +				   struct npu3_phy_reg *reg,
>> +				   int lane)
>> +{
>> +	struct npu3 *npu = dev->npu;
>> +	uint64_t scom, scom_val;
>> +
>> +	scom = npu3_phy_scom(dev, reg, lane);
>> +	xscom_read(npu->chip_id, scom, &scom_val);
>> +
>> +	return GETFIELD(reg->mask, scom_val);
>> +}
>> +
>> +static inline void npu3_phy_write(struct npu3_dev *dev,
>> +				  struct npu3_phy_reg *reg,
>> +				  uint64_t val)
>> +{
>> +	npu3_phy_write_lane(dev, reg, -1, val);
>> +}
>> +
>> +static inline uint64_t npu3_phy_read(struct npu3_dev *dev,
>> +				     struct npu3_phy_reg *reg)
>> +{
>> +	return npu3_phy_read_lane(dev, reg, -1);
>> +}
>> +
>> +struct procedure {
>> +	const char *name;
>> +	uint32_t (*steps[])(struct npu3_dev *);
>> +};
>> +
>> +#define DEFINE_PROCEDURE(NAME, STEPS...)	\
>> +static struct procedure procedure_##NAME = {	\
>> +	.name = #NAME,				\
>> +	.steps = { NAME, ##STEPS }		\
>> +}
>> +
>> +static uint32_t stop(struct npu3_dev *npu_dev __unused)
>> +{
>> +	return NPU3_PROC_COMPLETE | NPU3_PROC_ABORTED;
>> +}
>> +
>> +DEFINE_PROCEDURE(stop);
>> +
>> +static uint32_t nop(struct npu3_dev *npu_dev __unused)
>> +{
>> +	return NPU3_PROC_COMPLETE;
>> +}
>> +
>> +DEFINE_PROCEDURE(nop);
>> +
>> +static void set_iovalid(struct npu3_dev *dev, bool raise)
>> +{
>> +	struct npu3 *npu = dev->npu;
>> +	uint64_t reg, val;
>> +
>> +	reg = OB_CPLT_CONF1(dev->ob_chiplet);
>> +
>> +	xscom_read(npu->chip_id, reg, &val);
>> +	val = SETFIELD(OB_CPLT_CONF1_NV_IOVALID(dev->index), val, raise);
>> +	xscom_write(npu->chip_id, reg, val);
>> +}
>> +
>> +#define NPU3_PHY_LANES 24
>> +
>> +#define npu3_for_each_lane(lane, dev)				\
>> +	for (lane = 0; lane < NPU3_PHY_LANES; lane++)		\
>> +		if (dev->phy_lane_mask & PPC_BIT32(lane))	\
>> +
>> +static uint32_t phy_reset(struct npu3_dev *dev)
>> +{
>> +	uint32_t lane;
>> +
>> +	set_iovalid(dev, false);
>> +
>> +	npu3_for_each_lane(lane, dev)
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_LANE, lane, 0);
>> +
>> +	return NPU3_PROC_NEXT;
>> +}
>> +
>> +static uint32_t phy_reset_wait(struct npu3_dev *dev)
>> +{
>> +	int lane;
>> +
>> +	/* Wait for all lanes to become inactive */
>> +	npu3_for_each_lane(lane, dev)
>> +		if (npu3_phy_read_lane(dev, &NPU3_PHY_RX_LANE_BUSY, lane))
>> +			return NPU3_PROC_INPROGRESS;
>> +
>> +	npu3_for_each_lane(lane, dev) {
>> +		/* Set lane in reset */
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_IORESET, lane, 1);
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_TX_IORESET, lane, 1);
>> +
>> +		/* Release lane from reset */
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_IORESET, lane, 0);
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_TX_IORESET, lane, 0);
>> +
>> +		/* Reset the phase rotator */
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_RESET, lane, 1);
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_RESET, lane, 0);
>> +	}
>> +
>> +	return NPU3_PROC_NEXT;
>> +}
>> +
>> +/* Procedure 1.2.3 - Initialise I/O PHY Registers */
>> +static uint32_t phy_reset_complete(struct npu3_dev *dev)
>> +{
>> +	int lane;
>> +
>> +	npu3_for_each_lane(lane, dev) {
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_ANA_PDWN, lane, 0);
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_DIG_PDWN, lane, 0);
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_IQ_RES_SEL, lane, 7);
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_PHASE_STEP, lane, 0xc);
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_TX_LANE_PDWN, lane, 0);
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_FW_INERTIA_AMT, lane, 4);
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_CFG_LTE_MC, lane, 3);
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_A_INTEG_COARSE_GAIN, lane, 11);
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_B_INTEG_COARSE_GAIN, lane, 11);
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_E_INTEG_COARSE_GAIN, lane, 11);
>> +	}
>> +
>> +	set_iovalid(dev, true);
>> +
>> +	return NPU3_PROC_COMPLETE;
>> +}
>> +
>> +DEFINE_PROCEDURE(phy_reset, phy_reset_wait, phy_reset_complete);
>> +
>> +/* Procedure 1.2.6 - I/O PHY Tx Impedance Calibration */
>> +static uint32_t phy_tx_zcal(struct npu3_dev *dev)
>> +{
>> +	if (dev->npu->tx_zcal_complete)
>> +		return NPU3_PROC_COMPLETE;
>> +
>> +	/* Turn off SW enable and enable zcal state machine */
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_ZCAL_SWO_EN, 0);
>> +
>> +	/* Start impedance calibration state machine */
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_ZCAL_REQ, 1);
>> +
>> +	return NPU3_PROC_NEXT;
>> +}
>> +
>> +static uint32_t phy_tx_zcal_wait(struct npu3_dev *dev)
>> +{
>> +	if (npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_ERROR))
>> +		return NPU3_PROC_COMPLETE | NPU3_PROC_FAILED;
>> +
>> +	if (!npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_DONE))
>> +		return NPU3_PROC_INPROGRESS;
>> +
>> +	return NPU3_PROC_NEXT;
>> +}
>> +
>> +#define MARGIN_RATIO		0
>> +#define FFE_PRE_COEFF		0
>> +#define FFE_POST_COEFF		0
>> +
>> +#define PRE_WIDTH		5
>> +#define POST_WIDTH		7
>> +#define MAIN_WIDTH		7
>> +#define ZCAL_MIN		(16 * 2)
>> +#define ZCAL_MAX		(33 * 2)
>> +#define PRECURSOR_X2_MAX	(4 * 2 + 1)
>> +#define POSTCURSOR_X2_MAX	(6 * 2 + 1)
>> +#define MARGIN_X2_MAX		(8 * 2)
>> +#define MAIN_X2_MAX		(6 * 2 + 1)
>> +#define TOTAL_X2_MAX		(PRECURSOR_X2_MAX + POSTCURSOR_X2_MAX + \
>> +				 2 * MARGIN_X2_MAX + MAIN_X2_MAX)
>> +
>> +static uint32_t therm(uint32_t dec)
>> +{
>> +	return (0x1 << dec) - 1;
>> +}
>> +
>> +static uint32_t therm_with_half(uint32_t dec, uint8_t width)
>> +{
>> +	/* If the LSB of the 2r equivalent is on, then we need to set the 2r bit (MSB) */
>> +	uint32_t half_on = (dec & 0x1) << (width - 1);
>> +
>> +	/* Shift the 2r equivalent to a 1r value and convert to a thermometer code. */
>> +	uint32_t x1_equiv = ((1 << (dec >> 1)) - 1);
>> +
>> +	/* Combine 1r equivalent thermometer code + the 2r MSB value. */
>> +	return half_on | x1_equiv;
>> +}
>> +
>> +static uint32_t phy_tx_zcal_calculate(struct npu3_dev *dev)
>> +{
>> +	int p_value, n_value;
>> +	uint32_t zcal_n;
>> +	uint32_t zcal_p;
>> +	uint32_t p_main_enable = MAIN_X2_MAX;
>> +	uint32_t p_margin_pu_enable = MARGIN_X2_MAX;
>> +	uint32_t p_margin_pd_enable = MARGIN_X2_MAX;
>> +	uint32_t p_precursor_select;
>> +	uint32_t p_postcursor_select;
>> +	uint32_t margin_pu_select;
>> +	uint32_t n_main_enable = MAIN_X2_MAX;
>> +	uint32_t n_margin_pu_enable = MARGIN_X2_MAX;
>> +	uint32_t n_margin_pd_enable = MARGIN_X2_MAX;
>> +	uint32_t n_precursor_select;
>> +	uint32_t n_postcursor_select;
>> +	uint32_t margin_pd_select;
>> +	uint32_t margin_select;
>> +
>> +	/* Convert the value from 8R to 2R by / 4 */
>> +	zcal_n = npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_N) / 4;
>> +	zcal_p = npu3_phy_read(dev, &NPU3_PHY_TX_ZCAL_P) / 4;
>> +
>> +	/* Again, if the hardware detects an unexpected condition it's
>> +	 * better just to fail loudly. */
>> +	if (zcal_n < ZCAL_MIN || zcal_n > ZCAL_MAX ||
>> +	    zcal_p < ZCAL_MIN || zcal_p > ZCAL_MAX)
>> +		return NPU3_PROC_COMPLETE | NPU3_PROC_FAILED;
>> +
>> +	p_value = zcal_p - TOTAL_X2_MAX;
>> +	p_precursor_select = p_value * FFE_PRE_COEFF / 128;
>> +	p_postcursor_select = p_value * FFE_POST_COEFF / 128;
>> +	margin_pu_select = p_value * MARGIN_RATIO / 256;
>> +
>> +	if (p_value % 2) {
>> +		p_main_enable--;
>> +		p_value++;
>> +	}
>> +
>> +	while (p_value < 0) {
>> +		if (p_main_enable > 1) {
>> +			p_main_enable -= 2;
>> +		} else if (p_margin_pu_enable + p_margin_pd_enable > 0) {
>> +			if (p_margin_pu_enable == p_margin_pd_enable)
>> +				p_margin_pd_enable -= 2;
>> +			else
>> +				p_margin_pu_enable -= 2;
>> +		}
>> +		p_value += 2;
>> +	}
>> +
>> +	n_value = zcal_n - TOTAL_X2_MAX;
>> +	n_precursor_select = n_value * FFE_PRE_COEFF / 128;
>> +	n_postcursor_select = n_value * FFE_POST_COEFF / 128;
>> +	margin_pd_select = p_value * MARGIN_RATIO / 256;
>> +
>> +	if (n_value % 2) {
>> +		n_main_enable--;
>> +		n_value++;
>> +	}
>> +
>> +	while (n_value < 0) {
>> +		if (n_main_enable > 1) {
>> +			n_main_enable -= 2;
>> +		} else if (n_margin_pu_enable + n_margin_pd_enable > 0) {
>> +			if (n_margin_pu_enable == n_margin_pd_enable)
>> +				n_margin_pd_enable -= 2;
>> +			else
>> +				n_margin_pu_enable -= 2;
>> +		}
>> +		n_value += 2;
>> +	}
>> +
>> +	margin_select = therm((margin_pu_select + 1) / 2) &
>> +			therm((margin_pd_select + 1) / 2) &
>> +			therm((p_margin_pu_enable + 1) / 2) &
>> +			therm((p_margin_pd_enable + 1) / 2) &
>> +			therm((n_margin_pu_enable + 1) / 2) &
>> +			therm((n_margin_pd_enable + 1) / 2);
>> +
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_PRE_EN,      therm_with_half(PRECURSOR_X2_MAX, PRE_WIDTH));
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_PRE_SELECT,  therm_with_half(p_precursor_select, PRE_WIDTH));
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_POST_EN,     therm_with_half(POSTCURSOR_X2_MAX, POST_WIDTH));
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_POST_SELECT, therm_with_half(p_postcursor_select, POST_WIDTH));
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_MARGINPU_EN, therm((p_margin_pu_enable + 1) / 2));
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_MARGINPD_EN, therm((p_margin_pd_enable + 1) / 2));
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_PSEG_MAIN_EN,     therm_with_half(p_main_enable, MAIN_WIDTH));
>> +
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_PRE_EN,      therm_with_half(PRECURSOR_X2_MAX, PRE_WIDTH));
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_PRE_SELECT,  therm_with_half(n_precursor_select, PRE_WIDTH));
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_POST_EN,     therm_with_half(POSTCURSOR_X2_MAX, POST_WIDTH));
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_POST_SELECT, therm_with_half(n_postcursor_select, POST_WIDTH));
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_MARGINPU_EN, therm((n_margin_pu_enable + 1) / 2));
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_MARGINPD_EN, therm((n_margin_pd_enable + 1) / 2));
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_NSEG_MAIN_EN,     therm_with_half(n_main_enable, MAIN_WIDTH));
>> +
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_MARGINPU_SELECT,  therm(margin_select + 1) / 2);
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_MARGINPD_SELECT,  therm(margin_select + 1) / 2);
>> +
>> +	dev->npu->tx_zcal_complete = true;
>> +
>> +	return NPU3_PROC_COMPLETE;
>> +}
>> +
>> +DEFINE_PROCEDURE(phy_tx_zcal, phy_tx_zcal_wait, phy_tx_zcal_calculate);
>> +
>> +/* Procedure 1.2.4 - I/O PHY DC Calibration */
>> +static uint32_t phy_rx_dccal(struct npu3_dev *dev)
>> +{
>> +	int lane;
>> +
>> +	set_iovalid(dev, false);
>> +
>> +	npu3_for_each_lane(lane, dev)
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_FW_OFF, lane, 1);
>> +
>> +	npu3_for_each_lane(lane, dev)
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_DCCAL, lane, 1);
>> +
>> +	return NPU3_PROC_NEXT;
>> +}
>> +
>> +static uint32_t phy_rx_dccal_complete(struct npu3_dev *dev)
>> +{
>> +	int lane;
>> +
>> +	npu3_for_each_lane(lane, dev)
>> +		if (!npu3_phy_read_lane(dev, &NPU3_PHY_RX_DCCAL_DONE, lane))
>> +			return NPU3_PROC_INPROGRESS;
>> +
>> +	npu3_for_each_lane(lane, dev)
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_DCCAL, lane, 0);
>> +
>> +	npu3_for_each_lane(lane, dev) {
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_B_BANK_CONTROLS, lane, 0);
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_EDGE_TRACK_CNTL, lane, 0);
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_PR_FW_OFF, lane, 0);
>> +	}
>> +
>> +	return NPU3_PROC_NEXT;
>> +}
>> +
>> +/* Procedure 1.2.5 - IO PHY Tx FIFO Init */
>> +static uint32_t phy_tx_fifo_init(struct npu3_dev *dev)
>> +{
>> +	int lane;
>> +
>> +	npu3_for_each_lane(lane, dev) {
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_TX_UNLOAD_CLK_DISABLE, lane, 0);
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_TX_FIFO_INIT, lane, 1);
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_TX_UNLOAD_CLK_DISABLE, lane, 1);
>> +	}
>> +
>> +	set_iovalid(dev, true);
>> +
>> +	return NPU3_PROC_COMPLETE;
>> +}
>> +
>> +DEFINE_PROCEDURE(phy_rx_dccal, phy_rx_dccal_complete, phy_tx_fifo_init);
>> +
>> +/* Procedure 1.2.8 - Enable Downstream Link Training */
>> +static uint32_t phy_enable_tx_rxcal(struct npu3_dev *dev)
>> +{
>> +	int lane;
>> +
>> +	npu3_for_each_lane(lane, dev)
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_TX_RXCAL, lane, 1);
>> +
>> +	return NPU3_PROC_COMPLETE;
>> +}
>> +DEFINE_PROCEDURE(phy_enable_tx_rxcal);
>> +
>> +/* Procedure 1.2.9 - Disable Downstream Link Training */
>> +static uint32_t phy_disable_tx_rxcal(struct npu3_dev *dev)
>> +{
>> +	int lane;
>> +
>> +	npu3_for_each_lane(lane, dev)
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_TX_RXCAL, lane, 0);
>> +
>> +	return NPU3_PROC_COMPLETE;
>> +}
>> +DEFINE_PROCEDURE(phy_disable_tx_rxcal);
>> +
>> +/* Procedure 1.2.7 - I/O PHY Upstream Link Training */
>> +static uint32_t phy_rx_training(struct npu3_dev *dev)
>> +{
>> +	int lane;
>> +
>> +	npu3_for_each_lane(lane, dev)
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_RUN_LANE, lane, 1);
>> +
>> +	return NPU3_PROC_NEXT;
>> +}
>> +
>> +static uint32_t phy_rx_training_wait(struct npu3_dev *dev)
>> +{
>> +	int lane;
>> +
>> +	npu3_for_each_lane(lane, dev)
>> +		if (!npu3_phy_read_lane(dev, &NPU3_PHY_RX_INIT_DONE, lane))
>> +			return NPU3_PROC_INPROGRESS;
>> +
>> +	return NPU3_PROC_COMPLETE;
>> +}
>> +
>> +DEFINE_PROCEDURE(phy_rx_training, phy_rx_training_wait);
>> +
>> +static void npu3_dev_fence_set(struct npu3_dev *dev, uint8_t state)
>> +{
>> +	struct npu3 *npu = dev->npu;
>> +	uint64_t val;
>> +
>> +	val = npu3_read(npu, NPU3_NTL_MISC_CFG1(dev->index));
>> +	val = SETFIELD(NPU3_NTL_MISC_CFG1_NTL_RESET, val, state);
>> +	npu3_write(npu, NPU3_NTL_MISC_CFG1(dev->index), val);
>> +}
>> +
>> +static uint8_t npu3_dev_fence_get(struct npu3_dev *dev)
>> +{
>> +	uint64_t val;
>> +
>> +	val = npu3_read(dev->npu, NPU3_NTL_CQ_FENCE_STATUS(dev->index));
>> +	return GETFIELD(NPU3_NTL_CQ_FENCE_STATUS_FIELD, val);
>> +}
>> +
>> +/* Procedure 1.2.1 - Reset NPU/NDL */
>> +static uint32_t reset_ntl(struct npu3_dev *dev)
>> +{
>> +	struct npu3 *npu = dev->npu;
>> +	uint64_t val;
>> +	int lane;
>> +
>> +	set_iovalid(dev, true);
>> +
>> +	/* Power on clocks */
>> +	npu3_phy_write(dev, &NPU3_PHY_RX_CLKDIST_PDWN, 0);
>> +	npu3_phy_write(dev, &NPU3_PHY_RX_IREF_PDWN, 1);
>> +	npu3_phy_write(dev, &NPU3_PHY_TX_CLKDIST_PDWN, 0);
>> +	npu3_phy_write(dev, &NPU3_PHY_RX_CTL_DATASM_CLKDIST_PDWN, 0);
>> +
>> +	npu3_for_each_lane(lane, dev) {
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_ANA_PDWN, lane, 0);
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_RX_LANE_DIG_PDWN, lane, 0);
>> +		npu3_phy_write_lane(dev, &NPU3_PHY_TX_LANE_PDWN, lane, 0);
>> +	}
>> +
>> +	/* Write PRI */
>> +	val = SETFIELD(NPU3_NTL_PRI_CFG_NDL, 0ull, dev->index);
>> +	npu3_write(npu, NPU3_NTL_PRI_CFG(dev->index), val);
>> +
>> +	/* Disable RX parity checking */
>> +	val = npu3_read(npu, NPU3_NTL_MISC_CFG2(dev->index));
>> +	val &= ~NPU3_NTL_MISC_CFG2_NDL_RX_PARITY_ENA;
>> +	npu3_write(npu, NPU3_NTL_MISC_CFG2(dev->index), val);
>> +
>> +	if (dev->type == NPU3_DEV_TYPE_NVLINK)
>> +		npu3_pvd_flag_clear(dev, NPU3_DEV_DL_RESET);
>> +
>> +	npu3_dev_fence_set(dev, NPU3_NTL_CQ_FENCE_STATUS_FULL);
>> +
>> +	return NPU3_PROC_NEXT;
>> +}
>> +
>> +static uint32_t reset_ndl(struct npu3_dev *dev)
>> +{
>> +	struct npu3 *npu = dev->npu;
>> +	uint64_t reg;
>> +	uint32_t val32;
>> +
>> +	if (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_FULL)
>> +		return NPU3_PROC_INPROGRESS;
>> +
>> +	reg = NPU3_DLPL_CTL(dev->index);
>> +	val32 = npu3_read_4b(npu, reg);
>> +	val32 |= NPU3_DLPL_CTL_RESET_RX | NPU3_DLPL_CTL_RESET_MISC;
>> +	npu3_write_4b(npu, reg, val32);
>> +
>> +	val32 = npu3_read_4b(npu, reg);
>> +	val32 &= ~(NPU3_DLPL_CTL_RESET_RX | NPU3_DLPL_CTL_RESET_MISC);
>> +	npu3_write_4b(npu, reg, val32);
>> +
>> +	reg = NPU3_DLPL_CFG(dev->index);
>> +	val32 = NPU3_DLPL_CFG_PRI_BYTESWAP;
>> +	npu3_write_4b(npu, reg, val32);
>> +
>> +	/* Clear FIR bits */
>> +	for (uint32_t i = 0; i < NPU3_FIR_MAX; i++)
>> +		xscom_write(npu->chip_id, npu->xscom_base + NPU3_FIR(i), 0ull);
>> +
>> +	npu3_dev_fence_set(dev, NPU3_NTL_CQ_FENCE_STATUS_HALF);
>> +
>> +	return NPU3_PROC_NEXT;
>> +}
>> +
>> +static uint32_t reset_ntl_release(struct npu3_dev *dev)
>> +{
>> +	struct npu3 *npu = dev->npu;
>> +	uint32_t i = dev->index;
>> +
>> +	if (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_HALF)
>> +		return NPU3_PROC_INPROGRESS;
>> +
>> +	/* Credit setup */
>> +	npu3_write(npu, NPU3_NTL_CREQ_HDR_CRED_SND(i), 0x0200000000000000);
>> +	npu3_write(npu, NPU3_NTL_PRB_HDR_CRED_SND(i),  0x0200000000000000);
>> +	npu3_write(npu, NPU3_NTL_ATR_HDR_CRED_SND(i),  0x0200000000000000);
>> +	npu3_write(npu, NPU3_NTL_RSP_HDR_CRED_SND(i),  0x0200000000000000);
>> +	npu3_write(npu, NPU3_NTL_CREQ_DAT_CRED_SND(i), 0x1000000000000000);
>> +	npu3_write(npu, NPU3_NTL_RSP_DAT_CRED_SND(i),  0x1000000000000000);
>> +
>> +	npu3_write(npu, NPU3_NTL_CREQ_HDR_CRED_RCV(i), 0x0000be0000000000);
>> +	npu3_write(npu, NPU3_NTL_DGD_HDR_CRED_RCV(i),  0x0000640000000000);
>> +	npu3_write(npu, NPU3_NTL_ATSD_HDR_CRED_RCV(i), 0x0000200000000000);
>> +	npu3_write(npu, NPU3_NTL_RSP_HDR_CRED_RCV(i),  0x0000be0000000000);
>> +	npu3_write(npu, NPU3_NTL_CREQ_DAT_CRED_RCV(i), 0x0001000000000000);
>> +	npu3_write(npu, NPU3_NTL_RSP_DAT_CRED_RCV(i),  0x0001000000000000);
>> +
>> +	npu3_dev_fence_set(dev, NPU3_NTL_CQ_FENCE_STATUS_NONE);
>> +
>> +	return NPU3_PROC_NEXT;
>> +}
>> +
>> +static uint32_t reset_ntl_finish(struct npu3_dev *dev) {
>> +	struct npu3 *npu = dev->npu;
>> +	uint64_t val;
>> +
>> +	if (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_NONE)
>> +		return NPU3_PROC_INPROGRESS;
>> +
>> +	/* Enable RX parity checking */
>> +	val = npu3_read(npu, NPU3_NTL_MISC_CFG2(dev->index));
>> +	val |= NPU3_NTL_MISC_CFG2_NDL_RX_PARITY_ENA;
>> +	npu3_write(npu, NPU3_NTL_MISC_CFG2(dev->index), val);
>> +
>> +	if (dev->type == NPU3_DEV_TYPE_NVLINK)
>> +		npu3_pvd_flag_set(dev, NPU3_DEV_DL_RESET);
>> +
>> +	return NPU3_PROC_COMPLETE;
>> +}
>> +
>> +DEFINE_PROCEDURE(reset_ntl, reset_ndl, reset_ntl_release, reset_ntl_finish);
>> +
>> +static int npu3_dev_regcmp(struct npu3_dev *dev, uint64_t reg,
>> +			   const char *reg_name, uint64_t expected)
>> +{
>> +	uint64_t val;
>> +
>> +	val = npu3_read(dev->npu, reg);
>> +	if (val == expected)
>> +		return 0;
>> +
>> +	NPU3DEVERR(dev, "%s: expected 0x%llx, read 0x%llx\n",
>> +		   reg_name, expected, val);
>> +
>> +	return 1;
>> +}
>> +
>> +#define REGCMP(reg, expected) \
>> +	npu3_dev_regcmp(dev, reg(dev->index), #reg, expected);
>> +
>> +static uint32_t check_credits(struct npu3_dev *dev)
>> +{
>> +	int rc;
>> +
>> +	rc  = REGCMP(NPU3_NTL_CREQ_HDR_CRED_RCV, 0x0be0be0000000000ull);
>> +	rc |= REGCMP(NPU3_NTL_DGD_HDR_CRED_RCV,  0x0640640000000000ull);
>> +	rc |= REGCMP(NPU3_NTL_ATSD_HDR_CRED_RCV, 0x0200200000000000ull);
>> +	rc |= REGCMP(NPU3_NTL_RSP_HDR_CRED_RCV,  0x0be0be0000000000ull);
>> +	rc |= REGCMP(NPU3_NTL_CREQ_DAT_CRED_RCV, 0x1001000000000000ull);
>> +	rc |= REGCMP(NPU3_NTL_RSP_DAT_CRED_RCV,  0x1001000000000000ull);
>> +	if (rc)
>> +		return NPU3_PROC_COMPLETE | NPU3_PROC_FAILED;
>> +
>> +	return NPU3_PROC_COMPLETE;
>> +}
>> +
>> +DEFINE_PROCEDURE(check_credits);
>> +
>> +static struct procedure *procedures[] = {
>> +	 [0] = &procedure_stop,
>> +	 [1] = &procedure_nop,
>> +	 [4] = &procedure_phy_reset,
>> +	 [5] = &procedure_phy_tx_zcal,
>> +	 [6] = &procedure_phy_rx_dccal,
>> +	 [7] = &procedure_phy_enable_tx_rxcal,
>> +	 [8] = &procedure_phy_disable_tx_rxcal,
>> +	 [9] = &procedure_phy_rx_training,
>> +	[10] = &procedure_reset_ntl,
>> +	[11] = &procedure_nop, /* Placeholder for pre-terminate */
>> +	[12] = &procedure_nop, /* Placeholder for terminate */
>> +	[13] = &procedure_check_credits,
>> +};
>> +
>> +void npu3_dev_procedure_init(struct npu3_dev *dev, uint32_t pnum)
>> +{
>> +	struct npu3_procedure *proc = &dev->proc;
>> +	const char *name;
>> +
>> +	if (pnum >= ARRAY_SIZE(procedures) || !procedures[pnum]) {
>> +		NPU3DEVERR(dev, "Unsupported procedure number %d\n", pnum);
>> +		proc->status = NPU3_PROC_COMPLETE | NPU3_PROC_UNSUPPORTED;
>> +		return;
>> +	}
>> +
>> +	name = procedures[pnum]->name;
>> +
>> +	if (proc->number == pnum && !(proc->status & NPU3_PROC_COMPLETE))
>> +		NPU3DEVINF(dev, "Restarting procedure %s\n", name);
>> +	else
>> +		NPU3DEVINF(dev, "Starting procedure %s\n", name);
>> +
>> +	proc->status = NPU3_PROC_INPROGRESS;
>> +	proc->number = pnum;
>> +	proc->step = 0;
>> +	proc->timeout = mftb() + msecs_to_tb(1000);
>> +}
>> +
>> +static uint32_t npu3_dev_procedure_run_step(struct npu3_dev *dev)
>> +{
>> +	struct npu3_procedure *proc = &dev->proc;
>> +	uint32_t result;
>> +
>> +	result = procedures[proc->number]->steps[proc->step](dev);
>> +	if (result & NPU3_PROC_NEXT) {
>> +		proc->step++;
>> +
>> +		NPU3DEVINF(dev, "Running procedure %s step %d\n",
>> +			   procedures[proc->number]->name, proc->step);
>> +	}
>> +
>> +	return result;
>> +}
>> +
>> +static void npu3_dev_procedure_run(struct npu3_dev *dev)
>> +{
>> +	struct npu3_procedure *proc = &dev->proc;
>> +	const char *name;
>> +	uint32_t result;
>> +
>> +	do {
>> +		result = npu3_dev_procedure_run_step(dev);
>> +	} while (result & NPU3_PROC_NEXT);
>> +
>> +	name = procedures[proc->number]->name;
>> +
>> +	if (result & NPU3_PROC_COMPLETE) {
>> +		NPU3DEVINF(dev, "Procedure %s complete\n", name);
>> +	} else if (tb_compare(mftb(), proc->timeout) == TB_AAFTERB) {
>> +		NPU3DEVINF(dev, "Procedure %s timed out\n", name);
>> +		result = NPU3_PROC_COMPLETE | NPU3_PROC_FAILED;
>> +	}
>> +
>> +	/* Mask off internal state bits */
>> +	proc->status = result & NPU3_PROC_STATUS_MASK;
>> +}
>> +
>> +uint32_t npu3_dev_procedure_status(struct npu3_dev *dev)
>> +{
>> +	/* Run the procedure if not already complete */
>> +	if (!(dev->proc.status & NPU3_PROC_COMPLETE))
>> +		npu3_dev_procedure_run(dev);
>> +
>> +	return dev->proc.status;
>> +}
>> +
>> +int64_t npu3_dev_reset(struct npu3_dev *dev)
>> +{
>> +	unsigned long timeout;
>> +
>> +	reset_ntl(dev);
>> +	timeout = mftb() + msecs_to_tb(1000);
>> +
>> +	while (npu3_dev_fence_get(dev) != NPU3_NTL_CQ_FENCE_STATUS_FULL) {
>> +		if (tb_compare(mftb(), timeout) == TB_AAFTERB) {
>> +			NPU3DEVINF(dev, "Device reset timed out\n");
>> +			return OPAL_BUSY;
>> +		}
>> +	}
>> +
>> +	return OPAL_SUCCESS;
>> +}
>> diff --git a/hw/npu3-nvlink.c b/hw/npu3-nvlink.c
>> new file mode 100644
>> index 000000000000..95188f824e0e
>> --- /dev/null
>> +++ b/hw/npu3-nvlink.c
>> @@ -0,0 +1,1841 @@
>> +/* Copyright 2019 IBM Corp.
>> + *
>> + * Licensed under the Apache License, Version 2.0 (the "License");
>> + * you may not use this file except in compliance with the License.
>> + * You may obtain a copy of the License at
>> + *
>> + *	http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
>> + * implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +#include <skiboot.h>
>> +#include <device.h>
>> +#include <phys-map.h>
>> +#include <npu3.h>
>> +#include <npu3-regs.h>
>> +#include <pci-virt.h>
>> +#include <xscom.h>
>> +#include <xscom-p9-regs.h>
>> +#include <interrupts.h>
>> +#include <pci-cfg.h>
>> +#include <pci-slot.h>
>> +#include <cache-p9.h>
>> +
>> +#define NPU3LOG(l, npu, fmt, a...)		\
>> +	prlog(l, "NPU#%04x[%d:%d]: " fmt,	\
>> +	      (npu)->nvlink.phb.opal_id,	\
>> +	      (npu)->chip_id,			\
>> +	      (npu)->index, ##a)
>> +#define NPU3DBG(npu, fmt, a...) NPU3LOG(PR_DEBUG, npu, fmt, ##a)
>> +#define NPU3INF(npu, fmt, a...) NPU3LOG(PR_INFO, npu, fmt, ##a)
>> +#define NPU3ERR(npu, fmt, a...) NPU3LOG(PR_ERR, npu, fmt, ##a)
>> +
>> +#define NPU3DEVLOG(l, dev, fmt, a...)			\
>> +	prlog(l, "NPU#%04x:%02x:%02x.%x " fmt,		\
>> +	      (dev)->npu->nvlink.phb.opal_id,		\
>> +	      (dev)->nvlink.pvd->bdfn >> 8 & 0xff,	\
>> +	      (dev)->nvlink.pvd->bdfn >> 3 & 0x1f,	\
>> +	      (dev)->nvlink.pvd->bdfn & 0x7, ##a)
>> +#define NPU3DEVDBG(dev, fmt, a...) NPU3DEVLOG(PR_DEBUG, dev, fmt, ##a)
>> +#define NPU3DEVINF(dev, fmt, a...) NPU3DEVLOG(PR_INFO, dev, fmt, ##a)
>> +#define NPU3DEVERR(dev, fmt, a...) NPU3DEVLOG(PR_ERR, dev, fmt, ##a)
>> +
>> +#define NPU3_CFG_READ(size, type)					\
>> +static int64_t npu3_cfg_read##size(struct phb *phb, uint32_t bdfn,	\
>> +				   uint32_t offset, type *data)		\
>> +{									\
>> +	uint32_t val;							\
>> +	int64_t ret;							\
>> +									\
>> +	ret = pci_virt_cfg_read(phb, bdfn, offset,			\
>> +				sizeof(*data), &val);			\
>> +	*data = (type)val;						\
>> +	return ret;							\
>> +}
>> +
>> +#define NPU3_CFG_WRITE(size, type)					\
>> +static int64_t npu3_cfg_write##size(struct phb *phb, uint32_t bdfn,	\
>> +				    uint32_t offset, type data)		\
>> +{									\
>> +	uint32_t val = data;						\
>> +	int64_t ret;							\
>> +									\
>> +	ret = pci_virt_cfg_write(phb, bdfn, offset,			\
>> +				 sizeof(data), val);			\
>> +	return ret;							\
>> +}
>> +
>> +NPU3_CFG_READ(8, u8);
>> +NPU3_CFG_READ(16, u16);
>> +NPU3_CFG_READ(32, u32);
>> +NPU3_CFG_WRITE(8, u8);
>> +NPU3_CFG_WRITE(16, u16);
>> +NPU3_CFG_WRITE(32, u32);
>> +
>> +static int64_t npu3_eeh_freeze_status(struct phb *phb __unused,
>> +				      uint64_t pe_num __unused,
>> +				      uint8_t *freeze_state,
>> +				      uint16_t *pci_error_type,
>> +				      uint16_t *severity)
>> +{
>> +	/*
>> +	 * FIXME: When it's called by skiboot PCI config accessor,
>> +	 * the PE number is fixed to 0, which is incorrect. We need
>> +	 * introduce another PHB callback to translate it. For now,
>> +	 * it keeps the skiboot PCI enumeration going.
>> +	 */
>> +	*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
>> +	*pci_error_type = OPAL_EEH_NO_ERROR;
>> +
>> +	if (severity)
>> +		*severity = OPAL_EEH_SEV_NO_ERROR;
>> +
>> +	return OPAL_SUCCESS;
>> +}
>> +
>> +/* Number of PEs supported */
>> +#define NPU3_MAX_PE_NUM		16
>> +#define NPU3_RESERVED_PE_NUM	15
>> +
>> +static int64_t npu3_ioda_reset(struct phb *phb, bool purge __unused)
>> +{
>> +	struct npu3 *npu = npu3_phb_to_npu(phb);
>> +	uint64_t val;
>> +
>> +	val = NPU3_ATS_IODA_ADDR_AUTO_INC;
>> +	val = SETFIELD(NPU3_ATS_IODA_ADDR_TBL_SEL, val,
>> +		       NPU3_ATS_IODA_ADDR_TBL_TVT);
>> +	npu3_write(npu, NPU3_ATS_IODA_ADDR, val);
>> +
>> +	for (uint32_t i = 0; i < NPU3_MAX_PE_NUM; i++)
>> +		npu3_write(npu, NPU3_ATS_IODA_DATA, 0ull);
>> +
>> +	return OPAL_SUCCESS;
>> +}
>> +
>> +static inline void npu3_ioda_sel(struct npu3 *npu, uint32_t table,
>> +				 uint32_t index)
>> +{
>> +	uint64_t val;
>> +
>> +	val = SETFIELD(NPU3_ATS_IODA_ADDR_TBL_SEL, 0ull, table);
>> +	val = SETFIELD(NPU3_ATS_IODA_ADDR_TBL_ADDR, val, index);
>> +	npu3_write(npu, NPU3_ATS_IODA_ADDR, val);
>> +}
>> +
>> +static int64_t npu3_map_pe_dma_window(struct phb *phb,
>> +				      uint64_t pe_num,
>> +				      uint16_t window_id,
>> +				      uint16_t tce_levels,
>> +				      uint64_t tce_table_addr,
>> +				      uint64_t tce_table_size,
>> +				      uint64_t tce_page_size)
>> +{
>> +	struct npu3 *npu = npu3_phb_to_npu(phb);
>> +	uint64_t tts_encoded, val;
>> +	uint32_t page_size;
>> +
>> +	/* Each PE has one corresponding TVE */
>> +	if (window_id != pe_num || pe_num >= NPU3_MAX_PE_NUM)
>> +		return OPAL_PARAMETER;
>> +
>> +	npu3_ioda_sel(npu, NPU3_ATS_IODA_ADDR_TBL_TVT, pe_num);
>> +
>> +	/* TCE table size zero is used to disable the TVE */
>> +	if (!tce_table_size) {
>> +		npu3_write(npu, NPU3_ATS_IODA_DATA, 0ull);
>> +		return OPAL_SUCCESS;
>> +	}
>> +
>> +	/* TCE table size */
>> +	if (!is_pow2(tce_table_size) || tce_table_size < 0x1000)
>> +		return OPAL_PARAMETER;
>> +
>> +	tts_encoded = ilog2(tce_table_size) - 11;
>> +	if (tts_encoded > 39)
>> +		return OPAL_PARAMETER;
>> +
>> +	val = SETFIELD(NPU3_ATS_IODA_TVT_TABLE_SIZE, 0ull, tts_encoded);
>> +
>> +	/* Number of levels */
>> +	if (tce_levels < 1 || tce_levels > 4)
>> +		return OPAL_PARAMETER;
>> +
>> +	val = SETFIELD(NPU3_ATS_IODA_TVT_TABLE_LEVEL, val, tce_levels - 1);
>> +
>> +	/* TCE page size */
>> +	switch (tce_page_size) {
>> +	case 256 << 20:
>> +		page_size = 17;
>> +		break;
>> +	case 16 << 20:
>> +		page_size = 13;
>> +		break;
>> +	case 64 << 10:
>> +		page_size = 5;
>> +		break;
>> +	default:
>> +		page_size = 1;
>> +	}
>> +
>> +	val = SETFIELD(NPU3_ATS_IODA_TVT_PAGE_SIZE, val, page_size);
>> +	val = SETFIELD(NPU3_ATS_IODA_TVT_XLAT_ADDR, val, tce_table_addr >> 12);
>> +	npu3_write(npu, NPU3_ATS_IODA_DATA, val);
>> +
>> +	return OPAL_SUCCESS;
>> +}
>> +
>> +static int64_t npu3_map_pe_dma_window_real(struct phb *phb,
>> +					   uint64_t pe_num,
>> +					   uint16_t window_id,
>> +					   uint64_t pci_start_addr __unused,
>> +					   uint64_t pci_mem_size __unused)
>> +{
>> +	struct npu3 *npu = npu3_phb_to_npu(phb);
>> +	uint64_t val;
>> +
>> +	/* Each PE has one corresponding TVE */
>> +	if (window_id != pe_num || pe_num >= NPU3_MAX_PE_NUM)
>> +		return OPAL_PARAMETER;
>> +
>> +	if (pci_mem_size) {
>> +		/*
>> +		 * GPUs need to be able to access the MMIO memory space as well.
>> +		 * On POWER9 this is above the top of RAM, so disable the TVT
>> +		 * range check, allowing access to all memory addresses.
>> +		 */
>> +		val = 0;
>> +	} else {
>> +		/* Disable */
>> +		val = PPC_BIT(51);
>> +	}
>> +
>> +	npu3_ioda_sel(npu, NPU3_ATS_IODA_ADDR_TBL_TVT, pe_num);
>> +	npu3_write(npu, NPU3_ATS_IODA_DATA, val);
>> +
>> +	return OPAL_SUCCESS;
>> +}
>> +
>> +static int64_t npu3_next_error(struct phb *phb,
>> +			       uint64_t *first_frozen_pe,
>> +			       uint16_t *pci_error_type,
>> +			       uint16_t *severity)
>> +{
>> +	struct npu3 *npu = npu3_phb_to_npu(phb);
>> +	uint32_t pe_num;
>> +
>> +	if (!first_frozen_pe || !pci_error_type || !severity)
>> +		return OPAL_PARAMETER;
>> +
>> +	*first_frozen_pe = -1;
>> +	*pci_error_type = OPAL_EEH_NO_ERROR;
>> +	*severity = OPAL_EEH_SEV_NO_ERROR;
>> +
>> +	for (pe_num = 0; pe_num < NPU3_MAX_PE_NUM; pe_num++) {
>> +		if (!npu3_read(npu, NPU3_MISC_PESTB(pe_num)))
>> +			continue;
>> +
>> +		*first_frozen_pe = pe_num;
>> +		*pci_error_type = OPAL_EEH_PE_ERROR;
>> +		*severity = OPAL_EEH_SEV_PE_ER;
>> +		break;
>> +	}
>> +
>> +	return OPAL_SUCCESS;
>> +}
>> +
>> +static struct npu3_dev *npu3_bdfn_to_dev(struct npu3 *npu, uint32_t bdfn)
>> +{
>> +	struct pci_virt_device *pvd;
>> +
>> +	/* All emulated devices are attached to root bus */
>> +	if (bdfn & ~0xff)
>> +		return NULL;
>> +
>> +	pvd = pci_virt_find_device(&npu->nvlink.phb, bdfn);
>> +	if (pvd)
>> +		return pvd->data;
>> +
>> +	return NULL;
>> +}
>> +
>> +static int npu3_match_gpu(struct phb *phb __unused, struct pci_device *pd,
>> +			  void *data)
>> +{
>> +	const char *slot = data;
>> +	struct dt_node *dn;
>> +	char *loc_code;
>> +
>> +	/* Ignore non-NVIDIA devices */
>> +	if ((pd->vdid & 0xffff) != 0x10de)
>
>PCI_VENDOR_ID()?

Ah, nice! Will use.

>> +		return 0;
>> +
>> +	/* Find the PCI device's slot location */
>> +	for (dn = pd->dn;
>> +	     dn && !dt_find_property(dn, "ibm,loc-code");
>> +	     dn = dn->parent);
>> +
>> +	if (!dn)
>> +		return 0;
>> +
>> +	loc_code = (char *)dt_prop_get(dn, "ibm,loc-code");
>> +	if (streq(loc_code, slot))
>> +		return 1;
>> +
>> +	return 0;
>> +}
>> +
>> +static void npu3_dev_find_gpu(struct npu3_dev *dev)
>> +{
>> +	const char *slot = dev->nvlink.loc_code;
>> +	struct phb *phb;
>> +	struct pci_device *gpu;
>> +
>> +	if (!slot)
>> +		return;
>> +
>> +	for_each_phb(phb) {
>> +		gpu = pci_walk_dev(phb, NULL, npu3_match_gpu, (void *)slot);
>> +		if (!gpu)
>> +			continue;
>> +
>> +		dev->nvlink.gpu = gpu;
>> +		return;
>> +	}
>> +
>> +	NPU3DEVINF(dev, "No PCI device found for slot '%s'\n", slot);
>> +}
>> +
>> +#define VENDOR_CAP_START		0x80
>> +#define VENDOR_CAP_LINK_FLAG_OFFSET	0x0d
>> +
>> +void npu3_pvd_flag_set(struct npu3_dev *dev, uint8_t flag)
>> +{
>> +	uint32_t offset = VENDOR_CAP_START + VENDOR_CAP_LINK_FLAG_OFFSET;
>> +	uint32_t flags;
>> +
>> +	PCI_VIRT_CFG_RDONLY_RD(dev->nvlink.pvd, offset, 1, &flags);
>> +	flags |= flag;
>> +	PCI_VIRT_CFG_INIT_RO(dev->nvlink.pvd, offset, 1, flags);
>> +}
>> +
>> +void npu3_pvd_flag_clear(struct npu3_dev *dev, uint8_t flag)
>> +{
>> +	uint32_t offset = VENDOR_CAP_START + VENDOR_CAP_LINK_FLAG_OFFSET;
>> +	uint32_t flags;
>> +
>> +	PCI_VIRT_CFG_RDONLY_RD(dev->nvlink.pvd, offset, 1, &flags);
>> +	flags &= ~flag;
>> +	PCI_VIRT_CFG_INIT_RO(dev->nvlink.pvd, offset, 1, flags);
>> +}
>> +
>> +static struct lock npu3_phandle_lock = LOCK_UNLOCKED;
>> +
>> +static void npu3_append_phandle(struct dt_node *dn, const char *name,
>> +				uint32_t phandle)
>> +{
>> +	struct dt_property *prop;
>> +	uint32_t *phandles;
>> +	size_t len;
>> +
>> +	prop = __dt_find_property(dn, name);
>> +	if (!prop) {
>> +		dt_add_property_cells(dn, name, phandle);
>> +		return;
>> +	}
>> +
>> +	/*
>> +	 * Make sure no one else has a reference to the property. Assume
>> +	 * this is the only function that holds a reference to it.
>> +	 */
>> +	lock(&npu3_phandle_lock);
>> +
>> +	/* Need to append to the property */
>> +	len = prop->len + sizeof(*phandles);
>> +	dt_resize_property(&prop, len);
>> +	prop->len = len;
>> +
>> +	phandles = (uint32_t *)prop->prop;
>> +	phandles[len / sizeof(*phandles) - 1] = phandle;
>> +
>> +	unlock(&npu3_phandle_lock);
>> +}
>> +
>> +static void npu3_dev_fixup_dt(struct npu3_dev *dev)
>> +{
>> +	struct pci_device *pd = dev->nvlink.pd;
>> +	struct pci_device *gpu = dev->nvlink.gpu;
>> +
>> +	dt_add_property_cells(pd->dn, "ibm,nvlink", dev->dn->phandle);
>> +	dt_add_property_string(pd->dn, "ibm,loc-code", dev->nvlink.loc_code);
>> +	if (dev->link_speed != 0xff)
>> +		dt_add_property_cells(pd->dn, "ibm,nvlink-speed",
>> +				      lo32(dev->link_speed));
>> +
>> +	if (!gpu)
>> +		return;
>> +
>> +	npu3_append_phandle(gpu->dn, "ibm,npu", pd->dn->phandle);
>> +	dt_add_property_cells(pd->dn, "ibm,gpu", gpu->dn->phandle);
>> +}
>> +
>> +static int64_t npu3_gpu_bridge_sec_bus_reset(void *pdev,
>> +				struct pci_cfg_reg_filter *pcrf __unused,
>> +				uint32_t offset, uint32_t len,
>> +				uint32_t *data, bool write)
>> +{
>> +	struct pci_device *pd = pdev;
>> +	struct pci_device *gpu;
>> +	struct npu3 *npu;
>> +	struct npu3_dev *dev;
>> +	bool purge = false;
>> +
>> +	if (!write)
>> +		return OPAL_PARAMETER;
>> +
>> +	if (len != 2 || offset & 1) {
>> +		PCIERR(pd->phb, pd->bdfn,
>> +		       "Unsupported write to bridge control register\n");
>> +		return OPAL_PARAMETER;
>> +	}
>> +
>> +	if (!(*data & PCI_CFG_BRCTL_SECONDARY_RESET))
>> +		return OPAL_PARTIAL;
>> +
>> +	gpu = list_top(&pd->children, struct pci_device, link);
>> +	if (!gpu)
>> +		return OPAL_PARTIAL;
>> +
>> +	npu3_for_each_nvlink_npu(npu)
>> +		npu3_for_each_nvlink_dev(dev, npu)
>> +			if (dev->nvlink.gpu == gpu)
>> +				if (!npu3_dev_reset(dev))
>> +					purge = true;
>> +
>> +	if (purge)
>> +		purge_l2_l3_caches();
>> +
>> +	return OPAL_PARTIAL;
>> +}
>> +
>> +static int npu3_dev_bind(struct phb *phb, struct pci_device *pd,
>> +			 void *data __unused)
>> +{
>> +	struct npu3 *npu = npu3_phb_to_npu(phb);
>> +	struct npu3_dev *dev = npu3_bdfn_to_dev(npu, pd->bdfn);
>> +	struct pci_device *gpu;
>> +
>> +	dev->nvlink.pd = pd;
>> +
>> +	/* The slot label indicates which GPU this link is connected to */
>> +	dev->nvlink.loc_code = dt_prop_get_def(dev->dn, "ibm,slot-label", NULL);
>> +	if (!dev->nvlink.loc_code) {
>> +		/**
>> +		 * @fwts-label NPUNoPHBSlotLabel
>> +		 * @fwts-advice No GPU/NPU slot information was found.
>> +		 * NVLink3 functionality will not work.
>> +		 */
>> +		NPU3DEVERR(dev, "Cannot find GPU slot information\n");
>> +	}
>> +
>> +	npu3_dev_find_gpu(dev);
>> +	npu3_dev_fixup_dt(dev);
>> +
>> +	gpu = dev->nvlink.gpu;
>> +	if (!gpu)
>> +		return 0;
>> +
>> +	/* When a GPU is reset, ensure all of its links are reset too */
>> +	if (gpu->parent && gpu->parent->slot)
>> +		pci_add_cfg_reg_filter(gpu->parent, PCI_CFG_BRCTL, 2,
>> +				       PCI_REG_FLAG_WRITE,
>> +				       npu3_gpu_bridge_sec_bus_reset);
>> +
>> +	npu3_pvd_flag_set(dev, NPU3_DEV_PCI_LINKED);
>> +
>> +	return 0;
>> +}
>> +
>> +struct npu3 *npu3_next_nvlink_npu(struct npu3 *npu, uint32_t chip_id)
>> +{
>> +	uint64_t phb_id = 0;
>> +	struct phb *phb;
>> +
>> +	if (npu)
>> +		phb_id = npu->nvlink.phb.opal_id + 1;
>> +
>> +	for (; (phb = __pci_next_phb_idx(&phb_id));) {
>> +		if (phb->phb_type != phb_type_npu_v3)
>> +			continue;
>> +
>> +		npu = npu3_phb_to_npu(phb);
>> +		if (npu->chip_id == chip_id || chip_id == NPU3_ANY_CHIP)
>> +			return npu;
>> +	}
>> +
>> +	return NULL;
>> +}
>> +
>> +static struct npu3 *npu3_last_npu(void)
>> +{
>> +	static struct npu3 *last = NULL;
>> +	struct npu3 *npu;
>> +
>> +	if (last)
>> +		return last;
>> +
>> +	npu3_for_each_nvlink_npu(npu)
>> +		last = npu;
>> +
>> +	return last;
>> +}
>> +
>> +static uint32_t npu3_gpu_links(struct pci_device *gpu)
>> +{
>> +	const struct dt_property *prop;
>> +
>> +	if (!gpu)
>> +		return 0;
>> +
>> +	/* The link count is the number of phandles in "ibm,npu" */
>> +	prop = dt_find_property(gpu->dn, "ibm,npu");
>> +	if (!prop)
>> +		return 0;
>> +
>> +	return prop->len / sizeof(uint32_t);
>> +}
>> +
>> +static uint32_t npu3_links_per_gpu(void)
>> +{
>> +	static uint32_t links = -1;
>> +	struct npu3 *npu;
>> +	struct npu3_dev *dev;
>> +
>> +	/* Static value, same for all GPUs; only do this once */
>> +	if (links != -1)
>> +		return links;
>> +
>> +	/* Use the first GPU we find to figure this out */
>> +	npu3_for_each_nvlink_npu(npu) {
>> +		npu3_for_each_nvlink_dev(dev, npu) {
>> +			links = npu3_gpu_links(dev->nvlink.gpu);
>> +			if (links)
>> +				goto out;
>> +		}
>> +	}
>> +
>> +out:
>> +	prlog(PR_DEBUG, "NPU: %s: %d\n", __func__, links);
>> +
>> +	return links;
>
>
>Can this possibly return links==0? It relies on "ibm,npu" properties
>which are or are not create by now? If they are, then you do not need
>"static" for @links as this is only called from
>npu3_chip_possible_gpus() which has static @possible itself.

Yes, it will return 0 on a system without GPUs. 

>Statics are bad.

You're right, I can do with just the one in npu3_chip_possible_gpus().  
Will get rid of the others.

>> +}
>> +
>> +uint32_t npu3_dev_gpu_index(struct npu3_dev *dev)
>> +{
>> +	const char *slot;
>> +	char *p = NULL;
>> +	int ret;
>> +
>> +	slot = dev->nvlink.loc_code;
>> +	if (!slot)
>> +		return -1;
>> +
>> +	if (memcmp(slot, "GPU", 3))
>> +		return -1;
>> +
>> +	ret = strtol(slot + 3, &p, 10);
>> +	if (*p || p == slot + 3)
>> +		return -1;
>> +
>> +	return ret;
>> +}
>> +
>> +static uint32_t npu3_chip_possible_gpu_links(void)
>> +{
>> +	struct proc_chip *chip;
>> +	struct npu3 *npu;
>> +	struct npu3_dev *dev;
>> +	static uint32_t possible = -1;
>
>
>You do not need this static as this is only called from
>npu3_chip_possible_gpus() which has its own static @possible.

Yep.

>> +
>> +	/* Static value, same for all chips; only do this once */
>> +	if (possible != -1)
>> +		return possible;
>> +
>> +	possible = 0;
>> +
>> +	for_each_chip(chip) {
>> +		npu3_for_each_chip_nvlink_npu(npu, chip->id)
>> +			npu3_for_each_nvlink_dev(dev, npu)
>> +				if (npu3_dev_gpu_index(dev) != -1)
>> +					possible++;
>> +
>> +		if (possible)
>> +			break;
>> +	}
>> +
>> +	prlog(PR_DEBUG, "NPU: %s: %d\n", __func__, possible);
>> +
>> +	return possible;
>> +}
>> +
>> +uint32_t npu3_chip_possible_gpus(void)
>> +{
>> +	static uint32_t possible = -1;
>> +	uint32_t links_per_gpu;
>> +
>> +	/* Static value, same for all chips; only do this once */
>> +	if (possible != -1)
>> +		return possible;
>> +
>> +	possible = 0;
>> +
>> +	links_per_gpu = npu3_links_per_gpu();
>> +	if (links_per_gpu)
>> +		possible = npu3_chip_possible_gpu_links() / links_per_gpu;
>> +
>> +	prlog(PR_DEBUG, "NPU: %s: %d\n", __func__, possible);
>> +
>> +	return possible;
>> +}
>> +
>> +static void npu3_dev_assign_gmb(struct npu3_dev *dev, uint64_t addr,
>> +				uint64_t size)
>> +{
>> +	uint32_t mode;
>> +	uint64_t val;
>> +
>> +	switch (npu3_gpu_links(dev->nvlink.gpu)) {
>> +	case 0:
>> +		return;
>> +	case 1:
>> +		mode = 0;
>> +		break;
>> +	case 2:
>> +		mode = 1;
>> +		break;
>> +	case 3:
>> +		mode = 3;
>> +		break;
>> +	case 4:
>> +		mode = 6;
>> +		break;
>> +	case 6:
>> +		mode = 10;
>> +		break;
>> +	default:
>> +		/* Hardware does not support this configuration */
>> +		assert(0);
>> +	}
>> +
>> +	mode += dev->nvlink.pvd->bdfn & 0x7;
>> +
>> +	val = NPU3_GPU_MEM_BAR_ENABLE |
>> +	      NPU3_GPU_MEM_BAR_POISON;
>> +	val = SETFIELD(NPU3_GPU_MEM_BAR_ADDR, val, addr >> 30);
>> +	val = SETFIELD(NPU3_GPU_MEM_BAR_SIZE, val, size >> 30);
>> +	val = SETFIELD(NPU3_GPU_MEM_BAR_MODE, val, mode);
>> +
>> +	npu3_write(dev->npu, NPU3_GPU_MEM_BAR(dev->index), val);
>> +}
>> +
>> +static struct dt_node *npu3_create_memory_dn(struct npu3_dev *dev,
>> +					     uint32_t gpu_index, uint64_t addr,
>> +					     uint64_t size)
>> +{
>> +	uint32_t nid = 255 - gpu_index;
>> +	struct dt_node *mem;
>> +
>> +	mem = dt_find_by_name_addr(dt_root, "memory", addr);
>> +	if (mem)
>> +		return mem;
>> +
>> +	mem = dt_new_addr(dt_root, "memory", addr);
>> +	assert(mem);
>> +
>> +	dt_add_property_string(mem, "device_type", "memory");
>> +	dt_add_property_string(mem, "compatible", "ibm,coherent-device-memory");
>> +	dt_add_property_u64s(mem, "reg", addr, size);
>> +	dt_add_property_u64s(mem, "linux,usable-memory", addr, 0);
>> +	dt_add_property_cells(mem, "ibm,chip-id", nid);
>> +	dt_add_property_cells(mem, "ibm,associativity", 4, nid, nid, nid, nid);
>> +
>> +	NPU3INF(dev->npu, "%s mem: 0x%016llx (nid %d)\n", dev->nvlink.loc_code,
>> +		addr, nid);
>> +
>> +	return mem;
>> +}
>> +
>> +static void npu3_dev_init_gpu_mem(struct npu3_dev *dev)
>> +{
>> +	struct pci_device *pd = dev->nvlink.pd;
>> +	struct npu3 *npu = dev->npu;
>> +	struct dt_node *mem;
>> +	uint64_t addr, size, gta;
>> +	uint32_t gpu_index;
>> +
>> +	if (!dev->nvlink.gpu)
>> +		return;
>> +
>> +	gpu_index = npu3_dev_gpu_index(dev) % npu3_chip_possible_gpus();
>> +	phys_map_get(npu->chip_id, GPU_MEM_4T_DOWN, gpu_index, &addr, &size);
>> +
>> +	npu3_dev_assign_gmb(dev, addr, size);
>> +	mem = npu3_create_memory_dn(dev, gpu_index, addr, size);
>> +
>> +	/*
>> +	 * Coral mode address compression. This is documented in Figure 3.5 of
>> +	 * the NPU workbook; "P9->GPU RA Compression (Coral)".
>> +	 */
>> +	gta  = (addr >> 42 & 0x1) << 42;
>> +	gta |= (addr >> 45 & 0x3) << 43;
>> +	gta |= (addr >> 49 & 0x3) << 45;
>> +	gta |= addr & ((1ul << 43) - 1);
>> +
>> +	dt_add_property_cells(pd->dn, "memory-region", mem->phandle);
>> +	dt_add_property_u64s(pd->dn, "ibm,device-tgt-addr", gta);
>> +}
>> +
>> +static void npu3_final_fixup(void)
>> +{
>> +	struct npu3 *npu;
>> +	struct npu3_dev *dev;
>> +
>> +	npu3_for_each_nvlink_npu(npu)
>> +		npu3_for_each_nvlink_dev(dev, npu)
>> +			npu3_dev_init_gpu_mem(dev);
>> +}
>> +
>> +static void npu3_phb_final_fixup(struct phb *phb)
>> +{
>> +	struct npu3 *npu = npu3_phb_to_npu(phb);
>> +
>> +	pci_walk_dev(phb, NULL, npu3_dev_bind, NULL);
>> +
>> +	/* After every npu's devices are bound, do gpu-related fixup */
>> +	if (npu == npu3_last_npu())
>
>
>Why delay this till the last NPU is finally fixed up?

We have to bind all links to their GPUs before we set up the GPU memory 
BARs. That is because setting NPU3_GPU_MEM_BAR_MODE requires knowing how 
many link peers go to each GPU.

Note that npu2 was simpler in this regard. Because there was only one 
NPU (instead of three), you could find the number of link peers just by 
parsing all the link bdfs.

>What does guarantee the order?

It works because the loop in __pci_init_slots() walks the phbs[] array 
the same way npu3_next_nvlink_npu() does. Should I maybe add a comment 
or something?

>Cannot swift_npu3_fixup() do this? Looks fragile.

It could, but this is core npu code and I don't think it makes sense to 
make each platform do it themselves.

>> +		npu3_final_fixup();
>> +}
>> +
>> +static int64_t npu3_set_pe(struct phb *phb,
>> +			   uint64_t pe_num,
>> +			   uint64_t bdfn,
>> +			   uint8_t bcompare,
>> +			   uint8_t dcompare,
>> +			   uint8_t fcompare,
>> +			   uint8_t action)
>> +{
>> +	struct npu3 *npu = npu3_phb_to_npu(phb);
>> +	struct npu3_dev *dev;
>> +	uint64_t val;
>> +
>> +	dev = npu3_bdfn_to_dev(npu, bdfn);
>> +	if (!dev)
>> +		return OPAL_PARAMETER;
>> +
>> +	if (action != OPAL_MAP_PE && action != OPAL_UNMAP_PE)
>> +		return OPAL_PARAMETER;
>> +
>> +	if (pe_num >= NPU3_MAX_PE_NUM)
>> +		return OPAL_PARAMETER;
>> +
>> +	if (bcompare != OpalPciBusAll ||
>> +	    dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER ||
>> +	    fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER)
>> +		return OPAL_UNSUPPORTED;
>> +
>> +	if (!dev->nvlink.gpu)
>> +		return OPAL_SUCCESS;
>> +
>> +	val = NPU3_CTL_BDF2PE_CFG_ENABLE;
>> +	val = SETFIELD(NPU3_CTL_BDF2PE_CFG_PE, val, pe_num);
>> +	val = SETFIELD(NPU3_CTL_BDF2PE_CFG_BDF, val, dev->nvlink.gpu->bdfn);
>> +	npu3_write(npu, NPU3_CTL_BDF2PE_CFG(pe_num), val);
>> +
>> +	val = NPU3_MISC_BDF2PE_CFG_ENABLE;
>> +	val = SETFIELD(NPU3_MISC_BDF2PE_CFG_PE, val, pe_num);
>> +	val = SETFIELD(NPU3_MISC_BDF2PE_CFG_BDF, val, dev->nvlink.gpu->bdfn);
>> +	npu3_write(npu, NPU3_MISC_BDF2PE_CFG(pe_num), val);
>> +
>> +	return OPAL_SUCCESS;
>> +}
>> +
>> +static int64_t npu3_tce_kill_pages(struct npu3 *npu,
>> +				   uint64_t pe_num,
>> +				   uint32_t tce_size,
>> +				   uint64_t dma_addr,
>> +				   uint32_t npages)
>> +{
>> +	uint32_t check_tce_size;
>> +	uint64_t val;
>> +
>> +	if (pe_num >= NPU3_MAX_PE_NUM)
>> +		return OPAL_PARAMETER;
>> +
>> +	npu3_ioda_sel(npu, NPU3_ATS_IODA_ADDR_TBL_TVT, pe_num);
>> +	val = npu3_read(npu, NPU3_ATS_IODA_DATA);
>> +
>> +	check_tce_size = 0x800 << GETFIELD(NPU3_ATS_IODA_TVT_PAGE_SIZE, val);
>> +	if (check_tce_size != tce_size) {
>> +		NPU3ERR(npu, "%s: Unexpected TCE size (got 0x%x, expected 0x%x)\n",
>> +			__func__, tce_size, check_tce_size);
>> +
>> +		return OPAL_PARAMETER;
>> +	}
>> +
>> +	val = NPU3_ATS_TCE_KILL_ONE;
>> +	val = SETFIELD(NPU3_ATS_TCE_KILL_PE_NUMBER, val, pe_num);
>> +
>> +	while (npages--) {
>> +		val = SETFIELD(NPU3_ATS_TCE_KILL_ADDRESS, val, dma_addr >> 12);
>> +		npu3_write(npu, NPU3_ATS_TCE_KILL, val);
>> +
>> +		dma_addr += tce_size;
>> +	}
>> +
>> +	return OPAL_SUCCESS;
>> +}
>> +
>> +static int64_t npu3_tce_kill(struct phb *phb,
>> +			     uint32_t kill_type,
>> +			     uint64_t pe_num,
>> +			     uint32_t tce_size,
>> +			     uint64_t dma_addr,
>> +			     uint32_t npages)
>> +{
>> +	struct npu3 *npu = npu3_phb_to_npu(phb);
>> +
>> +	sync();
>> +
>> +	switch(kill_type) {
>> +	case OPAL_PCI_TCE_KILL_PAGES:
>> +		return npu3_tce_kill_pages(npu, pe_num, tce_size,
>> +					   dma_addr, npages);
>> +	case OPAL_PCI_TCE_KILL_PE:
>> +		/*
>> +		 * NPU doesn't support killing a PE so fall through
>> +		 * and do a kill all instead.
>> +		 */
>> +	case OPAL_PCI_TCE_KILL_ALL:
>> +		npu3_write(npu, NPU3_ATS_TCE_KILL, NPU3_ATS_TCE_KILL_ALL);
>> +		return OPAL_SUCCESS;
>> +	}
>> +
>> +	return OPAL_PARAMETER;
>> +}
>> +
>> +static const struct phb_ops npu_ops = {
>> +	.cfg_read8		= npu3_cfg_read8,
>> +	.cfg_read16		= npu3_cfg_read16,
>> +	.cfg_read32		= npu3_cfg_read32,
>> +	.cfg_write8		= npu3_cfg_write8,
>> +	.cfg_write16		= npu3_cfg_write16,
>> +	.cfg_write32		= npu3_cfg_write32,
>> +	.eeh_freeze_status	= npu3_eeh_freeze_status,
>> +	.ioda_reset		= npu3_ioda_reset,
>> +	.map_pe_dma_window	= npu3_map_pe_dma_window,
>> +	.map_pe_dma_window_real	= npu3_map_pe_dma_window_real,
>> +	.next_error		= npu3_next_error,
>> +	.phb_final_fixup	= npu3_phb_final_fixup,
>> +	.set_pe			= npu3_set_pe,
>> +	.tce_kill		= npu3_tce_kill,
>> +};
>> +
>> +static int64_t npu3_reset(struct pci_slot *slot)
>> +{
>> +	struct npu3 *npu = npu3_phb_to_npu(slot->phb);
>> +	struct npu3_dev *dev;
>> +	int64_t rc = OPAL_SUCCESS;
>> +	bool purge = false;
>> +
>> +	npu3_for_each_nvlink_dev(dev, npu) {
>> +		rc = npu3_dev_reset(dev);
>> +		if (rc)
>> +			break;
>> +
>> +		purge = true;
>> +	}
>> +
>> +	/* No devices reset; don't purge, just return */
>> +	if (!purge)
>> +		return rc;
>> +
>> +	/* All devices reset */
>> +	if (!rc)
>> +		return purge_l2_l3_caches();
>> +
>> +	/* Some devices successfully reset; purge, but still return error */
>> +	purge_l2_l3_caches();
>> +	return rc;
>> +}
>> +
>> +static int64_t npu3_freset(struct pci_slot *slot __unused)
>> +{
>> +	return OPAL_SUCCESS;
>> +}
>> +
>> +static int64_t npu3_get_link_state(struct pci_slot *slot __unused,
>> +				   uint8_t *val)
>> +{
>> +	*val = OPAL_SHPC_LINK_UP_x1;
>> +	return OPAL_SUCCESS;
>> +}
>> +
>> +static int64_t npu3_get_power_state(struct pci_slot *slot __unused,
>> +				    uint8_t *val)
>> +{
>> +	*val = PCI_SLOT_POWER_ON;
>> +	return OPAL_SUCCESS;
>> +}
>> +
>> +static void npu3_create_phb_slot(struct npu3 *npu)
>> +{
>> +	struct pci_slot *slot;
>> +
>> +	slot = pci_slot_alloc(&npu->nvlink.phb, NULL);
>> +	if (!slot) {
>> +		/**
>> +		 * @fwts-label NPUCannotCreatePHBSlot
>> +		 * @fwts-advice Firmware probably ran out of memory creating
>> +		 * NPU3 slot. NVLink functionality could be broken.
>> +		 */
>> +		NPU3ERR(npu, "Cannot create PHB slot\n");
>
>
>No need in this one - pci_slot_alloc() prints errors itself.

Will remove.

>> +		return;
>> +	}
>> +
>> +	/* Elementary functions */
>> +	slot->ops.creset		= npu3_reset;
>> +	slot->ops.freset		= npu3_freset;
>> +	slot->ops.hreset		= npu3_reset;
>> +	slot->ops.get_link_state	= npu3_get_link_state;
>> +	slot->ops.get_power_state	= npu3_get_power_state;
>> +}
>> +
>> +static void npu3_create_phb(struct npu3 *npu)
>> +{
>> +	struct phb *phb = &npu->nvlink.phb;
>> +
>> +	phb->phb_type = phb_type_npu_v3;
>> +	phb->ops = &npu_ops;
>> +	phb->dt_node = dt_new_addr(dt_root, "pciex", npu->regs[0]);
>> +	assert(phb->dt_node);
>> +
>> +	list_head_init(&phb->virt_devices);
>> +	pci_register_phb(phb, OPAL_DYNAMIC_PHB_ID);
>> +	npu3_create_phb_slot(npu);
>> +	npu3_ioda_reset(phb, true);
>> +}
>> +
>> +static void npu3_dev_init_hw(struct npu3_dev *dev)
>> +{
>> +	struct npu3 *npu = dev->npu;
>> +	uint64_t reg, val;
>> +
>> +	reg = NPU3_RELAXED_CFG2(dev->index);
>> +	val = npu3_read(npu, reg);
>> +	val |= NPU3_RELAXED_CFG2_CMD_CL_DMA_W |
>> +	       NPU3_RELAXED_CFG2_CMD_CL_DMA_W_HP |
>> +	       NPU3_RELAXED_CFG2_CMD_CL_DMA_INJ |
>> +	       NPU3_RELAXED_CFG2_CMD_PR_DMA_INJ |
>> +	       NPU3_RELAXED_CFG2_CMD_DMA_PR_W |
>> +	       NPU3_RELAXED_CFG2_CMD_CL_RD_NC_F0 |
>> +	       NPU3_RELAXED_CFG2_SRC_RDENA(0);
>> +	npu3_write(npu, reg, val);
>> +
>> +	reg = NPU3_NTL_MISC_CFG2(dev->index);
>> +	val = npu3_read(npu, reg);
>> +	val |= NPU3_NTL_MISC_CFG2_BRICK_ENABLE |
>> +	       NPU3_NTL_MISC_CFG2_NDL_TX_PARITY_ENA |
>> +	       NPU3_NTL_MISC_CFG2_NDL_PRI_PARITY_ENA |
>> +	       NPU3_NTL_MISC_CFG2_RCV_CREDIT_OVERFLOW_ENA;
>> +	npu3_write(npu, reg, val);
>> +}
>> +
>> +static void npu3_init_hw(struct npu3 *npu)
>> +{
>> +	struct npu3_dev *dev;
>> +	uint64_t reg, val;
>> +
>> +	reg = NPU3_XTS_CFG;
>> +	val = npu3_read(npu, reg);
>> +	val |= NPU3_XTS_CFG_MMIOSD | NPU3_XTS_CFG_TRY_ATR_RO;
>> +	npu3_write(npu, reg, val);
>> +
>> +	reg = NPU3_XTS_CFG2;
>> +	val = npu3_read(npu, reg);
>
>Either npu3_read() does not need to store to @val or...
>
>
>> +	val = NPU3_XTS_CFG2_NO_FLUSH_ENA;
>
>... something is missing here (NPU2 does
>"val | NPU2_XTS_CFG2_NO_FLUSH_ENA").

Doh! Should be

  val |= NPU3_XTS_CFG2_NO_FLUSH_ENA;

>> +	npu3_write(npu, reg, val);
>> +
>> +	reg = NPU3_RELAXED_SRC(0);
>> +	val = NPU3_RELAXED_SRC_MASK_NPU;
>> +	npu3_write(npu, reg, val);
>> +
>> +	npu3_for_each_nvlink_dev(dev, npu)
>> +		npu3_dev_init_hw(dev);
>> +}
>> +
>> +/* PCI command register (BAR enable/disable) */
>> +static int64_t npu3_cfg_cmd(void *pvd,
>> +			    struct pci_cfg_reg_filter *pcrf __unused,
>> +			    uint32_t offset, uint32_t size,
>> +			    uint32_t *data, bool write)
>> +{
>> +	struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
>> +
>> +	if (!write)
>> +		return OPAL_PARTIAL;
>> +
>> +	if (offset != PCI_CFG_CMD)
>> +		return OPAL_PARAMETER;
>> +
>> +	if (size != 1 && size != 2 && size != 4)
>> +		return OPAL_PARAMETER;
>> +
>> +	npu3_dev_enable_bars(dev, !!(*data & PCI_CFG_CMD_MEM_EN));
>> +
>> +	return OPAL_PARTIAL;
>> +}
>> +
>> +static int64_t npu3_cfg_bar_write(struct npu3_bar *bar, uint64_t mask,
>> +				  uint32_t data)
>> +{
>> +	if (data != 0xffffffff)
>> +		return OPAL_HARDWARE;
>> +
>> +	/* Return BAR size on next read */
>> +	bar->trap |= mask;
>> +
>> +	return OPAL_SUCCESS;
>> +}
>> +
>> +static int64_t npu3_cfg_bar_read(struct npu3_bar *bar, uint64_t mask,
>> +				 uint32_t *data)
>> +{
>> +	if (!(bar->trap & mask))
>> +		return OPAL_PARTIAL;
>> +
>> +	*data = GETFIELD(mask, bar->size);
>> +	bar->trap &= ~mask;
>> +
>> +	return OPAL_SUCCESS;
>> +}
>> +
>> +/* PCI BAR registers (NTL/GENID) */
>> +static int64_t npu3_cfg_bar(void *pvd __unused,
>> +			    struct pci_cfg_reg_filter *pcrf,
>> +			    uint32_t offset, uint32_t size, uint32_t *data,
>> +			    bool write)
>> +{
>> +	struct npu3_bar *bar = (struct npu3_bar *)pcrf->data;
>> +	uint64_t mask;
>> +
>> +	if (size != 4)
>> +		return OPAL_PARAMETER;
>> +
>> +	if (offset == pcrf->start)
>> +		mask = 0xffffffff;
>> +	else if (offset == pcrf->start + 4)
>> +		mask = 0xffffffffull << 32;
>> +	else
>> +		return OPAL_PARAMETER;
>> +
>> +	if (write)
>> +		return npu3_cfg_bar_write(bar, mask, *data);
>> +
>> +	return npu3_cfg_bar_read(bar, mask, data);
>> +}
>> +
>> +/* PCI control register */
>> +static int64_t npu3_cfg_devctl(void *pvd,
>> +			       struct pci_cfg_reg_filter *pcrf __unused,
>> +			       uint32_t offset, uint32_t size,
>> +			       uint32_t *data, bool write)
>> +{
>> +	struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
>> +
>> +	if (!write)
>> +		return OPAL_HARDWARE;
>> +
>> +	if (size != 2 || offset & 1) {
>> +		NPU3DEVERR(dev, "Unsupported write to pcie control register\n");
>> +		return OPAL_PARAMETER;
>> +	}
>> +
>> +	if (*data & PCICAP_EXP_DEVCTL_FUNC_RESET)
>> +		if (!npu3_dev_reset(dev))
>> +			purge_l2_l3_caches();
>> +
>> +	return OPAL_PARTIAL;
>> +}
>> +
>> +static uint32_t npu3_cfg_populate_pcie_cap(struct npu3_dev *dev, uint32_t start,
>> +					   uint32_t prev_cap)
>> +{
>> +	struct pci_virt_device *pvd = dev->nvlink.pvd;
>> +	uint32_t val;
>> +
>> +	/* Add capability list */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
>> +	PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_EXP);
>> +
>> +	/* 0x00 - ID/PCIE capability */
>> +	val = PCI_CFG_CAP_ID_EXP;
>> +	val |= 0x2 << 16 | PCIE_TYPE_ENDPOINT << 20;
>> +	PCI_VIRT_CFG_INIT_RO(pvd, start, 4, val);
>> +
>> +	/* 0x04 - Device capability */
>> +	val = PCIE_MPSS_128 |
>> +	      PCIE_PHANTOM_NONE << 3 |
>> +	      PCIE_L0SL_MAX_NO_LIMIT << 6 |
>> +	      PCIE_L1L_MAX_NO_LIMIT << 9 |
>> +	      PCICAP_EXP_DEVCAP_FUNC_RESET;
>> +	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_DEVCAP, 4, val);
>> +
>> +	pci_virt_add_filter(pvd, start + PCICAP_EXP_DEVCTL, 2,
>> +			    PCI_REG_FLAG_WRITE,
>> +			    npu3_cfg_devctl, NULL);
>> +
>> +	/* 0x08 - Device control and status */
>> +	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DEVCTL, 4, 0x00002810,
>> +			  0xffff0000, 0x000f0000);
>> +
>> +	/* 0x0c - Link capability */
>> +	val = PCIE_LSPEED_VECBIT_2 | PCIE_LWIDTH_1X << 4;
>> +	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP, 4, val);
>> +
>> +	/* 0x10 - Link control and status */
>> +	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL, 4, 0x00130000,
>> +			  0xfffff000, 0xc0000000);
>> +
>> +	/* 0x14 - Slot capability */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCAP, 4, 0x00000000);
>> +
>> +	/* 0x18 - Slot control and status */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SLOTCTL, 4, 0x00000000);
>> +
>> +	/* 0x1c - Root control and capability */
>> +	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RC, 4, 0x00000000,
>> +			  0xffffffe0, 0x00000000);
>> +
>> +	/* 0x20 - Root status */
>> +	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_RSTAT, 4, 0x00000000,
>> +			  0xffffffff, 0x00010000);
>> +
>> +	/* 0x24 - Device capability 2 */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, start + PCIECAP_EXP_DCAP2, 4, 0x00000000);
>> +
>> +	/* 0x28 - Device Control and status 2 */
>> +	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_DCTL2, 4, 0x00070000,
>> +			  0xffff0000, 0x00000000);
>> +
>> +	/* 0x2c - Link capability 2 */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_LCAP2, 4, 0x00000007);
>> +
>> +	/* 0x30 - Link control and status 2 */
>> +	PCI_VIRT_CFG_INIT(pvd, start + PCICAP_EXP_LCTL2, 4, 0x00000003,
>> +			  0xffff0000, 0x00200000);
>> +
>> +	/* 0x34 - Slot capability 2 */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCAP2, 4, 0x00000000);
>> +
>> +	/* 0x38 - Slot control and status 2 */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, start + PCICAP_EXP_SCTL2, 4, 0x00000000);
>> +
>> +	return start + PCICAP_EXP_SCTL2 + 8;
>> +}
>> +
>> +static int64_t npu3_dev_procedure_write(struct npu3_dev *dev, uint32_t offset,
>> +					uint32_t data)
>> +{
>> +	switch (offset) {
>> +	case 0:
>> +		NPU3DEVINF(dev, "Ignoring write to status register\n");
>> +		break;
>> +	case 4:
>> +		npu3_dev_procedure_init(dev, data);
>> +		break;
>> +	default:
>> +		return OPAL_PARAMETER;
>> +	}
>> +
>> +	return OPAL_SUCCESS;
>> +}
>> +
>> +static int64_t npu3_dev_procedure_read(struct npu3_dev *dev, uint32_t offset,
>> +				       uint32_t *data)
>> +{
>> +	switch (offset) {
>> +	case 0:
>> +		*data = npu3_dev_procedure_status(dev);
>> +		break;
>> +	case 4:
>> +		*data = dev->proc.number;
>> +		break;
>> +	default:
>> +		*data = 0;
>> +		return OPAL_PARAMETER;
>> +	}
>> +
>> +	return OPAL_SUCCESS;
>> +}
>> +
>> +/* Hardware procedure control/status registers */
>> +static int64_t npu3_dev_procedure(void *pvd, struct pci_cfg_reg_filter *pcrf,
>> +				  uint32_t offset, uint32_t size,
>> +				  uint32_t *data, bool write)
>> +{
>> +	struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
>> +
>> +	if (size != 4)
>> +		return OPAL_PARAMETER;
>> +
>> +	offset -= pcrf->start;
>> +
>> +	if (write)
>> +		return npu3_dev_procedure_write(dev, offset, *data);
>> +
>> +	return npu3_dev_procedure_read(dev, offset, data);
>> +}
>> +
>> +/* PPE SRAM access is indirect via CSAR/CSDR */
>> +static void npu3_dev_ppe_sram_sel(struct npu3_dev *dev, uint32_t reg)
>> +{
>> +	uint64_t val;
>> +
>> +	val = SETFIELD(OB_PPE_CSAR_SRAM_ADDR, 0ull, reg);
>> +	xscom_write(dev->npu->chip_id, OB_PPE_CSAR(dev->ob_chiplet), val);
>> +}
>> +
>> +static void npu3_dev_ppe_sram_write(struct npu3_dev *dev, uint32_t reg,
>> +				    uint64_t val)
>> +{
>> +	npu3_dev_ppe_sram_sel(dev, reg);
>> +	xscom_write(dev->npu->chip_id, OB_PPE_CSDR(dev->ob_chiplet), val);
>> +}
>> +
>> +static uint64_t npu3_dev_ppe_sram_read(struct npu3_dev *dev, uint32_t reg)
>> +{
>> +	uint64_t val;
>> +
>> +	npu3_dev_ppe_sram_sel(dev, reg);
>> +	xscom_read(dev->npu->chip_id, OB_PPE_CSDR(dev->ob_chiplet), &val);
>> +
>> +	return val;
>> +}
>> +
>> +/* Software-initiated autonomous link training (SALT) */
>> +static int64_t npu3_dev_salt(void *pvd, struct pci_cfg_reg_filter *pcrf,
>> +			     uint32_t offset, uint32_t size, uint32_t *data,
>> +			     bool write)
>> +{
>> +	struct npu3_dev *dev = ((struct pci_virt_device *)pvd)->data;
>> +	unsigned long timeout;
>> +	uint32_t cmd_reg;
>> +	uint64_t val;
>> +
>> +	if (size != 4 || offset != pcrf->start)
>> +		return OPAL_PARAMETER;
>> +
>> +	/* The config register before this one holds CMD_REG */
>> +	pci_virt_cfg_read_raw(pvd, PCI_VIRT_CFG_NORMAL, pcrf->start - 4,
>> +			      4, &cmd_reg);
>> +
>> +	/* Check for another command in progress */
>> +	val = npu3_dev_ppe_sram_read(dev, OB_PPE_SALT_CMD);
>> +	if (GETFIELD(OB_PPE_SALT_CMD_READY, val))
>> +		return OPAL_BUSY;
>> +
>> +	val = OB_PPE_SALT_CMD_READY;
>> +	val = SETFIELD(OB_PPE_SALT_CMD_RW, val, write);
>> +	val = SETFIELD(OB_PPE_SALT_CMD_LINKNUM, val, npu3_chip_dev_index(dev));
>> +	val = SETFIELD(OB_PPE_SALT_CMD_REG, val, cmd_reg);
>> +	if (write)
>> +		val = SETFIELD(OB_PPE_SALT_CMD_DATA, val, *data);
>> +
>> +	npu3_dev_ppe_sram_write(dev, OB_PPE_SALT_CMD, val);
>> +
>> +	/* Wait for the go bit to clear */
>> +	timeout = mftb() + msecs_to_tb(1000);
>> +
>> +	while (GETFIELD(OB_PPE_SALT_CMD_READY, val)) {
>> +		if (tb_compare(mftb(), timeout) == TB_AAFTERB) {
>> +			NPU3DEVINF(dev, "SALT_CMD 0x%x: timeout\n", cmd_reg);
>> +			return OPAL_BUSY;
>> +		}
>> +
>> +		val = npu3_dev_ppe_sram_read(dev, OB_PPE_SALT_CMD);
>> +	}
>> +
>> +	if (GETFIELD(OB_PPE_SALT_CMD_ERR, val))
>> +		NPU3DEVINF(dev, "SALT_CMD 0x%x: error\n", cmd_reg);
>> +
>> +	if (!write)
>> +		*data = GETFIELD(OB_PPE_SALT_CMD_DATA, val);
>> +
>> +	return OPAL_SUCCESS;
>> +}
>> +
>> +#define VENDOR_CAP_LEN		0x1c
>> +#define VENDOR_CAP_VERSION	0x02
>> +
>> +static uint32_t npu3_cfg_populate_vendor_cap(struct npu3_dev *dev,
>> +					     uint32_t start, uint32_t prev_cap)
>> +{
>> +	struct pci_virt_device *pvd = dev->nvlink.pvd;
>> +
>> +	/* Capabilities list */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, prev_cap, 1, start);
>> +	PCI_VIRT_CFG_INIT_RO(pvd, start, 1, PCI_CFG_CAP_ID_VENDOR);
>> +
>> +	/* Length and version */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, start + 2, 1, VENDOR_CAP_LEN);
>> +	PCI_VIRT_CFG_INIT_RO(pvd, start + 3, 1, VENDOR_CAP_VERSION);
>> +
>> +	/*
>> +	 * Defaults when the trap can't handle the read/write (eg. due to
>> +	 * reading/writing less than 4 bytes).
>> +	 */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, start + 4, 4, 0);
>> +	PCI_VIRT_CFG_INIT_RO(pvd, start + 8, 4, 0);
>> +
>> +	/* PHY procedure trap */
>> +	pci_virt_add_filter(pvd, start + 4, 8,
>> +			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
>> +			    npu3_dev_procedure, NULL);
>> +
>> +	/* Link index */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, start + 0xc, 1, npu3_chip_dev_index(dev));
>> +
>> +	/* SALT registers */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, start + 0x10, 4, 0);
>> +	PCI_VIRT_CFG_INIT_RO(pvd, start + 0x14, 4, 0);
>> +
>> +	pci_virt_add_filter(pvd, start + 0x14, 4,
>> +			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
>> +			    npu3_dev_salt, NULL);
>> +
>> +	return start + VENDOR_CAP_LEN;
>> +}
>> +
>> +static void npu3_cfg_populate(struct npu3_dev *dev)
>> +{
>> +	struct pci_virt_device *pvd = dev->nvlink.pvd;
>> +	uint64_t addr;
>> +	uint32_t pos;
>> +
>> +	/* 0x00 - Vendor/Device ID */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_VENDOR_ID, 4, 0x04ea1014);
>> +
>> +	/* 0x04 - Command/Status */
>> +	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8,
>> +			  0xf9000000);
>> +
>> +	pci_virt_add_filter(pvd, PCI_CFG_CMD, 1, PCI_REG_FLAG_WRITE,
>> +			    npu3_cfg_cmd, NULL);
>> +
>> +	/* 0x08 - Rev/Class/Cache */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_REV_ID, 4, 0x06800102);
>> +
>> +	/* 0x0c - CLS/Latency Timer/Header/BIST */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000);
>> +
>> +	/* 0x10/14 - NTL BAR */
>> +	addr = SETFIELD(0xf, dev->ntl_bar.addr,
>> +			PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64);
>> +	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR0, 4, lo32(addr), 0xf, 0);
>> +	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR1, 4, hi32(addr), 0, 0);
>> +
>> +	pci_virt_add_filter(pvd, PCI_CFG_BAR0, 8,
>> +			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
>> +			    npu3_cfg_bar, &dev->ntl_bar);
>> +
>> +	/* 0x18/1c - GENID BAR */
>> +	addr = SETFIELD(0xf, dev->genid_bar.addr,
>> +			PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64);
>> +	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR2, 4, lo32(addr), 0xf, 0);
>> +	PCI_VIRT_CFG_INIT(pvd, PCI_CFG_BAR3, 4, hi32(addr), 0, 0);
>> +
>> +	pci_virt_add_filter(pvd, PCI_CFG_BAR2, 8,
>> +			    PCI_REG_FLAG_READ | PCI_REG_FLAG_WRITE,
>> +			    npu3_cfg_bar, &dev->genid_bar);
>> +
>> +	/* 0x20/0x24 - BARs, disabled */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR4, 4, 0x00000000);
>> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_BAR5, 4, 0x00000000);
>> +
>> +	/* 0x28 - Cardbus CIS pointer */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CARDBUS_CIS, 4, 0x00000000);
>> +
>> +	/* 0x2c - Subsystem ID */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000);
>> +
>> +	/* 0x30 - ROM BAR, zero sized */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_ROMBAR, 4, 0xffffffff);
>> +
>> +	/* 0x34 - PCI Capability */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_CAP, 4, 0x00000000);
>> +
>> +	/* 0x38 - Reserved */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, 0x38, 4, 0x00000000);
>> +
>> +	/* 0x3c - INT line/pin/Minimal grant/Maximal latency */
>> +	PCI_VIRT_CFG_INIT_RO(pvd, PCI_CFG_INT_LINE, 4, 0x00000100); /* INT A */
>> +
>> +	/* PCIE and vendor specific capability */
>> +	pos = npu3_cfg_populate_pcie_cap(dev, 0x40, PCI_CFG_CAP);
>> +	pos = npu3_cfg_populate_vendor_cap(dev, pos, 0x41);
>> +	PCI_VIRT_CFG_INIT_RO(pvd, pos + 1, 1, 0);
>> +}
>> +
>> +static void npu3_dev_create_pvd(struct npu3_dev *dev)
>> +{
>> +	struct npu3 *npu = dev->npu;
>> +	struct phb *phb = &npu->nvlink.phb;
>> +
>> +	dev->nvlink.pvd = pci_virt_add_device(phb, dev->index, 0x100, dev);
>> +	if (!dev->nvlink.pvd)
>> +		return;
>> +
>> +	phb->scan_map |= 0x1 << GETFIELD(0xf8, dev->nvlink.pvd->bdfn);
>> +	npu3_cfg_populate(dev);
>> +}
>> +
>> +static void npu3_dt_add_mmio_window(struct npu3 *npu)
>> +{
>> +	struct dt_node *dn = npu->nvlink.phb.dt_node;
>> +	uint32_t ntl0_index = npu->index * NPU3_LINKS_PER_NPU;
>> +	uint64_t addr, size, win[2];
>> +
>> +	/* Device MMIO window (NTL/GENID regs only) */
>> +	phys_map_get(npu->chip_id, NPU_NTL, ntl0_index, &win[0], NULL);
>> +	phys_map_get(npu->chip_id, NPU_GENID, npu->index, &addr, &size);
>> +	win[1] = addr + size - win[0];
>> +
>> +	dt_add_property(dn, "ibm,mmio-window", win, sizeof(win));
>> +	dt_add_property_cells(dn, "ranges", 0x02000000,
>> +			      hi32(win[0]), lo32(win[0]),
>> +			      hi32(win[0]), lo32(win[0]),
>> +			      hi32(win[1]), lo32(win[1]));
>> +}
>> +
>> +/* NDL No-Stall Event level */
>> +static uint32_t npu3_dev_interrupt_level(struct npu3_dev *dev)
>> +{
>> +	const uint32_t level[12] = {  1,  3,  5,  7,  9, 11,
>> +				     43, 45, 47, 49, 51, 53 };
>> +
>> +	return level[npu3_chip_dev_index(dev)];
>> +}
>> +
>> +static void npu3_dt_add_interrupts(struct npu3 *npu)
>> +{
>> +	struct dt_node *dn = npu->nvlink.phb.dt_node;
>> +	uint32_t *map, icsp, i = 0;
>> +	struct npu3_dev *dev;
>> +	size_t map_size = 0;
>> +
>> +	npu3_for_each_nvlink_dev(dev, npu)
>> +		map_size += sizeof(*map) * 7;
>> +
>> +	if (!map_size)
>> +		return;
>> +
>> +	icsp = get_ics_phandle();
>> +	map = zalloc(map_size);
>> +	assert(map);
>> +
>> +	npu3_for_each_nvlink_dev(dev, npu) {
>> +		map[i] = dev->nvlink.pvd->bdfn << 8;
>> +		map[i + 3] = 1;		/* INT A */
>> +		map[i + 4] = icsp;	/* interrupt-parent */
>> +		map[i + 5] = npu->irq_base + npu3_dev_interrupt_level(dev);
>> +		map[i + 6] = 0;		/* 0 = EDGE, 1 = LEVEL */
>> +		i += 7;
>> +	}
>> +
>> +	dt_add_property_cells(dn, "interrupt-parent", icsp);
>> +	dt_add_property(dn, "interrupt-map", map, map_size);
>> +	dt_add_property_cells(dn, "interrupt-map-mask", 0xff00, 0x0, 0x0, 0x7);
>> +
>> +	free(map);
>> +}
>> +
>> +/* Populate PCI root device node */
>> +static void npu3_dt_add_props(struct npu3 *npu)
>> +{
>> +	struct dt_node *dn = npu->nvlink.phb.dt_node;
>> +
>> +	dt_add_property_cells(dn, "#address-cells", 3);
>> +	dt_add_property_cells(dn, "#size-cells", 2);
>> +	dt_add_property_cells(dn, "#interrupt-cells", 1);
>> +	dt_add_property_cells(dn, "bus-range", 0, 0xff);
>> +	dt_add_property_cells(dn, "clock-frequency", 0x200, 0);
>> +
>> +	dt_add_property_strings(dn, "device_type", "pciex");
>> +	/* To the OS, npu2 and npu3 are both ibm,ioda2-npu2-phb */
>> +	dt_add_property_strings(dn, "compatible",
>> +				"ibm,power9-npu-pciex",
>> +				"ibm,ioda2-npu2-phb");
>> +
>> +	dt_add_property_cells(dn, "ibm,phb-index",
>> +			      dt_prop_get_u32(npu->dt_node, "ibm,phb-index"));
>> +	dt_add_property_cells(dn, "ibm,phb-diag-data-size", 0);
>> +	dt_add_property_cells(dn, "ibm,opal-num-pes", NPU3_MAX_PE_NUM);
>> +	dt_add_property_cells(dn, "ibm,opal-reserved-pe", NPU3_RESERVED_PE_NUM);
>> +	dt_add_property_cells(dn, "ibm,supported-tce-sizes",
>> +			      12, /* 4K */
>> +			      16, /* 64K */
>> +			      24, /* 16M */
>> +			      28); /* 256M */
>
>Still only these 4 sizes, no 2M/1G?

Yes, according to the P9P NPU Workbook.

>> +
>> +	dt_add_property_cells(dn, "ibm,chip-id", npu->chip_id);
>> +	dt_add_property_cells(dn, "ibm,npu-index", npu->index);
>> +	dt_add_property_cells(dn, "ibm,npcq", npu->dt_node->phandle);
>> +	dt_add_property_cells(dn, "ibm,xscom-base", npu->xscom_base);
>> +	dt_add_property_cells(dn, "ibm,links", NPU3_LINKS_PER_NPU);
>> +
>> +	dt_add_property(dn, "reg", npu->regs, sizeof(npu->regs));
>> +	dt_add_property_u64s(dn, "ibm,mmio-atsd",
>> +			     npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(0),
>> +			     npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(1),
>> +			     npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(2),
>> +			     npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(3),
>> +			     npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(4),
>> +			     npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(5),
>> +			     npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(6),
>> +			     npu->regs[0] + NPU3_XTS_ATSD_LAUNCH(7));
>> +
>> +	npu3_dt_add_mmio_window(npu);
>> +	npu3_dt_add_interrupts(npu);
>> +}
>> +
>> +void npu3_init_nvlink(struct npu3 *npu)
>> +{
>> +	struct npu3_dev *dev;
>> +
>> +	if (!npu3_next_dev(npu, NULL, NPU3_DEV_TYPE_NVLINK))
>> +		return;
>> +
>> +	npu3_init_hw(npu);
>> +	npu3_create_phb(npu);
>> +
>> +	npu3_for_each_nvlink_dev(dev, npu)
>> +		npu3_dev_create_pvd(dev);
>> +
>> +	npu3_dt_add_props(npu);
>> +
>> +	/* TODO: Sort out if/why we still can't enable this */
>> +	disable_fast_reboot("NVLink device enabled");
>> +}
>> +
>> +static int64_t npu3_init_context_pid(struct npu3 *npu, uint32_t index,
>> +				     uint64_t msr)
>> +{
>> +	uint64_t map, old_map;
>> +
>> +	/* Unfiltered XTS mode; index is lparshort */
>> +	map = SETFIELD(NPU3_XTS_PID_MAP_LPARSHORT, 0ull, index);
>> +
>> +	/* Enable this mapping for both real and virtual addresses */
>> +	map |= NPU3_XTS_PID_MAP_VALID_ATRGPA0 | NPU3_XTS_PID_MAP_VALID_ATRGPA1;
>> +
>> +	/* Enable TLBIE/MMIOSD forwarding for this entry */
>> +	map |= NPU3_XTS_PID_MAP_VALID_ATSD;
>> +
>> +	/* Set the relevant MSR bits */
>> +	if (msr & MSR_DR)
>> +		map |= NPU3_XTS_PID_MAP_MSR_DR;
>> +
>> +	if (msr & MSR_HV)
>> +		map |= NPU3_XTS_PID_MAP_MSR_HV;
>> +
>> +	if (msr & MSR_PR)
>> +		map |= NPU3_XTS_PID_MAP_MSR_PR;
>> +
>> +	/* We don't support anything other than 64-bit so hardcode it here */
>> +	map |= NPU3_XTS_PID_MAP_MSR_SF;
>> +
>> +	old_map = npu3_read(npu, NPU3_XTS_PID_MAP(index));
>> +
>> +	/* Error out if this entry is already set with different msr bits */
>> +	if (old_map && GETFIELD(NPU3_XTS_PID_MAP_MSR, old_map) !=
>> +		       GETFIELD(NPU3_XTS_PID_MAP_MSR, map)) {
>> +		NPU3ERR(npu, "%s: Unexpected MSR value\n", __func__);
>> +		return OPAL_PARAMETER;
>> +	}
>> +
>> +	if (!old_map) {
>> +		NPU3DBG(npu, "XTS_PID_MAP[%03d] = 0x%08llx\n", index, map);
>> +		npu3_write(npu, NPU3_XTS_PID_MAP(index), map);
>> +	}
>> +
>> +	npu->nvlink.context_refcount[index]++;
>> +
>> +	return OPAL_SUCCESS;
>> +}
>> +
>> +#define NPU3_VALID_ATS_MSR_BITS (MSR_DR | MSR_HV | MSR_PR | MSR_SF)
>> +
>> +/*
>> + * Allocate a context ID and initialize the tables with the relevant
>> + * information. Returns the ID or error if one couldn't be allocated.
>> + */
>> +int64_t npu3_init_context(struct phb *phb, uint64_t msr, uint64_t bdf)
>> +{
>> +	struct npu3 *npu = npu3_phb_to_npu(phb);
>> +	uint32_t lparshort, i;
>> +	uint64_t map;
>> +	int64_t rc;
>> +
>> +	/*
>> +	 * MSR bits should be masked by the caller to allow for future
>> +	 * expansion if required.
>> +	 */
>> +	if (msr & ~NPU3_VALID_ATS_MSR_BITS)
>> +		return OPAL_UNSUPPORTED;
>> +
>> +	lock(&npu->lock);
>> +
>> +	for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) {
>> +		map = npu3_read(npu, NPU3_XTS_BDF_MAP(i));
>> +
>> +		if (map && GETFIELD(NPU3_XTS_BDF_MAP_BDF, map) == bdf)
>> +			break;
>> +	}
>> +
>> +	if (i == NPU3_XTS_BDF_MAP_MAX) {
>> +		NPU3ERR(npu, "LPARID not associated with any GPU\n");
>> +		rc = OPAL_PARAMETER;
>> +		goto out;
>> +	}
>> +
>> +	lparshort = GETFIELD(NPU3_XTS_BDF_MAP_LPARSHORT, map);
>> +	NPU3DBG(npu, "Found LPARSHORT 0x%x for bdf %02llx:%02llx.%llx\n",
>> +		lparshort, bdf >> 8 & 0xff, bdf >> 3 & 0x1f, bdf & 0x7);
>> +
>> +	rc = npu3_init_context_pid(npu, lparshort, msr);
>> +	if (rc)
>> +		goto out;
>> +
>> +	if (!(map & NPU3_XTS_BDF_MAP_VALID)) {
>> +		map |= NPU3_XTS_BDF_MAP_VALID;
>> +		npu3_write(npu, NPU3_XTS_BDF_MAP(i), map);
>> +	}
>> +
>> +	rc = lparshort;
>> +
>> +out:
>> +	unlock(&npu->lock);
>> +	return rc;
>> +}
>> +
>> +static int64_t npu3_destroy_context_pid(struct npu3 *npu, uint32_t index)
>> +{
>> +	if (!npu->nvlink.context_refcount[index])
>> +		return OPAL_PARAMETER;
>> +
>> +	/* Only destroy when refcount hits 0 */
>> +	if (--npu->nvlink.context_refcount[index])
>> +		return OPAL_PARTIAL;
>> +
>> +	NPU3DBG(npu, "XTS_PID_MAP[%03d] = 0 (destroy)\n", index);
>> +	npu3_write(npu, NPU3_XTS_PID_MAP(index), 0ull);
>> +
>> +	return OPAL_SUCCESS;
>> +}
>> +
>> +int64_t npu3_destroy_context(struct phb *phb, uint64_t bdf)
>> +{
>> +	struct npu3 *npu = npu3_phb_to_npu(phb);
>> +	uint32_t lparshort, i;
>> +	int64_t map, rc;
>> +
>> +	lock(&npu->lock);
>> +
>> +	for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) {
>> +		map = npu3_read(npu, NPU3_XTS_BDF_MAP(i));
>> +
>> +		if (map && GETFIELD(NPU3_XTS_BDF_MAP_BDF, map) == bdf)
>> +			break;
>> +	}
>> +
>> +	if (i == NPU3_XTS_BDF_MAP_MAX) {
>> +		NPU3ERR(npu, "LPARID not associated with any GPU\n");
>> +		rc = OPAL_PARAMETER;
>> +		goto out;
>> +	}
>> +
>> +	lparshort = GETFIELD(NPU3_XTS_BDF_MAP_LPARSHORT, map);
>> +	rc = npu3_destroy_context_pid(npu, lparshort);
>> +
>> +out:
>> +	unlock(&npu->lock);
>> +	return rc;
>> +}
>> +
>> +/* Map the given virtual bdf to lparid with given lpcr */
>> +int64_t npu3_map_lpar(struct phb *phb, uint64_t bdf, uint64_t lparid,
>> +		      uint64_t lpcr)
>> +{
>> +	struct npu3 *npu = npu3_phb_to_npu(phb);
>> +	struct npu3_dev *dev;
>> +	int64_t rc = OPAL_SUCCESS;
>> +	uint64_t map, val;
>> +	uint32_t i;
>> +
>> +	/*
>> +	 * The LPCR bits are only required for hash based ATS, which we don't
>> +	 * currently support, but may need to in the future.
>> +	 */
>> +	if (lpcr)
>> +		return OPAL_UNSUPPORTED;
>> +
>> +	lock(&npu->lock);
>> +
>> +	/* Update the entry if it already exists */
>> +	for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++) {
>> +		map = npu3_read(npu, NPU3_XTS_BDF_MAP(i));
>> +
>> +		if (map && GETFIELD(NPU3_XTS_BDF_MAP_BDF, map) == bdf)
>> +			break;
>> +	}
>> +
>> +	if (i == NPU3_XTS_BDF_MAP_MAX) {
>> +		/* No existing mapping found, find space for a new one */
>> +		for (i = 0; i < NPU3_XTS_BDF_MAP_MAX; i++)
>> +			if (!npu3_read(npu, NPU3_XTS_BDF_MAP(i)))
>> +				break;
>> +	}
>> +
>> +	if (i == NPU3_XTS_BDF_MAP_MAX) {
>> +		NPU3ERR(npu, "No free XTS_BDF[] entry\n");
>> +		rc = OPAL_RESOURCE;
>> +		goto out;
>> +	}
>> +
>> +	map = NPU3_XTS_BDF_MAP_UNFILT;
>> +	map = SETFIELD(NPU3_XTS_BDF_MAP_BDF, map, bdf);
>> +	map = SETFIELD(NPU3_XTS_BDF_MAP_LPARID, map, lparid);
>> +	map = SETFIELD(NPU3_XTS_BDF_MAP_LPARSHORT, map, i);
>> +
>> +	/* We only support radix at the moment */
>> +	map = SETFIELD(NPU3_XTS_BDF_MAP_XLAT, map, 0x3);
>> +
>> +	/* Find a link on which to send ATSDs for this device */
>> +	npu3_for_each_nvlink_dev(dev, npu)
>> +		if (dev->nvlink.gpu->bdfn == bdf)
>> +			break;
>> +
>> +	if (!dev || dev->nvlink.gpu->bdfn != bdf) {
>> +		NPU3ERR(npu, "Can't find a link for bdf %02llx:%02llx.%llx\n",
>> +			bdf >> 8 & 0xff, bdf >> 3 & 0x1f, bdf & 0x7);
>> +		rc = OPAL_PARAMETER;
>> +		goto out;
>> +	}
>> +
>> +	map = SETFIELD(NPU3_XTS_BDF_MAP_BRICK, map, dev->index);
>> +
>> +	NPU3DBG(npu, "XTS_BDF_MAP[%03d] = 0x%08llx\n", i, map);
>> +	npu3_write(npu, NPU3_XTS_BDF_MAP(i), map);
>> +
>> +	/* We need to allocate an ATSD per link */
>> +	val = SETFIELD(NPU3_XTS_ATSD_HYP_LPARID, 0ull, lparid);
>> +	if (!lparid)
>> +		val |= NPU3_XTS_ATSD_HYP_MSR_HV;
>> +
>> +	npu3_write(npu, NPU3_XTS_ATSD_HYP(dev->index), val);
>> +
>> +out:
>> +	unlock(&npu->lock);
>> +	return rc;
>> +}
>> +
>> +static int64_t npu3_relaxed_order_enable(struct npu3 *npu, uint64_t src)
>> +{
>> +	struct npu3_dev *dev;
>> +	uint32_t i;
>> +
>> +	for (i = 0; i < NPU3_RELAXED_SRC_MAX; i++)
>> +		if (npu3_read(npu, NPU3_RELAXED_SRC(i)) == src)
>> +			return OPAL_SUCCESS; /* Already enabled */
>> +
>> +	/* Find somewhere to write this source */
>> +	for (i = 0; i < NPU3_RELAXED_SRC_MAX; i++)
>> +		if (!npu3_read(npu, NPU3_RELAXED_SRC(i)))
>> +			break;
>> +
>> +	if (i == NPU3_RELAXED_SRC_MAX) {
>> +		NPU3ERR(npu, "Insufficient resources to activate relaxed ordering mode\n");
>> +		return OPAL_RESOURCE;
>> +	}
>> +
>> +	npu3_write(npu, NPU3_RELAXED_SRC(i), src);
>> +
>> +	npu3_for_each_nvlink_dev(dev, npu) {
>> +		uint64_t val = npu3_read(npu, NPU3_RELAXED_CFG2(dev->index));
>> +
>> +		val |= NPU3_RELAXED_CFG2_SRC_WRENA(i) |
>> +		       NPU3_RELAXED_CFG2_SRC_RDENA(i);
>> +		npu3_write(npu, NPU3_RELAXED_CFG2(dev->index), val);
>> +	}
>> +
>> +	return OPAL_SUCCESS;
>> +}
>> +
>> +static void npu3_relaxed_order_disable(struct npu3 *npu, uint64_t src)
>> +{
>> +	struct npu3_dev *dev;
>> +	uint32_t i;
>> +
>> +	for (i = 0; i < NPU3_RELAXED_SRC_MAX; i++)
>> +		if (npu3_read(npu, NPU3_RELAXED_SRC(i)) == src)
>> +			break;
>> +
>> +	if (i == NPU3_RELAXED_SRC_MAX)
>> +		return; /* Already disabled */
>> +
>> +	npu3_for_each_nvlink_dev(dev, npu) {
>> +		uint64_t val = npu3_read(npu, NPU3_RELAXED_CFG2(dev->index));
>> +
>> +		val &= ~NPU3_RELAXED_CFG2_SRC_WRENA(i);
>> +		val &= ~NPU3_RELAXED_CFG2_SRC_RDENA(i);
>> +		npu3_write(npu, NPU3_RELAXED_CFG2(dev->index), val);
>> +	}
>> +
>> +	npu3_write(npu, NPU3_RELAXED_SRC(i), 0ull);
>> +}
>> +
>> +/* Enable or disable relaxed ordering on all nvlinks for a given PEC. */
>> +int64_t npu3_set_relaxed_order(struct phb *phb, uint32_t gcid, int pec,
>> +			       bool enable)
>> +{
>> +	struct npu3 *npu = npu3_phb_to_npu(phb);
>> +	int64_t rc = OPAL_SUCCESS;
>> +	uint64_t src;
>> +
>> +	NPU3INF(npu, "%s relaxed ordering for PEC %d on chip %d\n",
>> +		enable ? "Enabling" : "Disabling",
>> +		pec, gcid);
>> +
>> +	lock(&npu->lock);
>> +
>> +	src = SETFIELD(NPU3_RELAXED_SRC_GRPCHP, 0ull, gcid);
>> +	src = SETFIELD(NPU3_RELAXED_SRC_PEC, src, pec);
>> +	src = SETFIELD(NPU3_RELAXED_SRC_RDSTART, src, 0);
>> +	src = SETFIELD(NPU3_RELAXED_SRC_RDEND, src, 47);
>> +	src = SETFIELD(NPU3_RELAXED_SRC_WRSTART, src, 0);
>> +	src = SETFIELD(NPU3_RELAXED_SRC_WREND, src, 23);
>> +
>> +	if (enable)
>> +		rc = npu3_relaxed_order_enable(npu, src);
>> +	else
>> +		npu3_relaxed_order_disable(npu, src);
>> +
>> +	unlock(&npu->lock);
>> +	return rc;
>> +}
>> diff --git a/hw/npu3.c b/hw/npu3.c
>> new file mode 100644
>> index 000000000000..22ccef2e01aa
>> --- /dev/null
>> +++ b/hw/npu3.c
>> @@ -0,0 +1,554 @@
>> +/* Copyright 2019 IBM Corp.
>> + *
>> + * Licensed under the Apache License, Version 2.0 (the "License");
>> + * you may not use this file except in compliance with the License.
>> + * You may obtain a copy of the License at
>> + *
>> + *	http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
>> + * implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +#include <io.h>
>> +#include <xscom.h>
>> +#include <npu3.h>
>> +#include <npu3-regs.h>
>> +#include <nvram.h>
>> +#include <interrupts.h>
>> +#include <xive.h>
>> +
>> +#define NPU3LOG(l, npu, fmt, a...) \
>> +	prlog(l, "NPU[%d:%d]: " fmt, (npu)->chip_id, (npu)->index, ##a)
>> +#define NPU3DBG(npu, fmt, a...) NPU3LOG(PR_DEBUG, npu, fmt, ##a)
>> +#define NPU3INF(npu, fmt, a...) NPU3LOG(PR_INFO, npu, fmt, ##a)
>> +#define NPU3ERR(npu, fmt, a...) NPU3LOG(PR_ERR, npu, fmt, ##a)
>> +
>> +#define NPU3DEVLOG(l, dev, fmt, a...)		\
>> +	prlog(l, "NPU[%d:%d:%d]: " fmt,		\
>> +	      (dev)->npu->chip_id,		\
>> +	      (dev)->npu->index,		\
>> +	      (dev)->index, ##a)
>> +#define NPU3DEVDBG(dev, fmt, a...) NPU3DEVLOG(PR_DEBUG, dev, fmt, ##a)
>> +#define NPU3DEVINF(dev, fmt, a...) NPU3DEVLOG(PR_INFO, dev, fmt, ##a)
>> +#define NPU3DEVERR(dev, fmt, a...) NPU3DEVLOG(PR_ERR, dev, fmt, ##a)
>> +
>> +static void npu3_dt_create_link(struct dt_node *npu, uint32_t npu_index,
>> +				uint32_t dev_index)
>> +{
>> +	struct dt_node *link;
>> +	uint32_t phy_lane_mask, ob_chiplet;
>> +
>> +	link = dt_new_addr(npu, "link", dev_index);
>> +
>> +	dt_add_property_string(link, "compatible", "ibm,npu-link");
>> +	dt_add_property_cells(link, "reg", dev_index);
>> +	dt_add_property_cells(link, "ibm,npu-link-index", dev_index);
>> +
>> +	switch (npu_index) {
>> +	case 0:
>> +		/* fall through */
>> +	case 2:
>> +		ob_chiplet = npu_index ? 3 : 0;
>> +
>> +		switch (dev_index) {
>> +		case 0:
>> +			phy_lane_mask = PPC_BITMASK32(0, 3);
>> +			break;
>> +		case 1:
>> +			phy_lane_mask = PPC_BITMASK32(13, 16);
>> +			break;
>> +		case 2:
>> +			phy_lane_mask = PPC_BITMASK32(7, 10);
>> +			break;
>> +		case 3:
>> +			phy_lane_mask = PPC_BITMASK32(20, 23);
>> +			break;
>> +		}
>> +
>> +		break;
>> +	case 1:
>> +		switch (dev_index) {
>> +		case 0:
>> +			ob_chiplet = 1;
>> +			phy_lane_mask = PPC_BITMASK32(0, 3);
>> +			break;
>> +		case 1:
>> +			ob_chiplet = 2;
>> +			phy_lane_mask = PPC_BITMASK32(0, 3);
>> +			break;
>> +		case 2:
>> +			ob_chiplet = 1;
>> +			phy_lane_mask = PPC_BITMASK32(7, 10);
>> +			break;
>> +		case 3:
>> +			ob_chiplet = 2;
>> +			phy_lane_mask = PPC_BITMASK32(7, 10);
>> +			break;
>> +		}
>> +
>> +		break;
>> +	default:
>> +		return;
>> +	}
>> +
>> +	dt_add_property_cells(link, "ibm,npu-phy", ob_chiplet);
>> +	dt_add_property_cells(link, "ibm,npu-lane-mask", phy_lane_mask);
>> +}
>> +
>> +static void npu3_dt_create_npu(struct dt_node *xscom, uint32_t npu_index)
>> +{
>> +	const uint32_t npu_base[] = { 0x5011000, 0x5011400, 0x3011c00 };
>> +	struct dt_node *npu;
>> +
>> +	npu = dt_new_addr(xscom, "npu", npu_base[npu_index]);
>> +
>> +	dt_add_property_cells(npu, "#size-cells", 0);
>> +	dt_add_property_cells(npu, "#address-cells", 1);
>> +	dt_add_property_cells(npu, "reg", npu_base[npu_index], 0x2c);
>> +	dt_add_property_string(npu, "compatible", "ibm,power9-npu3");
>> +	dt_add_property_cells(npu, "ibm,npu-index", npu_index);
>> +	dt_add_property_cells(npu, "ibm,phb-index", 7 + npu_index);
>> +
>> +	for (uint32_t i = 0; i < NPU3_LINKS_PER_NPU; i++)
>> +		npu3_dt_create_link(npu, npu_index, i);
>> +}
>> +
>> +/* This can be removed when/if we decide to use HDAT instead */
>> +static bool npu3_dt_create(void)
>> +{
>> +	struct proc_chip *chip = next_chip(NULL);
>> +	struct dt_node *xscom;
>> +
>> +	/* npu3 chips only */
>> +	if (proc_gen < proc_gen_p9 ||
>> +	    chip->type == PROC_CHIP_P9_NIMBUS ||
>> +	    chip->type == PROC_CHIP_P9_CUMULUS)
>> +		return false;
>> +
>> +	dt_for_each_compatible(dt_root, xscom, "ibm,xscom")
>> +		for (uint32_t i = 0; i < 3; i++)
>> +			npu3_dt_create_npu(xscom, i);
>> +
>> +	return true;
>> +}
>> +
>> +static struct npu3 *npu3_create(struct dt_node *dn)
>> +{
>> +	struct npu3 *npu;
>> +	struct dt_node *link;
>> +	struct npu3_dev *dev;
>> +	char *path;
>> +	uint32_t i;
>> +
>> +	npu = zalloc(sizeof(*npu));
>> +	assert(npu);
>> +
>> +	init_lock(&npu->lock);
>> +
>> +	npu->dt_node = dn;
>> +	npu->index = dt_prop_get_u32(dn, "ibm,npu-index");
>> +	npu->xscom_base = dt_get_address(dn, 0, NULL);
>> +
>> +	npu->chip_id = dt_get_chip_id(dn);
>> +	assert(get_chip(npu->chip_id));
>> +
>> +	dt_for_each_compatible(dn, link, "ibm,npu-link") {
>> +		i = dt_prop_get_u32(link, "ibm,npu-link-index");
>> +		assert(i < NPU3_LINKS_PER_NPU);
>> +
>> +		dev = &npu->devices[i];
>> +		dev->index = i;
>> +		dev->npu = npu;
>> +		dev->dn = link;
>> +		dev->ob_chiplet = dt_prop_get_u32(link, "ibm,npu-phy");
>> +		dev->phy_lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask");
>> +		dev->proc.status = NPU3_PROC_COMPLETE;
>> +	};
>> +
>> +	path = dt_get_path(dn);
>> +	NPU3INF(npu, "Found %s\n", path);
>> +	NPU3INF(npu, "SCOM base: 0x%llx\n", npu->xscom_base);
>> +	free(path);
>> +
>> +	return npu;
>> +}
>> +
>> +struct npu3_dev *npu3_next_dev(struct npu3 *npu, struct npu3_dev *dev,
>> +			       enum npu3_dev_type type)
>> +{
>> +	uint32_t i = 0;
>> +
>> +	if (dev)
>> +		i = dev->index + 1;
>> +
>> +	for (; i < NPU3_LINKS_PER_NPU; i++) {
>> +		dev = &npu->devices[i];
>> +
>> +		if (dev->type == type || type == NPU3_DEV_TYPE_ANY)
>> +			return dev;
>> +	}
>> +
>> +	return NULL;
>> +}
>> +
>> +static void npu3_device_detect_fixup(struct npu3_dev *dev)
>> +{
>> +	struct dt_node *dn = dev->dn;
>> +
>> +	if (dev->type == NPU3_DEV_TYPE_NVLINK) {
>> +		dt_add_property_strings(dn, "ibm,npu-link-type", "nvlink");
>> +		dev->link_speed = dt_prop_get_u32_def(
>> +					dn, "nvidia,link-speed", 0xff);
>> +		return;
>> +	}
>> +
>> +	NPU3DEVDBG(dev, "Link type unknown\n");
>> +	dt_add_property_strings(dn, "ibm,npu-link-type", "unknown");
>> +}
>> +
>> +/*
>> + * We use the indirect method because it uses the same addresses as
>> + * the MMIO offsets (NPU RING)
>> + */
>> +static void npu3_scom_sel(struct npu3 *npu, uint64_t reg, uint64_t size)
>> +{
>> +	uint64_t val;
>> +
>> +	val = SETFIELD(NPU3_MISC_DA_ADDR, 0ull, reg);
>> +	val = SETFIELD(NPU3_MISC_DA_LEN, val, size);
>> +	xscom_write(npu->chip_id,
>> +		    npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_ADDR,
>> +		    val);
>> +}
>> +
>> +static void npu3_scom_write(struct npu3 *npu, uint64_t reg, uint64_t size,
>> +			    uint64_t val)
>> +{
>> +	npu3_scom_sel(npu, reg, size);
>> +	xscom_write(npu->chip_id,
>> +		    npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_DATA,
>> +		    val);
>> +}
>> +
>> +static uint64_t npu3_scom_read(struct npu3 *npu, uint64_t reg, uint64_t size)
>> +{
>> +	uint64_t val;
>> +
>> +	npu3_scom_sel(npu, reg, size);
>> +	xscom_read(npu->chip_id,
>> +		   npu->xscom_base + NPU3_MISC_SCOM_IND_SCOM_DATA,
>> +		   &val);
>> +
>> +	return val;
>> +}
>> +
>> +void npu3_write(struct npu3 *npu, uint64_t reg, uint64_t val)
>> +{
>> +	void *mmio = (void *)npu->regs[0];
>> +
>> +	if (mmio)
>> +		out_be64(mmio + reg, val);
>> +	else
>> +		npu3_scom_write(npu, reg, NPU3_MISC_DA_LEN_8B, val);
>> +
>> +	/* CQ_SM writes should be mirrored in all four blocks */
>> +	if (NPU3_REG_BLOCK(reg) != NPU3_BLOCK_CQ_SM(0))
>> +		return;
>> +
>> +	for (uint32_t i = 1; i < 4; i++)
>> +		npu3_write(npu, NPU3_BLOCK_CQ_SM(i) + NPU3_REG_OFFSET(reg),
>> +			   val);
>> +}
>> +
>> +uint64_t npu3_read(struct npu3 *npu, uint64_t reg)
>> +{
>> +	void *mmio = (void *)npu->regs[0];
>> +
>> +	if (mmio)
>> +		return in_be64(mmio + reg);
>> +
>> +	return npu3_scom_read(npu, reg, NPU3_MISC_DA_LEN_8B);
>> +}
>> +
>> +void npu3_write_4b(struct npu3 *npu, uint64_t reg, uint32_t val)
>> +{
>> +	void *mmio = (void *)npu->regs[0];
>> +
>> +	if (mmio)
>> +		out_be32(mmio + reg, val);
>> +	else
>> +		npu3_scom_write(npu, reg, NPU3_MISC_DA_LEN_4B,
>> +				(uint64_t)val << 32);
>> +
>> +	if (NPU3_REG_BLOCK(reg) != NPU3_BLOCK_CQ_SM(0))
>> +		return;
>> +
>> +	for (uint32_t i = 1; i < 4; i++)
>> +		npu3_write_4b(npu, NPU3_BLOCK_CQ_SM(i) + NPU3_REG_OFFSET(reg),
>> +			      val);
>> +}
>> +
>> +uint32_t npu3_read_4b(struct npu3 *npu, uint64_t reg)
>> +{
>> +	void *mmio = (void *)npu->regs[0];
>> +
>> +	if (mmio)
>> +		return in_be32(mmio + reg);
>> +
>> +	return npu3_scom_read(npu, reg, NPU3_MISC_DA_LEN_4B) >> 32;
>> +}
>> +
>> +static void npu3_misc_config(struct npu3 *npu)
>> +{
>> +	struct npu3_dev *dev;
>> +	uint32_t typemap = 0;
>> +	uint64_t reg, val;
>> +
>> +	npu3_for_each_nvlink_dev(dev, npu)
>> +		typemap |= 0x10 >> dev->index;
>> +
>> +	reg = NPU3_SM_MISC_CFG0;
>> +	val = npu3_read(npu, reg);
>> +	val |= NPU3_SM_MISC_CFG0_ENABLE_PBUS;
>> +	val &= ~NPU3_SM_MISC_CFG0_ENABLE_SNARF_CPM;
>> +	val = SETFIELD(NPU3_SM_MISC_CFG0_NVLINK_MODE, val, typemap);
>> +	val = SETFIELD(NPU3_SM_MISC_CFG0_OCAPI_MODE, val, ~typemap);
>> +	npu3_write(npu, reg, val);
>> +
>> +	reg = NPU3_CTL_MISC_CFG2;
>> +	val = npu3_read(npu, reg);
>> +	val = SETFIELD(NPU3_CTL_MISC_CFG2_NVLINK_MODE, val, typemap);
>> +	val = SETFIELD(NPU3_CTL_MISC_CFG2_OCAPI_MODE, val, ~typemap);
>> +	npu3_write(npu, reg, val);
>> +
>> +	reg = NPU3_DAT_MISC_CFG1;
>> +	val = npu3_read(npu, reg);
>> +	val = SETFIELD(NPU3_DAT_MISC_CFG1_NVLINK_MODE, val, typemap);
>> +	val = SETFIELD(NPU3_DAT_MISC_CFG1_OCAPI_MODE, val, ~typemap);
>> +	npu3_write(npu, reg, val);
>> +}
>> +
>> +static void npu3_assign_bars(struct npu3 *npu)
>> +{
>> +	struct npu3_dev *dev;
>> +	uint64_t addr, size, val;
>> +
>> +	/* Global MMIO bar (per npu) */
>> +	phys_map_get(npu->chip_id, NPU_REGS, npu->index, &addr, &size);
>> +	val = SETFIELD(NPU3_MMIO_BAR_ADDR, 0ull, addr >> 24);
>> +	val |= NPU3_MMIO_BAR_ENABLE;
>> +	npu3_write(npu, NPU3_MMIO_BAR, val);
>> +
>> +	NPU3INF(npu, "MMIO base: 0x%016llx (%lldMB)\n", addr, size >> 20);
>> +	npu->regs[0] = addr;
>> +	npu->regs[1] = size;
>> +
>> +	/* NTL bar (per device) */
>> +	npu3_for_each_dev(dev, npu) {
>> +		phys_map_get(npu->chip_id, NPU_NTL, npu3_chip_dev_index(dev),
>> +			     &addr, &size);
>> +		val = SETFIELD(NPU3_NTL_BAR_ADDR, 0ull, addr >> 16);
>> +		val = SETFIELD(NPU3_NTL_BAR_SIZE, val, ilog2(size >> 16));
>> +		npu3_write(npu, NPU3_NTL_BAR(dev->index), val);
>> +
>> +		dev->ntl_bar.addr = addr;
>> +		dev->ntl_bar.size = size;
>> +	}
>> +
>> +	/* GENID bar (logically divided per device) */
>> +	phys_map_get(npu->chip_id, NPU_GENID, npu->index, &addr, NULL);
>> +	val = SETFIELD(NPU3_GENID_BAR_ADDR, 0ull, addr >> 19);
>> +	npu3_write(npu, NPU3_GENID_BAR, val);
>> +
>> +	npu3_for_each_dev(dev, npu) {
>> +		dev->genid_bar.addr = addr + (dev->index << 16);
>> +		dev->genid_bar.size = 64 << 10;
>> +	}
>> +}
>> +
>> +void npu3_dev_enable_bars(struct npu3_dev *dev, bool enable)
>> +{
>> +	struct npu3 *npu = dev->npu;
>> +	uint64_t reg, val;
>> +
>> +	if (dev->ntl_bar.enable == enable) /* No state change */
>> +		return;
>> +
>> +	dev->ntl_bar.enable = enable;
>> +	dev->genid_bar.enable = enable;
>> +
>> +	reg = NPU3_NTL_BAR(dev->index);
>> +	val = npu3_read(npu, reg);
>> +	val = SETFIELD(NPU3_NTL_BAR_ENABLE, val, enable);
>> +	npu3_write(npu, reg, val);
>> +
>> +	/*
>> +	 * Generation IDs are a single space in the hardware but we split them
>> +	 * per device. Only disable in hardware if every device has disabled.
>> +	 */
>> +	if (!enable)
>> +		npu3_for_each_dev(dev, npu)
>> +			if (dev->genid_bar.enable)
>> +				return;
>> +
>> +	reg = NPU3_GENID_BAR;
>> +	val = npu3_read(npu, reg);
>> +	val = SETFIELD(NPU3_GENID_BAR_ENABLE, val, enable);
>> +	npu3_write(npu, reg, val);
>> +}
>> +
>> +static uint64_t npu3_ipi_attributes(struct irq_source *is, uint32_t isn)
>> +{
>> +	struct npu3 *npu = is->data;
>> +	uint32_t level = isn - npu->irq_base;
>> +
>> +	/* TCE interrupt is used to detect a frozen PE */
>> +	if (level == 18)
>> +		return IRQ_ATTR_TARGET_OPAL |
>> +		       IRQ_ATTR_TARGET_RARE |
>> +		       IRQ_ATTR_TYPE_MSI;
>> +
>> +	return IRQ_ATTR_TARGET_LINUX;
>> +}
>> +
>> +static void npu3_ipi_interrupt(struct irq_source *is, uint32_t isn)
>> +{
>> +	struct npu3 *npu = is->data;
>> +	uint32_t level = isn - npu->irq_base;
>> +
>> +	if (level != 18) {
>> +		NPU3ERR(npu, "Received unknown interrupt %d\n", level);
>> +		return;
>> +	}
>> +
>> +	opal_update_pending_evt(OPAL_EVENT_PCI_ERROR, OPAL_EVENT_PCI_ERROR);
>> +}
>> +
>> +#define NPU3_IRQ_LEVELS 60
>> +
>> +static char *npu3_ipi_name(struct irq_source *is, uint32_t isn)
>> +{
>> +	struct npu3 *npu = is->data;
>> +	uint32_t level = isn - npu->irq_base;
>> +	static const char *names[NPU3_IRQ_LEVELS] = {
>> +		[0] = "NDL 0 Stall Event (brick 0)",
>> +		[1] = "NDL 0 No-Stall Event (brick 0)",
>> +		[2] = "NDL 1 Stall Event (brick 1)",
>> +		[3] = "NDL 1 No-Stall Event (brick 1)",
>> +		[4] = "NDL 2 Stall Event (brick 2)",
>> +		[5] = "NDL 2 No-Stall Event (brick 2)",
>> +		[6] = "NDL 3 Stall Event (brick 3)",
>> +		[7] = "NDL 3 No-Stall Event (brick 3)",
>> +		[8] = "NDL 4 Stall Event (brick 4)",
>> +		[9] = "NDL 4 No-Stall Event (brick 4)",
>> +		[10] = "NDL 5 Stall Event (brick 5)",
>> +		[11] = "NDL 5 No-Stall Event (brick 5)",
>> +		[12] = "NTL 0 Event",
>> +		[13] = "NTL 1 Event",
>> +		[14] = "NTL 2 Event",
>> +		[15] = "NTL 3 Event",
>> +		[16] = "NTL 4 Event",
>> +		[17] = "NTL 5 Event",
>> +		[18] = "TCE Event",
>> +		[19] = "ATS Event",
>> +		[20] = "CQ Event",
>> +		[21] = "MISC Event",
>> +		[41] = "Memory Controller Event",
>> +		[42] = "NDL 6 Stall Event (brick 6)",
>> +		[43] = "NDL 6 No-Stall Event (brick 6)",
>> +		[44] = "NDL 7 Stall Event (brick 7)",
>> +		[45] = "NDL 7 No-Stall Event (brick 7)",
>> +		[46] = "NDL 8 Stall Event (brick 8)",
>> +		[47] = "NDL 8 No-Stall Event (brick 8)",
>> +		[48] = "NDL 9 Stall Event (brick 9)",
>> +		[49] = "NDL 9 No-Stall Event (brick 9)",
>> +		[50] = "NDL 10 Stall Event (brick 10)",
>> +		[51] = "NDL 10 No-Stall Event (brick 10)",
>> +		[52] = "NDL 11 Stall Event (brick 11)",
>> +		[53] = "NDL 11 No-Stall Event (brick 11)",
>> +		[54] = "NTL 6 Event",
>> +		[55] = "NTL 7 Event",
>> +		[56] = "NTL 8 Event",
>> +		[57] = "NTL 9 Event",
>> +		[58] = "NTL 10 Event",
>> +		[59] = "NTL 11 Event",
>> +	};
>> +
>> +	if (level >= NPU3_IRQ_LEVELS || !names[level])
>> +		return strdup("Unknown");
>> +
>> +	return strdup(names[level]);
>> +}
>> +
>> +static const struct irq_source_ops npu3_ipi_ops = {
>> +	.attributes	= npu3_ipi_attributes,
>> +	.interrupt	= npu3_ipi_interrupt,
>> +	.name		= npu3_ipi_name,
>> +};
>> +
>> +static void npu3_setup_irqs(struct npu3 *npu)
>> +{
>> +	uint64_t reg, val;
>> +	uint32_t base;
>> +
>> +	base = xive_alloc_ipi_irqs(npu->chip_id, NPU3_IRQ_LEVELS, 64);
>> +	if (base == XIVE_IRQ_ERROR) {
>> +		NPU3ERR(npu, "Failed to allocate interrupt sources\n");
>> +		return;
>> +	}
>> +
>> +	xive_register_ipi_source(base, NPU3_IRQ_LEVELS, npu, &npu3_ipi_ops);
>> +
>> +	/* Set IPI configuration */
>> +	reg = NPU3_MISC_CFG;
>> +	val = npu3_read(npu, reg);
>> +	val = SETFIELD(NPU3_MISC_CFG_IPI_PS, val, NPU3_MISC_CFG_IPI_PS_64K);
>> +	val = SETFIELD(NPU3_MISC_CFG_IPI_OS, val, NPU3_MISC_CFG_IPI_OS_AIX);
>> +	npu3_write(npu, reg, val);
>> +
>> +	/* Set IRQ base */
>> +	reg = NPU3_MISC_INT_BAR;
>> +	val = SETFIELD(NPU3_MISC_INT_BAR_ADDR, 0ull,
>> +		       (uint64_t)xive_get_trigger_port(base) >> 12);
>> +	npu3_write(npu, reg, val);
>> +
>> +	npu->irq_base = base;
>> +}
>> +
>> +static void npu3_init(struct npu3 *npu)
>> +{
>> +	struct npu3_dev *dev;
>> +
>> +	platform.npu3_device_detect(npu);
>> +	npu3_for_each_dev(dev, npu)
>> +		npu3_device_detect_fixup(dev);
>> +
>> +	npu3_misc_config(npu);
>> +	npu3_assign_bars(npu);
>> +	npu3_setup_irqs(npu);
>> +	npu3_init_nvlink(npu);
>> +}
>> +
>> +void probe_npu3(void)
>> +{
>> +	struct dt_node *dn;
>> +	struct npu3 *npu;
>> +
>> +	if (!npu3_dt_create())
>> +		return;
>> +
>> +	if (!platform.npu3_device_detect) {
>> +		prlog(PR_INFO, "NPU: Platform does not support NPU\n");
>> +		return;
>> +	}
>> +
>> +	dt_for_each_compatible(dt_root, dn, "ibm,power9-npu3") {
>> +		npu = npu3_create(dn);
>> +		npu3_init(npu);
>> +	}
>> +}
>> diff --git a/include/npu3-regs.h b/include/npu3-regs.h
>> new file mode 100644
>> index 000000000000..ce76bf3dc59a
>> --- /dev/null
>> +++ b/include/npu3-regs.h
>> @@ -0,0 +1,247 @@
>> +/* Copyright 2019 IBM Corp.
>> + *
>> + * Licensed under the Apache License, Version 2.0 (the "License");
>> + * you may not use this file except in compliance with the License.
>> + * You may obtain a copy of the License at
>> + *
>> + *      http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
>> + * implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +#ifndef __NPU3_REGS_H
>> +#define __NPU3_REGS_H
>> +
>> +#define NPU3_FIR(n)				(0x2c00 + (n) * 0x40)
>> +#define NPU3_FIR_MASK(n)			(0x2c03 + (n) * 0x40)
>> +#define NPU3_FIR_ACTION0(n)			(0x2c06 + (n) * 0x40)
>> +#define NPU3_FIR_ACTION1(n)			(0x2c07 + (n) * 0x40)
>> +#define NPU3_FIR_MAX				3
>> +
>> +/* NPU RING: Indirect address/data port */
>> +#define NPU3_MISC_SCOM_IND_SCOM_ADDR		0x33e
>> +#define   NPU3_MISC_DA_ADDR			PPC_BITMASK(0, 23)
>> +#define   NPU3_MISC_DA_LEN			PPC_BITMASK(24, 25)
>> +#define     NPU3_MISC_DA_LEN_4B			2
>> +#define     NPU3_MISC_DA_LEN_8B			3
>> +#define NPU3_MISC_SCOM_IND_SCOM_DATA		0x33f
>> +
>> +/* NPU RING: Indirect register blocks */
>> +#define NPU3_BLOCK(nib0, nib1)			((nib0) << 20 | (nib1) << 16)
>> +#define NPU3_REG_BLOCK(reg)			((reg) & 0xff0000)
>> +#define NPU3_REG_OFFSET(reg)			((reg) & 0xffff)
>> +
>> +#define NPU3_BLOCK_NDL_U(brk)			NPU3_BLOCK(0 + (brk) / 2,\
>> +							   8 + (brk) % 2 * 2)
>> +#define NPU3_BLOCK_NTL_U(brk)			NPU3_BLOCK(0 + (brk) / 2,\
>> +							   9 + (brk) % 2 * 2)
>> +#define NPU3_BLOCK_CQ_SM(n)			NPU3_BLOCK(4, (n))
>> +#define NPU3_BLOCK_CQ_CTL			NPU3_BLOCK(4, 4)
>> +#define NPU3_BLOCK_CQ_DAT			NPU3_BLOCK(4, 5)
>> +#define NPU3_BLOCK_NDL(brk)			NPU3_BLOCK(4 + (brk) / 2,\
>> +							   8 + (brk) % 2 * 2)
>> +#define NPU3_BLOCK_NTL(brk)			NPU3_BLOCK(4 + (brk) / 2,\
>> +							   9 + (brk) % 2 * 2)
>> +#define NPU3_BLOCK_NPU_ATS			NPU3_BLOCK(7, 0)
>> +#define NPU3_BLOCK_NPU_XTS			NPU3_BLOCK(7, 1)
>> +#define NPU3_BLOCK_NPU_MISC			NPU3_BLOCK(7, 2)
>> +#define NPU3_BLOCK_NPU_XTS_ATSD(n)		NPU3_BLOCK(8, (n))
>> +
>> +/* NDL_U block registers */
>> +#define NPU3_DLPL_CTL(brk)			(NPU3_BLOCK_NDL_U(brk) + 0xfff4)
>> +#define   NPU3_DLPL_CTL_RESET_RX		PPC_BIT32(0)
>> +#define   NPU3_DLPL_CTL_RESET_MISC		PPC_BIT32(1)
>> +#define NPU3_DLPL_CFG(brk)			(NPU3_BLOCK_NDL_U(brk) + 0xfff8)
>> +#define   NPU3_DLPL_CFG_PRI_BYTESWAP		PPC_BIT32(0)
>> +
>> +/* NTL_U block registers */
>> +#define NPU3_NTL_MISC_CFG1(brk)			(NPU3_BLOCK_NTL_U(brk) + 0x0c0)
>> +#define   NPU3_NTL_MISC_CFG1_NTL_RESET		PPC_BITMASK(8, 9)
>> +#define NPU3_NTL_CREQ_HDR_CRED_SND(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x400)
>> +#define NPU3_NTL_PRB_HDR_CRED_SND(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x410)
>> +#define NPU3_NTL_ATR_HDR_CRED_SND(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x418)
>> +#define NPU3_NTL_RSP_HDR_CRED_SND(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x428)
>> +#define NPU3_NTL_CREQ_DAT_CRED_SND(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x430)
>> +#define NPU3_NTL_RSP_DAT_CRED_SND(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x438)
>> +#define NPU3_NTL_CREQ_HDR_CRED_RCV(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x440)
>> +#define NPU3_NTL_DGD_HDR_CRED_RCV(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x448)
>> +#define NPU3_NTL_ATSD_HDR_CRED_RCV(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x460)
>> +#define NPU3_NTL_RSP_HDR_CRED_RCV(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x468)
>> +#define NPU3_NTL_CREQ_DAT_CRED_RCV(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x470)
>> +#define NPU3_NTL_RSP_DAT_CRED_RCV(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x478)
>> +#define NPU3_NTL_CQ_FENCE_STATUS(brk)		(NPU3_BLOCK_NTL_U(brk) + 0x500)
>> +#define   NPU3_NTL_CQ_FENCE_STATUS_FIELD	PPC_BITMASK(0, 1)
>> +#define     NPU3_NTL_CQ_FENCE_STATUS_FULL	3
>> +#define     NPU3_NTL_CQ_FENCE_STATUS_HALF	2
>> +#define     NPU3_NTL_CQ_FENCE_STATUS_NONE	0
>> +
>> +/*
>> + * CQ_SM block registers
>> + *
>> + * Definitions here use NPU3_BLOCK_CQ_SM(0), but when npu3_write() is given
>> + * one of these, it will do corresponding writes to every CQ_SM block.
>> + */
>> +#define NPU3_SM_MISC_CFG0			(NPU3_BLOCK_CQ_SM(0) + 0x000)
>> +#define   NPU3_SM_MISC_CFG0_ENABLE_PBUS		PPC_BIT(26)
>> +#define   NPU3_SM_MISC_CFG0_ENABLE_SNARF_CPM	PPC_BIT(27)
>> +#define   NPU3_SM_MISC_CFG0_OCAPI_MODE		PPC_BITMASK(44, 48)
>> +#define   NPU3_SM_MISC_CFG0_NVLINK_MODE		PPC_BITMASK(49, 53)
>> +#define NPU3_SM_MISC_CFG1			(NPU3_BLOCK_CQ_SM(0) + 0x008)
>> +#define NPU3_SM_MISC_CFG2			(NPU3_BLOCK_CQ_SM(0) + 0x0f0)
>> +#define NPU3_GPU_MEM_BAR(brk)			(NPU3_BLOCK_CQ_SM(0) + 0x190 + (brk) * 8)
>> +#define   NPU3_GPU_MEM_BAR_ENABLE		PPC_BIT(0)
>> +#define   NPU3_GPU_MEM_BAR_ADDR_MASK		PPC_BITMASK(1, 35)
>> +#define     NPU3_GPU_MEM_BAR_ADDR		PPC_BITMASK(1, 21)
>> +#define     NPU3_GPU_MEM_BAR_SIZE		PPC_BITMASK(22, 35)
>> +#define   NPU3_GPU_MEM_BAR_SL_MODE		PPC_BIT(36)
>> +#define   NPU3_GPU_MEM_BAR_4T_LIMIT		PPC_BIT(37)
>> +#define   NPU3_GPU_MEM_BAR_4T_SELECT		PPC_BITMASK(38, 39)
>> +#define   NPU3_GPU_MEM_BAR_MODE			PPC_BITMASK(40, 43)
>> +#define   NPU3_GPU_MEM_BAR_POISON		PPC_BIT(45)
>> +#define   NPU3_GPU_MEM_BAR_CHIP_EQ_GROUP	PPC_BIT(49)
>> +#define NPU3_NTL_BAR(brk)			(NPU3_BLOCK_CQ_SM(0) + 0x1b8 + (brk) * 8)
>> +#define   NPU3_NTL_BAR_ENABLE			PPC_BIT(0)
>> +#define   NPU3_NTL_BAR_ADDR			PPC_BITMASK(3, 35)
>> +#define   NPU3_NTL_BAR_SIZE			PPC_BITMASK(39, 43)
>> +#define     NPU3_NTL_BAR_SIZE_128K		1
>> +#define NPU3_MMIO_BAR				(NPU3_BLOCK_CQ_SM(0) + 0x1e0)
>> +#define   NPU3_MMIO_BAR_ENABLE			PPC_BIT(0)
>> +#define   NPU3_MMIO_BAR_ADDR			PPC_BITMASK(3, 27)
>> +#define NPU3_GENID_BAR				(NPU3_BLOCK_CQ_SM(0) + 0x1e8)
>> +#define   NPU3_GENID_BAR_ENABLE			PPC_BIT(0)
>> +#define   NPU3_GENID_BAR_ADDR			PPC_BITMASK(3, 32)
>> +#define NPU3_RELAXED_SRC(n)			(NPU3_BLOCK_CQ_SM(0) + 0x1f0 + (n) * 8)
>> +#define   NPU3_RELAXED_SRC_MAX			4
>> +#define   NPU3_RELAXED_SRC_TAG			PPC_BITMASK(0, 13)
>> +#define     NPU3_RELAXED_SRC_GRPCHP		PPC_BITMASK(0, 6)
>> +#define     NPU3_RELAXED_SRC_PEC		PPC_BITMASK(12, 13)
>> +#define   NPU3_RELAXED_SRC_TAGMASK		PPC_BITMASK(14, 27)
>> +#define   NPU3_RELAXED_SRC_MASK_NPU		PPC_BIT(28)
>> +#define   NPU3_RELAXED_SRC_MASK_PCIE		PPC_BIT(29)
>> +#define   NPU3_RELAXED_SRC_MASK_L2L3		PPC_BIT(30)
>> +#define   NPU3_RELAXED_SRC_RDSTART		PPC_BITMASK(32, 39)
>> +#define   NPU3_RELAXED_SRC_RDEND		PPC_BITMASK(40, 47)
>> +#define   NPU3_RELAXED_SRC_WRSTART		PPC_BITMASK(48, 55)
>> +#define   NPU3_RELAXED_SRC_WREND		PPC_BITMASK(56, 63)
>> +#define NPU3_RELAXED_CFG2(brk)			(NPU3_BLOCK_CQ_SM(0) + 0x230 + (brk) * 8)
>> +#define   NPU3_RELAXED_CFG2_CMD_CL_DMA_W	PPC_BIT(0)
>> +#define   NPU3_RELAXED_CFG2_CMD_CL_DMA_W_HP	PPC_BIT(1)
>> +#define   NPU3_RELAXED_CFG2_CMD_CL_DMA_INJ	PPC_BIT(2)
>> +#define   NPU3_RELAXED_CFG2_CMD_PR_DMA_INJ	PPC_BIT(3)
>> +#define   NPU3_RELAXED_CFG2_CMD_DMA_PR_W	PPC_BIT(4)
>> +#define   NPU3_RELAXED_CFG2_CMD_CL_RD_NC_F0	PPC_BIT(5)
>> +#define   NPU3_RELAXED_CFG2_SRC_WRENA(src)	PPC_BIT(32 + (src) * 4)
>> +#define   NPU3_RELAXED_CFG2_SRC_RDENA(src)	PPC_BIT(33 + (src) * 4)
>> +#define   NPU3_RELAXED_CFG2_SRC_AWENA(src)	PPC_BIT(34 + (src) * 4)
>> +#define   NPU3_RELAXED_CFG2_SRC_ARENA(src)	PPC_BIT(35 + (src) * 4)
>> +
>> +/* CQ_CTL block registers */
>> +#define NPU3_CTL_MISC_CFG0			(NPU3_BLOCK_CQ_CTL + 0x000)
>> +#define NPU3_CTL_MISC_CFG1			(NPU3_BLOCK_CQ_CTL + 0x008)
>> +#define NPU3_CTL_MISC_CFG2			(NPU3_BLOCK_CQ_CTL + 0x010)
>> +#define   NPU3_CTL_MISC_CFG2_OCAPI_MODE		PPC_BITMASK(0, 4)
>> +#define   NPU3_CTL_MISC_CFG2_NVLINK_MODE	PPC_BITMASK(5, 9)
>> +#define NPU3_CTL_MISC_CFG3			(NPU3_BLOCK_CQ_CTL + 0x018)
>> +#define NPU3_CTL_BDF2PE_CFG(n)			(NPU3_BLOCK_CQ_CTL + 0x180 + (n) * 8)
>> +#define   NPU3_CTL_BDF2PE_CFG_ENABLE		PPC_BIT(0)
>> +#define   NPU3_CTL_BDF2PE_CFG_PE		PPC_BITMASK(4, 7)
>> +#define   NPU3_CTL_BDF2PE_CFG_BDF		PPC_BITMASK(8, 23)
>> +
>> +/* CQ_DAT block registers */
>> +#define NPU3_DAT_MISC_CFG1			(NPU3_BLOCK_CQ_DAT + 0x008)
>> +#define   NPU3_DAT_MISC_CFG1_OCAPI_MODE		PPC_BITMASK(40, 44)
>> +#define   NPU3_DAT_MISC_CFG1_NVLINK_MODE	PPC_BITMASK(45, 49)
>> +
>> +/* NTL block registers */
>> +#define NPU3_NTL_MISC_CFG2(brk)			(NPU3_BLOCK_NTL(brk) + 0x000)
>> +#define   NPU3_NTL_MISC_CFG2_BRICK_ENABLE	PPC_BIT(0)
>> +#define   NPU3_NTL_MISC_CFG2_NDL_RX_PARITY_ENA	PPC_BIT(16)
>> +#define   NPU3_NTL_MISC_CFG2_NDL_TX_PARITY_ENA	PPC_BIT(17)
>> +#define   NPU3_NTL_MISC_CFG2_NDL_PRI_PARITY_ENA	PPC_BIT(18)
>> +#define   NPU3_NTL_MISC_CFG2_RCV_CREDIT_OVERFLOW_ENA PPC_BIT(19)
>> +#define NPU3_NTL_PRI_CFG(brk)			(NPU3_BLOCK_NTL(brk) + 0x0b0)
>> +#define   NPU3_NTL_PRI_CFG_NDL			PPC_BITMASK(1, 2)
>> +
>> +/* NPU_ATS block registers */
>> +#define NPU3_ATS_IODA_ADDR			(NPU3_BLOCK_NPU_ATS + 0x108)
>> +#define   NPU3_ATS_IODA_ADDR_AUTO_INC		PPC_BIT(0)
>> +#define   NPU3_ATS_IODA_ADDR_TBL_SEL		PPC_BITMASK(11, 15)
>> +#define     NPU3_ATS_IODA_ADDR_TBL_TVT		9
>> +#define   NPU3_ATS_IODA_ADDR_TBL_ADDR		PPC_BITMASK(54, 63)
>> +#define NPU3_ATS_IODA_DATA			(NPU3_BLOCK_NPU_ATS + 0x110)
>> +#define   NPU3_ATS_IODA_TVT_XLAT_ADDR		PPC_BITMASK(0, 47)
>> +#define   NPU3_ATS_IODA_TVT_TABLE_LEVEL		PPC_BITMASK(48, 50)
>> +#define   NPU3_ATS_IODA_TVT_TABLE_SIZE		PPC_BITMASK(51, 55)
>> +#define   NPU3_ATS_IODA_TVT_PAGE_SIZE		PPC_BITMASK(59, 63)
>> +#define NPU3_ATS_TCE_KILL			(NPU3_BLOCK_NPU_ATS + 0x120)
>> +#define   NPU3_ATS_TCE_KILL_ALL			PPC_BIT(0)
>> +#define   NPU3_ATS_TCE_KILL_ONE			PPC_BIT(2)
>> +#define   NPU3_ATS_TCE_KILL_PE_NUMBER		PPC_BITMASK(4, 7)
>> +#define   NPU3_ATS_TCE_KILL_ADDRESS		PPC_BITMASK(15, 51)
>> +
>> +/* NPU_XTS block registers */
>> +#define NPU3_XTS_CFG				(NPU3_BLOCK_NPU_XTS + 0x020)
>> +#define   NPU3_XTS_CFG_MMIOSD			PPC_BIT(1)
>> +#define   NPU3_XTS_CFG_TRY_ATR_RO		PPC_BIT(6)
>> +#define   NPU3_XTS_CFG_OPENCAPI			PPC_BIT(15)
>> +#define NPU3_XTS_CFG2				(NPU3_BLOCK_NPU_XTS + 0x028)
>> +#define   NPU3_XTS_CFG2_NO_FLUSH_ENA		PPC_BIT(49)
>> +#define   NPU3_XTS_CFG2_XSL2_ENA		PPC_BIT(55)
>> +#define NPU3_XTS_CFG3				(NPU3_BLOCK_NPU_XTS + 0x068)
>> +#define NPU3_XTS_ATSD_HYP(n)			(NPU3_BLOCK_NPU_XTS + 0x100 + (n) * 8)
>> +#define   NPU3_XTS_ATSD_HYP_MSR_HV		PPC_BIT(51)
>> +#define   NPU3_XTS_ATSD_HYP_LPARID		PPC_BITMASK(52, 63)
>> +#define NPU3_XTS_BDF_MAP(n)			(NPU3_BLOCK_NPU_XTS + 0x4000 + (n) * 8)
>> +#define   NPU3_XTS_BDF_MAP_MAX			16
>> +#define   NPU3_XTS_BDF_MAP_VALID		PPC_BIT(0)
>> +#define   NPU3_XTS_BDF_MAP_UNFILT		PPC_BIT(1)
>> +#define   NPU3_XTS_BDF_MAP_STACK		PPC_BITMASK(4, 6)
>> +#define   NPU3_XTS_BDF_MAP_BRICK		PPC_BITMASK(7, 9)
>> +#define   NPU3_XTS_BDF_MAP_BDF			PPC_BITMASK(16, 31)
>> +#define   NPU3_XTS_BDF_MAP_XLAT			PPC_BITMASK(39, 40)
>> +#define   NPU3_XTS_BDF_MAP_LPCR_PS		PPC_BITMASK(41, 43)
>> +#define   NPU3_XTS_BDF_MAP_LPCR_ISL		PPC_BIT(44)
>> +#define   NPU3_XTS_BDF_MAP_LPCR_TC		PPC_BIT(45)
>> +#define   NPU3_XTS_BDF_MAP_LPCR_SC		PPC_BIT(46)
>> +#define   NPU3_XTS_BDF_MAP_LPCR_BOT		PPC_BIT(47)
>> +#define   NPU3_XTS_BDF_MAP_LPARSHORT		PPC_BITMASK(48, 51)
>> +#define   NPU3_XTS_BDF_MAP_LPARID		PPC_BITMASK(52, 63)
>> +#define NPU3_XTS_PID_MAP(n)			(NPU3_BLOCK_NPU_XTS + 0x8000 + (n) * 32)
>> +#define   NPU3_XTS_PID_MAP_VALID_ATRGPA0	PPC_BIT(0)
>> +#define   NPU3_XTS_PID_MAP_VALID_ATRGPA1	PPC_BIT(1)
>> +#define   NPU3_XTS_PID_MAP_VALID_ATSD		PPC_BIT(2)
>> +#define   NPU3_XTS_PID_MAP_MSR			PPC_BITMASK(25, 31)
>> +#define     NPU3_XTS_PID_MAP_MSR_DR		PPC_BIT(25)
>> +#define     NPU3_XTS_PID_MAP_MSR_TA		PPC_BIT(26)
>> +#define     NPU3_XTS_PID_MAP_MSR_HV		PPC_BIT(27)
>> +#define     NPU3_XTS_PID_MAP_MSR_PR		PPC_BIT(28)
>> +#define     NPU3_XTS_PID_MAP_MSR_US		PPC_BIT(29)
>> +#define     NPU3_XTS_PID_MAP_MSR_SF		PPC_BIT(30)
>> +#define     NPU3_XTS_PID_MAP_MSR_UV		PPC_BIT(31)
>> +#define   NPU3_XTS_PID_MAP_LPARSHORT		PPC_BITMASK(40, 43)
>> +#define   NPU3_XTS_PID_MAP_PID       		PPC_BITMASK(44, 63)
>> +
>> +/* NPU_MISC block registers */
>> +#define NPU3_MISC_CFG				(NPU3_BLOCK_NPU_MISC + 0x030)
>> +#define   NPU3_MISC_CFG_IPI_PS			PPC_BIT(11)
>> +#define     NPU3_MISC_CFG_IPI_PS_64K		1
>> +#define   NPU3_MISC_CFG_IPI_OS			PPC_BIT(12)
>> +#define     NPU3_MISC_CFG_IPI_OS_AIX		0
>> +#define     NPU3_MISC_CFG_IPI_OS_LINUX		1
>> +#define NPU3_MISC_INT_BAR			(NPU3_BLOCK_NPU_MISC + 0x098)
>> +#define   NPU3_MISC_INT_BAR_ADDR		PPC_BITMASK(0, 39)
>> +#define NPU3_MISC_BDF2PE_CFG(n)			(NPU3_BLOCK_NPU_MISC + 0x100 + (n) * 8)
>> +#define   NPU3_MISC_BDF2PE_CFG_ENABLE		PPC_BIT(0)
>> +#define   NPU3_MISC_BDF2PE_CFG_PE		PPC_BITMASK(4, 7)
>> +#define   NPU3_MISC_BDF2PE_CFG_BDF		PPC_BITMASK(8, 23)
>> +#define NPU3_MISC_PESTB(pe)			(NPU3_BLOCK_NPU_MISC + 0x200 + (pe) * 8)
>> +
>> +/* NPU_XTS_ATSD block registers */
>> +#define NPU3_XTS_ATSD_LAUNCH(n)			(NPU3_BLOCK_NPU_XTS_ATSD(n) + 0x000)
>> +
>> +#endif /* __NPU3_REGS_H */
>> diff --git a/include/npu3.h b/include/npu3.h
>> new file mode 100644
>> index 000000000000..6a4ac6a2d442
>> --- /dev/null
>> +++ b/include/npu3.h
>> @@ -0,0 +1,180 @@
>> +/* Copyright 2019 IBM Corp.
>> + *
>> + * Licensed under the Apache License, Version 2.0 (the "License");
>> + * you may not use this file except in compliance with the License.
>> + * You may obtain a copy of the License at
>> + *
>> + *      http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
>> + * implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +#ifndef __NPU3_H
>> +#define __NPU3_H
>> +
>> +#include <phys-map.h>
>> +#include <pci.h>
>> +#include <npu3-regs.h>
>> +
>> +enum npu3_dev_type {
>> +	NPU3_DEV_TYPE_UNKNOWN = 0,
>> +	NPU3_DEV_TYPE_NVLINK,
>> +	NPU3_DEV_TYPE_ANY = INT_MAX
>> +};
>> +
>> +/* Information about a currently running hw procedure */
>> +struct npu3_procedure {
>> +	uint16_t		number;
>> +	uint16_t		step;
>> +	uint32_t		status;
>> +	unsigned long		timeout;
>> +};
>> +
>> +/* Used to expose a hardware BAR (or logical slice of it) outside skiboot */
>> +struct npu3_bar {
>> +	bool			enable;
>> +	uint64_t		addr;
>> +	uint64_t		size;
>> +	uint64_t		trap;
>> +};
>> +
>> +struct npu3_dev_nvlink {
>> +	/*
>> +	 * PCI virtual device. BDFN is allocated based on GPU association.
>> +	 * Links connected to the same GPU will be exposed as different
>> +	 * functions of the same bus/device.
>> +	 */
>> +	struct pci_virt_device	*pvd;
>> +
>> +	/* The PCI device created from pvd */
>> +	const char		*loc_code;
>> +	struct pci_device	*pd;
>> +
>> +	/* The associated GPU device */
>> +	struct pci_device	*gpu;
>> +};
>> +
>> +struct npu3_dev {
>> +	enum npu3_dev_type	type;
>> +	uint32_t		index;
>> +	struct dt_node		*dn;
>> +	struct npu3		*npu;
>> +	struct npu3_procedure	proc;
>> +	uint64_t		link_speed;
>> +
>> +	struct npu3_bar		ntl_bar;
>> +	struct npu3_bar		genid_bar;
>> +
>> +	/* Associated PHY information */
>> +	uint32_t		ob_chiplet;
>> +	uint32_t		phy_lane_mask;
>> +
>> +	/* For NPU3_DEV_TYPE_NVLINK */
>> +	struct npu3_dev_nvlink	nvlink;
>> +};
>> +
>> +struct npu3_nvlink {
>> +	struct phb		phb;
>> +	uint32_t		context_refcount[NPU3_XTS_BDF_MAP_MAX];
>
>Can we please have same names for the same things? I do not care as much
>which one, just the same?

I'll change it. "context_refcount" is a bit verbose anyway.

>> +};
>> +
>> +#define NPU3_LINKS_PER_NPU 4
>> +
>> +struct npu3 {
>> +	uint32_t		index;
>> +	struct dt_node		*dt_node;
>> +	uint32_t		chip_id;
>> +	uint64_t		xscom_base;
>> +
>> +	/* Global MMIO window (all NPU regs) */
>> +	uint64_t		regs[2];
>> +
>> +	uint32_t		irq_base;
>> +	struct lock		lock;
>> +	bool			tx_zcal_complete;
>> +
>> +	struct npu3_dev		devices[NPU3_LINKS_PER_NPU];
>> +
>> +	/* Shared by any NPU3_DEV_TYPE_NVLINK devices */
>> +	struct npu3_nvlink	nvlink;
>> +};
>> +
>> +static inline struct npu3 *npu3_phb_to_npu(struct phb *phb)
>> +{
>> +	assert(phb->phb_type == phb_type_npu_v3);
>> +	return container_of(phb, struct npu3, nvlink.phb);
>> +}
>> +
>> +/* Chip-scope index of the link */
>> +static inline uint32_t npu3_chip_dev_index(struct npu3_dev *dev)
>> +{
>> +	return dev->npu->index * NPU3_LINKS_PER_NPU + dev->index;
>> +}
>> +
>> +struct npu3_dev *npu3_next_dev(struct npu3 *npu, struct npu3_dev *dev,
>> +			       enum npu3_dev_type type);
>> +
>> +#define npu3_for_each_dev_type(dev, npu, type) \
>> +	for (dev = NULL; (dev = npu3_next_dev(npu, dev, type));)
>> +
>> +#define npu3_for_each_nvlink_dev(dev, npu) \
>> +	npu3_for_each_dev_type(dev, npu, NPU3_DEV_TYPE_NVLINK)
>> +
>> +#define npu3_for_each_dev(dev, npu) \
>> +	npu3_for_each_dev_type(dev, npu, NPU3_DEV_TYPE_ANY)
>> +
>> +struct npu3 *npu3_next_nvlink_npu(struct npu3 *npu, uint32_t chip_id);
>> +
>> +#define npu3_for_each_chip_nvlink_npu(npu, chip_id)                    \
>> +        for (npu = NULL; (npu = npu3_next_nvlink_npu(npu, chip_id));)
>> +
>> +#define NPU3_ANY_CHIP INT_MAX
>> +#define npu3_for_each_nvlink_npu(npu) \
>> +	npu3_for_each_chip_nvlink_npu(npu, NPU3_ANY_CHIP)
>> +
>> +void npu3_init_nvlink(struct npu3 *npu);
>> +void npu3_dev_enable_bars(struct npu3_dev *dev, bool enable);
>> +int64_t npu3_dev_reset(struct npu3_dev *dev);
>> +
>> +uint32_t npu3_chip_possible_gpus(void);
>> +uint32_t npu3_dev_gpu_index(struct npu3_dev *dev);
>> +
>> +/* NPU RING register access */
>> +void npu3_write(struct npu3 *npu, uint64_t reg, uint64_t val);
>> +uint64_t npu3_read(struct npu3 *npu, uint64_t reg);
>> +void npu3_write_4b(struct npu3 *npu, uint64_t reg, uint32_t val);
>> +uint32_t npu3_read_4b(struct npu3 *npu, uint64_t reg);
>> +
>> +/* Link flags */
>> +#define NPU3_DEV_PCI_LINKED	0x1
>> +#define NPU3_DEV_DL_RESET	0x2
>> +
>> +void npu3_pvd_flag_set(struct npu3_dev *dev, uint8_t flag);
>> +void npu3_pvd_flag_clear(struct npu3_dev *dev, uint8_t flag);
>> +
>> +/* PHY procedures */
>> +#define NPU3_PROC_STATUS_MASK	0xc000000f
>> +#define NPU3_PROC_INPROGRESS	(1 << 31)
>> +#define NPU3_PROC_COMPLETE	(1 << 30)
>> +#define NPU3_PROC_NEXT		(1 << 29)
>> +#define NPU3_PROC_FAILED	2
>> +#define NPU3_PROC_ABORTED	3
>> +#define NPU3_PROC_UNSUPPORTED	4
>> +
>> +void npu3_dev_procedure_init(struct npu3_dev *dev, uint32_t pnum);
>> +uint32_t npu3_dev_procedure_status(struct npu3_dev *dev);
>> +
>> +/* OPAL entry points */
>> +int64_t npu3_init_context(struct phb *phb, uint64_t msr, uint64_t bdf);
>> +int64_t npu3_destroy_context(struct phb *phb, uint64_t bdf);
>> +int64_t npu3_map_lpar(struct phb *phb, uint64_t bdf, uint64_t lparid,
>> +		      uint64_t lpcr);
>> +int64_t npu3_set_relaxed_order(struct phb *phb, uint32_t gcid, int pec,
>> +			       bool enable);
>> +
>> +#endif /* __NPU3_H */
>> diff --git a/include/pci.h b/include/pci.h
>> index 2b7a3c2893d5..ff29010cbb71 100644
>> --- a/include/pci.h
>> +++ b/include/pci.h
>> @@ -366,6 +366,7 @@ enum phb_type {
>>  	phb_type_pcie_v4,
>>  	phb_type_npu_v2,
>>  	phb_type_npu_v2_opencapi,
>> +	phb_type_npu_v3,
>>  };
>>
>>  struct phb {
>> diff --git a/include/platform.h b/include/platform.h
>> index 4f8627a3a680..0b728a50075e 100644
>> --- a/include/platform.h
>> +++ b/include/platform.h
>> @@ -23,6 +23,7 @@ struct pci_device;
>>  struct pci_slot;
>>  struct errorlog;
>>  struct npu2;
>> +struct npu3;
>>
>>  enum resource_id {
>>  	RESOURCE_ID_KERNEL,
>> @@ -94,8 +95,9 @@ struct platform {
>>  	/* OpenCAPI platform-specific I2C information */
>>  	const struct platform_ocapi *ocapi;
>>
>> -	/* NPU2 device detection */
>> +	/* NPU device detection */
>>  	void		(*npu2_device_detect)(struct npu2 *npu);
>> +	void		(*npu3_device_detect)(struct npu3 *npu);
>>
>>  	/*
>>  	 * Probe platform, return true on a match, called before
>> diff --git a/include/skiboot.h b/include/skiboot.h
>> index 1b3bacbe73f6..2eafb1118dea 100644
>> --- a/include/skiboot.h
>> +++ b/include/skiboot.h
>> @@ -208,6 +208,7 @@ extern int preload_capp_ucode(void);
>>  extern void preload_io_vpd(void);
>>  extern void probe_npu(void);
>>  extern void probe_npu2(void);
>> +extern void probe_npu3(void);
>>  extern void uart_init(void);
>>  extern void mbox_init(void);
>>  extern void early_uart_init(void);
>> diff --git a/include/xscom-p9-regs.h b/include/xscom-p9-regs.h
>> index 5137d91838d6..856a92d9ab4f 100644
>> --- a/include/xscom-p9-regs.h
>> +++ b/include/xscom-p9-regs.h
>> @@ -82,4 +82,23 @@
>>  #define EC_PPM_SPECIAL_WKUP_OCC		0x010C
>>  #define EC_PPM_SPECIAL_WKUP_HYP		0x010D
>>
>> +#define OB_BASE(ob)				(((ob) + 9) << 24)
>> +#define OB_CPLT_CONF1(ob)			(OB_BASE(ob) + 0x9)
>> +#define   OB_CPLT_CONF1_NV_IOVALID(brk)		PPC_BIT(6 + (brk))
>> +#define OB_INDIRECT(ob)				((OB_BASE(ob) + 0x10c3f) | PPC_BIT(0))
>> +
>> +/* PPE SRAM: Indirect address/data port */
>> +#define OB_PPE_CSAR(ob)				(OB_BASE(ob) + 0x1104d)
>> +#define   OB_PPE_CSAR_SRAM_ADDR			PPC_BITMASK(16, 28)
>> +#define OB_PPE_CSDR(ob)				(OB_BASE(ob) + 0x1104e)
>> +
>> +/* PPE SRAM: Indirect registers */
>> +#define OB_PPE_SALT_CMD				0x1fe6
>> +#define   OB_PPE_SALT_CMD_READY			PPC_BIT(0)
>> +#define   OB_PPE_SALT_CMD_RW			PPC_BIT(1)
>> +#define   OB_PPE_SALT_CMD_ERR			PPC_BIT(2)
>> +#define   OB_PPE_SALT_CMD_LINKNUM		PPC_BITMASK(15, 18)
>> +#define   OB_PPE_SALT_CMD_REG			PPC_BITMASK(19, 31)
>> +#define   OB_PPE_SALT_CMD_DATA			PPC_BITMASK(32, 63)
>> +
>>  #endif /* __XSCOM_P9_REGS_H__ */
>>

-- 
Reza Arbab