[Skiboot] [PATCH 5/6] Nvlink: Add NPU PHB functions

Alistair Popple alistair at popple.id.au
Fri Oct 16 16:08:19 AEDT 2015


This patch adds support for the NPU Nvlink PHB type. It provides
access to each nvlink in the system by exposing them as PCIe devices
under a NPU PHB type. Each PCIe device has a configuration space
implemented in software which indicates the base address of the
DL/TL/PL registers required by the device drivers.

It also presents one LSI per device which is used to signal device
drivers of changes in device status. The configuration space also adds
a vendor specific capability which is used primarily by device drivers
to power on an train the IBM PHY.

Signed-off-by: Gavin Shan <gwshan at linux.vnet.ibm.com>
Signed-off-by: Alistair Popple <alistair at popple.id.au>
---
 core/init.c            |    3 +
 hw/Makefile.inc        |    2 +-
 hw/npu-hw-procedures.c |  598 +++++++++++++++++
 hw/npu.c               | 1718 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/npu-regs.h     |  235 +++++++
 include/npu.h          |  211 ++++++
 include/skiboot.h      |    2 +-
 7 files changed, 2767 insertions(+), 2 deletions(-)
 create mode 100644 hw/npu-hw-procedures.c
 create mode 100644 hw/npu.c
 create mode 100644 include/npu-regs.h
 create mode 100644 include/npu.h

diff --git a/core/init.c b/core/init.c
index 7ae4dee..6d21b55 100644
--- a/core/init.c
+++ b/core/init.c
@@ -740,6 +740,9 @@ void __noreturn main_cpu_entry(const void *fdt, u32 master_cpu)
 	/* Probe PHB3 on P8 */
 	probe_phb3();
 
+	/* Probe NPUs */
+	probe_npu();
+
 	/* Initialize PCI */
 	pci_init_slots();
 
diff --git a/hw/Makefile.inc b/hw/Makefile.inc
index 034947c..6eacb74 100644
--- a/hw/Makefile.inc
+++ b/hw/Makefile.inc
@@ -6,7 +6,7 @@ HW_OBJS += homer.o slw.o occ.o fsi-master.o centaur.o
 HW_OBJS += nx.o nx-rng.o nx-crypto.o nx-842.o
 HW_OBJS += p7ioc.o p7ioc-inits.o p7ioc-phb.o p5ioc2.o p5ioc2-phb.o
 HW_OBJS += phb3.o sfc-ctrl.o fake-rtc.o bt.o p8-i2c.o prd.o
-HW_OBJS += dts.o lpc-rtc.o
+HW_OBJS += dts.o lpc-rtc.o npu.o npu-hw-procedures.o
 HW=hw/built-in.o
 
 include $(SRC)/hw/fsp/Makefile.inc
diff --git a/hw/npu-hw-procedures.c b/hw/npu-hw-procedures.c
new file mode 100644
index 0000000..118ed6d
--- /dev/null
+++ b/hw/npu-hw-procedures.c
@@ -0,0 +1,598 @@
+/* Copyright 2013-2015 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <skiboot.h>
+#include <io.h>
+#include <timebase.h>
+#include <pci.h>
+#include <interrupts.h>
+#include <lock.h>
+#include <npu-regs.h>
+#include <npu.h>
+#include <xscom.h>
+
+typedef uint32_t (*step)(struct npu_dev *);
+
+struct procedure {
+	const char *name;
+	step steps[];
+};
+
+#define DEFINE_PROCEDURE(NAME, STEPS...)		\
+	struct procedure procedure_##NAME =		\
+	{.name = #NAME, .steps = {NAME, ##STEPS}}
+
+#define PROCEDURE_INPROGRESS	(1 << 31)
+#define PROCEDURE_COMPLETE	(1 << 30)
+#define PROCEDURE_NEXT		(1 << 29)
+#define PROCEDURE_FAILED	2
+#define PROCEDURE_ABORTED 	3
+#define PROCEDURE_UNSUPPORTED	4
+
+/* Mask defining which status bits we want to expose */
+#define PROCEDURE_STATUS_MASK	0xc000000f
+
+/* Accesors for PHY registers. These can be done either via MMIO or SCOM. */
+static bool pl_use_scom = 1;
+static void phy_write(struct npu_dev *npu_dev, uint64_t addr, uint32_t val)
+{
+	if (pl_use_scom)
+		xscom_write(npu_dev->npu->chip_id, npu_dev->pl_xscom_base | addr, val);
+	else
+		out_be16((void *) npu_dev->pl_base + PL_MMIO_ADDR(addr), val);
+}
+
+static uint16_t phy_read(struct npu_dev *npu_dev, uint64_t addr)
+{
+	uint64_t val;
+
+	if (pl_use_scom)
+		xscom_read(npu_dev->npu->chip_id, npu_dev->pl_xscom_base + addr, &val);
+	else
+		val = in_be16((void *) npu_dev->pl_base + PL_MMIO_ADDR(addr));
+
+	return val & 0xffff;
+}
+
+/* The DL registers can be accessed indirectly via the NTL */
+static void dl_write(struct npu_dev *npu_dev, uint32_t addr, uint32_t val)
+{
+	xscom_write(npu_dev->npu->chip_id,
+		    npu_dev->xscom + NX_DL_REG_ADDR, addr);
+	xscom_write(npu_dev->npu->chip_id,
+		    npu_dev->xscom + NX_DL_REG_DATA, val);
+}
+
+static uint64_t __unused dl_read(struct npu_dev *npu_dev, uint32_t addr)
+{
+	uint64_t val;
+
+	xscom_write(npu_dev->npu->chip_id,
+		    npu_dev->xscom + NX_DL_REG_ADDR, addr);
+	xscom_read(npu_dev->npu->chip_id,
+		   npu_dev->xscom + NX_DL_REG_DATA, &val);
+	return val;
+}
+
+/* Our hardware bits are backwards here. The lane vectors are 16-bit
+ * values represented in IBM bit ordering. This means lane 0 is
+ * represented by bit 15 in most of the registers. Internally we keep
+ * this sane (ie. npu_dev->lane_mask[0] == lane 0) as we need sane
+ * numbering for set_lane_reg() anyway.  */
+static uint32_t phy_lane_mask(struct npu_dev *npu_dev)
+{
+	/* We only train 8 lanes at a time so we don't do a full
+	 * bit-swap */
+	assert(npu_dev->lane_mask == 0xff00 || npu_dev->lane_mask == 0xff);
+
+	return ~npu_dev->lane_mask & 0xffff;
+}
+
+static void set_lane_reg(struct npu_dev *npu_dev, uint64_t base_reg,
+			 uint64_t data, uint64_t mask)
+{
+	uint64_t val, i;
+	uint32_t lane_mask = npu_dev->lane_mask;
+
+	for (i = 0; i <= 23; i++) {
+		if (lane_mask & (1ul << i)) {
+			uint64_t tx_rxcal_reg = base_reg + (i << 32);
+			val = phy_read(npu_dev, tx_rxcal_reg);
+			val = (val & ~mask) | data;
+			phy_write(npu_dev, tx_rxcal_reg, val);
+		}
+	}
+}
+
+static uint32_t stop(struct npu_dev *npu_dev __unused)
+{
+	return PROCEDURE_COMPLETE | PROCEDURE_ABORTED;
+}
+DEFINE_PROCEDURE(stop);
+
+static uint32_t nop(struct npu_dev *npu_dev __unused)
+{
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(nop);
+
+/* Procedure 1.2.1 (RESET_NPU_DL) from opt_programmerguide.odt. Also
+ * incorporates AT reset. */
+static uint32_t reset_npu_dl(struct npu_dev *npu_dev)
+{
+	void *ntl_base = (void *) npu_dev->bar.base;
+	uint64_t val;
+
+	/* Assert NPU reset */
+	val = in_be64(ntl_base + NTL_CONTROL);
+	val |= NTL_CONTROL_RESET;
+	out_be64(ntl_base + NTL_CONTROL, val);
+
+	/* Put the Nvidia logic in reset */
+	dl_write(npu_dev, NDL_CONTROL, 0xe8000000);
+
+	/* Release Nvidia logic from reset */
+	dl_write(npu_dev, NDL_CONTROL, 0);
+
+	/* Release NPU from reset */
+	val &= ~NTL_CONTROL_RESET;
+	out_be64(ntl_base + NTL_CONTROL, val);
+
+	/* Setup up TL credits */
+	out_be64(ntl_base + TL_CMD_CR, PPC_BIT(0));
+	out_be64(ntl_base + TL_CMD_D_CR, PPC_BIT(0));
+	out_be64(ntl_base + TL_RSP_CR, PPC_BIT(15));
+	out_be64(ntl_base + TL_RSP_D_CR, PPC_BIT(15));
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(reset_npu_dl);
+
+/* Procedures 1.2.3 (reset_lanes) & 1.2.4
+ * (io_register_write_reset_values) */
+static uint32_t phy_reset(struct npu_dev *npu_dev)
+{
+	uint16_t val;
+
+	/* Lower run_lane inputs for lanes to be reset */
+	val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15);
+	val &= ~phy_lane_mask(npu_dev);
+	phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val);
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_reset_wait(struct npu_dev *npu_dev)
+{
+	uint16_t val;
+
+	/* Wait for lane busy outputs to go to zero for lanes to be
+	 * reset */
+	val = phy_read(npu_dev, RX_LANE_BUSY_VEC_0_15);
+	if (val & phy_lane_mask(npu_dev))
+		return PROCEDURE_INPROGRESS;
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_reset_complete(struct npu_dev *npu_dev)
+{
+	uint16_t val;
+	uint32_t lane_mask = phy_lane_mask(npu_dev);
+
+	/* Set ioreset_vec for the desired lanes bit positions */
+	val = phy_read(npu_dev, RX_IORESET_VEC_0_15);
+	phy_write(npu_dev, RX_IORESET_VEC_0_15, val | lane_mask);
+
+	val = phy_read(npu_dev, TX_IORESET_VEC_0_15);
+	phy_write(npu_dev, TX_IORESET_VEC_0_15, val | lane_mask);
+
+	/* Clear ioreset_vec */
+	val = phy_read(npu_dev, RX_IORESET_VEC_0_15);
+	phy_write(npu_dev, RX_IORESET_VEC_0_15, val & ~lane_mask);
+
+	val = phy_read(npu_dev, TX_IORESET_VEC_0_15);
+	phy_write(npu_dev, TX_IORESET_VEC_0_15, val & ~lane_mask);
+
+	/* Reset RX phase rotators */
+	set_lane_reg(npu_dev, RX_PR_CNTL_PL, RX_PR_RESET, RX_PR_RESET);
+	set_lane_reg(npu_dev, RX_PR_CNTL_PL, 0, RX_PR_RESET);
+
+	/* Restore registers from scominit that may have changed */
+	set_lane_reg(npu_dev, RX_PR_MODE, 0x8, RX_PR_PHASE_STEP);
+	set_lane_reg(npu_dev, RX_A_DAC_CNTL,
+		     0x7 << MASK_TO_LSH(RX_PR_IQ_RES_SEL),
+		     RX_PR_IQ_RES_SEL);
+	set_lane_reg(npu_dev, TX_MODE1_PL, 0, TX_LANE_PDWN);
+	set_lane_reg(npu_dev, RX_BANK_CONTROLS, 0, RX_LANE_ANA_PDWN);
+	set_lane_reg(npu_dev, RX_MODE, 0, RX_LANE_DIG_PDWN);
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_reset, phy_reset_wait, phy_reset_complete);
+
+/* Round a fixed decimal number. Frac is the number of fractional
+ * bits */
+static uint32_t round(uint32_t val, int frac)
+{
+	if (val >> (frac - 1) & 0x1)
+		return (val >> frac) + 1;
+	else
+		return val >> frac;
+}
+
+#define ZCAL_MIN	(10 << 3)
+#define ZCAL_MAX	(40 << 3)
+#define ZCAL_K0		0x0
+#define ZCAL_M 		128
+/* TODO: add a test case for the following values:
+
+   Initial values:
+     zcal_n = 0xda;
+     zcal_p = 0xc7;
+
+   Results:
+   	pre_p = 0x0
+	pre_n = 0x0
+	margin_p = 0x0
+	margin_n = 0x0
+	total_en_p = 0x32
+	total_en_n = 0x37
+ */
+
+static uint32_t phy_tx_zcal(struct npu_dev *npu_dev)
+{
+	uint64_t val;
+
+	if (npu_dev->index < 2 && npu_dev->npu->tx_zcal_complete[0])
+		return PROCEDURE_COMPLETE;
+
+	if (npu_dev->index >= 2 && npu_dev->npu->tx_zcal_complete[1])
+		return PROCEDURE_COMPLETE;
+
+	/* Start calibration */
+	val = phy_read(npu_dev, TX_IMPCAL_SWO1_PB);
+	val &= TX_ZCAL_SWO_EN;
+	phy_write(npu_dev, TX_IMPCAL_SWO1_PB, val);
+	phy_write(npu_dev, TX_IMPCAL_SWO2_PB, 0x50 << 2);
+	val = phy_read(npu_dev, TX_IMPCAL_PB);
+	val |= TX_ZCAL_REQ;
+	phy_write(npu_dev, TX_IMPCAL_PB, val);
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_tx_zcal_wait(struct npu_dev *npu_dev)
+{
+	uint64_t val;
+
+	val = phy_read(npu_dev, TX_IMPCAL_PB);
+	if (!(val & TX_ZCAL_DONE))
+		return PROCEDURE_INPROGRESS;
+
+	if (val & TX_ZCAL_ERROR)
+		return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_tx_zcal_calculate(struct npu_dev *npu_dev)
+{
+	uint64_t val;
+	uint64_t zcal_n;
+	uint64_t zcal_p;
+	uint64_t margin_n;
+	uint64_t margin_p;
+	uint64_t pre_n;
+	uint64_t pre_p;
+	uint64_t total_en_n;
+	uint64_t total_en_p;
+
+	val = phy_read(npu_dev, TX_IMPCAL_NVAL_PB);
+	zcal_n = GETFIELD(TX_ZCAL_N, val);
+	val = phy_read(npu_dev, TX_IMPCAL_PVAL_PB);
+	zcal_p = GETFIELD(TX_ZCAL_P, val);
+
+	if ((zcal_n < ZCAL_MIN) || (zcal_n > ZCAL_MAX) ||
+	    (zcal_p < ZCAL_MIN) || (zcal_p > ZCAL_MAX))
+		return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+
+	margin_n = (0x80 - ZCAL_M) * zcal_n / 2;
+	margin_p = (0x80 - ZCAL_M) * zcal_p / 2;
+	pre_n = (((0x80 * zcal_n) - (2 * margin_n)) * ZCAL_K0) / 0x80;
+	pre_p = (((0x80 * zcal_p) - (2 * margin_p)) * ZCAL_K0) / 0x80;
+
+	total_en_n = 0x80 * zcal_n - (2 * margin_n) - (pre_n & 1023);
+	total_en_p = 0x80 * zcal_p - (2 * margin_p) - (pre_p & 1023);
+
+	pre_p = round(pre_p, 9);
+	pre_n = round(pre_n, 9);
+	margin_p = round(margin_p, 9);
+	margin_n = round(margin_n, 9);
+	total_en_p = round(total_en_p, 9);
+	total_en_n = round(total_en_n, 9);
+
+	val = SETFIELD(TX_FFE_TOTAL_ENABLE_N_ENC, 0, total_en_n);
+	val = SETFIELD(TX_FFE_TOTAL_ENABLE_P_ENC, val, total_en_p);
+	phy_write(npu_dev, TX_FFE_TOTAL_2RSTEP_EN, val);
+
+	val = SETFIELD(TX_FFE_PRE_N_SEL_ENC, 0, pre_n);
+	val = SETFIELD(TX_FFE_PRE_P_SEL_ENC, val, pre_p);
+	phy_write(npu_dev, TX_FFE_PRE_2RSTEP_SEL, val);
+
+	val = SETFIELD(TX_FFE_MARGIN_PD_N_SEL_ENC, 0, margin_n);
+	val = SETFIELD(TX_FFE_MARGIN_PU_P_SEL_ENC, val, margin_p);
+	phy_write(npu_dev, TX_FFE_MARGIN_2RSTEP_SEL, val);
+
+	if (npu_dev->index < 2)
+		npu_dev->npu->tx_zcal_complete[0] = true;
+	else
+		npu_dev->npu->tx_zcal_complete[1] = true;
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_tx_zcal, phy_tx_zcal_wait, phy_tx_zcal_calculate);
+
+static uint32_t phy_enable_tx_rxcal(struct npu_dev *npu_dev)
+{
+	/* Turn common mode on */
+	set_lane_reg(npu_dev, TX_MODE2_PL, TX_RXCAL, TX_RXCAL);
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_enable_tx_rxcal);
+
+static uint32_t phy_disable_tx_rxcal(struct npu_dev *npu_dev)
+{
+	/* Turn common mode off */
+	set_lane_reg(npu_dev, TX_MODE2_PL, 0, TX_RXCAL);
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_disable_tx_rxcal);
+
+static uint32_t phy_rx_dccal(struct npu_dev *npu_dev)
+{
+	if (phy_read(npu_dev, RX_LANE_BUSY_VEC_0_15)
+	    & ~phy_read(npu_dev, RX_INIT_DONE_VEC_0_15))
+		return PROCEDURE_INPROGRESS;
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_rx_dccal_start(struct npu_dev *npu_dev)
+{
+	uint64_t val;
+
+	/* Save EO step control */
+	val = phy_read(npu_dev, RX_EO_STEP_CNTL_PG);
+	npu_dev->procedure_data = val;
+
+	phy_write(npu_dev, RX_EO_STEP_CNTL_PG,
+		  RX_EO_ENABLE_LATCH_OFFSET_CAL
+		  | RX_EO_ENABLE_CM_COARSE_CAL);
+
+	val = phy_read(npu_dev, RX_RECAL_ABORT_VEC_0_15);
+	val |= phy_lane_mask(npu_dev);
+	phy_write(npu_dev, RX_RECAL_ABORT_VEC_0_15, val);
+
+	val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15);
+	val |= phy_lane_mask(npu_dev);
+	phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val);
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_rx_dccal_complete(struct npu_dev *npu_dev)
+{
+	/* Poll for completion on relevant lanes */
+	if ((phy_read(npu_dev, RX_INIT_DONE_VEC_0_15) & phy_lane_mask(npu_dev))
+	    != phy_lane_mask(npu_dev))
+		return PROCEDURE_INPROGRESS;
+
+	return PROCEDURE_NEXT;
+}
+
+static uint32_t phy_rx_dccal_fifo_init(struct npu_dev *npu_dev)
+{
+	uint64_t val;
+
+	val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15);
+	val &= ~phy_lane_mask(npu_dev);
+	phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val);
+
+	/* Turn off recal abort */
+	val = phy_read(npu_dev, RX_RECAL_ABORT_VEC_0_15);
+	val &= ~phy_lane_mask(npu_dev);
+	phy_write(npu_dev, RX_RECAL_ABORT_VEC_0_15, val);
+
+	/* Restore original settings */
+	phy_write(npu_dev, RX_EO_STEP_CNTL_PG, npu_dev->procedure_data);
+
+	/* FIFO Init */
+	set_lane_reg(npu_dev, TX_MODE2_PL, 0, TX_UNLOAD_CLK_DISABLE);
+	set_lane_reg(npu_dev, TX_CNTL_STAT2, TX_FIFO_INIT, TX_FIFO_INIT);
+	set_lane_reg(npu_dev, TX_MODE2_PL, TX_UNLOAD_CLK_DISABLE,
+		     TX_UNLOAD_CLK_DISABLE);
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_rx_dccal, phy_rx_dccal_start, phy_rx_dccal_complete,
+		 phy_rx_dccal_fifo_init);
+
+static uint32_t phy_rx_training(struct npu_dev *npu_dev)
+{
+	uint16_t val;
+
+	if (!npu_dev->procedure_data) {
+		val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15);
+		val |= phy_lane_mask(npu_dev);
+		phy_write(npu_dev, RX_RUN_LANE_VEC_0_15, val);
+	}
+
+	npu_dev->procedure_data++;
+	if (npu_dev->procedure_data >= 1000000)
+		return PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+
+	val = phy_read(npu_dev, RX_RUN_LANE_VEC_0_15);
+	if ((val & phy_lane_mask(npu_dev)) != phy_lane_mask(npu_dev))
+		return PROCEDURE_INPROGRESS;
+
+	return PROCEDURE_COMPLETE;
+}
+DEFINE_PROCEDURE(phy_rx_training);
+
+static struct procedure *npu_procedures[] = {
+	&procedure_stop,
+	&procedure_nop,
+	NULL,
+	NULL,
+	&procedure_phy_reset,
+	&procedure_phy_tx_zcal,
+	&procedure_phy_rx_dccal,
+	&procedure_phy_enable_tx_rxcal,
+	&procedure_phy_disable_tx_rxcal,
+	&procedure_phy_rx_training,
+	&procedure_reset_npu_dl,
+
+	/* Place holders for pre-terminate and terminate procedures */
+	&procedure_nop,
+	&procedure_nop};
+
+/* Run a procedure step(s) and return status */
+static uint32_t get_procedure_status(struct npu_dev *dev)
+{
+	uint32_t result;
+	uint16_t procedure = dev->procedure_number;
+	uint16_t step = dev->procedure_step;
+	const char *name = npu_procedures[procedure]->name;
+
+	do {
+		result = npu_procedures[procedure]->steps[step](dev);
+
+		if (result & PROCEDURE_NEXT) {
+			step++;
+			NPUDEVINF(dev, "Running procedure %s step %d\n", name, step);
+		}
+	} while (result & PROCEDURE_NEXT);
+
+	dev->procedure_step = step;
+
+	if (result & PROCEDURE_COMPLETE)
+		NPUDEVINF(dev, "Procedure %s complete\n", name);
+	else if (mftb() > dev->procedure_tb + msecs_to_tb(100)) {
+		NPUDEVINF(dev, "Procedure %s timed out\n", name);
+		result = PROCEDURE_COMPLETE | PROCEDURE_FAILED;
+	}
+
+	/* Mask off internal state bits */
+	dev->procedure_status = result & PROCEDURE_STATUS_MASK;
+
+	return dev->procedure_status;
+}
+
+int64_t npu_dev_procedure_read(struct npu_dev_trap *trap,
+				      uint32_t offset,
+				      uint32_t size,
+				      uint32_t *data)
+{
+	struct npu_dev *dev = trap->dev;
+	int64_t rc = OPAL_SUCCESS;
+
+	if (size != 4) {
+		/* Short config reads are not supported */
+		NPUDEVERR(dev, "Short read of procedure register\n");
+		return OPAL_PARAMETER;
+	}
+
+	offset -= trap->start;
+	*data = 0;
+
+	switch (offset) {
+	case 0:
+		/* Only run the procedure if not already complete */
+		if (dev->procedure_status & PROCEDURE_COMPLETE)
+			*data = dev->procedure_status;
+		else
+			*data = get_procedure_status(dev);
+
+		break;
+
+	case 4:
+		*data = dev->procedure_number;
+		break;
+
+	default:
+		NPUDEVERR(dev, "Invalid vendor specific offset 0x%08x\n",
+			  offset);
+		rc = OPAL_PARAMETER;
+	}
+
+	return rc;
+}
+
+int64_t npu_dev_procedure_write(struct npu_dev_trap *trap,
+				      uint32_t offset,
+				      uint32_t size,
+				      uint32_t data)
+{
+	struct npu_dev *dev = trap->dev;
+	const char *name;
+	int64_t rc = OPAL_SUCCESS;
+
+	if (size != 4) {
+		/* Short config writes are not supported */
+		NPUDEVERR(dev, "Short read of procedure register\n");
+		return OPAL_PARAMETER;
+	}
+
+	offset -= trap->start;
+
+	switch (offset) {
+	case 0:
+		/* We ignore writes to the status register */
+		NPUDEVINF(dev, "Ignoring writes to status register\n");
+		break;
+
+	case 4:
+		if (data >= ARRAY_SIZE(npu_procedures) ||
+		    !npu_procedures[data]) {
+			NPUDEVINF(dev, "Unsupported procedure number %d\n", data);
+			dev->procedure_status = PROCEDURE_COMPLETE
+				| PROCEDURE_UNSUPPORTED;
+			break;
+		}
+
+		name = npu_procedures[data]->name;
+		if (dev->procedure_number == data
+		    && !(dev->procedure_status & PROCEDURE_COMPLETE))
+			NPUDEVINF(dev, "Restarting procuedure %s\n", name);
+		else
+			NPUDEVINF(dev, "Starting procedure %s\n", name);
+
+		dev->procedure_status = PROCEDURE_INPROGRESS;
+		dev->procedure_number = data;
+		dev->procedure_step = 0;
+		dev->procedure_data = 0;
+		dev->procedure_tb = mftb();
+		break;
+
+	default:
+		NPUDEVINF(dev, "Invalid vendor specific offset 0x%08x\n", offset);
+		rc = OPAL_PARAMETER;
+	}
+
+	return rc;
+}
diff --git a/hw/npu.c b/hw/npu.c
new file mode 100644
index 0000000..c9bc12b
--- /dev/null
+++ b/hw/npu.c
@@ -0,0 +1,1718 @@
+/* Copyright 2013-2015 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <skiboot.h>
+#include <io.h>
+#include <timebase.h>
+#include <pci.h>
+#include <pci-cfg.h>
+#include <interrupts.h>
+#include <opal.h>
+#include <opal-api.h>
+#include <cpu.h>
+#include <device.h>
+#include <ccan/str/str.h>
+#include <ccan/array_size/array_size.h>
+#include <affinity.h>
+#include <npu-regs.h>
+#include <npu.h>
+#include <lock.h>
+#include <xscom.h>
+
+/*
+ * Terminology:
+ *
+ *  Brick - A group of either 8 TX or 8 RX lanes
+ *  Link - A group of 8 TX and 8 RX lanes
+ *
+ * Each link is represented in system software as an emulated PCI
+ * device. Garrison has two chips each with 4 links, therefore there
+ * are 8 emulated PCI devices in total.
+ *
+ *  +----------------------------------------------------------------+
+ *  |              PBCQ3 (SCOM Base Address 0x2012c00)               |
+ *  |               PHB3 (SCOM Base Address 0x9012c00)               |
+ *  +----------------------------------------------------------------+
+ *                          ||||||||  ||||||||
+ *                          ||||||||  ||||||||
+ *                          ||||||||  ||||||||
+ *                          ||||||||  ||||||||
+ *  +----------------------------------------------------------------+
+ *  |                             PCIe x8                            |
+ *  +----------------------------------------------------------------+
+ *  |                               GPU0                             |
+ *  +--------------------------------+-------------------------------+
+ *  |           NV Link 1            |           NV Link 0           |
+ *  +---------------+----------------+---------------+---------------+
+ *  |      RX       |      TX        |      RX       |      TX       |
+ *  +---------------+----------------+---------------+---------------+
+ *      ||||||||        ||||||||         ||||||||        ||||||||
+ *      ||||||||        ||||||||         ||||||||        ||||||||
+ *      ||||||||        ||||||||         ||||||||        ||||||||
+ *      ||||||||        ||||||||         ||||||||        ||||||||
+ *  +---------------+----------------+---------------+---------------+
+ *  |      TX       |      RX        |      TX       |      RX       |
+ *  +---------------+----------------+---------------+---------------+
+ *  |           Lanes [0:7]         PHY 0       Lanes [8:15]         |
+ *  |               SCOM Base Address 0x8000080008010c3f             |
+ *  +--------------------------------+-------------------------------+
+ *  |          Link 0 NDL/NTL        |         Link 1 NTL/NDL        |
+ *  |   SCOM Base Address 0x8013c00  |  SCOM Base Address 0x8013c40  |
+ *  +--------------------------------+-------------------------------+
+ *  |                                                                |
+ *  |          Address Translation/AT (shared for all links)         |
+ *  |                 SCOM Base Address 0x8013d80                    |
+ *  |                                                                |
+ *  +--------------------------------+-------------------------------+
+ *  |          Link 3 NDL/NTL        |         Link 4 NTL/NDL        |
+ *  |   SCOM Base Address 0x8013d00  |  SCOM Base Address 0x8013d40  |
+ *  +--------------------------------+-------------------------------+
+ *  |           Lanes [8:15]        PHY 1       Lanes [0:7]          |
+ *  |               SCOM Base Address 0x8000080008010c7f             |
+ *  +---------------+----------------+---------------+---------------+
+ *  |      TX       |      RX        |      TX       |      RX       |
+ *  +---------------+----------------+---------------+---------------+
+ *      ||||||||        ||||||||         ||||||||        ||||||||
+ *      ||||||||        ||||||||         ||||||||        ||||||||
+ *      ||||||||        ||||||||         ||||||||        ||||||||
+ *      ||||||||        ||||||||         ||||||||        ||||||||
+ *  +---------------+----------------+---------------+---------------+
+ *  |      RX       |      TX        |      RX       |      TX       |
+ *  +---------------+----------------+---------------+---------------+
+ *  |           NV Link 2            |           NV Link 3           |
+ *  +--------------------------------+-------------------------------+
+ *  |                               GPU1                             |
+ *  +----------------------------------------------------------------+
+ *  |                             PCIe x8                            |
+ *  +----------------------------------------------------------------+
+ *                          ||||||||  ||||||||
+ *                          ||||||||  ||||||||
+ *                          ||||||||  ||||||||
+ *                          ||||||||  ||||||||
+ *  +----------------------------------------------------------------+
+ *  |               PHB2 (SCOM Base Address 0x9012800)               |
+ *  |              PBCQ2 (SCOM Base Address 0x2012800)               |
+ *  +----------------------------------------------------------------+
+ *
+ */
+
+static struct npu_dev_cap *npu_dev_find_capability(struct npu_dev *dev,
+						   uint16_t id);
+
+/* PCI config raw accessors */
+#define NPU_DEV_CFG_NORMAL_RD(d, o, s, v)	\
+	npu_dev_cfg_read_raw(d, NPU_DEV_CFG_NORMAL, o, s, v)
+#define NPU_DEV_CFG_NORMAL_WR(d, o, s, v)	\
+	npu_dev_cfg_write_raw(d, NPU_DEV_CFG_NORMAL, o, s, v)
+#define NPU_DEV_CFG_RDONLY_RD(d, o, s, v)	\
+	npu_dev_cfg_read_raw(d, NPU_DEV_CFG_RDONLY, o, s, v)
+#define NPU_DEV_CFG_RDONLY_WR(d, o, s, v)	\
+	npu_dev_cfg_write_raw(d, NPU_DEV_CFG_RDONLY, o, s, v)
+#define NPU_DEV_CFG_W1CLR_RD(d, o, s, v)		\
+	npu_dev_cfg_read_raw(d, NPU_DEV_CFG_W1CLR, o, s, v)
+#define NPU_DEV_CFG_W1CLR_WR(d, o, s, v)		\
+	npu_dev_cfg_write_raw(d, NPU_DEV_CFG_W1CLR, o, s, v)
+
+#define NPU_DEV_CFG_INIT(d, o, s, v, ro, w1)		\
+	do {						\
+		NPU_DEV_CFG_NORMAL_WR(d, o, s, v);	\
+		NPU_DEV_CFG_RDONLY_WR(d, o, s, ro);	\
+		NPU_DEV_CFG_W1CLR_WR(d, o, s, w1);	\
+	} while(0)
+
+#define NPU_DEV_CFG_INIT_RO(d, o, s, v)			\
+	NPU_DEV_CFG_INIT(d, o, s, v, 0xffffffff, 0)
+
+static void npu_dev_cfg_read_raw(struct npu_dev *dev,
+				 uint32_t index,
+				 uint32_t offset,
+				 uint32_t size,
+				 uint32_t *val)
+{
+	uint8_t *pcfg = dev->config[index];
+	uint32_t r, t, i;
+
+	r = 0;
+	for (i = 0; i < size; i++) {
+		t = pcfg[offset + i];
+		r |= (t << (i * 8));
+	}
+
+	*val = r;
+}
+
+static void npu_dev_cfg_write_raw(struct npu_dev *dev,
+				  uint32_t index,
+				  uint32_t offset,
+				  uint32_t size,
+				  uint32_t val)
+{
+	uint8_t *pcfg = dev->config[index];
+	uint32_t i;
+
+	for (i = offset; i < (offset + size); i++) {
+		pcfg[i] = val;
+		val = (val >> 8);
+	}
+}
+
+/* Returns the scom base for the given link index */
+static uint64_t npu_link_scom_base(struct dt_node *dn, uint32_t scom_base,
+				   int index)
+{
+	struct dt_node *link;
+	uint32_t link_index;
+	char namebuf[32];
+
+	snprintf(namebuf, sizeof(namebuf), "link@%x", index);
+	link = dt_find_by_name(dn, namebuf);
+	assert(link);
+	link_index = dt_prop_get_u32(link, "ibm,npu-link-index");
+	return scom_base + (link_index * NPU_LINK_SIZE);
+}
+
+static uint64_t get_bar_size(uint64_t bar)
+{
+	return (1 << GETFIELD(NX_MMIO_BAR_SIZE, bar)) * 0x10000;
+}
+
+static void npu_lock(struct phb *phb)
+{
+	struct npu *p = phb_to_npu(phb);
+
+	lock(&p->lock);
+}
+
+static void npu_unlock(struct phb *phb)
+{
+	struct npu *p = phb_to_npu(phb);
+
+	unlock(&p->lock);
+}
+
+/* Update the changes of the device BAR to link BARs */
+static void npu_dev_bar_update(uint32_t gcid, struct npu_dev_bar *bar,
+			       bool enable)
+{
+	uint64_t val;
+
+	if (!bar->xscom)
+		return;
+
+	val = bar->base;
+	val = SETFIELD(NX_MMIO_BAR_SIZE, val, ilog2(bar->size / 0x10000));
+	if (enable)
+		val |= NX_MMIO_BAR_ENABLE;
+	xscom_write(gcid, bar->xscom, val);
+}
+
+/* Trap for PCI command (0x4) to enable or disable device's BARs */
+static int64_t npu_dev_cfg_write_cmd(struct npu_dev_trap *trap,
+				     uint32_t offset,
+				     uint32_t size,
+				     uint32_t data)
+{
+	struct npu_dev *dev = trap->dev;
+	bool enable;
+
+	if (offset != PCI_CFG_CMD)
+		return OPAL_PARAMETER;
+	if (size != 1 && size != 2 && size != 4)
+		return OPAL_PARAMETER;
+
+	/* Update device BARs and link BARs will be syncrhonized
+	 * with hardware automatically.
+	 */
+	enable = !!(data & PCI_CFG_CMD_MEM_EN);
+	npu_dev_bar_update(dev->npu->chip_id, &dev->bar, enable);
+
+	/* Normal path to update PCI config buffer */
+	return OPAL_PARAMETER;
+}
+
+/*
+ * Trap for memory BARs: 0xFF's should be written to BAR register
+ * prior to getting its size.
+ */
+static int64_t npu_dev_cfg_read_bar(struct npu_dev_trap *trap,
+				    uint32_t offset,
+				    uint32_t size,
+				    uint32_t *data)
+{
+	struct npu_dev_bar *bar = trap->data;
+
+	/* Revert to normal path if we weren't trapped for BAR size */
+	if (!bar->trapped)
+		return OPAL_PARAMETER;
+
+	if (offset != trap->start &&
+	    offset != trap->start + 4)
+		return OPAL_PARAMETER;
+	if (size != 4)
+		return OPAL_PARAMETER;
+
+	bar->trapped = false;
+	*data = bar->bar_sz;
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu_dev_cfg_write_bar(struct npu_dev_trap *trap,
+				     uint32_t offset,
+				     uint32_t size,
+				     uint32_t data)
+{
+	struct npu_dev_bar *bar = trap->data;
+	struct npu_dev *dev = container_of(bar, struct npu_dev, bar);
+	uint32_t pci_cmd;
+
+	if (offset != trap->start &&
+	    offset != trap->start + 4)
+		return OPAL_PARAMETER;
+	if (size != 4)
+		return OPAL_PARAMETER;
+
+	/* Return BAR size on next read */
+	if (data == 0xffffffff) {
+		bar->trapped = true;
+		if (offset == trap->start)
+			bar->bar_sz = (bar->size & 0xffffffff);
+		else
+			bar->bar_sz = (bar->size >> 32);
+
+		return OPAL_SUCCESS;
+	}
+
+	/* Update BAR base address */
+	if (offset == trap->start) {
+		bar->base &= 0xffffffff00000000;
+		bar->base |= (data & 0xfffffff0);
+	} else {
+		bar->base &= 0x00000000ffffffff;
+		bar->base |= ((uint64_t)data << 32);
+
+		NPU_DEV_CFG_NORMAL_RD(dev, PCI_CFG_CMD, 4, &pci_cmd);
+		npu_dev_bar_update(dev->npu->chip_id, bar,
+				   !!(pci_cmd & PCI_CFG_CMD_MEM_EN));
+	}
+
+	/* We still depend on the normal path to update the
+	 * cached config buffer.
+	 */
+	return OPAL_PARAMETER;
+}
+
+static struct npu_dev *bdfn_to_npu_dev(struct npu *p, uint32_t bdfn)
+{
+	int i;
+
+	/* Sanity check */
+	if (bdfn & ~0xff)
+		return NULL;
+
+	for(i = 0; i < p->total_devices; i++) {
+		if (p->devices[i].bdfn == bdfn)
+			return &p->devices[i];
+	}
+
+	return NULL;
+
+}
+
+static struct npu_dev *npu_dev_cfg_check(struct npu *p,
+					 uint32_t bdfn,
+					 uint32_t offset,
+					 uint32_t size)
+{
+	/* Sanity check */
+	if (offset >= NPU_DEV_CFG_SIZE)
+		return NULL;
+	if (offset & (size - 1))
+		return NULL;
+
+	return bdfn_to_npu_dev(p, bdfn);
+}
+
+static struct npu_dev_trap *npu_dev_trap_check(struct npu_dev *dev,
+					       uint32_t offset,
+					       uint32_t size,
+					       bool read)
+{
+	struct npu_dev_trap *trap;
+
+	list_for_each(&dev->traps, trap, link) {
+		if (read && !trap->read)
+			continue;
+		if (!read && !trap->write)
+			continue;
+
+		/* The requested region is overlapped with the one
+		 * specified by the trap, to pick the trap and let it
+		 * handle the request
+		 */
+		if (offset <= trap->end &&
+		    (offset + size - 1) >= trap->start)
+			return trap;
+	}
+
+	return NULL;
+}
+
+static int64_t _npu_dev_cfg_read(struct phb *phb, uint32_t bdfn,
+				uint32_t offset, uint32_t *data,
+				size_t size)
+{
+	struct npu *p = phb_to_npu(phb);
+	struct npu_dev *dev;
+	struct npu_dev_trap *trap;
+	int64_t ret;
+
+	/* Data returned upon errors */
+	*data = 0xffffffff;
+
+	/* Retrieve NPU device */
+	dev = npu_dev_cfg_check(p, bdfn, offset, size);
+	if (!dev)
+		return OPAL_PARAMETER;
+
+	/* Retrieve trap */
+	trap = npu_dev_trap_check(dev, offset, size, true);
+	if (trap) {
+		ret = trap->read(trap, offset,
+				 size, (uint32_t *)data);
+		if (ret == OPAL_SUCCESS)
+			return ret;
+	}
+
+	NPU_DEV_CFG_NORMAL_RD(dev, offset, size, data);
+
+	return OPAL_SUCCESS;
+}
+
+#define NPU_DEV_CFG_READ(size, type)					\
+static int64_t npu_dev_cfg_read##size(struct phb *phb, uint32_t bdfn,	\
+				      uint32_t offset, type *data)	\
+{									\
+	int64_t rc;							\
+	uint32_t val;							\
+									\
+	/* Data returned upon errors */					\
+	rc = _npu_dev_cfg_read(phb, bdfn, offset, &val, sizeof(*data));	\
+	*data = (type)val;						\
+	return rc;							\
+}
+
+static int64_t _npu_dev_cfg_write(struct phb *phb, uint32_t bdfn,
+				  uint32_t offset, uint32_t data,
+				  size_t size)
+{
+	struct npu *p = phb_to_npu(phb);
+	struct npu_dev *dev;
+	struct npu_dev_trap *trap;
+	uint32_t val, v, r, c, i;
+	int64_t ret;
+
+	/* Retrieve NPU device */
+	dev = npu_dev_cfg_check(p, bdfn, offset, size);
+	if (!dev)
+		return OPAL_PARAMETER;
+
+	/* Retrieve trap */
+	trap = npu_dev_trap_check(dev, offset, size, false);
+	if (trap) {
+		ret = trap->write(trap, offset,
+				  size, (uint32_t)data);
+		if (ret == OPAL_SUCCESS)
+			return ret;
+	}
+
+	/* Handle read-only and W1C bits */
+	val = data;
+	for (i = 0; i < size; i++) {
+		v = dev->config[NPU_DEV_CFG_NORMAL][offset + i];
+		r = dev->config[NPU_DEV_CFG_RDONLY][offset + i];
+		c = dev->config[NPU_DEV_CFG_W1CLR][offset + i];
+
+		/* Drop read-only bits */
+		val &= ~(r << (i * 8));
+		val |= (r & v) << (i * 8);
+
+		/* Drop W1C bits */
+		val &= ~(val & ((c & v) << (i * 8)));
+	}
+
+	NPU_DEV_CFG_NORMAL_WR(dev, offset, size, val);
+	return OPAL_SUCCESS;
+}
+
+#define NPU_DEV_CFG_WRITE(size, type)					\
+static int64_t npu_dev_cfg_write##size(struct phb *phb, uint32_t bdfn,	\
+				       uint32_t offset, type data)	\
+{									\
+	return _npu_dev_cfg_write(phb, bdfn, offset,			\
+				  data, sizeof(data));			\
+}
+
+NPU_DEV_CFG_READ(8, u8)
+NPU_DEV_CFG_READ(16, u16)
+NPU_DEV_CFG_READ(32, u32)
+NPU_DEV_CFG_WRITE(8, u8)
+NPU_DEV_CFG_WRITE(16, u16)
+NPU_DEV_CFG_WRITE(32, u32)
+
+/*
+ * Add calls to trap reads and writes to a NPU config space.
+ */
+static void npu_dev_add_cfg_trap(struct npu_dev *dev, uint32_t start,
+				 uint32_t size, void *data,
+				 int64_t (*read)(struct npu_dev_trap *,
+						 uint32_t,
+						 uint32_t,
+						 uint32_t *),
+				 int64_t (*write)(struct npu_dev_trap *,
+						  uint32_t,
+						  uint32_t,
+						  uint32_t))
+{
+	struct npu_dev_trap *trap;
+
+	trap = zalloc(sizeof(struct npu_dev_trap));
+	assert(trap);
+	trap->dev   = dev;
+	trap->start = start;
+	trap->end   = start + size - 1;
+	trap->read  = read;
+	trap->write = write;
+	trap->data  = data;
+	list_add_tail(&dev->traps, &trap->link);
+}
+
+static int __npu_dev_bind_pci_dev(struct phb *phb __unused,
+				  struct pci_device *pd,
+				  void *data)
+{
+	struct npu_dev *dev = data;
+	struct dt_node *pci_dt_node;
+	uint32_t npu_npcq_phandle;
+
+	/* Ignore non-nvidia PCI devices */
+	if ((pd->vdid & 0xffff) != 0x10de)
+		return 0;
+
+	/* Find the PCI devices pbcq */
+	for (pci_dt_node = pd->dn->parent;
+	     pci_dt_node && !dt_find_property(pci_dt_node, "ibm,pbcq");
+	     pci_dt_node = pci_dt_node->parent);
+
+	if (!pci_dt_node)
+		return 0;
+
+	npu_npcq_phandle = dt_prop_get_u32(dev->dt_node, "ibm,npu-pbcq");
+
+	if (dt_prop_get_u32(pci_dt_node, "ibm,pbcq") == npu_npcq_phandle &&
+	    (pd->vdid & 0xffff) == 0x10de)
+			return 1;
+
+	return 0;
+}
+
+static void npu_dev_bind_pci_dev(struct npu_dev *dev)
+{
+	struct phb *phb;
+	uint32_t i;
+
+	if (dev->pd)
+		return;
+
+	for (i = 0; i < 64; i++) {
+		if (dev->npu->phb.opal_id == i)
+			continue;
+
+		phb = pci_get_phb(i);
+		if (!phb)
+			continue;
+
+		dev->pd = pci_walk_dev(phb, __npu_dev_bind_pci_dev, dev);
+		if (dev->pd) {
+			dev->phb = phb;
+			return;
+		}
+	}
+
+	prlog(PR_ERR, "%s: NPU device %04x:00:%02x.0 not binding to PCI device\n",
+	      __func__, dev->npu->phb.opal_id, dev->index);
+}
+
+static struct lock pci_npu_phandle_lock = LOCK_UNLOCKED;
+
+/* Appends an NPU phandle to the given PCI device node ibm,npu
+ * property */
+static void npu_append_pci_phandle(struct dt_node *dn, u32 phandle)
+{
+	uint32_t *npu_phandles;
+	struct dt_property *pci_npu_phandle_prop;
+	size_t prop_len;
+
+	/* Use a lock to make sure no one else has a reference to an
+	 * ibm,npu property (this assumes this is the only function
+	 * that holds a reference to it). */
+	lock(&pci_npu_phandle_lock);
+
+	/* This function shouldn't be called unless ibm,npu exists */
+	pci_npu_phandle_prop = (struct dt_property *)
+		dt_require_property(dn, "ibm,npu", -1);
+
+	/* Need to append to the properties */
+	prop_len = pci_npu_phandle_prop->len;
+	prop_len += sizeof(*npu_phandles);
+	dt_resize_property(&pci_npu_phandle_prop, prop_len);
+	pci_npu_phandle_prop->len = prop_len;
+
+	npu_phandles = (uint32_t *) pci_npu_phandle_prop->prop;
+	npu_phandles[prop_len/sizeof(*npu_phandles) - 1] = phandle;
+	unlock(&pci_npu_phandle_lock);
+}
+
+static void npu_dn_fixup(struct phb *phb, struct pci_device *pd)
+{
+	struct npu *p = phb_to_npu(phb);
+	struct npu_dev *dev;
+
+	dev = bdfn_to_npu_dev(p, pd->bdfn);
+	assert(dev);
+
+	if (dev->phb || dev->pd)
+		return;
+
+	/* Bind the emulated PCI device with the real one, which can't
+	 * be done until the PCI devices are populated. Once the real
+	 * PCI device is identified, we also need fix the device-tree
+	 * for it
+	 */
+	npu_dev_bind_pci_dev(dev);
+	if (dev->phb && dev->pd && dev->pd->dn) {
+		if (dt_find_property(dev->pd->dn, "ibm,npu"))
+			npu_append_pci_phandle(dev->pd->dn, pd->dn->phandle);
+		else
+			dt_add_property_cells(dev->pd->dn, "ibm,npu", pd->dn->phandle);
+
+		dt_add_property_cells(pd->dn, "ibm,gpu", dev->pd->dn->phandle);
+	}
+}
+
+static void npu_ioda_init(struct npu *p)
+{
+	uint64_t *data64;
+	uint32_t i;
+
+	/* LXIVT - Disable all LSIs */
+	for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++) {
+		data64 = &p->lxive_cache[i];
+		*data64 = SETFIELD(NPU_IODA_LXIVT_PRIORITY, 0ul, 0xff);
+		*data64 = SETFIELD(NPU_IODA_LXIVT_SERVER, *data64, 0);
+	}
+
+	/* PCT - Reset to reserved PE# */
+	for (i = 0; i < ARRAY_SIZE(p->pce_cache); i++) {
+		data64 = &p->pce_cache[i];
+		*data64 = SETFIELD(NPU_IODA_PCT_PE, 0ul, NPU_NUM_OF_PES);
+		*data64 |= NPU_IODA_PCT_LINK_ENABLED;
+	}
+
+	/* Clear TVT */
+	memset(p->tve_cache, 0, sizeof(p->tve_cache));
+}
+
+static int64_t npu_ioda_reset(struct phb *phb, bool purge)
+{
+	struct npu *p = phb_to_npu(phb);
+	uint32_t i;
+
+	if (purge) {
+		NPUDBG(p, "Purging all IODA tables...\n");
+		npu_ioda_init(p);
+	}
+
+	/* LIST */
+	npu_ioda_sel(p, NPU_IODA_TBL_LIST, 0, true);
+	for (i = 0; i < 8; i++)
+		out_be64(p->at_regs + NPU_IODA_DATA0, 0x1);
+
+	/* LIXVT */
+	npu_ioda_sel(p, NPU_IODA_TBL_LXIVT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->lxive_cache); i++)
+		out_be64(p->at_regs + NPU_IODA_DATA0, p->lxive_cache[i]);
+
+	/* PCT */
+	npu_ioda_sel(p, NPU_IODA_TBL_PCT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->pce_cache); i++)
+		out_be64(p->at_regs + NPU_IODA_DATA0, p->pce_cache[i]);
+
+	/* TVT */
+	npu_ioda_sel(p, NPU_IODA_TBL_TVT, 0, true);
+	for (i = 0; i < ARRAY_SIZE(p->tve_cache); i++)
+		out_be64(p->at_regs + NPU_IODA_DATA0, p->tve_cache[i]);
+
+	return OPAL_SUCCESS;
+}
+
+static int npu_isn_valid(struct npu *p, uint32_t isn)
+{
+	if (p->chip_id != p8_irq_to_chip(isn) || p->index != 0 ||
+	    NPU_IRQ_NUM(isn) < NPU_LSI_IRQ_MIN ||
+	    NPU_IRQ_NUM(isn) > NPU_LSI_IRQ_MAX) {
+		NPUERR(p, "isn 0x%x not valid for this NPU\n", isn);
+		return false;
+	}
+
+	return true;
+}
+
+static int64_t npu_lsi_get_xive(void *data,
+				    uint32_t isn,
+				    uint16_t *server,
+				    uint8_t *prio)
+{
+	struct npu *p = data;
+	uint32_t irq = NPU_IRQ_NUM(isn);
+	uint64_t lxive;
+
+	if (!npu_isn_valid(p, isn))
+		return OPAL_PARAMETER;
+
+	/* The content is fetched from the cache, which requires
+	 * that the initial cache should be initialized with the
+	 * default values
+	 */
+	irq -= NPU_LSI_IRQ_MIN;
+	lxive = p->lxive_cache[irq];
+	*server = GETFIELD(NPU_IODA_LXIVT_SERVER, lxive);
+	*prio = GETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu_lsi_set_xive(void *data,
+				    uint32_t isn,
+				    uint16_t server,
+				    uint8_t prio)
+{
+	struct npu *p = data;
+	uint32_t irq = NPU_IRQ_NUM(isn);
+	uint64_t lxive;
+
+	if (!npu_isn_valid(p, isn))
+		return OPAL_PARAMETER;
+
+	/* Figure out LXIVT entry */
+	lxive = SETFIELD(NPU_IODA_LXIVT_SERVER, 0ul, server);
+	lxive = SETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive, prio);
+
+	/* Cache LXIVT entry */
+	irq -= NPU_LSI_IRQ_MIN;
+	p->lxive_cache[irq] = lxive;
+
+	/* Update to LXIVT entry */
+	npu_ioda_sel(p, NPU_IODA_TBL_LXIVT, irq, false);
+	lxive = in_be64(p->at_regs + NPU_IODA_DATA0);
+	lxive = SETFIELD(NPU_IODA_LXIVT_SERVER, lxive, server);
+	lxive = SETFIELD(NPU_IODA_LXIVT_PRIORITY, lxive, prio);
+	out_be64(p->at_regs + NPU_IODA_DATA0, lxive);
+
+	return OPAL_SUCCESS;
+}
+
+static void npu_err_interrupt(void *data, uint32_t isn)
+{
+	struct npu *p = data;
+	uint32_t irq = NPU_IRQ_NUM(isn);
+
+	if (!npu_isn_valid(p, isn))
+		return;
+
+	/* There're 4 LSIs used for error reporting: 4/5 for data
+	 * link error reporting while 6/7 for frozen PE detection
+	 */
+	irq -= NPU_LSI_IRQ_MIN;
+	switch (irq) {
+	case 4 ... 5:
+		prerror("Invalid NPU error interrupt received\n");
+		break;
+	case 6 ... 7:
+		NPUERR(p, "Error handling not implemented\n");
+		opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
+					OPAL_EVENT_PCI_ERROR);
+	}
+}
+
+/* LSIs (OS owned) */
+static const struct irq_source_ops npu_lsi_irq_ops = {
+	.get_xive	= npu_lsi_get_xive,
+	.set_xive	= npu_lsi_set_xive,
+};
+
+/* Error LSIs (skiboot owned) */
+static const struct irq_source_ops npu_err_lsi_irq_ops = {
+	.get_xive	= npu_lsi_get_xive,
+	.set_xive	= npu_lsi_set_xive,
+	.interrupt	= npu_err_interrupt,
+};
+
+static void npu_register_irq(struct npu *p)
+{
+	register_irq_source(&npu_lsi_irq_ops, p,
+			    p->base_lsi, 4);
+	register_irq_source(&npu_err_lsi_irq_ops, p,
+			    p->base_lsi + 4, 4);
+}
+
+static void npu_hw_init(struct npu *p)
+{
+	/* 3 MMIO setup for AT */
+	out_be64(p->at_regs + NPU_LSI_SOURCE_ID,
+		 SETFIELD(NPU_LSI_SRC_ID_BASE, 0ul, 0x7f));
+	out_be64(p->at_regs + NPU_INTREP_TIMER, 0x0ul);
+	npu_ioda_reset(&p->phb, false);
+}
+
+static int64_t npu_map_pe_dma_window_real(struct phb *phb,
+					   uint16_t pe_num,
+					   uint16_t window_id,
+					   uint64_t pci_start_addr,
+					   uint64_t pci_mem_size)
+{
+	struct npu *p = phb_to_npu(phb);
+	uint64_t end = pci_start_addr + pci_mem_size;
+	uint64_t tve;
+
+	/* Sanity check. Each PE has one corresponding TVE */
+	if (pe_num >= NPU_NUM_OF_PES ||
+	    window_id != pe_num)
+		return OPAL_PARAMETER;
+
+	if (pci_mem_size) {
+		/* Enable */
+
+		end = pci_start_addr + pci_mem_size;
+
+		/* We have to be 16M aligned */
+		if ((pci_start_addr & 0x00ffffff) ||
+		    (pci_mem_size & 0x00ffffff))
+			return OPAL_PARAMETER;
+
+		/*
+		 * It *looks* like this is the max we can support (we need
+		 * to verify this. Also we are not checking for rollover,
+		 * but then we aren't trying too hard to protect ourselves
+		 * againt a completely broken OS.
+		 */
+		if (end > 0x0003ffffffffffffull)
+			return OPAL_PARAMETER;
+
+		/*
+		 * Put start address bits 49:24 into TVE[52:53]||[0:23]
+		 * and end address bits 49:24 into TVE[54:55]||[24:47]
+		 * and set TVE[51]
+		 */
+		tve  = (pci_start_addr << 16) & (0xffffffull << 48);
+		tve |= (pci_start_addr >> 38) & (3ull << 10);
+		tve |= (end >>  8) & (0xfffffful << 16);
+		tve |= (end >> 40) & (3ull << 8);
+		tve |= PPC_BIT(51);
+	} else {
+		/* Disable */
+		tve = 0;
+	}
+
+	npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false);
+	out_be64(p->at_regs + NPU_IODA_DATA0, tve);
+	p->tve_cache[window_id] = tve;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu_map_pe_dma_window(struct phb *phb,
+					 uint16_t pe_num,
+					 uint16_t window_id,
+					 uint16_t tce_levels,
+					 uint64_t tce_table_addr,
+					 uint64_t tce_table_size,
+					 uint64_t tce_page_size)
+{
+	struct npu *p = phb_to_npu(phb);
+	uint64_t tts_encoded;
+	uint64_t data64 = 0;
+
+	/* Sanity check. Each PE has one corresponding TVE */
+	if (pe_num >= NPU_NUM_OF_PES ||
+	    window_id != pe_num)
+		return OPAL_PARAMETER;
+
+	/* Special condition, zero TCE table size used to disable
+	 * the TVE.
+	 */
+	if (!tce_table_size) {
+		npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false);
+		out_be64(p->at_regs + NPU_IODA_DATA0, 0ul);
+		p->tve_cache[window_id] = 0ul;
+		return OPAL_SUCCESS;
+	}
+
+	/* Additional arguments validation */
+	if (tce_levels < 1 ||
+	    tce_levels > 4 ||
+	    !is_pow2(tce_table_size) ||
+	    tce_table_size < 0x1000)
+		return OPAL_PARAMETER;
+
+	/* TCE table size */
+	data64 = SETFIELD(NPU_IODA_TVT_TTA, 0ul, tce_table_addr >> 12);
+	tts_encoded = ilog2(tce_table_size) - 11;
+	if (tts_encoded > 39)
+		return OPAL_PARAMETER;
+	data64 = SETFIELD(NPU_IODA_TVT_SIZE, data64, tts_encoded);
+
+	/* TCE page size */
+	switch (tce_page_size) {
+	case 0x10000:		/* 64K */
+		data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 5);
+		break;
+	case 0x1000000:		/* 16M */
+		data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 13);
+		break;
+	case 0x10000000:	/* 256M */
+		data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 17);
+		break;
+	case 0x1000:		/* 4K */
+	default:
+		data64 = SETFIELD(NPU_IODA_TVT_PSIZE, data64, 1);
+	}
+
+	/* Number of levels */
+	data64 = SETFIELD(NPU_IODA_TVT_LEVELS, data64, tce_levels - 1);
+
+	/* Update to hardware */
+	npu_ioda_sel(p, NPU_IODA_TBL_TVT, window_id, false);
+	out_be64(p->at_regs + NPU_IODA_DATA0, data64);
+	p->tve_cache[window_id] = data64;
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu_set_pe(struct phb *phb,
+			      uint64_t pe_num,
+			      uint64_t bdfn,
+			      uint8_t bcompare,
+			      uint8_t dcompare,
+			      uint8_t fcompare,
+			      uint8_t action)
+{
+	struct npu *p = phb_to_npu(phb);
+	struct npu_dev *dev;
+	uint32_t link_idx;
+	uint64_t *data64;
+
+	/* Sanity check */
+	if (action != OPAL_MAP_PE &&
+	    action != OPAL_UNMAP_PE)
+		return OPAL_PARAMETER;
+	if (pe_num >= NPU_NUM_OF_PES)
+		return OPAL_PARAMETER;
+
+	/* All emulated PCI devices hooked to root bus, whose
+	 * bus number is zero.
+	 */
+	dev = bdfn_to_npu_dev(p, bdfn);
+	if ((bdfn >> 8) || !dev)
+		return OPAL_PARAMETER;
+
+	link_idx = dev->index;
+
+	/* Separate links will be mapped to different PEs */
+	if (bcompare != OpalPciBusAll ||
+	    dcompare != OPAL_COMPARE_RID_DEVICE_NUMBER ||
+	    fcompare != OPAL_COMPARE_RID_FUNCTION_NUMBER)
+		return OPAL_UNSUPPORTED;
+
+	/* Map the link to the corresponding PE */
+	data64 = &p->pce_cache[link_idx];
+	if (action == OPAL_MAP_PE)
+		*data64 = SETFIELD(NPU_IODA_PCT_PE, *data64,
+				   pe_num);
+	else
+		*data64 = SETFIELD(NPU_IODA_PCT_PE, *data64,
+				   NPU_NUM_OF_PES);
+
+	*data64 |= NPU_IODA_PCT_LINK_ENABLED;
+
+	npu_ioda_sel(p, NPU_IODA_TBL_PCT, link_idx, false);
+	out_be64(p->at_regs + NPU_IODA_DATA0, *data64);
+
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu_link_state(struct phb *phb __unused)
+{
+	/* As we're emulating all PCI stuff, the link bandwidth
+	 * isn't big deal anyway.
+	 */
+	return OPAL_SHPC_LINK_UP_x1;
+}
+
+static int64_t npu_power_state(struct phb *phb __unused)
+{
+	return OPAL_SHPC_POWER_ON;
+}
+
+static int64_t npu_freset(struct phb *phb __unused)
+{
+	/* FIXME: PHB fundamental reset, which need to be
+	 * figured out later. It's used by EEH recovery
+	 * upon fenced AT.
+	 */
+	return OPAL_SUCCESS;
+}
+
+static int64_t npu_freeze_status(struct phb *phb __unused,
+				     uint64_t pe_number __unused,
+				     uint8_t *freeze_state,
+				     uint16_t *pci_error_type __unused,
+				     uint16_t *severity __unused,
+				     uint64_t *phb_status __unused)
+{
+	/* FIXME: When it's called by skiboot PCI config accessor,
+	 * the PE number is fixed to 0, which is incorrect. We need
+	 * introduce another PHB callback to translate it. For now,
+	 * it keeps the skiboot PCI enumeration going.
+	 */
+	*freeze_state = OPAL_EEH_STOPPED_NOT_FROZEN;
+	return OPAL_SUCCESS;
+}
+
+static const struct phb_ops npu_ops = {
+	.lock			= npu_lock,
+	.unlock			= npu_unlock,
+	.cfg_read8		= npu_dev_cfg_read8,
+	.cfg_read16		= npu_dev_cfg_read16,
+	.cfg_read32		= npu_dev_cfg_read32,
+	.cfg_write8		= npu_dev_cfg_write8,
+	.cfg_write16		= npu_dev_cfg_write16,
+	.cfg_write32		= npu_dev_cfg_write32,
+	.choose_bus		= NULL,
+	.device_init		= NULL,
+	.device_node_fixup	= npu_dn_fixup,
+	.presence_detect	= NULL,
+	.ioda_reset		= npu_ioda_reset,
+	.papr_errinjct_reset	= NULL,
+	.pci_reinit		= NULL,
+	.set_phb_mem_window	= NULL,
+	.phb_mmio_enable	= NULL,
+	.map_pe_mmio_window	= NULL,
+	.map_pe_dma_window	= npu_map_pe_dma_window,
+	.map_pe_dma_window_real	= npu_map_pe_dma_window_real,
+	.pci_msi_eoi		= NULL,
+	.set_xive_pe		= NULL,
+	.get_msi_32		= NULL,
+	.get_msi_64		= NULL,
+	.set_pe			= npu_set_pe,
+	.set_peltv		= NULL,
+	.link_state		= npu_link_state,
+	.power_state		= npu_power_state,
+	.slot_power_off		= NULL,
+	.slot_power_on		= NULL,
+	.hot_reset		= NULL,
+	.fundamental_reset	= npu_freset,
+	.complete_reset		= NULL,
+	.poll			= NULL,
+	.eeh_freeze_status	= npu_freeze_status,
+	.eeh_freeze_clear	= NULL,
+	.eeh_freeze_set		= NULL,
+	.next_error		= NULL,
+	.err_inject		= NULL,
+	.get_diag_data		= NULL,
+	.get_diag_data2		= NULL,
+	.set_capi_mode		= NULL,
+	.set_capp_recovery	= NULL,
+};
+
+static void assign_mmio_bars(uint32_t gcid, uint32_t xscom,
+			     struct dt_node *npu_dn, uint64_t mm_win[2])
+{
+	uint64_t mem_start, mem_end;
+	struct npu_dev_bar bar;
+	struct dt_node *link;
+
+	/* Configure BAR selection.
+	 *
+	 * Currently, each PHY contains 2 links and each link has 2
+	 * BARs. The first BAR is assigned to the DLTL region which is
+	 * what the kernel uses. The second BAR is either assigned to
+	 * either the PL or AT region or unassigned. The PL0/PL1/AT
+	 * MMIO regions are not exposed to the kernel so we assigned
+	 * them at the start of the available memory area followed by
+	 * the DLTL regions. So we end up with the following memory
+	 * map (assuming we're given a memory region starting at
+	 * 0x3fff000000000):
+	 *
+	 * Link#0-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000420000
+	 * Link#0-BAR#1:     PL0 BAR (  2MB) - 0x3fff000000000
+	 * Link#1-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000440000
+	 * Link#1-BAR#1:      AT BAR ( 64KB) - 0x3fff000400000
+	 * Link#2-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000460000
+	 * Link#2-BAR#1:     PL1 BAR (  2MB) - 0x3fff000200000
+	 * Link#3-BAR#0: NTL/NDL BAR (128KB) - 0x3fff000480000
+	 * Link#3-BAR#1:  UNASSIGNED
+	 */
+	xscom_write(gcid, xscom + NPU_AT_SCOM_OFFSET + NX_BAR,
+		    0x0211000043500000);
+
+	xscom_read(gcid, npu_link_scom_base(npu_dn, xscom, 0) + NX_MMIO_BAR_0,
+		   &mem_start);
+	mem_start = GETFIELD(NX_MMIO_BAR_BASE, mem_start) << 12;
+
+	xscom_read(gcid, npu_link_scom_base(npu_dn, xscom, 5) + NX_MMIO_BAR_0,
+		   &mem_end);
+	mem_end = (GETFIELD(NX_MMIO_BAR_BASE, mem_end) << 12) +
+		get_bar_size(mem_end);
+
+	/* PL0 BAR comes first at 0x3fff000000000 */
+	bar.xscom = npu_link_scom_base(npu_dn, xscom, 0) + NX_MMIO_BAR_1;
+	bar.base = mem_start;
+	bar.size = NX_MMIO_PL_SIZE;
+	npu_dev_bar_update(gcid, &bar, true);
+
+	/* PL1 BAR */
+	bar.xscom = npu_link_scom_base(npu_dn, xscom, 4) + NX_MMIO_BAR_1;
+	bar.base += bar.size;
+	bar.size = NX_MMIO_PL_SIZE;
+	npu_dev_bar_update(gcid, &bar, true);
+
+	/* Then the AT BAR */
+	bar.xscom = npu_link_scom_base(npu_dn, xscom, 1) + NX_MMIO_BAR_1;
+	bar.base += bar.size;
+	bar.size = NX_MMIO_AT_SIZE;
+	npu_dev_bar_update(gcid, &bar, true);
+
+	/* Now we configure all the DLTL BARs. These are the ones
+	 * actually exposed to the kernel. */
+	mm_win[0] = bar.base + bar.size;
+	dt_for_each_node(npu_dn, link) {
+		uint32_t index;
+
+		index = dt_prop_get_u32(link, "ibm,npu-link-index");
+		bar.xscom = npu_link_scom_base(npu_dn, xscom, index) +
+			NX_MMIO_BAR_0;
+		bar.base += bar.size;
+		bar.size = NX_MMIO_DL_SIZE;
+		bar.base = ALIGN_UP(bar.base, bar.size);
+		npu_dev_bar_update(gcid, &bar, false);
+	}
+	mm_win[1] = (bar.base + bar.size) - mm_win[0];
+
+	/* If we weren't given enough room to setup all the BARs we
+	 * require it's better to crash here than risk creating
+	 * overlapping BARs which will xstop the machine randomly in
+	 * the future.*/
+	assert(bar.base + bar.size <= mem_end);
+}
+
+/* Probe NPU device node and create PCI root device node
+ * accordingly. The NPU deivce node should specify number
+ * of links and xscom base address to access links.
+ */
+static void npu_probe_phb(struct dt_node *dn)
+{
+	struct dt_node *np;
+	uint32_t gcid, index, xscom;
+	uint64_t at_bar[2], mm_win[2], val;
+	uint32_t links = 0;
+	char *path;
+
+	/* Retrieve chip id */
+	path = dt_get_path(dn);
+	gcid = dt_get_chip_id(dn);
+	index = dt_prop_get_u32(dn, "ibm,npu-index");
+	dt_for_each_compatible(dn, np, "ibm,npu-link")
+		links++;
+
+	prlog(PR_INFO, "Chip %d Found NPU%d (%d links) at %s\n",
+	      gcid, index, links, path);
+	free(path);
+
+	/* Retrieve xscom base addr */
+	xscom = dt_get_address(dn, 0, NULL);
+	prlog(PR_INFO, "   XSCOM Base:  %08x\n", xscom);
+
+	assign_mmio_bars(gcid, xscom, dn, mm_win);
+
+	/* Retrieve AT BAR */
+	xscom_read(gcid, npu_link_scom_base(dn, xscom, 1) + NX_MMIO_BAR_1,
+		   &val);
+	if (!(val & NX_MMIO_BAR_ENABLE)) {
+		prlog(PR_ERR, "   AT BAR disabled!\n");
+		return;
+	}
+
+	at_bar[0] = GETFIELD(NX_MMIO_BAR_BASE, val) << 12;
+	at_bar[1] = get_bar_size(val);
+	prlog(PR_INFO, "   AT BAR:      %016llx (%lldKB)\n",
+	      at_bar[0], at_bar[1] / 0x400);
+
+	/* Create PCI root device node */
+	np = dt_new_addr(dt_root, "pciex", at_bar[0]);
+	if (!np) {
+		prlog(PR_ERR, "%s: Cannot create PHB device node\n",
+		      __func__);
+		return;
+	}
+
+	dt_add_property_strings(np, "compatible",
+				"ibm,power8-npu-pciex", "ibm,ioda2-npu-phb");
+	dt_add_property_strings(np, "device_type", "pciex");
+	dt_add_property(np, "reg", at_bar, sizeof(at_bar));
+
+	dt_add_property_cells(np, "ibm,phb-index", index);
+	dt_add_property_cells(np, "ibm,chip-id", gcid);
+	dt_add_property_cells(np, "ibm,xscom-base", xscom);
+	dt_add_property_cells(np, "ibm,npcq", dn->phandle);
+	dt_add_property_cells(np, "ibm,links", links);
+	dt_add_property(np, "ibm,mmio-window", mm_win, sizeof(mm_win));
+}
+
+static void npu_dev_populate_vendor_cap(struct npu_dev_cap *cap)
+{
+	struct npu_dev *dev = cap->dev;
+	uint32_t offset = cap->start;
+	uint32_t val;
+
+	/* Add version and length information */
+	val = (cap->end - cap->start) | 0x1 << 8;
+	NPU_DEV_CFG_INIT_RO(dev, offset + 2, 4, val);
+	offset += 4;
+
+	/* Defaults when the trap can't handle the read/write (eg. due
+	 * to reading/writing less than 4 bytes). */
+	val = 0x0;
+	NPU_DEV_CFG_INIT_RO(dev, offset, 4, val);
+	NPU_DEV_CFG_INIT_RO(dev, offset + 4, 4, val);
+
+	/* Create a trap for AT/PL procedures */
+	npu_dev_add_cfg_trap(dev, offset, 8, NULL, npu_dev_procedure_read,
+			     npu_dev_procedure_write);
+	offset += 8;
+
+	NPU_DEV_CFG_INIT_RO(dev, offset, 4, dev->index);
+}
+
+static void npu_dev_populate_pcie_cap(struct npu_dev_cap *cap)
+{
+	struct npu_dev *dev = cap->dev;
+	uint32_t base = cap->start;
+	uint32_t val;
+
+	/* Sanity check on capability ID */
+	if (cap->id != PCI_CFG_CAP_ID_EXP) {
+		prlog(PR_NOTICE, "%s: Invalid capability ID %d (%d)\n",
+		      __func__, cap->id, PCI_CFG_CAP_ID_EXP);
+		return;
+	}
+
+	/* Sanity check on spanned registers */
+	if ((cap->end - cap->start) < 0x40) {
+		prlog(PR_NOTICE, "%s: Invalid reg region [%x, %x] for cap %d\n",
+		      __func__, cap->start, cap->end, cap->id);
+		return;
+	}
+
+	/* 0x00 - ID/PCIE capability */
+	val = cap->id;
+	val |= ((0x2 << 16) | (PCIE_TYPE_ENDPOINT << 20));
+	NPU_DEV_CFG_INIT_RO(dev, base, 4, val);
+
+	/* 0x04 - Device capability
+	 *
+	 * We should support FLR. Oterwhsie, it might have
+	 * problem passing it through to userland via Linux
+	 * VFIO infrastructure
+	 */
+	val = ((PCIE_MPSS_128) |
+	       (PCIE_PHANTOM_NONE << 3) |
+	       (PCIE_L0SL_MAX_NO_LIMIT << 6) |
+	       (PCIE_L1L_MAX_NO_LIMIT << 9) |
+	       (PCICAP_EXP_DEVCAP_FUNC_RESET));
+	NPU_DEV_CFG_INIT_RO(dev, base + PCICAP_EXP_DEVCAP, 4, val);
+
+	/* 0x08 - Device control and status */
+	NPU_DEV_CFG_INIT(dev, base + PCICAP_EXP_DEVCTL, 4, 0x00002810,
+			 0xffff0000, 0x000f0000);
+
+	/* 0x0c - Link capability */
+	val = (PCIE_LSPEED_VECBIT_2 | (PCIE_LWIDTH_1X << 4));
+	NPU_DEV_CFG_INIT_RO(dev, base + PCICAP_EXP_LCAP, 4, val);
+
+	/* 0x10 - Link control and status */
+	NPU_DEV_CFG_INIT(dev, base + PCICAP_EXP_LCTL, 4, 0x00130000,
+			 0xfffff000, 0xc0000000);
+
+	/* 0x14 - Slot capability */
+	NPU_DEV_CFG_INIT_RO(dev, base + PCICAP_EXP_SLOTCAP, 4, 0x00000000);
+
+	/* 0x18 - Slot control and status */
+	NPU_DEV_CFG_INIT_RO(dev, base + PCICAP_EXP_SLOTCTL, 4, 0x00000000);
+
+	/* 0x1c - Root control and capability */
+	NPU_DEV_CFG_INIT(dev, base + PCICAP_EXP_RC, 4, 0x00000000,
+			 0xffffffe0, 0x00000000);
+
+	/* 0x20 - Root status */
+	NPU_DEV_CFG_INIT(dev, base + PCICAP_EXP_RSTAT, 4, 0x00000000,
+			 0xffffffff, 0x00010000);
+
+	/* 0x24 - Device capability 2 */
+	NPU_DEV_CFG_INIT_RO(dev, base + PCIECAP_EXP_DCAP2, 4, 0x00000000);
+
+	/* 0x28 - Device Control and status 2 */
+	NPU_DEV_CFG_INIT(dev, base + PCICAP_EXP_DCTL2, 4, 0x00070000,
+			 0xffff0000, 0x00000000);
+
+	/* 0x2c - Link capability 2 */
+	NPU_DEV_CFG_INIT_RO(dev, base + PCICAP_EXP_LCAP2, 4, 0x00000007);
+
+	/* 0x30 - Link control and status 2 */
+	NPU_DEV_CFG_INIT(dev, base + PCICAP_EXP_LCTL2, 4, 0x00000003,
+			 0xffff0000, 0x00200000);
+
+	/* 0x34 - Slot capability 2 */
+	NPU_DEV_CFG_INIT_RO(dev, base + PCICAP_EXP_SCAP2, 4, 0x00000000);
+
+	/* 0x38 - Slot control and status 2 */
+	NPU_DEV_CFG_INIT_RO(dev, base + PCICAP_EXP_SCTL2, 4, 0x00000000);
+}
+
+static struct npu_dev_cap *npu_dev_create_capability(struct npu_dev *dev,
+				  void (*populate)(struct npu_dev_cap *),
+				  uint16_t id,
+				  uint16_t start,
+				  uint16_t end)
+{
+	struct npu_dev_cap *cap;
+
+	/* Check if the capability is existing */
+	cap = npu_dev_find_capability(dev, id);
+	if (cap)
+		return cap;
+
+	/* Allocate new one */
+	cap = zalloc(sizeof(struct npu_dev_cap));
+	assert(cap);
+
+	/* Put it into the pool */
+	cap->id         = id;
+	cap->start      = start;
+	cap->end        = end;
+	cap->dev        = dev;
+	cap->populate	= populate;
+	list_add_tail(&dev->capabilities, &cap->link);
+
+	return cap;
+}
+
+static struct npu_dev_cap *npu_dev_find_capability(struct npu_dev *dev,
+						   uint16_t id)
+{
+	struct npu_dev_cap *cap;
+
+	list_for_each(&dev->capabilities, cap, link) {
+		if (cap->id == id)
+			return cap;
+	}
+
+	return NULL;
+}
+
+/*
+ * All capabilities should be put into the device capability
+ * list according to register offset in ascending order for
+ * easy access at later point.
+ */
+static void npu_dev_create_capabilities(struct npu_dev *dev)
+{
+	list_head_init(&dev->capabilities);
+
+	/* PCI express capability */
+	npu_dev_create_capability(dev, npu_dev_populate_pcie_cap,
+				  PCI_CFG_CAP_ID_EXP, 0x40, 0x80);
+
+	/* Vendor specific capability */
+	npu_dev_create_capability(dev, npu_dev_populate_vendor_cap,
+				  PCI_CFG_CAP_ID_VENDOR, 0x80, 0x90);
+}
+
+static void npu_dev_create_cfg(struct npu_dev *dev)
+{
+	struct npu_dev_cap *cap;
+	uint32_t offset;
+	uint32_t last_cap_offset;
+
+	/* Initialize config traps */
+	list_head_init(&dev->traps);
+
+	/* 0x00 - Vendor/Device ID */
+	NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_VENDOR_ID, 4, 0x04ea1014);
+
+	/* 0x04 - Command/Status
+	 *
+	 * Create one trap to trace toggling memory BAR enable bit
+	 */
+	NPU_DEV_CFG_INIT(dev, PCI_CFG_CMD, 4, 0x00100000, 0xffb802b8,
+			 0xf9000000);
+
+	npu_dev_add_cfg_trap(dev, PCI_CFG_CMD, 1, NULL, NULL,
+			     npu_dev_cfg_write_cmd);
+
+	/* 0x08 - Rev/Class/Cache */
+	NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_REV_ID, 4, 0x06800100);
+
+	/* 0x0c - CLS/Latency Timer/Header/BIST */
+	NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_CACHE_LINE_SIZE, 4, 0x00800000);
+
+	/* 0x10 - BARs, always 64-bits non-prefetchable
+	 *
+	 * Each emulated device represents one link and therefore
+	 * there is one BAR for the assocaited DLTL region.
+	 */
+
+	/* Low 32-bits */
+	NPU_DEV_CFG_INIT(dev, PCI_CFG_BAR0, 4,
+			 (dev->bar.base & 0xfffffff0) | dev->bar.flags,
+			 0x0000000f, 0x00000000);
+
+	/* High 32-bits */
+	NPU_DEV_CFG_INIT(dev, PCI_CFG_BAR1, 4, (dev->bar.base >> 32),
+			 0x00000000, 0x00000000);
+
+	/*
+	 * Create trap. Writting 0xFF's to BAR registers should be
+	 * trapped and return size on next read
+	 */
+	npu_dev_add_cfg_trap(dev, PCI_CFG_BAR0, 8, &dev->bar,
+			     npu_dev_cfg_read_bar, npu_dev_cfg_write_bar);
+
+	/* 0x18/1c/20/24 - Disabled BAR#2/3/4/5
+	 *
+	 * Mark those BARs readonly so that 0x0 will be returned when
+	 * probing the length and the BARs will be skipped.
+	 */
+	NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_BAR2, 4, 0x00000000);
+	NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_BAR3, 4, 0x00000000);
+	NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_BAR4, 4, 0x00000000);
+	NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_BAR5, 4, 0x00000000);
+
+	/* 0x28 - Cardbus CIS pointer */
+	NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_CARDBUS_CIS, 4, 0x00000000);
+
+	/* 0x2c - Subsystem ID */
+	NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_SUBSYS_VENDOR_ID, 4, 0x00000000);
+
+	/* 0x30 - ROM BAR
+	 *
+	 * Force its size to be zero so that the kernel will skip
+	 * probing the ROM BAR. We needn't emulate ROM BAR.
+	 */
+	NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_ROMBAR, 4, 0xffffffff);
+
+	/* 0x34 - PCI Capability
+	 *
+	 * By default, we don't have any capabilities
+	 */
+	NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_CAP, 4, 0x00000000);
+
+	last_cap_offset = PCI_CFG_CAP - 1;
+	list_for_each(&dev->capabilities, cap, link) {
+		offset = cap->start;
+
+		/* Initialize config space for the capability */
+		if (cap->populate)
+			cap->populate(cap);
+
+		/* Add capability header */
+		NPU_DEV_CFG_INIT_RO(dev, offset, 2, cap->id);
+
+		/* Update the next capability pointer */
+		NPU_DEV_CFG_NORMAL_WR(dev, last_cap_offset + 1, 1, offset);
+
+		last_cap_offset = offset;
+	}
+
+	/* 0x38 - Reserved */
+	NPU_DEV_CFG_INIT_RO(dev, 0x38, 4, 0x00000000);
+
+	/* 0x3c - INT line/pin/Minimal grant/Maximal latency */
+	if (!(dev->index % 2))
+		NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_INT_LINE, 4, 0x00000100);
+	else
+		NPU_DEV_CFG_INIT_RO(dev, PCI_CFG_INT_LINE, 4, 0x00000200);
+}
+
+static uint32_t npu_allocate_bdfn(struct npu *p, uint32_t pbcq)
+{
+	int i;
+	int dev = -1;
+	int bdfn = -1;
+
+	/* Find the highest function number alloacted to emulated PCI
+	 * devices associated with this GPU. */
+	for(i = 0; i < p->total_devices; i++) {
+		int dev_bdfn = p->devices[i].bdfn;
+		dev = MAX(dev, dev_bdfn & 0xf8);
+
+		if (dt_prop_get_u32(p->devices[i].dt_node,
+				    "ibm,npu-pbcq") == pbcq)
+			bdfn = MAX(bdfn, dev_bdfn);
+	}
+
+	if (bdfn >= 0)
+		/* Device has already been allocated for this GPU so
+		 * assign the emulated PCI device the next
+		 * function. */
+		return bdfn + 1;
+	else if (dev >= 0)
+		/* Otherwise allocate a new device and allocate
+		 * function 0. */
+		return dev + (1 << 3);
+	else
+		return 0;
+}
+
+static void npu_create_devices(struct dt_node *dn, struct npu *p)
+{
+	struct npu_dev *dev;
+	struct dt_node *npu_dn, *link;
+	uint32_t npu_phandle, index = 0;
+	uint64_t buid;
+	uint64_t lsisrcid;
+
+	lsisrcid = GETFIELD(NPU_LSI_SRC_ID_BASE,
+			    in_be64(p->at_regs + NPU_LSI_SOURCE_ID));
+	buid = SETFIELD(NP_BUID_BASE, 0ull,
+			(p8_chip_irq_block_base(p->chip_id, P8_IRQ_BLOCK_MISC) | lsisrcid));
+	buid |= NP_BUID_ENABLE;
+
+	/* Get the npu node which has the links which we expand here
+	 * into pci like devices attached to our emulated phb. */
+	npu_phandle = dt_prop_get_u32(dn, "ibm,npcq");
+	npu_dn = dt_find_by_phandle(dt_root, npu_phandle);
+	assert(npu_dn);
+
+	/* Walk the link at x nodes to initialize devices */
+	p->total_devices = 0;
+	p->phb.scan_map = 0;
+	dt_for_each_compatible(npu_dn, link, "ibm,npu-link") {
+		struct npu_dev_bar *bar;
+		uint32_t pbcq;
+		uint64_t val;
+		uint32_t j;
+
+		dev = &p->devices[index];
+		dev->index = dt_prop_get_u32(link, "ibm,npu-link-index");
+		dev->xscom = npu_link_scom_base(npu_dn, p->xscom_base,
+						dev->index);
+
+		dev->npu = p;
+		dev->dt_node = link;
+
+		/* We don't support MMIO PHY access yet */
+		dev->pl_base = NULL;
+
+		pbcq = dt_prop_get_u32(link, "ibm,npu-pbcq");
+		dev->bdfn = npu_allocate_bdfn(p, pbcq);
+
+		/* This must be done after calling
+		 * npu_allocate_bdfn() */
+		p->total_devices++;
+		p->phb.scan_map |= 0x1 << ((dev->bdfn & 0xf8) >> 3);
+
+		dev->pl_xscom_base = dt_prop_get_u64(link, "ibm,npu-phy");
+		dev->lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask");
+
+		/* Setup BUID/ISRN */
+		xscom_write(p->chip_id, dev->xscom + NX_NP_BUID, buid);
+
+		/* Setup emulated config space */
+		for (j = 0; j < NPU_DEV_CFG_MAX; j++)
+			dev->config[j] = zalloc(NPU_DEV_CFG_SIZE);
+		bar = &dev->bar;
+		bar->flags = (PCI_CFG_BAR_TYPE_MEM |
+			      PCI_CFG_BAR_MEM64);
+
+		/* Update BAR info */
+		bar->xscom = dev->xscom + NX_MMIO_BAR_0;
+		xscom_read(p->chip_id, bar->xscom, &val);
+		bar->base  = GETFIELD(NX_MMIO_BAR_BASE, val) << 12;
+		bar->size = get_bar_size(val);
+
+		/*
+		 * The config space is initialised with the BARs
+		 * disabled, so make sure it is actually disabled in
+		 * hardware.
+		 */
+		npu_dev_bar_update(p->chip_id, bar, false);
+
+		/* Initialize capabilities */
+		npu_dev_create_capabilities(dev);
+
+		/* Initialize config space */
+		npu_dev_create_cfg(dev);
+
+		index++;
+	}
+}
+
+static void npu_add_phb_properties(struct npu *p)
+{
+	struct dt_node *np = p->phb.dt_node;
+	uint32_t icsp = get_ics_phandle();
+	uint64_t tkill, mm_base, mm_size;
+	uint32_t base_lsi = p->base_lsi;
+	uint32_t map[] = { 0x0, 0x0, 0x0, 0x1, icsp, base_lsi,
+			   0x0, 0x0, 0x0, 0x2, icsp, base_lsi + 1,
+			   0x800, 0x0, 0x0, 0x1, icsp, base_lsi + 2,
+			   0x800, 0x0, 0x0, 0x2, icsp, base_lsi + 3 };
+	uint32_t mask[] = {0xf800, 0x0, 0x0, 0x7};
+
+	/* Add various properties that HB doesn't have to
+	 * add, some of them simply because they result from
+	 * policy decisions made in skiboot rather than in HB
+	 * such as the MMIO windows going to PCI, interrupts,
+	 * etc.
+	 */
+	dt_add_property_cells(np, "#address-cells", 3);
+	dt_add_property_cells(np, "#size-cells", 2);
+	dt_add_property_cells(np, "#interrupt-cells", 1);
+	dt_add_property_cells(np, "bus-range", 0, 0xff);
+	dt_add_property_cells(np, "clock-frequency", 0x200, 0);
+        dt_add_property_cells(np, "interrupt-parent", icsp);
+
+        /* DLPL Interrupts */
+        p->phb.lstate.int_size = 1;
+        p->phb.lstate.int_val[0][0] = p->base_lsi + NPU_LSI_INT_DL0;
+        p->phb.lstate.int_val[1][0] = p->base_lsi + NPU_LSI_INT_DL1;
+        p->phb.lstate.int_val[2][0] = p->base_lsi + NPU_LSI_INT_DL2;
+        p->phb.lstate.int_val[3][0] = p->base_lsi + NPU_LSI_INT_DL3;
+        p->phb.lstate.int_parent[0] = icsp;
+        p->phb.lstate.int_parent[1] = icsp;
+        p->phb.lstate.int_parent[2] = icsp;
+        p->phb.lstate.int_parent[3] = icsp;
+
+	/* Due to the way the emulated PCI devices are structured in
+	 * the device tree the core PCI layer doesn't do this for
+	 * us. Besides the swizzling wouldn't suit our needs even if it
+	 * did. */
+	dt_add_property(np, "interrupt-map", map, sizeof(map));
+	dt_add_property(np, "interrupt-map-mask", mask, sizeof(mask));
+
+	/* NPU PHB properties */
+	/* TODO: Due to an errata TCE KILL only works when DMA traffic
+	 * has been stopped. We need to implement the work around
+	 * which is to do a TCE kill all instead. */
+	tkill = cleanup_addr((uint64_t)p->at_regs) + NPU_TCE_KILL;
+	dt_add_property_cells(np, "ibm,opal-num-pes",
+			      NPU_NUM_OF_PES);
+	dt_add_property_cells(np, "ibm,opal-reserved-pe",
+			      NPU_NUM_OF_PES);
+        dt_add_property_cells(np, "ibm,opal-tce-kill",
+			      hi32(tkill), lo32(tkill));
+
+	/* Memory window is exposed as 32-bits non-prefetchable
+	 * one because 64-bits prefetchable one is kind of special
+	 * to kernel.
+	 */
+	mm_base = p->mm_base;
+	mm_size = p->mm_size;
+	dt_add_property_cells(np, "ranges", 0x02000000,
+			      hi32(mm_base), lo32(mm_base),
+			      hi32(mm_base), lo32(mm_base),
+			      hi32(mm_size), lo32(mm_size));
+}
+
+static void npu_create_phb(struct dt_node *dn)
+{
+	const struct dt_property *prop;
+	struct npu *p;
+	uint32_t links;
+	void *pmem;
+
+	/* Retrieve number of devices */
+	links = dt_prop_get_u32(dn, "ibm,links");
+	pmem = zalloc(sizeof(struct npu) + links * sizeof(struct npu_dev));
+	assert(pmem);
+
+	/* Populate PHB */
+	p = pmem;
+	p->index = dt_prop_get_u32(dn, "ibm,phb-index");
+	p->chip_id = dt_prop_get_u32(dn, "ibm,chip-id");
+	p->xscom_base = dt_prop_get_u32(dn, "ibm,xscom-base");
+	p->total_devices = links;
+
+	/* This is the AT base */
+	p->at_xscom = p->xscom_base + NPU_AT_SCOM_OFFSET;
+	p->at_regs = (void *)dt_get_address(dn, 0, NULL);
+
+	prop = dt_require_property(dn, "ibm,mmio-window", -1);
+	assert(prop->len >= (2 * sizeof(uint64_t)));
+	p->mm_base = ((const uint64_t *)prop->prop)[0];
+	p->mm_size = ((const uint64_t *)prop->prop)[1];
+
+	p->devices = pmem + sizeof(struct npu);
+
+	/* Interrupt */
+        p->base_lsi = p8_chip_irq_block_base(p->chip_id, P8_IRQ_BLOCK_MISC) +
+		NPU_LSI_IRQ_MIN;
+
+	/* Generic PHB */
+	p->phb.dt_node = dn;
+	p->phb.ops = &npu_ops;
+	p->phb.phb_type = phb_type_pcie_v3;
+
+	/* Populate devices */
+	npu_create_devices(dn, p);
+
+	/* Populate extra properties */
+	npu_add_phb_properties(p);
+
+	/* Register PHB */
+	pci_register_phb(&p->phb, -1);
+
+	/* Initialize IODA cache */
+	npu_ioda_init(p);
+
+	/* Register interrupt source */
+	npu_register_irq(p);
+
+	/* Initialize hardware */
+	npu_hw_init(p);
+}
+
+void probe_npu(void)
+{
+	struct dt_node *np;
+
+	/* Scan NPU XSCOM nodes */
+	dt_for_each_compatible(dt_root, np, "ibm,power8-npu")
+		npu_probe_phb(np);
+
+	/* Scan newly created PHB nodes */
+	dt_for_each_compatible(dt_root, np, "ibm,power8-npu-pciex")
+		npu_create_phb(np);
+}
diff --git a/include/npu-regs.h b/include/npu-regs.h
new file mode 100644
index 0000000..f663a98
--- /dev/null
+++ b/include/npu-regs.h
@@ -0,0 +1,235 @@
+/* Copyright 2013-2015 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NPU_REGS_H
+#define __NPU_REGS_H
+
+/* Size of a single link */
+#define NPU_LINK_SIZE			0x40
+
+/* Link registers */
+#define NX_PB_ERR_RPT_0			0x00
+#define NX_PB_ERR_RPT_1			0x01
+#define NX_MMIO_BAR_0			0x02
+#define NX_MMIO_BAR_1			0x03
+#define   NX_MMIO_BAR_BASE		PPC_BITMASK(14,51)
+#define   NX_MMIO_BAR_ENABLE		PPC_BIT(52)
+#define   NX_MMIO_BAR_SIZE		PPC_BITMASK(53,55)
+#define NX_NODAL_BAR0			0x04
+#define NX_NODAL_BAR1			0x05
+#define   NX_NODAL_BAR_ENABLE		PPC_BIT(0)
+#define   NX_NODAL_BAR_MASK		PPC_BITMASK(1,14)
+#define   NX_NODAL_BAR_BASE		PPC_BITMASK(15,32)
+#define NX_GROUP_BAR0			0x06
+#define NX_GROUP_BAR1			0x07
+#define   NX_GROUP_BAR_ENABLE		PPC_BIT(0)
+#define   NX_GROUP_BAR_MASK		PPC_BITMASK(1,14)
+#define   NX_GROUP_BAR_BASE		PPC_BITMASK(15,32)
+#define NX_EPSILON_COUN			0x08
+#define   NX_EPSILON_COUN_DISABLE	PPC_BIT(6)
+#define NX_MISC_CONTROL			0x09
+#define NX_PB_DEBUG			0x0a
+#define NX_PB_ECC			0x0b
+#define NX_DEBUG_SNAPSHOT_0		0x0c
+#define NX_DEBUG_SNAPSHOT_1		0x0d
+#define NX_CS_CTL			0x0e
+#define NX_CONFIG_CQ			0x0f
+#define NX_MRBO0			0x10
+#define NX_MRBO1			0x11
+#define NX_AS_CMD_CFG			0x12
+#define NX_NP_BUID			0x13
+#define   NP_BUID_ENABLE		PPC_BIT(0)
+#define   NP_BUID_BASE			PPC_BITMASK(1,23)
+#define NX_TL_CMD_CR			0x20
+#define NX_TL_CMD_D_CR			0x21
+#define NX_TL_RSP_CR			0x22
+#define NX_TL_RSP_D_CR			0x23
+#define NX_DL_REG_ADDR			0x24
+#define NX_DL_REG_DATA			0x25
+#define NX_NTL_CONTROL			0x26
+#define NX_NTL_PMU_CONTROL		0x27
+#define NX_NTL_PMU_COUNT		0x28
+#define NX_NTL_ER_HOLD			0x29
+#define NX_NTL_FST_ERR			0x2a
+#define NX_NTL_ECC			0x2b
+#define NX_NTL_FST_MSK			0x2c
+
+/* NP AT register */
+#define NX_FIR				0x00
+#define NX_FIR_CLEAR			0x01
+#define NX_FIR_SET			0x02
+#define NX_FIR_MASK			0x03
+#define NX_FIR_MASK_CLR			0x04
+#define NX_FIR_MASK_SET			0x05
+#define NX_FIR_ACTION0			0x06
+#define NX_FIR_ACTION1			0x07
+#define NX_FIR_WOF			0x08
+#define NX_AT_PMU_CTRL			0x26
+#define NX_AT_PMU_CNT			0x27
+#define NX_AT_ERR_HOLD			0x28
+#define   NX_AT_ERR_HOLD_RESET		PPC_BIT(63)
+#define NX_AT_DEBUG			0x29
+#define NX_AT_ECC			0x2a
+#define NX_BAR				0x2b
+
+/* AT MMIO registers */
+#define NPU_LSI_SOURCE_ID		0x00100
+#define   NPU_LSI_SRC_ID_BASE		PPC_BITMASK(5,11)
+#define NPU_DMA_CHAN_STATUS		0x00110
+#define NPU_INTREP_TIMER		0x001f8
+#define NPU_DMARD_SYNC			0x00200
+#define   NPU_DMARD_SYNC_START_RD	PPC_BIT(0)
+#define   NPU_DMARD_SYNC_RD		PPC_BIT(1)
+#define   NPU_DMARD_SYNC_START_WR	PPC_BIT(2)
+#define   NPU_DMARD_SYNC_WR		PPC_BIT(3)
+#define NPU_TCE_KILL			0x00210
+#define NPU_IODA_ADDR			0x00220
+#define   NPU_IODA_AD_AUTOINC		PPC_BIT(0)
+#define   NPU_IODA_AD_TSEL		PPC_BITMASK(11,15)
+#define   NPU_IODA_AD_TADR		PPC_BITMASK(54,63)
+#define NPU_IODA_DATA0			0x00228
+#define NPU_XIVE_UPD			0x00248
+#define NPU_GEN_CAP			0x00250
+#define NPU_TCE_CAP			0x00258
+#define NPU_INT_CAP			0x00260
+#define NPU_EEH_CAP			0x00268
+#define NPU_VR				0x00800
+#define NPU_CTRLR			0x00810
+#define NPU_TCR				0x00880
+#define NPU_Q_DMA_R			0x00888
+#define NPU_AT_ESR			0x00c80
+#define NPU_AT_FESR			0x00c88
+#define NPU_AT_LR_ER			0x00c98
+#define NPU_AT_SI_ER			0x00ca0
+#define NPU_AT_FR_ER			0x00ca8
+#define NPU_AT_FE_ER			0x00cb0
+#define NPU_AT_ESMR			0x00cd0
+#define NPU_AT_FESMR			0x00cd8
+#define NPU_AT_I_LR0			0x00d00
+#define NPU_AT_I_LR1			0x00d08
+#define NPU_AT_I_LR2			0x00d10
+#define NPU_AT_I_LR3			0x00d18
+
+/* AT */
+#define NPU_AT_SCOM_OFFSET		0x180
+
+/* NTL */
+#define TL_CMD_CR			0x10000
+#define TL_CMD_D_CR			0x10008
+#define TL_RSP_CR			0x10010
+#define TL_RSP_D_CR			0x10018
+#define NTL_CONTROL			0x10020
+#define   NTL_CONTROL_RESET		PPC_BIT(0)
+
+/* IODA tables */
+#define NPU_IODA_TBL_LIST	1
+#define NPU_IODA_TBL_LXIVT	2
+#define NPU_IODA_TBL_PCT	4
+#define NPU_IODA_TBL_PESTB	8
+#define NPU_IODA_TBL_TVT	9
+#define NPU_IODA_TBL_TCD	10
+#define NPU_IODA_TBL_TDR	11
+#define NPU_IODA_TBL_PESTB_ADDR	12
+#define NPU_IODA_TBL_EA		16
+
+/* LXIVT */
+#define NPU_IODA_LXIVT_SERVER		PPC_BITMASK(8,23)
+#define NPU_IODA_LXIVT_PRIORITY		PPC_BITMASK(24,31)
+
+/* PCT */
+#define NPU_IODA_PCT_LINK_ENABLED	PPC_BIT(0)
+#define NPU_IODA_PCT_PE			PPC_BITMASK(2,3)
+
+/* TVT */
+#define NPU_IODA_TVT_TTA		PPC_BITMASK(0,47)
+#define NPU_IODA_TVT_LEVELS		PPC_BITMASK(48,50)
+#define   NPU_IODA_TVE_1_LEVEL		0
+#define   NPU_IODA_TVE_2_LEVELS		1
+#define   NPU_IODA_TVE_3_LEVELS		2
+#define   NPU_IODA_TVE_4_LEVELS		3
+#define NPU_IODA_TVT_SIZE		PPC_BITMASK(51,55)
+#define NPU_IODA_TVT_PSIZE		PPC_BITMASK(59,63)
+
+/* NDL Registers */
+#define NDL_STATUS		0xfff0
+#define NDL_CONTROL		0xfff4
+
+/* BAR Sizes */
+#define NX_MMIO_PL_SIZE		0x200000
+#define NX_MMIO_AT_SIZE		0x10000
+#define NX_MMIO_DL_SIZE		0x20000
+
+/* Translates a PHY SCOM address to an MMIO offset */
+#define PL_MMIO_ADDR(reg) (((reg >> 32) & 0xfffffull) << 1)
+
+/* PHY register scom offsets & fields */
+#define RX_PR_CNTL_PL		0x0002180000000000
+#define	  RX_PR_RESET		PPC_BIT(63)
+
+#define TX_MODE1_PL		0x0004040000000000
+#define   TX_LANE_PDWN		PPC_BIT(48)
+
+#define TX_MODE2_PL		0x00040c0000000000
+#define   TX_RXCAL		PPC_BIT(57)
+#define   TX_UNLOAD_CLK_DISABLE PPC_BIT(56)
+
+#define TX_CNTL_STAT2		0x00041c0000000000
+#define   TX_FIFO_INIT		PPC_BIT(48)
+
+#define RX_BANK_CONTROLS	0x0000f80000000000
+#define   RX_LANE_ANA_PDWN	PPC_BIT(54)
+
+#define RX_MODE			0x0002000000000000
+#define   RX_LANE_DIG_PDWN	PPC_BIT(48)
+
+#define RX_PR_MODE		0x0002100000000000
+#define   RX_PR_PHASE_STEP	PPC_BITMASK(60, 63)
+
+#define RX_A_DAC_CNTL		0x0000080000000000
+#define   RX_PR_IQ_RES_SEL	PPC_BITMASK(58, 60)
+
+#define RX_LANE_BUSY_VEC_0_15	0x000b000000000000
+#define TX_FFE_TOTAL_2RSTEP_EN	0x000c240000000000
+#define   TX_FFE_TOTAL_ENABLE_P_ENC	PPC_BITMASK(49,55)
+#define   TX_FFE_TOTAL_ENABLE_N_ENC	PPC_BITMASK(57,63)
+#define TX_FFE_PRE_2RSTEP_SEL	0x000c2c0000000000
+#define   TX_FFE_PRE_P_SEL_ENC		PPC_BITMASK(51,54)
+#define   TX_FFE_PRE_N_SEL_ENC		PPC_BITMASK(59,62)
+#define TX_FFE_MARGIN_2RSTEP_SEL 0x000c34000000000
+#define   TX_FFE_MARGIN_PU_P_SEL_ENC	PPC_BITMASK(51,55)
+#define   TX_FFE_MARGIN_PD_N_SEL_ENC	PPC_BITMASK(59,63)
+#define TX_IORESET_VEC_0_15	0x000d2c0000000000
+#define TX_IMPCAL_PB		0x000f040000000000
+#define   TX_ZCAL_REQ			PPC_BIT(49)
+#define	  TX_ZCAL_DONE			PPC_BIT(50)
+#define   TX_ZCAL_ERROR			PPC_BIT(51)
+#define TX_IMPCAL_NVAL_PB	0x000f0c0000000000
+#define   TX_ZCAL_N			PPC_BITMASK(48,56)
+#define TX_IMPCAL_PVAL_PB	0x000f140000000000
+#define   TX_ZCAL_P			PPC_BITMASK(48,56)
+#define RX_EO_STEP_CNTL_PG	0x0008300000000000
+#define   RX_EO_ENABLE_LATCH_OFFSET_CAL	PPC_BIT(48)
+#define   RX_EO_ENABLE_CM_COARSE_CAL	PPC_BIT(57)
+#define RX_RUN_LANE_VEC_0_15   	0x0009b80000000000
+#define RX_RECAL_ABORT_VEC_0_15 0x0009c80000000000
+#define RX_IORESET_VEC_0_15	0x0009d80000000000
+#define RX_EO_RECAL_PG		0x000a800000000000
+#define RX_INIT_DONE_VEC_0_15	0x000ac00000000000
+#define TX_IMPCAL_SWO1_PB	0x000f240000000000
+#define   TX_ZCAL_SWO_EN		PPC_BIT(48)
+#define TX_IMPCAL_SWO2_PB	0x000f2c0000000000
+
+#endif /* __NPU_REGS_H */
diff --git a/include/npu.h b/include/npu.h
new file mode 100644
index 0000000..795b704
--- /dev/null
+++ b/include/npu.h
@@ -0,0 +1,211 @@
+/* Copyright 2013-2015 IBM Corp.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NPU_H
+#define __NPU_H
+
+/* Number of PEs supported */
+#define NPU_NUM_OF_PES	4
+
+/* Each brick has 2 MMIO BARs at the maximum. BAR0 is always used to
+ * map the 128KB TL/DL registers. BAR1 is used to map either the PL or
+ * the AT registers which are not exposed to the OS.
+ */
+#define NPU_BRICK_NUM_OF_BARS	2
+#define NPU_BRICK_TL_BAR_SIZE	0x20000
+#define NPU_BRICK_PL_BAR_SIZE	0x200000
+
+/* The config space of NPU device is emulated. We have different
+ * bits to represent config register properties: readonly, write-
+ * one-to-clear.
+ */
+#define NPU_DEV_CFG_NORMAL      0
+#define NPU_DEV_CFG_RDONLY      1
+#define NPU_DEV_CFG_W1CLR       2
+#define NPU_DEV_CFG_MAX         3
+
+/* Bytes of the emulated NPU PCI device config space. We are
+ * emulating PCI express device, not legacy one
+ */
+#define NPU_DEV_CFG_SIZE	0x100
+
+/* Interrupt mapping
+ *
+ * NPU PHB doesn't support MSI interrupts. It only supports
+ * 8 LSI interrupts: [0, 3] for bricks' DL blocks. [4, 5]
+ * for reporting errors from DL blocks. [6, 7] for reporting
+ * errors from TL blocks, NPCQs and AT.
+ */
+#define NPU_LSI_IRQ_COUNT	8
+#define NPU_LSI_INT_DL0         0
+#define NPU_LSI_INT_DL1         1
+#define NPU_LSI_INT_DL2         2
+#define NPU_LSI_INT_DL3         3
+#define NPU_LSI_IRQ_MIN		0x7F0
+#define NPU_LSI_IRQ_MAX		(NPU_LSI_IRQ_MIN + NPU_LSI_IRQ_COUNT - 1)
+#define NPU_LSI_IRQ_BASE(chip, phb)	(P8_CHIP_IRQ_PHB_BASE(chip, phb) | NPU_LSI_IRQ_MIN)
+#define NPU_IRQ_NUM(irq)		(irq & 0x7FF)
+
+/* NPU device capability descriptor. All PCI capabilities is
+ * organized as linked list. Each PCI capability has specific
+ * hook to populate when initializing NPU device.
+ */
+struct npu_dev;
+struct npu_dev_cap {
+	uint16_t		id;
+	uint16_t		start;
+	uint16_t		end;
+	struct npu_dev		*dev;
+	void			(*populate)(struct npu_dev_cap *cap);
+	struct list_node	link;
+};
+
+/* Config space access trap. */
+struct npu_dev_trap {
+	struct npu_dev		*dev;
+	uint32_t		start;
+	uint32_t		end;
+	void			*data;
+	int64_t			(*read)(struct npu_dev_trap *trap,
+					uint32_t offset,
+					uint32_t size,
+					uint32_t *data);
+	int64_t			(*write)(struct npu_dev_trap *trap,
+					 uint32_t offset,
+					 uint32_t size,
+					 uint32_t data);
+	struct list_node	link;
+};
+
+struct npu_dev_bar {
+	uint32_t		flags;
+	uint32_t		xscom;
+	uint64_t		base;
+	uint64_t		size;
+	uint32_t		bar_sz;
+	bool			trapped;
+};
+
+/* Each device contains 2 links. The device will be exposed as
+ * standard PCIE device and the config space is emulated by skiboot.
+ */
+struct npu_dev {
+	uint32_t		flags;
+	uint32_t		index;
+	uint64_t		xscom;
+	void			*pl_base;
+	uint64_t		pl_xscom_base;
+	struct npu_dev_bar	bar;
+	struct phb		*phb;
+
+	/* Device and function numbers are allocated based on GPU
+	 * association */
+	uint32_t		bdfn;
+
+	/* The link at x node */
+	struct dt_node		*dt_node;
+
+	/* The GPU PCI device this NPU device is associated with */
+	struct pci_device	*pd;
+
+	struct npu		*npu;
+	uint8_t			*config[NPU_DEV_CFG_MAX];
+	struct list_head	capabilities;
+	struct list_head	traps;
+
+	/* Which PHY lanes this device is associated with */
+	uint16_t		lane_mask;
+
+	/* Used to store the currently running procedure number for
+	 * this device. */
+	uint16_t		procedure_number;
+
+	/* Used to store the step within a procedure that we are up
+	 * to. */
+	uint16_t		procedure_step;
+
+	/* Arbitrary data used by each procedure to track status. */
+	uint64_t		procedure_data;
+
+	/* Used to timeout long running procedures. */
+	unsigned long		procedure_tb;
+
+	uint32_t		procedure_status;
+};
+
+/* NPU PHB descriptor */
+struct npu {
+	uint32_t		flags;
+	uint32_t		index;
+	struct lock		lock;
+	uint32_t		chip_id;
+	uint64_t		xscom_base;
+	uint64_t		at_xscom;
+	void			*at_regs;
+	uint32_t		base_lsi;
+	uint64_t		mm_base;
+	uint64_t		mm_size;
+	uint32_t		total_devices;
+	struct npu_dev		*devices;
+
+	/* IODA cache */
+	uint64_t		lxive_cache[8];
+	uint64_t		pce_cache[6];
+	uint64_t		tve_cache[NPU_NUM_OF_PES];
+
+	bool			tx_zcal_complete[2];
+
+	struct phb		phb;
+};
+
+static inline struct npu *phb_to_npu(struct phb *phb)
+{
+	return container_of(phb, struct npu, phb);
+}
+
+static inline void npu_ioda_sel(struct npu *p, uint32_t table,
+				    uint32_t addr, bool autoinc)
+{
+	out_be64(p->at_regs + NPU_IODA_ADDR,
+		 (autoinc ? NPU_IODA_AD_AUTOINC : 0)	|
+		 SETFIELD(NPU_IODA_AD_TSEL, 0ul, table)	|
+		 SETFIELD(NPU_IODA_AD_TADR, 0ul, addr));
+}
+
+void npu_scom_init(struct npu_dev *dev);
+
+int64_t npu_dev_procedure_read(struct npu_dev_trap *trap,
+			       uint32_t offset,
+			       uint32_t size,
+			       uint32_t *data);
+
+int64_t npu_dev_procedure_write(struct npu_dev_trap *trap,
+				uint32_t offset,
+				uint32_t size,
+				uint32_t data);
+
+#define NPUDBG(p, fmt, a...)	prlog(PR_DEBUG, "NPU%d: " fmt, \
+				      (p)->phb.opal_id, ##a)
+#define NPUINF(p, fmt, a...)	prlog(PR_INFO,  "NPU%d: " fmt, \
+				      (p)->phb.opal_id, ##a)
+#define NPUERR(p, fmt, a...)	prlog(PR_ERR,   "NPU%d: " fmt, \
+				      (p)->phb.opal_id, ##a)
+
+#define NPUDEVDBG(p, fmt, a...)	NPUDBG((p)->npu, fmt, ##a)
+#define NPUDEVINF(p, fmt, a...)	NPUINF((p)->npu, fmt, ##a)
+#define NPUDEVERR(p, fmt, a...)	NPUERR((p)->npu, fmt, ##a)
+
+#endif /* __NPU_H */
diff --git a/include/skiboot.h b/include/skiboot.h
index 4eec6db..18db6cf 100644
--- a/include/skiboot.h
+++ b/include/skiboot.h
@@ -202,6 +202,7 @@ extern void probe_p7ioc(void);
 extern void probe_phb3(void);
 extern int phb3_preload_capp_ucode(void);
 extern void phb3_preload_vpd(void);
+extern void probe_npu(void);
 extern void uart_init(bool enable_interrupt);
 extern void homer_init(void);
 extern void occ_pstates_init(void);
@@ -264,4 +265,3 @@ extern bool slw_timer_ok(void);
 extern void fake_rtc_init(void);
 
 #endif /* __SKIBOOT_H */
-
-- 
2.1.4



More information about the Skiboot mailing list