[Skiboot] [PATCH 2/7] npu2-opencapi: Setup perf counters to detect CRC errors
Frederic Barrat
fbarrat at linux.ibm.com
Sat Mar 2 00:52:34 AEDT 2019
It's possible to set up performance counters for the PLL to detect
various conditions for the links in nvlink or opencapi mode. Since
those counters are currently unused, let's configure them when an obus
is in opencapi mode to detect CRC errors on the link. Each link has
two counters:
- CRC error detected by the host
- CRC error detected by the DLx (NAK received by the host)
We also dump the counters shortly after the link trains, but they can
be read multiple times through cronus, pdbg or linux. The counters are
configured to be reset after each read.
Signed-off-by: Frederic Barrat <fbarrat at linux.ibm.com>
---
hw/npu2-opencapi.c | 62 +++++++++++++++++++++++++++++++++++++++++++++
include/npu2-regs.h | 17 +++++++++++++
2 files changed, 79 insertions(+)
diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c
index 6ad561c4..6d642cde 100644
--- a/hw/npu2-opencapi.c
+++ b/hw/npu2-opencapi.c
@@ -909,6 +909,66 @@ static void reset_odl(uint32_t gcid, struct npu2_dev *dev)
xscom_write(gcid, config_xscom, reg);
}
+static void setup_perf_counters(struct npu2_dev *dev)
+{
+ uint64_t addr, reg, link;
+
+ /*
+ * setup the DLL perf counters to check CRC errors detected by
+ * the NPU or the adapter.
+ *
+ * Counter 0: link 0/ODL0, CRC error detected by ODL
+ * Counter 1: link 0/ODL0, CRC error detected by DLx
+ * Counter 2: link 1/ODL1, CRC error detected by ODL
+ * Counter 3: link 1/ODL1, CRC error detected by DLx
+ */
+ if ((dev->brick_index == 2) || (dev->brick_index == 5))
+ link = 0;
+ else
+ link = 1;
+
+ addr = OB_DLL_PERF_MONITOR_CONFIG(dev->brick_index);
+ xscom_read(dev->npu->chip_id, addr, ®);
+ if (link == 0) {
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE, reg,
+ OB_DLL_PERF_MONITOR_CONFIG_LINK0);
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE >> 2, reg,
+ OB_DLL_PERF_MONITOR_CONFIG_LINK0);
+ } else {
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE >> 4, reg,
+ OB_DLL_PERF_MONITOR_CONFIG_LINK1);
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE >> 6, reg,
+ OB_DLL_PERF_MONITOR_CONFIG_LINK1);
+ }
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_SIZE, reg,
+ OB_DLL_PERF_MONITOR_CONFIG_SIZE16);
+ xscom_write(dev->npu->chip_id,
+ OB_DLL_PERF_MONITOR_CONFIG(dev->brick_index), reg);
+ OCAPIDBG(dev, "perf counter config %llx = %llx\n", addr, reg);
+
+ addr = OB_DLL_PERF_MONITOR_SELECT(dev->brick_index);
+ xscom_read(dev->npu->chip_id, addr, ®);
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_SELECT_COUNTER >> (link * 16),
+ reg, OB_DLL_PERF_MONITOR_SELECT_CRC_ODL);
+ reg = SETFIELD(OB_DLL_PERF_MONITOR_SELECT_COUNTER >> ((link * 16) + 8),
+ reg, OB_DLL_PERF_MONITOR_SELECT_CRC_DLX);
+ xscom_write(dev->npu->chip_id, addr, reg);
+ OCAPIDBG(dev, "perf counter select %llx = %llx\n", addr, reg);
+}
+
+static void check_perf_counters(struct npu2_dev *dev)
+{
+ uint64_t addr, reg, link0, link1;
+
+ addr = OB_DLL_PERF_COUNTER0(dev->brick_index);
+ xscom_read(dev->npu->chip_id, addr, ®);
+ link0 = GETFIELD(PPC_BITMASK(0, 31), reg);
+ link1 = GETFIELD(PPC_BITMASK(32, 63), reg);
+ if (link0 || link1)
+ OCAPIERR(dev, "CRC error count link0=%08llx link1=%08llx\n",
+ link0, link1);
+}
+
static void set_init_pattern(uint32_t gcid, struct npu2_dev *dev)
{
uint64_t reg, config_xscom;
@@ -1048,6 +1108,7 @@ static int64_t npu2_opencapi_poll_link(struct pci_slot *slot)
case OCAPI_SLOT_LINK_TRAINED:
otl_enabletx(chip_id, dev->npu->xscom_base, dev);
pci_slot_set_state(slot, OCAPI_SLOT_NORMAL);
+ check_perf_counters(dev);
dev->phb_ocapi.scan_map = 1;
return OPAL_SUCCESS;
@@ -1569,6 +1630,7 @@ static void setup_device(struct npu2_dev *dev)
setup_afu_mmio_bars(dev->npu->chip_id, dev->npu->xscom_base, dev);
/* Procedure 13.1.3.9 - AFU Config BARs */
setup_afu_config_bars(dev->npu->chip_id, dev->npu->xscom_base, dev);
+ setup_perf_counters(dev);
set_fence_control(dev->npu->chip_id, dev->npu->xscom_base, dev->brick_index, 0b00);
diff --git a/include/npu2-regs.h b/include/npu2-regs.h
index 5190aeb7..ca311097 100644
--- a/include/npu2-regs.h
+++ b/include/npu2-regs.h
@@ -725,6 +725,23 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
#define PU_IOE_PB_FP_CFG_FP1_FMR_DISABLE PPC_BIT(52)
#define PU_IOE_PB_FP_CFG_FP1_PRS_DISABLE PPC_BIT(57)
+#define OB_DLL_PERF_MONITOR_CONFIG(brick_index) \
+ (0x901081C + ((brick_index - 2) >> 1) * 0x3000000)
+#define OB_DLL_PERF_MONITOR_CONFIG_ENABLE PPC_BITMASK(0, 1)
+#define OB_DLL_PERF_MONITOR_CONFIG_LINK0 0b10
+#define OB_DLL_PERF_MONITOR_CONFIG_LINK1 0b01
+#define OB_DLL_PERF_MONITOR_CONFIG_SIZE PPC_BITMASK(16, 23)
+#define OB_DLL_PERF_MONITOR_CONFIG_SIZE16 0xFF
+#define OB_DLL_PERF_MONITOR_SELECT(brick_index) \
+ (0x901081D + ((brick_index - 2) >> 1) * 0x3000000)
+#define OB_DLL_PERF_MONITOR_SELECT_COUNTER PPC_BITMASK(0, 7)
+#define OB_DLL_PERF_MONITOR_SELECT_CRC_ODL 0x44
+#define OB_DLL_PERF_MONITOR_SELECT_CRC_DLX 0x45
+#define OB_DLL_PERF_COUNTER0(brick_index) \
+ (0x901081E + ((brick_index - 2) >> 1) * 0x3000000)
+#define OB_DLL_PERF_COUNTER0_VAL PPC_BITMASK(0, 31)
+
+
#define OB_ODL_OFFSET(brick_index) \
((((brick_index - 2) >> 1) * 0x3000000) + ((brick_index == 3 || brick_index == 4) ? 1 : 0))
--
2.19.1
More information about the Skiboot
mailing list