[Skiboot] [PATCH 2/7] npu2-opencapi: Setup perf counters to detect CRC errors

Andrew Donnellan andrew.donnellan at au1.ibm.com
Wed Mar 6 13:51:12 AEDT 2019


On 2/3/19 12:52 am, Frederic Barrat wrote:
> It's possible to set up performance counters for the PLL to detect
> various conditions for the links in nvlink or opencapi mode. Since
> those counters are currently unused, let's configure them when an obus
> is in opencapi mode to detect CRC errors on the link. Each link has
> two counters:
>   - CRC error detected by the host
>   - CRC error detected by the DLx (NAK received by the host)
> 
> We also dump the counters shortly after the link trains, but they can
> be read multiple times through cronus, pdbg or linux. The counters are
> configured to be reset after each read.
> 
> Signed-off-by: Frederic Barrat <fbarrat at linux.ibm.com>

I haven't checked the SCOMs used here in detail, but they don't look 
obviously wrong.

Reviewed-by: Andrew Donnellan <andrew.donnellan at au1.ibm.com>

> ---
>   hw/npu2-opencapi.c  | 62 +++++++++++++++++++++++++++++++++++++++++++++
>   include/npu2-regs.h | 17 +++++++++++++
>   2 files changed, 79 insertions(+)
> 
> diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c
> index 6ad561c4..6d642cde 100644
> --- a/hw/npu2-opencapi.c
> +++ b/hw/npu2-opencapi.c
> @@ -909,6 +909,66 @@ static void reset_odl(uint32_t gcid, struct npu2_dev *dev)
>   	xscom_write(gcid, config_xscom, reg);
>   }
>   
> +static void setup_perf_counters(struct npu2_dev *dev)
> +{
> +	uint64_t addr, reg, link;
> +
> +	/*
> +	 * setup the DLL perf counters to check CRC errors detected by
> +	 * the NPU or the adapter.
> +	 *
> +	 * Counter 0: link 0/ODL0, CRC error detected by ODL
> +	 * Counter 1: link 0/ODL0, CRC error detected by DLx
> +	 * Counter 2: link 1/ODL1, CRC error detected by ODL
> +	 * Counter 3: link 1/ODL1, CRC error detected by DLx
> +	 */
> +	if ((dev->brick_index == 2) || (dev->brick_index == 5))
> +		link = 0;
> +	else
> +		link = 1;
> +
> +	addr = OB_DLL_PERF_MONITOR_CONFIG(dev->brick_index);
> +	xscom_read(dev->npu->chip_id, addr, &reg);
> +	if (link == 0) {
> +		reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE, reg,
> +			OB_DLL_PERF_MONITOR_CONFIG_LINK0);
> +		reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE >> 2, reg,
> +			OB_DLL_PERF_MONITOR_CONFIG_LINK0);
> +	} else {
> +		reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE >> 4, reg,
> +			OB_DLL_PERF_MONITOR_CONFIG_LINK1);	
> +		reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_ENABLE >> 6, reg,
> +			OB_DLL_PERF_MONITOR_CONFIG_LINK1);
> +	}
> +	reg = SETFIELD(OB_DLL_PERF_MONITOR_CONFIG_SIZE, reg,
> +		OB_DLL_PERF_MONITOR_CONFIG_SIZE16);
> +	xscom_write(dev->npu->chip_id,
> +		OB_DLL_PERF_MONITOR_CONFIG(dev->brick_index), reg);
> +	OCAPIDBG(dev, "perf counter config %llx = %llx\n", addr, reg);
> +
> +	addr = OB_DLL_PERF_MONITOR_SELECT(dev->brick_index);
> +	xscom_read(dev->npu->chip_id, addr, &reg);
> +	reg = SETFIELD(OB_DLL_PERF_MONITOR_SELECT_COUNTER >> (link * 16),
> +		reg, OB_DLL_PERF_MONITOR_SELECT_CRC_ODL);
> +	reg = SETFIELD(OB_DLL_PERF_MONITOR_SELECT_COUNTER >> ((link * 16) + 8),
> +		reg, OB_DLL_PERF_MONITOR_SELECT_CRC_DLX);
> +	xscom_write(dev->npu->chip_id, addr, reg);
> +	OCAPIDBG(dev, "perf counter select %llx = %llx\n", addr, reg);
> +}
> +
> +static void check_perf_counters(struct npu2_dev *dev)
> +{
> +	uint64_t addr, reg, link0, link1;
> +
> +	addr = OB_DLL_PERF_COUNTER0(dev->brick_index);
> +	xscom_read(dev->npu->chip_id, addr, &reg);
> +	link0 = GETFIELD(PPC_BITMASK(0, 31), reg);
> +	link1 = GETFIELD(PPC_BITMASK(32, 63), reg);
> +	if (link0 || link1)
> +		OCAPIERR(dev, "CRC error count link0=%08llx link1=%08llx\n",
> +			link0, link1);
> +}
> +
>   static void set_init_pattern(uint32_t gcid, struct npu2_dev *dev)
>   {
>   	uint64_t reg, config_xscom;
> @@ -1048,6 +1108,7 @@ static int64_t npu2_opencapi_poll_link(struct pci_slot *slot)
>   	case OCAPI_SLOT_LINK_TRAINED:
>   		otl_enabletx(chip_id, dev->npu->xscom_base, dev);
>   		pci_slot_set_state(slot, OCAPI_SLOT_NORMAL);
> +		check_perf_counters(dev);
>   		dev->phb_ocapi.scan_map = 1;
>   		return OPAL_SUCCESS;
>   
> @@ -1569,6 +1630,7 @@ static void setup_device(struct npu2_dev *dev)
>   	setup_afu_mmio_bars(dev->npu->chip_id, dev->npu->xscom_base, dev);
>   	/* Procedure 13.1.3.9 - AFU Config BARs */
>   	setup_afu_config_bars(dev->npu->chip_id, dev->npu->xscom_base, dev);
> +	setup_perf_counters(dev);
>   
>   	set_fence_control(dev->npu->chip_id, dev->npu->xscom_base, dev->brick_index, 0b00);
>   
> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
> index 5190aeb7..ca311097 100644
> --- a/include/npu2-regs.h
> +++ b/include/npu2-regs.h
> @@ -725,6 +725,23 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
>   #define    PU_IOE_PB_FP_CFG_FP1_FMR_DISABLE	PPC_BIT(52)
>   #define    PU_IOE_PB_FP_CFG_FP1_PRS_DISABLE	PPC_BIT(57)
>   
> +#define OB_DLL_PERF_MONITOR_CONFIG(brick_index) \
> +	(0x901081C + ((brick_index - 2) >> 1) * 0x3000000)
> +#define   OB_DLL_PERF_MONITOR_CONFIG_ENABLE	PPC_BITMASK(0, 1)
> +#define   OB_DLL_PERF_MONITOR_CONFIG_LINK0	0b10
> +#define   OB_DLL_PERF_MONITOR_CONFIG_LINK1	0b01
> +#define   OB_DLL_PERF_MONITOR_CONFIG_SIZE	PPC_BITMASK(16, 23)
> +#define   OB_DLL_PERF_MONITOR_CONFIG_SIZE16	0xFF
> +#define OB_DLL_PERF_MONITOR_SELECT(brick_index) \
> +	(0x901081D + ((brick_index - 2) >> 1) * 0x3000000)
> +#define   OB_DLL_PERF_MONITOR_SELECT_COUNTER	PPC_BITMASK(0, 7)
> +#define   OB_DLL_PERF_MONITOR_SELECT_CRC_ODL	0x44
> +#define   OB_DLL_PERF_MONITOR_SELECT_CRC_DLX	0x45
> +#define OB_DLL_PERF_COUNTER0(brick_index) \
> +	(0x901081E + ((brick_index - 2) >> 1) * 0x3000000)
> +#define   OB_DLL_PERF_COUNTER0_VAL		PPC_BITMASK(0, 31)
> +
> +
>   #define OB_ODL_OFFSET(brick_index) \
>   	((((brick_index - 2) >> 1) * 0x3000000) + ((brick_index == 3 || brick_index == 4) ? 1 : 0))
>   
> 

-- 
Andrew Donnellan              OzLabs, ADL Canberra
andrew.donnellan at au1.ibm.com  IBM Australia Limited



More information about the Skiboot mailing list