[Skiboot] [PATCH v3 4/7] hw/npu2: Setup an error interrupt on some opencapi FIRs

Andrew Donnellan andrew.donnellan at au1.ibm.com
Mon Apr 8 17:19:29 AEST 2019


On 6/4/19 1:33 am, Frederic Barrat wrote:
> Many errors reported in the NPU FIR2 register, mostly catching
> unexpected errors on the opencapi link are defined as 'brick fatal' in
> the workbook, yet the default action is set to system checkstop. It's
> possible to see those errors during AFU development, where the AFU may
> send unexpected packets on the link, therefore triggering those
> errors. Checkstopping the system in this case is clearly extreme, as
> the error could be contained to the brick and proper analysis of a
> checkstop is not trivial outside of a bringup environment.
> 
> This patch changes the default action of those errors so that the NPU
> will raise an interrupt instead. Follow-up patches will log
> proper information so that the error can be debugged and linux can
> catch the event.
> 
> Signed-off-by: Frederic Barrat <fbarrat at linux.ibm.com>

Adding my:

Reviewed-by: Andrew Donnellan <andrew.donnellan at au1.ibm.com>

again.

> ---
> Changelog
> v2, v3: no change
> 
>   hw/npu2-common.c    | 27 +++++++++++++++++++++------
>   hw/npu2-opencapi.c  | 39 ++++++++++++++++++++++++++++++++-------
>   include/npu2-regs.h |  5 ++++-
>   3 files changed, 57 insertions(+), 14 deletions(-)
> 
> diff --git a/hw/npu2-common.c b/hw/npu2-common.c
> index 0b46f68c..ccbbbbca 100644
> --- a/hw/npu2-common.c
> +++ b/hw/npu2-common.c
> @@ -108,8 +108,12 @@ static uint64_t npu2_ipi_attributes(struct irq_source *is __unused, uint32_t isn
>   	struct npu2 *p = is->data;
>   	uint32_t idx = isn - p->base_lsi;
>   
> -	if (idx == 18)
> -		/* TCE Interrupt - used to detect a frozen PE */
> +	if ((idx == 18) || (idx >= 27 && idx <= 34))
> +		/*
> +		 * level 18: TCE Interrupt - used to detect a frozen PE (nvlink)
> +		 * level 27-30: OTL interrupt (opencapi)
> +		 * level 31-34: XSL interrupt (opencapi)
> +		 */
>   		return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE | IRQ_ATTR_TYPE_MSI;
>   	else
>   		return IRQ_ATTR_TARGET_LINUX;
> @@ -166,14 +170,25 @@ static void npu2_err_interrupt(struct irq_source *is, uint32_t isn)
>   {
>   	struct npu2 *p = is->data;
>   	uint32_t idx = isn - p->base_lsi;
> +	int brick;
>   
> -	if (idx != 18) {
> +	switch (idx) {
> +	case 18:
> +		opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
> +					OPAL_EVENT_PCI_ERROR);
> +		break;
> +	case 27 ... 34:
> +		/* opencapi only */
> +		brick = 2 + ((idx - 27) % 4);
> +		prlog(PR_ERR, "NPU[%d] error interrupt for brick %d\n",
> +			p->chip_id, brick);
> +		opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
> +					OPAL_EVENT_PCI_ERROR);
> +		break;
> +	default:
>   		prerror("OPAL received unknown NPU2 interrupt %d\n", idx);
>   		return;
>   	}
> -
> -	opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
> -				OPAL_EVENT_PCI_ERROR);
>   }
>   
>   static const struct irq_source_ops npu2_ipi_ops = {
> diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c
> index d32aaa53..285615a5 100644
> --- a/hw/npu2-opencapi.c
> +++ b/hw/npu2-opencapi.c
> @@ -1509,9 +1509,9 @@ static void mask_nvlink_fir(struct npu2 *p)
>   	 */
>   
>   	/* Mask FIRs */
> -	xscom_read(p->chip_id, p->xscom_base + NPU2_MISC_FIR_MASK1, &reg);
> +	xscom_read(p->chip_id, p->xscom_base + NPU2_MISC_FIR1_MASK, &reg);
>   	reg = SETFIELD(PPC_BITMASK(0, 11), reg, 0xFFF);
> -	xscom_write(p->chip_id, p->xscom_base + NPU2_MISC_FIR_MASK1, reg);
> +	xscom_write(p->chip_id, p->xscom_base + NPU2_MISC_FIR1_MASK, reg);
>   
>   	/* freeze disable */
>   	reg = npu2_scom_read(p->chip_id, p->xscom_base,
> @@ -1535,17 +1535,42 @@ static void mask_nvlink_fir(struct npu2 *p)
>   			NPU2_MISC_IRQ_ENABLE1, NPU2_MISC_DA_LEN_8B, reg);
>   }
>   
> -static int enable_xsl_irq(struct npu2 *p)
> +static int enable_interrupts(struct npu2 *p)
>   {
> -	uint64_t reg;
> +	uint64_t reg, val_xsl, val_override;
> +
> +	/*
> +	 * Enable translation interrupts for all bricks and override
> +	 * every brick-fatal error to send an interrupt instead of
> +	 * checkstopping.
> +	 *
> +	 * FIR bits configured to trigger an interrupt must have their
> +	 * default action masked
> +	 */
> +	val_xsl = PPC_BIT(0) | PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3);
> +	val_override = 0x0FFFEFC00FF1B000;
> +
> +	xscom_read(p->chip_id, p->xscom_base + NPU2_MISC_FIR2_MASK, &reg);
> +	reg |= val_xsl | val_override;
> +	xscom_write(p->chip_id, p->xscom_base + NPU2_MISC_FIR2_MASK, reg);
>   
> -	/* enable translation interrupts for all bricks */
>   	reg = npu2_scom_read(p->chip_id, p->xscom_base, NPU2_MISC_IRQ_ENABLE2,
>   			     NPU2_MISC_DA_LEN_8B);
> -	reg |= PPC_BIT(0) | PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3);
> +	reg |= val_xsl | val_override;
>   	npu2_scom_write(p->chip_id, p->xscom_base, NPU2_MISC_IRQ_ENABLE2,
>   			NPU2_MISC_DA_LEN_8B, reg);
>   
> +	/*
> +	 * Make sure the brick is fenced on those errors.
> +	 * Fencing is incompatible with freezing, but there's no
> +	 * freeze defined for FIR2, so we don't have to worry about it
> +	 */
> +	reg = npu2_scom_read(p->chip_id, p->xscom_base, NPU2_MISC_FENCE_ENABLE2,
> +			     NPU2_MISC_DA_LEN_8B);
> +	reg |= val_override;
> +	npu2_scom_write(p->chip_id, p->xscom_base, NPU2_MISC_FENCE_ENABLE2,
> +			NPU2_MISC_DA_LEN_8B, reg);
> +
>   	mask_nvlink_fir(p);
>   	return 0;
>   }
> @@ -1704,7 +1729,7 @@ int npu2_opencapi_init_npu(struct npu2 *npu)
>   		address_translation_config(npu->chip_id, npu->xscom_base, dev->brick_index);
>   	}
>   
> -	enable_xsl_irq(npu);
> +	enable_interrupts(npu);
>   
>   	for (int i = 0; i < npu->total_devices; i++) {
>   		dev = &npu->devices[i];
> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
> index ca311097..939a23f5 100644
> --- a/include/npu2-regs.h
> +++ b/include/npu2-regs.h
> @@ -480,10 +480,13 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
>   #define NPU2_MISC_IRQ_LOG13			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x368)
>   #define NPU2_MISC_IRQ_LOG14			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x370)
>   #define NPU2_MISC_IRQ_LOG15			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x378)
> +#define NPU2_MISC_FENCE_ENABLE2			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x400)
>   #define NPU2_MISC_IRQ_ENABLE2			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x408)
>   
>   /* Misc register, direct access only */
> -#define NPU2_MISC_FIR_MASK1		0x2C43
> +#define NPU2_MISC_FIR0_MASK		0x2C03
> +#define NPU2_MISC_FIR1_MASK		0x2C43
> +#define NPU2_MISC_FIR2_MASK		0x2C83
>   
>   /* ATS block registers */
>   #define NPU2_ATS_PMU_CTL			NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_ATS, 0x000)
> 

-- 
Andrew Donnellan              OzLabs, ADL Canberra
andrew.donnellan at au1.ibm.com  IBM Australia Limited



More information about the Skiboot mailing list