[Skiboot] [PATCH v2 4/7] hw/npu2: Setup an error interrupt on some opencapi FIRs
Andrew Donnellan
andrew.donnellan at au1.ibm.com
Thu Apr 4 16:03:05 AEDT 2019
On 26/3/19 5:29 am, Frederic Barrat wrote:
> Many errors reported in the NPU FIR2 register, mostly catching
> unexpected errors on the opencapi link are defined as 'brick fatal' in
> the workbook, yet the default action is set to system checkstop. It's
> possible to see those errors during AFU development, where the AFU may
> send unexpected packets on the link, therefore triggering those
> errors. Checkstopping the system in this case is clearly extreme, as
> the error could be contained to the brick and proper analysis of a
> checkstop is not trivial outside of a bringup environment.
>
> This patch changes the default action of those errors so that the NPU
> will raise an interrupt instead. Follow-up patches will log
> proper information so that the error can be debugged and linux can
> catch the event.
>
> Signed-off-by: Frederic Barrat <fbarrat at linux.ibm.com>
Looks good to me
Reviewed-by: Andrew Donnellan <andrew.donnellan at au1.ibm.com>
> ---
> Changelog
> v2: no change
>
> hw/npu2-common.c | 27 +++++++++++++++++++++------
> hw/npu2-opencapi.c | 39 ++++++++++++++++++++++++++++++++-------
> include/npu2-regs.h | 5 ++++-
> 3 files changed, 57 insertions(+), 14 deletions(-)
>
> diff --git a/hw/npu2-common.c b/hw/npu2-common.c
> index 0b46f68c..ccbbbbca 100644
> --- a/hw/npu2-common.c
> +++ b/hw/npu2-common.c
> @@ -108,8 +108,12 @@ static uint64_t npu2_ipi_attributes(struct irq_source *is __unused, uint32_t isn
> struct npu2 *p = is->data;
> uint32_t idx = isn - p->base_lsi;
>
> - if (idx == 18)
> - /* TCE Interrupt - used to detect a frozen PE */
> + if ((idx == 18) || (idx >= 27 && idx <= 34))
> + /*
> + * level 18: TCE Interrupt - used to detect a frozen PE (nvlink)
> + * level 27-30: OTL interrupt (opencapi)
> + * level 31-34: XSL interrupt (opencapi)
> + */
> return IRQ_ATTR_TARGET_OPAL | IRQ_ATTR_TARGET_RARE | IRQ_ATTR_TYPE_MSI;
> else
> return IRQ_ATTR_TARGET_LINUX;
> @@ -166,14 +170,25 @@ static void npu2_err_interrupt(struct irq_source *is, uint32_t isn)
> {
> struct npu2 *p = is->data;
> uint32_t idx = isn - p->base_lsi;
> + int brick;
>
> - if (idx != 18) {
> + switch (idx) {
> + case 18:
> + opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
> + OPAL_EVENT_PCI_ERROR);
> + break;
> + case 27 ... 34:
> + /* opencapi only */
> + brick = 2 + ((idx - 27) % 4);
> + prlog(PR_ERR, "NPU[%d] error interrupt for brick %d\n",
> + p->chip_id, brick);
> + opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
> + OPAL_EVENT_PCI_ERROR);
> + break;
> + default:
> prerror("OPAL received unknown NPU2 interrupt %d\n", idx);
> return;
> }
> -
> - opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
> - OPAL_EVENT_PCI_ERROR);
> }
>
> static const struct irq_source_ops npu2_ipi_ops = {
> diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c
> index d32aaa53..285615a5 100644
> --- a/hw/npu2-opencapi.c
> +++ b/hw/npu2-opencapi.c
> @@ -1509,9 +1509,9 @@ static void mask_nvlink_fir(struct npu2 *p)
> */
>
> /* Mask FIRs */
> - xscom_read(p->chip_id, p->xscom_base + NPU2_MISC_FIR_MASK1, ®);
> + xscom_read(p->chip_id, p->xscom_base + NPU2_MISC_FIR1_MASK, ®);
> reg = SETFIELD(PPC_BITMASK(0, 11), reg, 0xFFF);
> - xscom_write(p->chip_id, p->xscom_base + NPU2_MISC_FIR_MASK1, reg);
> + xscom_write(p->chip_id, p->xscom_base + NPU2_MISC_FIR1_MASK, reg);
>
> /* freeze disable */
> reg = npu2_scom_read(p->chip_id, p->xscom_base,
> @@ -1535,17 +1535,42 @@ static void mask_nvlink_fir(struct npu2 *p)
> NPU2_MISC_IRQ_ENABLE1, NPU2_MISC_DA_LEN_8B, reg);
> }
>
> -static int enable_xsl_irq(struct npu2 *p)
> +static int enable_interrupts(struct npu2 *p)
> {
> - uint64_t reg;
> + uint64_t reg, val_xsl, val_override;
> +
> + /*
> + * Enable translation interrupts for all bricks and override
> + * every brick-fatal error to send an interrupt instead of
> + * checkstopping.
> + *
> + * FIR bits configured to trigger an interrupt must have their
> + * default action masked
> + */
> + val_xsl = PPC_BIT(0) | PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3);
> + val_override = 0x0FFFEFC00FF1B000;
> +
> + xscom_read(p->chip_id, p->xscom_base + NPU2_MISC_FIR2_MASK, ®);
> + reg |= val_xsl | val_override;
> + xscom_write(p->chip_id, p->xscom_base + NPU2_MISC_FIR2_MASK, reg);
>
> - /* enable translation interrupts for all bricks */
> reg = npu2_scom_read(p->chip_id, p->xscom_base, NPU2_MISC_IRQ_ENABLE2,
> NPU2_MISC_DA_LEN_8B);
> - reg |= PPC_BIT(0) | PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3);
> + reg |= val_xsl | val_override;
> npu2_scom_write(p->chip_id, p->xscom_base, NPU2_MISC_IRQ_ENABLE2,
> NPU2_MISC_DA_LEN_8B, reg);
>
> + /*
> + * Make sure the brick is fenced on those errors.
> + * Fencing is incompatible with freezing, but there's no
> + * freeze defined for FIR2, so we don't have to worry about it
> + */
> + reg = npu2_scom_read(p->chip_id, p->xscom_base, NPU2_MISC_FENCE_ENABLE2,
> + NPU2_MISC_DA_LEN_8B);
> + reg |= val_override;
> + npu2_scom_write(p->chip_id, p->xscom_base, NPU2_MISC_FENCE_ENABLE2,
> + NPU2_MISC_DA_LEN_8B, reg);
> +
> mask_nvlink_fir(p);
> return 0;
> }
> @@ -1704,7 +1729,7 @@ int npu2_opencapi_init_npu(struct npu2 *npu)
> address_translation_config(npu->chip_id, npu->xscom_base, dev->brick_index);
> }
>
> - enable_xsl_irq(npu);
> + enable_interrupts(npu);
>
> for (int i = 0; i < npu->total_devices; i++) {
> dev = &npu->devices[i];
> diff --git a/include/npu2-regs.h b/include/npu2-regs.h
> index ca311097..939a23f5 100644
> --- a/include/npu2-regs.h
> +++ b/include/npu2-regs.h
> @@ -480,10 +480,13 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
> #define NPU2_MISC_IRQ_LOG13 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x368)
> #define NPU2_MISC_IRQ_LOG14 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x370)
> #define NPU2_MISC_IRQ_LOG15 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x378)
> +#define NPU2_MISC_FENCE_ENABLE2 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x400)
> #define NPU2_MISC_IRQ_ENABLE2 NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_MISC, 0x408)
>
> /* Misc register, direct access only */
> -#define NPU2_MISC_FIR_MASK1 0x2C43
> +#define NPU2_MISC_FIR0_MASK 0x2C03
> +#define NPU2_MISC_FIR1_MASK 0x2C43
> +#define NPU2_MISC_FIR2_MASK 0x2C83
>
> /* ATS block registers */
> #define NPU2_ATS_PMU_CTL NPU2_REG_OFFSET(NPU2_STACK_MISC, NPU2_BLOCK_ATS, 0x000)
>
--
Andrew Donnellan OzLabs, ADL Canberra
andrew.donnellan at au1.ibm.com IBM Australia Limited
More information about the Skiboot
mailing list