[Skiboot] [RFC 08/12] npu2-opencapi: Improve error reporting to the OS
Andrew Donnellan
ajd at linux.ibm.com
Fri Jun 21 17:20:23 AEST 2019
On 19/6/19 10:45 pm, Frederic Barrat wrote:
> When resetting an opencapi link, the brick will be fenced
> temporarily. Therefore we can't rely on the fencing state of the brick
> any more to check for the health of an opencapi PHB, as we could
> report errors if queried for a PHB state at the same time a link is
> being reset.
>
> Instead, we flag the device as 'broken' when an error interrupt is
> received, just before raising an event to the OS. When the OS is
> querying for the state of a PHB, we only have to look at the 'broken'
> attribute.
>
> Note that there's no recovery possible on P9 when an error interrupt
> is received unexpectedly, as recovery is not supported by hardware. So
> when a device/link is marked as 'broken', it stays broken. All the OS
> can do is log the error and notify the drivers.
>
> Signed-off-by: Frederic Barrat <fbarrat at linux.ibm.com>
Reviewed-by: Andrew Donnellan <ajd at linux.ibm.com>
> ---
> hw/npu2-common.c | 7 +++++++
> hw/npu2-opencapi.c | 21 +++++++++++++++++----
> include/npu2.h | 4 ++++
> 3 files changed, 28 insertions(+), 4 deletions(-)
>
> diff --git a/hw/npu2-common.c b/hw/npu2-common.c
> index f3f2f45a..a2563efc 100644
> --- a/hw/npu2-common.c
> +++ b/hw/npu2-common.c
> @@ -419,6 +419,13 @@ static void npu2_err_interrupt(struct irq_source *is, uint32_t isn)
> p->chip_id, irq_name);
> free(irq_name);
> show_all_regs(p, brick);
> + /*
> + * P9 NPU doesn't support recovering a link going down
> + * unexpectedly. So we mark the device as broken and
> + * report it to the OS, so that the error is logged
> + * and the drivers notified.
> + */
> + npu2_opencapi_set_broken(p, brick);
> opal_update_pending_evt(OPAL_EVENT_PCI_ERROR,
> OPAL_EVENT_PCI_ERROR);
> break;
> diff --git a/hw/npu2-opencapi.c b/hw/npu2-opencapi.c
> index 153f2c6f..c11c945f 100644
> --- a/hw/npu2-opencapi.c
> +++ b/hw/npu2-opencapi.c
> @@ -1476,14 +1476,12 @@ static int64_t npu2_opencapi_eeh_next_error(struct phb *phb,
> uint16_t *severity)
> {
> struct npu2_dev *dev = phb_to_npu2_dev_ocapi(phb);
> - uint64_t reg;
>
> if (!first_frozen_pe || !pci_error_type || !severity)
> return OPAL_PARAMETER;
>
> - reg = npu2_read(dev->npu, NPU2_MISC_FENCE_STATE);
> - if (reg & PPC_BIT(dev->brick_index)) {
> - OCAPIERR(dev, "Brick %d fenced!\n", dev->brick_index);
> + if (dev->flags & NPU2_DEV_BROKEN) {
> + OCAPIDBG(dev, "Reporting device as broken\n");
> *first_frozen_pe = dev->linux_pe;
> *pci_error_type = OPAL_EEH_PHB_ERROR;
> *severity = OPAL_EEH_SEV_PHB_DEAD;
> @@ -1833,6 +1831,21 @@ static const struct phb_ops npu2_opencapi_ops = {
> .tce_kill = NULL,
> };
>
> +void npu2_opencapi_set_broken(struct npu2 *npu, int brick)
> +{
> + struct phb *phb;
> + struct npu2_dev *dev;
> +
> + for_each_phb(phb) {
> + if (phb->phb_type == phb_type_npu_v2_opencapi) {
> + dev = phb_to_npu2_dev_ocapi(phb);
> + if (dev->npu == npu &&
> + dev->brick_index == brick)
> + dev->flags |= NPU2_DEV_BROKEN;
> + }
> + }
> +}
> +
> static int64_t opal_npu_spa_setup(uint64_t phb_id, uint32_t __unused bdfn,
> uint64_t addr, uint64_t PE_mask)
> {
> diff --git a/include/npu2.h b/include/npu2.h
> index 4648464b..b376d0ee 100644
> --- a/include/npu2.h
> +++ b/include/npu2.h
> @@ -131,6 +131,8 @@ struct npu2_dev_nvlink {
> const char *slot_label;
> };
>
> +#define NPU2_DEV_BROKEN 0x1
> +
> struct npu2_dev {
> enum npu2_dev_type type;
> uint32_t link_index;
> @@ -139,6 +141,7 @@ struct npu2_dev {
> struct dt_node *dt_node;
> struct npu2_pcie_bar bars[2];
> struct npu2 *npu;
> + long flags;
>
> uint32_t bdfn;
>
> @@ -256,4 +259,5 @@ int64_t npu2_freeze_status(struct phb *phb __unused,
> uint16_t *pci_error_type __unused,
> uint16_t *severity __unused);
> void npu2_dump_scoms(int chip_id);
> +void npu2_opencapi_set_broken(struct npu2 *npu, int brick);
> #endif /* __NPU2_H */
>
--
Andrew Donnellan OzLabs, ADL Canberra
ajd at linux.ibm.com IBM Australia Limited
More information about the Skiboot
mailing list