[Skiboot] [PATCH 4/5] opal/eeh: Send an error callout on EEH error.

Thu Jul 30 11:24:35 AEST 2020

On Wed, Jul 29, 2020 at 1:01 PM Mahesh Salgaonkar <mahesh at linux.ibm.com> wrote:
>
> On EEH error send out an error log (eSEL) with hardware callout. To avoid
> generating multiple events for same error, use a bit flag in generic PHB
> structure. Use two bits i.e SEND and SENT bit. Whenever an EEH
> freeze/fence is detected, a SEND error log bit is set. Once the error log
> is queued, a SENT error log bit is set.

I don't think this is the right approach. There's a couple of
different events we need to be concerned with namely:

1) Full PHB Fences
2) Multi-PE freezes (PELT-V)
3) Single PE freezes

1) is the rarest kind and it looks like this is the only type this
patch actually addresses. 2) and 3) are the common cases so we need to
log those too and call out the correct slots when they occur. They're
a bit trickier since we'd need to track whether a log has been sent on
a per-PE basis. However, a bitmap that tracks it on a per-PE basis is
probably sufficent.

> These bits are sticky and gets reset when PHB is reinitialized to clear the EEH error.

Stickness is a hardware concept and I'd prefer we didn't refer to
random bits of software state as "sticky" since it just confuses
matters.

> The error log includes FRU details and PHB diag data
>
> Signed-off-by: Mahesh Salgaonkar <mahesh at linux.ibm.com>
> ---
>  core/pci-opal.c    |   49 +++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/phb3.c          |    6 ++++++
>  hw/phb4.c          |    6 ++++++
>  include/errorlog.h |    1 +
>  include/pci.h      |    5 +++++
>  5 files changed, 67 insertions(+)
>
> diff --git a/core/pci-opal.c b/core/pci-opal.c
> index aa375c6aa..47503d5d2 100644
> --- a/core/pci-opal.c
> +++ b/core/pci-opal.c
> @@ -13,6 +13,12 @@
>  #include <opal-msg.h>
>  #include <timebase.h>
>  #include <timer.h>
> +#include <errorlog.h>
> +#include <chip.h>
> +
> +DEFINE_LOG_ENTRY(OPAL_RC_PCI_RESET_PHB, OPAL_INPUT_OUTPUT_ERR_EVT,
> +               OPAL_PCI, OPAL_IO_DEVICES, OPAL_UNRECOVERABLE_ERR_GENERAL,
> +               OPAL_NA);
>
>  #define OPAL_PCICFG_ACCESS_READ(op, cb, type)  \
>  static int64_t opal_pci_config_##op(uint64_t phb_id,                   \
> @@ -984,6 +990,45 @@ static int64_t opal_pci_set_power_state(uint64_t async_token,
>  }
>  opal_call(OPAL_PCI_SET_POWER_STATE, opal_pci_set_power_state, 3);
>
> +static void send_eeh_serviceable_event(struct phb *phb, void *diag_buffer)
> +{
> +       struct errorlog *buf;
> +       const char *loc, *part, *serial;
> +       uint32_t chip_id, len;
> +       struct OpalIoPhbErrorCommon *common;
> +
> +       /* Generate and send an error log/eSEL */
> +       buf = opal_elog_create(&e_info(OPAL_RC_PCI_RESET_PHB), 0);

OPAL_RC_PCI_RESET_PHB is aliased with OPAL_RC_PCI_INIT_SLOT so eh...
I'd prefer we had a seperate elog type for reporting errors,
especially considering PE freezes don't require a PHB reset.

> +       if (!buf) {
> +               prerror("Unable to send EEH error log (eSEL)\n");
> +               return;
> +       }
> +       log_append_msg(buf, "PHB#%x Freeze/Fence detected!\n", phb->opal_id);
> +       log_mark_serviceable(buf);
> +
> +       /* Add PHB base location code */
> +       loc = phb->base_loc_code;

I'm fairly certain this isn't the right location code to be reporting.
I think this is populated from the io-base-loc code which is going to
refer to the system planar rather than the slot containing the device
which EEHed. The right thing would be to log the location codes
corresponding to slots with a frozen PE.

> +       log_add_callout_section(buf, loc, NULL, NULL);
> +
> +       /* Add FRU callout of associated chip id */
> +       chip_id = dt_get_chip_id(phb->dt_node);
> +       loc = chip_loc_code(chip_id);
> +       part = chip_part_number(chip_id);
> +       serial = chip_serial_number(chip_id);
> +       log_add_callout_section(buf, loc, part, serial);
> +
> +       /* Insert the phb diag data. */
> +       common = (struct OpalIoPhbErrorCommon *)diag_buffer;

don't cast void pointers

> +       len = be32_to_cpu(common->len);
> +
> +       log_add_section(buf, OPAL_ELOG_SEC_DIAG);
> +       log_append_data(buf, diag_buffer, len);
> +       log_commit(buf);
> +
> +       phb->flags |= PCI_EEH_ERR_LOG_SENT;
> +}
> +
>  static int64_t opal_pci_get_phb_diag_data2(uint64_t phb_id,
>                                            void *diag_buffer,
>                                            uint64_t diag_buffer_len)
> @@ -1000,6 +1045,10 @@ static int64_t opal_pci_get_phb_diag_data2(uint64_t phb_id,
>                 return OPAL_UNSUPPORTED;
>         phb_lock(phb);
>         rc = phb->ops->get_diag_data2(phb, diag_buffer, diag_buffer_len);
> +
> +       /* Send an error log if required */
> +       if ((phb->flags & PCI_EEH_ERR_LOG_MASK) == PCI_EEH_ERR_LOG_SEND)
> +               send_eeh_serviceable_event(phb, diag_buffer);
>         phb_unlock(phb);
>
>         return rc;
> diff --git a/hw/phb3.c b/hw/phb3.c
> index 8af6b6164..0d7370f52 100644
> --- a/hw/phb3.c
> +++ b/hw/phb3.c
> @@ -68,6 +68,9 @@ static bool phb3_fenced(struct phb3 *p)
>         if (nfir & PPC_BIT(16)) {
>                 p->flags |= PHB3_AIB_FENCED;
>
> +               /* Mark flag to send an error log */
> +               p->phb.flags |= PCI_EEH_ERR_LOG_SEND;
> +
>                 phb3_eeh_dump_regs(p, NULL);
>                 return true;
>         }
> @@ -2758,6 +2761,9 @@ static int64_t phb3_creset(struct pci_slot *slot)
>                  */
>                 p->flags &= ~PHB3_AIB_FENCED;
>                 p->flags &= ~PHB3_CAPP_RECOVERY;
> +
> +               /* Reset the error logging related flag */
> +               p->phb.flags &= ~PCI_EEH_ERR_LOG_MASK;
>                 phb3_init_hw(p, false);
>
>                 if (p->flags & PHB3_CAPP_DISABLING) {
> diff --git a/hw/phb4.c b/hw/phb4.c
> index 3f22a2c4d..8e59cdba4 100644
> --- a/hw/phb4.c
> +++ b/hw/phb4.c
> @@ -2550,6 +2550,9 @@ static bool phb4_fenced(struct phb4 *p)
>         /* Mark ourselves fenced */
>         p->flags |= PHB4_AIB_FENCED;
>
> +       /* Mark flag to send an error log */
> +       p->phb.flags |= PCI_EEH_ERR_LOG_SEND;
> +
>         PHBERR(p, "PHB Freeze/Fence detected !\n");
>         phb4_dump_pec_err_regs(p);
>
> @@ -3444,6 +3447,9 @@ static int64_t phb4_creset(struct pci_slot *slot)
>                 p->flags &= ~PHB4_AIB_FENCED;
>                 p->flags &= ~PHB4_CAPP_RECOVERY;
>                 p->flags &= ~PHB4_CFG_USE_ASB;
> +
> +               /* Reset the error logging related flag */
> +               p->phb.flags &= ~PCI_EEH_ERR_LOG_MASK;
>                 phb4_init_hw(p);
>                 pci_slot_set_state(slot, PHB4_SLOT_CRESET_FRESET);
>
> diff --git a/include/errorlog.h b/include/errorlog.h
> index 24d12c4d8..9490e4ec4 100644
> --- a/include/errorlog.h
> +++ b/include/errorlog.h
> @@ -341,6 +341,7 @@ enum opal_reasoncode {
>  };
>
>  #define OPAL_ELOG_SEC_DESC     0x44455343
> +#define OPAL_ELOG_SEC_DIAG     0x44494147      /* For EEH diag data */
>
>  #define DEFINE_LOG_ENTRY(reason, type, id, subsys,                     \
>  severity, subtype) static struct opal_err_info err_##reason =          \
> diff --git a/include/pci.h b/include/pci.h
> index eb23a6d9b..feadbf21d 100644
> --- a/include/pci.h
> +++ b/include/pci.h
> @@ -382,6 +382,11 @@ struct phb {
>
>         /* Additional data the platform might need to attach */
>         void                    *platform_data;
> +
> +       uint32_t                flags;
> +#define PCI_EEH_ERR_LOG_SEND   0x1
> +#define PCI_EEH_ERR_LOG_SENT   0x2
> +#define PCI_EEH_ERR_LOG_MASK   0x3

Why do we need seperate SEND and SENT bits?

>  };
>
>  static inline void phb_lock(struct phb *phb)
>
>