[Skiboot] [PATCH v2 07/10] opal/eeh: Send an error callout for EEH errors on PEs.
Mahesh Salgaonkar
mahesh at linux.ibm.com
Wed Oct 7 23:09:34 AEDT 2020
Send error log events for single PE and Multi-PE freeze/fences. Generate
error log once per PE error. This patch adds a hook in
freeze_status/next_error opal call to capture single/Multi PE errors and
send errorlog per affected PE. Add a bimap for maximum supported PE number
and use it to report errorlog once per PE.
Signed-off-by: Mahesh Salgaonkar <mahesh at linux.ibm.com>
---
core/pci-opal.c | 28 ++++++++++++++++++++++++++++
core/pci.c | 11 +++++++++++
include/pci.h | 4 ++++
3 files changed, 43 insertions(+)
diff --git a/core/pci-opal.c b/core/pci-opal.c
index b0cb50069..333682a0b 100644
--- a/core/pci-opal.c
+++ b/core/pci-opal.c
@@ -124,6 +124,26 @@ static void send_phb_freeze_event(struct phb *phb, void *diag_buffer)
phb->flags &= ~PCI_EEH_ERR_LOG_SEND;
}
+static void send_phb_pe_freeze_event(struct phb *phb, uint64_t pe_number)
+{
+ struct errorlog *buf;
+
+ buf = opal_elog_create(&e_info(OPAL_RC_PCI_PHB_FREEZE), 0);
+ if (!buf) {
+ prerror("Unable to send EEH error log (eSEL)\n");
+ return;
+ }
+
+ log_append_msg(buf, "PHB#%x PE#%lld Freeze/Fence detected!\n",
+ phb->opal_id, pe_number);
+
+ /* TODO: Get PHB status data */
+ /* TODO: Add location info of slot with forzen PEs */
+
+ send_eeh_serviceable_event(phb, buf, NULL);
+ bitmap_set_bit(*phb->pe_freeze_reported, pe_number);
+}
+
static int64_t opal_pci_config_read_half_word_be(uint64_t phb_id,
uint64_t bus_dev_func,
uint64_t offset,
@@ -206,6 +226,10 @@ static int64_t opal_pci_eeh_freeze_status(uint64_t phb_id, uint64_t pe_number,
rc = phb->ops->eeh_freeze_status(phb, pe_number, freeze_state,
&pci_error_type, NULL);
*__pci_error_type = cpu_to_be16(pci_error_type);
+
+ if (*freeze_state &&
+ !bitmap_tst_bit(*phb->pe_freeze_reported, pe_number))
+ send_phb_pe_freeze_event(phb, pe_number);
phb_unlock(phb);
return rc;
@@ -1098,6 +1122,10 @@ static int64_t opal_pci_next_error(uint64_t phb_id, __be64 *__first_frozen_pe,
opal_pci_eeh_clear_evt(phb_id);
rc = phb->ops->next_error(phb, &first_frozen_pe, &pci_error_type,
&severity);
+ if ((first_frozen_pe != (uint64_t)-1) &&
+ !bitmap_tst_bit(*phb->pe_freeze_reported, first_frozen_pe))
+ send_phb_pe_freeze_event(phb, first_frozen_pe);
+
phb_unlock(phb);
*__first_frozen_pe = cpu_to_be64(first_frozen_pe);
diff --git a/core/pci.c b/core/pci.c
index e195ecbf4..ba3f294ae 100644
--- a/core/pci.c
+++ b/core/pci.c
@@ -1054,6 +1054,17 @@ int64_t pci_register_phb(struct phb *phb, int opal_id)
phb->filter_map = zalloc(BITMAP_BYTES(0x10000));
assert(phb->filter_map);
+ /*
+ * Allocate a bitmap to track PE errors that are already reported.
+ *
+ * TODO: PHB registration takes place way before it queries the
+ * capabilities hence phb4->max_num_pes is not known yet. Hence use 512
+ * as maximum suppored PEs while allcating bitmap. Need to find a way
+ * to detect it at registration time.
+ */
+ phb->pe_freeze_reported = zalloc(BITMAP_BYTES(MAX_NUM_PES));
+ assert(phb->pe_freeze_reported);
+
return OPAL_SUCCESS;
}
diff --git a/include/pci.h b/include/pci.h
index b2a9af3e8..d8f712e72 100644
--- a/include/pci.h
+++ b/include/pci.h
@@ -31,6 +31,9 @@
PCI_BUS_NUM(_bdfn), \
PCI_DEV(_bdfn), PCI_FUNC(_bdfn), ## a)
+/* Maximum number of supported PEs */
+#define MAX_NUM_PES 512
+
struct pci_device;
struct pci_cfg_reg_filter;
@@ -373,6 +376,7 @@ struct phb {
struct pci_lsi_state lstate;
uint32_t mps;
bitmap_t *filter_map;
+ bitmap_t *pe_freeze_reported;
/* PCI-X only slot info, for PCI-E this is in the RC bridge */
struct pci_slot *slot;
More information about the Skiboot
mailing list