[Skiboot] [PATCH v2 07/10] opal/eeh: Send an error callout for EEH errors on PEs.

Wed Oct 7 23:09:34 AEDT 2020

Send error log events for single PE and Multi-PE freeze/fences. Generate
error log once per PE error. This patch adds a hook in
freeze_status/next_error opal call to capture single/Multi PE errors and
send errorlog per affected PE. Add a bimap for maximum supported PE number
and use it to report errorlog once per PE.

Signed-off-by: Mahesh Salgaonkar <mahesh at linux.ibm.com>
---
 core/pci-opal.c |   28 ++++++++++++++++++++++++++++
 core/pci.c      |   11 +++++++++++
 include/pci.h   |    4 ++++
 3 files changed, 43 insertions(+)

diff --git a/core/pci-opal.c b/core/pci-opal.c
index b0cb50069..333682a0b 100644
--- a/core/pci-opal.c
+++ b/core/pci-opal.c
@@ -124,6 +124,26 @@ static void send_phb_freeze_event(struct phb *phb, void *diag_buffer)
 	phb->flags &= ~PCI_EEH_ERR_LOG_SEND;
 }
 
+static void send_phb_pe_freeze_event(struct phb *phb, uint64_t pe_number)
+{
+	struct errorlog *buf;
+
+	buf = opal_elog_create(&e_info(OPAL_RC_PCI_PHB_FREEZE), 0);
+	if (!buf) {
+		prerror("Unable to send EEH error log (eSEL)\n");
+		return;
+	}
+
+	log_append_msg(buf, "PHB#%x PE#%lld Freeze/Fence detected!\n",
+					phb->opal_id, pe_number);
+
+	/* TODO: Get PHB status data */
+	/* TODO: Add location info of slot with forzen PEs */
+
+	send_eeh_serviceable_event(phb, buf, NULL);
+	bitmap_set_bit(*phb->pe_freeze_reported, pe_number);
+}
+
 static int64_t opal_pci_config_read_half_word_be(uint64_t phb_id,
 						 uint64_t bus_dev_func,
 						 uint64_t offset,
@@ -206,6 +226,10 @@ static int64_t opal_pci_eeh_freeze_status(uint64_t phb_id, uint64_t pe_number,
 	rc = phb->ops->eeh_freeze_status(phb, pe_number, freeze_state,
 					 &pci_error_type, NULL);
 	*__pci_error_type = cpu_to_be16(pci_error_type);
+
+	if (*freeze_state &&
+		!bitmap_tst_bit(*phb->pe_freeze_reported, pe_number))
+		send_phb_pe_freeze_event(phb, pe_number);
 	phb_unlock(phb);
 
 	return rc;
@@ -1098,6 +1122,10 @@ static int64_t opal_pci_next_error(uint64_t phb_id, __be64 *__first_frozen_pe,
 	opal_pci_eeh_clear_evt(phb_id);
 	rc = phb->ops->next_error(phb, &first_frozen_pe, &pci_error_type,
 				  &severity);
+	if ((first_frozen_pe != (uint64_t)-1) &&
+		!bitmap_tst_bit(*phb->pe_freeze_reported, first_frozen_pe))
+		send_phb_pe_freeze_event(phb, first_frozen_pe);
+
 	phb_unlock(phb);
 
 	*__first_frozen_pe = cpu_to_be64(first_frozen_pe);
diff --git a/core/pci.c b/core/pci.c
index e195ecbf4..ba3f294ae 100644
--- a/core/pci.c
+++ b/core/pci.c
@@ -1054,6 +1054,17 @@ int64_t pci_register_phb(struct phb *phb, int opal_id)
 	phb->filter_map = zalloc(BITMAP_BYTES(0x10000));
 	assert(phb->filter_map);
 
+	/*
+	 * Allocate a bitmap to track PE errors that are already reported.
+	 *
+	 * TODO: PHB registration takes place way before it queries the
+	 * capabilities hence phb4->max_num_pes is not known yet. Hence use 512
+	 * as maximum suppored PEs while allcating bitmap. Need to find a way
+	 * to detect it at registration time.
+	 */
+	phb->pe_freeze_reported = zalloc(BITMAP_BYTES(MAX_NUM_PES));
+	assert(phb->pe_freeze_reported);
+
 	return OPAL_SUCCESS;
 }
 
diff --git a/include/pci.h b/include/pci.h
index b2a9af3e8..d8f712e72 100644
--- a/include/pci.h
+++ b/include/pci.h
@@ -31,6 +31,9 @@
 	      PCI_BUS_NUM(_bdfn),			\
 	      PCI_DEV(_bdfn), PCI_FUNC(_bdfn), ## a)
 
+/* Maximum number of supported PEs */
+#define MAX_NUM_PES	512
+
 struct pci_device;
 struct pci_cfg_reg_filter;
 
@@ -373,6 +376,7 @@ struct phb {
 	struct pci_lsi_state	lstate;
 	uint32_t		mps;
 	bitmap_t		*filter_map;
+	bitmap_t		*pe_freeze_reported;
 
 	/* PCI-X only slot info, for PCI-E this is in the RC bridge */
 	struct pci_slot		*slot;