[Skiboot] [PATCH 4/5] opal/eeh: Send an error callout on EEH error.

Mahesh Salgaonkar mahesh at linux.ibm.com
Wed Jul 29 13:01:05 AEST 2020


On EEH error send out an error log (eSEL) with hardware callout. To avoid
generating multiple events for same error, use a bit flag in generic PHB
structure. Use two bits i.e SEND and SENT bit.  Whenever an EEH
freeze/fence is detected, a SEND error log bit is set. Once the error log
is queued, a SENT error log bit is set. These bits are sticky and gets
reset when PHB is reinitialized to clear the EEH error. The error log
includes FRU details and PHB diag data

Signed-off-by: Mahesh Salgaonkar <mahesh at linux.ibm.com>
---
 core/pci-opal.c    |   49 +++++++++++++++++++++++++++++++++++++++++++++++++
 hw/phb3.c          |    6 ++++++
 hw/phb4.c          |    6 ++++++
 include/errorlog.h |    1 +
 include/pci.h      |    5 +++++
 5 files changed, 67 insertions(+)

diff --git a/core/pci-opal.c b/core/pci-opal.c
index aa375c6aa..47503d5d2 100644
--- a/core/pci-opal.c
+++ b/core/pci-opal.c
@@ -13,6 +13,12 @@
 #include <opal-msg.h>
 #include <timebase.h>
 #include <timer.h>
+#include <errorlog.h>
+#include <chip.h>
+
+DEFINE_LOG_ENTRY(OPAL_RC_PCI_RESET_PHB, OPAL_INPUT_OUTPUT_ERR_EVT,
+		OPAL_PCI, OPAL_IO_DEVICES, OPAL_UNRECOVERABLE_ERR_GENERAL,
+		OPAL_NA);
 
 #define OPAL_PCICFG_ACCESS_READ(op, cb, type)	\
 static int64_t opal_pci_config_##op(uint64_t phb_id,			\
@@ -984,6 +990,45 @@ static int64_t opal_pci_set_power_state(uint64_t async_token,
 }
 opal_call(OPAL_PCI_SET_POWER_STATE, opal_pci_set_power_state, 3);
 
+static void send_eeh_serviceable_event(struct phb *phb, void *diag_buffer)
+{
+	struct errorlog *buf;
+	const char *loc, *part, *serial;
+	uint32_t chip_id, len;
+	struct OpalIoPhbErrorCommon *common;
+
+	/* Generate and send an error log/eSEL */
+	buf = opal_elog_create(&e_info(OPAL_RC_PCI_RESET_PHB), 0);
+	if (!buf) {
+		prerror("Unable to send EEH error log (eSEL)\n");
+		return;
+	}
+
+	log_append_msg(buf, "PHB#%x Freeze/Fence detected!\n", phb->opal_id);
+	log_mark_serviceable(buf);
+
+	/* Add PHB base location code */
+	loc = phb->base_loc_code;
+	log_add_callout_section(buf, loc, NULL, NULL);
+
+	/* Add FRU callout of associated chip id */
+	chip_id = dt_get_chip_id(phb->dt_node);
+	loc = chip_loc_code(chip_id);
+	part = chip_part_number(chip_id);
+	serial = chip_serial_number(chip_id);
+	log_add_callout_section(buf, loc, part, serial);
+
+	/* Insert the phb diag data. */
+	common = (struct OpalIoPhbErrorCommon *)diag_buffer;
+	len = be32_to_cpu(common->len);
+
+	log_add_section(buf, OPAL_ELOG_SEC_DIAG);
+	log_append_data(buf, diag_buffer, len);
+	log_commit(buf);
+
+	phb->flags |= PCI_EEH_ERR_LOG_SENT;
+}
+
 static int64_t opal_pci_get_phb_diag_data2(uint64_t phb_id,
 					   void *diag_buffer,
 					   uint64_t diag_buffer_len)
@@ -1000,6 +1045,10 @@ static int64_t opal_pci_get_phb_diag_data2(uint64_t phb_id,
 		return OPAL_UNSUPPORTED;
 	phb_lock(phb);
 	rc = phb->ops->get_diag_data2(phb, diag_buffer, diag_buffer_len);
+
+	/* Send an error log if required */
+	if ((phb->flags & PCI_EEH_ERR_LOG_MASK) == PCI_EEH_ERR_LOG_SEND)
+		send_eeh_serviceable_event(phb, diag_buffer);
 	phb_unlock(phb);
 
 	return rc;
diff --git a/hw/phb3.c b/hw/phb3.c
index 8af6b6164..0d7370f52 100644
--- a/hw/phb3.c
+++ b/hw/phb3.c
@@ -68,6 +68,9 @@ static bool phb3_fenced(struct phb3 *p)
 	if (nfir & PPC_BIT(16)) {
 		p->flags |= PHB3_AIB_FENCED;
 
+		/* Mark flag to send an error log */
+		p->phb.flags |= PCI_EEH_ERR_LOG_SEND;
+
 		phb3_eeh_dump_regs(p, NULL);
 		return true;
 	}
@@ -2758,6 +2761,9 @@ static int64_t phb3_creset(struct pci_slot *slot)
 		 */
 		p->flags &= ~PHB3_AIB_FENCED;
 		p->flags &= ~PHB3_CAPP_RECOVERY;
+
+		/* Reset the error logging related flag */
+		p->phb.flags &= ~PCI_EEH_ERR_LOG_MASK;
 		phb3_init_hw(p, false);
 
 		if (p->flags & PHB3_CAPP_DISABLING) {
diff --git a/hw/phb4.c b/hw/phb4.c
index 3f22a2c4d..8e59cdba4 100644
--- a/hw/phb4.c
+++ b/hw/phb4.c
@@ -2550,6 +2550,9 @@ static bool phb4_fenced(struct phb4 *p)
 	/* Mark ourselves fenced */
 	p->flags |= PHB4_AIB_FENCED;
 
+	/* Mark flag to send an error log */
+	p->phb.flags |= PCI_EEH_ERR_LOG_SEND;
+
 	PHBERR(p, "PHB Freeze/Fence detected !\n");
 	phb4_dump_pec_err_regs(p);
 
@@ -3444,6 +3447,9 @@ static int64_t phb4_creset(struct pci_slot *slot)
 		p->flags &= ~PHB4_AIB_FENCED;
 		p->flags &= ~PHB4_CAPP_RECOVERY;
 		p->flags &= ~PHB4_CFG_USE_ASB;
+
+		/* Reset the error logging related flag */
+		p->phb.flags &= ~PCI_EEH_ERR_LOG_MASK;
 		phb4_init_hw(p);
 		pci_slot_set_state(slot, PHB4_SLOT_CRESET_FRESET);
 
diff --git a/include/errorlog.h b/include/errorlog.h
index 24d12c4d8..9490e4ec4 100644
--- a/include/errorlog.h
+++ b/include/errorlog.h
@@ -341,6 +341,7 @@ enum opal_reasoncode {
 };
 
 #define OPAL_ELOG_SEC_DESC	0x44455343
+#define OPAL_ELOG_SEC_DIAG	0x44494147	/* For EEH diag data */
 
 #define DEFINE_LOG_ENTRY(reason, type, id, subsys,			\
 severity, subtype) static struct opal_err_info err_##reason =		\
diff --git a/include/pci.h b/include/pci.h
index eb23a6d9b..feadbf21d 100644
--- a/include/pci.h
+++ b/include/pci.h
@@ -382,6 +382,11 @@ struct phb {
 
 	/* Additional data the platform might need to attach */
 	void			*platform_data;
+
+	uint32_t		flags;
+#define PCI_EEH_ERR_LOG_SEND	0x1
+#define PCI_EEH_ERR_LOG_SENT	0x2
+#define PCI_EEH_ERR_LOG_MASK	0x3
 };
 
 static inline void phb_lock(struct phb *phb)




More information about the Skiboot mailing list