[Skiboot] [PATCH v2 06/10] opal/eeh: Send an error callout on EEH error.

Mahesh Salgaonkar mahesh at linux.ibm.com
Wed Oct 7 23:09:28 AEDT 2020


On EEH error send out an error log (eSEL) with hardware callout. To avoid
generating multiple events for same error, use a bit flag in generic PHB
structure. Whenever an EEH freeze/fence is detected, a SEND error log bit
is set. The error log includes FRU details and PHB diag data. This patch
addresses full PHB fences events. Subsequent patches will address single PE
and Multi-PE freeze/fences.

As part of FRU details it will include slot location of RootPort, io base
location code and processor chip fru details as below:

|                               Callout Section                            |
|                                                                          |
| Additional Sections      : Disabled                                      |
| Callout Count            : 3                                             |
|                                                                          |
|                             Normal Hardware FRU                          |
| Priority                 : Medium Priority                               |
| Location Code            : U78D2.001.RCH0060-P1-C2                       |
|                                                                          |
|                             Normal Hardware FRU                          |
| Priority                 : Medium Priority                               |
| Location Code            : U78D2.001.RCH0060-P1                          |
| Part Number              : 01EK968                                       |
| Serial Number            : Y230UF6C103M                                  |
|                                                                          |
|                             Normal Hardware FRU                          |
| Priority                 : Medium Priority                               |
| Location Code            : U78D2.001.RCH0060-P1-C48                      |
| Part Number              : 02CY253                                       |
| Serial Number            : YA1934460542                                  |
|                                                                          |

Signed-off-by: Mahesh Salgaonkar <mahesh at linux.ibm.com>
---
Change in v2:
- Introduce new elog type OPAL_RC_PCI_PHB_FREEZE.
---
 core/pci-opal.c    |   70 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 hw/phb3.c          |    3 ++
 hw/phb4.c          |    4 +++
 include/errorlog.h |    2 +
 include/pci.h      |    3 ++
 5 files changed, 82 insertions(+)

diff --git a/core/pci-opal.c b/core/pci-opal.c
index aa375c6aa..b0cb50069 100644
--- a/core/pci-opal.c
+++ b/core/pci-opal.c
@@ -13,6 +13,12 @@
 #include <opal-msg.h>
 #include <timebase.h>
 #include <timer.h>
+#include <errorlog.h>
+#include <chip.h>
+
+DEFINE_LOG_ENTRY(OPAL_RC_PCI_PHB_FREEZE, OPAL_INPUT_OUTPUT_ERR_EVT,
+		OPAL_PCI, OPAL_IO_DEVICES, OPAL_UNRECOVERABLE_ERR_GENERAL,
+		OPAL_NA);
 
 #define OPAL_PCICFG_ACCESS_READ(op, cb, type)	\
 static int64_t opal_pci_config_##op(uint64_t phb_id,			\
@@ -58,6 +64,66 @@ OPAL_PCICFG_ACCESS_WRITE(write_byte,		write8, uint8_t)
 OPAL_PCICFG_ACCESS_WRITE(write_half_word,	write16, uint16_t)
 OPAL_PCICFG_ACCESS_WRITE(write_word,		write32, uint32_t)
 
+/* Generate and send an error log/eSEL */
+static void send_eeh_serviceable_event(struct phb *phb, struct errorlog *buf,
+							void *diag_buffer)
+{
+	const char *loc, *part, *serial;
+	uint32_t chip_id, len;
+	struct OpalIoPhbErrorCommon *common;
+
+	log_mark_serviceable(buf);
+
+	/* Add FRU callout for PHB base (backplane) */
+	loc = phb->base_loc_code;
+	part = phb->base_part_no;
+	serial = phb->base_serial_no;
+	log_add_callout_section(buf, loc, part, serial);
+
+	/* Add FRU callout of associated chip id */
+	chip_id = dt_get_chip_id(phb->dt_node);
+	loc = chip_loc_code(chip_id);
+	part = chip_part_number(chip_id);
+	serial = chip_serial_number(chip_id);
+	log_add_callout_section(buf, loc, part, serial);
+
+	if (!diag_buffer)
+		goto skip_to_commit;
+
+	/* Insert the phb diag data. */
+	common = diag_buffer;
+	len = be32_to_cpu(common->len);
+
+	log_add_section(buf, OPAL_ELOG_SEC_DIAG);
+	log_append_data(buf, diag_buffer, len);
+
+skip_to_commit:
+	log_commit(buf);
+}
+
+static void send_phb_freeze_event(struct phb *phb, void *diag_buffer)
+{
+	struct errorlog *buf;
+	struct pci_device *pd;
+	const char *loc = NULL;
+
+	buf = opal_elog_create(&e_info(OPAL_RC_PCI_PHB_FREEZE), 0);
+	if (!buf) {
+		prerror("Unable to send EEH error log (eSEL)\n");
+		return;
+	}
+
+	log_append_msg(buf, "PHB#%x Freeze/Fence detected!\n", phb->opal_id);
+
+	/* Add slot location info of RootPort */
+	pd = list_entry(phb->devices.n.next, struct pci_device, link);
+	loc = dt_prop_get_def(pd->dn, "ibm,slot-location-code", NULL);
+	log_add_callout_section(buf, loc, NULL, NULL);
+
+	send_eeh_serviceable_event(phb, buf, diag_buffer);
+	phb->flags &= ~PCI_EEH_ERR_LOG_SEND;
+}
+
 static int64_t opal_pci_config_read_half_word_be(uint64_t phb_id,
 						 uint64_t bus_dev_func,
 						 uint64_t offset,
@@ -1000,6 +1066,10 @@ static int64_t opal_pci_get_phb_diag_data2(uint64_t phb_id,
 		return OPAL_UNSUPPORTED;
 	phb_lock(phb);
 	rc = phb->ops->get_diag_data2(phb, diag_buffer, diag_buffer_len);
+
+	/* Send an error log if required */
+	if (phb->flags & PCI_EEH_ERR_LOG_SEND)
+		send_phb_freeze_event(phb, diag_buffer);
 	phb_unlock(phb);
 
 	return rc;
diff --git a/hw/phb3.c b/hw/phb3.c
index 8af6b6164..5465b62ae 100644
--- a/hw/phb3.c
+++ b/hw/phb3.c
@@ -68,6 +68,9 @@ static bool phb3_fenced(struct phb3 *p)
 	if (nfir & PPC_BIT(16)) {
 		p->flags |= PHB3_AIB_FENCED;
 
+		/* Mark flag to send an error log */
+		p->phb.flags |= PCI_EEH_ERR_LOG_SEND;
+
 		phb3_eeh_dump_regs(p, NULL);
 		return true;
 	}
diff --git a/hw/phb4.c b/hw/phb4.c
index 79bfdbf9a..cd50361fc 100644
--- a/hw/phb4.c
+++ b/hw/phb4.c
@@ -2554,6 +2554,9 @@ static bool phb4_fenced(struct phb4 *p)
 	/* Mark ourselves fenced */
 	p->flags |= PHB4_AIB_FENCED;
 
+	/* Mark flag to send an error log */
+	p->phb.flags |= PCI_EEH_ERR_LOG_SEND;
+
 	PHBERR(p, "PHB Freeze/Fence detected !\n");
 	phb4_dump_pec_err_regs(p);
 
@@ -3448,6 +3451,7 @@ static int64_t phb4_creset(struct pci_slot *slot)
 		p->flags &= ~PHB4_AIB_FENCED;
 		p->flags &= ~PHB4_CAPP_RECOVERY;
 		p->flags &= ~PHB4_CFG_USE_ASB;
+
 		phb4_init_hw(p);
 		pci_slot_set_state(slot, PHB4_SLOT_CRESET_FRESET);
 
diff --git a/include/errorlog.h b/include/errorlog.h
index a9c3250e8..9bd200903 100644
--- a/include/errorlog.h
+++ b/include/errorlog.h
@@ -287,6 +287,7 @@ enum opal_reasoncode {
 	OPAL_RC_PCI_ADD_SLOT	    = OPAL_SRC_COMPONENT_PCI | 0x11,
 	OPAL_RC_PCI_SCAN	    = OPAL_SRC_COMPONENT_PCI | 0x12,
 	OPAL_RC_PCI_RESET_PHB	    = OPAL_SRC_COMPONENT_PCI | 0x10,
+	OPAL_RC_PCI_PHB_FREEZE	    = OPAL_SRC_COMPONENT_PCI | 0x13,
 /* ATTN */
 	OPAL_RC_ATTN		    = OPAL_SRC_COMPONENT_ATTN | 0x10,
 /* MEM_ERR */
@@ -341,6 +342,7 @@ enum opal_reasoncode {
 };
 
 #define OPAL_ELOG_SEC_DESC	0x44455343
+#define OPAL_ELOG_SEC_DIAG	0x44494147	/* For EEH diag data */
 
 #define DEFINE_LOG_ENTRY(reason, type, id, subsys,			\
 severity, subtype) static struct opal_err_info err_##reason =		\
diff --git a/include/pci.h b/include/pci.h
index 0b7a1f8a6..b2a9af3e8 100644
--- a/include/pci.h
+++ b/include/pci.h
@@ -384,6 +384,9 @@ struct phb {
 
 	/* Additional data the platform might need to attach */
 	void			*platform_data;
+
+	uint32_t		flags;
+#define PCI_EEH_ERR_LOG_SEND	0x1
 };
 
 static inline void phb_lock(struct phb *phb)




More information about the Skiboot mailing list