[Skiboot] [PATCH v2 08/10] opal/eeh: Add PHB diag data in error log for PE errors.

Mahesh Salgaonkar mahesh at linux.ibm.com
Wed Oct 7 23:09:41 AEDT 2020


Gather PHB status (diag data) for corresponding frozen PE and add it to
errorlog. Since we qeury the PHB status during freeze_status/next_error
opal call, we can't use get_diag_data2() which clears the errors. Instead
introduce a new phb_ops which will collect only PHB status required to add
it in errorlog. This will make sure linux also gets proper diag data when
it calls get_diag_data2().

Signed-off-by: Mahesh Salgaonkar <mahesh at linux.ibm.com>
---
 core/pci-opal.c |   21 +++++++++++++++++++--
 hw/phb3.c       |   25 +++++++++++++++++++++----
 hw/phb4.c       |   24 ++++++++++++++++++++----
 include/pci.h   |    2 ++
 4 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/core/pci-opal.c b/core/pci-opal.c
index 333682a0b..34de05ddf 100644
--- a/core/pci-opal.c
+++ b/core/pci-opal.c
@@ -127,6 +127,9 @@ static void send_phb_freeze_event(struct phb *phb, void *diag_buffer)
 static void send_phb_pe_freeze_event(struct phb *phb, uint64_t pe_number)
 {
 	struct errorlog *buf;
+	void *diag_buffer;
+	uint32_t len;
+	int rc;
 
 	buf = opal_elog_create(&e_info(OPAL_RC_PCI_PHB_FREEZE), 0);
 	if (!buf) {
@@ -137,11 +140,25 @@ static void send_phb_pe_freeze_event(struct phb *phb, uint64_t pe_number)
 	log_append_msg(buf, "PHB#%x PE#%lld Freeze/Fence detected!\n",
 					phb->opal_id, pe_number);
 
-	/* TODO: Get PHB status data */
+	/* Get PHB status data */
+	len = dt_prop_get_u32(phb->dt_node, "ibm,phb-diag-data-size");
+	diag_buffer = zalloc(len);
+	if (diag_buffer) {
+		rc = phb->ops->get_phb_status(phb, diag_buffer, len);
+		if (rc != OPAL_SUCCESS) {
+			prerror("Failed to gather phb diag data\n");
+			free(diag_buffer);
+			diag_buffer = NULL;
+		}
+	} else
+		prerror("Failed to allocate size for phb diag data\n");
+
 	/* TODO: Add location info of slot with forzen PEs */
 
-	send_eeh_serviceable_event(phb, buf, NULL);
+	send_eeh_serviceable_event(phb, buf, diag_buffer);
 	bitmap_set_bit(*phb->pe_freeze_reported, pe_number);
+	if (diag_buffer)
+		free(diag_buffer);
 }
 
 static int64_t opal_pci_config_read_half_word_be(uint64_t phb_id,
diff --git a/hw/phb3.c b/hw/phb3.c
index 5465b62ae..1bd753bae 100644
--- a/hw/phb3.c
+++ b/hw/phb3.c
@@ -3457,13 +3457,13 @@ static int64_t phb3_err_inject(struct phb *phb, uint64_t pe_number,
 	return handler(p, pe_number, addr, mask, is_write);
 }
 
-static int64_t phb3_get_diag_data(struct phb *phb,
+/* Get phb diag data only. Do not clear the phb pending errors. */
+static int64_t phb3_get_phb_status(struct phb *phb,
 				  void *diag_buffer,
 				  uint64_t diag_buffer_len)
 {
 	struct phb3 *p = phb_to_phb3(phb);
 	struct OpalIoPhb3ErrorData *data = diag_buffer;
-	bool fenced;
 
 	if (diag_buffer_len < sizeof(struct OpalIoPhb3ErrorData))
 		return OPAL_PARAMETER;
@@ -3474,10 +3474,26 @@ static int64_t phb3_get_diag_data(struct phb *phb,
 	 * Dummy check for fence so that phb3_read_phb_status knows
 	 * whether to use ASB or AIB
 	 */
-	fenced = phb3_fenced(p);
+	phb3_fenced(p);
 	phb3_read_phb_status(p, data);
 
-	if (!fenced)
+	return OPAL_SUCCESS;
+}
+
+/* Get phb diag data and clear the phb pending errors. */
+static int64_t phb3_get_diag_data(struct phb *phb,
+				  void *diag_buffer,
+				  uint64_t diag_buffer_len)
+{
+	int64_t rc;
+	struct phb3 *p = phb_to_phb3(phb);
+	struct OpalIoPhb3ErrorData *data = diag_buffer;
+
+	rc = phb3_get_phb_status(phb, diag_buffer, diag_buffer_len);
+	if (!rc)
+		return rc;
+
+	if (!(p->flags & PHB3_AIB_FENCED))
 		phb3_eeh_dump_regs(p, data);
 
 	/*
@@ -3873,6 +3889,7 @@ static const struct phb_ops phb3_ops = {
 	.next_error		= phb3_eeh_next_error,
 	.err_inject		= phb3_err_inject,
 	.get_diag_data2		= phb3_get_diag_data,
+	.get_phb_status		= phb3_get_phb_status,
 	.set_capi_mode		= phb3_set_capi_mode,
 	.set_capp_recovery	= phb3_set_capp_recovery,
 };
diff --git a/hw/phb4.c b/hw/phb4.c
index cd50361fc..a088a640e 100644
--- a/hw/phb4.c
+++ b/hw/phb4.c
@@ -4091,11 +4091,11 @@ static int64_t phb4_err_inject(struct phb *phb, uint64_t pe_number,
 	return handler(p, pe_number, addr, mask, is_write);
 }
 
-static int64_t phb4_get_diag_data(struct phb *phb,
+/* Get phb diag data only. Do not clear the phb pending errors. */
+static int64_t phb4_get_phb_status(struct phb *phb,
 				  void *diag_buffer,
 				  uint64_t diag_buffer_len)
 {
-	bool fenced;
 	struct phb4 *p = phb_to_phb4(phb);
 	struct OpalIoPhb4ErrorData *data = diag_buffer;
 
@@ -4108,10 +4108,25 @@ static int64_t phb4_get_diag_data(struct phb *phb,
 	 * Dummy check for fence so that phb4_read_phb_status knows
 	 * whether to use ASB or AIB
 	 */
-	fenced = phb4_fenced(p);
+	phb4_fenced(p);
 	phb4_read_phb_status(p, data);
 
-	if (!fenced)
+	return OPAL_SUCCESS;
+}
+
+/* Get phb diag data and clear the phb pending errors. */
+static int64_t phb4_get_diag_data(struct phb *phb,
+				  void *diag_buffer,
+				  uint64_t diag_buffer_len)
+{
+	int64_t rc;
+	struct phb4 *p = phb_to_phb4(phb);
+
+	rc = phb4_get_phb_status(phb, diag_buffer, diag_buffer_len);
+	if (!rc)
+		return rc;
+
+	if (!(p->flags & PHB4_AIB_FENCED))
 		phb4_eeh_dump_regs(p);
 
 	/*
@@ -4938,6 +4953,7 @@ static const struct phb_ops phb4_ops = {
 	.next_error		= phb4_eeh_next_error,
 	.err_inject		= phb4_err_inject,
 	.get_diag_data2		= phb4_get_diag_data,
+	.get_phb_status		= phb4_get_phb_status,
 	.tce_kill		= phb4_tce_kill,
 	.set_capi_mode		= phb4_set_capi_mode,
 	.set_p2p		= phb4_set_p2p,
diff --git a/include/pci.h b/include/pci.h
index d8f712e72..8da57a954 100644
--- a/include/pci.h
+++ b/include/pci.h
@@ -254,6 +254,8 @@ struct phb_ops {
 			      uint64_t mask);
 	int64_t (*get_diag_data2)(struct phb *phb, void *diag_buffer,
 				  uint64_t diag_buffer_len);
+	int64_t (*get_phb_status)(struct phb *phb, void *diag_buffer,
+				  uint64_t diag_buffer_len);
 	int64_t (*next_error)(struct phb *phb, uint64_t *first_frozen_pe,
 			      uint16_t *pci_error_type, uint16_t *severity);
 




More information about the Skiboot mailing list