[Skiboot] [PATCH] FSP: Notify FSP of Platform Log ID after Host Initiated Reset Reload

Stewart Smith stewart at linux.vnet.ibm.com
Fri May 5 15:55:28 AEST 2017


Trigging a Host Initiated Reset (when the host detects the FSP has gone
out to lunch and should be rebooted), would cause "Unknown Command" messages
to appear in the OPAL log.

This patch implements those messages

How to trigger FSP RR(HIR):

$ putmemproc 300000f8 0x00000000deadbeef
s1	k0:n0:s0:p00
ecmd_ppc putmemproc 300000f8 0x00000000deadbeef

Log showing unknown command:
/ # cat /sys/firmware/opal/msglog | grep -i ,3
[  110.232114723,3] FSP: fsp_trigger_reset() entry
[  188.431793837,3] FSP #0: Link down, starting R&R
[  464.109239162,3] FSP #0: Got XUP with no pending message !
[  466.340598554,3] FSP-DPO: Unknown command 0xce0900
[  466.340600126,3] FSP: Unhandled message ce0900

The message we need to handle is "Get PLID after host initiated FipS
reset/reload". When the FSP comes back from HIR, it asks "hey, so, which
error log explains why you rebooted me?". So, we tell it.

Reported-by: Pridhiviraj Paidipeddi <ppaidipe at linux.vnet.ibm.com>
Signed-off-by: Stewart Smith <stewart at linux.vnet.ibm.com>
---
 core/errorlog.c           | 13 ++++++----
 core/opal.c               | 12 +++++++++-
 hw/fsp/fsp-surveillance.c | 11 ++++-----
 hw/fsp/fsp.c              | 61 +++++++++++++++++++++++++++++++++++------------
 include/errorlog.h        | 13 ++++++----
 include/fsp.h             |  6 ++++-
 6 files changed, 83 insertions(+), 33 deletions(-)

diff --git a/core/errorlog.c b/core/errorlog.c
index 179e09f42f00..522dfcc2ee7c 100644
--- a/core/errorlog.c
+++ b/core/errorlog.c
@@ -1,4 +1,4 @@
-/* Copyright 2013-2016 IBM Corp.
+/* Copyright 2013-2017 IBM Corp.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -196,7 +196,7 @@ void log_append_msg(struct errorlog *buf, const char *fmt, ...)
 	log_append_data(buf, err_msg, strlen(err_msg));
 }
 
-void log_simple_error(struct opal_err_info *e_info, const char *fmt, ...)
+uint32_t log_simple_error(struct opal_err_info *e_info, const char *fmt, ...)
 {
 	struct errorlog *buf;
 	va_list list;
@@ -212,10 +212,13 @@ void log_simple_error(struct opal_err_info *e_info, const char *fmt, ...)
 	buf = opal_elog_create(e_info, 0);
 	if (buf == NULL) {
 		prerror("ELOG: Error getting buffer to log error\n");
-	} else {
-		log_append_data(buf, err_msg, strlen(err_msg));
-		log_commit(buf);
+		return -1;
 	}
+
+	log_append_data(buf, err_msg, strlen(err_msg));
+	log_commit(buf);
+
+	return buf->plid;
 }
 
 int elog_init(void)
diff --git a/core/opal.c b/core/opal.c
index 73223b1fe200..14357cc35f45 100644
--- a/core/opal.c
+++ b/core/opal.c
@@ -30,6 +30,7 @@
 #include <opal-msg.h>
 #include <timer.h>
 #include <elf-abi.h>
+#include <errorlog.h>
 
 /* Pending events to signal via opal_poll_events */
 uint64_t opal_pending_events;
@@ -51,6 +52,13 @@ static uint64_t opal_dynamic_events;
 extern uint32_t attn_trigger;
 extern uint32_t hir_trigger;
 
+/* We make this look like a Surveillance error, even though it really
+ * isn't one.
+ */
+DEFINE_LOG_ENTRY(OPAL_INJECTED_HIR, OPAL_MISC_ERR_EVT, OPAL_SURVEILLANCE,
+		OPAL_SURVEILLANCE_ERR, OPAL_PREDICTIVE_ERR_GENERAL,
+		OPAL_MISCELLANEOUS_INFO_ONLY);
+
 void opal_table_init(void)
 {
 	struct opal_table_entry *s = __opal_table_start;
@@ -408,7 +416,9 @@ static int64_t opal_poll_events(__be64 *outstanding_event_mask)
 
 	/* Test the host initiated reset */
 	if (hir_trigger == 0xdeadbeef) {
-		fsp_trigger_reset();
+		uint32_t plid = log_simple_error(&e_info(OPAL_INJECTED_HIR),
+			"SURV: Injected HIR, initiating FSP R/R\n");
+		fsp_trigger_reset(plid);
 		hir_trigger = 0;
 	}
 
diff --git a/hw/fsp/fsp-surveillance.c b/hw/fsp/fsp-surveillance.c
index d3e5c450c3b7..202b0932ab9f 100644
--- a/hw/fsp/fsp-surveillance.c
+++ b/hw/fsp/fsp-surveillance.c
@@ -82,15 +82,12 @@ static void fsp_surv_check_timeout(void)
 	 * just go ahead and check timeouts.
 	 */
 	if (tb_compare(now, surv_ack_timer) == TB_AAFTERB) {
-		/* XXX: We should be logging a PEL to the host, assuming
-		 * the FSP is dead, pending a R/R.
-		 */
-		log_simple_error(&e_info(OPAL_RC_SURVE_ACK),
+		uint32_t plid = log_simple_error(&e_info(OPAL_RC_SURVE_ACK),
 			"SURV: Surv ACK timed out; initiating R/R\n");
 
 		/* Reset the pending trigger too */
 		fsp_surv_ack_pending = false;
-		fsp_trigger_reset();
+		fsp_trigger_reset(plid);
 	}
 
 	return;
@@ -149,10 +146,10 @@ static void fsp_surv_got_param(uint32_t param_id __unused, int err_len,
 			       void *data __unused)
 {
 	if (err_len != 4) {
-		log_simple_error(&e_info(OPAL_RC_SURVE_STATUS),
+		uint32_t plid = log_simple_error(&e_info(OPAL_RC_SURVE_STATUS),
 		"SURV: Error (%d) retrieving surv status; initiating R/R\n",
 			err_len);
-		fsp_trigger_reset();
+		fsp_trigger_reset(plid);
 		return;
 	}
 
diff --git a/hw/fsp/fsp.c b/hw/fsp/fsp.c
index a0c5a788d543..162d9b4ff878 100644
--- a/hw/fsp/fsp.c
+++ b/hw/fsp/fsp.c
@@ -40,7 +40,13 @@
 #include <ccan/list/list.h>
 
 DEFINE_LOG_ENTRY(OPAL_RC_FSP_POLL_TIMEOUT, OPAL_PLATFORM_ERR_EVT, OPAL_FSP,
-		 OPAL_PLATFORM_FIRMWARE, OPAL_ERROR_PANIC, OPAL_NA);
+		 OPAL_PLATFORM_FIRMWARE, OPAL_RECOVERED_ERR_GENERAL, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_FSP_MBOX_ERR, OPAL_PLATFORM_ERR_EVT, OPAL_FSP,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_RECOVERED_ERR_GENERAL, OPAL_NA);
+
+DEFINE_LOG_ENTRY(OPAL_RC_FSP_DISR_HIR_MASK, OPAL_PLATFORM_ERR_EVT, OPAL_FSP,
+		 OPAL_PLATFORM_FIRMWARE, OPAL_RECOVERED_ERR_GENERAL, OPAL_NA);
 
 #define FSP_TRACE_MSG
 #define FSP_TRACE_EVENT
@@ -545,9 +551,12 @@ static void __fsp_trigger_reset(void)
 		fsp_prep_for_reset(fsp);
 }
 
-void fsp_trigger_reset(void)
+static uint32_t fsp_hir_reason_plid;
+
+void fsp_trigger_reset(uint32_t plid)
 {
 	lock(&fsp_lock);
+	fsp_hir_reason_plid = plid;
 	__fsp_trigger_reset();
 	unlock(&fsp_lock);
 }
@@ -683,9 +692,11 @@ static void fsp_handle_errors(struct fsp *fsp)
 	 * quite rare.
 	 */
 	if (fsp->state == fsp_mbx_err) {
-		prerror("FSP #%d: Triggering HIR on mbx_err\n",
-				fsp->index);
-		fsp_trigger_reset();
+		uint32_t plid;
+		plid = log_simple_error(&e_info(OPAL_RC_FSP_MBOX_ERR),
+					"FSP #%d: Triggering HIR on mbx_err\n",
+					fsp->index);
+		fsp_trigger_reset(plid);
 		return;
 	}
 
@@ -736,16 +747,20 @@ static void fsp_handle_errors(struct fsp *fsp)
 	 * to trigger a HIR so it can try to recover via the DRCR route.
 	 */
 	if (disr & FSP_DISR_HIR_TRIGGER_MASK) {
+		const char *reason = "Unknown FSP_DISR_HIR_TRIGGER";
+		uint32_t plid;
 		fsp_trace_event(fsp, TRACE_FSP_EVT_SOFT_RR, disr, 0, 0, 0);
 
 		if (disr & FSP_DISR_FSP_UNIT_CHECK)
-			prlog(PR_DEBUG, "FSP: DISR Unit Check set\n");
+			reason = "DISR Unit Check set";
 		else if (disr & FSP_DISR_FSP_RUNTIME_TERM)
-			prlog(PR_DEBUG, "FSP: DISR Runtime Terminate set\n");
+			reason = "DISR Runtime Terminate set";
 		else if (disr & FSP_DISR_FSP_FLASH_TERM)
-			prlog(PR_DEBUG, "FSP: DISR Flash Terminate set\n");
-		prlog(PR_NOTICE, "FSP: Triggering host initiated reset"
-		      " sequence\n");
+			reason = "DISR Flash Terminate set";
+
+		plid = log_simple_error(&e_info(OPAL_RC_FSP_DISR_HIR_MASK),
+					"FSP: %s. Triggering host initiated "
+					"reset.", reason);
 
 		/* Clear all interrupt conditions */
 		fsp_wreg(fsp, FSP_HDIR_REG, FSP_DBIRQ_ALL);
@@ -753,7 +768,7 @@ static void fsp_handle_errors(struct fsp *fsp)
 		/* Make sure this happened */
 		fsp_rreg(fsp, FSP_HDIR_REG);
 
-		fsp_trigger_reset();
+		fsp_trigger_reset(plid);
 		return;
 	}
 
@@ -1318,6 +1333,21 @@ static bool fsp_local_command(u32 cmd_sub_mod, struct fsp_msg *msg)
 			}
 		}
 		return true;
+	case FSP_CMD_GET_HIR_PLID:
+		/* Get Platform Log Id with reason for Host Initiated Reset */
+		prlog(PR_DEBUG, "FSP: Sending PLID 0x%x as HIR reason\n",
+		      fsp_hir_reason_plid);
+		resp = fsp_mkmsg(FSP_RSP_GET_HIR_PLID, 1, fsp_hir_reason_plid);
+		if (!resp)
+			prerror("FSP: Failed to allocate GET_HIR_PLID response\n");
+		else {
+			if (fsp_queue_msg(resp, fsp_freemsg)) {
+				fsp_freemsg(resp);
+				prerror("FSP: Failed to queue GET_HIR_PLID resp\n");
+			}
+		}
+		fsp_hir_reason_plid = 0;
+		return true;
 	}
 	return false;
 }
@@ -1340,7 +1370,7 @@ static void fsp_handle_command(struct fsp_msg *msg)
 	cmd_sub_mod =  (msg->word0 & 0xff) << 16;
 	cmd_sub_mod |= (msg->word1 & 0xff) << 8;
 	cmd_sub_mod |= (msg->word1 >> 8) & 0xff;
-	
+
 	/* Some commands are handled locally */
 	if (fsp_local_command(cmd_sub_mod, msg))
 		goto free;
@@ -2148,9 +2178,10 @@ static void fsp_timeout_poll(void *data __unused)
 			fsp_complete_msg(req);
 			__fsp_trigger_reset();
 			unlock(&fsp_lock);
-			log_simple_error(&e_info(OPAL_RC_FSP_POLL_TIMEOUT),
-					 "FSP: Response from FSP timed out, word0 = %x,"
-					 "word1 = %x state: %d\n", w0, w1, mstate);
+			fsp_hir_reason_plid = log_simple_error(
+				&e_info(OPAL_RC_FSP_POLL_TIMEOUT),
+				"FSP: Response from FSP timed out, word0 = %x,"
+				"word1 = %x state: %d\n", w0, w1, mstate);
 		}
 	next_bit:
 		cmdclass_resp_bitmask = cmdclass_resp_bitmask >> 1;
diff --git a/include/errorlog.h b/include/errorlog.h
index 247198b3a8df..e9d5ad86f3d6 100644
--- a/include/errorlog.h
+++ b/include/errorlog.h
@@ -259,6 +259,7 @@ enum opal_reasoncode {
 	OPAL_RC_SURVE_INIT	    = OPAL_SRC_COMPONENT_SURVEILLANCE | 0x10,
 	OPAL_RC_SURVE_STATUS	    = OPAL_SRC_COMPONENT_SURVEILLANCE | 0x11,
 	OPAL_RC_SURVE_ACK	    = OPAL_SRC_COMPONENT_SURVEILLANCE | 0x12,
+	OPAL_INJECTED_HIR	    = OPAL_SRC_COMPONENT_SURVEILLANCE | 0x13,
 /* SYSPARAM */
 	OPAL_RC_SYSPARM_INIT	    = OPAL_SRC_COMPONENT_SYSPARAM | 0x10,
 	OPAL_RC_SYSPARM_MSG	    = OPAL_SRC_COMPONENT_SYSPARAM | 0x11,
@@ -313,8 +314,9 @@ enum opal_reasoncode {
 	OPAL_RC_SLW_GET		    = OPAL_SRC_COMPONENT_SLW | 0x12,
 	OPAL_RC_SLW_REG		    = OPAL_SRC_COMPONENT_SLW | 0x13,
 /* FSP	*/
-	OPAL_RC_FSP_POLL_TIMEOUT
-				    = OPAL_SRC_COMPONENT_FSP | 0x10,
+	OPAL_RC_FSP_POLL_TIMEOUT    = OPAL_SRC_COMPONENT_FSP | 0x10,
+	OPAL_RC_FSP_MBOX_ERR	    = OPAL_SRC_COMPONENT_FSP | 0x11,
+	OPAL_RC_FSP_DISR_HIR_MASK   = OPAL_SRC_COMPONENT_FSP | 0x12,
 /* I2C */
 	OPAL_RC_I2C_INIT	    = OPAL_SRC_COMPONENT_I2C | 0X10,
 	OPAL_RC_I2C_START_REQ	    = OPAL_SRC_COMPONENT_I2C | 0X11,
@@ -339,9 +341,12 @@ severity, subtype) static struct opal_err_info err_##reason =		\
 
 /* This is wrapper around the error log function, which creates
  * and commits the error to FSP.
- * Used for simple error logging
+ * Used for simple error logging.
+ * Returns a Log ID, if an error involves a service processor needing
+ * to be kicked, this logid can be sent to the service processor explaining
+ * *why* we kicked it. Log Id = -1 on error.
  */
-void log_simple_error(struct opal_err_info *e_info,
+uint32_t log_simple_error(struct opal_err_info *e_info,
 		const char *fmt, ...) __attribute__ ((format (printf, 2, 3)));
 
 #define e_info(reason_code) err_##reason_code
diff --git a/include/fsp.h b/include/fsp.h
index f75b6ad98f7c..e7f6a7be8c33 100644
--- a/include/fsp.h
+++ b/include/fsp.h
@@ -368,6 +368,10 @@
 #define FSP_CMD_DEEP_REBOOT	0x1ce4e04 /* HV->FSP: Deep IPL */
 #define FSP_CMD_INIT_DPO	0x0ce5b00 /* FSP->HV: Initialize Delayed Power Off */
 #define FSP_RSP_INIT_DPO	0x0cedb00 /* HV->FSP: Response for DPO init command */
+#define FSP_CMD_GET_HIR_PLID	0x0ce0900 /* FSP->HV: Get Platform Log ID with
+					   * reason for Host Initiated Reset.
+					   */
+#define FSP_RSP_GET_HIR_PLID	0x0ce8900 /* HV->FSP: Reply with PLID */
 #define FSP_CMD_PANELSTATUS	0x0ce5c00 /* FSP->HV */
 #define FSP_CMD_PANELSTATUS_EX1	0x0ce5c02 /* FSP->HV */
 #define FSP_CMD_PANELSTATUS_EX2	0x0ce5c03 /* FSP->HV */
@@ -808,7 +812,7 @@ extern void fsp_ipmi_init(void);
 
 /* Reset/Reload */
 extern void fsp_reinit_fsp(void);
-extern void fsp_trigger_reset(void);
+extern void fsp_trigger_reset(uint32_t plid);
 extern void fsp_reset_links(void);
 extern bool fsp_in_rr(void);
 
-- 
2.1.4



More information about the Skiboot mailing list