[Skiboot] [PATCH 06/10] fsi-master: Fix error handling

Benjamin Herrenschmidt benh at kernel.crashing.org
Tue Jun 23 14:25:56 AEST 2015


Don't consider hMFSI bits when using cMFSI or vice-versa, properly
reset the PIB2OPB bridge etc..

Signed-off-by: Benjamin Herrenschmidt <benh at kernel.crashing.org>
---
 hw/fsi-master.c | 128 +++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 75 insertions(+), 53 deletions(-)

diff --git a/hw/fsi-master.c b/hw/fsi-master.c
index 9f644aa..b8e5d6b 100644
--- a/hw/fsi-master.c
+++ b/hw/fsi-master.c
@@ -41,15 +41,18 @@
 #define   OPB_CMD_16BIT		0x20000000
 #define   OPB_CMD_32BIT		0x60000000
 #define PIB2OPB_REG_STAT	0x1
+#define   OPB_STAT_ANY_ERR	0x80000000
+#define   OPB_STAT_ERR_OPB      0x7FEC0000
+#define   OPB_STAT_ERRACK       0x00100000
 #define   OPB_STAT_BUSY		0x00010000
 #define   OPB_STAT_READ_VALID   0x00020000
-#define   OPB_STAT_ERR_OPB      0x09F00000
 #define   OPB_STAT_ERR_CMFSI    0x0000FC00
-#define   OPB_STAT_ERR_MFSI     0x000000FC
-#define   OPB_STAT_ERR_ANY      (OPB_STAT_ERR_OPB | \
-				 OPB_STAT_ERR_CMFSI | \
-				 OPB_STAT_ERR_MFSI)
+#define   OPB_STAT_ERR_HMFSI    0x000000FC
+#define   OPB_STAT_ERR_BASE	(OPB_STAT_ANY_ERR | \
+				 OPB_STAT_ERR_OPB | \
+				 OPB_STAT_ERRACK)
 #define PIB2OPB_REG_LSTAT	0x2
+#define PIB2OPB_REG_RESET	0x4
 
 /*
  * PIB2OPB 0 has 2 MFSIs, cMFSI and hMFSI, PIB2OPB 1 only
@@ -70,7 +73,7 @@
  * global lock
  */
 static struct lock fsi_lock = LOCK_UNLOCKED;
-static uint32_t mfsi_valid_err = OPB_STAT_ERR_ANY;
+static uint32_t mfsi_valid_err;
 
 /*
  * OPB accessors
@@ -78,37 +81,47 @@ static uint32_t mfsi_valid_err = OPB_STAT_ERR_ANY;
 
 #define MFSI_OPB_MAX_TRIES	120
 
-static int64_t mfsi_handle_opb_error(uint32_t chip, uint32_t xscom_base,
-				     uint32_t stat)
+static int64_t mfsi_pib2opb_reset(uint32_t chip, uint32_t xscom_base)
 {
+	uint64_t stat;
 	int64_t rc;
 
-	prerror("MFSI: Error status=0x%08x !\n", stat);
+	rc = xscom_write(chip, xscom_base + PIB2OPB_REG_RESET, (1ul << 63));
+	if (rc) {
+		prerror("MFSI: XSCOM error %lld resetting PIB2OPB\n", rc);
+		return rc;
+	}
+	rc = xscom_write(chip, xscom_base + PIB2OPB_REG_STAT, (1ul << 63));
+	if (rc) {
+		prerror("MFSI: XSCOM error %lld resetting status\n", rc);
+		return rc;
+	}
+	rc = xscom_read(chip, xscom_base + PIB2OPB_REG_STAT, &stat);
+	if (rc) {
+		prerror("MFSI: XSCOM error %lld reading status\n", rc);
+		return rc;
+	}
+	return 0;
+}
 
-	/* XXX Dump a bunch of data, create an error log ... */
 
-	/* Clean error */
-	rc = xscom_write(chip, xscom_base + PIB2OPB_REG_STAT, 0);
-	if (rc)
-		prerror("MFSI: XSCOM error %lld clearing status\n", rc);
-
-	/*
-	 * XXX HB resets the ports here, but that's broken as it will
-	 * re-enter the opb accessors ... the HW is a mess here, it mixes
-	 * the OPB stuff with the FSI stuff in horrible ways.
-	 * If we want to reset the port and generally handle FSI specific
-	 * errors we should do that at the upper level and leave only the
-	 * OPB error handling here.
-	 *
-	 * We probably need to return "stat" to the callers too for that
-	 * to work
+static int64_t mfsi_handle_opb_error(uint32_t chip, uint32_t xscom_base,
+				     uint32_t stat, uint32_t err_bits)
+{
+	prerror("MFSI: Error status=0x%08x (raw=0x%08x) !\n",
+		stat & err_bits, stat);
+
+	/* For now, just reset the PIB2OPB on error. We should collect more
+	 * info and look at the remote errors in the target as well but that
+	 * will be for another day.
 	 */
+	mfsi_pib2opb_reset(chip, xscom_base);
 	
 	return OPAL_HARDWARE;
 }
 
 static int64_t mfsi_opb_poll(uint32_t chip, uint32_t xscom_base,
-			     uint32_t *read_data)
+			     uint32_t *read_data, uint32_t err_bits)
 {
 	unsigned long retries = MFSI_OPB_MAX_TRIES;
 	uint64_t sval;
@@ -131,9 +144,6 @@ static int64_t mfsi_opb_poll(uint32_t chip, uint32_t xscom_base,
 		/* Complete */
 		if (!(stat & OPB_STAT_BUSY))
 			break;
-		/* Error */
-		if (stat & mfsi_valid_err)
-			break;
 		if (retries-- == 0) {
 			/* XXX What should we do here ? reset it ? */
 			prerror("MFSI: OPB POLL timeout !\n");
@@ -143,8 +153,8 @@ static int64_t mfsi_opb_poll(uint32_t chip, uint32_t xscom_base,
 	}
 
 	/* Did we have an error ? */
-	if (stat & mfsi_valid_err)
-		return mfsi_handle_opb_error(chip, xscom_base, stat);
+	if (stat & err_bits)
+		 return mfsi_handle_opb_error(chip, xscom_base, stat, err_bits);
 
 	if (read_data) {
 		if (!(stat & OPB_STAT_READ_VALID)) {
@@ -159,7 +169,8 @@ static int64_t mfsi_opb_poll(uint32_t chip, uint32_t xscom_base,
 }
 
 static int64_t mfsi_opb_read(uint32_t chip, uint32_t xscom_base,
-			     uint32_t addr, uint32_t *data)
+			     uint32_t addr, uint32_t *data,
+			     uint32_t err_bits)
 {
 	uint64_t opb_cmd = OPB_CMD_READ | OPB_CMD_32BIT;
 	int64_t rc;
@@ -178,11 +189,12 @@ static int64_t mfsi_opb_read(uint32_t chip, uint32_t xscom_base,
 		prerror("MFSI: XSCOM error %lld writing OPB CMD\n", rc);
 		return rc;
 	}
-	return mfsi_opb_poll(chip, xscom_base, data);
+	return mfsi_opb_poll(chip, xscom_base, data, err_bits);
 }
 
 static int64_t mfsi_opb_write(uint32_t chip, uint32_t xscom_base,
-			      uint32_t addr, uint32_t data)
+			      uint32_t addr, uint32_t data,
+			      uint32_t err_bits)
 {
 	uint64_t opb_cmd = OPB_CMD_WRITE | OPB_CMD_32BIT;
 	int64_t rc;
@@ -202,12 +214,12 @@ static int64_t mfsi_opb_write(uint32_t chip, uint32_t xscom_base,
 		prerror("MFSI: XSCOM error %lld writing OPB CMD\n", rc);
 		return rc;
 	}
-	return mfsi_opb_poll(chip, xscom_base, NULL);
+	return mfsi_opb_poll(chip, xscom_base, NULL, err_bits);
 }
 
 static int64_t mfsi_get_addrs(uint32_t mfsi, uint32_t port,
 			      uint32_t *xscom_base, uint32_t *port_base,
-			      uint32_t *reg_base)
+			      uint32_t *reg_base, uint32_t *err_bits)
 {
 	if (port > 7)
 		return OPAL_PARAMETER;
@@ -218,20 +230,25 @@ static int64_t mfsi_get_addrs(uint32_t mfsi, uint32_t port,
 		*xscom_base = PIB2OPB_MFSI0_ADDR;
 		*port_base = cMFSI_OPB_PORT_BASE + port * MFSI_OPB_PORT_STRIDE;
 		*reg_base = cMFSI_OPB_REG_BASE;
+		*err_bits = OPB_STAT_ERR_BASE | OPB_STAT_ERR_CMFSI;
 		break;
 	case MFSI_cMFSI1:
 		*xscom_base = PIB2OPB_MFSI1_ADDR;
 		*port_base = cMFSI_OPB_PORT_BASE + port * MFSI_OPB_PORT_STRIDE;
 		*reg_base = cMFSI_OPB_REG_BASE;
+		*err_bits = OPB_STAT_ERR_BASE | OPB_STAT_ERR_CMFSI;
 		break;
 	case MFSI_hMFSI0:
 		*xscom_base = PIB2OPB_MFSI0_ADDR;
 		*port_base = hMFSI_OPB_PORT_BASE + port * MFSI_OPB_PORT_STRIDE;
 		*reg_base = hMFSI_OPB_REG_BASE;
+		*err_bits = OPB_STAT_ERR_BASE | OPB_STAT_ERR_HMFSI;
 		break;
 	default:
 		return OPAL_PARAMETER;
 	}
+	*err_bits = *err_bits & mfsi_valid_err;
+
 	return OPAL_SUCCESS;
 }
 
@@ -239,13 +256,13 @@ int64_t mfsi_read(uint32_t chip, uint32_t mfsi, uint32_t port,
 		  uint32_t fsi_addr, uint32_t *data)
 {
 	int64_t rc;
-	uint32_t xscom, port_addr, reg;
+	uint32_t xscom, port_addr, reg, err_bits;
 
-	rc = mfsi_get_addrs(mfsi, port, &xscom, &port_addr, &reg);
+	rc = mfsi_get_addrs(mfsi, port, &xscom, &port_addr, &reg, &err_bits);
 	if (rc)
 		return rc;
 	lock(&fsi_lock);
-	rc = mfsi_opb_read(chip, xscom, port_addr + fsi_addr, data);
+	rc = mfsi_opb_read(chip, xscom, port_addr + fsi_addr, data, err_bits);
 	/* XXX Handle FSI level errors here, maybe reset port */
 	unlock(&fsi_lock);
 
@@ -256,13 +273,13 @@ int64_t mfsi_write(uint32_t chip, uint32_t mfsi, uint32_t port,
 		   uint32_t fsi_addr, uint32_t data)
 {
 	int64_t rc;
-	uint32_t xscom, port_addr, reg;
+	uint32_t xscom, port_addr, reg, err_bits;
 
-	rc = mfsi_get_addrs(mfsi, port, &xscom, &port_addr, &reg);
+	rc = mfsi_get_addrs(mfsi, port, &xscom, &port_addr, &reg, &err_bits);
 	if (rc)
 		return rc;
 	lock(&fsi_lock);
-	rc = mfsi_opb_write(chip, xscom, port_addr + fsi_addr, data);
+	rc = mfsi_opb_write(chip, xscom, port_addr + fsi_addr, data, err_bits);
 	/* XXX Handle FSI level errors here, maybe reset port */
 	unlock(&fsi_lock);
 
@@ -278,16 +295,21 @@ void mfsi_init(void)
 	 */
 	chip = next_chip(NULL);
 	assert(chip);
-	if (chip->type == PROC_CHIP_P8_MURANO) {
-		/* Hardware Bug HW222712 on Murano DD1.0 causes the
-		 * any_error bit to be un-clearable so we just
-		 * have to ignore it
- 		 */
-		if (chip->ec_level < 0x20) {
-			/* 16: cMFSI any-master-error */
-			/* 24: hMFSI any-master-error */
-			mfsi_valid_err &= 0xFFFF7F7F;
- 		}
-	}
+	mfsi_valid_err = OPB_STAT_ERR_BASE | OPB_STAT_ERR_CMFSI | OPB_STAT_ERR_HMFSI;
+
+
+	/* Hardware Bug HW222712 on Murano DD1.0 causes the
+	 * any_error bit to be un-clearable so we just
+	 * have to ignore it. Additionally, HostBoot applies
+	 * this to Venice too, though the comment there claims
+	 * this is a Simics workaround.
+	 *
+	 * The doc says that bit can be safely ignored, so let's
+	 * just not bother and always take it out.
+	 */
+
+	/* 16: cMFSI any-master-error */
+	/* 24: hMFSI any-master-error */
+	mfsi_valid_err &= 0xFFFF7F7F;
 }
 
-- 
2.1.4



More information about the Skiboot mailing list