[Skiboot] [PATCH 06/10] fsi-master: Fix error handling
Benjamin Herrenschmidt
benh at kernel.crashing.org
Tue Jun 23 14:25:56 AEST 2015
Don't consider hMFSI bits when using cMFSI or vice-versa, properly
reset the PIB2OPB bridge etc..
Signed-off-by: Benjamin Herrenschmidt <benh at kernel.crashing.org>
---
hw/fsi-master.c | 128 +++++++++++++++++++++++++++++++++-----------------------
1 file changed, 75 insertions(+), 53 deletions(-)
diff --git a/hw/fsi-master.c b/hw/fsi-master.c
index 9f644aa..b8e5d6b 100644
--- a/hw/fsi-master.c
+++ b/hw/fsi-master.c
@@ -41,15 +41,18 @@
#define OPB_CMD_16BIT 0x20000000
#define OPB_CMD_32BIT 0x60000000
#define PIB2OPB_REG_STAT 0x1
+#define OPB_STAT_ANY_ERR 0x80000000
+#define OPB_STAT_ERR_OPB 0x7FEC0000
+#define OPB_STAT_ERRACK 0x00100000
#define OPB_STAT_BUSY 0x00010000
#define OPB_STAT_READ_VALID 0x00020000
-#define OPB_STAT_ERR_OPB 0x09F00000
#define OPB_STAT_ERR_CMFSI 0x0000FC00
-#define OPB_STAT_ERR_MFSI 0x000000FC
-#define OPB_STAT_ERR_ANY (OPB_STAT_ERR_OPB | \
- OPB_STAT_ERR_CMFSI | \
- OPB_STAT_ERR_MFSI)
+#define OPB_STAT_ERR_HMFSI 0x000000FC
+#define OPB_STAT_ERR_BASE (OPB_STAT_ANY_ERR | \
+ OPB_STAT_ERR_OPB | \
+ OPB_STAT_ERRACK)
#define PIB2OPB_REG_LSTAT 0x2
+#define PIB2OPB_REG_RESET 0x4
/*
* PIB2OPB 0 has 2 MFSIs, cMFSI and hMFSI, PIB2OPB 1 only
@@ -70,7 +73,7 @@
* global lock
*/
static struct lock fsi_lock = LOCK_UNLOCKED;
-static uint32_t mfsi_valid_err = OPB_STAT_ERR_ANY;
+static uint32_t mfsi_valid_err;
/*
* OPB accessors
@@ -78,37 +81,47 @@ static uint32_t mfsi_valid_err = OPB_STAT_ERR_ANY;
#define MFSI_OPB_MAX_TRIES 120
-static int64_t mfsi_handle_opb_error(uint32_t chip, uint32_t xscom_base,
- uint32_t stat)
+static int64_t mfsi_pib2opb_reset(uint32_t chip, uint32_t xscom_base)
{
+ uint64_t stat;
int64_t rc;
- prerror("MFSI: Error status=0x%08x !\n", stat);
+ rc = xscom_write(chip, xscom_base + PIB2OPB_REG_RESET, (1ul << 63));
+ if (rc) {
+ prerror("MFSI: XSCOM error %lld resetting PIB2OPB\n", rc);
+ return rc;
+ }
+ rc = xscom_write(chip, xscom_base + PIB2OPB_REG_STAT, (1ul << 63));
+ if (rc) {
+ prerror("MFSI: XSCOM error %lld resetting status\n", rc);
+ return rc;
+ }
+ rc = xscom_read(chip, xscom_base + PIB2OPB_REG_STAT, &stat);
+ if (rc) {
+ prerror("MFSI: XSCOM error %lld reading status\n", rc);
+ return rc;
+ }
+ return 0;
+}
- /* XXX Dump a bunch of data, create an error log ... */
- /* Clean error */
- rc = xscom_write(chip, xscom_base + PIB2OPB_REG_STAT, 0);
- if (rc)
- prerror("MFSI: XSCOM error %lld clearing status\n", rc);
-
- /*
- * XXX HB resets the ports here, but that's broken as it will
- * re-enter the opb accessors ... the HW is a mess here, it mixes
- * the OPB stuff with the FSI stuff in horrible ways.
- * If we want to reset the port and generally handle FSI specific
- * errors we should do that at the upper level and leave only the
- * OPB error handling here.
- *
- * We probably need to return "stat" to the callers too for that
- * to work
+static int64_t mfsi_handle_opb_error(uint32_t chip, uint32_t xscom_base,
+ uint32_t stat, uint32_t err_bits)
+{
+ prerror("MFSI: Error status=0x%08x (raw=0x%08x) !\n",
+ stat & err_bits, stat);
+
+ /* For now, just reset the PIB2OPB on error. We should collect more
+ * info and look at the remote errors in the target as well but that
+ * will be for another day.
*/
+ mfsi_pib2opb_reset(chip, xscom_base);
return OPAL_HARDWARE;
}
static int64_t mfsi_opb_poll(uint32_t chip, uint32_t xscom_base,
- uint32_t *read_data)
+ uint32_t *read_data, uint32_t err_bits)
{
unsigned long retries = MFSI_OPB_MAX_TRIES;
uint64_t sval;
@@ -131,9 +144,6 @@ static int64_t mfsi_opb_poll(uint32_t chip, uint32_t xscom_base,
/* Complete */
if (!(stat & OPB_STAT_BUSY))
break;
- /* Error */
- if (stat & mfsi_valid_err)
- break;
if (retries-- == 0) {
/* XXX What should we do here ? reset it ? */
prerror("MFSI: OPB POLL timeout !\n");
@@ -143,8 +153,8 @@ static int64_t mfsi_opb_poll(uint32_t chip, uint32_t xscom_base,
}
/* Did we have an error ? */
- if (stat & mfsi_valid_err)
- return mfsi_handle_opb_error(chip, xscom_base, stat);
+ if (stat & err_bits)
+ return mfsi_handle_opb_error(chip, xscom_base, stat, err_bits);
if (read_data) {
if (!(stat & OPB_STAT_READ_VALID)) {
@@ -159,7 +169,8 @@ static int64_t mfsi_opb_poll(uint32_t chip, uint32_t xscom_base,
}
static int64_t mfsi_opb_read(uint32_t chip, uint32_t xscom_base,
- uint32_t addr, uint32_t *data)
+ uint32_t addr, uint32_t *data,
+ uint32_t err_bits)
{
uint64_t opb_cmd = OPB_CMD_READ | OPB_CMD_32BIT;
int64_t rc;
@@ -178,11 +189,12 @@ static int64_t mfsi_opb_read(uint32_t chip, uint32_t xscom_base,
prerror("MFSI: XSCOM error %lld writing OPB CMD\n", rc);
return rc;
}
- return mfsi_opb_poll(chip, xscom_base, data);
+ return mfsi_opb_poll(chip, xscom_base, data, err_bits);
}
static int64_t mfsi_opb_write(uint32_t chip, uint32_t xscom_base,
- uint32_t addr, uint32_t data)
+ uint32_t addr, uint32_t data,
+ uint32_t err_bits)
{
uint64_t opb_cmd = OPB_CMD_WRITE | OPB_CMD_32BIT;
int64_t rc;
@@ -202,12 +214,12 @@ static int64_t mfsi_opb_write(uint32_t chip, uint32_t xscom_base,
prerror("MFSI: XSCOM error %lld writing OPB CMD\n", rc);
return rc;
}
- return mfsi_opb_poll(chip, xscom_base, NULL);
+ return mfsi_opb_poll(chip, xscom_base, NULL, err_bits);
}
static int64_t mfsi_get_addrs(uint32_t mfsi, uint32_t port,
uint32_t *xscom_base, uint32_t *port_base,
- uint32_t *reg_base)
+ uint32_t *reg_base, uint32_t *err_bits)
{
if (port > 7)
return OPAL_PARAMETER;
@@ -218,20 +230,25 @@ static int64_t mfsi_get_addrs(uint32_t mfsi, uint32_t port,
*xscom_base = PIB2OPB_MFSI0_ADDR;
*port_base = cMFSI_OPB_PORT_BASE + port * MFSI_OPB_PORT_STRIDE;
*reg_base = cMFSI_OPB_REG_BASE;
+ *err_bits = OPB_STAT_ERR_BASE | OPB_STAT_ERR_CMFSI;
break;
case MFSI_cMFSI1:
*xscom_base = PIB2OPB_MFSI1_ADDR;
*port_base = cMFSI_OPB_PORT_BASE + port * MFSI_OPB_PORT_STRIDE;
*reg_base = cMFSI_OPB_REG_BASE;
+ *err_bits = OPB_STAT_ERR_BASE | OPB_STAT_ERR_CMFSI;
break;
case MFSI_hMFSI0:
*xscom_base = PIB2OPB_MFSI0_ADDR;
*port_base = hMFSI_OPB_PORT_BASE + port * MFSI_OPB_PORT_STRIDE;
*reg_base = hMFSI_OPB_REG_BASE;
+ *err_bits = OPB_STAT_ERR_BASE | OPB_STAT_ERR_HMFSI;
break;
default:
return OPAL_PARAMETER;
}
+ *err_bits = *err_bits & mfsi_valid_err;
+
return OPAL_SUCCESS;
}
@@ -239,13 +256,13 @@ int64_t mfsi_read(uint32_t chip, uint32_t mfsi, uint32_t port,
uint32_t fsi_addr, uint32_t *data)
{
int64_t rc;
- uint32_t xscom, port_addr, reg;
+ uint32_t xscom, port_addr, reg, err_bits;
- rc = mfsi_get_addrs(mfsi, port, &xscom, &port_addr, ®);
+ rc = mfsi_get_addrs(mfsi, port, &xscom, &port_addr, ®, &err_bits);
if (rc)
return rc;
lock(&fsi_lock);
- rc = mfsi_opb_read(chip, xscom, port_addr + fsi_addr, data);
+ rc = mfsi_opb_read(chip, xscom, port_addr + fsi_addr, data, err_bits);
/* XXX Handle FSI level errors here, maybe reset port */
unlock(&fsi_lock);
@@ -256,13 +273,13 @@ int64_t mfsi_write(uint32_t chip, uint32_t mfsi, uint32_t port,
uint32_t fsi_addr, uint32_t data)
{
int64_t rc;
- uint32_t xscom, port_addr, reg;
+ uint32_t xscom, port_addr, reg, err_bits;
- rc = mfsi_get_addrs(mfsi, port, &xscom, &port_addr, ®);
+ rc = mfsi_get_addrs(mfsi, port, &xscom, &port_addr, ®, &err_bits);
if (rc)
return rc;
lock(&fsi_lock);
- rc = mfsi_opb_write(chip, xscom, port_addr + fsi_addr, data);
+ rc = mfsi_opb_write(chip, xscom, port_addr + fsi_addr, data, err_bits);
/* XXX Handle FSI level errors here, maybe reset port */
unlock(&fsi_lock);
@@ -278,16 +295,21 @@ void mfsi_init(void)
*/
chip = next_chip(NULL);
assert(chip);
- if (chip->type == PROC_CHIP_P8_MURANO) {
- /* Hardware Bug HW222712 on Murano DD1.0 causes the
- * any_error bit to be un-clearable so we just
- * have to ignore it
- */
- if (chip->ec_level < 0x20) {
- /* 16: cMFSI any-master-error */
- /* 24: hMFSI any-master-error */
- mfsi_valid_err &= 0xFFFF7F7F;
- }
- }
+ mfsi_valid_err = OPB_STAT_ERR_BASE | OPB_STAT_ERR_CMFSI | OPB_STAT_ERR_HMFSI;
+
+
+ /* Hardware Bug HW222712 on Murano DD1.0 causes the
+ * any_error bit to be un-clearable so we just
+ * have to ignore it. Additionally, HostBoot applies
+ * this to Venice too, though the comment there claims
+ * this is a Simics workaround.
+ *
+ * The doc says that bit can be safely ignored, so let's
+ * just not bother and always take it out.
+ */
+
+ /* 16: cMFSI any-master-error */
+ /* 24: hMFSI any-master-error */
+ mfsi_valid_err &= 0xFFFF7F7F;
}
--
2.1.4
More information about the Skiboot
mailing list