[Skiboot] [PATCH 09/10] centaur: Improve FSI SCOM error handling
Benjamin Herrenschmidt
benh at kernel.crashing.org
Tue Jun 23 14:25:59 AEST 2015
Based on HostBoot, recovers from bad XSCOM addresses
Signed-off-by: Benjamin Herrenschmidt <benh at kernel.crashing.org>
---
hw/centaur.c | 122 ++++++++++++++++++++++++++++++++++--------------------
include/centaur.h | 1 +
2 files changed, 78 insertions(+), 45 deletions(-)
diff --git a/hw/centaur.c b/hw/centaur.c
index 24d2836..3df1291 100644
--- a/hw/centaur.c
+++ b/hw/centaur.c
@@ -59,6 +59,9 @@
#define SCAC_CONFIG_CLR 0x020115d0
#define SCAC_ENABLE_MSK PPC_BIT(0)
+#define cent_log(__lev, __c, __fmt, ...) \
+ prlog(__lev, "CENTAUR %x: " __fmt, __c->part_id, ##__VA_ARGS__)
+
static int64_t centaur_fsiscom_complete(struct centaur_chip *centaur)
{
int64_t rc;
@@ -67,17 +70,49 @@ static int64_t centaur_fsiscom_complete(struct centaur_chip *centaur)
rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
centaur->fsi_master_port, FSI_STATUS_REG, &stat);
if (rc) {
- /* XXX Improve logging */
- prerror("CENTAUR: MFSI read error %lld reading STAT\n", rc);
+ cent_log(PR_ERR, centaur, "MFSI read error %lld reading STAT\n", rc);
return rc;
}
if ((stat & (FSI_STATUS_ABORT | FSI_STATUS_ERRORS)) == 0)
return OPAL_SUCCESS;
- prerror("CENTAUR: Remote FSI error, stat=0x%08x\n", stat);
+ cent_log(PR_ERR, centaur, "Remote FSI SCOM error, status=0x%08x\n", stat);
- /* XXX Handle recovery */
+ /* All 1's ? Assume it's gone */
+ if (stat == 0xffffffffu) {
+ cent_log(PR_ERR, centaur, "Chip appears to be dead !\n");
+ centaur->valid = false;
+
+ /* Here, hostboot grabs a pile of FFDC from the FSI layer,
+ * we could do that too ...
+ */
+ return OPAL_HARDWARE;
+ }
+ /* Here HB prints the GPx registers which I believe are only
+ * in the host (FSI master). We skip that for now, we don't have
+ * a good API to them
+ */
+
+ /* Recovery sequence from HostBoot fsiscom.C
+ * if SCOM fails and FSI Master displays "MasterTimeOut"
+ * then 7,6 <covered by FSI driver>
+ * else if SCOM fails and FSI2PIB Status shows PIB abort
+ * then just perform unit reset (6) and wait 1 ms
+ * else (PIB_abort='0' but PIB error is unequal 0)
+ * then just perform unit reset (6) (wait not needed).
+ *
+ * Note: Waiting 1ms inside OPAL is a BIG NO NO !!! We have
+ * no choice but doing it at the moment but that will have
+ * to be fixed one way or another, possibly by returning some
+ * kind of busy status until the delay is expired.
+ */
+ rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+ centaur->fsi_master_port, FSI_ENG_RESET_REG, 0);
+ if (rc) {
+ cent_log(PR_ERR, centaur, "MFSI write error %lld resetting SCOM engine\n",
+ rc);
+ }
return OPAL_HARDWARE;
}
@@ -90,8 +125,7 @@ static int64_t centaur_fsiscom_read(struct centaur_chip *centaur, uint32_t pcb_a
rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
centaur->fsi_master_port, FSI_CMD_REG, pcb_addr | FSI_CMD_RD);
if (rc) {
- /* XXX Improve logging */
- prerror("CENTAUR: MFSI write error %lld writing CMD\n", rc);
+ cent_log(PR_ERR, centaur, "MFSI write error %lld writing CMD\n", rc);
return rc;
}
@@ -102,15 +136,13 @@ static int64_t centaur_fsiscom_read(struct centaur_chip *centaur, uint32_t pcb_a
rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
centaur->fsi_master_port, FSI_DATA0_REG, &data0);
if (rc) {
- /* XXX Improve logging */
- prerror("CENTAUR: MFSI read error %lld reading DATA0\n", rc);
+ cent_log(PR_ERR, centaur, "MFSI read error %lld reading DATA0\n", rc);
return rc;
}
rc = mfsi_read(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
centaur->fsi_master_port, FSI_DATA1_REG, &data1);
if (rc) {
- /* XXX Improve logging */
- prerror("CENTAUR: MFSI read error %lld readking DATA1\n", rc);
+ cent_log(PR_ERR, centaur, "MFSI read error %lld readking DATA1\n", rc);
return rc;
}
@@ -119,6 +151,33 @@ static int64_t centaur_fsiscom_read(struct centaur_chip *centaur, uint32_t pcb_a
return OPAL_SUCCESS;
}
+static int64_t centaur_fsiscom_write(struct centaur_chip *centaur, uint32_t pcb_addr,
+ uint64_t val)
+{
+ int64_t rc;
+
+ rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+ centaur->fsi_master_port, FSI_DATA0_REG, hi32(val));
+ if (rc) {
+ cent_log(PR_ERR, centaur, "MFSI write error %lld writing DATA0\n", rc);
+ return rc;
+ }
+ rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+ centaur->fsi_master_port, FSI_DATA1_REG, lo32(val));
+ if (rc) {
+ cent_log(PR_ERR, centaur, "MFSI write error %lld writing DATA1\n", rc);
+ return rc;
+ }
+ rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
+ centaur->fsi_master_port, FSI_CMD_REG, pcb_addr | FSI_CMD_WR);
+ if (rc) {
+ cent_log(PR_ERR, centaur, "MFSI write error %lld writing CMD\n", rc);
+ return rc;
+ }
+
+ return centaur_fsiscom_complete(centaur);
+}
+
struct centaur_chip *get_centaur(uint32_t part_id)
{
uint32_t hchip_id, mchan;
@@ -156,36 +215,6 @@ struct centaur_chip *get_centaur(uint32_t part_id)
return centaur;
}
-static int64_t centaur_fsiscom_write(struct centaur_chip *centaur, uint32_t pcb_addr,
- uint64_t val)
-{
- int64_t rc;
-
- rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
- centaur->fsi_master_port, FSI_DATA0_REG, hi32(val));
- if (rc) {
- /* XXX Improve logging */
- prerror("CENTAUR: MFSI write error %lld writing DATA0\n", rc);
- return rc;
- }
- rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
- centaur->fsi_master_port, FSI_DATA1_REG, lo32(val));
- if (rc) {
- /* XXX Improve logging */
- prerror("CENTAUR: MFSI write error %lld writing DATA1\n", rc);
- return rc;
- }
- rc = mfsi_write(centaur->fsi_master_chip_id, centaur->fsi_master_engine,
- centaur->fsi_master_port, FSI_CMD_REG, pcb_addr | FSI_CMD_WR);
- if (rc) {
- /* XXX Improve logging */
- prerror("CENTAUR: MFSI write error %lld writing CMD\n", rc);
- return rc;
- }
-
- return centaur_fsiscom_complete(centaur);
-}
-
int64_t centaur_xscom_read(uint32_t id, uint64_t pcb_addr, uint64_t *val)
{
struct centaur_chip *centaur = get_centaur(id);
@@ -223,8 +252,9 @@ static bool centaur_check_id(struct centaur_chip *centaur)
rc = centaur_fsiscom_read(centaur, 0xf000f, &val);
if (rc) {
- prerror("CENTAUR: FSISCOM error %lld reading ID register\n",
- rc);
+ cent_log(PR_ERR, centaur,
+ " FSISCOM error %lld reading ID register\n",
+ rc);
return false;
}
@@ -233,7 +263,8 @@ static bool centaur_check_id(struct centaur_chip *centaur)
/* Identify chip */
if ((val & 0xff) != 0xe9) {
- prerror("CENTAUR: CFAM ID 0x%02x is not a Centaur !\n",
+ cent_log(PR_ERR, centaur,
+ " CFAM ID 0x%02x is not a Centaur !\n",
(unsigned int)(val & 0xff));
return false;
}
@@ -287,6 +318,7 @@ static bool centaur_add(uint32_t part_id, uint32_t mchip, uint32_t meng,
prerror("CENTAUR: Duplicate centaur !\n");
return false;
}
+ centaur->part_id = part_id;
centaur->fsi_master_chip_id = mchip;
centaur->fsi_master_port = mport;
centaur->fsi_master_engine = meng ? MFSI_cMFSI1 : MFSI_cMFSI0;
@@ -295,7 +327,7 @@ static bool centaur_add(uint32_t part_id, uint32_t mchip, uint32_t meng,
if (!centaur_check_id(centaur))
return false;
- printf("CENTAUR: ChipID 0x%x [DD%x.%x]\n", part_id,
+ cent_log(PR_INFO, centaur, "Found DD%x.%x chip\n",
centaur->ec_level >> 4,
centaur->ec_level & 0xf);
@@ -343,7 +375,7 @@ int64_t centaur_enable_sensor_cache(uint32_t part_id)
lock(¢aur->lock);
if (centaur->scache_disable_count == 0) {
- prerror("CENTAUR: Cache count going negative !\n");
+ cent_log(PR_ERR, centaur, "Cache count going negative !\n");
backtrace();
goto bail;
}
diff --git a/include/centaur.h b/include/centaur.h
index f515da7..6453e13 100644
--- a/include/centaur.h
+++ b/include/centaur.h
@@ -25,6 +25,7 @@
struct centaur_chip {
bool valid;
uint8_t ec_level;
+ uint32_t part_id;
uint32_t fsi_master_chip_id;
uint32_t fsi_master_port;
uint32_t fsi_master_engine;
--
2.1.4
More information about the Skiboot
mailing list