[Skiboot] [PATCH] NPU2 HMIs: dump out a *LOT* of npu2 registers for debugging

Stewart Smith stewart at linux.vnet.ibm.com
Wed Feb 28 17:52:12 AEDT 2018


This is not the way we want to end up doing this.

This is a hack to make folk happy and not require crondump to
debug nvidia/npu2 issues.

Cc: stable
Signed-off-by: Stewart Smith <stewart at linux.vnet.ibm.com>
---
 core/hmi.c          | 38 +++++++++++++++++++++++++++++++++++++-
 hw/slw.c            |  4 ++--
 hw/xscom.c          | 36 ++++++++++++++++++++++--------------
 include/npu2-regs.h |  7 ++++++-
 include/xscom.h     |  4 ++--
 5 files changed, 69 insertions(+), 20 deletions(-)

diff --git a/core/hmi.c b/core/hmi.c
index 00d0fb7b545d..17983a334d43 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -1,4 +1,4 @@
-/* Copyright 2013-2014 IBM Corp.
+/* Copyright 2013-2018 IBM Corp.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <npu2-regs.h>
 #include <npu.h>
 #include <capp.h>
+#include <nvram.h>
 
 /*
  * HMER register layout:
@@ -544,7 +545,10 @@ static void find_npu2_checkstop_reason(int flat_chip_id,
 	uint64_t npu2_fir_action0_addr;
 	uint64_t npu2_fir_action1_addr;
 	uint64_t fatal_errors;
+	uint64_t npu_scom_dump[2];
+	bool npu2_hmi_verbose;
 	int total_errors = 0;
+	uint64_t r;
 
 	/* Find the NPU on the chip associated with the HMI. */
 	for_each_phb(phb) {
@@ -596,6 +600,38 @@ static void find_npu2_checkstop_reason(int flat_chip_id,
 	if (!total_errors)
 		return;
 
+	npu2_hmi_verbose = nvram_query_eq("npu2-hmi-verbose", "true");
+	/* Force this for now until we sort out something better */
+	npu2_hmi_verbose = true;
+
+	if (npu2_hmi_verbose) {
+		_xscom_lock();
+		for (r = NPU2_DEBUG_REG_START; r < NPU2_DEBUG_REG_END; r++) {
+			npu_scom_dump[0] = npu_scom_dump[1] = 0;
+			_xscom_read(flat_chip_id, r++, &npu_scom_dump[0], false, true);
+			_xscom_read(flat_chip_id, r,   &npu_scom_dump[1], false, true);
+			prlog(PR_ERR, "NPU: 0x%016llx=0x%016llx 0x%016llx=0x%016llx\n",
+			      r-1, npu_scom_dump[0],
+			      r, npu_scom_dump[1]);
+		}
+		for (r = NPU2_FIR_REGISTER_0; r < NPU2_FIR_REGISTER_END; r++) {
+			npu_scom_dump[0] = npu_scom_dump[1] = 0;
+			_xscom_read(flat_chip_id, r++, &npu_scom_dump[0], false, true);
+			_xscom_read(flat_chip_id, r,   &npu_scom_dump[1], false, true);
+			prlog(PR_ERR, "NPU: 0x%016llx=0x%016llx 0x%016llx=0x%016llx\n",
+			      r-1, npu_scom_dump[0],
+			      r, npu_scom_dump[1]);
+		}
+		_xscom_unlock();
+		prlog(PR_ERR, " _________________________ \n");
+		prlog(PR_ERR, "< It's Driver Debug time! >\n");
+		prlog(PR_ERR, " ------------------------- \n");
+		prlog(PR_ERR, "       \\   ,__,            \n");
+		prlog(PR_ERR, "        \\  (oo)____        \n");
+		prlog(PR_ERR, "           (__)    )\\      \n");
+		prlog(PR_ERR, "              ||--|| *     \n");
+	}
+
 	/* Set up the HMI event */
 	hmi_evt->severity = OpalHMI_SEV_WARNING;
 	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
diff --git a/hw/slw.c b/hw/slw.c
index f3c837423a01..db238ecb06b5 100644
--- a/hw/slw.c
+++ b/hw/slw.c
@@ -1620,7 +1620,7 @@ void slw_update_timer_expiry(uint64_t new_target)
 		/* Grab generation and spin if odd */
 		_xscom_lock();
 		for (;;) {
-			rc = _xscom_read(slw_timer_chip, 0xE0006, &gen, false);
+			rc = _xscom_read(slw_timer_chip, 0xE0006, &gen, false, false);
 			if (rc) {
 				prerror("SLW: Error %lld reading tmr gen "
 					" count\n", rc);
@@ -1664,7 +1664,7 @@ void slw_update_timer_expiry(uint64_t new_target)
 		}
 
 		/* Re-check gen count */
-		rc = _xscom_read(slw_timer_chip, 0xE0006, &gen2, false);
+		rc = _xscom_read(slw_timer_chip, 0xE0006, &gen2, false, false);
 		if (rc) {
 			prerror("SLW: Error %lld re-reading tmr gen "
 				" count\n", rc);
diff --git a/hw/xscom.c b/hw/xscom.c
index 05012780aafe..1bcfd475e737 100644
--- a/hw/xscom.c
+++ b/hw/xscom.c
@@ -215,8 +215,9 @@ static int xscom_clear_error(uint32_t gcid, uint32_t pcb_addr)
 }
 
 static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_addr,
-			      bool is_write, int64_t retries,
-			      int64_t *xscom_clear_retries)
+				  bool is_write, int64_t retries,
+				  int64_t *xscom_clear_retries,
+				  bool ignore_error)
 {
 	unsigned int stat = GETFIELD(SPR_HMER_XSCOM_STATUS, hmer);
 	int64_t rc = OPAL_HARDWARE;
@@ -277,9 +278,12 @@ static int64_t xscom_handle_error(uint64_t hmer, uint32_t gcid, uint32_t pcb_add
 	}
 
 	/* XXX: Create error log entry ? */
-	log_simple_error(&e_info(OPAL_RC_XSCOM_RW),
-		"XSCOM: %s error gcid=0x%x pcb_addr=0x%x stat=0x%x\n",
-		is_write ? "write" : "read", gcid, pcb_addr, stat);
+	if (!ignore_error)
+		log_simple_error(&e_info(OPAL_RC_XSCOM_RW),
+				 "XSCOM: %s error gcid=0x%x "
+				 "pcb_addr=0x%x stat=0x%x\n",
+				 is_write ? "write" : "read", gcid,
+				 pcb_addr, stat);
 
 	/* We need to reset the XSCOM or we'll hang on the next access */
 	xscom_reset(gcid, false);
@@ -322,14 +326,16 @@ static inline bool xscom_is_multicast_addr(uint32_t addr)
  * Low level XSCOM access functions, perform a single direct xscom
  * access via MMIO
  */
-static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
+static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val,
+			bool ignore_error)
 {
 	uint64_t hmer;
 	int64_t ret, retries;
 	int64_t xscom_clear_retries = XSCOM_CLEAR_MAX_RETRIES;
 
 	if (!xscom_gcid_ok(gcid)) {
-		prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
+		if (!ignore_error)
+			prerror("%s: invalid XSCOM gcid 0x%x\n", __func__, gcid);
 		return OPAL_PARAMETER;
 	}
 
@@ -351,7 +357,7 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
 
 		/* Handle error and possibly eventually retry */
 		ret = xscom_handle_error(hmer, gcid, pcb_addr, false, retries,
-				&xscom_clear_retries);
+					 &xscom_clear_retries, ignore_error);
 		if (ret != OPAL_BUSY)
 			break;
 	}
@@ -370,7 +376,8 @@ static int __xscom_read(uint32_t gcid, uint32_t pcb_addr, uint64_t *val)
 	if (proc_gen == proc_gen_p9 && ret == OPAL_XSCOM_CHIPLET_OFF)
 		return ret;
 
-	prerror("XSCOM: Read failed, ret =  %lld\n", ret);
+	if (!ignore_error)
+		prerror("XSCOM: Read failed, ret =  %lld\n", ret);
 	return ret;
 }
 
@@ -403,7 +410,7 @@ static int __xscom_write(uint32_t gcid, uint32_t pcb_addr, uint64_t val)
 
 		/* Handle error and possibly eventually retry */
 		ret = xscom_handle_error(hmer, gcid, pcb_addr, true, retries,
-				&xscom_clear_retries);
+					 &xscom_clear_retries, false);
 		if (ret != OPAL_BUSY)
 			break;
 	}
@@ -451,7 +458,7 @@ static int xscom_indirect_read_form0(uint32_t gcid, uint64_t pcb_addr,
 
 	/* Wait for completion */
 	for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) {
-		rc = __xscom_read(gcid, addr, &data);
+		rc = __xscom_read(gcid, addr, &data, false);
 		if (rc)
 			goto bail;
 		if ((data & XSCOM_DATA_IND_COMPLETE) &&
@@ -513,7 +520,7 @@ static int xscom_indirect_write_form0(uint32_t gcid, uint64_t pcb_addr,
 
 	/* Wait for completion */
 	for (retries = 0; retries < XSCOM_IND_MAX_RETRIES; retries++) {
-		rc = __xscom_read(gcid, addr, &data);
+		rc = __xscom_read(gcid, addr, &data, false);
 		if (rc)
 			goto bail;
 		if ((data & XSCOM_DATA_IND_COMPLETE) &&
@@ -588,7 +595,8 @@ void _xscom_unlock(void)
 /*
  * External API
  */
-int _xscom_read(uint32_t partid, uint64_t pcb_addr, uint64_t *val, bool take_lock)
+int _xscom_read(uint32_t partid, uint64_t pcb_addr, uint64_t *val,
+		bool take_lock, bool ignore_error)
 {
 	uint32_t gcid;
 	int rc;
@@ -635,7 +643,7 @@ int _xscom_read(uint32_t partid, uint64_t pcb_addr, uint64_t *val, bool take_loc
 	if (pcb_addr & XSCOM_ADDR_IND_FLAG)
 		rc = xscom_indirect_read(gcid, pcb_addr, val);
 	else
-		rc = __xscom_read(gcid, pcb_addr & 0x7fffffff, val);
+		rc = __xscom_read(gcid, pcb_addr & 0x7fffffff, val, ignore_error);
 
 	/* Unlock it */
 	if (take_lock)
diff --git a/include/npu2-regs.h b/include/npu2-regs.h
index c1092735d371..73925f9ee86d 100644
--- a/include/npu2-regs.h
+++ b/include/npu2-regs.h
@@ -1,4 +1,4 @@
-/* Copyright 2013-2016 IBM Corp.
+/* Copyright 2013-2018 IBM Corp.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,10 @@ uint64_t npu2_read(struct npu2 *p, uint64_t reg);
 void npu2_write(struct npu2 *p, uint64_t reg, uint64_t val);
 void npu2_write_mask(struct npu2 *p, uint64_t reg, uint64_t val, uint64_t mask);
 
+/* SCOM Registers to dump on HMI to aid in debugging */
+#define NPU2_DEBUG_REG_START 0x5011000
+#define NPU2_DEBUG_REG_END   0x50110FF
+
 /* These aren't really NPU specific registers but we initialise them in NPU
  * code */
 #define MCD0_BANK0_CN3 0x301100d
@@ -468,6 +472,7 @@ void npu2_write_mask(struct npu2 *p, uint64_t reg, uint64_t val, uint64_t mask);
 #define NPU2_FIR_REGISTER_0			0x0000000005013C00
 #define NPU2_FIR_REGISTER_1			0x0000000005013C40
 #define NPU2_FIR_REGISTER_2			0x0000000005013C80
+#define NPU2_FIR_REGISTER_END			0x0000000005013CFF
 
 #define NPU2_TOTAL_FIR_REGISTERS		3
 
diff --git a/include/xscom.h b/include/xscom.h
index 98532240b116..3193abdbb6e9 100644
--- a/include/xscom.h
+++ b/include/xscom.h
@@ -225,7 +225,7 @@
 
 /* Use only in select places where multiple SCOMs are time/latency sensitive */
 extern void _xscom_lock(void);
-extern int _xscom_read(uint32_t partid, uint64_t pcb_addr, uint64_t *val, bool take_lock);
+extern int _xscom_read(uint32_t partid, uint64_t pcb_addr, uint64_t *val, bool take_lock, bool ignore_error);
 extern int _xscom_write(uint32_t partid, uint64_t pcb_addr, uint64_t val, bool take_lock);
 extern void _xscom_unlock(void);
 
@@ -233,7 +233,7 @@ extern void _xscom_unlock(void);
 /* Targeted SCOM access */
 static inline int xscom_read(uint32_t partid, uint64_t pcb_addr, uint64_t *val)
 {
-	return _xscom_read(partid, pcb_addr, val, true);
+	return _xscom_read(partid, pcb_addr, val, true, false);
 }
 static inline int xscom_write(uint32_t partid, uint64_t pcb_addr, uint64_t val) {
 	return _xscom_write(partid, pcb_addr, val, true);
-- 
2.14.3



More information about the Skiboot mailing list