[Skiboot] [PATCH 2/2] NPU2: dump NPU2 registers on npu2 HMI

Stewart Smith stewart at linux.vnet.ibm.com
Tue Mar 27 15:42:58 AEDT 2018


Due to the nature of debugging npu2 issues, folk are wanting the
full list of NPU2 registers dumped when there's a problem.

We have to list out each register as traversing the range
triggers FIR bits that confuse PRD.

Suggested-by: Ryan Black <rblack at us.ibm.com>
Signed-off-by: Stewart Smith <stewart at linux.vnet.ibm.com>
---
 core/hmi.c | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 73 insertions(+), 2 deletions(-)

diff --git a/core/hmi.c b/core/hmi.c
index 1a6d145c19db..162dd8a11253 100644
--- a/core/hmi.c
+++ b/core/hmi.c
@@ -1,4 +1,4 @@
-/* Copyright 2013-2014 IBM Corp.
+/* Copyright 2013-2018 IBM Corp.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <npu2-regs.h>
 #include <npu.h>
 #include <capp.h>
+#include <nvram.h>
 
 /*
  * HMER register layout:
@@ -567,6 +568,59 @@ static void find_nx_checkstop_reason(int flat_chip_id,
 	*event_generated = true;
 }
 
+/*
+ * If the year is 2018 and you still see all these hardcoded, you
+ * should really replace this with the neat macros that's in the
+ * NPU2 code rather than this horrible listing of every single
+ * NPU2 register hardcoded for a specific chip.
+ *
+ * I feel dirty having even written it.
+ */
+static uint32_t npu2_scom_dump[] = {
+	0x5011017, 0x5011047, 0x5011077, 0x50110A7,
+	0x5011217, 0x5011247, 0x5011277, 0x50112A7,
+	0x5011417, 0x5011447, 0x5011477, 0x50114A7,
+	0x50110DA, 0x50112DA, 0x50114DA,
+	0x50110DB, 0x50112DB, 0x50114DB,
+	0x5011011, 0x5011041, 0x5011071, 0x50110A1,
+	0x5011211, 0x5011241, 0x5011271, 0x50112A1,
+	0x5011411, 0x5011441, 0x5011471, 0x50114A1,
+	0x5011018, 0x5011048, 0x5011078, 0x50110A8,
+	0x5011218, 0x5011248, 0x5011278, 0x50112A8,
+	0x5011418, 0x5011448, 0x5011478, 0x50114A8,
+	0x5011640,
+	0x5011114, 0x5011134, 0x5011314, 0x5011334,
+	0x5011514, 0x5011534, 0x5011118, 0x5011138,
+	0x5011318, 0x5011338, 0x5011518, 0x5011538,
+	0x50110D8, 0x50112D8, 0x50114D8,
+	0x50110D9, 0x50112D9, 0x50114D9,
+	0x5011019, 0x5011049, 0x5011079, 0x50110A9,
+	0x5011219, 0x5011249, 0x5011279, 0x50112A9,
+	0x5011419, 0x5011449, 0x5011479, 0x50114A9,
+	0x50110F4, 0x50112F4, 0x50114F4,
+	0x50110F5, 0x50112F5, 0x50114F5,
+	0x50110F6, 0x50112F6, 0x50114F6,
+	0x50110FD, 0x50112FD, 0x50114FD,
+	0x50110FE, 0x50112FE, 0x50114FE,
+	0x00
+};
+
+static void dump_scoms(int flat_chip_id, const char *unit, uint32_t *scoms)
+{
+	uint64_t value;
+	int r;
+
+	while (*scoms != 0) {
+		value = 0;
+		r = _xscom_read(flat_chip_id, *scoms, &value, false);
+		if (r != OPAL_SUCCESS)
+			continue;
+		prlog(PR_ERR, "%s: 0x%08x=0x%016llx\n",
+		      unit, *scoms, value);
+		scoms++;
+	}
+}
+
 static void find_npu2_checkstop_reason(int flat_chip_id,
 				      struct OpalHMIEvent *hmi_evt,
 				      bool *event_generated)
@@ -574,7 +628,7 @@ static void find_npu2_checkstop_reason(int flat_chip_id,
 	struct phb *phb;
 	struct npu *p = NULL;
 	int i;
-
+	bool npu2_hmi_verbose = false;
 	uint64_t npu2_fir;
 	uint64_t npu2_fir_mask;
 	uint64_t npu2_fir_action0;
@@ -636,6 +690,23 @@ static void find_npu2_checkstop_reason(int flat_chip_id,
 	if (!total_errors)
 		return;
 
+	npu2_hmi_verbose = nvram_query_eq("npu2-hmi-verbose", "true");
+	/* Force this for now until we sort out something better */
+	npu2_hmi_verbose = true;
+
+	if (npu2_hmi_verbose) {
+		_xscom_lock();
+		dump_scoms(flat_chip_id, "NPU2", npu2_scom_dump);
+		_xscom_unlock();
+		prlog(PR_ERR, " _________________________ \n");
+		prlog(PR_ERR, "< It's Driver Debug time! >\n");
+		prlog(PR_ERR, " ------------------------- \n");
+		prlog(PR_ERR, "       \\   ,__,            \n");
+		prlog(PR_ERR, "        \\  (oo)____        \n");
+		prlog(PR_ERR, "           (__)    )\\      \n");
+		prlog(PR_ERR, "              ||--|| *     \n");
+	}
+
 	/* Set up the HMI event */
 	hmi_evt->severity = OpalHMI_SEV_WARNING;
 	hmi_evt->type = OpalHMI_ERROR_MALFUNC_ALERT;
-- 
2.14.3



More information about the Skiboot mailing list