[Skiboot] [PATCH skiboot] npu2: Increase timeout for L2/L3 cache purging

Tue Jun 25 14:29:04 AEST 2019

On NVLink2 bridge reset, we purge all L2/L3 caches in the system.
This is an asynchronous operation, we have a 2ms timeout here. There are
reports that this is not enough and "PURGE L3 on core xxx timed out"
messages appear (for the reference: on the test setup this takes
280us..780us).

This defines the timeout as a macro and changes this from 2ms to 20ms.

This adds a tracepoint to tell how long it took to purge all the caches.

Signed-off-by: Alexey Kardashevskiy <aik at ozlabs.ru>
---

It would be interesting to know how long it can possibly take and if it
depends on the actual GPU load and usage pattern.

To enable or disable traces, "nvram" needs to run and then the host needs
reboot:

- enable traces:
sudo nvram  -p ibm,skiboot --update-config log-level-memory=trace
sudo nvram  -p ibm,skiboot --update-config log-level-driver=trace

- disable traces:
sudo nvram  -p ibm,skiboot --update-config log-level-memory=
sudo nvram  -p ibm,skiboot --update-config log-level-driver=
---
 include/npu2-regs.h |  2 ++
 hw/npu2.c           | 20 +++++++++++++-------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/include/npu2-regs.h b/include/npu2-regs.h
index 3cb587adc354..00a72e685b6e 100644
--- a/include/npu2-regs.h
+++ b/include/npu2-regs.h
@@ -798,6 +798,8 @@ void npu2_scom_write(uint64_t gcid, uint64_t scom_base,
 #define L3_PRD_PURGE_TTYPE_MASK 		PPC_BIT(1) | PPC_BIT(2) | PPC_BIT(3) | PPC_BIT(4)
 #define L3_FULL_PURGE				0x0
 
+#define L2_L3_PRD_PURGE_TIMEOUT_MS		20
+
 /* Config registers for NPU2 */
 #define NPU_STCK0_CS_SM0_MISC_CONFIG0		0x5011000
 #define NPU_STCK0_CS_SM1_MISC_CONFIG0		0x5011030
diff --git a/hw/npu2.c b/hw/npu2.c
index 3a2808d7133c..e8ec5cc9a43d 100644
--- a/hw/npu2.c
+++ b/hw/npu2.c
@@ -336,7 +336,7 @@ static int wait_l2_purge(uint32_t chip_id, uint32_t core_id)
 	uint64_t val;
 	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L2_PRD_PURGE_CMD_REG);
 	unsigned long now = mftb();
-	unsigned long end = now + msecs_to_tb(2);
+	unsigned long end = now + msecs_to_tb(L2_L3_PRD_PURGE_TIMEOUT_MS);
 	int rc;
 
 	while (1) {
@@ -386,7 +386,7 @@ static int wait_l3_purge(uint32_t chip_id, uint32_t core_id)
 	uint64_t val;
 	uint64_t addr = XSCOM_ADDR_P9_EX(core_id, L3_PRD_PURGE_REG);
 	unsigned long now = mftb();
-	unsigned long end = now + msecs_to_tb(2);
+	unsigned long end = now + msecs_to_tb(L2_L3_PRD_PURGE_TIMEOUT_MS);
 	int rc;
 
 	/* Trigger bit is automatically set to zero when flushing is done */
@@ -414,6 +414,7 @@ static int64_t purge_l2_l3_caches(void)
 	struct cpu_thread *t;
 	uint64_t core_id, prev_core_id = (uint64_t)-1;
 	int rc;
+	unsigned long now = mftb();
 
 	for_each_ungarded_cpu(t) {
 		/* Only need to do it once per core chiplet */
@@ -423,10 +424,10 @@ static int64_t purge_l2_l3_caches(void)
 		prev_core_id = core_id;
 		rc = start_l2_purge(t->chip_id, core_id);
 		if (rc)
-			return rc;
+			goto trace_exit;
 		rc = start_l3_purge(t->chip_id, core_id);
 		if (rc)
-			return rc;
+			goto trace_exit;
 	}
 
 	prev_core_id = (uint64_t)-1;
@@ -439,12 +440,17 @@ static int64_t purge_l2_l3_caches(void)
 
 		rc = wait_l2_purge(t->chip_id, core_id);
 		if (rc)
-			return rc;
+			goto trace_exit;
 		rc = wait_l3_purge(t->chip_id, core_id);
 		if (rc)
-			return rc;
+			goto trace_exit;
 	}
-	return OPAL_SUCCESS;
+
+trace_exit:
+	prlog(PR_TRACE, "L2/L3 purging took %ldus\n",
+			tb_to_usecs(mftb() - now));
+
+	return rc;
 }
 
 static int64_t npu2_dev_cfg_exp_devcap(void *dev,
-- 
2.17.1