[Skiboot] [RFC PATCH] direct-controls: p9 stop/cont fixes

Nicholas Piggin npiggin at gmail.com
Wed May 2 12:04:39 AEST 2018


Some recent work on pdbg and closer reading of the workbook has
shown we're not quite doing the right thing with thread starting
and stopping. These things were definitely broken before, and
work better with these new sequences when testing with pdbg, see:

https://lists.ozlabs.org/pipermail/pdbg/2018-May/000174.html

Fortunately at the moment breakage is mostly confined to error
paths, and to the dctl_cont_thread case, but errors are ecountered
here, and we may start using dctl_cont_thread more if we start
doig ramming or stepping or other interesting debugging things, so
it's good to tighten up.

I can split these out, just want to get feedback on it:

- p9_cont_thread is broken vs stop states. A quiesced thread in stop
  must not be hit with a thread cont scom because that causes it to
  execute the instruction after stop (rather than do the powersave
  wakeup sequence). We must do a clear maintainence mode instead.

- p9_cont_thread should check that the thread actually was quiesced
  in the first place. Anything could happen if we don't do this.

- If p9_stop_thread times out waiting for the thread to quiesce, do
  not hit it with a cont scom because again we don't know what state
  things are in. There seems to be no good way to de-assert the thread
  stop and recover here, so the thread may eventually quiesce and get
  stuck. But that's better than undefined behaviour that causes random
  crashes.

- The important non-error-case fix is to not proceed with direct
  controls if the thread was already quiesced. This implies someone
  else is doing direct controls (e.g., pdbg), so proceeding is going
  to cause state to get trampled on and things to go south. There
  is still a small race condition where two users could quiesce the
  thread at the same time. Not sure how we could arbitrate exclusive
  access to direct controls, it would be good to fix, but it's a
  much less concerning window. The common case is you stop a thread
  with pdbg, then the host's NMI IPI watchdog tries to hit it. This
  should fix that case.
---
 core/direct-controls.c | 100 ++++++++++++++++++++++++++++++-----------
 1 file changed, 75 insertions(+), 25 deletions(-)

diff --git a/core/direct-controls.c b/core/direct-controls.c
index c7b9c9b7..227608df 100644
--- a/core/direct-controls.c
+++ b/core/direct-controls.c
@@ -273,10 +273,14 @@ static int p8_sreset_thread(struct cpu_thread *cpu)
 #define P9_QUIESCE_POLL_INTERVAL	100
 #define P9_QUIESCE_TIMEOUT		100000
 
+#define P9_CORE_THREAD_STATE		0x10ab3
+#define P9_THREAD_INFO			0x10a9b
+
 #define P9_EC_DIRECT_CONTROLS		0x10a9c
 #define P9_THREAD_STOP(t)		PPC_BIT(7 + 8*(t))
 #define P9_THREAD_CONT(t)		PPC_BIT(6 + 8*(t))
 #define P9_THREAD_SRESET(t)		PPC_BIT(4 + 8*(t))
+#define P9_THREAD_CLEAR_MAINT(t)	PPC_BIT(3 + 8*(t))
 #define P9_THREAD_PWR(t)		PPC_BIT(32 + 8*(t))
 
 /* EC_PPM_SPECIAL_WKUP_HYP */
@@ -412,6 +416,72 @@ static int p9_thread_quiesced(struct cpu_thread *cpu)
 	return 0;
 }
 
+static int p9_cont_thread(struct cpu_thread *cpu)
+{
+	uint32_t chip_id = pir_to_chip_id(cpu->pir);
+	uint32_t core_id = pir_to_core_id(cpu->pir);
+	uint32_t thread_id = pir_to_thread_id(cpu->pir);
+	uint32_t cts_addr;
+	uint32_t ti_addr;
+	uint32_t dctl_addr;
+	uint64_t core_thread_state;
+	uint64_t thread_info;
+	bool active, stop;
+	int rc;
+
+	rc = p9_thread_quiesced(cpu);
+	if (rc < 0)
+		return rc;
+	if (!rc) {
+		prlog(PR_ERR, "Could not cont thread %u:%u:%u:"
+				" Thread is not quiesced.\n",
+				chip_id, core_id, thread_id);
+		return OPAL_BUSY;
+	}
+
+	cts_addr = XSCOM_ADDR_P9_EC(core_id, P9_CORE_THREAD_STATE);
+	ti_addr = XSCOM_ADDR_P9_EC(core_id, P9_THREAD_INFO);
+	dctl_addr = XSCOM_ADDR_P9_EC(core_id, P9_EC_DIRECT_CONTROLS);
+
+	if (xscom_read(chip_id, cts_addr, &core_thread_state)) {
+		prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
+				" Unable to read CORE_THREAD_STATE.\n",
+				chip_id, core_id, thread_id);
+		return OPAL_HARDWARE;
+	}
+	if (core_thread_state & PPC_BIT(56 + thread_id))
+		stop = true;
+	else
+		stop = false;
+
+	if (xscom_read(chip_id, ti_addr, &thread_info)) {
+		prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
+				" Unable to read THREAD_INFO.\n",
+				chip_id, core_id, thread_id);
+		return OPAL_HARDWARE;
+	}
+	if (thread_info & PPC_BIT(thread_id))
+		active = true;
+	else
+		active = false;
+
+	if (!active || stop) {
+		if (xscom_write(chip_id, dctl_addr, P9_THREAD_CLEAR_MAINT(thread_id))) {
+			prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
+				      " Unable to write EC_DIRECT_CONTROLS.\n",
+				      chip_id, core_id, thread_id);
+		}
+	} else {
+		if (xscom_write(chip_id, dctl_addr, P9_THREAD_CONT(thread_id))) {
+			prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
+				      " Unable to write EC_DIRECT_CONTROLS.\n",
+				      chip_id, core_id, thread_id);
+		}
+	}
+
+	return 0;
+}
+
 static int p9_stop_thread(struct cpu_thread *cpu)
 {
 	uint32_t chip_id = pir_to_chip_id(cpu->pir);
@@ -426,10 +496,12 @@ static int p9_stop_thread(struct cpu_thread *cpu)
 	rc = p9_thread_quiesced(cpu);
 	if (rc < 0)
 		return rc;
-	if (rc)
-		prlog(PR_WARNING, "Stopping thread %u:%u:%u warning:"
-				" thread is quiesced already.\n",
+	if (rc) {
+		prlog(PR_ERR, "Could not stop thread %u:%u:%u:"
+				" Thread is quiesced already.\n",
 				chip_id, core_id, thread_id);
+		return OPAL_BUSY;
+	}
 
 	if (xscom_write(chip_id, dctl_addr, P9_THREAD_STOP(thread_id))) {
 		prlog(PR_ERR, "Could not stop thread %u:%u:%u:"
@@ -452,31 +524,9 @@ static int p9_stop_thread(struct cpu_thread *cpu)
 			" Unable to quiesce thread.\n",
 			chip_id, core_id, thread_id);
 
-	if (xscom_write(chip_id, dctl_addr, P9_THREAD_CONT(thread_id))) {
-		prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
-				" Unable to write EC_DIRECT_CONTROLS.\n",
-				chip_id, core_id, thread_id);
-	}
-
 	return OPAL_HARDWARE;
 }
 
-static int p9_cont_thread(struct cpu_thread *cpu)
-{
-	uint32_t chip_id = pir_to_chip_id(cpu->pir);
-	uint32_t core_id = pir_to_core_id(cpu->pir);
-	uint32_t thread_id = pir_to_thread_id(cpu->pir);
-	uint32_t dctl_addr;
-
-	dctl_addr = XSCOM_ADDR_P9_EC(core_id, P9_EC_DIRECT_CONTROLS);
-	if (xscom_write(chip_id, dctl_addr, P9_THREAD_CONT(thread_id))) {
-		prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
-				" Unable to write EC_DIRECT_CONTROLS.\n",
-				chip_id, core_id, thread_id);
-	}
-
-	return 0;
-}
 
 static int p9_sreset_thread(struct cpu_thread *cpu)
 {
-- 
2.17.0



More information about the Skiboot mailing list