[Skiboot] [RFC PATCH] direct-controls: p9 stop/cont fixes
Nicholas Piggin
npiggin at gmail.com
Wed May 2 12:04:39 AEST 2018
Some recent work on pdbg and closer reading of the workbook has
shown we're not quite doing the right thing with thread starting
and stopping. These things were definitely broken before, and
work better with these new sequences when testing with pdbg, see:
https://lists.ozlabs.org/pipermail/pdbg/2018-May/000174.html
Fortunately at the moment breakage is mostly confined to error
paths, and to the dctl_cont_thread case, but errors are ecountered
here, and we may start using dctl_cont_thread more if we start
doig ramming or stepping or other interesting debugging things, so
it's good to tighten up.
I can split these out, just want to get feedback on it:
- p9_cont_thread is broken vs stop states. A quiesced thread in stop
must not be hit with a thread cont scom because that causes it to
execute the instruction after stop (rather than do the powersave
wakeup sequence). We must do a clear maintainence mode instead.
- p9_cont_thread should check that the thread actually was quiesced
in the first place. Anything could happen if we don't do this.
- If p9_stop_thread times out waiting for the thread to quiesce, do
not hit it with a cont scom because again we don't know what state
things are in. There seems to be no good way to de-assert the thread
stop and recover here, so the thread may eventually quiesce and get
stuck. But that's better than undefined behaviour that causes random
crashes.
- The important non-error-case fix is to not proceed with direct
controls if the thread was already quiesced. This implies someone
else is doing direct controls (e.g., pdbg), so proceeding is going
to cause state to get trampled on and things to go south. There
is still a small race condition where two users could quiesce the
thread at the same time. Not sure how we could arbitrate exclusive
access to direct controls, it would be good to fix, but it's a
much less concerning window. The common case is you stop a thread
with pdbg, then the host's NMI IPI watchdog tries to hit it. This
should fix that case.
---
core/direct-controls.c | 100 ++++++++++++++++++++++++++++++-----------
1 file changed, 75 insertions(+), 25 deletions(-)
diff --git a/core/direct-controls.c b/core/direct-controls.c
index c7b9c9b7..227608df 100644
--- a/core/direct-controls.c
+++ b/core/direct-controls.c
@@ -273,10 +273,14 @@ static int p8_sreset_thread(struct cpu_thread *cpu)
#define P9_QUIESCE_POLL_INTERVAL 100
#define P9_QUIESCE_TIMEOUT 100000
+#define P9_CORE_THREAD_STATE 0x10ab3
+#define P9_THREAD_INFO 0x10a9b
+
#define P9_EC_DIRECT_CONTROLS 0x10a9c
#define P9_THREAD_STOP(t) PPC_BIT(7 + 8*(t))
#define P9_THREAD_CONT(t) PPC_BIT(6 + 8*(t))
#define P9_THREAD_SRESET(t) PPC_BIT(4 + 8*(t))
+#define P9_THREAD_CLEAR_MAINT(t) PPC_BIT(3 + 8*(t))
#define P9_THREAD_PWR(t) PPC_BIT(32 + 8*(t))
/* EC_PPM_SPECIAL_WKUP_HYP */
@@ -412,6 +416,72 @@ static int p9_thread_quiesced(struct cpu_thread *cpu)
return 0;
}
+static int p9_cont_thread(struct cpu_thread *cpu)
+{
+ uint32_t chip_id = pir_to_chip_id(cpu->pir);
+ uint32_t core_id = pir_to_core_id(cpu->pir);
+ uint32_t thread_id = pir_to_thread_id(cpu->pir);
+ uint32_t cts_addr;
+ uint32_t ti_addr;
+ uint32_t dctl_addr;
+ uint64_t core_thread_state;
+ uint64_t thread_info;
+ bool active, stop;
+ int rc;
+
+ rc = p9_thread_quiesced(cpu);
+ if (rc < 0)
+ return rc;
+ if (!rc) {
+ prlog(PR_ERR, "Could not cont thread %u:%u:%u:"
+ " Thread is not quiesced.\n",
+ chip_id, core_id, thread_id);
+ return OPAL_BUSY;
+ }
+
+ cts_addr = XSCOM_ADDR_P9_EC(core_id, P9_CORE_THREAD_STATE);
+ ti_addr = XSCOM_ADDR_P9_EC(core_id, P9_THREAD_INFO);
+ dctl_addr = XSCOM_ADDR_P9_EC(core_id, P9_EC_DIRECT_CONTROLS);
+
+ if (xscom_read(chip_id, cts_addr, &core_thread_state)) {
+ prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
+ " Unable to read CORE_THREAD_STATE.\n",
+ chip_id, core_id, thread_id);
+ return OPAL_HARDWARE;
+ }
+ if (core_thread_state & PPC_BIT(56 + thread_id))
+ stop = true;
+ else
+ stop = false;
+
+ if (xscom_read(chip_id, ti_addr, &thread_info)) {
+ prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
+ " Unable to read THREAD_INFO.\n",
+ chip_id, core_id, thread_id);
+ return OPAL_HARDWARE;
+ }
+ if (thread_info & PPC_BIT(thread_id))
+ active = true;
+ else
+ active = false;
+
+ if (!active || stop) {
+ if (xscom_write(chip_id, dctl_addr, P9_THREAD_CLEAR_MAINT(thread_id))) {
+ prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
+ " Unable to write EC_DIRECT_CONTROLS.\n",
+ chip_id, core_id, thread_id);
+ }
+ } else {
+ if (xscom_write(chip_id, dctl_addr, P9_THREAD_CONT(thread_id))) {
+ prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
+ " Unable to write EC_DIRECT_CONTROLS.\n",
+ chip_id, core_id, thread_id);
+ }
+ }
+
+ return 0;
+}
+
static int p9_stop_thread(struct cpu_thread *cpu)
{
uint32_t chip_id = pir_to_chip_id(cpu->pir);
@@ -426,10 +496,12 @@ static int p9_stop_thread(struct cpu_thread *cpu)
rc = p9_thread_quiesced(cpu);
if (rc < 0)
return rc;
- if (rc)
- prlog(PR_WARNING, "Stopping thread %u:%u:%u warning:"
- " thread is quiesced already.\n",
+ if (rc) {
+ prlog(PR_ERR, "Could not stop thread %u:%u:%u:"
+ " Thread is quiesced already.\n",
chip_id, core_id, thread_id);
+ return OPAL_BUSY;
+ }
if (xscom_write(chip_id, dctl_addr, P9_THREAD_STOP(thread_id))) {
prlog(PR_ERR, "Could not stop thread %u:%u:%u:"
@@ -452,31 +524,9 @@ static int p9_stop_thread(struct cpu_thread *cpu)
" Unable to quiesce thread.\n",
chip_id, core_id, thread_id);
- if (xscom_write(chip_id, dctl_addr, P9_THREAD_CONT(thread_id))) {
- prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
- " Unable to write EC_DIRECT_CONTROLS.\n",
- chip_id, core_id, thread_id);
- }
-
return OPAL_HARDWARE;
}
-static int p9_cont_thread(struct cpu_thread *cpu)
-{
- uint32_t chip_id = pir_to_chip_id(cpu->pir);
- uint32_t core_id = pir_to_core_id(cpu->pir);
- uint32_t thread_id = pir_to_thread_id(cpu->pir);
- uint32_t dctl_addr;
-
- dctl_addr = XSCOM_ADDR_P9_EC(core_id, P9_EC_DIRECT_CONTROLS);
- if (xscom_write(chip_id, dctl_addr, P9_THREAD_CONT(thread_id))) {
- prlog(PR_ERR, "Could not resume thread %u:%u:%u:"
- " Unable to write EC_DIRECT_CONTROLS.\n",
- chip_id, core_id, thread_id);
- }
-
- return 0;
-}
static int p9_sreset_thread(struct cpu_thread *cpu)
{
--
2.17.0
More information about the Skiboot
mailing list