[Skiboot] [PATCH v2 3/3] lock: Add additional lock auditing code

Benjamin Herrenschmidt benh at kernel.crashing.org
Wed Dec 20 13:16:23 AEDT 2017


Keep track of lock owner name and replace lock_depth counter
with a per-cpu list of locks held by the cpu.

This allows us to print the actual locks held in case we hit
the (in)famous message about opal_pollers being run with a
lock held.

It also allows us to warn (and drop them) if locks are still
held when returning to the OS or completing a scheduled job.

Signed-off-by: Benjamin Herrenschmidt <benh at kernel.crashing.org>
---

v2. Move the list_del() in unlock() to before we clear the
    lock value otherwise we can race with a concurrent
    locker.
---
 core/cpu.c      |  6 ++++++
 core/lock.c     | 41 ++++++++++++++++++++++++++++++++---------
 core/opal.c     |  8 +++++++-
 core/stack.c    |  1 +
 core/timebase.c |  2 +-
 include/cpu.h   |  2 +-
 include/lock.h  | 32 +++++++++++++++++++++++++++-----
 7 files changed, 75 insertions(+), 17 deletions(-)

diff --git a/core/cpu.c b/core/cpu.c
index 7dd7c863..3e110c3c 100644
--- a/core/cpu.c
+++ b/core/cpu.c
@@ -285,6 +285,11 @@ void cpu_process_jobs(void)
 		if (no_return)
 			free(job);
 		func(data);
+		if (!list_empty(&cpu->locks_held)) {
+			prlog(PR_ERR, "OPAL job %s returning with locks held\n",
+			      job->name);
+			drop_my_locks(true);
+		}
 		lock(&cpu->job_lock);
 		if (!no_return) {
 			cpu->job_count--;
@@ -822,6 +827,7 @@ static void init_cpu_thread(struct cpu_thread *t,
 	init_lock(&t->dctl_lock);
 	init_lock(&t->job_lock);
 	list_head_init(&t->job_queue);
+	list_head_init(&t->locks_held);
 	t->stack_guard = STACK_CHECK_GUARD_BASE ^ pir;
 	t->state = state;
 	t->pir = pir;
diff --git a/core/lock.c b/core/lock.c
index b5e3323f..edfe1c7e 100644
--- a/core/lock.c
+++ b/core/lock.c
@@ -57,8 +57,8 @@ static void unlock_check(struct lock *l)
 	if (l->in_con_path && this_cpu()->con_suspend == 0)
 		lock_error(l, "Unlock con lock with console not suspended", 3);
 
-	if (this_cpu()->lock_depth == 0)
-		lock_error(l, "Releasing lock with 0 depth", 4);
+	if (list_empty(&this_cpu()->locks_held))
+		lock_error(l, "Releasing lock we don't hold depth", 4);
 }
 
 #else
@@ -89,7 +89,7 @@ static inline bool __try_lock(struct cpu_thread *cpu, struct lock *l)
 	return false;
 }
 
-bool try_lock(struct lock *l)
+bool try_lock_caller(struct lock *l, const char *owner)
 {
 	struct cpu_thread *cpu = this_cpu();
 
@@ -97,22 +97,23 @@ bool try_lock(struct lock *l)
 		return true;
 
 	if (__try_lock(cpu, l)) {
+		l->owner = owner;
 		if (l->in_con_path)
 			cpu->con_suspend++;
-		cpu->lock_depth++;
+		list_add(&cpu->locks_held, &l->list);
 		return true;
 	}
 	return false;
 }
 
-void lock(struct lock *l)
+void lock_caller(struct lock *l, const char *owner)
 {
 	if (bust_locks)
 		return;
 
 	lock_check(l);
 	for (;;) {
-		if (try_lock(l))
+		if (try_lock_caller(l, owner))
 			break;
 		smt_lowest();
 		while (l->lock_val)
@@ -130,8 +131,9 @@ void unlock(struct lock *l)
 
 	unlock_check(l);
 
+	l->owner = NULL;
+	list_del(&l->list);
 	lwsync();
-	this_cpu()->lock_depth--;
 	l->lock_val = 0;
 
 	/* WARNING: On fast reboot, we can be reset right at that
@@ -144,7 +146,7 @@ void unlock(struct lock *l)
 	}
 }
 
-bool lock_recursive(struct lock *l)
+bool lock_recursive_caller(struct lock *l, const char *caller)
 {
 	if (bust_locks)
 		return false;
@@ -152,7 +154,7 @@ bool lock_recursive(struct lock *l)
 	if (lock_held_by_me(l))
 		return false;
 
-	lock(l);
+	lock_caller(l, caller);
 	return true;
 }
 
@@ -160,3 +162,24 @@ void init_locks(void)
 {
 	bust_locks = false;
 }
+
+void dump_locks_list(void)
+{
+	struct lock *l;
+
+	prlog(PR_ERR, "Locks held:\n");
+	list_for_each(&this_cpu()->locks_held, l, list)
+		prlog(PR_ERR, "  %s\n", l->owner);
+}
+
+void drop_my_locks(bool warn)
+{
+	struct lock *l;
+
+	while((l = list_pop(&this_cpu()->locks_held, struct lock, list)) != NULL) {
+		if (warn)
+			prlog(PR_ERR, "  %s\n", l->owner);
+		unlock(l);
+	}
+}
+
diff --git a/core/opal.c b/core/opal.c
index d33d527c..1bca774c 100644
--- a/core/opal.c
+++ b/core/opal.c
@@ -182,6 +182,11 @@ int64_t opal_exit_check(int64_t retval, struct stack_frame *eframe)
 		printf("CPU UN-ACCOUNTED FIRMWARE ENTRY! PIR=%04lx cpu @%p -> pir=%04x token=%llu retval=%lld\n",
 		       mfspr(SPR_PIR), cpu, cpu->pir, token, retval);
 	} else {
+		if (!list_empty(&cpu->locks_held)) {
+			prlog(PR_ERR, "OPAL exiting with locks held, token=%llu retval=%lld\n",
+			      token, retval);
+			drop_my_locks(true);
+		}
 		sync(); /* release barrier vs quiescing */
 		cpu->in_opal_call--;
 	}
@@ -557,7 +562,7 @@ void opal_run_pollers(void)
 	}
 	this_cpu()->in_poller = true;
 
-	if (this_cpu()->lock_depth && pollers_with_lock_warnings < 64) {
+	if (!list_empty(&this_cpu()->locks_held) && pollers_with_lock_warnings < 64) {
 		/**
 		 * @fwts-label OPALPollerWithLock
 		 * @fwts-advice opal_run_pollers() was called with a lock
@@ -565,6 +570,7 @@ void opal_run_pollers(void)
 		 * lucky/careful.
 		 */
 		prlog(PR_ERR, "Running pollers with lock held !\n");
+		dump_locks_list();
 		backtrace();
 		pollers_with_lock_warnings++;
 		if (pollers_with_lock_warnings == 64) {
diff --git a/core/stack.c b/core/stack.c
index bb7fa625..af4d37d0 100644
--- a/core/stack.c
+++ b/core/stack.c
@@ -21,6 +21,7 @@
 #include <stack.h>
 #include <mem_region.h>
 #include <unistd.h>
+#include <lock.h>
 
 #define STACK_BUF_ENTRIES	60
 static struct bt_entry bt_buf[STACK_BUF_ENTRIES];
diff --git a/core/timebase.c b/core/timebase.c
index f2dff44b..777e4ba6 100644
--- a/core/timebase.c
+++ b/core/timebase.c
@@ -53,7 +53,7 @@ void time_wait(unsigned long duration)
 {
 	struct cpu_thread *c = this_cpu();
 
-	if (this_cpu()->lock_depth) {
+	if (!list_empty(&this_cpu()->locks_held)) {
 		time_wait_nopoll(duration);
 		return;
 	}
diff --git a/include/cpu.h b/include/cpu.h
index e3bc75f4..bec4b030 100644
--- a/include/cpu.h
+++ b/include/cpu.h
@@ -61,8 +61,8 @@ struct cpu_thread {
 	uint64_t			save_r1;
 	void				*icp_regs;
 	uint32_t			in_opal_call;
-	uint32_t			lock_depth;
 	uint32_t			con_suspend;
+	struct list_head		locks_held;
 	bool				con_need_flush;
 	bool				quiesce_opal_call;
 	bool				in_mcount;
diff --git a/include/lock.h b/include/lock.h
index 7bdfca28..061873ab 100644
--- a/include/lock.h
+++ b/include/lock.h
@@ -20,6 +20,7 @@
 #include <stdbool.h>
 #include <processor.h>
 #include <cmpxchg.h>
+#include <ccan/list/list.h>
 
 struct lock {
 	/* Lock value has bit 63 as lock bit and the PIR of the owner
@@ -32,10 +33,19 @@ struct lock {
 	 * in which case taking it will suspend console flushing
 	 */
 	bool in_con_path;
+
+	/* file/line of lock owner */
+	const char *owner;
+
+	/* linkage in per-cpu list of owned locks */
+	struct list_node list;
 };
 
-/* Initializer */
-#define LOCK_UNLOCKED	{ .lock_val = 0, .in_con_path = 0 }
+/* Initializer... not ideal but works for now. If we need different
+ * values for the fields and/or start getting warnings we'll have to
+ * play macro tricks
+ */
+#define LOCK_UNLOCKED	{ 0 }
 
 /* Note vs. libc and locking:
  *
@@ -65,8 +75,14 @@ static inline void init_lock(struct lock *l)
 	*l = (struct lock)LOCK_UNLOCKED;
 }
 
-extern bool try_lock(struct lock *l);
-extern void lock(struct lock *l);
+#define LOCK_CALLER	__FILE__ ":" stringify(__LINE__)
+
+#define try_lock(l)		try_lock_caller(l, LOCK_CALLER)
+#define lock(l)			lock_caller(l, LOCK_CALLER)
+#define lock_recursive(l)	lock_recursive_caller(l, LOCK_CALLER)
+
+extern bool try_lock_caller(struct lock *l, const char *caller);
+extern void lock_caller(struct lock *l, const char *caller);
 extern void unlock(struct lock *l);
 
 extern bool lock_held_by_me(struct lock *l);
@@ -77,9 +93,15 @@ extern bool lock_held_by_me(struct lock *l);
  * returns false if the lock was already held by this cpu. If it returns
  * true, then the caller shall release it when done.
  */
-extern bool lock_recursive(struct lock *l);
+extern bool lock_recursive_caller(struct lock *l, const char *caller);
 
 /* Called after per-cpu data structures are available */
 extern void init_locks(void);
 
+/* Dump the list of locks held by this CPU */
+extern void dump_locks_list(void);
+
+/* Clean all locks held by CPU (and warn if any) */
+extern void drop_my_locks(bool warn);
+
 #endif /* __LOCK_H */
-- 
2.14.3



More information about the Skiboot mailing list