[PATCH v2 17/17] powerpc/qspinlock: provide accounting and options for sleepy locks

Nicholas Piggin npiggin at gmail.com
Mon Nov 14 13:31:37 AEDT 2022


Finding the owner or a queued waiter on a lock with a preempted vcpu
is indicative of an oversubscribed guest causing the lock to get into
trouble. Provide some options to detect this situation and have new
CPUs avoid queueing for a longer time (more steal iterations) to
minimise the problems caused by vcpu preemption on the queue.

Signed-off-by: Nicholas Piggin <npiggin at gmail.com>
---
 arch/powerpc/include/asm/qspinlock_types.h |   7 +-
 arch/powerpc/lib/qspinlock.c               | 244 +++++++++++++++++++--
 2 files changed, 232 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/qspinlock_types.h b/arch/powerpc/include/asm/qspinlock_types.h
index 35f9525381e6..4fbcc8a4230b 100644
--- a/arch/powerpc/include/asm/qspinlock_types.h
+++ b/arch/powerpc/include/asm/qspinlock_types.h
@@ -30,7 +30,7 @@ typedef struct qspinlock {
  *
  *     0: locked bit
  *  1-14: lock holder cpu
- *    15: unused bit
+ *    15: lock owner or queuer vcpus observed to be preempted bit
  *    16: must queue bit
  * 17-31: tail cpu (+1)
  */
@@ -49,6 +49,11 @@ typedef struct qspinlock {
 #error "qspinlock does not support such large CONFIG_NR_CPUS"
 #endif
 
+#define _Q_SLEEPY_OFFSET	15
+#define _Q_SLEEPY_BITS		1
+#define _Q_SLEEPY_MASK		_Q_SET_MASK(SLEEPY_OWNER)
+#define _Q_SLEEPY_VAL		(1U << _Q_SLEEPY_OFFSET)
+
 #define _Q_MUST_Q_OFFSET	16
 #define _Q_MUST_Q_BITS		1
 #define _Q_MUST_Q_MASK		_Q_SET_MASK(MUST_Q)
diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c
index 7e6ab1f30d50..36afdfde41aa 100644
--- a/arch/powerpc/lib/qspinlock.c
+++ b/arch/powerpc/lib/qspinlock.c
@@ -5,6 +5,7 @@
 #include <linux/percpu.h>
 #include <linux/smp.h>
 #include <linux/topology.h>
+#include <linux/sched/clock.h>
 #include <asm/qspinlock.h>
 #include <asm/paravirt.h>
 
@@ -36,25 +37,56 @@ static int head_spins __read_mostly = (1<<8);
 static bool pv_yield_owner __read_mostly = true;
 static bool pv_yield_allow_steal __read_mostly = false;
 static bool pv_spin_on_preempted_owner __read_mostly = false;
+static bool pv_sleepy_lock __read_mostly = true;
+static bool pv_sleepy_lock_sticky __read_mostly = false;
+static u64 pv_sleepy_lock_interval_ns __read_mostly = 0;
+static int pv_sleepy_lock_factor __read_mostly = 256;
 static bool pv_yield_prev __read_mostly = true;
 static bool pv_yield_propagate_owner __read_mostly = true;
 static bool pv_prod_head __read_mostly = false;
 
 static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes);
+static DEFINE_PER_CPU_ALIGNED(u64, sleepy_lock_seen_clock);
 
-static __always_inline int get_steal_spins(bool paravirt)
+static __always_inline bool recently_sleepy(void)
 {
-	return steal_spins;
+	/* pv_sleepy_lock is true when this is called */
+	if (pv_sleepy_lock_interval_ns) {
+		u64 seen = this_cpu_read(sleepy_lock_seen_clock);
+
+		if (seen) {
+			u64 delta = sched_clock() - seen;
+			if (delta < pv_sleepy_lock_interval_ns)
+				return true;
+			this_cpu_write(sleepy_lock_seen_clock, 0);
+		}
+	}
+
+	return false;
 }
 
-static __always_inline int get_remote_steal_spins(bool paravirt)
+static __always_inline int get_steal_spins(bool paravirt, bool sleepy)
 {
-	return remote_steal_spins;
+	if (paravirt && sleepy)
+		return steal_spins * pv_sleepy_lock_factor;
+	else
+		return steal_spins;
 }
 
-static __always_inline int get_head_spins(bool paravirt)
+static __always_inline int get_remote_steal_spins(bool paravirt, bool sleepy)
 {
-	return head_spins;
+	if (paravirt && sleepy)
+		return remote_steal_spins * pv_sleepy_lock_factor;
+	else
+		return remote_steal_spins;
+}
+
+static __always_inline int get_head_spins(bool paravirt, bool sleepy)
+{
+	if (paravirt && sleepy)
+		return head_spins * pv_sleepy_lock_factor;
+	else
+		return head_spins;
 }
 
 static inline u32 encode_tail_cpu(int cpu)
@@ -187,6 +219,56 @@ static __always_inline u32 clear_mustq(struct qspinlock *lock)
 	return prev;
 }
 
+static __always_inline bool try_set_sleepy(struct qspinlock *lock, u32 old)
+{
+	u32 prev;
+	u32 new = old | _Q_SLEEPY_VAL;
+
+	BUG_ON(!(old & _Q_LOCKED_VAL));
+	BUG_ON(old & _Q_SLEEPY_VAL);
+
+	asm volatile(
+"1:	lwarx	%0,0,%1		# try_set_sleepy			\n"
+"	cmpw	0,%0,%2							\n"
+"	bne-	2f							\n"
+"	stwcx.	%3,0,%1							\n"
+"	bne-	1b							\n"
+"2:									\n"
+	: "=&r" (prev)
+	: "r" (&lock->val), "r"(old), "r" (new)
+	: "cr0", "memory");
+
+	return likely(prev == old);
+}
+
+static __always_inline void seen_sleepy_owner(struct qspinlock *lock, u32 val)
+{
+	if (pv_sleepy_lock) {
+		if (pv_sleepy_lock_interval_ns)
+			this_cpu_write(sleepy_lock_seen_clock, sched_clock());
+		if (!(val & _Q_SLEEPY_VAL))
+			try_set_sleepy(lock, val);
+	}
+}
+
+static __always_inline void seen_sleepy_lock(void)
+{
+	if (pv_sleepy_lock && pv_sleepy_lock_interval_ns)
+		this_cpu_write(sleepy_lock_seen_clock, sched_clock());
+}
+
+static __always_inline void seen_sleepy_node(struct qspinlock *lock, u32 val)
+{
+	if (pv_sleepy_lock) {
+		if (pv_sleepy_lock_interval_ns)
+			this_cpu_write(sleepy_lock_seen_clock, sched_clock());
+		if (val & _Q_LOCKED_VAL) {
+			if (!(val & _Q_SLEEPY_VAL))
+				try_set_sleepy(lock, val);
+		}
+	}
+}
+
 static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val)
 {
 	int cpu = decode_tail_cpu(val);
@@ -234,6 +316,7 @@ static __always_inline bool __yield_to_locked_owner(struct qspinlock *lock, u32
 
 	spin_end();
 
+	seen_sleepy_owner(lock, val);
 	preempted = true;
 
 	/*
@@ -308,11 +391,12 @@ static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int
 }
 
 /* Called inside spin_begin() */
-static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode *node, u32 val, bool paravirt)
+static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode *node, u32 val, bool paravirt)
 {
 	int prev_cpu = decode_tail_cpu(val);
 	u32 yield_count;
 	int yield_cpu;
+	bool preempted = false;
 
 	if (!paravirt)
 		goto relax;
@@ -334,6 +418,9 @@ static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode *
 
 	spin_end();
 
+	preempted = true;
+	seen_sleepy_node(lock, val);
+
 	smp_rmb();
 
 	if (yield_cpu == node->yield_cpu) {
@@ -341,7 +428,7 @@ static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode *
 			node->next->yield_cpu = yield_cpu;
 		yield_to_preempted(yield_cpu, yield_count);
 		spin_begin();
-		return;
+		return preempted;
 	}
 	spin_begin();
 
@@ -355,26 +442,31 @@ static __always_inline void yield_to_prev(struct qspinlock *lock, struct qnode *
 
 	spin_end();
 
+	preempted = true;
+	seen_sleepy_node(lock, val);
+
 	smp_rmb(); /* See __yield_to_locked_owner comment */
 
 	if (!node->locked) {
 		yield_to_preempted(prev_cpu, yield_count);
 		spin_begin();
-		return;
+		return preempted;
 	}
 	spin_begin();
 
 relax:
 	spin_cpu_relax();
+
+	return preempted;
 }
 
-static __always_inline bool steal_break(u32 val, int iters, bool paravirt)
+static __always_inline bool steal_break(u32 val, int iters, bool paravirt, bool sleepy)
 {
-	if (iters >= get_steal_spins(paravirt))
+	if (iters >= get_steal_spins(paravirt, sleepy))
 		return true;
 
 	if (IS_ENABLED(CONFIG_NUMA) &&
-			(iters >= get_remote_steal_spins(paravirt))) {
+			(iters >= get_remote_steal_spins(paravirt, sleepy))) {
 		int cpu = get_owner_cpu(val);
 		if (numa_node_id() != cpu_to_node(cpu))
 			return true;
@@ -384,6 +476,8 @@ static __always_inline bool steal_break(u32 val, int iters, bool paravirt)
 
 static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool paravirt)
 {
+	bool seen_preempted = false;
+	bool sleepy = false;
 	int iters = 0;
 	u32 val;
 
@@ -410,7 +504,25 @@ static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool parav
 			preempted = yield_to_locked_owner(lock, val, paravirt);
 		}
 
+		if (paravirt && pv_sleepy_lock) {
+			if (!sleepy) {
+				if (val & _Q_SLEEPY_VAL) {
+					seen_sleepy_lock();
+					sleepy = true;
+				} else if (recently_sleepy()) {
+					sleepy = true;
+				}
+			}
+			if (pv_sleepy_lock_sticky && seen_preempted &&
+					!(val & _Q_SLEEPY_VAL)) {
+				if (try_set_sleepy(lock, val))
+					val |= _Q_SLEEPY_VAL;
+			}
+		}
+
 		if (preempted) {
+			seen_preempted = true;
+			sleepy = true;
 			if (!pv_spin_on_preempted_owner)
 				iters++;
 			/*
@@ -424,7 +536,7 @@ static __always_inline bool try_to_steal_lock(struct qspinlock *lock, bool parav
 		} else {
 			iters++;
 		}
-	} while (!steal_break(val, iters, paravirt));
+	} while (!steal_break(val, iters, paravirt, sleepy));
 
 	spin_end();
 
@@ -436,6 +548,7 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
 	struct qnodes *qnodesp;
 	struct qnode *next, *node;
 	u32 val, old, tail;
+	bool seen_preempted = false;
 	int idx;
 
 	BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS));
@@ -477,8 +590,10 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
 
 		/* Wait for mcs node lock to be released */
 		spin_begin();
-		while (!node->locked)
-			yield_to_prev(lock, node, old, paravirt);
+		while (!node->locked) {
+			if (yield_to_prev(lock, node, old, paravirt))
+				seen_preempted = true;
+		}
 		spin_end();
 
 		/* Clear out stale propagated yield_cpu */
@@ -499,7 +614,8 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
 				break;
 
 			propagate_yield_cpu(node, val, &set_yield_cpu, paravirt);
-			yield_head_to_locked_owner(lock, val, paravirt);
+			if (yield_head_to_locked_owner(lock, val, paravirt))
+				seen_preempted = true;
 		}
 		spin_end();
 
@@ -515,7 +631,9 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
 	} else {
 		int set_yield_cpu = -1;
 		int iters = 0;
+		bool sleepy = false;
 		bool mustq = false;
+		bool preempted;
 
 again:
 		/* We're at the head of the waitqueue, wait for the lock. */
@@ -525,15 +643,37 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b
 			if (!(val & _Q_LOCKED_VAL))
 				break;
 
+			if (paravirt && pv_sleepy_lock) {
+				if (!sleepy) {
+					if (val & _Q_SLEEPY_VAL) {
+						seen_sleepy_lock();
+						sleepy = true;
+					} else if (recently_sleepy()) {
+						sleepy = true;
+					}
+				}
+				if (pv_sleepy_lock_sticky && seen_preempted &&
+						!(val & _Q_SLEEPY_VAL)) {
+					if (try_set_sleepy(lock, val))
+						val |= _Q_SLEEPY_VAL;
+				}
+			}
+
 			propagate_yield_cpu(node, val, &set_yield_cpu, paravirt);
-			if (yield_head_to_locked_owner(lock, val, paravirt)) {
+			preempted = yield_head_to_locked_owner(lock, val, paravirt);
+			if (preempted)
+				seen_preempted = true;
+
+			if (paravirt && preempted) {
+				sleepy = true;
+
 				if (!pv_spin_on_preempted_owner)
 					iters++;
 			} else {
 				iters++;
 			}
 
-			if (!mustq && iters >= get_head_spins(paravirt)) {
+			if (!mustq && iters >= get_head_spins(paravirt, sleepy)) {
 				mustq = true;
 				set_mustq(lock);
 				val |= _Q_MUST_Q_VAL;
@@ -733,6 +873,70 @@ static int pv_spin_on_preempted_owner_get(void *data, u64 *val)
 
 DEFINE_SIMPLE_ATTRIBUTE(fops_pv_spin_on_preempted_owner, pv_spin_on_preempted_owner_get, pv_spin_on_preempted_owner_set, "%llu\n");
 
+static int pv_sleepy_lock_set(void *data, u64 val)
+{
+	pv_sleepy_lock = !!val;
+
+	return 0;
+}
+
+static int pv_sleepy_lock_get(void *data, u64 *val)
+{
+	*val = pv_sleepy_lock;
+
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock, pv_sleepy_lock_get, pv_sleepy_lock_set, "%llu\n");
+
+static int pv_sleepy_lock_sticky_set(void *data, u64 val)
+{
+	pv_sleepy_lock_sticky = !!val;
+
+	return 0;
+}
+
+static int pv_sleepy_lock_sticky_get(void *data, u64 *val)
+{
+	*val = pv_sleepy_lock_sticky;
+
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_sticky, pv_sleepy_lock_sticky_get, pv_sleepy_lock_sticky_set, "%llu\n");
+
+static int pv_sleepy_lock_interval_ns_set(void *data, u64 val)
+{
+	pv_sleepy_lock_interval_ns = val;
+
+	return 0;
+}
+
+static int pv_sleepy_lock_interval_ns_get(void *data, u64 *val)
+{
+	*val = pv_sleepy_lock_interval_ns;
+
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_interval_ns, pv_sleepy_lock_interval_ns_get, pv_sleepy_lock_interval_ns_set, "%llu\n");
+
+static int pv_sleepy_lock_factor_set(void *data, u64 val)
+{
+	pv_sleepy_lock_factor = val;
+
+	return 0;
+}
+
+static int pv_sleepy_lock_factor_get(void *data, u64 *val)
+{
+	*val = pv_sleepy_lock_factor;
+
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_pv_sleepy_lock_factor, pv_sleepy_lock_factor_get, pv_sleepy_lock_factor_set, "%llu\n");
+
 static int pv_yield_prev_set(void *data, u64 val)
 {
 	pv_yield_prev = !!val;
@@ -790,6 +994,10 @@ static __init int spinlock_debugfs_init(void)
 		debugfs_create_file("qspl_pv_yield_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_owner);
 		debugfs_create_file("qspl_pv_yield_allow_steal", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_allow_steal);
 		debugfs_create_file("qspl_pv_spin_on_preempted_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_spin_on_preempted_owner);
+		debugfs_create_file("qspl_pv_sleepy_lock", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock);
+		debugfs_create_file("qspl_pv_sleepy_lock_sticky", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_sticky);
+		debugfs_create_file("qspl_pv_sleepy_lock_interval_ns", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_interval_ns);
+		debugfs_create_file("qspl_pv_sleepy_lock_factor", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_factor);
 		debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev);
 		debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner);
 		debugfs_create_file("qspl_pv_prod_head", 0600, arch_debugfs_dir, NULL, &fops_pv_prod_head);
-- 
2.37.2



More information about the Linuxppc-dev mailing list