[RFC PATCH v2 1/3] sched/fair: introduce new scheduler group type group_parked

Mon Feb 17 22:32:50 AEDT 2025

A parked CPU is considered to be flagged as unsuitable to process
workload at the moment, but might be become usable anytime. Depending on
the necessity for additional computation power and/or available capacity
of the underlying hardware.

A scheduler group is considered to be parked, if there are tasks queued
on parked CPUs and there are no idle CPUs, i.e. all non parked CPUs are
busy or there are only parked CPUs. A scheduler group with parked tasks
can be considered to not be parked, if it has idle CPUs which can pick
up the parked tasks. A parked scheduler group is considered to be busier
than another if it runs more tasks on parked CPUs than another parked
scheduler group.

A parked CPU must keep its scheduler tick (or have it re-enabled if
necessary) in order to make sure that parked CPUs which only run a
single task which does not give up its runtime voluntarily is still
evacuated as it would otherwise go into NO_HZ.

The status of the underlying hardware must be considered to be
architecture dependent. Therefore the check whether a CPU is parked is
architecture specific. For architectures not relying on this feature,
the check is mostly a NOP.

This is more efficient and non-disruptive compared to CPU hotplug in
environments where such changes can be necessary on a frequent basis.

Signed-off-by: Tobias Huschle <huschle at linux.ibm.com>
---
 include/linux/sched/topology.h | 19 ++++++++
 kernel/sched/core.c            | 13 ++++-
 kernel/sched/fair.c            | 86 +++++++++++++++++++++++++++++-----
 kernel/sched/syscalls.c        |  3 ++
 4 files changed, 109 insertions(+), 12 deletions(-)

diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7f3dbafe1817..2a4730729988 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -265,6 +265,25 @@ unsigned long arch_scale_cpu_capacity(int cpu)
 }
 #endif
 
+#ifndef arch_cpu_parked
+/**
+ * arch_cpu_parked - Check if a given CPU is currently parked.
+ *
+ * A parked CPU cannot run any kind of workload since underlying
+ * physical CPU should not be used at the moment .
+ *
+ * @cpu: the CPU in question.
+ *
+ * By default assume CPU is not parked
+ *
+ * Return: Parked state of CPU
+ */
+static __always_inline bool arch_cpu_parked(int cpu)
+{
+	return false;
+}
+#endif
+
 #ifndef arch_scale_hw_pressure
 static __always_inline
 unsigned long arch_scale_hw_pressure(int cpu)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 165c90ba64ea..9ed15911ec60 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1352,6 +1352,9 @@ bool sched_can_stop_tick(struct rq *rq)
 	if (rq->cfs.h_nr_queued > 1)
 		return false;
 
+	if (rq->cfs.nr_running > 0 && arch_cpu_parked(cpu_of(rq)))
+		return false;
+
 	/*
 	 * If there is one task and it has CFS runtime bandwidth constraints
 	 * and it's on the cpu now we don't want to stop the tick.
@@ -2443,7 +2446,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
 
 	/* Non kernel threads are not allowed during either online or offline. */
 	if (!(p->flags & PF_KTHREAD))
-		return cpu_active(cpu);
+		return !arch_cpu_parked(cpu) && cpu_active(cpu);
 
 	/* KTHREAD_IS_PER_CPU is always allowed. */
 	if (kthread_is_per_cpu(p))
@@ -2453,6 +2456,10 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
 	if (cpu_dying(cpu))
 		return false;
 
+	/* CPU should be avoided at the moment */
+	if (arch_cpu_parked(cpu))
+		return false;
+
 	/* But are allowed during online. */
 	return cpu_online(cpu);
 }
@@ -3930,6 +3937,10 @@ static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
 	if (task_on_scx(p))
 		return false;
 
+	/* The task should not be queued onto a parked CPU. */
+	if (arch_cpu_parked(cpu))
+		return false;
+
 	/*
 	 * Do not complicate things with the async wake_list while the CPU is
 	 * in hotplug state.
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1c0ef435a7aa..5eb1a3113704 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6871,6 +6871,8 @@ static int sched_idle_rq(struct rq *rq)
 #ifdef CONFIG_SMP
 static int sched_idle_cpu(int cpu)
 {
+	if (arch_cpu_parked(cpu))
+		return 0;
 	return sched_idle_rq(cpu_rq(cpu));
 }
 #endif
@@ -7399,6 +7401,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 {
 	int target = nr_cpumask_bits;
 
+	if (arch_cpu_parked(target))
+		return prev_cpu;
+
 	if (sched_feat(WA_IDLE))
 		target = wake_affine_idle(this_cpu, prev_cpu, sync);
 
@@ -9182,7 +9187,12 @@ enum group_type {
 	 * The CPU is overloaded and can't provide expected CPU cycles to all
 	 * tasks.
 	 */
-	group_overloaded
+	group_overloaded,
+	/*
+	 * The CPU should be avoided as it can't provide expected CPU cycles
+	 * even for small amounts of workload.
+	 */
+	group_parked
 };
 
 enum migration_type {
@@ -9902,6 +9912,7 @@ struct sg_lb_stats {
 	unsigned long group_runnable;		/* Total runnable time over the CPUs of the group */
 	unsigned int sum_nr_running;		/* Nr of all tasks running in the group */
 	unsigned int sum_h_nr_running;		/* Nr of CFS tasks running in the group */
+	unsigned int sum_nr_parked;
 	unsigned int idle_cpus;                 /* Nr of idle CPUs         in the group */
 	unsigned int group_weight;
 	enum group_type group_type;
@@ -10159,6 +10170,9 @@ group_type group_classify(unsigned int imbalance_pct,
 			  struct sched_group *group,
 			  struct sg_lb_stats *sgs)
 {
+	if (sgs->sum_nr_parked && !sgs->idle_cpus)
+		return group_parked;
+
 	if (group_is_overloaded(imbalance_pct, sgs))
 		return group_overloaded;
 
@@ -10354,6 +10368,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		if (cpu_overutilized(i))
 			*sg_overutilized = 1;
 
+		sgs->sum_nr_parked += arch_cpu_parked(i) * rq->cfs.h_nr_queued;
+
 		/*
 		 * No need to call idle_cpu() if nr_running is not 0
 		 */
@@ -10459,6 +10475,8 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 	 */
 
 	switch (sgs->group_type) {
+	case group_parked:
+		return sgs->sum_nr_parked > busiest->sum_nr_parked;
 	case group_overloaded:
 		/* Select the overloaded group with highest avg_load. */
 		return sgs->avg_load > busiest->avg_load;
@@ -10622,6 +10640,9 @@ static int idle_cpu_without(int cpu, struct task_struct *p)
 {
 	struct rq *rq = cpu_rq(cpu);
 
+	if (arch_cpu_parked(cpu))
+		return 0;
+
 	if (rq->curr != rq->idle && rq->curr != p)
 		return 0;
 
@@ -10670,6 +10691,8 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
 		nr_running = rq->nr_running - local;
 		sgs->sum_nr_running += nr_running;
 
+		sgs->sum_nr_parked += arch_cpu_parked(i) * rq->cfs.h_nr_queued;
+
 		/*
 		 * No need to call idle_cpu_without() if nr_running is not 0
 		 */
@@ -10717,6 +10740,8 @@ static bool update_pick_idlest(struct sched_group *idlest,
 	 */
 
 	switch (sgs->group_type) {
+	case group_parked:
+		return false;
 	case group_overloaded:
 	case group_fully_busy:
 		/* Select the group with lowest avg_load. */
@@ -10767,7 +10792,7 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int
 	unsigned long imbalance;
 	struct sg_lb_stats idlest_sgs = {
 			.avg_load = UINT_MAX,
-			.group_type = group_overloaded,
+			.group_type = group_parked,
 	};
 
 	do {
@@ -10825,6 +10850,8 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int
 		return idlest;
 
 	switch (local_sgs.group_type) {
+	case group_parked:
+		return idlest;
 	case group_overloaded:
 	case group_fully_busy:
 
@@ -11076,6 +11103,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	local = &sds->local_stat;
 	busiest = &sds->busiest_stat;
 
+	if (busiest->group_type == group_parked) {
+		env->migration_type = migrate_task;
+		env->imbalance = busiest->sum_nr_parked;
+		return;
+	}
+
 	if (busiest->group_type == group_misfit_task) {
 		if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
 			/* Set imbalance to allow misfit tasks to be balanced. */
@@ -11244,13 +11277,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 /*
  * Decision matrix according to the local and busiest group type:
  *
- * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
- * has_spare        nr_idle   balanced   N/A    N/A  balanced   balanced
- * fully_busy       nr_idle   nr_idle    N/A    N/A  balanced   balanced
- * misfit_task      force     N/A        N/A    N/A  N/A        N/A
- * asym_packing     force     force      N/A    N/A  force      force
- * imbalanced       force     force      N/A    N/A  force      force
- * overloaded       force     force      N/A    N/A  force      avg_load
+ * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded parked
+ * has_spare        nr_idle   balanced   N/A    N/A  balanced   balanced  balanced
+ * fully_busy       nr_idle   nr_idle    N/A    N/A  balanced   balanced  balanced
+ * misfit_task      force     N/A        N/A    N/A  N/A        N/A       N/A
+ * asym_packing     force     force      N/A    N/A  force      force     balanced
+ * imbalanced       force     force      N/A    N/A  force      force     balanced
+ * overloaded       force     force      N/A    N/A  force      avg_load  balanced
+ * parked           force     force      N/A    N/A  force      force     balanced
  *
  * N/A :      Not Applicable because already filtered while updating
  *            statistics.
@@ -11259,6 +11293,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
  * avg_load : Only if imbalance is significant enough.
  * nr_idle :  dst_cpu is not busy and the number of idle CPUs is quite
  *            different in groups.
+ * nr_task :  balancing can go either way depending on the number of running tasks
+ *            per group
  */
 
 /**
@@ -11289,6 +11325,13 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
 		goto out_balanced;
 
 	busiest = &sds.busiest_stat;
+	local = &sds.local_stat;
+
+	if (local->group_type == group_parked)
+		goto out_balanced;
+
+	if (busiest->group_type == group_parked)
+		goto force_balance;
 
 	/* Misfit tasks should be dealt with regardless of the avg load */
 	if (busiest->group_type == group_misfit_task)
@@ -11310,7 +11353,6 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env)
 	if (busiest->group_type == group_imbalanced)
 		goto force_balance;
 
-	local = &sds.local_stat;
 	/*
 	 * If the local group is busier than the selected busiest group
 	 * don't try and pull any tasks.
@@ -11423,6 +11465,9 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env,
 		enum fbq_type rt;
 
 		rq = cpu_rq(i);
+		if (arch_cpu_parked(i) && rq->cfs.h_nr_queued)
+			return rq;
+
 		rt = fbq_classify_rq(rq);
 
 		/*
@@ -11593,6 +11638,9 @@ static int need_active_balance(struct lb_env *env)
 {
 	struct sched_domain *sd = env->sd;
 
+	if (arch_cpu_parked(env->src_cpu) && cpu_rq(env->src_cpu)->cfs.h_nr_queued)
+		return 1;
+
 	if (asym_active_balance(env))
 		return 1;
 
@@ -11626,6 +11674,14 @@ static int should_we_balance(struct lb_env *env)
 	struct sched_group *sg = env->sd->groups;
 	int cpu, idle_smt = -1;
 
+	if (arch_cpu_parked(env->dst_cpu))
+		return 0;
+
+	for_each_cpu(cpu, sched_domain_span(env->sd)) {
+		if (arch_cpu_parked(cpu) && cpu_rq(cpu)->cfs.h_nr_queued)
+			return 1;
+	}
+
 	/*
 	 * Ensure the balancing environment is consistent; can happen
 	 * when the softirq triggers 'during' hotplug.
@@ -11766,7 +11822,7 @@ static int sched_balance_rq(int this_cpu, struct rq *this_rq,
 	ld_moved = 0;
 	/* Clear this flag as soon as we find a pullable task */
 	env.flags |= LBF_ALL_PINNED;
-	if (busiest->nr_running > 1) {
+	if (busiest->nr_running > 1 || arch_cpu_parked(busiest->cpu)) {
 		/*
 		 * Attempt to move tasks. If sched_balance_find_src_group has found
 		 * an imbalance but busiest->nr_running <= 1, the group is
@@ -12356,6 +12412,11 @@ static void nohz_balancer_kick(struct rq *rq)
 	if (time_before(now, nohz.next_balance))
 		goto out;
 
+	if (!idle_cpu(rq->cpu)) {
+		flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
+		goto out;
+	}
+
 	if (rq->nr_running >= 2) {
 		flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
 		goto out;
@@ -12767,6 +12828,9 @@ static int sched_balance_newidle(struct rq *this_rq, struct rq_flags *rf)
 
 	update_misfit_status(NULL, this_rq);
 
+	if (arch_cpu_parked(this_cpu))
+		return 0;
+
 	/*
 	 * There is a task waiting to run. No need to search for one.
 	 * Return 0; the task will be enqueued when switching to idle.
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index 456d339be98f..7efd76a30be7 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -214,6 +214,9 @@ int idle_cpu(int cpu)
 		return 0;
 #endif
 
+	if (arch_cpu_parked(cpu))
+		return 0;
+
 	return 1;
 }
 
-- 
2.34.1