[PATCH 5/5] powerpc/smp: Add Power9 scheduler topology

Thu Mar 2 11:49:20 AEDT 2017

In previous generations of Power processors each core had a private L2
cache. The Power9 processor has a slightly different architecture where
the L2 cache is shared among pairs of cores rather than being completely
private.

Making the scheduler aware of this cache sharing allows the scheduler to
make more intelligent migration decisions. When one core in the pair is
overloaded tasks can be migrated to its paired core to improve throughput
without cache-refilling penality typically associated with task
migration.

Signed-off-by: Oliver O'Halloran <oohall at gmail.com>
---
 arch/powerpc/kernel/smp.c | 44 ++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 5571f30ff72d..5e1811b24415 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -673,7 +673,7 @@ static struct device_node *cpu_to_l2cache(int cpu)
 	return cache;
 }
 
-static bool update_core_mask_by_l2(int cpu, bool onlining)
+static bool update_mask_by_l2(int cpu, bool onlining, struct cpumask *(*mask_fn)(int))
 {
 	const struct cpumask *mask = onlining ? cpu_online_mask : cpu_present_mask;
 	struct device_node *l2_cache, *np;
@@ -689,7 +689,7 @@ static bool update_core_mask_by_l2(int cpu, bool onlining)
 			continue;
 
 		if (np == l2_cache)
-			set_cpus_related(cpu, i, onlining, cpu_core_mask);
+			set_cpus_related(cpu, i, onlining, mask_fn);
 
 		of_node_put(np);
 	}
@@ -724,10 +724,17 @@ static void update_cpu_masks(int cpu, bool onlining)
 
 	update_thread_mask(cpu, onlining);
 
+	/* we need the l2 cache mask for the power9 scheduler topology */
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		update_mask_by_l2(cpu, onlining, cpu_cache_mask);
+
+	/* now build the core mask */
+	set_cpus_related(cpu, cpu, onlining, cpu_core_mask);
+
 	if (update_core_mask_by_chip_id(cpu, onlining))
 		return;
 
-	if (update_core_mask_by_l2(cpu, onlining))
+	if (update_mask_by_l2(cpu, onlining, cpu_core_mask))
 		return;
 
 	/* if all else fails duplicate the sibling mask */
@@ -805,6 +812,32 @@ static struct sched_domain_topology_level powerpc_topology[] = {
 	{ NULL, },
 };
 
+
+/* P9 has a slightly odd architecture where two, four thread cores share an L2
+ * cache. For highly threaded workloads it makes sense to try and keep tasks
+ * inside the pair for better cache utilisation so the scheduler needs to be
+ * aware of this. */
+static int powerpc_shared_cache_flags(void)
+{
+	return SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING;
+}
+
+/* this is kind of gross, but passing cpu_cache_mask directly
+ * causes the build to fail due to incompatible pointer types */
+static inline const struct cpumask *cpu_cache_mask_c(int cpu)
+{
+	return cpu_cache_mask(cpu);
+}
+
+static struct sched_domain_topology_level power9_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+	{ cpu_smt_mask, powerpc_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+	{ cpu_cache_mask_c, powerpc_shared_cache_flags, SD_INIT_NAME(CACHE) },
+	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ NULL, },
+};
+
 void __init smp_cpus_done(unsigned int max_cpus)
 {
 	cpumask_var_t old_mask;
@@ -829,7 +862,10 @@ void __init smp_cpus_done(unsigned int max_cpus)
 
 	dump_numa_cpu_topology();
 
-	set_sched_topology(powerpc_topology);
+	if (cpu_has_feature(CPU_FTR_ARCH_300))
+		set_sched_topology(power9_topology);
+	else
+		set_sched_topology(powerpc_topology);
 
 }
 
-- 
2.9.3