[PATCH] arm64: topology: Support SMT control on ACPI based system

Fri Oct 20 18:38:38 AEDT 2023

For ACPI we'll build the topology from PPTT and we cannot directly
get the SMT number of each core. Instead using a temporary xarray
to record the heterogeneous information (from ACPI_PPTT_ACPI_IDENTICAL)
and SMT information of the first core in its heterogeneous CPU cluster
when building the topology. Then we can know the largest SMT number
in the system. Warn if heterogeneous SMT topology exists (multiple
heterogeneous CPU clusters with different SMT thread number) since the
SMT control cannot handle this well. Then enable the support of SMT
control.

Signed-off-by: Yicong Yang <yangyicong at hisilicon.com>
---
 arch/arm64/kernel/topology.c | 60 ++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
index 1a2c72f3e7f8..f6ec30fae70e 100644
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -15,8 +15,10 @@
 #include <linux/arch_topology.h>
 #include <linux/cacheinfo.h>
 #include <linux/cpufreq.h>
+#include <linux/cpu_smt.h>
 #include <linux/init.h>
 #include <linux/percpu.h>
+#include <linux/xarray.h>

 #include <asm/cpu.h>
 #include <asm/cputype.h>
@@ -37,17 +39,29 @@ static bool __init acpi_cpu_is_threaded(int cpu)
 	return !!is_threaded;
 }

+struct cpu_smt_info {
+	int thread_num;
+	int core_id;
+	int cpu;
+};
+
 /*
  * Propagate the topology information of the processor_topology_node tree to the
  * cpu_topology array.
  */
 int __init parse_acpi_topology(void)
 {
+	int max_smt_thread_num = 1;
+	struct cpu_smt_info *entry;
+	struct xarray hetero_cpu;
+	unsigned long hetero_id;
 	int cpu, topology_id;

 	if (acpi_disabled)
 		return 0;

+	xa_init(&hetero_cpu);
+
 	for_each_possible_cpu(cpu) {
 		topology_id = find_acpi_cpu_topology(cpu, 0);
 		if (topology_id < 0)
@@ -57,6 +71,30 @@ int __init parse_acpi_topology(void)
 			cpu_topology[cpu].thread_id = topology_id;
 			topology_id = find_acpi_cpu_topology(cpu, 1);
 			cpu_topology[cpu].core_id   = topology_id;
+
+			/*
+			 * Build up the XArray using the heterogeneous ID of
+			 * the CPU cluster. Store the CPU and SMT information
+			 * of the first appeared CPU in the CPU cluster of this
+			 * heterogeneous ID since the SMT information should be
+			 * the same in this CPU cluster. Then we can know the
+			 * SMT information of each heterogeneous CPUs in the
+			 * system.
+			 */
+			hetero_id = find_acpi_cpu_topology_hetero_id(cpu);
+			entry = (struct cpu_smt_info *)xa_load(&hetero_cpu, hetero_id);
+			if (!entry) {
+				entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+				WARN_ON(!entry);
+
+				entry->cpu = cpu;
+				entry->core_id = topology_id;
+				entry->thread_num = 1;
+				xa_store(&hetero_cpu, hetero_id,
+					 entry, GFP_KERNEL);
+			} else if (entry->core_id == topology_id) {
+				entry->thread_num++;
+			}
 		} else {
 			cpu_topology[cpu].thread_id  = -1;
 			cpu_topology[cpu].core_id    = topology_id;
@@ -67,6 +105,28 @@ int __init parse_acpi_topology(void)
 		cpu_topology[cpu].package_id = topology_id;
 	}

+	/*
+	 * This should be a short loop depending on the number of heterogeneous
+	 * CPU clusters. Typically on a homogeneous system there's only one
+	 * entry in the XArray.
+	 */
+	xa_for_each(&hetero_cpu, hetero_id, entry) {
+		if (entry->thread_num == 1)
+			continue;
+
+		if (entry->thread_num != max_smt_thread_num &&
+		    max_smt_thread_num != 1)
+			pr_warn("Heterogeneous SMT topology not handled");
+
+		if (entry->thread_num > max_smt_thread_num)
+			max_smt_thread_num = entry->thread_num;
+
+		xa_erase(&hetero_cpu, hetero_id);
+		kfree(entry);
+	}
+
+	cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
+	xa_destroy(&hetero_cpu);
 	return 0;
 }
 #endif
-- 
2.24.0


> Regards,
> Pierre
> 
>>
>> Thanks.
>>
>>> +
>>> Â Â Â Â Â Â Â Â  cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
>>> Â  Â Â Â Â Â Â Â Â  xa_destroy(&core_threads);
>>> diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
>>> index 95513abd664f..20d7f5b72ddd 100644
>>> --- a/drivers/base/arch_topology.c
>>> +++ b/drivers/base/arch_topology.c
>>> @@ -532,13 +532,15 @@ static int __init get_cpu_for_node(struct device_node *node)
>>> Â Â Â Â Â Â Â Â  return cpu;
>>> Â Â }
>>> Â  -static void __init update_smt_num_threads(unsigned int num_threads)
>>> +static void __init update_smt_num_threads(int num_threads)
>>> Â Â {
>>> -Â Â Â Â Â Â  static unsigned int max_smt_thread_num = 1;
>>> +Â Â Â Â Â Â  static int max_smt_thread_num = -1;
>>> Â  -Â Â Â Â Â Â  if (num_threads > max_smt_thread_num) {
>>> +Â Â Â Â Â Â  if (max_smt_thread_num < 0) {
>>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  max_smt_thread_num = num_threads;
>>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
>>> +Â Â Â Â Â Â  } else if (num_threads != max_smt_thread_num) {
>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â  pr_warn("Heterogeneous SMT topology not handled");
>>> Â Â Â Â Â Â Â Â  }
>>> Â Â }
>>> Â  diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
>>> index b721f360d759..afdfdc64a0a1 100644
>>> --- a/include/linux/arch_topology.h
>>> +++ b/include/linux/arch_topology.h
>>> @@ -87,6 +87,7 @@ extern struct cpu_topology cpu_topology[NR_CPUS];
>>> Â Â #define topology_physical_package_id(cpu)Â Â Â Â Â  (cpu_topology[cpu].package_id)
>>> Â Â #define topology_cluster_id(cpu)Â Â Â Â Â Â  (cpu_topology[cpu].cluster_id)
>>> Â Â #define topology_core_id(cpu)Â Â Â Â Â Â Â Â Â  (cpu_topology[cpu].core_id)
>>> +#define topology_thread_id(cpu)Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  (cpu_topology[cpu].thread_id)
>>> Â Â #define topology_core_cpumask(cpu)Â Â Â Â  (&cpu_topology[cpu].core_sibling)
>>> Â Â #define topology_sibling_cpumask(cpu)Â  (&cpu_topology[cpu].thread_sibling)
>>> Â Â #define topology_cluster_cpumask(cpu)Â  (&cpu_topology[cpu].cluster_sibling)
>>>
>>> ----------------------------
>>>
>>>
>>> Regards,
>>> Pierre
>>>
>>>>
>>>> Thanks,
>>>> Yicong
>>>>
>>>>>
>>>>> Same comment for the DT patch. If there is an assumption that all CPUs have
>>>>> the same number of threads, then update_smt_num_threads() could only be called
>>>>> once I suppose,
>>>>>
>>>>> Regards,
>>>>> Pierre
>>>>>
>>>>>
>>>>> On 8/6/24 10:53, Yicong Yang wrote:
>>>>>> From: Yicong Yang <yangyicong at hisilicon.com>
>>>>>>
>>>>>> For ACPI we'll build the topology from PPTT and we cannot directly
>>>>>> get the SMT number of each core. Instead using a temporary xarray
>>>>>> to record the SMT number of each core when building the topology
>>>>>> and we can know the largest SMT number in the system. Then we can
>>>>>> enable the support of SMT control.
>>>>>>
>>>>>> Signed-off-by: Yicong Yang <yangyicong at hisilicon.com>
>>>>>> ---
>>>>>> Â Â Â  arch/arm64/kernel/topology.c | 24 ++++++++++++++++++++++++
>>>>>> Â Â Â  1 file changed, 24 insertions(+)
>>>>>>
>>>>>> diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c
>>>>>> index 1a2c72f3e7f8..f72e1e55b05e 100644
>>>>>> --- a/arch/arm64/kernel/topology.c
>>>>>> +++ b/arch/arm64/kernel/topology.c
>>>>>> @@ -15,8 +15,10 @@
>>>>>> Â Â Â  #include <linux/arch_topology.h>
>>>>>> Â Â Â  #include <linux/cacheinfo.h>
>>>>>> Â Â Â  #include <linux/cpufreq.h>
>>>>>> +#include <linux/cpu_smt.h>
>>>>>> Â Â Â  #include <linux/init.h>
>>>>>> Â Â Â  #include <linux/percpu.h>
>>>>>> +#include <linux/xarray.h>
>>>>>> Â Â Â  Â  #include <asm/cpu.h>
>>>>>> Â Â Â  #include <asm/cputype.h>
>>>>>> @@ -43,11 +45,16 @@ static bool __init acpi_cpu_is_threaded(int cpu)
>>>>>> Â Â Â Â  */
>>>>>> Â Â Â  int __init parse_acpi_topology(void)
>>>>>> Â Â Â  {
>>>>>> +Â Â Â  int thread_num, max_smt_thread_num = 1;
>>>>>> +Â Â Â  struct xarray core_threads;
>>>>>> Â Â Â Â Â Â Â  int cpu, topology_id;
>>>>>> +Â Â Â  void *entry;
>>>>>> Â Â Â  Â Â Â Â Â  if (acpi_disabled)
>>>>>> Â Â Â Â Â Â Â Â Â Â Â  return 0;
>>>>>> Â Â Â  +Â Â Â  xa_init(&core_threads);
>>>>>> +
>>>>>> Â Â Â Â Â Â Â  for_each_possible_cpu(cpu) {
>>>>>> Â Â Â Â Â Â Â Â Â Â Â  topology_id = find_acpi_cpu_topology(cpu, 0);
>>>>>> Â Â Â Â Â Â Â Â Â Â Â  if (topology_id < 0)
>>>>>> @@ -57,6 +64,20 @@ int __init parse_acpi_topology(void)
>>>>>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  cpu_topology[cpu].thread_id = topology_id;
>>>>>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  topology_id = find_acpi_cpu_topology(cpu, 1);
>>>>>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  cpu_topology[cpu].core_idÂ Â  = topology_id;
>>>>>> +
>>>>>> +Â Â Â Â Â Â Â Â Â Â Â  entry = xa_load(&core_threads, topology_id);
>>>>>> +Â Â Â Â Â Â Â Â Â Â Â  if (!entry) {
>>>>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  xa_store(&core_threads, topology_id,
>>>>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  xa_mk_value(1), GFP_KERNEL);
>>>>>> +Â Â Â Â Â Â Â Â Â Â Â  } else {
>>>>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  thread_num = xa_to_value(entry);
>>>>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  thread_num++;
>>>>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  xa_store(&core_threads, topology_id,
>>>>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  xa_mk_value(thread_num), GFP_KERNEL);
>>>>>> +
>>>>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  if (thread_num > max_smt_thread_num)
>>>>>> +Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  max_smt_thread_num = thread_num;
>>>>>> +Â Â Â Â Â Â Â Â Â Â Â  }
>>>>>> Â Â Â Â Â Â Â Â Â Â Â  } else {
>>>>>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  cpu_topology[cpu].thread_idÂ  = -1;
>>>>>> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â  cpu_topology[cpu].core_idÂ Â Â  = topology_id;
>>>>>> @@ -67,6 +88,9 @@ int __init parse_acpi_topology(void)
>>>>>> Â Â Â Â Â Â Â Â Â Â Â  cpu_topology[cpu].package_id = topology_id;
>>>>>> Â Â Â Â Â Â Â  }
>>>>>> Â Â Â  +Â Â Â  cpu_smt_set_num_threads(max_smt_thread_num, max_smt_thread_num);
>>>>>> +
>>>>>> +Â Â Â  xa_destroy(&core_threads);
>>>>>> Â Â Â Â Â Â Â  return 0;
>>>>>> Â Â Â  }
>>>>>> Â Â Â  #endif
>>>>>
>>>>> .
> 
> .