[PATCH v5] pseries: prevent free CPU ids to be reused on another node

Laurent Dufour ldufour at linux.ibm.com
Thu Jul 1 18:35:24 AEST 2021


Hi Michael,

Do you mind taking this patch of 5.14?

Thanks,
Laurent.

Le 29/04/2021 à 19:49, Laurent Dufour a écrit :
> When a CPU is hot added, the CPU ids are taken from the available mask from
> the lower possible set. If that set of values was previously used for CPU
> attached to a different node, this seems to application like if these CPUs
> have migrated from a node to another one which is not expected in real
> life.
> 
> To prevent this, it is needed to record the CPU ids used for each node and
> to not reuse them on another node. However, to prevent CPU hot plug to
> fail, in the case the CPU ids is starved on a node, the capability to reuse
> other nodes’ free CPU ids is kept. A warning is displayed in such a case
> to warn the user.
> 
> A new CPU bit mask (node_recorded_ids_map) is introduced for each possible
> node. It is populated with the CPU onlined at boot time, and then when a
> CPU is hot plug to a node. The bits in that mask remain when the CPU is hot
> unplugged, to remind this CPU ids have been used for this node.
> 
> If no id set was found, a retry is made without removing the ids used on
> the other nodes to try reusing them. This is the way ids have been
> allocated prior to this patch.
> 
> The effect of this patch can be seen by removing and adding CPUs using the
> Qemu monitor. In the following case, the first CPU from the node 2 is
> removed, then the first one from the node 1 is removed too. Later, the
> first CPU of the node 2 is added back. Without that patch, the kernel will
> numbered these CPUs using the first CPU ids available which are the ones
> freed when removing the second CPU of the node 0. This leads to the CPU ids
> 16-23 to move from the node 1 to the node 2. With the patch applied, the
> CPU ids 32-39 are used since they are the lowest free ones which have not
> been used on another node.
> 
> At boot time:
> [root at vm40 ~]# numactl -H | grep cpus
> node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
> node 1 cpus: 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
> node 2 cpus: 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
> 
> Vanilla kernel, after the CPU hot unplug/plug operations:
> [root at vm40 ~]# numactl -H | grep cpus
> node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
> node 1 cpus: 24 25 26 27 28 29 30 31
> node 2 cpus: 16 17 18 19 20 21 22 23 40 41 42 43 44 45 46 47
> 
> Patched kernel, after the CPU hot unplug/plug operations:
> [root at vm40 ~]# numactl -H | grep cpus
> node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
> node 1 cpus: 24 25 26 27 28 29 30 31
> node 2 cpus: 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
> 
> Signed-off-by: Laurent Dufour <ldufour at linux.ibm.com>
> ---
> V5:
>   - Rework code structure
>   - Reintroduce the capability to reuse other node's ids.
> V4: addressing Nathan's comment
>   - Rename the local variable named 'nid' into 'assigned_node'
> V3: addressing Nathan's comments
>   - Remove the retry feature
>   - Reduce the number of local variables (removing 'i')
>   - Add comment about the cpu_add_remove_lock protecting the added CPU mask.
>   V2: (no functional changes)
>   - update the test's output in the commit's description
>   - node_recorded_ids_map should be static
> ---
>   arch/powerpc/platforms/pseries/hotplug-cpu.c | 171 ++++++++++++++-----
>   1 file changed, 132 insertions(+), 39 deletions(-)
> 
> diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> index 7e970f81d8ff..e1f224320102 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-cpu.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c
> @@ -39,6 +39,12 @@
>   /* This version can't take the spinlock, because it never returns */
>   static int rtas_stop_self_token = RTAS_UNKNOWN_SERVICE;
>   
> +/*
> + * Record the CPU ids used on each nodes.
> + * Protected by cpu_add_remove_lock.
> + */
> +static cpumask_var_t node_recorded_ids_map[MAX_NUMNODES];
> +
>   static void rtas_stop_self(void)
>   {
>   	static struct rtas_args args;
> @@ -139,72 +145,148 @@ static void pseries_cpu_die(unsigned int cpu)
>   	paca_ptrs[cpu]->cpu_start = 0;
>   }
>   
> +/**
> + * find_cpu_id_range - found a linear ranger of @nthreads free CPU ids.
> + * @nthreads : the number of threads (cpu ids)
> + * @assigned_node : the node it belongs to or NUMA_NO_NODE if free ids from any
> + *                  node can be peek.
> + * @cpu_mask: the returned CPU mask.
> + *
> + * Returns 0 on success.
> + */
> +static int find_cpu_id_range(unsigned int nthreads, int assigned_node,
> +			     cpumask_var_t *cpu_mask)
> +{
> +	cpumask_var_t candidate_mask;
> +	unsigned int cpu, node;
> +	int rc = -ENOSPC;
> +
> +	if (!zalloc_cpumask_var(&candidate_mask, GFP_KERNEL))
> +		return -ENOMEM;
> +
> +	cpumask_clear(*cpu_mask);
> +	for (cpu = 0; cpu < nthreads; cpu++)
> +		cpumask_set_cpu(cpu, *cpu_mask);
> +
> +	BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask));
> +
> +	/* Get a bitmap of unoccupied slots. */
> +	cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask);
> +
> +	if (assigned_node != NUMA_NO_NODE) {
> +		/*
> +		 * Remove free ids previously assigned on the other nodes. We
> +		 * can walk only online nodes because once a node became online
> +		 * it is not turned offlined back.
> +		 */
> +		for_each_online_node(node) {
> +			if (node == assigned_node)
> +				continue;
> +			cpumask_andnot(candidate_mask, candidate_mask,
> +				       node_recorded_ids_map[node]);
> +		}
> +	}
> +
> +	if (cpumask_empty(candidate_mask))
> +		goto out;
> +
> +	while (!cpumask_empty(*cpu_mask)) {
> +		if (cpumask_subset(*cpu_mask, candidate_mask))
> +			/* Found a range where we can insert the new cpu(s) */
> +			break;
> +		cpumask_shift_left(*cpu_mask, *cpu_mask, nthreads);
> +	}
> +
> +	if (!cpumask_empty(*cpu_mask))
> +		rc = 0;
> +
> +out:
> +	free_cpumask_var(candidate_mask);
> +	return rc;
> +}
> +
>   /*
>    * Update cpu_present_mask and paca(s) for a new cpu node.  The wrinkle
> - * here is that a cpu device node may represent up to two logical cpus
> + * here is that a cpu device node may represent multiple logical cpus
>    * in the SMT case.  We must honor the assumption in other code that
>    * the logical ids for sibling SMT threads x and y are adjacent, such
>    * that x^1 == y and y^1 == x.
>    */
>   static int pseries_add_processor(struct device_node *np)
>   {
> -	unsigned int cpu;
> -	cpumask_var_t candidate_mask, tmp;
> -	int err = -ENOSPC, len, nthreads, i;
> +	int len, nthreads, node, cpu, assigned_node;
> +	int rc = 0;
> +	cpumask_var_t cpu_mask;
>   	const __be32 *intserv;
>   
>   	intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len);
>   	if (!intserv)
>   		return 0;
>   
> -	zalloc_cpumask_var(&candidate_mask, GFP_KERNEL);
> -	zalloc_cpumask_var(&tmp, GFP_KERNEL);
> -
>   	nthreads = len / sizeof(u32);
> -	for (i = 0; i < nthreads; i++)
> -		cpumask_set_cpu(i, tmp);
>   
> -	cpu_maps_update_begin();
> +	if (!alloc_cpumask_var(&cpu_mask, GFP_KERNEL))
> +		return -ENOMEM;
>   
> -	BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask));
> +	/*
> +	 * Fetch from the DT nodes read by dlpar_configure_connector() the NUMA
> +	 * node id the added CPU belongs to.
> +	 */
> +	node = of_node_to_nid(np);
> +	if (node < 0 || !node_possible(node))
> +		node = first_online_node;
>   
> -	/* Get a bitmap of unoccupied slots. */
> -	cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask);
> -	if (cpumask_empty(candidate_mask)) {
> -		/* If we get here, it most likely means that NR_CPUS is
> -		 * less than the partition's max processors setting.
> +	BUG_ON(node == NUMA_NO_NODE);
> +	assigned_node = node;
> +
> +	cpu_maps_update_begin();
> +
> +	rc = find_cpu_id_range(nthreads, node, &cpu_mask);
> +	if (rc && nr_node_ids > 1) {
> +		/*
> +		 * Try again, considering the free CPU ids from the other node.
>   		 */
> -		printk(KERN_ERR "Cannot add cpu %pOF; this system configuration"
> -		       " supports %d logical cpus.\n", np,
> -		       num_possible_cpus());
> -		goto out_unlock;
> +		node = NUMA_NO_NODE;
> +		rc = find_cpu_id_range(nthreads, NUMA_NO_NODE, &cpu_mask);
>   	}
>   
> -	while (!cpumask_empty(tmp))
> -		if (cpumask_subset(tmp, candidate_mask))
> -			/* Found a range where we can insert the new cpu(s) */
> -			break;
> -		else
> -			cpumask_shift_left(tmp, tmp, nthreads);
> -
> -	if (cpumask_empty(tmp)) {
> -		printk(KERN_ERR "Unable to find space in cpu_present_mask for"
> -		       " processor %pOFn with %d thread(s)\n", np,
> -		       nthreads);
> -		goto out_unlock;
> +	if (rc) {
> +		pr_err("Cannot add cpu %pOF; this system configuration"
> +		       " supports %d logical cpus.\n", np, num_possible_cpus());
> +		goto out;
>   	}
>   
> -	for_each_cpu(cpu, tmp) {
> +	for_each_cpu(cpu, cpu_mask) {
>   		BUG_ON(cpu_present(cpu));
>   		set_cpu_present(cpu, true);
>   		set_hard_smp_processor_id(cpu, be32_to_cpu(*intserv++));
>   	}
> -	err = 0;
> -out_unlock:
> +
> +	/* Record the newly used CPU ids for the associate node. */
> +	cpumask_or(node_recorded_ids_map[assigned_node],
> +		   node_recorded_ids_map[assigned_node], cpu_mask);
> +
> +	/*
> +	 * If node is set to NUMA_NO_NODE, CPU ids have be reused from
> +	 * another node, remove them from its mask.
> +	 */
> +	if (node == NUMA_NO_NODE) {
> +		cpu = cpumask_first(cpu_mask);
> +		pr_warn("Reusing free CPU ids %d-%d from another node\n",
> +			cpu, cpu + nthreads - 1);
> +		for_each_online_node(node) {
> +			if (node == assigned_node)
> +				continue;
> +			cpumask_andnot(node_recorded_ids_map[node],
> +				       node_recorded_ids_map[node],
> +				       cpu_mask);
> +		}
> +	}
> +
> +out:
>   	cpu_maps_update_done();
> -	free_cpumask_var(candidate_mask);
> -	free_cpumask_var(tmp);
> -	return err;
> +	free_cpumask_var(cpu_mask);
> +	return rc;
>   }
>   
>   /*
> @@ -908,6 +990,7 @@ static struct notifier_block pseries_smp_nb = {
>   static int __init pseries_cpu_hotplug_init(void)
>   {
>   	int qcss_tok;
> +	unsigned int node;
>   
>   #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE
>   	ppc_md.cpu_probe = dlpar_cpu_probe;
> @@ -929,8 +1012,18 @@ static int __init pseries_cpu_hotplug_init(void)
>   	smp_ops->cpu_die = pseries_cpu_die;
>   
>   	/* Processors can be added/removed only on LPAR */
> -	if (firmware_has_feature(FW_FEATURE_LPAR))
> +	if (firmware_has_feature(FW_FEATURE_LPAR)) {
> +		for_each_node(node) {
> +			alloc_bootmem_cpumask_var(&node_recorded_ids_map[node]);
> +
> +			/* Record ids of CPU added at boot time */
> +			cpumask_or(node_recorded_ids_map[node],
> +				   node_recorded_ids_map[node],
> +				   node_to_cpumask_map[node]);
> +		}
> +
>   		of_reconfig_notifier_register(&pseries_smp_nb);
> +	}
>   
>   	return 0;
>   }
> 



More information about the Linuxppc-dev mailing list