[PATCH 3/3] powerpc: Use form 1 affinity to setup node distance

Benjamin Herrenschmidt benh at kernel.crashing.org
Thu May 6 16:50:42 EST 2010


On Fri, 2010-04-30 at 14:43 +1000, Anton Blanchard wrote:
> Form 1 affinity allows multiple entries in ibm,associativity-reference-points
> which represent affinity domains in decreasing order of importance. The
> Linux concept of a node is always the first entry, but using the other
> values as an input to node_distance() allows the memory allocator to make
> better decisions on which node to go first when local memory has been
> exhausted.
> 
> We keep things simple and create an array indexed by NUMA node, capped at
> 4 entries. Each time we lookup an associativity property we initialise
> the array which is overkill, but since we should only hit this path during
> boot it didn't seem worth adding a per node valid bit.

Ok, so pls dbl check my -next branch (I'm pushing a new one out today
hopefully) and respin :-) 1 and 2 seem to be already there and 3 doesn't
apply (non-trivial).

Thanks !

Cheers,
Ben.

> Signed-off-by: Anton Blanchard <anton at samba.org>
> ---
> 
> Index: linux-2.6/arch/powerpc/include/asm/topology.h
> ===================================================================
> --- linux-2.6.orig/arch/powerpc/include/asm/topology.h	2010-04-29 15:58:58.000000000 +1000
> +++ linux-2.6/arch/powerpc/include/asm/topology.h	2010-04-29 15:59:00.000000000 +1000
> @@ -77,6 +77,9 @@ static inline int pcibus_to_node(struct 
>  	.balance_interval	= 1,					\
>  }
>  
> +extern int __node_distance(int, int);
> +#define node_distance(a, b) __node_distance(a, b)
> +
>  extern void __init dump_numa_cpu_topology(void);
>  
>  extern int sysfs_add_device_to_node(struct sys_device *dev, int nid);
> Index: linux-2.6/arch/powerpc/mm/numa.c
> ===================================================================
> --- linux-2.6.orig/arch/powerpc/mm/numa.c	2010-04-29 15:58:59.000000000 +1000
> +++ linux-2.6/arch/powerpc/mm/numa.c	2010-04-29 22:05:24.000000000 +1000
> @@ -42,6 +42,12 @@ EXPORT_SYMBOL(node_data);
>  
>  static int min_common_depth;
>  static int n_mem_addr_cells, n_mem_size_cells;
> +static int form1_affinity;
> +
> +#define MAX_DISTANCE_REF_POINTS 4
> +static int distance_ref_points_depth;
> +static const unsigned int *distance_ref_points;
> +static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
>  
>  static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
>  						unsigned int *nid)
> @@ -179,6 +185,39 @@ static const u32 *of_get_usable_memory(s
>  	return prop;
>  }
>  
> +int __node_distance(int a, int b)
> +{
> +	int i;
> +	int distance = LOCAL_DISTANCE;
> +
> +	if (!form1_affinity)
> +		return distance;
> +
> +	for (i = 0; i < distance_ref_points_depth; i++) {
> +		if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
> +			break;
> +
> +		/* Double the distance for each NUMA level */
> +		distance *= 2;
> +	}
> +
> +	return distance;
> +}
> +
> +static void initialize_distance_lookup_table(int nid,
> +		const unsigned int *associativity)
> +{
> +	int i;
> +
> +	if (!form1_affinity)
> +		return;
> +
> +	for (i = 0; i < distance_ref_points_depth; i++) {
> +		distance_lookup_table[nid][i] =
> +			associativity[distance_ref_points[i]];
> +	}
> +}
> +
>  /* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
>   * info is found.
>   */
> @@ -200,6 +239,10 @@ static int of_node_to_nid_single(struct 
>  	/* POWER4 LPAR uses 0xffff as invalid node */
>  	if (nid == 0xffff || nid >= MAX_NUMNODES)
>  		nid = -1;
> +
> +	if (nid > 0 && tmp[0] >= distance_ref_points_depth)
> +		initialize_distance_lookup_table(nid, tmp);
> +
>  out:
>  	return nid;
>  }
> @@ -226,26 +269,10 @@ int of_node_to_nid(struct device_node *d
>  }
>  EXPORT_SYMBOL_GPL(of_node_to_nid);
>  
> -/*
> - * In theory, the "ibm,associativity" property may contain multiple
> - * associativity lists because a resource may be multiply connected
> - * into the machine.  This resource then has different associativity
> - * characteristics relative to its multiple connections.  We ignore
> - * this for now.  We also assume that all cpu and memory sets have
> - * their distances represented at a common level.  This won't be
> - * true for hierarchical NUMA.
> - *
> - * In any case the ibm,associativity-reference-points should give
> - * the correct depth for a normal NUMA system.
> - *
> - * - Dave Hansen <haveblue at us.ibm.com>
> - */
>  static int __init find_min_common_depth(void)
>  {
> -	int depth, index;
> -	const unsigned int *ref_points;
> +	int depth;
>  	struct device_node *rtas_root;
> -	unsigned int len;
>  	struct device_node *options;
>  
>  	rtas_root = of_find_node_by_path("/rtas");
> @@ -254,35 +281,62 @@ static int __init find_min_common_depth(
>  		return -1;
>  
>  	/*
> -	 * this property is 2 32-bit integers, each representing a level of
> -	 * depth in the associativity nodes.  The first is for an SMP
> -	 * configuration (should be all 0's) and the second is for a normal
> -	 * NUMA configuration.
> +	 * This property is a set of 32-bit integers, each representing
> +	 * an index into the ibm,associativity nodes.
> +	 *
> +	 * With form 0 affinity the first integer is for an SMP configuration
> +	 * (should be all 0's) and the second is for a normal NUMA
> +	 * configuration. We have only one level of NUMA.
> +	 *
> +	 * With form 1 affinity the first integer is the most significant
> +	 * NUMA boundary and the following are progressively less significant
> +	 * boundaries. There can be more than one level of NUMA.
>  	 */
> -	index = 1;
> -	ref_points = of_get_property(rtas_root,
> -			"ibm,associativity-reference-points", &len);
> +	distance_ref_points = of_get_property(rtas_root,
> +			"ibm,associativity-reference-points",
> +			&distance_ref_points_depth);
> +
> +	if (!distance_ref_points)
> +		goto err;
> +
> +	distance_ref_points_depth /= sizeof(int);
>  
> -	/*
> -	 * For type 1 affinity information we want the first field
> -	 */
>  	options = of_find_node_by_path("/options");
>  	if (options) {
>  		const char *str;
>  		str = of_get_property(options, "ibm,associativity-form", NULL);
>  		if (str && !strcmp(str, "1"))
> -                        index = 0;
> +			form1_affinity = 1;
>  	}
>  
> -	if ((len >= 2 * sizeof(unsigned int)) && ref_points) {
> -		depth = ref_points[index];
> +	if (form1_affinity) {
> +		depth = distance_ref_points[0];
>  	} else {
> -		dbg("NUMA: ibm,associativity-reference-points not found.\n");
> -		depth = -1;
> +		if (distance_ref_points_depth < 2)
> +			goto err;
> +
> +		depth = distance_ref_points[1];
>  	}
> +
> +	/*
> +	 * Warn and cap if the hardware supports more than
> +	 * MAX_DISTANCE_REF_POINTS domains.
> +	 */
> +	if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
> +		printk(KERN_WARNING
> +		       "NUMA: distance array capped at %d entries\n",
> +			MAX_DISTANCE_REF_POINTS);
> +		distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
> +	}
> +
>  	of_node_put(rtas_root);
>  
>  	return depth;
> +
> +err:
> +	dbg("NUMA: ibm,associativity-reference-points not found.\n");
> +	of_node_put(rtas_root);
> +	return -1;
>  }
>  
>  static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)




More information about the Linuxppc-dev mailing list