[PATCH v2 8/8] powerpc/xive: Map one IPI interrupt per node

Cédric Le Goater clg at kaod.org
Wed Mar 10 02:52:10 AEDT 2021


On 3/9/21 2:23 PM, Greg Kurz wrote:
> On Wed, 3 Mar 2021 18:48:57 +0100
> Cédric Le Goater <clg at kaod.org> wrote:
> 
>> ipistorm [*] can be used to benchmark the raw interrupt rate of an
>> interrupt controller by measuring the number of IPIs a system can
>> sustain. When applied to the XIVE interrupt controller of POWER9 and
>> POWER10 systems, a significant drop of the interrupt rate can be
>> observed when crossing the second node boundary.
>>
>> This is due to the fact that a single IPI interrupt is used for all
>> CPUs of the system. The structure is shared and the cache line updates
>> impact greatly the traffic between nodes and the overall IPI
>> performance.
>>
>> As a workaround, the impact can be reduced by deactivating the IRQ
>> lockup detector ("noirqdebug") which does a lot of accounting in the
>> Linux IRQ descriptor structure and is responsible for most of the
>> performance penalty.
>>
>> As a fix, this proposal allocates an IPI interrupt per node, to be
>> shared by all CPUs of that node. It solves the scaling issue, the IRQ
>> lockup detector still has an impact but the XIVE interrupt rate scales
>> linearly. It also improves the "noirqdebug" case as showed in the
>> tables below.
>>
>>  * P9 DD2.2 - 2s * 64 threads
>>
>>                                                "noirqdebug"
>>                         Mint/s                    Mint/s
>>  chips  cpus      IPI/sys   IPI/chip       IPI/chip    IPI/sys
>>  --------------------------------------------------------------
>>  1      0-15     4.984023   4.875405       4.996536   5.048892
>>         0-31    10.879164  10.544040      10.757632  11.037859
>>         0-47    15.345301  14.688764      14.926520  15.310053
>>         0-63    17.064907  17.066812      17.613416  17.874511
>>  2      0-79    11.768764  21.650749      22.689120  22.566508
>>         0-95    10.616812  26.878789      28.434703  28.320324
>>         0-111   10.151693  31.397803      31.771773  32.388122
>>         0-127    9.948502  33.139336      34.875716  35.224548
>>
>>  * P10 DD1 - 4s (not homogeneous) 352 threads
>>
>>                                                "noirqdebug"
>>                         Mint/s                    Mint/s
>>  chips  cpus      IPI/sys   IPI/chip       IPI/chip    IPI/sys
>>  --------------------------------------------------------------
>>  1      0-15     2.409402   2.364108       2.383303   2.395091
>>         0-31     6.028325   6.046075       6.089999   6.073750
>>         0-47     8.655178   8.644531       8.712830   8.724702
>>         0-63    11.629652  11.735953      12.088203  12.055979
>>         0-79    14.392321  14.729959      14.986701  14.973073
>>         0-95    12.604158  13.004034      17.528748  17.568095
>>  2      0-111    9.767753  13.719831      19.968606  20.024218
>>         0-127    6.744566  16.418854      22.898066  22.995110
>>         0-143    6.005699  19.174421      25.425622  25.417541
>>         0-159    5.649719  21.938836      27.952662  28.059603
>>         0-175    5.441410  24.109484      31.133915  31.127996
>>  3      0-191    5.318341  24.405322      33.999221  33.775354
>>         0-207    5.191382  26.449769      36.050161  35.867307
>>         0-223    5.102790  29.356943      39.544135  39.508169
>>         0-239    5.035295  31.933051      42.135075  42.071975
>>         0-255    4.969209  34.477367      44.655395  44.757074
>>  4      0-271    4.907652  35.887016      47.080545  47.318537
>>         0-287    4.839581  38.076137      50.464307  50.636219
>>         0-303    4.786031  40.881319      53.478684  53.310759
>>         0-319    4.743750  43.448424      56.388102  55.973969
>>         0-335    4.709936  45.623532      59.400930  58.926857
>>         0-351    4.681413  45.646151      62.035804  61.830057
>>
>> [*] https://github.com/antonblanchard/ipistorm
>>
>> Signed-off-by: Cédric Le Goater <clg at kaod.org>
>> ---
>>  arch/powerpc/sysdev/xive/xive-internal.h |  2 --
>>  arch/powerpc/sysdev/xive/common.c        | 39 ++++++++++++++++++------
>>  2 files changed, 30 insertions(+), 11 deletions(-)
>>
>> diff --git a/arch/powerpc/sysdev/xive/xive-internal.h b/arch/powerpc/sysdev/xive/xive-internal.h
>> index 9cf57c722faa..b3a456fdd3a5 100644
>> --- a/arch/powerpc/sysdev/xive/xive-internal.h
>> +++ b/arch/powerpc/sysdev/xive/xive-internal.h
>> @@ -5,8 +5,6 @@
>>  #ifndef __XIVE_INTERNAL_H
>>  #define __XIVE_INTERNAL_H
>>  
>> -#define XIVE_IPI_HW_IRQ		0 /* interrupt source # for IPIs */
>> -
>>  /*
>>   * A "disabled" interrupt should never fire, to catch problems
>>   * we set its logical number to this
>> diff --git a/arch/powerpc/sysdev/xive/common.c b/arch/powerpc/sysdev/xive/common.c
>> index 8eefd152b947..c27f7bb0494b 100644
>> --- a/arch/powerpc/sysdev/xive/common.c
>> +++ b/arch/powerpc/sysdev/xive/common.c
>> @@ -65,8 +65,16 @@ static struct irq_domain *xive_irq_domain;
>>  #ifdef CONFIG_SMP
>>  static struct irq_domain *xive_ipi_irq_domain;
>>  
>> -/* The IPIs all use the same logical irq number */
>> -static u32 xive_ipi_irq;
>> +/* The IPIs use the same logical irq number when on the same chip */
>> +static struct xive_ipi_desc {
>> +	unsigned int irq;
>> +	char name[8]; /* enough bytes to fit IPI-XXX */
> 
> So this assumes that the node number that node is <= 999 ? This
> is certainly the case for now since CONFIG_NODES_SHIFT is 8
> on ppc64 but starting with 10, you'd have truncated names.

It should be harmless though. I agree this is a useless optimization.

> What about deriving the size of name[] from CONFIG_NODES_SHIFT ?

Yes.
 
> Apart from that, LGTM. Probably not worth to respin just for
> this.
> 
> I also could give a try in a KVM guest.
> 
> Topology passed to QEMU:
> 
>   -smp 8,maxcpus=8,cores=2,threads=2,sockets=2 \
>   -numa node,nodeid=0,cpus=0-4 \
>   -numa node,nodeid=1,cpus=4-7
> 
> Topology observed in guest with lstopo :
> 
>   Package L#0
>     NUMANode L#0 (P#0 30GB)
>     L1d L#0 (32KB) + L1i L#0 (32KB) + Core L#0
>       PU L#0 (P#0)
>       PU L#1 (P#1)
>     L1d L#1 (32KB) + L1i L#1 (32KB) + Core L#1
>       PU L#2 (P#2)
>       PU L#3 (P#3)
>   Package L#1
>     NUMANode L#1 (P#1 32GB)
>     L1d L#2 (32KB) + L1i L#2 (32KB) + Core L#2
>       PU L#4 (P#4)
>       PU L#5 (P#5)
>     L1d L#3 (32KB) + L1i L#3 (32KB) + Core L#3
>       PU L#6 (P#6)
>       PU L#7 (P#7)
> 
> Interrupts in guest:
> 
> $ cat /proc/interrupts 
>            CPU0       CPU1       CPU2       CPU3       CPU4       CPU5       CPU6       CPU7       
>  16:       1023        871       1042        749          0          0          0          0  XIVE-IPI   0 Edge      IPI-0
>  17:          0          0          0          0       2123       1019       1263       1288  XIVE-IPI   1 Edge      IPI-1
> 
> IPIs are mapped to the appropriate nodes, and the numbers indicate
> that everything is working as expected.

You should see the same on 2 socket PowerNV QEMU machine.
 
> Reviewed-and-tested-by: Greg Kurz <groug at kaod.org>

Thanks,

C. 

> 
>> +} *xive_ipis;
>> +
>> +static unsigned int xive_ipi_cpu_to_irq(unsigned int cpu)
>> +{
>> +	return xive_ipis[cpu_to_node(cpu)].irq;
>> +}
>>  #endif
>>  
>>  /* Xive state for each CPU */
>> @@ -1106,25 +1114,36 @@ static const struct irq_domain_ops xive_ipi_irq_domain_ops = {
>>  
>>  static void __init xive_request_ipi(void)
>>  {
>> -	unsigned int virq;
>> +	unsigned int node;
>>  
>> -	xive_ipi_irq_domain = irq_domain_add_linear(NULL, 1,
>> +	xive_ipi_irq_domain = irq_domain_add_linear(NULL, nr_node_ids,
>>  						    &xive_ipi_irq_domain_ops, NULL);
>>  	if (WARN_ON(xive_ipi_irq_domain == NULL))
>>  		return;
>>  
>> -	/* Initialize it */
>> -	virq = irq_create_mapping(xive_ipi_irq_domain, XIVE_IPI_HW_IRQ);
>> -	xive_ipi_irq = virq;
>> +	xive_ipis = kcalloc(nr_node_ids, sizeof(*xive_ipis), GFP_KERNEL | __GFP_NOFAIL);
>> +	for_each_node(node) {
>> +		struct xive_ipi_desc *xid = &xive_ipis[node];
>> +		irq_hw_number_t node_ipi_hwirq = node;
>> +
>> +		/*
>> +		 * Map one IPI interrupt per node for all cpus of that node.
>> +		 * Since the HW interrupt number doesn't have any meaning,
>> +		 * simply use the node number.
>> +		 */
>> +		xid->irq = irq_create_mapping(xive_ipi_irq_domain, node_ipi_hwirq);
>> +		snprintf(xid->name, sizeof(xid->name), "IPI-%d", node);
>>  
>> -	WARN_ON(request_irq(virq, xive_muxed_ipi_action,
>> -			    IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL));
>> +		WARN_ON(request_irq(xid->irq, xive_muxed_ipi_action,
>> +				    IRQF_PERCPU | IRQF_NO_THREAD, xid->name, NULL));
>> +	}
>>  }
>>  
>>  static int xive_setup_cpu_ipi(unsigned int cpu)
>>  {
>>  	struct xive_cpu *xc;
>>  	int rc;
>> +	unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
>>  
>>  	pr_debug("Setting up IPI for CPU %d\n", cpu);
>>  
>> @@ -1165,6 +1184,8 @@ static int xive_setup_cpu_ipi(unsigned int cpu)
>>  
>>  static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc)
>>  {
>> +	unsigned int xive_ipi_irq = xive_ipi_cpu_to_irq(cpu);
>> +
>>  	/* Disable the IPI and free the IRQ data */
>>  
>>  	/* Already cleaned up ? */
> 



More information about the Linuxppc-dev mailing list