[RFC PATCH kernel 1/2] irq: Add reference counting to IRQ mappings

Alexey Kardashevskiy aik at ozlabs.ru
Thu Oct 29 16:04:16 AEDT 2020



On 28/10/2020 03:09, Marc Zyngier wrote:
> Hi Alexey,
> 
> On 2020-10-27 09:06, Alexey Kardashevskiy wrote:
>> PCI devices share 4 legacy INTx interrupts from the same PCI host bridge.
>> Device drivers map/unmap hardware interrupts via irq_create_mapping()/
>> irq_dispose_mapping(). The problem with that these interrupts are
>> shared and when performing hot unplug, we need to unmap the interrupt
>> only when the last device is released.
>>
>> This reuses already existing irq_desc::kobj for this purpose.
>> The refcounter is naturally 1 when the descriptor is allocated already;
>> this adds kobject_get() in places where already existing mapped virq
>> is returned.
> 
> That's quite interesting, as I was about to revive a patch series that
> rework the irqdomain subsystem to directly cache irq_desc instead of
> raw interrupt numbers. And for that, I needed some form of refcounting...
> 
>>
>> This reorganizes irq_dispose_mapping() to release the kobj and let
>> the release callback do the cleanup.
>>
>> If some driver or platform does its own reference counting, this expects
>> those parties to call irq_find_mapping() and call irq_dispose_mapping()
>> for every irq_create_fwspec_mapping()/irq_create_mapping().
>>
>> Signed-off-by: Alexey Kardashevskiy <aik at ozlabs.ru>
>> ---
>>  kernel/irq/irqdesc.c   | 35 +++++++++++++++++++++++------------
>>  kernel/irq/irqdomain.c | 27 +++++++++++++--------------
>>  2 files changed, 36 insertions(+), 26 deletions(-)
>>
>> diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
>> index 1a7723604399..dae096238500 100644
>> --- a/kernel/irq/irqdesc.c
>> +++ b/kernel/irq/irqdesc.c
>> @@ -419,20 +419,39 @@ static struct irq_desc *alloc_desc(int irq, int
>> node, unsigned int flags,
>>      return NULL;
>>  }
>>
>> +static void delayed_free_desc(struct rcu_head *rhp);
>>  static void irq_kobj_release(struct kobject *kobj)
>>  {
>>      struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj);
>> +    struct irq_domain *domain;
>> +    unsigned int virq = desc->irq_data.irq;
>>
>> -    free_masks(desc);
>> -    free_percpu(desc->kstat_irqs);
>> -    kfree(desc);
>> +    domain = desc->irq_data.domain;
>> +    if (domain) {
>> +        if (irq_domain_is_hierarchy(domain)) {
>> +            irq_domain_free_irqs(virq, 1);
> 
> How does this work with hierarchical domains? Each domain should
> contribute as a reference on the irq_desc. But if you got here,
> it means the refcount has already dropped to 0.
> 
> So either there is nothing to free here, or you don't track the
> references implied by the hierarchy. I suspect the latter.

This is correct, I did not look at hierarchy yet, looking now...



>> +        } else {
>> +            irq_domain_disassociate(domain, virq);
>> +            irq_free_desc(virq);
>> +        }
>> +    }
>> +
>> +    /*
>> +     * We free the descriptor, masks and stat fields via RCU. That
>> +     * allows demultiplex interrupts to do rcu based management of
>> +     * the child interrupts.
>> +     * This also allows us to use rcu in kstat_irqs_usr().
>> +     */
>> +    call_rcu(&desc->rcu, delayed_free_desc);
>>  }
>>
>>  static void delayed_free_desc(struct rcu_head *rhp)
>>  {
>>      struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu);
>>
>> -    kobject_put(&desc->kobj);
>> +    free_masks(desc);
>> +    free_percpu(desc->kstat_irqs);
>> +    kfree(desc);
>>  }
>>
>>  static void free_desc(unsigned int irq)
>> @@ -453,14 +472,6 @@ static void free_desc(unsigned int irq)
>>       */
>>      irq_sysfs_del(desc);
>>      delete_irq_desc(irq);
>> -
>> -    /*
>> -     * We free the descriptor, masks and stat fields via RCU. That
>> -     * allows demultiplex interrupts to do rcu based management of
>> -     * the child interrupts.
>> -     * This also allows us to use rcu in kstat_irqs_usr().
>> -     */
>> -    call_rcu(&desc->rcu, delayed_free_desc);
>>  }
>>
>>  static int alloc_descs(unsigned int start, unsigned int cnt, int node,
>> diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
>> index cf8b374b892d..02733ddc321f 100644
>> --- a/kernel/irq/irqdomain.c
>> +++ b/kernel/irq/irqdomain.c
>> @@ -638,6 +638,7 @@ unsigned int irq_create_mapping(struct irq_domain 
>> *domain,
>>  {
>>      struct device_node *of_node;
>>      int virq;
>> +    struct irq_desc *desc;
>>
>>      pr_debug("irq_create_mapping(0x%p, 0x%lx)\n", domain, hwirq);
>>
>> @@ -655,7 +656,9 @@ unsigned int irq_create_mapping(struct irq_domain 
>> *domain,
>>      /* Check if mapping already exists */
>>      virq = irq_find_mapping(domain, hwirq);
>>      if (virq) {
>> +        desc = irq_to_desc(virq);
>>          pr_debug("-> existing mapping on virq %d\n", virq);
>> +        kobject_get(&desc->kobj);
> 
> My worry with this is that there is probably a significant amount of
> code out there that relies on multiple calls to irq_create_mapping()
> with the same parameters not to have any side effects. They would
> expect a subsequent irq_dispose_mapping() to drop the translation
> altogether, and that's obviously not the case here.
> 
> Have you audited the various call sites to see what could break?


The vast majority calls one of irq..create_mapping in init/probe and 
then calls irq_dispose_mapping() right there if probing failed or when 
the driver is unloaded. I could not spot any reference counting 
anywhere, everyone seems to call irq_dispose_mapping() per every 
irq_of_parse_and_map() (or friends).

Then there is a minority (such as the code I am fixing in 
powerpc/pseries) but it is either broken (such as pseries/pci which does 
not call irq_dispose_mapping at all)  or  it is 1 disposal per 1 mapping 
(PPC KVM).

I did not spend awful amount of time though, git grep 
irq_dispose_mapping gives 200 lines...


> 
>>          return virq;
>>      }
>>
>> @@ -751,6 +754,7 @@ unsigned int irq_create_fwspec_mapping(struct
>> irq_fwspec *fwspec)
>>      irq_hw_number_t hwirq;
>>      unsigned int type = IRQ_TYPE_NONE;
>>      int virq;
>> +    struct irq_desc *desc;
>>
>>      if (fwspec->fwnode) {
>>          domain = irq_find_matching_fwspec(fwspec, DOMAIN_BUS_WIRED);
>> @@ -787,8 +791,11 @@ unsigned int irq_create_fwspec_mapping(struct
>> irq_fwspec *fwspec)
>>           * current trigger type then we are done so return the
>>           * interrupt number.
>>           */
>> -        if (type == IRQ_TYPE_NONE || type == irq_get_trigger_type(virq))
>> +        if (type == IRQ_TYPE_NONE || type == 
>> irq_get_trigger_type(virq)) {
>> +            desc = irq_to_desc(virq);
>> +            kobject_get(&desc->kobj);
>>              return virq;
>> +        }
>>
>>          /*
>>           * If the trigger type has not been set yet, then set
>> @@ -800,6 +807,8 @@ unsigned int irq_create_fwspec_mapping(struct
>> irq_fwspec *fwspec)
>>                  return 0;
>>
>>              irqd_set_trigger_type(irq_data, type);
>> +            desc = irq_to_desc(virq);
>> +            kobject_get(&desc->kobj);
>>              return virq;
>>          }
>>
>> @@ -852,22 +861,12 @@ EXPORT_SYMBOL_GPL(irq_create_of_mapping);
>>   */
>>  void irq_dispose_mapping(unsigned int virq)
>>  {
>> -    struct irq_data *irq_data = irq_get_irq_data(virq);
>> -    struct irq_domain *domain;
>> +    struct irq_desc *desc = irq_to_desc(virq);
>>
>> -    if (!virq || !irq_data)
>> +    if (!virq || !desc)
>>          return;
>>
>> -    domain = irq_data->domain;
>> -    if (WARN_ON(domain == NULL))
>> -        return;
>> -
>> -    if (irq_domain_is_hierarchy(domain)) {
>> -        irq_domain_free_irqs(virq, 1);
>> -    } else {
>> -        irq_domain_disassociate(domain, virq);
>> -        irq_free_desc(virq);
>> -    }
>> +    kobject_put(&desc->kobj);
>>  }
>>  EXPORT_SYMBOL_GPL(irq_dispose_mapping);
> 
> Thanks,
> 
>          M.

-- 
Alexey


More information about the Linuxppc-dev mailing list