* Alexander Gordeev <agord...@redhat.com> wrote:

> The MSI specification has several constraints in comparison with MSI-X,
> most notable of them is the inability to configure MSIs independently.
> As a result, it is impossible to dispatch interrupts from different
> queues to different CPUs. This is largely devalues the support of
> multiple MSIs in SMP systems.
> 
> Also, a necessity to allocate a contiguous block of vector numbers for
> devices capable of multiple MSIs might cause a considerable pressure on
> x86 interrupt vector allocator and could lead to fragmentation of the
> interrupt vectors space.
> 
> This patch overcomes both drawbacks in presense of IRQ remapping and
> lets devices take advantage of multiple queues and per-IRQ affinity
> assignments.
> 
> Signed-off-by: Alexander Gordeev <agord...@redhat.com>
> ---
>  arch/x86/kernel/apic/io_apic.c |  174 +++++++++++++++++++++++++++++++++------
>  include/linux/irq.h            |    6 ++
>  kernel/irq/chip.c              |   30 +++++--
>  kernel/irq/irqdesc.c           |   31 +++++++
>  4 files changed, 206 insertions(+), 35 deletions(-)
> 
> diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
> index c265593..d5cb13c 100644
> --- a/arch/x86/kernel/apic/io_apic.c
> +++ b/arch/x86/kernel/apic/io_apic.c
> @@ -305,6 +305,11 @@ static int alloc_irq_from(unsigned int from, int node)
>       return irq_alloc_desc_from(from, node);
>  }
>  
> +static int alloc_irqs_from(unsigned int from, unsigned int count, int node)
> +{
> +     return irq_alloc_descs_from(from, count, node);
> +}
> +
>  static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
>  {
>       free_irq_cfg(at, cfg);
> @@ -2991,37 +2996,58 @@ device_initcall(ioapic_init_ops);
>  /*
>   * Dynamic irq allocate and deallocation
>   */
> -unsigned int create_irq_nr(unsigned int from, int node)
> +unsigned int __create_irqs(unsigned int from, unsigned int count, int node)
>  {
> -     struct irq_cfg *cfg;
> +     struct irq_cfg **cfg;
>       unsigned long flags;
> -     unsigned int ret = 0;
> -     int irq;
> +     int irq, i;
>  
>       if (from < nr_irqs_gsi)
>               from = nr_irqs_gsi;
>  
> -     irq = alloc_irq_from(from, node);
> -     if (irq < 0)
> -             return 0;
> -     cfg = alloc_irq_cfg(irq, node);
> -     if (!cfg) {
> -             free_irq_at(irq, NULL);
> +     cfg = kzalloc_node(count * sizeof(cfg[0]), GFP_KERNEL, node);
> +     if (!cfg)
>               return 0;
> +
> +     irq = alloc_irqs_from(from, count, node);
> +     if (irq < 0)
> +             goto out_cfgs;
> +
> +     for (i = 0; i < count; i++) {
> +             cfg[i] = alloc_irq_cfg(irq + i, node);
> +             if (!cfg[i])
> +                     goto out_irqs;
>       }
>  
>       raw_spin_lock_irqsave(&vector_lock, flags);
> -     if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
> -             ret = irq;
> +     for (i = 0; i < count; i++)
> +             if (__assign_irq_vector(irq + i, cfg[i], apic->target_cpus()))
> +                     goto out_vecs;
>       raw_spin_unlock_irqrestore(&vector_lock, flags);
>  
> -     if (ret) {
> -             irq_set_chip_data(irq, cfg);
> -             irq_clear_status_flags(irq, IRQ_NOREQUEST);
> -     } else {
> -             free_irq_at(irq, cfg);
> +     for (i = 0; i < count; i++) {
> +             irq_set_chip_data(irq + i, cfg[i]);
> +             irq_clear_status_flags(irq + i, IRQ_NOREQUEST);
>       }
> -     return ret;
> +
> +     kfree(cfg);
> +     return irq;
> +
> +out_vecs:
> +     for (; i; i--)
> +             __clear_irq_vector(irq + i - 1, cfg[i - 1]);
> +     raw_spin_unlock_irqrestore(&vector_lock, flags);
> +out_irqs:
> +     for (i = 0; i < count; i++)
> +             free_irq_at(irq + i, cfg[i]);
> +out_cfgs:
> +     kfree(cfg);
> +     return 0;
> +}
> +
> +unsigned int create_irq_nr(unsigned int from, int node)
> +{
> +     return __create_irqs(from, 1, node);
>  }
>  
>  int create_irq(void)
> @@ -3054,6 +3080,27 @@ void destroy_irq(unsigned int irq)
>       free_irq_at(irq, cfg);
>  }
>  
> +static inline void destroy_irqs(unsigned int irq, unsigned int count)
> +{
> +     unsigned int i;
> +     for (i = 0; i < count; i++)
Missing newline.

> +             destroy_irq(irq + i);
> +}
> +
> +static inline int
> +can_create_pow_of_two_irqs(unsigned int from, unsigned int count)
> +{
> +     if ((count > 1) && (count % 2))
> +             return -EINVAL;
> +
> +     for (; count; count = count / 2) {
> +             if (!irq_can_alloc_irqs(from, count))
> +                     return count;
> +     }
> +
> +     return -ENOSPC;
> +}
> +
>  /*
>   * MSI message composition
>   */
> @@ -3145,18 +3192,25 @@ static struct irq_chip msi_chip = {
>       .irq_retrigger          = ioapic_retrigger_irq,
>  };
>  
> -static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int 
> irq)
> +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc,
> +                      unsigned int irq_base, unsigned int irq_offset)
>  {
>       struct irq_chip *chip = &msi_chip;
>       struct msi_msg msg;
> +     unsigned int irq = irq_base + irq_offset;
>       int ret;
>  
>       ret = msi_compose_msg(dev, irq, &msg, -1);
>       if (ret < 0)
>               return ret;
>  
> -     irq_set_msi_desc(irq, msidesc);
> -     write_msi_msg(irq, &msg);
> +     irq_set_msi_desc_off(irq_base, irq_offset, msidesc);
> +
> +     /* MSI-X message is written per-IRQ, the offset is always 0.
> +      * MSI message denotes a contiguous group of IRQs, written for 0th IRQ.
> +      */

Please use the customary (multi-line) comment style:

  /*
   * Comment .....
   * ...... goes here.
   */

specified in Documentation/CodingStyle.


> +     if (!irq_offset)
> +             write_msi_msg(irq, &msg);
>  
>       if (irq_remapped(irq_get_chip_data(irq))) {
>               irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
> @@ -3170,16 +3224,12 @@ static int setup_msi_irq(struct pci_dev *dev, struct 
> msi_desc *msidesc, int irq)
>       return 0;
>  }
>  
> -int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
> +int setup_msix_irqs(struct pci_dev *dev, int nvec)
>  {
>       int node, ret, sub_handle, index = 0;
>       unsigned int irq, irq_want;
>       struct msi_desc *msidesc;
>  
> -     /* x86 doesn't support multiple MSI yet */
> -     if (type == PCI_CAP_ID_MSI && nvec > 1)
> -             return 1;
> -
>       node = dev_to_node(&dev->dev);
>       irq_want = nr_irqs_gsi;
>       sub_handle = 0;
> @@ -3208,7 +3258,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int 
> nvec, int type)
>                               goto error;
>               }
>  no_ir:
> -             ret = setup_msi_irq(dev, msidesc, irq);
> +             ret = setup_msi_irq(dev, msidesc, irq, 0);
>               if (ret < 0)
>                       goto error;
>               sub_handle++;
> @@ -3220,6 +3270,76 @@ error:
>       return ret;
>  }
>  
> +int setup_msi_irqs(struct pci_dev *dev, int nvec)
> +{
> +     int node, ret, sub_handle, index = 0;
> +     unsigned int irq;
> +     struct msi_desc *msidesc;
> +
> +     if (nvec > 1 && !irq_remapping_enabled)
> +             return 1;
> +
> +     nvec = __roundup_pow_of_two(nvec);
> +     ret = can_create_pow_of_two_irqs(nr_irqs_gsi, nvec);
> +     if (ret != nvec)
> +             return ret;
> +
> +     WARN_ON(!list_is_singular(&dev->msi_list));
> +     msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
> +     WARN_ON(msidesc->irq);
> +     WARN_ON(msidesc->msi_attrib.multiple);
> +
> +     node = dev_to_node(&dev->dev);
> +     irq = __create_irqs(nr_irqs_gsi, nvec, node);
> +     if (irq == 0)
> +             return -ENOSPC;
> +
> +     if (!irq_remapping_enabled) {
> +             ret = setup_msi_irq(dev, msidesc, irq, 0);
> +             if (ret < 0)
> +                     goto error;
> +             return 0;
> +     }
> +
> +     msidesc->msi_attrib.multiple = ilog2(nvec);
> +     for (sub_handle = 0; sub_handle < nvec; sub_handle++) {
> +             if (!sub_handle) {
> +                     index = msi_alloc_remapped_irq(dev, irq, nvec);
> +                     if (index < 0) {
> +                             ret = index;
> +                             goto error;
> +                     }
> +             } else {
> +                     ret = msi_setup_remapped_irq(dev, irq + sub_handle,
> +                                                  index, sub_handle);
> +                     if (ret < 0)
> +                             goto error;
> +             }
> +             ret = setup_msi_irq(dev, msidesc, irq, sub_handle);
> +             if (ret < 0)
> +                     goto error;
> +     }
> +     return 0;
> +
> +error:
> +     destroy_irqs(irq, nvec);
> +
> +     /* Restore altered MSI descriptor fields and prevent just destroyed
> +      * IRQs from tearing down again in default_teardown_msi_irqs()
> +      */

Ditto.

> +     msidesc->irq = 0;
> +     msidesc->msi_attrib.multiple = 0;
> +
> +     return ret;
> +}
> +
> +int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
> +{
> +     if (type == PCI_CAP_ID_MSI)
> +             return setup_msi_irqs(dev, nvec);
> +     return setup_msix_irqs(dev, nvec);
> +}
> +
>  void native_teardown_msi_irq(unsigned int irq)
>  {
>       destroy_irq(irq);
> diff --git a/include/linux/irq.h b/include/linux/irq.h
> index 216b0ba..c3ba39f 100644
> --- a/include/linux/irq.h
> +++ b/include/linux/irq.h
> @@ -522,6 +522,8 @@ extern int irq_set_handler_data(unsigned int irq, void 
> *data);
>  extern int irq_set_chip_data(unsigned int irq, void *data);
>  extern int irq_set_irq_type(unsigned int irq, unsigned int type);
>  extern int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry);
> +extern int irq_set_msi_desc_off(unsigned int irq_base, unsigned int 
> irq_offset,
> +                             struct msi_desc *entry);
>  extern struct irq_data *irq_get_irq_data(unsigned int irq);
>  
>  static inline struct irq_chip *irq_get_chip(unsigned int irq)
> @@ -584,8 +586,12 @@ int __irq_alloc_descs(int irq, unsigned int from, 
> unsigned int cnt, int node,
>  #define irq_alloc_desc_from(from, node)              \
>       irq_alloc_descs(-1, from, 1, node)
>  
> +#define irq_alloc_descs_from(from, cnt, node)        \
> +     irq_alloc_descs(-1, from, cnt, node)
> +

Please use inlines instead of macros. Might transform the one 
above it as well in the process.

>  void irq_free_descs(unsigned int irq, unsigned int cnt);
>  int irq_reserve_irqs(unsigned int from, unsigned int cnt);
> +int irq_can_alloc_irqs(unsigned int from, unsigned int cnt);
>  
>  static inline void irq_free_desc(unsigned int irq)
>  {
> diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
> index 57d86d0..2230389 100644
> --- a/kernel/irq/chip.c
> +++ b/kernel/irq/chip.c
> @@ -90,27 +90,41 @@ int irq_set_handler_data(unsigned int irq, void *data)
>  EXPORT_SYMBOL(irq_set_handler_data);
>  
>  /**
> - *   irq_set_msi_desc - set MSI descriptor data for an irq
> - *   @irq:   Interrupt number
> - *   @entry: Pointer to MSI descriptor data
> + *   irq_set_msi_desc_off - set MSI descriptor data for an irq at offset
> + *   @irq_base:      Interrupt number base
> + *   @irq_offset:    Interrupt number offset
> + *   @entry:         Pointer to MSI descriptor data
>   *
> - *   Set the MSI descriptor entry for an irq
> + *   Set the MSI descriptor entry for an irq at offset
>   */
> -int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
> +int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset,
> +                      struct msi_desc *entry)
>  {
>       unsigned long flags;
> -     struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 
> IRQ_GET_DESC_CHECK_GLOBAL);
> +     struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, 
> &flags, IRQ_GET_DESC_CHECK_GLOBAL);
>  
>       if (!desc)
>               return -EINVAL;
>       desc->irq_data.msi_desc = entry;
> -     if (entry)
> -             entry->irq = irq;
> +     if (entry && !irq_offset)
> +             entry->irq = irq_base;
>       irq_put_desc_unlock(desc, flags);
>       return 0;
>  }
>  
>  /**
> + *   irq_set_msi_desc - set MSI descriptor data for an irq
> + *   @irq:   Interrupt number
> + *   @entry: Pointer to MSI descriptor data
> + *
> + *   Set the MSI descriptor entry for an irq
> + */
> +int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
> +{
> +     return irq_set_msi_desc_off(irq, 0, entry);
> +}
> +
> +/**
>   *   irq_set_chip_data - set irq chip data for an irq
>   *   @irq:   Interrupt number
>   *   @data:  Pointer to chip specific data
> diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
> index 192a302..8287b78 100644
> --- a/kernel/irq/irqdesc.c
> +++ b/kernel/irq/irqdesc.c
> @@ -210,6 +210,13 @@ static int irq_expand_nr_irqs(unsigned int nr)
>       return 0;
>  }
>  
> +static int irq_can_expand_nr_irqs(unsigned int nr)
> +{
> +     if (nr > IRQ_BITMAP_BITS)
> +             return -ENOMEM;
> +     return 0;
> +}
> +
>  int __init early_irq_init(void)
>  {
>       int i, initcnt, node = first_online_node;
> @@ -414,6 +421,30 @@ int irq_reserve_irqs(unsigned int from, unsigned int cnt)
>  }
>  
>  /**
> + * irq_can_alloc_irqs - checks if a range of irqs could be allocated
> + * @from:    check from irq number
> + * @cnt:     number of irqs to check
> + *
> + * Returns 0 on success or an appropriate error code
> + */
> +int irq_can_alloc_irqs(unsigned int from, unsigned int cnt)
> +{
> +     unsigned int start;
> +     int ret = 0;
> +
> +     if (!cnt)
> +             return -EINVAL;
> +
> +     mutex_lock(&sparse_irq_lock);
> +     start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
> +                                        from, cnt, 0);
> +     mutex_unlock(&sparse_irq_lock);
> +     if (start + cnt > nr_irqs)
> +             ret = irq_can_expand_nr_irqs(start + cnt);
> +     return ret;

How is this supposed to work wrt. races?

Thanks,

        Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to