On Wed, 2012-10-17 at 15:23 +0900, Takao Indoh wrote:
> This patch resets PCIe devices at boot time by hot reset when
> "reset_devices" is specified.
> 
> Signed-off-by: Takao Indoh <indou.ta...@jp.fujitsu.com>
> ---
>  arch/x86/include/asm/pci-direct.h |    1 
>  arch/x86/kernel/setup.c           |    3 
>  arch/x86/pci/early.c              |  353 ++++++++++++++++++++++++++++
>  include/linux/pci.h               |    2 
>  init/main.c                       |    4 
>  5 files changed, 361 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/include/asm/pci-direct.h 
> b/arch/x86/include/asm/pci-direct.h
> index b1e7a45..de30db2 100644
> --- a/arch/x86/include/asm/pci-direct.h
> +++ b/arch/x86/include/asm/pci-direct.h
> @@ -18,4 +18,5 @@ extern int early_pci_allowed(void);
>  extern unsigned int pci_early_dump_regs;
>  extern void early_dump_pci_device(u8 bus, u8 slot, u8 func);
>  extern void early_dump_pci_devices(void);
> +extern void early_reset_pcie_devices(void);
>  #endif /* _ASM_X86_PCI_DIRECT_H */
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index a2bb18e..73d3425 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -987,6 +987,9 @@ void __init setup_arch(char **cmdline_p)
>       generic_apic_probe();
>  
>       early_quirks();
> +#ifdef CONFIG_PCI
> +     early_reset_pcie_devices();
> +#endif
>  
>       /*
>        * Read APIC and some other early information from ACPI tables.
> diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
> index d1067d5..df7f4fc 100644
> --- a/arch/x86/pci/early.c
> +++ b/arch/x86/pci/early.c
> @@ -1,5 +1,6 @@
>  #include <linux/kernel.h>
>  #include <linux/pci.h>
> +#include <linux/bootmem.h>
>  #include <asm/pci-direct.h>
>  #include <asm/io.h>
>  #include <asm/pci_x86.h>
> @@ -109,3 +110,355 @@ void early_dump_pci_devices(void)
>               }
>       }
>  }
> +
> +#define PCI_EXP_SAVE_REGS    7
> +#define pcie_cap_has_devctl(type, flags)     1
> +#define pcie_cap_has_lnkctl(type, flags)             \
> +             ((flags & PCI_EXP_FLAGS_VERS) > 1 ||    \
> +              (type == PCI_EXP_TYPE_ROOT_PORT ||     \
> +               type == PCI_EXP_TYPE_ENDPOINT ||      \
> +               type == PCI_EXP_TYPE_LEG_END))
> +#define pcie_cap_has_sltctl(type, flags)             \
> +             ((flags & PCI_EXP_FLAGS_VERS) > 1 ||    \
> +              ((type == PCI_EXP_TYPE_ROOT_PORT) ||   \
> +               (type == PCI_EXP_TYPE_DOWNSTREAM &&   \
> +                (flags & PCI_EXP_FLAGS_SLOT))))
> +#define pcie_cap_has_rtctl(type, flags)                      \
> +             ((flags & PCI_EXP_FLAGS_VERS) > 1 ||    \
> +              (type == PCI_EXP_TYPE_ROOT_PORT ||     \
> +               type == PCI_EXP_TYPE_RC_EC))
> +
> +struct save_config {
> +     u32 pci[16];
> +     u16 pcie[PCI_EXP_SAVE_REGS];
> +};
> +
> +struct pcie_dev {
> +     int cap;   /* position of PCI Express capability */
> +     int flags; /* PCI_EXP_FLAGS */
> +     struct save_config save; /* saved configration register */
> +};
> +
> +struct pcie_port {
> +     struct list_head dev;
> +     u8 bus;
> +     u8 slot;
> +     u8 func;
> +     u8 secondary;
> +     struct pcie_dev child[PCI_MAX_FUNCTIONS];
> +};
> +
> +static LIST_HEAD(device_list);
> +static void __init pci_udelay(int loops)
> +{
> +     while (loops--) {
> +             /* Approximately 1 us */
> +             native_io_delay();
> +     }
> +}
> +
> +/* Derived from drivers/pci/pci.c */
> +#define PCI_FIND_CAP_TTL     48
> +static int __init __pci_find_next_cap_ttl(u8 bus, u8 slot, u8 func,
> +                                       u8 pos, int cap, int *ttl)
> +{
> +     u8 id;
> +
> +     while ((*ttl)--) {
> +             pos = read_pci_config_byte(bus, slot, func, pos);
> +             if (pos < 0x40)
> +                     break;
> +             pos &= ~3;
> +             id = read_pci_config_byte(bus, slot, func,
> +                                     pos + PCI_CAP_LIST_ID);
> +             if (id == 0xff)
> +                     break;
> +             if (id == cap)
> +                     return pos;
> +             pos += PCI_CAP_LIST_NEXT;
> +     }
> +     return 0;
> +}
> +
> +static int __init __pci_find_next_cap(u8 bus, u8 slot, u8 func, u8 pos, int 
> cap)
> +{
> +     int ttl = PCI_FIND_CAP_TTL;
> +
> +     return __pci_find_next_cap_ttl(bus, slot, func, pos, cap, &ttl);
> +}
> +
> +static int __init __pci_bus_find_cap_start(u8 bus, u8 slot, u8 func,
> +                                        u8 hdr_type)
> +{
> +     u16 status;
> +
> +     status = read_pci_config_16(bus, slot, func, PCI_STATUS);
> +     if (!(status & PCI_STATUS_CAP_LIST))
> +             return 0;
> +
> +     switch (hdr_type) {
> +     case PCI_HEADER_TYPE_NORMAL:
> +     case PCI_HEADER_TYPE_BRIDGE:
> +             return PCI_CAPABILITY_LIST;
> +     case PCI_HEADER_TYPE_CARDBUS:
> +             return PCI_CB_CAPABILITY_LIST;
> +     default:
> +             return 0;
> +     }
> +
> +     return 0;
> +}
> +
> +static int __init early_pci_find_capability(u8 bus, u8 slot, u8 func, int 
> cap)
> +{
> +     int pos;
> +     u8 type = read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE);
> +
> +     pos = __pci_bus_find_cap_start(bus, slot, func, type & 0x7f);
> +     if (pos)
> +             pos = __pci_find_next_cap(bus, slot, func, pos, cap);
> +
> +     return pos;
> +}
> +
> +static void __init do_reset(u8 bus, u8 slot, u8 func)
> +{
> +     u16 ctrl;
> +
> +     printk(KERN_INFO "pci 0000:%02x:%02x.%d reset\n", bus, slot, func);
> +
> +     /* Assert Secondary Bus Reset */
> +     ctrl = read_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL);
> +     ctrl |= PCI_BRIDGE_CTL_BUS_RESET;
> +     write_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL, ctrl);
> +
> +     /*
> +      * PCIe spec requires software to ensure a minimum reset duration
> +      * (Trst == 1ms). We have here 5ms safety margin because pci_udelay is
> +      * not precise.
> +      */
> +     pci_udelay(5000);
> +
> +     /* De-assert Secondary Bus Reset */
> +     ctrl &= ~PCI_BRIDGE_CTL_BUS_RESET;
> +     write_pci_config_16(bus, slot, func, PCI_BRIDGE_CONTROL, ctrl);
> +}
> +
> +static void __init save_state(unsigned bus, unsigned slot, unsigned func,
> +             struct pcie_dev *dev)
> +{
> +     int i;
> +     int pcie, flags, pcie_type;
> +     struct save_config *save;
> +
> +     pcie = dev->cap;
> +     flags = dev->flags;
> +     pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
> +     save = &dev->save;
> +
> +     printk(KERN_INFO "pci 0000:%02x:%02x.%d save state\n", bus, slot, func);
> +
> +     for (i = 0; i < 16; i++)
> +             save->pci[i] = read_pci_config(bus, slot, func, i * 4);
> +     i = 0;
> +     if (pcie_cap_has_devctl(pcie_type, flags))
> +             save->pcie[i++] = read_pci_config_16(bus, slot, func,
> +                                                   pcie + PCI_EXP_DEVCTL);
> +     if (pcie_cap_has_lnkctl(pcie_type, flags))
> +             save->pcie[i++] = read_pci_config_16(bus, slot, func,
> +                                                   pcie + PCI_EXP_LNKCTL);
> +     if (pcie_cap_has_sltctl(pcie_type, flags))
> +             save->pcie[i++] = read_pci_config_16(bus, slot, func,
> +                                                   pcie + PCI_EXP_SLTCTL);
> +     if (pcie_cap_has_rtctl(pcie_type, flags))
> +             save->pcie[i++] = read_pci_config_16(bus, slot, func,
> +                                                   pcie + PCI_EXP_RTCTL);
> +
> +     if ((flags & PCI_EXP_FLAGS_VERS) >= 2) {
> +             save->pcie[i++] = read_pci_config_16(bus, slot, func,
> +                                                   pcie + PCI_EXP_DEVCTL2);
> +             save->pcie[i++] = read_pci_config_16(bus, slot, func,
> +                                                   pcie + PCI_EXP_LNKCTL2);
> +             save->pcie[i++] = read_pci_config_16(bus, slot, func,
> +                                                   pcie + PCI_EXP_SLTCTL2);
> +     }
> +}
> +
> +static void __init restore_state(unsigned bus, unsigned slot, unsigned func,
> +             struct pcie_dev *dev)
> +{
> +     int i = 0;
> +     int pcie, flags, pcie_type;
> +     struct save_config *save;
> +
> +     pcie = dev->cap;
> +     flags = dev->flags;
> +     pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
> +     save = &dev->save;
> +
> +     printk(KERN_INFO "pci 0000:%02x:%02x.%d restore state\n",
> +            bus, slot, func);
> +
> +     if (pcie_cap_has_devctl(pcie_type, flags))
> +             write_pci_config_16(bus, slot, func,
> +                                 pcie + PCI_EXP_DEVCTL, save->pcie[i++]);
> +     if (pcie_cap_has_lnkctl(pcie_type, flags))
> +             write_pci_config_16(bus, slot, func,
> +                                 pcie + PCI_EXP_LNKCTL, save->pcie[i++]);
> +     if (pcie_cap_has_sltctl(pcie_type, flags))
> +             write_pci_config_16(bus, slot, func,
> +                                 pcie + PCI_EXP_SLTCTL, save->pcie[i++]);
> +     if (pcie_cap_has_rtctl(pcie_type, flags))
> +             write_pci_config_16(bus, slot, func,
> +                                 pcie + PCI_EXP_RTCTL, save->pcie[i++]);
> +
> +     if ((flags & PCI_EXP_FLAGS_VERS) >= 2) {
> +             write_pci_config_16(bus, slot, func,
> +                                 pcie + PCI_EXP_DEVCTL2, save->pcie[i++]);
> +             write_pci_config_16(bus, slot, func,
> +                                 pcie + PCI_EXP_LNKCTL2, save->pcie[i++]);
> +             write_pci_config_16(bus, slot, func,
> +                                 pcie + PCI_EXP_SLTCTL2, save->pcie[i++]);
> +     }
> +
> +     for (i = 15; i >= 0; i--)
> +             write_pci_config(bus, slot, func, i * 4, save->pci[i]);
> +}
> +
> +static void __init find_pcie_device(unsigned bus, unsigned slot, unsigned 
> func)
> +{
> +     int f, count;
> +     int pcie, pcie_type;
> +     u8 type;
> +     u16 vendor, flags;
> +     u32 class;
> +     int secondary;
> +     struct pcie_port *port;
> +     int pcie_cap[PCI_MAX_FUNCTIONS];
> +     int pcie_flags[PCI_MAX_FUNCTIONS];
> +
> +     pcie = early_pci_find_capability(bus, slot, func, PCI_CAP_ID_EXP);
> +     if (!pcie)
> +             return;
> +
> +     flags = read_pci_config_16(bus, slot, func, pcie + PCI_EXP_FLAGS);
> +     pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
> +     if ((pcie_type != PCI_EXP_TYPE_ROOT_PORT) &&
> +         (pcie_type != PCI_EXP_TYPE_DOWNSTREAM))
> +             return;
> +
> +     type = read_pci_config_byte(bus, slot, func, PCI_HEADER_TYPE);
> +     if ((type & 0x7f) != PCI_HEADER_TYPE_BRIDGE)
> +             return;
> +     secondary = read_pci_config_byte(bus, slot, func, PCI_SECONDARY_BUS);
> +
> +     memset(pcie_cap, 0, sizeof(pcie_cap));
> +     memset(pcie_flags, 0, sizeof(pcie_flags));
> +     for (count = 0, f = 0; f < PCI_MAX_FUNCTIONS; f++) {
> +             vendor = read_pci_config_16(secondary, 0, f, PCI_VENDOR_ID);
> +             if (vendor == 0xffff)
> +                     continue;
> +
> +             pcie = early_pci_find_capability(secondary, 0, f,
> +                             PCI_CAP_ID_EXP);
> +             if (!pcie)
> +                     continue;
> +
> +             flags = read_pci_config_16(secondary, 0, f,
> +                             pcie + PCI_EXP_FLAGS);
> +             pcie_type = (flags & PCI_EXP_FLAGS_TYPE) >> 4;
> +             if ((pcie_type == PCI_EXP_TYPE_UPSTREAM) ||
> +                 (pcie_type == PCI_EXP_TYPE_PCI_BRIDGE))
> +                     /* Don't reset switch, bridge */
> +                     return;
> +
> +             class = read_pci_config(secondary, 0, f, PCI_CLASS_REVISION);
> +             if ((class >> 24) == PCI_BASE_CLASS_DISPLAY)
> +                     /* Don't reset VGA device */
> +                     return;
> +
> +             count++;
> +             pcie_cap[f] = pcie;
> +             pcie_flags[f] = flags;
> +     }
> +
> +     if (!count)
> +             return;
> +
> +     port = (struct pcie_port *)alloc_bootmem(sizeof(struct pcie_port));
> +     if (port == NULL) {
> +             printk(KERN_ERR "pci 0000:%02x:%02x.%d alloc_bootmem failed\n",
> +                    bus, slot, func);
> +             return;
> +     }
> +     memset(port, 0, sizeof(*port));
> +     port->bus = bus;
> +     port->slot = slot;
> +     port->func = func;
> +     port->secondary = secondary;
> +     for (f = 0; f < PCI_MAX_FUNCTIONS; f++) {
> +             if (pcie_cap[f] != 0) {
> +                     port->child[f].cap = pcie_cap[f];
> +                     port->child[f].flags = pcie_flags[f];
> +                     save_state(secondary, 0, f, &port->child[f]);
> +             }
> +     }
> +     list_add_tail(&port->dev, &device_list);
> +}
> +
> +void __init early_reset_pcie_devices(void)
> +{
> +     unsigned bus, slot, func;
> +     struct pcie_port *port, *tmp;
> +
> +     if (!early_pci_allowed() || !reset_devices)
> +             return;
> +
> +     /* Find PCIe port and save config registers of its downstream devices */
> +     for (bus = 0; bus < 256; bus++) {
> +             for (slot = 0; slot < 32; slot++) {
> +                     for (func = 0; func < PCI_MAX_FUNCTIONS; func++) {
> +                             u16 vendor;
> +                             u8 type;
> +                             vendor = read_pci_config_16(bus, slot, func,
> +                                             PCI_VENDOR_ID);
> +
> +                             if (vendor == 0xffff)
> +                                     continue;
> +
> +                             find_pcie_device(bus, slot, func);
> +
> +                             if (func == 0) {
> +                                     type = read_pci_config_byte(bus, slot,
> +                                                                 func,
> +                                                            PCI_HEADER_TYPE);
> +                                     if (!(type & 0x80))
> +                                             break;
> +                             }
> +                     }
> +             }
> +     }
> +
> +     if (list_empty(&device_list))
> +             return;
> +
> +     /* Do bus reset */
> +     list_for_each_entry(port, &device_list, dev)
> +             do_reset(port->bus, port->slot, port->func);
> +
> +     /*
> +      * According to PCIe spec, software must wait a minimum of 100 ms
> +      * before sending a configuration request. We have 500ms safety margin
> +      * here.
> +      */
> +     pci_udelay(500000);
> +
> +     /* Restore config registers and free memory */
> +     list_for_each_entry_safe(port, tmp, &device_list, dev) {
> +             for (func = 0; func < PCI_MAX_FUNCTIONS; func++)
> +                     if (port->child[func].cap)
> +                             restore_state(port->secondary, 0, func,
> +                                           &port->child[func]);
> +             free_bootmem(__pa(port), sizeof(*port));
> +     }
> +}
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index ee21795..eca3231 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -35,6 +35,8 @@
>  /* Include the ID list */
>  #include <linux/pci_ids.h>
>  
> +#define PCI_MAX_FUNCTIONS 8
> +
>  /* pci_slot represents a physical slot */
>  struct pci_slot {
>       struct pci_bus *bus;            /* The bus this slot is on */
> diff --git a/init/main.c b/init/main.c
> index 9cf77ab..0eb7430 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -144,10 +144,10 @@ EXPORT_SYMBOL(reset_devices);
>  static int __init set_reset_devices(char *str)
>  {
>       reset_devices = 1;
> -     return 1;
> +     return 0;
>  }
>  
> -__setup("reset_devices", set_reset_devices);
> +early_param("reset_devices", set_reset_devices);
>  
>  static const char * argv_init[MAX_INIT_ARGS+2] = { "init", NULL, };
>  const char * envp_init[MAX_INIT_ENVS+2] = { "HOME=/", "TERM=linux", NULL, };
> 
> 
> _______________________________________________
> kexec mailing list
> ke...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec

This looks good. Good catch on access to downstream devices after reset.
It is certainly safer to save all config registers before any resets.

One thing I am concerned about is would a reset affect SR-IOV extended
capability registers. If so, should save_state() save those registers as
well? Alex Williamson (cc'd) can possibly comment on that aspect.

--
Khalid

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to