On Mon, May 11, 2015 at 05:02:08PM +1000, Alexey Kardashevskiy wrote:
>On 05/11/2015 04:25 PM, Gavin Shan wrote:
>>On Sat, May 09, 2015 at 10:43:23PM +1000, Alexey Kardashevskiy wrote:
>>>On 05/01/2015 04:02 PM, Gavin Shan wrote:
>>>>The original code doesn't support releasing PEs dynamically, meaning
>>>>that PE and the associated resources (IO, M32, M64 and DMA) can't
>>>>be released when unplugging a PCI adapter from one hotpluggable slot.
>>>>
>>>>The patch takes object oriented methodology, introducs reference
>>>>count to PE, which is initialized to 1 and increased with 1 when a
>>>>new PCI device joins the PE. Once the last PCI device leaves the
>>>>PE, the PE is going to be release together with its associated
>>>>(IO, M32, M64, DMA) resources.
>>>
>>>
>>>Too little commit log for non-trivial non-cut-n-paste 30KB patch...
>>>
>>
>>Ok. I'll add more details in next revision.
>>
>>>>
>>>>Signed-off-by: Gavin Shan <gws...@linux.vnet.ibm.com>
>>>>---
>>>>  arch/powerpc/include/asm/pci-bridge.h     |   3 +
>>>>  arch/powerpc/kernel/pci-hotplug.c         |   5 +
>>>>  arch/powerpc/platforms/powernv/pci-ioda.c | 658 
>>>> +++++++++++++++++++-----------
>>>>  arch/powerpc/platforms/powernv/pci.h      |   4 +-
>>>>  4 files changed, 432 insertions(+), 238 deletions(-)
>>>>
>>>>diff --git a/arch/powerpc/include/asm/pci-bridge.h 
>>>>b/arch/powerpc/include/asm/pci-bridge.h
>>>>index 5367eb3..a6ad4b1 100644
>>>>--- a/arch/powerpc/include/asm/pci-bridge.h
>>>>+++ b/arch/powerpc/include/asm/pci-bridge.h
>>>>@@ -31,6 +31,9 @@ struct pci_controller_ops {
>>>>    resource_size_t (*window_alignment)(struct pci_bus *, unsigned long 
>>>> type);
>>>>    void            (*setup_bridge)(struct pci_bus *, unsigned long);
>>>>    void            (*reset_secondary_bus)(struct pci_dev *dev);
>>>>+
>>>>+   /* Called when PCI device is released */
>>>>+   void            (*release_device)(struct pci_dev *);
>>>>  };
>>>>
>>>>  /*
>>>>diff --git a/arch/powerpc/kernel/pci-hotplug.c 
>>>>b/arch/powerpc/kernel/pci-hotplug.c
>>>>index 7ed85a6..0040343 100644
>>>>--- a/arch/powerpc/kernel/pci-hotplug.c
>>>>+++ b/arch/powerpc/kernel/pci-hotplug.c
>>>>@@ -29,6 +29,11 @@
>>>>   */
>>>>  void pcibios_release_device(struct pci_dev *dev)
>>>>  {
>>>>+   struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>>>+
>>>>+   if (hose->controller_ops.release_device)
>>>>+           hose->controller_ops.release_device(dev);
>>>>+
>>>>    eeh_remove_device(dev);
>>>>  }
>>>>
>>>>diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
>>>>b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>index 910fb67..ef8c216 100644
>>>>--- a/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
>>>>@@ -12,6 +12,8 @@
>>>>  #undef DEBUG
>>>>
>>>>  #include <linux/kernel.h>
>>>>+#include <linux/atomic.h>
>>>>+#include <linux/kref.h>
>>>>  #include <linux/pci.h>
>>>>  #include <linux/crash_dump.h>
>>>>  #include <linux/debugfs.h>
>>>>@@ -47,6 +49,8 @@
>>>>  /* 256M DMA window, 4K TCE pages, 8 bytes TCE */
>>>>  #define TCE32_TABLE_SIZE  ((0x10000000 / 0x1000) * 8)
>>>>
>>>>+static void pnv_ioda_release_pe(struct kref *kref);
>>>>+
>>>>  static void pe_level_printk(const struct pnv_ioda_pe *pe, const char 
>>>> *level,
>>>>                        const char *fmt, ...)
>>>>  {
>>>>@@ -123,25 +127,400 @@ static inline bool pnv_pci_is_mem_pref_64(unsigned 
>>>>long flags)
>>>>            (IORESOURCE_MEM_64 | IORESOURCE_PREFETCH));
>>>>  }
>>>>
>>>>-static void pnv_ioda_reserve_pe(struct pnv_phb *phb, int pe_no)
>>>>+static inline void pnv_ioda_pe_get(struct pnv_ioda_pe *pe)
>>>>  {
>>>>-   if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
>>>>-           pr_warn("%s: Invalid PE %d on PHB#%x\n",
>>>>-                   __func__, pe_no, phb->hose->global_number);
>>>>+   if (!pe)
>>>>+           return;
>>>>+
>>>>+   kref_get(&pe->kref);
>>>>+}
>>>>+
>>>>+static inline void pnv_ioda_pe_put(struct pnv_ioda_pe *pe)
>>>>+{
>>>>+   unsigned int count;
>>>>+
>>>>+   if (!pe)
>>>>            return;
>>>>+
>>>>+   /*
>>>>+    * The count is initialized to 1 and increased with 1 when
>>>>+    * a new PCI device is bound with the PE. Once the last PCI
>>>>+    * device is leaving from the PE, the PE is going to be
>>>>+    * released.
>>>>+    */
>>>>+   count = atomic_read(&pe->kref.refcount);
>>>>+   if (count == 2)
>>>>+           kref_sub(&pe->kref, 2, pnv_ioda_release_pe);
>>>>+   else
>>>>+           kref_put(&pe->kref, pnv_ioda_release_pe);
>>>
>>>
>>>What if pnv_ioda_pe_get() gets called between atomic_read() and kref_sub()?
>>>
>>
>>Yeah, that would have problem. But it shouldn't happen because the
>>PCI devices are joining the parent PE# in strictly serialized mode.
>>Same thing happens when detaching PCI devices from its parent PE.
>
>
>oookay. Another thing then - why is this kref counter initialized to 1?
>It would make sense if you did something special when the counter becomes 1
>after decrement but you do not.
>
>Also, this kref thing makes sense if you do kref_put() in multiple places and
>do not know which one will be the last one so you pass the callback to all of
>them. Here you do kref_put/sub in one place and you read the counter - so you
>can call pnv_ioda_release_pe() directly. And it feels like a simple atomic_t
>would do the job just fine. If you still feel that the counter should start
>from 1, there are atomic_dec_if_positive() and atomic_inc_not_zero() and
>others.
>

It's good question actually. The counter is initialized to 1 when the PE
is reserved because of M64 requirement or allocated for non-M64 case. If
we reserve or allocate PE#, there is one thing for sure: the PCI bus has
one PCI device (including PCI bridge) at least. After the PE# is reserved
or allocated, the PCI device joins the PE with the result of increasing
the counter with 1. It means the counter is 2 when PE contains one PCI
device, and 3 when there're 2 devices. One reason for this design is that
we just need decrease the counter if we have to release this PE in the
window between PE reservation/allocation and first PCI device joins. I
think you're correct that we can call pnv_ioda_release_pe() in this window.
In this way, the counter is always reflecting the number of PCI devices
the PE contains.

>>>>+}
>>>>+
>>>>+static void pnv_pci_release_device(struct pci_dev *pdev)
>>>>+{
>>>>+   struct pci_controller *hose = pci_bus_to_host(pdev->bus);
>>>>+   struct pnv_phb *phb = hose->private_data;
>>>>+   struct pci_dn *pdn = pci_get_pdn(pdev);
>>>>+   struct pnv_ioda_pe *pe;
>>>>+
>>>>+   if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>>>+           pe = &phb->ioda.pe_array[pdn->pe_number];
>>>>+           pnv_ioda_pe_put(pe);
>>>>+           pdn->pe_number = IODA_INVALID_PE;
>>>>    }
>>>>+}
>>>>
>>>>-   if (test_and_set_bit(pe_no, phb->ioda.pe_alloc)) {
>>>>-           pr_warn("%s: PE %d was assigned on PHB#%x\n",
>>>>-                   __func__, pe_no, phb->hose->global_number);
>>>>+static void pnv_ioda_release_pe_dma(struct pnv_ioda_pe *pe)
>>>>+{
>>>>+   struct pnv_phb *phb = pe->phb;
>>>>+   int index, count;
>>>>+   unsigned long tbl_addr, tbl_size;
>>>>+
>>>>+   /* No DMA capability for slave PEs */
>>>>+   if (pe->flags & PNV_IODA_PE_SLAVE)
>>>>+           return;
>>>>+
>>>>+   /* Bypass DMA window */
>>>>+   if (phb->type == PNV_PHB_IODA2 &&
>>>>+       pe->tce_bypass_enabled &&
>>>>+       pe->tce32_table &&
>>>>+       pe->tce32_table->set_bypass)
>>>>+           pe->tce32_table->set_bypass(pe->tce32_table, false);
>>>>+
>>>>+   /* 32-bits DMA window */
>>>>+   count = pe->tce32_seg_end - pe->tce32_seg_start;
>>>>+   tbl_addr = pe->tce32_table->it_base;
>>>>+   if (!count)
>>>>            return;
>>>>+
>>>>+   /* Free IOMMU table */
>>>>+   iommu_free_table(pe->tce32_table,
>>>>+                    of_node_full_name(phb->hose->dn));
>>>>+
>>>>+   /* Deconfigure TCE table */
>>>>+   switch (phb->type) {
>>>>+   case PNV_PHB_IODA1:
>>>>+           for (index = 0; index < count; index++)
>>>>+                   opal_pci_map_pe_dma_window(phb->opal_id,
>>>>+                                              pe->pe_number,
>>>>+                                              pe->tce32_seg_start + index,
>>>>+                                              1,
>>>>+                                              __pa(tbl_addr) +
>>>>+                                              index * TCE32_TABLE_SIZE,
>>>>+                                              0,
>>>>+                                              0x1000);
>>>>+           bitmap_clear(phb->ioda.tce32_segmap,
>>>>+                        pe->tce32_seg_start,
>>>>+                        count);
>>>>+           tbl_size = TCE32_TABLE_SIZE * count;
>>>>+           break;
>>>>+   case PNV_PHB_IODA2:
>>>>+           opal_pci_map_pe_dma_window(phb->opal_id,
>>>>+                                      pe->pe_number,
>>>>+                                      pe->pe_number << 1,
>>>>+                                      1,
>>>>+                                      __pa(tbl_addr),
>>>>+                                      0,
>>>>+                                      0x1000);
>>>>+           tbl_size = (1ul << ilog2(phb->ioda.m32_pci_base));
>>>>+           tbl_size = (tbl_size >> IOMMU_PAGE_SHIFT_4K) * 8;
>>>>+           break;
>>>>+   default:
>>>>+           pe_warn(pe, "Unsupported PHB type %d\n", phb->type);
>>>>+           return;
>>>>+   }
>>>>+
>>>>+   /* Free memory of IOMMU table */
>>>>+   free_pages(tbl_addr, get_order(tbl_size));
>>>
>>>
>>>You just programmed the table address to TVT and then you are releasing the
>>>pages. It does not seem right, it will leave garbage in TVT. Also, I am
>>>adding helpers to alloc/free TCE pages in DDW patchset, you could reuse bits
>>>from there (I'll post v10 soon, you'll be in copy and you'll have to review
>>>that ;) ).
>>>
>>
>>I assume you're talking about TVE. I don't understand how garbage will be left
>>in TVE. opal_pci_map_pe_dma_window(), which is handled by skiboot, clear TVE
>>with zero'ed "tce_table_size". The pages previously allocated for TCE table is
>>released to buddy system, which can be allocated by somebody else (from buddy
>>or slab).
>
>opal_pci_map_pe_dma_window() takes __pa(tbl_addr) which points to some memory
>which is still allocated. This value goes to a table (which has 2 entries per
>PE, one for 32bit DMA window and one for bypass/hugewindow) which PHB uses to
>get the actual TCE table address. What is the name of this table? :) Anyway,
>you write an address there and then you call free_pages() so after
>free_pages(), the value in that TVE/TVT/whatever table is a garbage.
>

I don't look into your DDW code yet. Before we have DDW patchset, the bypass
TVE (window) isn't supposed to have corresponding TCE table. I guess you might
change the behaviour in your DDW patchset and I'll take a close look on that.
For DMA32 window, which is the name of the table, the TVE is cleared by skiboot
when having zero "tce_table_size" argument.

        opal_pci_map_pe_dma_window(phb->opal_id,
                                   pe->pe_number,
                                   pe->pe_number << 1,
                                   1,
                                   __pa(tbl_addr),
                                   0,                   <<<< "tce_table_size".
                                   0x1000);

>
>>
>>Ok. Please put me into the cc list. I guess the whole series of patches is
>>better to rebased on your DDW patchset, which is to be merged first, I 
>>believe.
>>
>>>
>>>>+   pe->tce32_table = NULL;
>>>>+   pe->tce32_seg_start = 0;
>>>>+   pe->tce32_seg_end = 0;
>>>>+}
>>>>+
>>>>+static void pnv_ioda_release_pe_seg(struct pnv_ioda_pe *pe)
>>>>+{
>>>>+   struct pnv_phb *phb = pe->phb;
>>>>+   unsigned long *segmap = NULL, *pe_segmap = NULL;
>>>>+   int i;
>>>>+   uint16_t win, win_type[] = { OPAL_IO_WINDOW_TYPE,
>>>>+                                OPAL_M32_WINDOW_TYPE,
>>>>+                                OPAL_M64_WINDOW_TYPE };
>>>>+
>>>>+   for (win = 0; win < ARRAY_SIZE(win_type); win++) {
>>>>+           switch (win_type[win]) {
>>>>+           case OPAL_IO_WINDOW_TYPE:
>>>>+                   segmap = phb->ioda.io_segmap;
>>>>+                   pe_segmap = pe->io_segmap;
>>>>+                   break;
>>>>+           case OPAL_M32_WINDOW_TYPE:
>>>>+                   segmap = phb->ioda.m32_segmap;
>>>>+                   pe_segmap = pe->m32_segmap;
>>>>+                   break;
>>>>+           case OPAL_M64_WINDOW_TYPE:
>>>>+                   segmap = phb->ioda.m64_segmap;
>>>>+                   pe_segmap = pe->m64_segmap;
>>>>+                   break;
>>>>+           }
>>>>+           i = -1;
>>>>+           while ((i = find_next_bit(pe_segmap,
>>>>+                   phb->ioda.total_pe, i + 1)) < phb->ioda.total_pe) {
>>>>+                   if (win_type[win] == OPAL_IO_WINDOW_TYPE ||
>>>>+                       win_type[win] == OPAL_M32_WINDOW_TYPE)
>>>>+                           opal_pci_map_pe_mmio_window(phb->opal_id,
>>>>+                                           phb->ioda.reserved_pe,
>>>>+                                           win_type[win], 0, i);
>>>>+                   else if (phb->type == PNV_PHB_IODA1)
>>>>+                           opal_pci_map_pe_mmio_window(phb->opal_id,
>>>>+                                           phb->ioda.reserved_pe,
>>>>+                                           win_type[win],
>>>>+                                           i / 8, i % 8);
>>>
>>>The function is called ""release" but it programs something what looks like
>>>reasonable values, is it correct?
>>>
>>
>>It's out of problem, When the segment is deallocated, it's mapped to the
>>reserved PE#.
>>
>>>
>>>
>>>>+
>>>>+                   clear_bit(i, pe_segmap);
>>>>+                   clear_bit(i, segmap);
>>>>+           }
>>>>+   }
>>>>+}
>>>>+
>>>>+static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
>>>>+                             struct pnv_ioda_pe *parent,
>>>>+                             struct pnv_ioda_pe *child,
>>>>+                             bool is_add)
>>>>+{
>>>>+   const char *desc = is_add ? "adding" : "removing";
>>>>+   uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
>>>>+                         OPAL_REMOVE_PE_FROM_DOMAIN;
>>>>+   struct pnv_ioda_pe *slave;
>>>>+   long rc;
>>>>+
>>>>+   /* Parent PE affects child PE */
>>>>+   rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>>>+                           child->pe_number, op);
>>>>+   if (rc != OPAL_SUCCESS) {
>>>>+           pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
>>>>+                   rc, desc);
>>>>+           return -ENXIO;
>>>>+   }
>>>>+
>>>>+   if (!(child->flags & PNV_IODA_PE_MASTER))
>>>>+           return 0;
>>>>+
>>>>+   /* Compound case: parent PE affects slave PEs */
>>>>+   list_for_each_entry(slave, &child->slaves, list) {
>>>>+           rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>>>+                                   slave->pe_number, op);
>>>>+           if (rc != OPAL_SUCCESS) {
>>>>+                   pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
>>>>+                           rc, desc);
>>>>+                   return -ENXIO;
>>>>+           }
>>>>+   }
>>>>+
>>>>+   return 0;
>>>>+}
>>>>+
>>>>+static int pnv_ioda_set_peltv(struct pnv_ioda_pe *pe, bool is_add)
>>>>+{
>>>>+   struct pnv_phb *phb = pe->phb;
>>>>+   struct pnv_ioda_pe *slave;
>>>>+   struct pci_dev *pdev = NULL;
>>>>+   int ret;
>>>>+
>>>>+   /*
>>>>+    * Clear PE frozen state. If it's master PE, we need
>>>>+    * clear slave PE frozen state as well.
>>>>+    */
>>>>+   opal_pci_eeh_freeze_clear(phb->opal_id,
>>>>+                             pe->pe_number,
>>>>+                             OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>>>+   if (pe->flags & PNV_IODA_PE_MASTER) {
>>>>+           list_for_each_entry(slave, &pe->slaves, list) {
>>>>+                   opal_pci_eeh_freeze_clear(phb->opal_id,
>>>>+                                             slave->pe_number,
>>>>+                                             
>>>>OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>>>+           }
>>>>+   }
>>>>+
>>>>+   /*
>>>>+    * Associate PE in PELT. We need add the PE into the
>>>>+    * corresponding PELT-V as well. Otherwise, the error
>>>>+    * originated from the PE might contribute to other
>>>>+    * PEs.
>>>>+    */
>>>>+   ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
>>>>+   if (ret)
>>>>+           return ret;
>>>>+
>>>>+   /* For compound PEs, any one affects all of them */
>>>>+   if (pe->flags & PNV_IODA_PE_MASTER) {
>>>>+           list_for_each_entry(slave, &pe->slaves, list) {
>>>>+                   ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
>>>>+                   if (ret)
>>>>+                           return ret;
>>>>+           }
>>>>+   }
>>>>+
>>>>+   if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
>>>>+           pdev = pe->pbus->self;
>>>>+   else if (pe->flags & PNV_IODA_PE_DEV)
>>>>+           pdev = pe->pdev->bus->self;
>>>>+#ifdef CONFIG_PCI_IOV
>>>>+   else if (pe->flags & PNV_IODA_PE_VF)
>>>>+           pdev = pe->parent_dev->bus->self;
>>>>+#endif /* CONFIG_PCI_IOV */
>>>>+
>>>>+   while (pdev) {
>>>>+           struct pci_dn *pdn = pci_get_pdn(pdev);
>>>>+           struct pnv_ioda_pe *parent;
>>>>+
>>>>+           if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>>>+                   parent = &phb->ioda.pe_array[pdn->pe_number];
>>>>+                   ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
>>>>+                   if (ret)
>>>>+                           return ret;
>>>>+           }
>>>>+
>>>>+           pdev = pdev->bus->self;
>>>>+   }
>>>>+
>>>>+   return 0;
>>>>+}
>>>>+
>>>>+static void pnv_ioda_deconfigure_pe(struct pnv_ioda_pe *pe)
>>>
>>>
>>>It used to be under #ifdef CONFIG_PCI_IOV, now it is not. Looks like just
>>>moving of this function to a different place deserves a separate patch with a
>>>comment why ("it is going to be used now for non-SRIOV case too" may be?).
>>>
>>
>>Yeah, it makes sense to me. Will fix it up.
>>
>>>
>>>>+{
>>>>+   struct pnv_phb *phb = pe->phb;
>>>>+   struct pci_dev *parent;
>>>>+   uint8_t bcomp, dcomp, fcomp;
>>>>+   long rid_end, rid;
>>>>+   int64_t rc;
>>>>+
>>>>+   /* Tear down MVE */
>>>>+   if (phb->type == PNV_PHB_IODA1 &&
>>>>+       pe->mve_number != -1) {
>>>>+           rc = opal_pci_set_mve(phb->opal_id,
>>>>+                                 pe->mve_number,
>>>>+                                 phb->ioda.reserved_pe);
>>>>+           if (rc != OPAL_SUCCESS)
>>>>+                   pe_warn(pe, "Error %lld unmapping MVE#%d\n",
>>>>+                           rc, pe->mve_number);
>>>>+           rc = opal_pci_set_mve_enable(phb->opal_id,
>>>>+                                        pe->mve_number,
>>>>+                                        OPAL_DISABLE_MVE);
>>>>+           if (rc != OPAL_SUCCESS)
>>>>+                   pe_warn(pe, "Error %lld disabling MVE#%d\n",
>>>>+                           rc, pe->mve_number);
>>>>+           pe->mve_number = -1;
>>>>+   }
>>>>+
>>>>+   /* Unmapping PELTV */
>>>>+   pnv_ioda_set_peltv(pe, false);
>>>>+
>>>>+   /* To unmap PELTM */
>>>>+   if (pe->pbus) {
>>>>+           int count;
>>>>+
>>>>+           dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
>>>>+           fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
>>>>+           parent = pe->pbus->self;
>>>>+           if (pe->flags & PNV_IODA_PE_BUS_ALL)
>>>>+                   count = pe->pbus->busn_res.end -
>>>>+                           pe->pbus->busn_res.start + 1;
>>>>+           else
>>>>+                   count = 1;
>>>>+
>>>>+           switch(count) {
>>>>+           case  1: bcomp = OpalPciBusAll;   break;
>>>>+           case  2: bcomp = OpalPciBus7Bits; break;
>>>>+           case  4: bcomp = OpalPciBus6Bits; break;
>>>>+           case  8: bcomp = OpalPciBus5Bits; break;
>>>>+           case 16: bcomp = OpalPciBus4Bits; break;
>>>>+           case 32: bcomp = OpalPciBus3Bits; break;
>>>>+           default:
>>>>+                   /* Fail back to case of one bus */
>>>>+                   pe_warn(pe, "Cannot support %d buses\n", count);
>>>>+                   bcomp = OpalPciBusAll;
>>>>+           }
>>>>+           rid_end = pe->rid + (count << 8);
>>>>+   } else {
>>>>+#ifdef CONFIG_PCI_IOV
>>>>+           if (pe->flags & PNV_IODA_PE_VF)
>>>>+                   parent = pe->parent_dev;
>>>>+           else
>>>>+#endif
>>>>+                   parent = pe->pdev->bus->self;
>>>>+           bcomp = OpalPciBusAll;
>>>>+           dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
>>>>+           fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
>>>>+           rid_end = pe->rid + 1;
>>>>+   }
>>>>+
>>>>+   /* Clear RID mapping */
>>>>+   for (rid = pe->rid; rid < rid_end; rid++)
>>>>+           phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
>>>>+
>>>>+   /* Unmapping PELTM */
>>>>+   rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
>>>>+                        bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
>>>>+   if (rc)
>>>>+           pe_warn(pe, "Error %ld unmapping PELTM\n", rc);
>>>>+}
>>>>+
>>>>+static void pnv_ioda_release_pe(struct kref *kref)
>>>>+{
>>>>+   struct pnv_ioda_pe *pe = container_of(kref, struct pnv_ioda_pe, kref);
>>>>+   struct pnv_ioda_pe *tmp, *slave;
>>>>+   struct pnv_phb *phb = pe->phb;
>>>>+
>>>>+   pnv_ioda_release_pe_dma(pe);
>>>>+   pnv_ioda_release_pe_seg(pe);
>>>>+   pnv_ioda_deconfigure_pe(pe);
>>>>+
>>>>+   /* Release slave PEs for compound PE */
>>>>+   if (pe->flags & PNV_IODA_PE_MASTER) {
>>>>+           list_for_each_entry_safe(slave, tmp, &pe->slaves, list)
>>>>+                   pnv_ioda_pe_put(slave);
>>>>+   }
>>>>+
>>>>+   /* Remove the PE from various list. We need remove slave
>>>>+    * PE from master's list.
>>>>+    */
>>>>+   list_del(&pe->dma_link);
>>>>+   list_del(&pe->list);
>>>>+
>>>>+   /* Free PE number */
>>>>+   clear_bit(pe->pe_number, phb->ioda.pe_alloc);
>>>>+}
>>>>+
>>>>+static struct pnv_ioda_pe *pnv_ioda_init_pe(struct pnv_phb *phb,
>>>>+                                       int pe_no)
>>>>+{
>>>>+   struct pnv_ioda_pe *pe = &phb->ioda.pe_array[pe_no];
>>>>+
>>>>+   kref_init(&pe->kref);
>>>>+   pe->phb = phb;
>>>>+   pe->pe_number = pe_no;
>>>>+   INIT_LIST_HEAD(&pe->dma_link);
>>>>+   INIT_LIST_HEAD(&pe->list);
>>>>+
>>>>+   return pe;
>>>>+}
>>>>+
>>>>+static struct pnv_ioda_pe *pnv_ioda_reserve_pe(struct pnv_phb *phb,
>>>>+                                          int pe_no)
>>>>+{
>>>>+   if (!(pe_no >= 0 && pe_no < phb->ioda.total_pe)) {
>>>>+           pr_warn("%s: Invalid PE %d on PHB#%x\n",
>>>>+                   __func__, pe_no, phb->hose->global_number);
>>>>+           return NULL;
>>>>    }
>>>>
>>>>-   phb->ioda.pe_array[pe_no].phb = phb;
>>>>-   phb->ioda.pe_array[pe_no].pe_number = pe_no;
>>>>+   /*
>>>>+    * Same PE might be reserved for multiple times, which
>>>>+    * is out of problem actually.
>>>>+    */
>>>>+   set_bit(pe_no, phb->ioda.pe_alloc);
>>>>+   return pnv_ioda_init_pe(phb, pe_no);
>>>>  }
>>>>
>>>>-static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>>>+static struct pnv_ioda_pe *pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>>>  {
>>>>    unsigned long pe_no;
>>>>    unsigned long limit = phb->ioda.total_pe - 1;
>>>>@@ -154,20 +533,10 @@ static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
>>>>                    break;
>>>>
>>>>            if (--limit >= phb->ioda.total_pe)
>>>>-                   return IODA_INVALID_PE;
>>>>+                   return NULL;
>>>>    } while(1);
>>>>
>>>>-   phb->ioda.pe_array[pe_no].phb = phb;
>>>>-   phb->ioda.pe_array[pe_no].pe_number = pe_no;
>>>>-   return pe_no;
>>>>-}
>>>>-
>>>>-static void pnv_ioda_free_pe(struct pnv_phb *phb, int pe)
>>>>-{
>>>>-   WARN_ON(phb->ioda.pe_array[pe].pdev);
>>>>-
>>>>-   memset(&phb->ioda.pe_array[pe], 0, sizeof(struct pnv_ioda_pe));
>>>>-   clear_bit(pe, phb->ioda.pe_alloc);
>>>>+   return pnv_ioda_init_pe(phb, pe_no);
>>>>  }
>>>>
>>>>  static int pnv_ioda1_init_m64(struct pnv_phb *phb)
>>>>@@ -382,8 +751,9 @@ static void pnv_ioda_reserve_m64_pe(struct pnv_phb *phb,
>>>>    }
>>>>  }
>>>>
>>>>-static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>>>-                           struct pci_bus *bus, int all)
>>>>+static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>>>+                                           struct pci_bus *bus,
>>>>+                                           int all)
>>>
>>>
>>>Mechanic changes like this could easily go to a separate patch.
>>>
>>
>>Indeed. I'll see how I can split the patches up in next revision.
>>Thanks for the suggestion.
>>
>>>>  {
>>>>    resource_size_t segsz = phb->ioda.m64_segsize;
>>>>    struct pci_dev *pdev;
>>>>@@ -394,14 +764,14 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>>>    int i;
>>>>
>>>>    if (!pnv_ioda_need_m64_pe(phb, bus))
>>>>-           return IODA_INVALID_PE;
>>>>+           return NULL;
>>>>
>>>>          /* Allocate bitmap */
>>>>    size = _ALIGN_UP(phb->ioda.total_pe / 8, sizeof(unsigned long));
>>>>    pe_bitsmap = kzalloc(size, GFP_KERNEL);
>>>>    if (!pe_bitsmap) {
>>>>            pr_warn("%s: Out of memory !\n", __func__);
>>>>-           return IODA_INVALID_PE;
>>>>+           return NULL;
>>>>    }
>>>>
>>>>    /* The bridge's M64 window might be extended to PHB's M64
>>>>@@ -438,7 +808,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>>>    /* No M64 window found ? */
>>>>    if (bitmap_empty(pe_bitsmap, phb->ioda.total_pe)) {
>>>>            kfree(pe_bitsmap);
>>>>-           return IODA_INVALID_PE;
>>>>+           return NULL;
>>>>    }
>>>>
>>>>    /* Figure out the master PE and put all slave PEs
>>>>@@ -491,7 +861,7 @@ static int pnv_ioda_pick_m64_pe(struct pnv_phb *phb,
>>>>    }
>>>>
>>>>    kfree(pe_bitsmap);
>>>>-   return master_pe->pe_number;
>>>>+   return master_pe;
>>>>  }
>>>>
>>>>  static void __init pnv_ioda_parse_m64_window(struct pnv_phb *phb)
>>>>@@ -695,7 +1065,7 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, 
>>>>int pe_no)
>>>>   * but in the meantime, we need to protect them to avoid warnings
>>>>   */
>>>>  #ifdef CONFIG_PCI_MSI
>>>>-static struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
>>>>+static struct pnv_ioda_pe *pnv_ioda_pci_dev_to_pe(struct pci_dev *dev)
>>>>  {
>>>>    struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>>>    struct pnv_phb *phb = hose->private_data;
>>>>@@ -709,191 +1079,6 @@ static struct pnv_ioda_pe *pnv_ioda_get_pe(struct 
>>>>pci_dev *dev)
>>>>  }
>>>>  #endif /* CONFIG_PCI_MSI */
>>>>
>>>>-static int pnv_ioda_set_one_peltv(struct pnv_phb *phb,
>>>>-                             struct pnv_ioda_pe *parent,
>>>>-                             struct pnv_ioda_pe *child,
>>>>-                             bool is_add)
>>>>-{
>>>>-   const char *desc = is_add ? "adding" : "removing";
>>>>-   uint8_t op = is_add ? OPAL_ADD_PE_TO_DOMAIN :
>>>>-                         OPAL_REMOVE_PE_FROM_DOMAIN;
>>>>-   struct pnv_ioda_pe *slave;
>>>>-   long rc;
>>>>-
>>>>-   /* Parent PE affects child PE */
>>>>-   rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>>>-                           child->pe_number, op);
>>>>-   if (rc != OPAL_SUCCESS) {
>>>>-           pe_warn(child, "OPAL error %ld %s to parent PELTV\n",
>>>>-                   rc, desc);
>>>>-           return -ENXIO;
>>>>-   }
>>>>-
>>>>-   if (!(child->flags & PNV_IODA_PE_MASTER))
>>>>-           return 0;
>>>>-
>>>>-   /* Compound case: parent PE affects slave PEs */
>>>>-   list_for_each_entry(slave, &child->slaves, list) {
>>>>-           rc = opal_pci_set_peltv(phb->opal_id, parent->pe_number,
>>>>-                                   slave->pe_number, op);
>>>>-           if (rc != OPAL_SUCCESS) {
>>>>-                   pe_warn(slave, "OPAL error %ld %s to parent PELTV\n",
>>>>-                           rc, desc);
>>>>-                   return -ENXIO;
>>>>-           }
>>>>-   }
>>>>-
>>>>-   return 0;
>>>>-}
>>>>-
>>>>-static int pnv_ioda_set_peltv(struct pnv_phb *phb,
>>>>-                         struct pnv_ioda_pe *pe,
>>>>-                         bool is_add)
>>>>-{
>>>>-   struct pnv_ioda_pe *slave;
>>>>-   struct pci_dev *pdev = NULL;
>>>>-   int ret;
>>>>-
>>>>-   /*
>>>>-    * Clear PE frozen state. If it's master PE, we need
>>>>-    * clear slave PE frozen state as well.
>>>>-    */
>>>>-   if (is_add) {
>>>>-           opal_pci_eeh_freeze_clear(phb->opal_id, pe->pe_number,
>>>>-                                     OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>>>-           if (pe->flags & PNV_IODA_PE_MASTER) {
>>>>-                   list_for_each_entry(slave, &pe->slaves, list)
>>>>-                           opal_pci_eeh_freeze_clear(phb->opal_id,
>>>>-                                                     slave->pe_number,
>>>>-                                                     
>>>>OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>>>-           }
>>>>-   }
>>>>-
>>>>-   /*
>>>>-    * Associate PE in PELT. We need add the PE into the
>>>>-    * corresponding PELT-V as well. Otherwise, the error
>>>>-    * originated from the PE might contribute to other
>>>>-    * PEs.
>>>>-    */
>>>>-   ret = pnv_ioda_set_one_peltv(phb, pe, pe, is_add);
>>>>-   if (ret)
>>>>-           return ret;
>>>>-
>>>>-   /* For compound PEs, any one affects all of them */
>>>>-   if (pe->flags & PNV_IODA_PE_MASTER) {
>>>>-           list_for_each_entry(slave, &pe->slaves, list) {
>>>>-                   ret = pnv_ioda_set_one_peltv(phb, slave, pe, is_add);
>>>>-                   if (ret)
>>>>-                           return ret;
>>>>-           }
>>>>-   }
>>>>-
>>>>-   if (pe->flags & (PNV_IODA_PE_BUS_ALL | PNV_IODA_PE_BUS))
>>>>-           pdev = pe->pbus->self;
>>>>-   else if (pe->flags & PNV_IODA_PE_DEV)
>>>>-           pdev = pe->pdev->bus->self;
>>>>-#ifdef CONFIG_PCI_IOV
>>>>-   else if (pe->flags & PNV_IODA_PE_VF)
>>>>-           pdev = pe->parent_dev->bus->self;
>>>>-#endif /* CONFIG_PCI_IOV */
>>>>-   while (pdev) {
>>>>-           struct pci_dn *pdn = pci_get_pdn(pdev);
>>>>-           struct pnv_ioda_pe *parent;
>>>>-
>>>>-           if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>>>-                   parent = &phb->ioda.pe_array[pdn->pe_number];
>>>>-                   ret = pnv_ioda_set_one_peltv(phb, parent, pe, is_add);
>>>>-                   if (ret)
>>>>-                           return ret;
>>>>-           }
>>>>-
>>>>-           pdev = pdev->bus->self;
>>>>-   }
>>>>-
>>>>-   return 0;
>>>>-}
>>>>-
>>>>-#ifdef CONFIG_PCI_IOV
>>>>-static int pnv_ioda_deconfigure_pe(struct pnv_phb *phb, struct pnv_ioda_pe 
>>>>*pe)
>>>>-{
>>>>-   struct pci_dev *parent;
>>>>-   uint8_t bcomp, dcomp, fcomp;
>>>>-   int64_t rc;
>>>>-   long rid_end, rid;
>>>>-
>>>>-   /* Currently, we just deconfigure VF PE. Bus PE will always there.*/
>>>>-   if (pe->pbus) {
>>>>-           int count;
>>>>-
>>>>-           dcomp = OPAL_IGNORE_RID_DEVICE_NUMBER;
>>>>-           fcomp = OPAL_IGNORE_RID_FUNCTION_NUMBER;
>>>>-           parent = pe->pbus->self;
>>>>-           if (pe->flags & PNV_IODA_PE_BUS_ALL)
>>>>-                   count = pe->pbus->busn_res.end - 
>>>>pe->pbus->busn_res.start + 1;
>>>>-           else
>>>>-                   count = 1;
>>>>-
>>>>-           switch(count) {
>>>>-           case  1: bcomp = OpalPciBusAll;         break;
>>>>-           case  2: bcomp = OpalPciBus7Bits;       break;
>>>>-           case  4: bcomp = OpalPciBus6Bits;       break;
>>>>-           case  8: bcomp = OpalPciBus5Bits;       break;
>>>>-           case 16: bcomp = OpalPciBus4Bits;       break;
>>>>-           case 32: bcomp = OpalPciBus3Bits;       break;
>>>>-           default:
>>>>-                   dev_err(&pe->pbus->dev, "Number of subordinate buses %d 
>>>>unsupported\n",
>>>>-                           count);
>>>>-                   /* Do an exact match only */
>>>>-                   bcomp = OpalPciBusAll;
>>>>-           }
>>>>-           rid_end = pe->rid + (count << 8);
>>>>-   } else {
>>>>-           if (pe->flags & PNV_IODA_PE_VF)
>>>>-                   parent = pe->parent_dev;
>>>>-           else
>>>>-                   parent = pe->pdev->bus->self;
>>>>-           bcomp = OpalPciBusAll;
>>>>-           dcomp = OPAL_COMPARE_RID_DEVICE_NUMBER;
>>>>-           fcomp = OPAL_COMPARE_RID_FUNCTION_NUMBER;
>>>>-           rid_end = pe->rid + 1;
>>>>-   }
>>>>-
>>>>-   /* Clear the reverse map */
>>>>-   for (rid = pe->rid; rid < rid_end; rid++)
>>>>-           phb->ioda.pe_rmap[rid] = IODA_INVALID_PE;
>>>>-
>>>>-   /* Release from all parents PELT-V */
>>>>-   while (parent) {
>>>>-           struct pci_dn *pdn = pci_get_pdn(parent);
>>>>-           if (pdn && pdn->pe_number != IODA_INVALID_PE) {
>>>>-                   rc = opal_pci_set_peltv(phb->opal_id, pdn->pe_number,
>>>>-                                           pe->pe_number, 
>>>>OPAL_REMOVE_PE_FROM_DOMAIN);
>>>>-                   /* XXX What to do in case of error ? */
>>>
>>>
>>>Not much :) Free associated memory and mark it "dead" so it won't be used
>>>again till reboot. In what circumstance can this opal_pci_set_peltv() fail at
>>>all?
>>>
>>
>>Yeah, maybe. Until now, I didn't see this failure since the code is there
>>from the day. Note the code has been there for almost 4 years since the
>>day Ben wrote it.
>
>
>Sure. But if it starts failing, we won't even notice it - there is no even
>pr_err() or WARN_ON.
>

Agree. I'll see what I can do. At least I can have error message to alert.

>>
>>>
>>>>-           }
>>>>-           parent = parent->bus->self;
>>>>-   }
>>>>-
>>>>-   opal_pci_eeh_freeze_set(phb->opal_id, pe->pe_number,
>>>>-                             OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
>>>>-
>>>>-   /* Disassociate PE in PELT */
>>>>-   rc = opal_pci_set_peltv(phb->opal_id, pe->pe_number,
>>>>-                           pe->pe_number, OPAL_REMOVE_PE_FROM_DOMAIN);
>>>>-   if (rc)
>>>>-           pe_warn(pe, "OPAL error %ld remove self from PELTV\n", rc);
>>>>-   rc = opal_pci_set_pe(phb->opal_id, pe->pe_number, pe->rid,
>>>>-                        bcomp, dcomp, fcomp, OPAL_UNMAP_PE);
>>>>-   if (rc)
>>>>-           pe_err(pe, "OPAL error %ld trying to setup PELT table\n", rc);
>>>>-
>>>>-   pe->pbus = NULL;
>>>>-   pe->pdev = NULL;
>>>>-   pe->parent_dev = NULL;
>>>>-
>>>>-   return 0;
>>>>-}
>>>>-#endif /* CONFIG_PCI_IOV */
>>>>-
>>>>  static int pnv_ioda_configure_pe(struct pnv_phb *phb, struct pnv_ioda_pe 
>>>> *pe)
>>>>  {
>>>>    struct pci_dev *parent;
>>>>@@ -953,7 +1138,7 @@ static int pnv_ioda_configure_pe(struct pnv_phb *phb, 
>>>>struct pnv_ioda_pe *pe)
>>>>    }
>>>>
>>>>    /* Configure PELTV */
>>>>-   pnv_ioda_set_peltv(phb, pe, true);
>>>>+   pnv_ioda_set_peltv(pe, true);
>>>>
>>>>    /* Setup reverse map */
>>>>    for (rid = pe->rid; rid < rid_end; rid++)
>>>>@@ -1207,6 +1392,8 @@ static void pnv_ioda_setup_same_PE(struct pci_bus 
>>>>*bus, struct pnv_ioda_pe *pe)
>>>>            if (pdn->pe_number != IODA_INVALID_PE)
>>>>                    continue;
>>>>
>>>>+           /* Increase reference count of the parent PE */
>>>
>>>When you comment like this, I read it as the comment belongs to the whole
>>>next chunk till the first empty line, i.e. to all 5 lines below, which is not
>>>the case. I'd remove the comment as 1) "pe_get" in pnv_ioda_pe_get() name
>>>suggests incrementing the reference counter 2) "pe" is always parent in this
>>>function. I do not insist though.
>>>
>>
>>Agree on your explaining. I'll remove this unuseful comments.
>>
>>>
>>>>+           pnv_ioda_pe_get(pe);
>>>>            pdn->pe_number = pe->pe_number;
>>>>            pe->dma_weight += pnv_ioda_dev_dma_weight(dev);
>>>>            if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
>>>>@@ -1224,7 +1411,7 @@ static struct pnv_ioda_pe 
>>>>*pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>>>  {
>>>>    struct pci_controller *hose = pci_bus_to_host(bus);
>>>>    struct pnv_phb *phb = hose->private_data;
>>>>-   struct pnv_ioda_pe *pe;
>>>>+   struct pnv_ioda_pe *pe = NULL;
>>>>    int pe_num = IODA_INVALID_PE;
>>>>
>>>>    /* For partial hotplug case, the PE instance hasn't been destroyed
>>>>@@ -1240,24 +1427,24 @@ static struct pnv_ioda_pe 
>>>>*pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>>>    }
>>>>
>>>>    /* PE number for root bus should have been reserved */
>>>>-   if (pci_is_root_bus(bus))
>>>>-           pe_num = phb->ioda.root_pe_no;
>>>>+   if (pci_is_root_bus(bus) &&
>>>>+       phb->ioda.root_pe_no != IODA_INVALID_PE)
>>>>+           pe = &phb->ioda.pe_array[phb->ioda.root_pe_no];
>>>>
>>>>    /* Check if PE is determined by M64 */
>>>>-   if (pe_num == IODA_INVALID_PE && phb->pick_m64_pe)
>>>>-           pe_num = phb->pick_m64_pe(phb, bus, all);
>>>>+   if (!pe && phb->pick_m64_pe)
>>>>+           pe = phb->pick_m64_pe(phb, bus, all);
>>>>
>>>>    /* The PE number isn't pinned by M64 */
>>>>-   if (pe_num == IODA_INVALID_PE)
>>>>-           pe_num = pnv_ioda_alloc_pe(phb);
>>>>+   if (!pe)
>>>>+           pe = pnv_ioda_alloc_pe(phb);
>>>>
>>>>-   if (pe_num == IODA_INVALID_PE) {
>>>>-           pr_warning("%s: Not enough PE# available for PCI bus 
>>>>%04x:%02x\n",
>>>>+   if (!pe) {
>>>>+           pr_warn("%s: No enough PE# available for PCI bus %04x:%02x\n",
>>>>                    __func__, pci_domain_nr(bus), bus->number);
>>>>            return NULL;
>>>>    }
>>>>
>>>>-   pe = &phb->ioda.pe_array[pe_num];
>>>>    pe->flags |= (all ? PNV_IODA_PE_BUS_ALL : PNV_IODA_PE_BUS);
>>>>    pe->pbus = bus;
>>>>    pe->pdev = NULL;
>>>>@@ -1274,14 +1461,12 @@ static struct pnv_ioda_pe 
>>>>*pnv_ioda_setup_bus_PE(struct pci_bus *bus, int all)
>>>>
>>>>    if (pnv_ioda_configure_pe(phb, pe)) {
>>>>            /* XXX What do we do here ? */
>>>>-           if (pe_num)
>>>>-                   pnv_ioda_free_pe(phb, pe_num);
>>>>-           pe->pbus = NULL;
>>>>+           pnv_ioda_pe_put(pe);
>>>>            return NULL;
>>>>    }
>>>>
>>>>    pe->tce32_table = kzalloc_node(sizeof(struct iommu_table),
>>>>-                   GFP_KERNEL, hose->node);
>>>>+                                  GFP_KERNEL, hose->node);
>>>
>>>Seems like spaces change only - if you really want this change (which I hate
>>>- makes code look inaccurate to my taste but it seems I am in minority here
>>>:) ), please put it to the separate patch.
>>>
>>
>>Ok. Confirm with you: You prefer the original format? I don't know
>>why I prefer the later one. Maybe my eyes are quite broken :-)
>
>
>I prefer not to change existing whitespaces unless it is done once and for
>the entire file :) Just remove this change from the patch.
>

Sure.

>>>
>>>>    pe->tce32_table->data = pe;
>>>>
>>>>    /* Associate it with all child devices */
>>>>@@ -1521,9 +1706,9 @@ static void pnv_ioda_release_vf_PE(struct pci_dev 
>>>>*pdev, u16 num_vfs)
>>>>            list_del(&pe->list);
>>>>            mutex_unlock(&phb->ioda.pe_list_mutex);
>>>>
>>>>-           pnv_ioda_deconfigure_pe(phb, pe);
>>>>+           pnv_ioda_deconfigure_pe(pe);
>>>
>>>
>>>Is this change necessary to get "Release PEs dynamically" working? Move it to
>>>mechanical changes patch may be?
>>>
>>
>>Ok. I'll try to do that.
>>
>>>
>>>>
>>>>-           pnv_ioda_free_pe(phb, pe->pe_number);
>>>>+           pnv_ioda_pe_put(pe);
>>>>    }
>>>>  }
>>>>
>>>>@@ -1601,9 +1786,7 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev 
>>>>*pdev, u16 num_vfs)
>>>>
>>>>            if (pnv_ioda_configure_pe(phb, pe)) {
>>>>                    /* XXX What do we do here ? */
>>>>-                   if (pe_num)
>>>>-                           pnv_ioda_free_pe(phb, pe_num);
>>>>-                   pe->pdev = NULL;
>>>>+                   pnv_ioda_pe_put(pe);
>>>>                    continue;
>>>>            }
>>>>
>>>>@@ -2263,7 +2446,7 @@ int pnv_phb_to_cxl_mode(struct pci_dev *dev, uint64_t 
>>>>mode)
>>>>    struct pnv_ioda_pe *pe;
>>>>    int rc;
>>>>
>>>>-   pe = pnv_ioda_get_pe(dev);
>>>>+   pe = pnv_ioda_pci_dev_to_pe(dev);
>>>
>>>
>>>And this change could to separately. Not clear how this helps to "Release PEs
>>>dynamically".
>>>
>>>
>>
>>It's not related to "Release PEs dynamically". The change is introduced by
>>the function rename: Original pnv_ioda_get_pe() is renamed to 
>>pnv_ioda_pci_dev_to_pe().
>
>
>But the rename happened in this patch and the patch's subj is "Release PEs
>dynamically" so it should be related somehow or move it to a simple separate
>patch "let's give the lalala function a better name to reflect what it
>actually does" (but in this case the new name does not make any more sense
>than the old one).
>

Yeah, I'll try to split the patches to separate blala and walala :-)

>>>>    if (!pe)
>>>>            return -ENODEV;
>>>>
>>>>@@ -2379,7 +2562,7 @@ int pnv_cxl_ioda_msi_setup(struct pci_dev *dev, 
>>>>unsigned int hwirq,
>>>>    struct pnv_ioda_pe *pe;
>>>>    int rc;
>>>>
>>>>-   if (!(pe = pnv_ioda_get_pe(dev)))
>>>>+   if (!(pe = pnv_ioda_pci_dev_to_pe(dev)))
>>>>            return -ENODEV;
>>>>
>>>>    /* Assign XIVE to PE */
>>>>@@ -2401,7 +2584,7 @@ static int pnv_pci_ioda_msi_setup(struct pnv_phb 
>>>>*phb, struct pci_dev *dev,
>>>>                              unsigned int hwirq, unsigned int virq,
>>>>                              unsigned int is_64, struct msi_msg *msg)
>>>>  {
>>>>-   struct pnv_ioda_pe *pe = pnv_ioda_get_pe(dev);
>>>>+   struct pnv_ioda_pe *pe = pnv_ioda_pci_dev_to_pe(dev);
>>>>    unsigned int xive_num = hwirq - phb->msi_base;
>>>>    __be32 data;
>>>>    int rc;
>>>>@@ -3065,6 +3248,7 @@ static void __init pnv_pci_init_ioda_phb(struct 
>>>>device_node *np,
>>>>    pnv_pci_controller_ops.setup_bridge = pnv_pci_setup_bridge;
>>>>    pnv_pci_controller_ops.window_alignment = pnv_pci_window_alignment;
>>>>    pnv_pci_controller_ops.reset_secondary_bus = 
>>>> pnv_pci_reset_secondary_bus;
>>>>+   pnv_pci_controller_ops.release_device = pnv_pci_release_device;
>>>>    hose->controller_ops = pnv_pci_controller_ops;
>>>>
>>>>  #ifdef CONFIG_PCI_IOV
>>>>diff --git a/arch/powerpc/platforms/powernv/pci.h 
>>>>b/arch/powerpc/platforms/powernv/pci.h
>>>>index 1bea3a8..8b10f01 100644
>>>>--- a/arch/powerpc/platforms/powernv/pci.h
>>>>+++ b/arch/powerpc/platforms/powernv/pci.h
>>>>@@ -28,6 +28,7 @@ enum pnv_phb_model {
>>>>  /* Data associated with a PE, including IOMMU tracking etc.. */
>>>>  struct pnv_phb;
>>>>  struct pnv_ioda_pe {
>>>>+   struct kref             kref;
>>>>    unsigned long           flags;
>>>>    struct pnv_phb          *phb;
>>>>
>>>>@@ -120,7 +121,8 @@ struct pnv_phb {
>>>>    void (*shutdown)(struct pnv_phb *phb);
>>>>    int (*init_m64)(struct pnv_phb *phb);
>>>>    void (*reserve_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus);
>>>>-   int (*pick_m64_pe)(struct pnv_phb *phb, struct pci_bus *bus, int all);
>>>>+   struct pnv_ioda_pe *(*pick_m64_pe)(struct pnv_phb *phb,
>>>>+                                      struct pci_bus *bus, int all);
>>>>    int (*get_pe_state)(struct pnv_phb *phb, int pe_no);
>>>>    void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
>>>>    int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
>>>>

Thanks,
Gavin

_______________________________________________
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Reply via email to