On Wed, May 07, 2025 at 08:58:39AM -0700, Haiyang Zhang wrote: > To collaborate with hardware servicing events, upon receiving the special > EQE notification from the HW channel, remove the devices on this bus. > Then, after a waiting period based on the device specs, rescan the parent > bus to recover the devices. > > Signed-off-by: Haiyang Zhang <haiya...@microsoft.com> > --- > .../net/ethernet/microsoft/mana/gdma_main.c | 61 +++++++++++++++++++ > include/net/mana/gdma.h | 5 +- > 2 files changed, 65 insertions(+), 1 deletion(-) > > diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c > b/drivers/net/ethernet/microsoft/mana/gdma_main.c > index 4ffaf7588885..aa2ccf4d0ec6 100644 > --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c > +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c > @@ -352,11 +352,52 @@ void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit) > } > EXPORT_SYMBOL_NS(mana_gd_ring_cq, "NET_MANA"); > > +#define MANA_SERVICE_PERIOD 10 > + > +struct mana_serv_work { > + struct work_struct serv_work; > + struct pci_dev *pdev; > +}; > + > +static void mana_serv_func(struct work_struct *w) > +{ > + struct mana_serv_work *mns_wk = container_of(w, struct mana_serv_work, > serv_work); > + struct pci_dev *pdev = mns_wk->pdev; > + struct pci_bus *bus, *parent; > + > + if (!pdev) > + goto out; > + > + bus = pdev->bus; > + if (!bus) { > + dev_err(&pdev->dev, "MANA service: no bus\n"); > + goto out; > + } > + > + parent = bus->parent; > + if (!parent) { > + dev_err(&pdev->dev, "MANA service: no parent bus\n"); > + goto out; > + } > + > + pci_stop_and_remove_bus_device_locked(bus->self); > + > + msleep(MANA_SERVICE_PERIOD * 1000); > + > + pci_lock_rescan_remove(); > + pci_rescan_bus(parent); > + pci_unlock_rescan_remove(); > + > +out: > + kfree(mns_wk);
Shouldn't gc->in_service be set to false again? > +} > + > static void mana_gd_process_eqe(struct gdma_queue *eq) > { > u32 head = eq->head % (eq->queue_size / GDMA_EQE_SIZE); > struct gdma_context *gc = eq->gdma_dev->gdma_context; > struct gdma_eqe *eq_eqe_ptr = eq->queue_mem_ptr; > + struct mana_serv_work *mns_wk; > union gdma_eqe_info eqe_info; > enum gdma_eqe_type type; > struct gdma_event event; > @@ -400,6 +441,26 @@ static void mana_gd_process_eqe(struct gdma_queue *eq) > eq->eq.callback(eq->eq.context, eq, &event); > break; > > + case GDMA_EQE_HWC_FPGA_RECONFIG: > + case GDMA_EQE_HWC_SOCMANA_CRASH: may be we also add a log(dev_dbg) to indicate if the servicing is for FPGA reconfig or socmana crash. > + if (gc->in_service) { > + dev_info(gc->dev, "Already in service\n"); > + break; > + } > + > + mns_wk = kzalloc(sizeof(*mns_wk), GFP_ATOMIC); > + if (!mns_wk) { > + dev_err(gc->dev, "Fail to alloc mana_serv_work\n"); > + break; > + } > + > + dev_info(gc->dev, "Start MANA service\n"); > + gc->in_service = true; > + mns_wk->pdev = to_pci_dev(gc->dev); > + INIT_WORK(&mns_wk->serv_work, mana_serv_func); > + schedule_work(&mns_wk->serv_work); > + break; > + > default: > break; > } > diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h > index 228603bf03f2..13cfbcf67815 100644 > --- a/include/net/mana/gdma.h > +++ b/include/net/mana/gdma.h > @@ -58,8 +58,9 @@ enum gdma_eqe_type { > GDMA_EQE_HWC_INIT_EQ_ID_DB = 129, > GDMA_EQE_HWC_INIT_DATA = 130, > GDMA_EQE_HWC_INIT_DONE = 131, > - GDMA_EQE_HWC_SOC_RECONFIG = 132, > + GDMA_EQE_HWC_FPGA_RECONFIG = 132, > GDMA_EQE_HWC_SOC_RECONFIG_DATA = 133, > + GDMA_EQE_HWC_SOCMANA_CRASH = 135, > GDMA_EQE_RNIC_QP_FATAL = 176, > }; > > @@ -388,6 +389,8 @@ struct gdma_context { > u32 test_event_eq_id; > > bool is_pf; > + bool in_service; > + > phys_addr_t bar0_pa; > void __iomem *bar0_va; > void __iomem *shm_base; > -- > 2.34.1