On Mon, May 12, 2025 at 12:57:54PM -0700, Haiyang Zhang wrote:
> To collaborate with hardware servicing events, upon receiving the special
> EQE notification from the HW channel, remove the devices on this bus.
> Then, after a waiting period based on the device specs, rescan the parent
> bus to recover the devices.
> 
> Signed-off-by: Haiyang Zhang <haiya...@microsoft.com>
> ---
> v3:
> Updated for checkpatch warnings as suggested by Simon Horman.
> 
> v2:
> Added dev_dbg for service type as suggested by Shradha Gupta.
> Added driver cap bit.
> 
> ---
>  .../net/ethernet/microsoft/mana/gdma_main.c   | 64 +++++++++++++++++++
>  include/net/mana/gdma.h                       | 11 +++-
>  2 files changed, 73 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c 
> b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> index 4ffaf7588885..3102bd2b875b 100644
> --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
> +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
> @@ -352,11 +352,55 @@ void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit)
>  }
>  EXPORT_SYMBOL_NS(mana_gd_ring_cq, "NET_MANA");
>  
> +#define MANA_SERVICE_PERIOD 10
> +
> +struct mana_serv_work {
> +     struct work_struct serv_work;
> +     struct pci_dev *pdev;
> +};
> +
> +static void mana_serv_func(struct work_struct *w)
> +{
> +     struct mana_serv_work *mns_wk;
> +     struct pci_bus *bus, *parent;
> +     struct pci_dev *pdev;
> +
> +     mns_wk = container_of(w, struct mana_serv_work, serv_work);
> +     pdev = mns_wk->pdev;
> +
> +     if (!pdev)
> +             goto out;
> +
> +     bus = pdev->bus;
> +     if (!bus) {
> +             dev_err(&pdev->dev, "MANA service: no bus\n");
> +             goto out;
> +     }
> +
> +     parent = bus->parent;
> +     if (!parent) {
> +             dev_err(&pdev->dev, "MANA service: no parent bus\n");
> +             goto out;
> +     }
> +
> +     pci_stop_and_remove_bus_device_locked(bus->self);
> +
> +     msleep(MANA_SERVICE_PERIOD * 1000);
> +
> +     pci_lock_rescan_remove();
> +     pci_rescan_bus(parent);
> +     pci_unlock_rescan_remove();
> +
> +out:
> +     kfree(mns_wk);
> +}
> +
>  static void mana_gd_process_eqe(struct gdma_queue *eq)
>  {
>       u32 head = eq->head % (eq->queue_size / GDMA_EQE_SIZE);
>       struct gdma_context *gc = eq->gdma_dev->gdma_context;
>       struct gdma_eqe *eq_eqe_ptr = eq->queue_mem_ptr;
> +     struct mana_serv_work *mns_wk;
>       union gdma_eqe_info eqe_info;
>       enum gdma_eqe_type type;
>       struct gdma_event event;
> @@ -400,6 +444,26 @@ static void mana_gd_process_eqe(struct gdma_queue *eq)
>               eq->eq.callback(eq->eq.context, eq, &event);
>               break;
>  
> +     case GDMA_EQE_HWC_FPGA_RECONFIG:
> +     case GDMA_EQE_HWC_SOCMANA_CRASH:
> +             dev_dbg(gc->dev, "Recv MANA service type:%d\n", type);
> +
> +             if (gc->in_service) {
> +                     dev_info(gc->dev, "Already in service\n");
> +                     break;
> +             }
> +
> +             mns_wk = kzalloc(sizeof(*mns_wk), GFP_ATOMIC);
> +             if (!mns_wk)
> +                     break;
> +
> +             dev_info(gc->dev, "Start MANA service type:%d\n", type);
> +             gc->in_service = true;
> +             mns_wk->pdev = to_pci_dev(gc->dev);
> +             INIT_WORK(&mns_wk->serv_work, mana_serv_func);
> +             schedule_work(&mns_wk->serv_work);
> +             break;
> +
>       default:
>               break;
>       }
> diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
> index 228603bf03f2..d0fbc9c64cc8 100644
> --- a/include/net/mana/gdma.h
> +++ b/include/net/mana/gdma.h
> @@ -58,8 +58,9 @@ enum gdma_eqe_type {
>       GDMA_EQE_HWC_INIT_EQ_ID_DB      = 129,
>       GDMA_EQE_HWC_INIT_DATA          = 130,
>       GDMA_EQE_HWC_INIT_DONE          = 131,
> -     GDMA_EQE_HWC_SOC_RECONFIG       = 132,
> +     GDMA_EQE_HWC_FPGA_RECONFIG      = 132,
>       GDMA_EQE_HWC_SOC_RECONFIG_DATA  = 133,
> +     GDMA_EQE_HWC_SOCMANA_CRASH      = 135,
>       GDMA_EQE_RNIC_QP_FATAL          = 176,
>  };
>  
> @@ -388,6 +389,8 @@ struct gdma_context {
>       u32                     test_event_eq_id;
>  
>       bool                    is_pf;
> +     bool                    in_service;
> +
>       phys_addr_t             bar0_pa;
>       void __iomem            *bar0_va;
>       void __iomem            *shm_base;
> @@ -558,12 +561,16 @@ enum {
>  /* Driver can handle holes (zeros) in the device list */
>  #define GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP BIT(11)
>  
> +/* Driver can self reset on EQE notification */
> +#define GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE BIT(14)
> +
>  #define GDMA_DRV_CAP_FLAGS1 \
>       (GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT | \
>        GDMA_DRV_CAP_FLAG_1_NAPI_WKDONE_FIX | \
>        GDMA_DRV_CAP_FLAG_1_HWC_TIMEOUT_RECONFIG | \
>        GDMA_DRV_CAP_FLAG_1_VARIABLE_INDIRECTION_TABLE_SUPPORT | \
> -      GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP)
> +      GDMA_DRV_CAP_FLAG_1_DEV_LIST_HOLES_SUP | \
> +      GDMA_DRV_CAP_FLAG_1_SELF_RESET_ON_EQE)
>  
>  #define GDMA_DRV_CAP_FLAGS2 0
>  
> -- 
> 2.34.1

Reviewed-by: Shradha Gupta <shradhagu...@linux.microsoft.com>

Reply via email to