Sun, May 05, 2019 at 02:33:00AM CEST, sae...@mellanox.com wrote:
>From: Alex Vesker <va...@mellanox.com>
>
>Crdump allows the driver to create a snapshot of the FW PCI crspace.
>This is useful in case of catastrophic issues which may require FW
>reset. The snapshot can be used for later debug.
>
>The snapshot is exposed using devlink region_snapshot in downstream patch,
>cr-space address regions are registered on init and snapshots are attached
>once a new snapshot is collected by the driver.
>
>Signed-off-by: Alex Vesker <va...@mellanox.com>
>Signed-off-by: Moshe Shemesh <mo...@mellanox.com>
>Reviewed-by: Feras Daoud <fera...@mellanox.com>
>Signed-off-by: Saeed Mahameed <sae...@mellanox.com>
>---
> .../net/ethernet/mellanox/mlx5/core/Makefile  |   2 +-
> .../ethernet/mellanox/mlx5/core/diag/crdump.c | 179 ++++++++++++++++++
> .../net/ethernet/mellanox/mlx5/core/health.c  |   1 +
> .../ethernet/mellanox/mlx5/core/lib/mlx5.h    |   4 +
> .../net/ethernet/mellanox/mlx5/core/main.c    |   5 +
> include/linux/mlx5/driver.h                   |   4 +
> 6 files changed, 194 insertions(+), 1 deletion(-)
> create mode 100644 drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c
>
>diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile 
>b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
>index 34d9a079b608..5feed9e1bec7 100644
>--- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile
>+++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile
>@@ -16,7 +16,7 @@ mlx5_core-y :=       main.o cmd.o debugfs.o fw.o eq.o uar.o 
>pagealloc.o \
>               transobj.o vport.o sriov.o fs_cmd.o fs_core.o \
>               fs_counters.o rl.o lag.o dev.o events.o wq.o lib/gid.o \
>               lib/devcom.o lib/pci_vsc.o diag/fs_tracepoint.o \
>-              diag/fw_tracer.o devlink.o
>+              diag/fw_tracer.o diag/crdump.o devlink.o
> 
> #
> # Netdev basic
>diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c 
>b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c
>new file mode 100644
>index 000000000000..6430ceeefb53
>--- /dev/null
>+++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/crdump.c
>@@ -0,0 +1,179 @@
>+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
>+/* Copyright (c) 2019 Mellanox Technologies */
>+
>+#include <linux/proc_fs.h>
>+#include <linux/mlx5/driver.h>
>+#include <net/devlink.h>
>+#include "mlx5_core.h"
>+#include "lib/pci_vsc.h"
>+#include "lib/mlx5.h"
>+
>+#define BAD_ACCESS                    0xBADACCE5
>+#define MLX5_PROTECTED_CR_SCAN_CRSPACE        0x7
>+#define MAX_NUM_OF_DUMPS_TO_STORE     (8)

Please make your local defines prefixed with "mlx5_". For example
"BAD_ACCESS" sounds way too generic.


>+
>+static const char *region_cr_space_str = "cr-space";
>+
>+struct mlx5_fw_crdump {
>+      u32                     size;
>+      struct devlink_region   *region_crspace;
>+};
>+
>+static bool mlx5_crdump_enbaled(struct mlx5_core_dev *dev)

Typo: s/enbaled/enabled/


>+{
>+      struct mlx5_priv *priv = &dev->priv;
>+
>+      return (!!priv->health.crdump);

Poitlesss brackets. Please remove.


>+}
>+
>+static int mlx5_crdump_fill(struct mlx5_core_dev *dev,
>+                          char *crdump_region, u32 *snapshot_id)
>+{
>+      struct devlink *devlink = priv_to_devlink(dev);
>+      struct mlx5_priv *priv = &dev->priv;
>+      struct mlx5_fw_crdump *crdump = priv->health.crdump;
>+      int i, ret = 0;
>+      u32 *cr_data;
>+      u32 id;
>+
>+      cr_data = kvmalloc(crdump->size, GFP_KERNEL);
>+      if (!cr_data)
>+              return -ENOMEM;
>+
>+      for (i = 0; i < (crdump->size / 4); i++)
>+              cr_data[i] = BAD_ACCESS;
>+
>+      ret = mlx5_vsc_gw_read_block_fast(dev, cr_data, crdump->size);
>+      if (ret <= 0) {
>+              if (ret == 0)
>+                      ret = -EIO;
>+              goto free_data;
>+      }
>+
>+      if (crdump->size != ret) {
>+              mlx5_core_warn(dev, "failed to read full dump, read %d out of 
>%u\n",
>+                             ret, crdump->size);
>+              ret = -EINVAL;
>+              goto free_data;
>+      }
>+
>+      /* Get the available snapshot ID for the dumps */
>+      id = devlink_region_shapshot_id_get(devlink);
>+      ret = devlink_region_snapshot_create(crdump->region_crspace,
>+                                           crdump->size, (u8 *)cr_data,
>+                                           id, &kvfree);
>+      if (ret) {
>+              mlx5_core_warn(dev, "crdump: devlink create %s snapshot id %d 
>err %d\n",
>+                             region_cr_space_str, id, ret);
>+              goto free_data;
>+      } else {
>+              *snapshot_id = id;
>+              strcpy(crdump_region, region_cr_space_str);
>+      }
>+      return 0;
>+
>+free_data:
>+      kvfree(cr_data);
>+      return ret;
>+}
>+
>+int mlx5_crdump_collect(struct mlx5_core_dev *dev,
>+                      char *crdump_region, u32 *snapshot_id)
>+{
>+      int ret = 0;
>+
>+      if (!mlx5_crdump_enbaled(dev))
>+              return -ENODEV;
>+
>+      ret = mlx5_vsc_gw_lock(dev);
>+      if (ret) {
>+              mlx5_core_warn(dev, "crdump: failed to lock vsc gw err %d\n",
>+                             ret);
>+              return ret;
>+      }
>+
>+      ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE, NULL);
>+      if (ret)
>+              goto unlock;
>+
>+      ret = mlx5_crdump_fill(dev, crdump_region, snapshot_id);
>+
>+unlock:
>+      mlx5_vsc_gw_unlock(dev);
>+      return ret;
>+}
>+
>+int mlx5_crdump_init(struct mlx5_core_dev *dev)
>+{
>+      struct devlink *devlink = priv_to_devlink(dev);
>+      struct mlx5_priv *priv = &dev->priv;
>+      struct mlx5_fw_crdump *crdump;
>+      u32 space_size;
>+      int ret;
>+
>+      if (!mlx5_core_is_pf(dev) || !mlx5_vsc_accessible(dev) ||
>+          mlx5_crdump_enbaled(dev))
>+              return 0;
>+
>+      ret = mlx5_vsc_gw_lock(dev);
>+      if (ret)
>+              return ret;
>+
>+      /* Check if space is supported and get space size */
>+      ret = mlx5_vsc_gw_set_space(dev, MLX5_VSC_SPACE_SCAN_CRSPACE,
>+                                  &space_size);
>+      if (ret) {
>+              /* Unlock and mask error since space is not supported */
>+              mlx5_vsc_gw_unlock(dev);
>+              return 0;
>+      }
>+
>+      if (!space_size) {
>+              mlx5_core_warn(dev, "Invalid Crspace size, zero\n");
>+              mlx5_vsc_gw_unlock(dev);
>+              return -EINVAL;
>+      }
>+
>+      ret = mlx5_vsc_gw_unlock(dev);
>+      if (ret)
>+              return ret;
>+
>+      crdump = kzalloc(sizeof(*crdump), GFP_KERNEL);
>+      if (!crdump)
>+              return -ENOMEM;
>+
>+      /* Create cr-space region */
>+      crdump->size = space_size;
>+      crdump->region_crspace =
>+              devlink_region_create(devlink,
>+                                    region_cr_space_str,
>+                                    MAX_NUM_OF_DUMPS_TO_STORE,
>+                                    space_size);

Unnecessary wraps.


>+      if (IS_ERR(crdump->region_crspace)) {
>+              mlx5_core_warn(dev,

Unnecessary wrap.


>+                             "crdump: create devlink region %s err %ld\n",
>+                             region_cr_space_str,
>+                             PTR_ERR(crdump->region_crspace));
>+              ret = PTR_ERR(crdump->region_crspace);
>+              goto free_crdump;
>+      }
>+      priv->health.crdump = crdump;
>+      return 0;
>+
>+free_crdump:
>+      kfree(crdump);
>+      return ret;
>+}
>+
>+void mlx5_crdump_cleanup(struct mlx5_core_dev *dev)
>+{
>+      struct mlx5_priv *priv = &dev->priv;
>+      struct mlx5_fw_crdump *crdump = priv->health.crdump;
>+
>+      if (!crdump)
>+              return;
>+
>+      devlink_region_destroy(crdump->region_crspace);
>+      kfree(crdump);
>+      priv->health.crdump = NULL;

Why do you need to have this NULL. Where do you check for NULL?


>+}
>diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c 
>b/drivers/net/ethernet/mellanox/mlx5/core/health.c
>index a2656f4008d9..90f3da6da7f9 100644
>--- a/drivers/net/ethernet/mellanox/mlx5/core/health.c
>+++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c
>@@ -388,6 +388,7 @@ int mlx5_health_init(struct mlx5_core_dev *dev)
>       spin_lock_init(&health->wq_lock);
>       INIT_WORK(&health->work, health_care);
>       INIT_DELAYED_WORK(&health->recover_work, health_recover);
>+      health->crdump = NULL;

Same here.


> 
>       return 0;
> }
>diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h 
>b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
>index 397a2847867a..3c9a6dedccaa 100644
>--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
>+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/mlx5.h
>@@ -41,6 +41,10 @@ int  mlx5_core_reserve_gids(struct mlx5_core_dev *dev, 
>unsigned int count);
> void mlx5_core_unreserve_gids(struct mlx5_core_dev *dev, unsigned int count);
> int  mlx5_core_reserved_gid_alloc(struct mlx5_core_dev *dev, int *gid_index);
> void mlx5_core_reserved_gid_free(struct mlx5_core_dev *dev, int gid_index);
>+int mlx5_crdump_init(struct mlx5_core_dev *dev);
>+void mlx5_crdump_cleanup(struct mlx5_core_dev *dev);
>+int mlx5_crdump_collect(struct mlx5_core_dev *dev,
>+                      char *crdump_region, u32 *snapshot_id);
> 
> /* TODO move to lib/events.h */
> 
>diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c 
>b/drivers/net/ethernet/mellanox/mlx5/core/main.c
>index 64eb2a558b30..43f5487de4c3 100644
>--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
>+++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
>@@ -1320,6 +1320,10 @@ static int init_one(struct pci_dev *pdev, const struct 
>pci_device_id *id)
>       if (err)
>               goto clean_load;
> 
>+      err = mlx5_crdump_init(dev);
>+      if (err)
>+              dev_err(&pdev->dev, "mlx5_crdump_init failed with error code 
>%d\n", err);
>+
>       pci_save_state(pdev);
>       return 0;
> 
>@@ -1341,6 +1345,7 @@ static void remove_one(struct pci_dev *pdev)
>       struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
>       struct devlink *devlink = priv_to_devlink(dev);
> 
>+      mlx5_crdump_cleanup(dev);
>       mlx5_devlink_unregister(devlink);
>       mlx5_unregister_device(dev);
> 
>diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
>index 56d0a116f575..ddf6f41a75d3 100644
>--- a/include/linux/mlx5/driver.h
>+++ b/include/linux/mlx5/driver.h
>@@ -53,6 +53,7 @@
> #include <linux/mlx5/eq.h>
> #include <linux/timecounter.h>
> #include <linux/ptp_clock_kernel.h>
>+#include <net/devlink.h>
> 
> enum {
>       MLX5_BOARD_ID_LEN = 64,
>@@ -427,6 +428,8 @@ struct mlx5_sq_bfreg {
>       unsigned int            offset;
> };
> 
>+struct mlx5_fw_crdump;
>+
> struct mlx5_core_health {
>       struct health_buffer __iomem   *health;
>       __be32 __iomem                 *health_counter;
>@@ -440,6 +443,7 @@ struct mlx5_core_health {
>       unsigned long                   flags;
>       struct work_struct              work;
>       struct delayed_work             recover_work;
>+      struct mlx5_fw_crdump          *crdump;
> };
> 
> struct mlx5_qp_table {
>-- 
>2.20.1
>

Reply via email to