On 5/9/2019 11:23 AM, Jiri Pirko wrote: > Tue, May 07, 2019 at 02:58:32PM CEST, a...@mellanox.com wrote: >> >> >> On 5/7/2019 3:41 PM, Jiri Pirko wrote: >>> Mon, Apr 29, 2019 at 04:17:39PM CEST, a...@mellanox.com wrote: >>>> TX reporter reports an error on two scenarios: >>>> - TX timeout on a specific tx queue >>>> - TX completion error on a specific send queue >>>> Prior to this patch, no dump data was supported by the tx reporter. This >>>> patch adds support for SW data dump of the related SQ context. The dump >>>> is simply the SQ's raw memory snapshot taken right after the error was >>>> reported, before any recovery procedure was launched. With this >>>> approach, no maintenance is needed as the driver fetch the actual data >>>> according to the layout on which the SQ was compiled with. By providing >>>> a SW context, one can easily debug error on a given SQ. >>>> >>>> In order to offline translate the raw memory into a human readable >>>> format, the user can use some out-of-kernel scripts which receives as an >>>> input the following: >>>> - Object raw memory >>>> - Driver object compiled with debug info (can be taken/generated at any >>>> time from the machine) >>>> - Object name >>>> >>>> An example of such script output can be seen below. >>>> Note: the script is not offered as part of this patch as it do not >>>> belong to the kernel, I just described it in order to grasp the general >>>> idea of how/what can be fetched from SW dump via devlink health. >>>> >>>> The output of the SW dump can be extracted by devlink health command: >>>> $ sudo devlink health dump show pci/0000:00:0b.0 reporter tx. >>>> mlx5e_txqsq: sqn: 6336 >>>> memory: >>>> 00 00 00 00 00 00 00 00 >>>> 01 00 00 00 00 00 00 00 >>>> 00 00 00 00 00 00 00 00 >>>> 45 f4 88 cb 09 00 00 00 >>>> 00 00 00 00 00 00 00 00 >>>> 00 00 00 00 00 00 00 00 >>>> c0 ff ff ff 1f 00 00 00 >>>> f8 18 1e 89 81 88 ff ff >>>> ... >>>> >>>> script output below, with struct members names and actual values: >>>> >>>> struct mlx5e_txqsq { >>>> short unsigned int cc 0x5 ; >>>> unsigned int dma_fifo_cc 0x5 ; >>>> struct net_dim { >>>> unsigned char state 0x1 ; >>>> struct net_dim_stats { >>>> int ppms 0x0 ; >>>> int bpms 0x0 ; >>>> int epms 0x0 ; >>>> } prev_stats; >>>> struct net_dim_sample { >>>> long long int time 0x90766ef9d ; >>>> unsigned int pkt_ctr 0x0 ; >>>> unsigned int byte_ctr 0x0 ; >>>> short unsigned int event_ctr 0x0 ; >>>> } start_sample; >>>> struct work_struct { >>>> struct { >>>> long int counter 0x1fffffffc0 ; >>>> } data; >>>> struct list_head { >>>> struct list_head * next >>>> 0xffff8881b08998f8 ; >>>> struct list_head * prev >>>> 0xffff8881b08998f8 ; >>>> } entry; >>>> void (*func)(struct work_struct *) >>>> 0xffffffffa02d0e30 ; >>>> } work; >>>> unsigned char profile_ix 0x60 ; >>>> unsigned char mode 0x72 ; >>>> unsigned char tune_state 0x35 ; >>>> unsigned char steps_right 0xa0 ; >>>> unsigned char steps_left 0xff ; >>>> unsigned char tired 0xff ; >>>> } dim; >>>> short unsigned int pc 0x0 ; >>>> unsigned int dma_fifo_pc 0x0 ; >>>> struct mlx5e_cq { >>>> struct mlx5_cqwq { >>>> struct mlx5_frag_buf_ctrl { >>>> struct mlx5_buf_list * frags 0x500000005 ; >>>> unsigned int sz_m1 0x0 ; >>>> short unsigned int frag_sz_m1 0x0 ; >>>> short unsigned int strides_offset 0x0 ; >>>> unsigned char log_sz 0x0 ; >>>> unsigned char log_stride 0x0 ; >>>> unsigned char log_frag_strides 0x0 ; >>>> } fbc; >>>> __be32 * db 0x0 ; >>>> unsigned int cc 0x0 ; >>>> } wq; >>>> short unsigned int event_ctr 0x0 ; >>>> struct napi_struct * napi 0x0 ; >>>> struct mlx5_core_cq { >>>> unsigned int cqn 0x0 ; >>>> int cqe_sz 0x0 ; >>>> __be32 * set_ci_db 0xffff8881b1aa4988 ; >>>> __be32 * arm_db 0x3f000003ff ; >>>> struct mlx5_uars_page * uar 0x6060a ; >>>> struct refcount_struct { >>>> struct { >>>> int counter 0xa1814500 ; >>>> } refs; >>>> } refcount; >>>> struct completion { >>>> unsigned int done 0x5 ; >>>> struct wait_queue_head { >>>> struct spinlock { >>>> union { >>>> struct raw_spinlock { >>>> struct >>>> qspinlock { >>>> union >>>> { >>>> >>>> struct { >>>> >>>> int counter >>>> 0x5 ; >>>> >>>> } val; >>>> >>>> struct { >>>> >>>> unsigned char locked >>>> 0x5 ; >>>> >>>> unsigned char pending >>>> 0x0 ; >>>> >>>> } ; >>>> >>>> struct { >>>> >>>> short unsigned int >>>> locked_pending 0x5 ; >>>> >>>> short unsigned int tail >>>> 0x0 ; >>>> >>>> } ; >>>> } ; >>>> } raw_lock; >>>> } rlock; >>>> } ; >>>> } lock; >>>> struct list_head { >>>> struct list_head * next >>>> 0xffff8881b089bb88 ; >>>> struct list_head * prev >>>> 0x4000000c0a ; >>>> } head; >>>> } wait; >>>> } free; >>>> unsigned int vector 0xa1814500 ; >>>> unsigned int irqn 0xffff8881 ; >>>> void (*comp)(struct mlx5_core_cq *) >>>> 0xffff8881a1814504 ; >>>> void (*event)(struct mlx5_core_cq *, enum >>>> mlx5_event) 0xffff8881a2cdea08 ; >>>> unsigned int cons_index 0x1 ; >>>> unsigned int arm_sn 0x0 ; >>>> struct mlx5_rsc_debug * dbg 0x0 ; >>>> int pid 0x0 ; >>>> struct { >>>> struct list_head { >>>> struct list_head * next >>>> 0xffffffff ; >>>> struct list_head * prev >>>> 0xffffffffffffffff ; >>>> } list; >>>> void (*comp)(struct mlx5_core_cq *) >>>> 0xffffffffa0356940 ; >>>> void * priv 0x0 ; >>>> } tasklet_ctx; >>>> int reset_notify_added 0x0 ; >>>> struct list_head { >>>> struct list_head * next >>>> 0xffffffffa0300700 ; >>>> struct list_head * prev 0xd ; >>>> } reset_notify; >>>> struct mlx5_eq_comp * eq 0x0 ; >>>> short unsigned int uid 0x9a70 ; >>>> } mcq; >>>> struct mlx5e_channel * channel 0xffff8881b0899a70 ; >>>> struct mlx5_core_dev * mdev 0x4800000001 ; >>>> struct mlx5_wq_ctrl { >>>> struct mlx5_core_dev * mdev 0xffffffffa02d5350 ; >>>> struct mlx5_frag_buf { >>>> struct mlx5_buf_list * frags >>>> 0xffffffffa02d5460 ; >>>> int npages 0x0 ; >>>> int size 0x5 ; >>>> unsigned char page_shift 0x8 ; >>>> } buf; >>>> struct mlx5_db { >>>> __be32 * db 0x1c6 ; >>>> union { >>>> struct mlx5_db_pgdir * pgdir 0x0 ; >>>> struct mlx5_ib_user_db_page * user_page >>>> 0x0 ; >>>> } u; >>>> long long unsigned int dma >>>> 0xffff8881b0899ab0 ; >>>> int index 0x0 ; >>>> } db; >>>> } wq_ctrl; >>>> } cq; >>>> struct mlx5_wq_cyc { >>>> struct mlx5_frag_buf_ctrl { >>>> struct mlx5_buf_list * frags 0xffff8881a7600160 ; >>>> unsigned int sz_m1 0xa7600160 ; >>>> short unsigned int frag_sz_m1 0x8881 ; >>>> short unsigned int strides_offset 0xffff ; >>>> unsigned char log_sz 0x88 ; >>>> unsigned char log_stride 0x49 ; >>>> unsigned char log_frag_strides 0xaa ; >>>> } fbc; >>>> __be32 * db 0x1000000000010 ; >>>> short unsigned int sz 0xc ; >>>> short unsigned int wqe_ctr 0x0 ; >>>> short unsigned int cur_sz 0x0 ; >>>> } wq; >>>> unsigned int dma_fifo_mask 0xa1814500 ; >>>> struct mlx5e_sq_stats * stats 0xffff8881a33a0348 ; >>>> struct { >>>> struct mlx5e_sq_dma * dma_fifo 0x1a1814500 ; >>>> struct mlx5e_tx_wqe_info * wqe_info 0x14 ; >>>> } db; >>>> void * uar_map 0x0 ; >>>> struct netdev_queue * txq 0x0 ; >>>> unsigned int sqn 0x18c0 ; >>>> unsigned char min_inline_mode 0x0 ; >>>> struct device * pdev 0x0 ; >>>> unsigned int mkey_be 0x0 ; >>>> long unsigned int state 0x0 ; >>>> struct hwtstamp_config * tstamp 0x0 ; >>>> struct mlx5_clock * clock 0xffff8881b1aa6f88 ; >>>> struct mlx5_wq_ctrl { >>>> struct mlx5_core_dev * mdev 0x3f000003ff ; >>>> struct mlx5_frag_buf { >>>> struct mlx5_buf_list * frags 0x6060a ; >>>> int npages 0xa1814604 ; >>>> int size 0xffff8881 ; >>>> unsigned char page_shift 0x0 ; >>>> } buf; >>>> struct mlx5_db { >>>> __be32 * db 0xfff ; >>>> union { >>>> struct mlx5_db_pgdir * pgdir 0x0 ; >>>> struct mlx5_ib_user_db_page * user_page >>>> 0x0 ; >>>> } u; >>>> long long unsigned int dma 0xffff888188440000 ; >>>> int index 0x8b074000 ; >>>> } db; >>>> } wq_ctrl; >>>> struct mlx5e_channel * channel 0xffffc9000010d800 ; >>>> int txq_ix 0xa0020180 ; >>>> unsigned int rate_limit 0xffff8881 ; >>>> struct work_struct { >>>> struct { >>>> long int counter 0x1000018c0 ; >>>> } data; >>>> struct list_head { >>>> struct list_head * next 0xffff8881c32b68e8 ; >>>> struct list_head * prev 0x800 ; >>>> } entry; >>>> void (*func)(struct work_struct *) 0x9 ; >>>> } recover_work; >>>> } ; >>> >>> I don't get it. You are dumping live kernel memory? There are already >>> facilities to do that in place. Why to replicate it? >> I am dumping the driver's memory under a lock so I can ensure it's >> consistency (as appose to /dev/mem) >> vmcore cannot be taken from a live kernel (without crashing). >> I need the memory's snapshot right after the error from the driver's >> context. > > Got it. However, this sounds like a generic problem not specific to > nic drivers. How other subsystems resolve this (if they do at all)? > > Correct, this is a suggested debugging solution for a generic problem: enabling the user of a run time memory snapshot for kernel modules (at a given error event). My research shows that other subsystems deal with errors either by panicking (too much) or by debug/log prints (too little). This solution is (a) low in maintenance (b) consistent in memory (c) has small performance impact (d) use an existing infra-structure between the kernel module and the user space. It might be ported to other subsystems using their own user-space vs. kernel tools. Regardless of how the memory output was generated to the user, the parsing script can work on it.
> >> Which other tools do you mean? >>> >>> >>>> >>>> Signed-off-by: Aya Levin <a...@mellanox.com> >>>> --- >>>> .../ethernet/mellanox/mlx5/core/en/reporter_tx.c | 100 >>>> +++++++++++++++++++++ >>>> 1 file changed, 100 insertions(+) >>>> >>>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c >>>> b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c >>>> index 476dd97f7f2f..8a39f5525e57 100644 >>>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c >>>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c >>>> @@ -9,6 +9,7 @@ >>>> >>>> struct mlx5e_tx_err_ctx { >>>> int (*recover)(struct mlx5e_txqsq *sq); >>>> + int (*dump)(struct mlx5e_txqsq *sq); >>>> struct mlx5e_txqsq *sq; >>>> }; >>>> >>>> @@ -281,10 +282,109 @@ static int mlx5e_tx_reporter_diagnose(struct >>>> devlink_health_reporter *reporter, >>>> return err; >>>> } >>>> >>>> +static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv *priv, >>>> + struct mlx5e_txqsq *sq, >>>> + struct devlink_fmsg *fmsg) >>>> +{ >>>> + u64 *ptr = (u64 *)sq; >>>> + int copy, err; >>>> + int i = 0; >>>> + >>>> + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) >>>> + return 0; >>>> + >>>> + err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq"); >>>> + if (err) >>>> + return err; >>>> + >>>> + err = devlink_fmsg_obj_nest_start(fmsg); >>>> + if (err) >>>> + return err; >>>> + >>>> + err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory"); >>>> + if (err) >>>> + return err; >>>> + >>>> + while (i < sizeof(struct mlx5e_txqsq)) { >>>> + copy = sizeof(u64); >>>> + >>>> + if (i + copy > sizeof(struct mlx5e_txqsq)) >>>> + copy = sizeof(struct mlx5e_txqsq) - i; >>>> + >>>> + err = devlink_fmsg_binary_put(fmsg, ptr, copy); >>>> + if (err) >>>> + return err; >>>> + ptr++; >>>> + i += copy; >>>> + } >>>> + >>>> + err = devlink_fmsg_arr_pair_nest_end(fmsg); >>>> + if (err) >>>> + return err; >>>> + >>>> + err = devlink_fmsg_obj_nest_end(fmsg); >>>> + if (err) >>>> + return err; >>>> + >>>> + err = devlink_fmsg_pair_nest_end(fmsg); >>>> + >>>> + return err; >>>> +} >>>> + >>>> +static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv, >>>> + struct devlink_fmsg *fmsg) >>>> +{ >>>> + int i, err = 0; >>>> + >>>> + mutex_lock(&priv->state_lock); >>>> + >>>> + if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) >>>> + goto unlock; >>>> + >>>> + err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs"); >>>> + if (err) >>>> + goto unlock; >>>> + >>>> + for (i = 0; i < priv->channels.num * priv->channels.params.num_tc; >>>> + i++) { >>>> + err = devlink_fmsg_obj_nest_start(fmsg); >>>> + if (err) >>>> + goto unlock; >>>> + >>>> + err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, priv->txq2sq[i], >>>> + fmsg); >>>> + if (err) >>>> + goto unlock; >>>> + >>>> + err = devlink_fmsg_pair_nest_end(fmsg); >>>> + if (err) >>>> + goto unlock; >>>> + } >>>> + err = devlink_fmsg_arr_pair_nest_end(fmsg); >>>> + if (err) >>>> + goto unlock; >>>> + >>>> +unlock: >>>> + mutex_unlock(&priv->state_lock); >>>> + return err; >>>> +} >>>> + >>>> +static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter >>>> *reporter, >>>> + struct devlink_fmsg *fmsg, void *context) >>>> +{ >>>> + struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); >>>> + struct mlx5e_tx_err_ctx *err_ctx = context; >>>> + >>>> + return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv, err_ctx->sq, >>>> + fmsg) : >>>> + mlx5e_tx_reporter_sw_dump_all(priv, fmsg); >>>> +} >>>> + >>>> static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { >>>> .name = "tx", >>>> .recover = mlx5e_tx_reporter_recover, >>>> .diagnose = mlx5e_tx_reporter_diagnose, >>>> + .dump = mlx5e_tx_reporter_sw_dump, >>>> }; >>>> >>>> #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 >>>> -- >>>> 2.14.1 >>>>