On 5/9/2019 11:23 AM, Jiri Pirko wrote:
> Tue, May 07, 2019 at 02:58:32PM CEST, a...@mellanox.com wrote:
>>
>>
>> On 5/7/2019 3:41 PM, Jiri Pirko wrote:
>>> Mon, Apr 29, 2019 at 04:17:39PM CEST, a...@mellanox.com wrote:
>>>> TX reporter reports an error on two scenarios:
>>>> - TX timeout on a specific tx queue
>>>> - TX completion error on a specific send queue
>>>> Prior to this patch, no dump data was supported by the tx reporter. This
>>>> patch adds support for SW data dump of the related SQ context. The dump
>>>> is simply the SQ's raw memory snapshot taken right after the error was
>>>> reported, before any recovery procedure was launched. With this
>>>> approach, no maintenance is needed as the driver fetch the actual data
>>>> according to the layout on which the SQ was compiled with.  By providing
>>>> a SW context, one can easily debug error on a given SQ.
>>>>
>>>> In order to offline translate the raw memory into a human readable
>>>> format, the user can use some out-of-kernel scripts which receives as an
>>>> input the following:
>>>> - Object raw memory
>>>> - Driver object compiled with debug info (can be taken/generated at any 
>>>> time from the machine)
>>>> - Object name
>>>>
>>>> An example of such script output can be seen below.
>>>> Note: the script is not offered as part of this patch as it do not
>>>> belong to the kernel, I just described it in order to grasp the general
>>>> idea of how/what can be fetched from SW dump via devlink health.
>>>>
>>>> The output of the SW dump can be extracted by devlink health command:
>>>> $ sudo devlink health dump show pci/0000:00:0b.0 reporter tx.
>>>> mlx5e_txqsq: sqn: 6336
>>>> memory:
>>>>     00 00 00 00 00 00 00 00
>>>>     01 00 00 00 00 00 00 00
>>>>     00 00 00 00 00 00 00 00
>>>>     45 f4 88 cb 09 00 00 00
>>>>     00 00 00 00 00 00 00 00
>>>>     00 00 00 00 00 00 00 00
>>>>     c0 ff ff ff 1f 00 00 00
>>>>     f8 18 1e 89 81 88 ff ff
>>>>     ...
>>>>
>>>> script output below, with struct members names and actual values:
>>>>
>>>> struct  mlx5e_txqsq {
>>>>    short unsigned int         cc    0x5 ;
>>>>    unsigned int               dma_fifo_cc   0x5 ;
>>>>    struct  net_dim {
>>>>            unsigned char      state         0x1 ;
>>>>            struct  net_dim_stats {
>>>>                    int        ppms          0x0 ;
>>>>                    int        bpms          0x0 ;
>>>>                    int        epms          0x0 ;
>>>>            } prev_stats;
>>>>            struct  net_dim_sample {
>>>>                    long long int time       0x90766ef9d ;
>>>>                    unsigned int pkt_ctr     0x0 ;
>>>>                    unsigned int byte_ctr    0x0 ;
>>>>                    short unsigned int event_ctr     0x0 ;
>>>>            } start_sample;
>>>>            struct  work_struct {
>>>>                    struct   {
>>>>                            long int counter         0x1fffffffc0 ;
>>>>                    } data;
>>>>                    struct  list_head {
>>>>                            struct list_head * next          
>>>> 0xffff8881b08998f8 ;
>>>>                            struct list_head * prev          
>>>> 0xffff8881b08998f8 ;
>>>>                    } entry;
>>>>                    void       (*func)(struct work_struct *)         
>>>> 0xffffffffa02d0e30 ;
>>>>            } work;
>>>>            unsigned char      profile_ix    0x60 ;
>>>>            unsigned char      mode          0x72 ;
>>>>            unsigned char      tune_state    0x35 ;
>>>>            unsigned char      steps_right   0xa0 ;
>>>>            unsigned char      steps_left    0xff ;
>>>>            unsigned char      tired         0xff ;
>>>>    } dim;
>>>>    short unsigned int         pc    0x0 ;
>>>>    unsigned int               dma_fifo_pc   0x0 ;
>>>>    struct  mlx5e_cq {
>>>>            struct  mlx5_cqwq {
>>>>                    struct  mlx5_frag_buf_ctrl {
>>>>                            struct mlx5_buf_list * frags     0x500000005 ;
>>>>                            unsigned int sz_m1       0x0 ;
>>>>                            short unsigned int frag_sz_m1    0x0 ;
>>>>                            short unsigned int strides_offset        0x0 ;
>>>>                            unsigned char log_sz     0x0 ;
>>>>                            unsigned char log_stride         0x0 ;
>>>>                            unsigned char log_frag_strides   0x0 ;
>>>>                    } fbc;
>>>>                    __be32 *   db    0x0 ;
>>>>                    unsigned int cc          0x0 ;
>>>>            } wq;
>>>>            short unsigned int event_ctr     0x0 ;
>>>>            struct napi_struct * napi        0x0 ;
>>>>            struct  mlx5_core_cq {
>>>>                    unsigned int cqn         0x0 ;
>>>>                    int        cqe_sz        0x0 ;
>>>>                    __be32 *   set_ci_db     0xffff8881b1aa4988 ;
>>>>                    __be32 *   arm_db        0x3f000003ff ;
>>>>                    struct mlx5_uars_page * uar      0x6060a ;
>>>>                    struct  refcount_struct {
>>>>                            struct   {
>>>>                                    int    counter   0xa1814500 ;
>>>>                            } refs;
>>>>                    } refcount;
>>>>                    struct  completion {
>>>>                            unsigned int done        0x5 ;
>>>>                            struct  wait_queue_head {
>>>>                                    struct  spinlock {
>>>>                                            union   {
>>>>                                                    struct  raw_spinlock {
>>>>                                                            struct  
>>>> qspinlock {
>>>>                                                                    union   
>>>> {
>>>>                                                                            
>>>> struct   {
>>>>                                                                            
>>>>         int                                                    counter   
>>>> 0x5 ;
>>>>                                                                            
>>>> } val;
>>>>                                                                            
>>>> struct   {
>>>>                                                                            
>>>>         unsigned char                                          locked    
>>>> 0x5 ;
>>>>                                                                            
>>>>         unsigned char                                          pending   
>>>> 0x0 ;
>>>>                                                                            
>>>> } ;
>>>>                                                                            
>>>> struct   {
>>>>                                                                            
>>>>         short unsigned int                                     
>>>> locked_pending    0x5 ;
>>>>                                                                            
>>>>         short unsigned int                                     tail      
>>>> 0x0 ;
>>>>                                                                            
>>>> } ;
>>>>                                                                    } ;
>>>>                                                            } raw_lock;
>>>>                                                    } rlock;
>>>>                                            } ;
>>>>                                    } lock;
>>>>                                    struct  list_head {
>>>>                                            struct list_head * next         
>>>>  0xffff8881b089bb88 ;
>>>>                                            struct list_head * prev         
>>>>  0x4000000c0a ;
>>>>                                    } head;
>>>>                            } wait;
>>>>                    } free;
>>>>                    unsigned int vector      0xa1814500 ;
>>>>                    unsigned int irqn        0xffff8881 ;
>>>>                    void       (*comp)(struct mlx5_core_cq *)        
>>>> 0xffff8881a1814504 ;
>>>>                    void       (*event)(struct mlx5_core_cq *, enum 
>>>> mlx5_event)      0xffff8881a2cdea08 ;
>>>>                    unsigned int cons_index          0x1 ;
>>>>                    unsigned int arm_sn      0x0 ;
>>>>                    struct mlx5_rsc_debug * dbg      0x0 ;
>>>>                    int        pid   0x0 ;
>>>>                    struct   {
>>>>                            struct  list_head {
>>>>                                    struct list_head * next          
>>>> 0xffffffff ;
>>>>                                    struct list_head * prev          
>>>> 0xffffffffffffffff ;
>>>>                            } list;
>>>>                            void (*comp)(struct mlx5_core_cq *)      
>>>> 0xffffffffa0356940 ;
>>>>                            void * priv      0x0 ;
>>>>                    } tasklet_ctx;
>>>>                    int        reset_notify_added    0x0 ;
>>>>                    struct  list_head {
>>>>                            struct list_head * next          
>>>> 0xffffffffa0300700 ;
>>>>                            struct list_head * prev          0xd ;
>>>>                    } reset_notify;
>>>>                    struct mlx5_eq_comp * eq         0x0 ;
>>>>                    short unsigned int uid   0x9a70 ;
>>>>            } mcq;
>>>>            struct mlx5e_channel * channel   0xffff8881b0899a70 ;
>>>>            struct mlx5_core_dev * mdev      0x4800000001 ;
>>>>            struct  mlx5_wq_ctrl {
>>>>                    struct mlx5_core_dev * mdev      0xffffffffa02d5350 ;
>>>>                    struct  mlx5_frag_buf {
>>>>                            struct mlx5_buf_list * frags     
>>>> 0xffffffffa02d5460 ;
>>>>                            int npages       0x0 ;
>>>>                            int size         0x5 ;
>>>>                            unsigned char page_shift         0x8 ;
>>>>                    } buf;
>>>>                    struct  mlx5_db {
>>>>                            __be32 * db      0x1c6 ;
>>>>                            union   {
>>>>                                    struct mlx5_db_pgdir * pgdir     0x0 ;
>>>>                                    struct mlx5_ib_user_db_page * user_page 
>>>>          0x0 ;
>>>>                            } u;
>>>>                            long long unsigned int dma       
>>>> 0xffff8881b0899ab0 ;
>>>>                            int index        0x0 ;
>>>>                    } db;
>>>>            } wq_ctrl;
>>>>    } cq;
>>>>    struct  mlx5_wq_cyc {
>>>>            struct  mlx5_frag_buf_ctrl {
>>>>                    struct mlx5_buf_list * frags     0xffff8881a7600160 ;
>>>>                    unsigned int sz_m1       0xa7600160 ;
>>>>                    short unsigned int frag_sz_m1    0x8881 ;
>>>>                    short unsigned int strides_offset        0xffff ;
>>>>                    unsigned char log_sz     0x88 ;
>>>>                    unsigned char log_stride         0x49 ;
>>>>                    unsigned char log_frag_strides   0xaa ;
>>>>            } fbc;
>>>>            __be32 *           db    0x1000000000010 ;
>>>>            short unsigned int sz    0xc ;
>>>>            short unsigned int wqe_ctr       0x0 ;
>>>>            short unsigned int cur_sz        0x0 ;
>>>>    } wq;
>>>>    unsigned int               dma_fifo_mask         0xa1814500 ;
>>>>    struct mlx5e_sq_stats *    stats         0xffff8881a33a0348 ;
>>>>    struct   {
>>>>            struct mlx5e_sq_dma * dma_fifo   0x1a1814500 ;
>>>>            struct mlx5e_tx_wqe_info * wqe_info      0x14 ;
>>>>    } db;
>>>>    void *                     uar_map       0x0 ;
>>>>    struct netdev_queue *      txq   0x0 ;
>>>>    unsigned int               sqn   0x18c0 ;
>>>>    unsigned char              min_inline_mode       0x0 ;
>>>>    struct device *            pdev          0x0 ;
>>>>    unsigned int               mkey_be       0x0 ;
>>>>    long unsigned int          state         0x0 ;
>>>>    struct hwtstamp_config *   tstamp        0x0 ;
>>>>    struct mlx5_clock *        clock         0xffff8881b1aa6f88 ;
>>>>    struct  mlx5_wq_ctrl {
>>>>            struct mlx5_core_dev * mdev      0x3f000003ff ;
>>>>            struct  mlx5_frag_buf {
>>>>                    struct mlx5_buf_list * frags     0x6060a ;
>>>>                    int        npages        0xa1814604 ;
>>>>                    int        size          0xffff8881 ;
>>>>                    unsigned char page_shift         0x0 ;
>>>>            } buf;
>>>>            struct  mlx5_db {
>>>>                    __be32 *   db    0xfff ;
>>>>                    union   {
>>>>                            struct mlx5_db_pgdir * pgdir     0x0 ;
>>>>                            struct mlx5_ib_user_db_page * user_page         
>>>>  0x0 ;
>>>>                    } u;
>>>>                    long long unsigned int dma       0xffff888188440000 ;
>>>>                    int        index         0x8b074000 ;
>>>>            } db;
>>>>    } wq_ctrl;
>>>>    struct mlx5e_channel *     channel       0xffffc9000010d800 ;
>>>>    int                        txq_ix        0xa0020180 ;
>>>>    unsigned int               rate_limit    0xffff8881 ;
>>>>    struct  work_struct {
>>>>            struct   {
>>>>                    long int   counter       0x1000018c0 ;
>>>>            } data;
>>>>            struct  list_head {
>>>>                    struct list_head * next          0xffff8881c32b68e8 ;
>>>>                    struct list_head * prev          0x800 ;
>>>>            } entry;
>>>>            void               (*func)(struct work_struct *)         0x9 ;
>>>>    } recover_work;
>>>> } ;
>>>
>>> I don't get it. You are dumping live kernel memory? There are already
>>> facilities to do that in place. Why to replicate it?
>> I am dumping the driver's memory under a lock so I can ensure it's
>> consistency (as appose to /dev/mem)
>> vmcore cannot be taken from a live kernel (without crashing).
>> I need the memory's snapshot right after the error from the driver's
>> context.
> 
> Got it. However, this sounds like a generic problem not specific to
> nic drivers. How other subsystems resolve this (if they do at all)?
> 
> 
Correct, this is a suggested debugging solution for a generic problem: 
enabling the user of a run time memory snapshot for kernel modules (at a 
given error event). My research shows that other subsystems deal with 
errors either by panicking (too much) or by debug/log prints (too little).
This solution is (a) low in maintenance (b) consistent in memory (c) has 
small performance impact (d) use an existing infra-structure between the 
kernel module and the user space.
It might be ported to other subsystems using their own user-space vs. 
kernel tools. Regardless of how the memory output was generated to the 
user, the parsing script can work on it.

> 
>> Which other tools do you mean?
>>>
>>>
>>>>
>>>> Signed-off-by: Aya Levin <a...@mellanox.com>
>>>> ---
>>>> .../ethernet/mellanox/mlx5/core/en/reporter_tx.c   | 100 
>>>> +++++++++++++++++++++
>>>> 1 file changed, 100 insertions(+)
>>>>
>>>> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c 
>>>> b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>> index 476dd97f7f2f..8a39f5525e57 100644
>>>> --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>> +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c
>>>> @@ -9,6 +9,7 @@
>>>>
>>>> struct mlx5e_tx_err_ctx {
>>>>    int (*recover)(struct mlx5e_txqsq *sq);
>>>> +  int (*dump)(struct mlx5e_txqsq *sq);
>>>>    struct mlx5e_txqsq *sq;
>>>> };
>>>>
>>>> @@ -281,10 +282,109 @@ static int mlx5e_tx_reporter_diagnose(struct 
>>>> devlink_health_reporter *reporter,
>>>>    return err;
>>>> }
>>>>
>>>> +static int mlx5e_tx_reporter_sw_dump_from_ctx(struct mlx5e_priv *priv,
>>>> +                                        struct mlx5e_txqsq *sq,
>>>> +                                        struct devlink_fmsg *fmsg)
>>>> +{
>>>> +  u64 *ptr = (u64 *)sq;
>>>> +  int copy, err;
>>>> +  int i = 0;
>>>> +
>>>> +  if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>>> +          return 0;
>>>> +
>>>> +  err = devlink_fmsg_pair_nest_start(fmsg, "mlx5e_txqsq");
>>>> +  if (err)
>>>> +          return err;
>>>> +
>>>> +  err = devlink_fmsg_obj_nest_start(fmsg);
>>>> +  if (err)
>>>> +          return err;
>>>> +
>>>> +  err = devlink_fmsg_arr_pair_nest_start(fmsg, "memory");
>>>> +  if (err)
>>>> +          return err;
>>>> +
>>>> +  while (i < sizeof(struct mlx5e_txqsq)) {
>>>> +          copy = sizeof(u64);
>>>> +
>>>> +          if (i + copy > sizeof(struct mlx5e_txqsq))
>>>> +                  copy = sizeof(struct mlx5e_txqsq) - i;
>>>> +
>>>> +          err = devlink_fmsg_binary_put(fmsg, ptr, copy);
>>>> +          if (err)
>>>> +                  return err;
>>>> +          ptr++;
>>>> +          i += copy;
>>>> +  }
>>>> +
>>>> +  err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>>> +  if (err)
>>>> +          return err;
>>>> +
>>>> +  err = devlink_fmsg_obj_nest_end(fmsg);
>>>> +  if (err)
>>>> +          return err;
>>>> +
>>>> +  err = devlink_fmsg_pair_nest_end(fmsg);
>>>> +
>>>> +  return err;
>>>> +}
>>>> +
>>>> +static int mlx5e_tx_reporter_sw_dump_all(struct mlx5e_priv *priv,
>>>> +                                   struct devlink_fmsg *fmsg)
>>>> +{
>>>> +  int i, err = 0;
>>>> +
>>>> +  mutex_lock(&priv->state_lock);
>>>> +
>>>> +  if (!test_bit(MLX5E_STATE_OPENED, &priv->state))
>>>> +          goto unlock;
>>>> +
>>>> +  err = devlink_fmsg_arr_pair_nest_start(fmsg, "SQs");
>>>> +  if (err)
>>>> +          goto unlock;
>>>> +
>>>> +  for (i = 0; i < priv->channels.num * priv->channels.params.num_tc;
>>>> +       i++) {
>>>> +          err = devlink_fmsg_obj_nest_start(fmsg);
>>>> +          if (err)
>>>> +                  goto unlock;
>>>> +
>>>> +          err = mlx5e_tx_reporter_sw_dump_from_ctx(priv, priv->txq2sq[i],
>>>> +                                                   fmsg);
>>>> +          if (err)
>>>> +                  goto unlock;
>>>> +
>>>> +          err = devlink_fmsg_pair_nest_end(fmsg);
>>>> +          if (err)
>>>> +                  goto unlock;
>>>> +  }
>>>> +  err = devlink_fmsg_arr_pair_nest_end(fmsg);
>>>> +  if (err)
>>>> +          goto unlock;
>>>> +
>>>> +unlock:
>>>> +  mutex_unlock(&priv->state_lock);
>>>> +  return err;
>>>> +}
>>>> +
>>>> +static int mlx5e_tx_reporter_sw_dump(struct devlink_health_reporter 
>>>> *reporter,
>>>> +                               struct devlink_fmsg *fmsg, void *context)
>>>> +{
>>>> +  struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter);
>>>> +  struct mlx5e_tx_err_ctx *err_ctx = context;
>>>> +
>>>> +  return err_ctx ? mlx5e_tx_reporter_sw_dump_from_ctx(priv, err_ctx->sq,
>>>> +                                                      fmsg) :
>>>> +                   mlx5e_tx_reporter_sw_dump_all(priv, fmsg);
>>>> +}
>>>> +
>>>> static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = {
>>>>            .name = "tx",
>>>>            .recover = mlx5e_tx_reporter_recover,
>>>>            .diagnose = mlx5e_tx_reporter_diagnose,
>>>> +          .dump = mlx5e_tx_reporter_sw_dump,
>>>> };
>>>>
>>>> #define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500
>>>> -- 
>>>> 2.14.1
>>>>

Reply via email to