> Hello,
> the patch below is of interest for people who use Mellanox Connect-X/4
> and 5 ethernet adapters and configure it for jumbo frames.  The patch
> should improve the system behavior on low or fragmented memory situations.
> See the commit message for detailed description.  Also, the patch should
> not cause performance degradation for the normal 1500 MTU by keeping the
> same configuration as currently unpatched driver.
> 
> I am interested in hearing about regressions (or not) caused by the
> change.

I'll hopefully get  to test this early next week, but my schedule
is very full until then.

I do have a fist full of sytle(9) and other comments though.

You can probably ignore my nits on capitalize start of comments,
it seems that this may be the style of this code....

> commit f25c0c624177a1ca06ca051b0adb842acb66ec11
> Author: Konstantin Belousov <konstant...@mellanox.com>
> Date:   Wed Nov 22 20:14:53 2017 +0200
> 
>     mlx5en: Handle jumbo frames without requiring big clusters.
>     
>     The scatter list is formed by the chunks of MCLBYTES each, and larger
>     than default packets are returned to the stack as the mbuf chain.
>     
>     This is an improvements over the original patch by ha...@mellanox.com
>     which solves some busdma issues and sizes WQEs scatter list to only
>     have the minimal sufficient number of elements.  In particular, for
>     the default MTU 1500 bytes the receive queue format is not changed
>     comparing to the unpatched driver, avoiding a reported performance
>     regression.
>     
>     Issue: 1065432
>     Issue: 1184316
>     Change-Id: Ib63a5ef55abd6ef1ec9b296e2cef88e4604bbd29
>     Signed-off-by: Konstantin Belousov <konstant...@mellanox.com>
> 
> diff --git a/sys/dev/mlx5/mlx5_en/en.h b/sys/dev/mlx5/mlx5_en/en.h
> index b5000c32eaf..ff5ef446e34 100644
> --- a/sys/dev/mlx5/mlx5_en/en.h
> +++ b/sys/dev/mlx5/mlx5_en/en.h
> @@ -80,8 +80,19 @@
>  #define      MLX5E_PARAMS_DEFAULT_LOG_RQ_SIZE                0xa
>  #define      MLX5E_PARAMS_MAXIMUM_LOG_RQ_SIZE                0xe
>  
> -/* freeBSD HW LRO is limited by 16KB - the size of max mbuf */
Though it is tempting to fix spelling errors and stylish things
while working on code it usually helps to sepeate those into
either a pre or post commit to reduce the lines of code change
that need to be looked at when reviewing a diff.  This also helps
diff(1) to chunk things a bit better.

> +#define      MLX5E_MAX_RX_SEGS 7
> +
> +#ifndef MLX5E_MAX_RX_BYTES
> +#define      MLX5E_MAX_RX_BYTES MCLBYTES
> +#endif
> +
> +#if (MLX5E_MAX_RX_SEGS == 1)
> +/* FreeBSD HW LRO is limited by 16KB - the size of max mbuf */
>  #define      MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ                 MJUM16BYTES
> +#else
> +#define      MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ \
> +    MIN(65535, MLX5E_MAX_RX_SEGS * MLX5E_MAX_RX_BYTES)
> +#endif
>  #define      MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC      0x10
>  #define      MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_USEC_FROM_CQE     0x3
>  #define      MLX5E_PARAMS_DEFAULT_RX_CQ_MODERATION_PKTS      0x20
> @@ -530,6 +541,7 @@ struct mlx5e_rq {
>       struct mtx mtx;
>       bus_dma_tag_t dma_tag;
>       u32     wqe_sz;
> +     u32     nsegs;

Alphabetic order, and should just use modify the current u32 line:
        u32     nsegs, wqe_sz;

>       struct mlx5e_rq_mbuf *mbuf;
>       struct ifnet *ifp;
>       struct mlx5e_rq_stats stats;
> @@ -795,9 +807,12 @@ struct mlx5e_tx_wqe {
>  
>  struct mlx5e_rx_wqe {
>       struct mlx5_wqe_srq_next_seg next;
> -     struct mlx5_wqe_data_seg data;
> +     struct mlx5_wqe_data_seg data[];

None standard way to declare a pointer?

>  };
>  
> +/* the size of the structure above must be power of two */
the should be The.

> +CTASSERT(powerof2(sizeof(struct mlx5e_rx_wqe)));
> +
>  struct mlx5e_eeprom {
>       int     lock_bit;
>       int     i2c_addr;
> diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c 
> b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
> index c2c4b0d7744..25544e561e3 100644
> --- a/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
> +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_main.c
> @@ -33,9 +33,12 @@
>  #ifndef ETH_DRIVER_VERSION
>  #define      ETH_DRIVER_VERSION      "3.4.1"
>  #endif
> +
Adding white space is a style change, and actually unwanted right here
as the next line is tightly coupled to the previous line.

>  char mlx5e_version[] = "Mellanox Ethernet driver"
>      " (" ETH_DRIVER_VERSION ")";
>  
> +static int mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 
> *nsegs);
> +

Could you move the function before use and eliminate the need
for the forward declare?

>  struct mlx5e_channel_param {
>       struct mlx5e_rq_param rq;
>       struct mlx5e_sq_param sq;
> @@ -810,6 +813,11 @@ mlx5e_create_rq(struct mlx5e_channel *c,
>       int wq_sz;
>       int err;
>       int i;
> +     u32 nsegs, wqe_sz;
> +
> +     err = mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
> +     if (err != 0)
> +             goto done;
>  
>       /* Create DMA descriptor TAG */
>       if ((err = -bus_dma_tag_create(
> @@ -819,9 +827,9 @@ mlx5e_create_rq(struct mlx5e_channel *c,
>           BUS_SPACE_MAXADDR,          /* lowaddr */
>           BUS_SPACE_MAXADDR,          /* highaddr */
>           NULL, NULL,                 /* filter, filterarg */
> -         MJUM16BYTES,                /* maxsize */
> -         1,                          /* nsegments */
> -         MJUM16BYTES,                /* maxsegsize */
> +         nsegs * MLX5E_MAX_RX_BYTES, /* maxsize */
> +         nsegs,                      /* nsegments */
> +         nsegs * MLX5E_MAX_RX_BYTES, /* maxsegsize */
>           0,                          /* flags */
>           NULL, NULL,                 /* lockfunc, lockfuncarg */
>           &rq->dma_tag)))
> @@ -834,23 +842,9 @@ mlx5e_create_rq(struct mlx5e_channel *c,
>  
>       rq->wq.db = &rq->wq.db[MLX5_RCV_DBR];
>  
> -     if (priv->params.hw_lro_en) {
> -             rq->wqe_sz = priv->params.lro_wqe_sz;
> -     } else {
> -             rq->wqe_sz = MLX5E_SW2MB_MTU(priv->ifp->if_mtu);
> -     }
> -     if (rq->wqe_sz > MJUM16BYTES) {
> -             err = -ENOMEM;
> +     err = mlx5e_get_wqe_sz(priv, &rq->wqe_sz, &rq->nsegs);
> +     if (err != 0)
>               goto err_rq_wq_destroy;
> -     } else if (rq->wqe_sz > MJUM9BYTES) {
> -             rq->wqe_sz = MJUM16BYTES;
> -     } else if (rq->wqe_sz > MJUMPAGESIZE) {
> -             rq->wqe_sz = MJUM9BYTES;
> -     } else if (rq->wqe_sz > MCLBYTES) {
> -             rq->wqe_sz = MJUMPAGESIZE;
> -     } else {
> -             rq->wqe_sz = MCLBYTES;
> -     }
>  
>       wq_sz = mlx5_wq_ll_get_size(&rq->wq);
>  
> @@ -861,7 +855,11 @@ mlx5e_create_rq(struct mlx5e_channel *c,
>       rq->mbuf = malloc(wq_sz * sizeof(rq->mbuf[0]), M_MLX5EN, M_WAITOK | 
> M_ZERO);
>       for (i = 0; i != wq_sz; i++) {
>               struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
> +#if (MLX5E_MAX_RX_SEGS == 1)
>               uint32_t byte_count = rq->wqe_sz - MLX5E_NET_IP_ALIGN;
> +#else
> +             int j;
> +#endif
>  
>               err = -bus_dmamap_create(rq->dma_tag, 0, &rq->mbuf[i].dma_map);
>               if (err != 0) {
> @@ -869,8 +867,15 @@ mlx5e_create_rq(struct mlx5e_channel *c,
>                               bus_dmamap_destroy(rq->dma_tag, 
> rq->mbuf[i].dma_map);
>                       goto err_rq_mbuf_free;
>               }
> -             wqe->data.lkey = c->mkey_be;
> -             wqe->data.byte_count = cpu_to_be32(byte_count | 
> MLX5_HW_START_PADDING);
> +
> +             /* set value for constant fields */
> +#if (MLX5E_MAX_RX_SEGS == 1)
> +             wqe->data[0].lkey = c->mkey_be;
> +             wqe->data[0].byte_count = cpu_to_be32(byte_count | 
> MLX5_HW_START_PADDING);
> +#else
> +             for (j = 0; j < rq->nsegs; j++)
> +                     wqe->data[j].lkey = c->mkey_be;
> +#endif
>       }
>  
>       rq->ifp = c->ifp;
> @@ -1809,16 +1814,51 @@ mlx5e_close_channel_wait(struct mlx5e_channel 
> *volatile *pp)
>       free(c, M_MLX5EN);
>  }
>  
> +static int
> +mlx5e_get_wqe_sz(struct mlx5e_priv *priv, u32 *wqe_sz, u32 *nsegs)
> +{
> +     u32 r, n;
Alpah order on variables:
        u32 n, r;

> +
> +     r = priv->params.hw_lro_en ? priv->params.lro_wqe_sz :
> +         MLX5E_SW2MB_MTU(priv->ifp->if_mtu);
> +     if (r > MJUM16BYTES)
> +             return (-ENOMEM);
> +
> +     if (r > MJUM9BYTES)
> +             r = MJUM16BYTES;
> +     else if (r > MJUMPAGESIZE)
> +             r = MJUM9BYTES;
> +     else if (r > MCLBYTES)
> +             r = MJUMPAGESIZE;
> +     else
> +             r = MCLBYTES;
> +
> +     /*
> +      * n + 1 must be a power of two, because stride size must be.
> +      * Stride size is 16 * (n + 1), as the first segment is
> +      * control.
> +      */
> +     for (n = howmany(r, MLX5E_MAX_RX_BYTES); !powerof2(n + 1); n++)
> +             ;
> +
> +     *wqe_sz = r;
> +     *nsegs = n;
> +     return (0);
> +}
> +
>  static void
>  mlx5e_build_rq_param(struct mlx5e_priv *priv,
>      struct mlx5e_rq_param *param)
>  {
>       void *rqc = param->rqc;
>       void *wq = MLX5_ADDR_OF(rqc, rqc, wq);
> +     u32 wqe_sz, nsegs;
Variable order: nsegs, wqe_sz;

>  
> +     mlx5e_get_wqe_sz(priv, &wqe_sz, &nsegs);
>       MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_LINKED_LIST);
>       MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
> -     MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe)));
> +     MLX5_SET(wq, wq, log_wq_stride, ilog2(sizeof(struct mlx5e_rx_wqe) +
> +         nsegs * sizeof(struct mlx5_wqe_data_seg)));
>       MLX5_SET(wq, wq, log_wq_sz, priv->params.log_rq_size);
>       MLX5_SET(wq, wq, pd, priv->pdn);
>  
> diff --git a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c 
> b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
> index fb14be43b32..5868b0a2f55 100644
> --- a/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
> +++ b/sys/dev/mlx5/mlx5_en/mlx5_en_rx.c
> @@ -32,21 +32,47 @@ static inline int
>  mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq,
>      struct mlx5e_rx_wqe *wqe, u16 ix)
>  {
> -     bus_dma_segment_t segs[1];
> +     bus_dma_segment_t segs[rq->nsegs];
This code line compiles?   Or another odd pointer declare?

>       struct mbuf *mb;
>       int nsegs;
>       int err;
> -
> +#if (MLX5E_MAX_RX_SEGS != 1)
> +     struct mbuf *mb_head;
> +     int i;
> +#endif
>       if (rq->mbuf[ix].mbuf != NULL)
>               return (0);
>  
> +#if (MLX5E_MAX_RX_SEGS == 1)
>       mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rq->wqe_sz);
>       if (unlikely(!mb))
>               return (-ENOMEM);
>  
> -     /* set initial mbuf length */
>       mb->m_pkthdr.len = mb->m_len = rq->wqe_sz;
> +#else
> +     mb_head = mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR,
> +         MLX5E_MAX_RX_BYTES);
> +     if (unlikely(mb == NULL))
> +             return (-ENOMEM);
> +
> +     mb->m_len = MLX5E_MAX_RX_BYTES;
> +     mb->m_pkthdr.len = MLX5E_MAX_RX_BYTES;
>  
> +     for (i = 1; i < rq->nsegs; i++) {
> +             if (mb_head->m_pkthdr.len >= rq->wqe_sz)
> +                     break;
> +             mb = mb->m_next = m_getjcl(M_NOWAIT, MT_DATA, 0,
> +                 MLX5E_MAX_RX_BYTES);
> +             if (unlikely(mb == NULL)) {
> +                     m_freem(mb_head);
> +                     return (-ENOMEM);
> +             }
> +             mb->m_len = MLX5E_MAX_RX_BYTES;
> +             mb_head->m_pkthdr.len += MLX5E_MAX_RX_BYTES;
> +     }
> +     /* rewind to first mbuf in chain */
> +     mb = mb_head;
> +#endif
>       /* get IP header aligned */
>       m_adj(mb, MLX5E_NET_IP_ALIGN);
>  
> @@ -54,12 +80,26 @@ mlx5e_alloc_rx_wqe(struct mlx5e_rq *rq,
>           mb, segs, &nsegs, BUS_DMA_NOWAIT);
>       if (err != 0)
>               goto err_free_mbuf;
> -     if (unlikely(nsegs != 1)) {
> +     if (unlikely(nsegs == 0)) {
>               bus_dmamap_unload(rq->dma_tag, rq->mbuf[ix].dma_map);
>               err = -ENOMEM;
>               goto err_free_mbuf;
>       }
> -     wqe->data.addr = cpu_to_be64(segs[0].ds_addr);
> +#if (MLX5E_MAX_RX_SEGS == 1)
> +     wqe->data[0].addr = cpu_to_be64(segs[0].ds_addr);
> +#else
> +     wqe->data[0].addr = cpu_to_be64(segs[0].ds_addr);
> +     wqe->data[0].byte_count = cpu_to_be32(segs[0].ds_len |
> +         MLX5_HW_START_PADDING);
> +     for (i = 1; i != nsegs; i++) {
> +             wqe->data[i].addr = cpu_to_be64(segs[i].ds_addr);
> +             wqe->data[i].byte_count = cpu_to_be32(segs[i].ds_len);
> +     }
> +     for (; i < rq->nsegs; i++) {
> +             wqe->data[i].addr = 0;
> +             wqe->data[i].byte_count = 0;
> +     }
> +#endif
>  
>       rq->mbuf[ix].mbuf = mb;
>       rq->mbuf[ix].data = mb->m_data;
> @@ -214,6 +254,9 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe,
>  {
>       struct ifnet *ifp = rq->ifp;
>       struct mlx5e_channel *c;
> +#if (MLX5E_MAX_RX_SEGS != 1)
> +     struct mbuf *mb_head;
> +#endif
>       int lro_num_seg;        /* HW LRO session aggregated packets counter */
>       uint64_t tstmp;
>  
> @@ -224,7 +267,26 @@ mlx5e_build_rx_mbuf(struct mlx5_cqe64 *cqe,
>               rq->stats.lro_bytes += cqe_bcnt;
>       }
>  
> +#if (MLX5E_MAX_RX_SEGS == 1)
>       mb->m_pkthdr.len = mb->m_len = cqe_bcnt;
> +#else
> +     mb->m_pkthdr.len = cqe_bcnt;
> +     for (mb_head = mb; mb != NULL; mb = mb->m_next) {
> +             if (mb->m_len > cqe_bcnt)
> +                     mb->m_len = cqe_bcnt;
> +             cqe_bcnt -= mb->m_len;
> +             if (likely(cqe_bcnt == 0)) {
> +                     if (likely(mb->m_next != NULL)) {
> +                             /* trim off empty mbufs */
Trim

> +                             m_freem(mb->m_next);
> +                             mb->m_next = NULL;
> +                     }
> +                     break;
> +             }
> +     }
> +     /* rewind to first mbuf in chain */
Rewind

> +     mb = mb_head;
> +#endif
>       /* check if a Toeplitz hash was computed */
>       if (cqe->rss_hash_type != 0) {
>               mb->m_pkthdr.flowid = be32_to_cpu(cqe->rss_hash_result);
> @@ -447,7 +509,10 @@ mlx5e_rx_cq_comp(struct mlx5_core_cq *mcq)
>       int i = 0;
>  
>  #ifdef HAVE_PER_CQ_EVENT_PACKET
> -     struct mbuf *mb = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, rq->wqe_sz);
> +#if (MHLEN < 15)
> +#error "MHLEN is too small"
> +#endif
> +     struct mbuf *mb = m_gethdr(M_NOWAIT, MT_DATA);
>  
>       if (mb != NULL) {
>               /* this code is used for debugging purpose only */
> _______________________________________________
> freebsd-net@freebsd.org mailing list
> https://lists.freebsd.org/mailman/listinfo/freebsd-net
> To unsubscribe, send any mail to "freebsd-net-unsubscr...@freebsd.org"
> 

-- 
Rod Grimes                                                 rgri...@freebsd.org
_______________________________________________
freebsd-net@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-net
To unsubscribe, send any mail to "freebsd-net-unsubscr...@freebsd.org"

Reply via email to