Thursday, July 12, 2018 3:02 PM, Mordechay Haimovsky:
> Subject: [PATCH v3] net/mlx5: add support for 32bit systems
> 
> This patch adds support for building and running mlx5 PMD on 32bit systems
> such as i686.
> 
> The main issue to tackle was handling the 32bit access to the UAR as quoted
> from the mlx5 PRM:
> QP and CQ DoorBells require 64-bit writes. For best performance, it is
> recommended to execute the QP/CQ DoorBell as a single 64-bit write
> operation. For platforms that do not support 64 bit writes, it is possible to
> issue the 64 bits DoorBells through two consecutive writes, each write 32
> bits, as described below:
> * The order of writing each of the Dwords is from lower to upper
>   addresses.
> * No other DoorBell can be rung (or even start ringing) in the midst  of an 
> on-
> going write of a DoorBell over a given UAR page.
> The last rule implies that in a multi-threaded environment, the access to a
> UAR page (which can be accessible by all threads in the process) must be
> synchronized (for example, using a semaphore) unless an atomic write of 64
> bits in a single bus operation is guaranteed. Such a synchronization is not
> required for when ringing DoorBells on different UAR pages.
> 
> Signed-off-by: Moti Haimovsky <mo...@mellanox.com>

Applied to next-net-mlx (again), thanks. 

Guidelines for 32b compilation and testing:
1. fetch the latest rdma-core from github. Make sure you have commit "708c8242 
mlx5: Fix compilation on 32 bit systems when sse3 is on"
2. compile rdma-core for 32b by
        mkdir build32
        cd build32
        CFLAGS="-Werror -m32" cmake -GNinja .. -DENABLE_RESOLVE_NEIGH=0 
-DIOCTL_MODE=both (approach taken from rdma-core travis build 
https://github.com/linux-rdma/rdma-core/blob/master/buildlib/travis-build#L20) 
        Ninja (or ninja-build)
3. compile and run DPDK against build32 directory
        

> ---
> v3:
> * Rebased upon latest changes in mlx5 PMD and rdma-core.
> 
> v2:
> * Fixed coding style issues.
> * Modified documentation according to review inputs.
> * Fixed merge conflicts.
> ---
>  doc/guides/nics/features/mlx5.ini |  1 +
>  doc/guides/nics/mlx5.rst          |  6 +++-
>  drivers/net/mlx5/mlx5.c           |  8 ++++-
>  drivers/net/mlx5/mlx5.h           |  5 +++
>  drivers/net/mlx5/mlx5_defs.h      | 18 ++++++++--
>  drivers/net/mlx5/mlx5_rxq.c       |  6 +++-
>  drivers/net/mlx5/mlx5_rxtx.c      | 22 +++++++------
>  drivers/net/mlx5/mlx5_rxtx.h      | 69
> ++++++++++++++++++++++++++++++++++++++-
>  drivers/net/mlx5/mlx5_txq.c       | 13 +++++++-
>  9 files changed, 131 insertions(+), 17 deletions(-)
> 
> diff --git a/doc/guides/nics/features/mlx5.ini
> b/doc/guides/nics/features/mlx5.ini
> index e75b14b..b28b43e 100644
> --- a/doc/guides/nics/features/mlx5.ini
> +++ b/doc/guides/nics/features/mlx5.ini
> @@ -43,5 +43,6 @@ Multiprocess aware   = Y
>  Other kdrv           = Y
>  ARMv8                = Y
>  Power8               = Y
> +x86-32               = Y
>  x86-64               = Y
>  Usage doc            = Y
> diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst index
> 0d0d217..ebf2336 100644
> --- a/doc/guides/nics/mlx5.rst
> +++ b/doc/guides/nics/mlx5.rst
> @@ -49,7 +49,7 @@ libibverbs.
>  Features
>  --------
> 
> -- Multi arch support: x86_64, POWER8, ARMv8.
> +- Multi arch support: x86_64, POWER8, ARMv8, i686.
>  - Multiple TX and RX queues.
>  - Support for scattered TX and RX frames.
>  - IPv4, IPv6, TCPv4, TCPv6, UDPv4 and UDPv6 RSS on any number of queues.
> @@ -489,6 +489,10 @@ RMDA Core with Linux Kernel
>  - Minimal kernel version : v4.14 or the most recent 4.14-rc (see `Linux
> installation documentation`_)
>  - Minimal rdma-core version: v15+ commit 0c5f5765213a ("Merge pull
> request #227 from yishaih/tm")
>    (see `RDMA Core installation documentation`_)
> +- When building for i686 use:
> +
> +  - rdma-core version 18.0 or above built with 32bit support.
> +  - Kernel version 4.14.41 or above.
> 
>  .. _`Linux installation documentation`:
> https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-
> stable.git/plain/Documentation/admin-guide/README.rst
>  .. _`RDMA Core installation documentation`:
> https://raw.githubusercontent.com/linux-rdma/rdma-
> core/master/README.md
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> dda50b8..15f1a17 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -598,7 +598,7 @@
>       rte_memseg_walk(find_lower_va_bound, &addr);
> 
>       /* keep distance to hugepages to minimize potential conflicts. */
> -     addr = RTE_PTR_SUB(addr, MLX5_UAR_OFFSET + MLX5_UAR_SIZE);
> +     addr = RTE_PTR_SUB(addr, (uintptr_t)(MLX5_UAR_OFFSET +
> +MLX5_UAR_SIZE));
>       /* anonymous mmap, no real memory consumption. */
>       addr = mmap(addr, MLX5_UAR_SIZE,
>                   PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
> @@ -939,6 +939,12 @@
>       priv->device_attr = attr;
>       priv->pd = pd;
>       priv->mtu = ETHER_MTU;
> +#ifndef RTE_ARCH_64
> +     /* Initialize UAR access locks for 32bit implementations. */
> +     rte_spinlock_init(&priv->uar_lock_cq);
> +     for (i = 0; i < MLX5_UAR_PAGE_NUM_MAX; i++)
> +             rte_spinlock_init(&priv->uar_lock[i]);
> +#endif
>       /* Some internal functions rely on Netlink sockets, open them now.
> */
>       priv->nl_socket_rdma = mlx5_nl_init(0, NETLINK_RDMA);
>       priv->nl_socket_route = mlx5_nl_init(RTMGRP_LINK,
> NETLINK_ROUTE);
> diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h index
> 131be33..896158a 100644
> --- a/drivers/net/mlx5/mlx5.h
> +++ b/drivers/net/mlx5/mlx5.h
> @@ -215,6 +215,11 @@ struct priv {
>       int nl_socket_rdma; /* Netlink socket (NETLINK_RDMA). */
>       int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */
>       uint32_t nl_sn; /* Netlink message sequence number. */
> +#ifndef RTE_ARCH_64
> +     rte_spinlock_t uar_lock_cq; /* CQs share a common distinct UAR */
> +     rte_spinlock_t uar_lock[MLX5_UAR_PAGE_NUM_MAX];
> +     /* UAR same-page access control required in 32bit implementations.
> */
> +#endif
>  };
> 
>  #define PORT_ID(priv) ((priv)->dev_data->port_id) diff --git
> a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h index
> 5bbbec2..f6ec415 100644
> --- a/drivers/net/mlx5/mlx5_defs.h
> +++ b/drivers/net/mlx5/mlx5_defs.h
> @@ -87,14 +87,28 @@
>  #define MLX5_LINK_STATUS_TIMEOUT 10
> 
>  /* Reserved address space for UAR mapping. */ -#define MLX5_UAR_SIZE
> (1ULL << 32)
> +#define MLX5_UAR_SIZE (1ULL << (sizeof(uintptr_t) * 4))
> 
>  /* Offset of reserved UAR address space to hugepage memory. Offset is
> used here
>   * to minimize possibility of address next to hugepage being used by other
> code
>   * in either primary or secondary process, failing to map TX UAR would make
> TX
>   * packets invisible to HW.
>   */
> -#define MLX5_UAR_OFFSET (1ULL << 32)
> +#define MLX5_UAR_OFFSET (1ULL << (sizeof(uintptr_t) * 4))
> +
> +/* Maximum number of UAR pages used by a port,
> + * These are the size and mask for an array of mutexes used to
> +synchronize
> + * the access to port's UARs on platforms that do not support 64 bit writes.
> + * In such systems it is possible to issue the 64 bits DoorBells
> +through two
> + * consecutive writes, each write 32 bits. The access to a UAR page
> +(which can
> + * be accessible by all threads in the process) must be synchronized
> + * (for example, using a semaphore). Such a synchronization is not
> +required
> + * when ringing DoorBells on different UAR pages.
> + * A port with 512 Tx queues uses 8, 4kBytes, UAR pages which are
> +shared
> + * among the ports.
> + */
> +#define MLX5_UAR_PAGE_NUM_MAX 64
> +#define MLX5_UAR_PAGE_NUM_MASK ((MLX5_UAR_PAGE_NUM_MAX)
> - 1)
> 
>  /* Log 2 of the default number of strides per WQE for Multi-Packet RQ. */
> #define MLX5_MPRQ_STRIDE_NUM_N 6U diff --git
> a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c index
> 071740b..16e1641 100644
> --- a/drivers/net/mlx5/mlx5_rxq.c
> +++ b/drivers/net/mlx5/mlx5_rxq.c
> @@ -647,7 +647,8 @@
>       doorbell = (uint64_t)doorbell_hi << 32;
>       doorbell |=  rxq->cqn;
>       rxq->cq_db[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi);
> -     rte_write64(rte_cpu_to_be_64(doorbell), cq_db_reg);
> +     mlx5_uar_write64(rte_cpu_to_be_64(doorbell),
> +                      cq_db_reg, rxq->uar_lock_cq);
>  }
> 
>  /**
> @@ -1449,6 +1450,9 @@ struct mlx5_rxq_ctrl *
>       tmpl->rxq.elts_n = log2above(desc);
>       tmpl->rxq.elts =
>               (struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
> +#ifndef RTE_ARCH_64
> +     tmpl->rxq.uar_lock_cq = &priv->uar_lock_cq; #endif
>       tmpl->idx = idx;
>       rte_atomic32_inc(&tmpl->refcnt);
>       LIST_INSERT_HEAD(&priv->rxqsctrl, tmpl, next); diff --git
> a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index
> a7ed8d8..52a1074 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -495,6 +495,7 @@
>       volatile struct mlx5_wqe_ctrl *last_wqe = NULL;
>       unsigned int segs_n = 0;
>       const unsigned int max_inline = txq->max_inline;
> +     uint64_t addr_64;
> 
>       if (unlikely(!pkts_n))
>               return 0;
> @@ -711,12 +712,12 @@
>                       ds = 3;
>  use_dseg:
>                       /* Add the remaining packet as a simple ds. */
> -                     addr = rte_cpu_to_be_64(addr);
> +                     addr_64 = rte_cpu_to_be_64(addr);
>                       *dseg = (rte_v128u32_t){
>                               rte_cpu_to_be_32(length),
>                               mlx5_tx_mb2mr(txq, buf),
> -                             addr,
> -                             addr >> 32,
> +                             addr_64,
> +                             addr_64 >> 32,
>                       };
>                       ++ds;
>                       if (!segs_n)
> @@ -750,12 +751,12 @@
>               total_length += length;
>  #endif
>               /* Store segment information. */
> -             addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
> uintptr_t));
> +             addr_64 = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
> uintptr_t));
>               *dseg = (rte_v128u32_t){
>                       rte_cpu_to_be_32(length),
>                       mlx5_tx_mb2mr(txq, buf),
> -                     addr,
> -                     addr >> 32,
> +                     addr_64,
> +                     addr_64 >> 32,
>               };
>               (*txq->elts)[++elts_head & elts_m] = buf;
>               if (--segs_n)
> @@ -1450,6 +1451,7 @@
>       unsigned int mpw_room = 0;
>       unsigned int inl_pad = 0;
>       uint32_t inl_hdr;
> +     uint64_t addr_64;
>       struct mlx5_mpw mpw = {
>               .state = MLX5_MPW_STATE_CLOSED,
>       };
> @@ -1586,13 +1588,13 @@
>                                       ((uintptr_t)mpw.data.raw +
>                                        inl_pad);
>                       (*txq->elts)[elts_head++ & elts_m] = buf;
> -                     addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
> -                                                              uintptr_t));
> +                     addr_64 =
> rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
> +                                                                 uintptr_t));
>                       *dseg = (rte_v128u32_t) {
>                               rte_cpu_to_be_32(length),
>                               mlx5_tx_mb2mr(txq, buf),
> -                             addr,
> -                             addr >> 32,
> +                             addr_64,
> +                             addr_64 >> 32,
>                       };
>                       mpw.data.raw = (volatile void *)(dseg + 1);
>                       mpw.total_len += (inl_pad + sizeof(*dseg)); diff --git
> a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index
> a04a84f..992e977 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.h
> +++ b/drivers/net/mlx5/mlx5_rxtx.h
> @@ -26,6 +26,8 @@
>  #include <rte_common.h>
>  #include <rte_hexdump.h>
>  #include <rte_atomic.h>
> +#include <rte_spinlock.h>
> +#include <rte_io.h>
> 
>  #include "mlx5_utils.h"
>  #include "mlx5.h"
> @@ -118,6 +120,10 @@ struct mlx5_rxq_data {
>       void *cq_uar; /* CQ user access region. */
>       uint32_t cqn; /* CQ number. */
>       uint8_t cq_arm_sn; /* CQ arm seq number. */
> +#ifndef RTE_ARCH_64
> +     rte_spinlock_t *uar_lock_cq;
> +     /* CQ (UAR) access lock required for 32bit implementations */ #endif
>       uint32_t tunnel; /* Tunnel information. */  } __rte_cache_aligned;
> 
> @@ -198,6 +204,10 @@ struct mlx5_txq_data {
>       volatile void *bf_reg; /* Blueflame register remapped. */
>       struct rte_mbuf *(*elts)[]; /* TX elements. */
>       struct mlx5_txq_stats stats; /* TX queue counters. */
> +#ifndef RTE_ARCH_64
> +     rte_spinlock_t *uar_lock;
> +     /* UAR access lock required for 32bit implementations */ #endif
>  } __rte_cache_aligned;
> 
>  /* Verbs Rx queue elements. */
> @@ -353,6 +363,63 @@ uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct
> rte_mbuf **pkts,  uint32_t mlx5_rx_addr2mr_bh(struct mlx5_rxq_data
> *rxq, uintptr_t addr);  uint32_t mlx5_tx_addr2mr_bh(struct mlx5_txq_data
> *txq, uintptr_t addr);
> 
> +/**
> + * Provide safe 64bit store operation to mlx5 UAR region for both 32bit
> +and
> + * 64bit architectures.
> + *
> + * @param val
> + *   value to write in CPU endian format.
> + * @param addr
> + *   Address to write to.
> + * @param lock
> + *   Address of the lock to use for that UAR access.
> + */
> +static __rte_always_inline void
> +__mlx5_uar_write64_relaxed(uint64_t val, volatile void *addr,
> +                        rte_spinlock_t *lock __rte_unused) { #ifdef
> RTE_ARCH_64
> +     rte_write64_relaxed(val, addr);
> +#else /* !RTE_ARCH_64 */
> +     rte_spinlock_lock(lock);
> +     rte_write32_relaxed(val, addr);
> +     rte_io_wmb();
> +     rte_write32_relaxed(val >> 32,
> +                         (volatile void *)((volatile char *)addr + 4));
> +     rte_spinlock_unlock(lock);
> +#endif
> +}
> +
> +/**
> + * Provide safe 64bit store operation to mlx5 UAR region for both 32bit
> +and
> + * 64bit architectures while guaranteeing the order of execution with
> +the
> + * code being executed.
> + *
> + * @param val
> + *   value to write in CPU endian format.
> + * @param addr
> + *   Address to write to.
> + * @param lock
> + *   Address of the lock to use for that UAR access.
> + */
> +static __rte_always_inline void
> +__mlx5_uar_write64(uint64_t val, volatile void *addr, rte_spinlock_t
> +*lock) {
> +     rte_io_wmb();
> +     __mlx5_uar_write64_relaxed(val, addr, lock); }
> +
> +/* Assist macros, used instead of directly calling the functions they
> +wrap. */ #ifdef RTE_ARCH_64 #define mlx5_uar_write64_relaxed(val, dst,
> +lock) \
> +             __mlx5_uar_write64_relaxed(val, dst, NULL) #define
> +mlx5_uar_write64(val, dst, lock) __mlx5_uar_write64(val, dst, NULL)
> +#else #define mlx5_uar_write64_relaxed(val, dst, lock) \
> +             __mlx5_uar_write64_relaxed(val, dst, lock) #define
> +mlx5_uar_write64(val, dst, lock) __mlx5_uar_write64(val, dst, lock)
> +#endif
> +
>  #ifndef NDEBUG
>  /**
>   * Verify or set magic value in CQE.
> @@ -619,7 +686,7 @@ uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct
> rte_mbuf **pkts,
>       *txq->qp_db = rte_cpu_to_be_32(txq->wqe_ci);
>       /* Ensure ordering between DB record and BF copy. */
>       rte_wmb();
> -     *dst = *src;
> +     mlx5_uar_write64_relaxed(*src, dst, txq->uar_lock);
>       if (cond)
>               rte_wmb();
>  }
> diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
> index 5057561..f9bc473 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -255,6 +255,9 @@
>       struct mlx5_txq_ctrl *txq_ctrl;
>       int already_mapped;
>       size_t page_size = sysconf(_SC_PAGESIZE);
> +#ifndef RTE_ARCH_64
> +     unsigned int lock_idx;
> +#endif
> 
>       memset(pages, 0, priv->txqs_n * sizeof(uintptr_t));
>       /*
> @@ -281,7 +284,7 @@
>               }
>               /* new address in reserved UAR address space. */
>               addr = RTE_PTR_ADD(priv->uar_base,
> -                                uar_va & (MLX5_UAR_SIZE - 1));
> +                                uar_va & (uintptr_t)(MLX5_UAR_SIZE - 1));
>               if (!already_mapped) {
>                       pages[pages_n++] = uar_va;
>                       /* fixed mmap to specified address in reserved @@ -
> 305,6 +308,12 @@
>               else
>                       assert(txq_ctrl->txq.bf_reg ==
>                              RTE_PTR_ADD((void *)addr, off));
> +#ifndef RTE_ARCH_64
> +             /* Assign a UAR lock according to UAR page number */
> +             lock_idx = (txq_ctrl->uar_mmap_offset / page_size) &
> +                        MLX5_UAR_PAGE_NUM_MASK;
> +             txq->uar_lock = &priv->uar_lock[lock_idx]; #endif
>       }
>       return 0;
>  }
> @@ -511,6 +520,8 @@ struct mlx5_txq_ibv *
>       rte_atomic32_inc(&txq_ibv->refcnt);
>       if (qp.comp_mask & MLX5DV_QP_MASK_UAR_MMAP_OFFSET) {
>               txq_ctrl->uar_mmap_offset = qp.uar_mmap_offset;
> +             DRV_LOG(DEBUG, "port %u: uar_mmap_offset 0x%lx",
> +                     dev->data->port_id, txq_ctrl->uar_mmap_offset);
>       } else {
>               DRV_LOG(ERR,
>                       "port %u failed to retrieve UAR info, invalid"
> --
> 1.8.3.1

Reply via email to