The branch main has been updated by whu:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=a18e99945414fb1f9d455b780c6fcf2d09cc68d8

commit a18e99945414fb1f9d455b780c6fcf2d09cc68d8
Author:     Wei Hu <w...@freebsd.org>
AuthorDate: 2025-02-24 13:56:06 +0000
Commit:     Wei Hu <w...@freebsd.org>
CommitDate: 2025-02-24 13:56:06 +0000

    mana: Increase default tx and rx ring size to 1024
    
    Tcp perfomance tests show high number of retries under heave tx
    traffic. The numbers of queue stops and wakeups also increase.
    Further analysis suggests the FreeBSD network stack tends to send
    TSO packets with multiple sg entries, typically ranging from
    10 to 16. On mana, every two sgs takes one unit of tx ring.
    Therefore, adding up one unit for the head, it takes 6 to 9 units
    of tx ring to send a typical TSO packet.
    
    Current default tx ring size is 256, which can get filled up
    quickly under heavy load. When tx ring is full, the send queue
    is stopped waiting for the ring space to be freed. This could
    cause the network stack to drop packets, and lead to tcp
    retransmissions.
    
    Increase the default tx and rx ring size to 1024 units. Also
    introduce two tuneables allowing users to request tx and rx ring
    size in loader.conf:
            hw.mana.rx_req_size
            hw.mana.tx_req_size
    When mana is loading, the driver checks these two values and
    round them up to power of 2. If these two are not set or
    the request values are out of the allowable range, it sets the
    default ring size instead.
    
    Also change the tx and rx single loop completion budget to 8.
    
    Tested by:      whu
    MFC after:      2 weeks
    Sponsored by:   Microsoft
---
 sys/dev/mana/mana.h        | 23 +++++++++--
 sys/dev/mana/mana_en.c     | 96 +++++++++++++++++++++++++++++++++++++++-------
 sys/dev/mana/mana_sysctl.c | 16 ++++++++
 3 files changed, 119 insertions(+), 16 deletions(-)

diff --git a/sys/dev/mana/mana.h b/sys/dev/mana/mana.h
index 906b28eb56b6..a805aa047b9d 100644
--- a/sys/dev/mana/mana.h
+++ b/sys/dev/mana/mana.h
@@ -106,9 +106,23 @@ enum TRI_STATE {
 #define DEFAULT_FRAME_SIZE             (ADAPTER_MTU_SIZE + 14)
 #define MAX_FRAME_SIZE                 4096
 
-#define RX_BUFFERS_PER_QUEUE           512
-
-#define MAX_SEND_BUFFERS_PER_QUEUE     256
+/* Unit number of RX buffers. Must be power of two
+ * Higher number could fail at allocation.
+ */
+#define MAX_RX_BUFFERS_PER_QUEUE       8192
+#define DEF_RX_BUFFERS_PER_QUEUE       1024
+#define MIN_RX_BUFFERS_PER_QUEUE       128
+
+/* Unit number of TX buffers. Must be power of two
+ * Higher number could fail at allocation.
+ * The max value is derived as the maximum
+ * allocatable pages supported on host per guest
+ * through testing. TX buffer size beyond this
+ * value is rejected by the hardware.
+ */
+#define MAX_SEND_BUFFERS_PER_QUEUE     16384
+#define DEF_SEND_BUFFERS_PER_QUEUE     1024
+#define MIN_SEND_BUFFERS_PER_QUEUE     128
 
 #define EQ_SIZE                                (8 * PAGE_SIZE)
 #define LOG2_EQ_THROTTLE               3
@@ -507,6 +521,9 @@ struct mana_port_context {
        unsigned int            max_queues;
        unsigned int            num_queues;
 
+       unsigned int            tx_queue_size;
+       unsigned int            rx_queue_size;
+
        mana_handle_t           port_handle;
 
        int                     vport_use_count;
diff --git a/sys/dev/mana/mana_en.c b/sys/dev/mana/mana_en.c
index 735b94bba6cd..a1d2d1015b89 100644
--- a/sys/dev/mana/mana_en.c
+++ b/sys/dev/mana/mana_en.c
@@ -67,6 +67,9 @@
 static int mana_up(struct mana_port_context *apc);
 static int mana_down(struct mana_port_context *apc);
 
+extern unsigned int mana_tx_req_size;
+extern unsigned int mana_rx_req_size;
+
 static void
 mana_rss_key_fill(void *k, size_t size)
 {
@@ -492,6 +495,7 @@ mana_xmit(struct mana_txq *txq)
        if_t ndev = txq->ndev;
        struct mbuf *mbuf;
        struct mana_port_context *apc = if_getsoftc(ndev);
+       unsigned int tx_queue_size = apc->tx_queue_size;
        struct mana_port_stats *port_stats = &apc->port_stats;
        struct gdma_dev *gd = apc->ac->gdma_dev;
        uint64_t packets, bytes;
@@ -635,7 +639,7 @@ mana_xmit(struct mana_txq *txq)
                }
 
                next_to_use =
-                   (next_to_use + 1) % MAX_SEND_BUFFERS_PER_QUEUE;
+                   (next_to_use + 1) % tx_queue_size;
 
                (void)atomic_inc_return(&txq->pending_sends);
 
@@ -1423,6 +1427,7 @@ mana_poll_tx_cq(struct mana_cq *cq)
        unsigned int wqe_unit_cnt = 0;
        struct mana_txq *txq = cq->txq;
        struct mana_port_context *apc;
+       unsigned int tx_queue_size;
        uint16_t next_to_complete;
        if_t ndev;
        int comp_read;
@@ -1436,6 +1441,7 @@ mana_poll_tx_cq(struct mana_cq *cq)
 
        ndev = txq->ndev;
        apc = if_getsoftc(ndev);
+       tx_queue_size = apc->tx_queue_size;
 
        comp_read = mana_gd_poll_cq(cq->gdma_cq, completions,
            CQE_POLLING_BUFFER);
@@ -1521,7 +1527,7 @@ mana_poll_tx_cq(struct mana_cq *cq)
                mb();
 
                next_to_complete =
-                   (next_to_complete + 1) % MAX_SEND_BUFFERS_PER_QUEUE;
+                   (next_to_complete + 1) % tx_queue_size;
 
                pkt_transmitted++;
        }
@@ -1867,9 +1873,9 @@ mana_cq_handler(void *context, struct gdma_queue 
*gdma_queue)
        mana_gd_ring_cq(gdma_queue, arm_bit);
 }
 
-#define MANA_POLL_BUDGET       8
-#define MANA_RX_BUDGET         256
-#define MANA_TX_BUDGET         MAX_SEND_BUFFERS_PER_QUEUE
+#define MANA_POLL_BUDGET       256
+#define MANA_RX_BUDGET         8
+#define MANA_TX_BUDGET         8
 
 static void
 mana_poll(void *arg, int pending)
@@ -1976,7 +1982,7 @@ mana_deinit_txq(struct mana_port_context *apc, struct 
mana_txq *txq)
 
        if (txq->tx_buf_info) {
                /* Free all mbufs which are still in-flight */
-               for (i = 0; i < MAX_SEND_BUFFERS_PER_QUEUE; i++) {
+               for (i = 0; i < apc->tx_queue_size; i++) {
                        txbuf_info = &txq->tx_buf_info[i];
                        if (txbuf_info->mbuf) {
                                mana_tx_unmap_mbuf(apc, txbuf_info);
@@ -2034,15 +2040,19 @@ mana_create_txq(struct mana_port_context *apc, if_t net)
            M_DEVBUF, M_WAITOK | M_ZERO);
 
        /*  The minimum size of the WQE is 32 bytes, hence
-        *  MAX_SEND_BUFFERS_PER_QUEUE represents the maximum number of WQEs
+        *  apc->tx_queue_size represents the maximum number of WQEs
         *  the SQ can store. This value is then used to size other queues
         *  to prevent overflow.
+        *  Also note that the txq_size is always going to be page aligned,
+        *  as min val of apc->tx_queue_size is 128 and that would make
+        *  txq_size 128 * 32 = 4096 and the other higher values of
+        *  apc->tx_queue_size are always power of two.
         */
-       txq_size = MAX_SEND_BUFFERS_PER_QUEUE * 32;
+       txq_size = apc->tx_queue_size * 32;
        KASSERT(IS_ALIGNED(txq_size, PAGE_SIZE),
            ("txq size not page aligned"));
 
-       cq_size = MAX_SEND_BUFFERS_PER_QUEUE * COMP_ENTRY_SIZE;
+       cq_size = apc->tx_queue_size * COMP_ENTRY_SIZE;
        cq_size = ALIGN(cq_size, PAGE_SIZE);
 
        gc = gd->gdma_context;
@@ -2125,7 +2135,7 @@ mana_create_txq(struct mana_port_context *apc, if_t net)
                gc->cq_table[cq->gdma_id] = cq->gdma_cq;
 
                /* Initialize tx specific data */
-               txq->tx_buf_info = malloc(MAX_SEND_BUFFERS_PER_QUEUE *
+               txq->tx_buf_info = malloc(apc->tx_queue_size *
                    sizeof(struct mana_send_buf_info),
                    M_DEVBUF, M_WAITOK | M_ZERO);
 
@@ -2133,7 +2143,7 @@ mana_create_txq(struct mana_port_context *apc, if_t net)
                    "mana:tx(%d)", i);
                mtx_init(&txq->txq_mtx, txq->txq_mtx_name, NULL, MTX_DEF);
 
-               txq->txq_br = buf_ring_alloc(4 * MAX_SEND_BUFFERS_PER_QUEUE,
+               txq->txq_br = buf_ring_alloc(4 * apc->tx_queue_size,
                    M_DEVBUF, M_WAITOK, &txq->txq_mtx);
 
                /* Allocate taskqueue for deferred send */
@@ -2323,10 +2333,10 @@ mana_create_rxq(struct mana_port_context *apc, uint32_t 
rxq_idx,
        gc = gd->gdma_context;
 
        rxq = malloc(sizeof(*rxq) +
-           RX_BUFFERS_PER_QUEUE * sizeof(struct mana_recv_buf_oob),
+           apc->rx_queue_size * sizeof(struct mana_recv_buf_oob),
            M_DEVBUF, M_WAITOK | M_ZERO);
        rxq->ndev = ndev;
-       rxq->num_rx_buf = RX_BUFFERS_PER_QUEUE;
+       rxq->num_rx_buf = apc->rx_queue_size;
        rxq->rxq_idx = rxq_idx;
        /*
         * Minimum size is MCLBYTES(2048) bytes for a mbuf cluster.
@@ -2763,6 +2773,62 @@ mana_detach(if_t ndev)
        return err;
 }
 
+static unsigned int
+mana_get_tx_queue_size(int port_idx, unsigned int request_size)
+{
+       unsigned int new_size;
+
+       if (request_size == 0)
+               /* Uninitialized */
+               new_size = DEF_SEND_BUFFERS_PER_QUEUE;
+       else
+               new_size = roundup_pow_of_two(request_size);
+
+       if (new_size < MIN_SEND_BUFFERS_PER_QUEUE ||
+           new_size > MAX_SEND_BUFFERS_PER_QUEUE) {
+               mana_info(NULL, "mana port %d: requested tx buffer "
+                   "size %u out of allowable range (%u - %u), "
+                   "setting to default\n",
+                   port_idx, request_size,
+                   MIN_SEND_BUFFERS_PER_QUEUE,
+                   MAX_SEND_BUFFERS_PER_QUEUE);
+               new_size = DEF_SEND_BUFFERS_PER_QUEUE;
+       }
+       mana_info(NULL, "mana port %d: tx buffer size %u "
+           "(%u requested)\n",
+           port_idx, new_size, request_size);
+
+       return (new_size);
+}
+
+static unsigned int
+mana_get_rx_queue_size(int port_idx, unsigned int request_size)
+{
+       unsigned int new_size;
+
+       if (request_size == 0)
+               /* Uninitialized */
+               new_size = DEF_RX_BUFFERS_PER_QUEUE;
+       else
+               new_size = roundup_pow_of_two(request_size);
+
+       if (new_size < MIN_RX_BUFFERS_PER_QUEUE ||
+           new_size > MAX_RX_BUFFERS_PER_QUEUE) {
+               mana_info(NULL, "mana port %d: requested rx buffer "
+                   "size %u out of allowable range (%u - %u), "
+                   "setting to default\n",
+                   port_idx, request_size,
+                   MIN_RX_BUFFERS_PER_QUEUE,
+                   MAX_RX_BUFFERS_PER_QUEUE);
+               new_size = DEF_RX_BUFFERS_PER_QUEUE;
+       }
+       mana_info(NULL, "mana port %d: rx buffer size %u "
+           "(%u requested)\n",
+           port_idx, new_size, request_size);
+
+       return (new_size);
+}
+
 static int
 mana_probe_port(struct mana_context *ac, int port_idx,
     if_t *ndev_storage)
@@ -2782,6 +2848,10 @@ mana_probe_port(struct mana_context *ac, int port_idx,
        apc->max_queues = gc->max_num_queues;
        apc->num_queues = min_t(unsigned int,
            gc->max_num_queues, MANA_MAX_NUM_QUEUES);
+       apc->tx_queue_size = mana_get_tx_queue_size(port_idx,
+           mana_tx_req_size);
+       apc->rx_queue_size = mana_get_rx_queue_size(port_idx,
+           mana_rx_req_size);
        apc->port_handle = INVALID_MANA_HANDLE;
        apc->port_idx = port_idx;
        apc->frame_size = DEFAULT_FRAME_SIZE;
diff --git a/sys/dev/mana/mana_sysctl.c b/sys/dev/mana/mana_sysctl.c
index 844a05040595..acb3628f09bc 100644
--- a/sys/dev/mana/mana_sysctl.c
+++ b/sys/dev/mana/mana_sysctl.c
@@ -34,9 +34,17 @@ static int 
mana_sysctl_cleanup_thread_cpu(SYSCTL_HANDLER_ARGS);
 
 int mana_log_level = MANA_ALERT | MANA_WARNING | MANA_INFO;
 
+unsigned int mana_tx_req_size;
+unsigned int mana_rx_req_size;
+
 SYSCTL_NODE(_hw, OID_AUTO, mana, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
     "MANA driver parameters");
 
+SYSCTL_UINT(_hw_mana, OID_AUTO, tx_req_size, CTLFLAG_RWTUN,
+    &mana_tx_req_size, 0, "requested number of unit of tx queue");
+SYSCTL_UINT(_hw_mana, OID_AUTO, rx_req_size, CTLFLAG_RWTUN,
+    &mana_rx_req_size, 0, "requested number of unit of rx queue");
+
 /*
  * Logging level for changing verbosity of the output
  */
@@ -166,6 +174,14 @@ mana_sysctl_add_port(struct mana_port_context *apc)
            "enable_altq", CTLFLAG_RW, &apc->enable_tx_altq, 0,
            "Choose alternative txq under heavy load");
 
+       SYSCTL_ADD_UINT(ctx, apc->port_list, OID_AUTO,
+           "tx_queue_size", CTLFLAG_RD, &apc->tx_queue_size, 0,
+           "number of unit of tx queue");
+
+       SYSCTL_ADD_UINT(ctx, apc->port_list, OID_AUTO,
+           "rx_queue_size", CTLFLAG_RD, &apc->rx_queue_size, 0,
+           "number of unit of rx queue");
+
        SYSCTL_ADD_PROC(ctx, apc->port_list, OID_AUTO,
            "bind_cleanup_thread_cpu",
            CTLTYPE_U8 | CTLFLAG_RW | CTLFLAG_MPSAFE,

Reply via email to