When searching LKEY, if search key is mempool pointer, the 2nd cacheline has to be accessed and it even requires to check whether a buffer is indirect per every search. Instead, using address for search key can reduce cycles taken. And caching the last hit entry is beneficial as well.
Signed-off-by: Yongseok Koh <ys...@mellanox.com> --- drivers/net/mlx5/mlx5_mr.c | 2 ++ drivers/net/mlx5/mlx5_rxtx.c | 37 ++++++++++++++++++++----------------- drivers/net/mlx5/mlx5_rxtx.h | 3 +++ 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c index 0a3638460..f2484cb06 100644 --- a/drivers/net/mlx5/mlx5_mr.c +++ b/drivers/net/mlx5/mlx5_mr.c @@ -207,6 +207,8 @@ txq_mp2mr_reg(struct txq *txq, struct rte_mempool *mp, unsigned int idx) sizeof(txq_ctrl->txq.mp2mr[0]))); } /* Store the new entry. */ + txq_ctrl->txq.mp2mr[idx].start = (uintptr_t)mr->addr; + txq_ctrl->txq.mp2mr[idx].end = (uintptr_t)mr->addr + mr->length; txq_ctrl->txq.mp2mr[idx].mp = mp; txq_ctrl->txq.mp2mr[idx].mr = mr; txq_ctrl->txq.mp2mr[idx].lkey = htonl(mr->lkey); diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c index d81d630f7..5cc38ccb7 100644 --- a/drivers/net/mlx5/mlx5_rxtx.c +++ b/drivers/net/mlx5/mlx5_rxtx.c @@ -77,7 +77,7 @@ static __rte_always_inline void txq_complete(struct txq *txq); static __rte_always_inline uint32_t -txq_mp2mr(struct txq *txq, struct rte_mempool *mp); +txq_mb2mr(struct txq *txq, struct rte_mbuf *mb); static __rte_always_inline void mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe); @@ -355,7 +355,7 @@ txq_mb2mp(struct rte_mbuf *buf) } /** - * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[]. + * Get Memory Region (MR) <-> rte_mbuf association from txq->mp2mr[]. * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full, * remove an entry first. * @@ -368,27 +368,30 @@ txq_mb2mp(struct rte_mbuf *buf) * mr->lkey on success, (uint32_t)-1 on failure. */ static inline uint32_t -txq_mp2mr(struct txq *txq, struct rte_mempool *mp) +txq_mb2mr(struct txq *txq, struct rte_mbuf *mb) { - unsigned int i; - uint32_t lkey = (uint32_t)-1; + uint16_t i = txq->mr_cache_idx; + uintptr_t addr = rte_pktmbuf_mtod(mb, uintptr_t); + assert(i < RTE_DIM(txq->mp2mr)); + if (likely(txq->mp2mr[i].start <= addr && txq->mp2mr[i].end >= addr)) + return txq->mp2mr[i].lkey; for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) { if (unlikely(txq->mp2mr[i].mp == NULL)) { /* Unknown MP, add a new MR for it. */ break; } - if (txq->mp2mr[i].mp == mp) { + if (txq->mp2mr[i].start <= addr && + txq->mp2mr[i].end >= addr) { assert(txq->mp2mr[i].lkey != (uint32_t)-1); assert(htonl(txq->mp2mr[i].mr->lkey) == txq->mp2mr[i].lkey); - lkey = txq->mp2mr[i].lkey; - break; + txq->mr_cache_idx = i; + return txq->mp2mr[i].lkey; } } - if (unlikely(lkey == (uint32_t)-1)) - lkey = txq_mp2mr_reg(txq, mp, i); - return lkey; + txq->mr_cache_idx = 0; + return txq_mp2mr_reg(txq, txq_mb2mp(mb), i); } /** @@ -773,7 +776,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) naddr = htonll(addr); *dseg = (rte_v128u32_t){ htonl(length), - txq_mp2mr(txq, txq_mb2mp(buf)), + txq_mb2mr(txq, buf), naddr, naddr >> 32, }; @@ -812,7 +815,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) naddr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)); *dseg = (rte_v128u32_t){ htonl(length), - txq_mp2mr(txq, txq_mb2mp(buf)), + txq_mb2mr(txq, buf), naddr, naddr >> 32, }; @@ -1054,7 +1057,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) addr = rte_pktmbuf_mtod(buf, uintptr_t); *dseg = (struct mlx5_wqe_data_seg){ .byte_count = htonl(DATA_LEN(buf)), - .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), + .lkey = txq_mb2mr(txq, buf), .addr = htonll(addr), }; #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) @@ -1300,7 +1303,7 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts, addr = rte_pktmbuf_mtod(buf, uintptr_t); *dseg = (struct mlx5_wqe_data_seg){ .byte_count = htonl(DATA_LEN(buf)), - .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), + .lkey = txq_mb2mr(txq, buf), .addr = htonll(addr), }; #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) @@ -1607,7 +1610,7 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) addr = rte_pktmbuf_mtod(buf, uintptr_t); *dseg = (struct mlx5_wqe_data_seg){ .byte_count = htonl(DATA_LEN(buf)), - .lkey = txq_mp2mr(txq, txq_mb2mp(buf)), + .lkey = txq_mb2mr(txq, buf), .addr = htonll(addr), }; #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG) @@ -1690,7 +1693,7 @@ mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n) naddr = htonll(addr); *dseg = (rte_v128u32_t) { htonl(length), - txq_mp2mr(txq, txq_mb2mp(buf)), + txq_mb2mr(txq, buf), naddr, naddr >> 32, }; diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h index f9b738b4e..a86f41abc 100644 --- a/drivers/net/mlx5/mlx5_rxtx.h +++ b/drivers/net/mlx5/mlx5_rxtx.h @@ -267,10 +267,13 @@ struct txq { volatile uint32_t *cq_db; /* Completion queue doorbell. */ volatile void *bf_reg; /* Blueflame register. */ struct { + uintptr_t start; /* Start address of MR */ + uintptr_t end; /* End address of MR */ const struct rte_mempool *mp; /* Cached Memory Pool. */ struct ibv_mr *mr; /* Memory Region (for mp). */ uint32_t lkey; /* htonl(mr->lkey) */ } mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */ + uint16_t mr_cache_idx; /* Index of last hit entry. */ struct rte_mbuf *(*elts)[]; /* TX elements. */ struct mlx5_txq_stats stats; /* TX queue counters. */ } __rte_cache_aligned; -- 2.11.0