This patch shows how it is possible to have both the driver local page cache, which uses elevated refcnt for "catching"/avoiding SKB put_page. And at the same time, have pages getting returned to the page_pool from ndp_xdp_xmit DMA completion.
Performance is surprisingly good. Tested DMA-TX completion on ixgbe, that calls "xdp_return_frame", which call page_pool_put_page(). Stats show DMA-TX-completion runs on CPU#9 and mlx5 RX runs on CPU#5. (Internally page_pool uses ptr_ring, which is what gives the good cross CPU performance). Show adapter(s) (ixgbe2 mlx5p2) statistics (ONLY that changed!) Ethtool(ixgbe2 ) stat: 732863573 ( 732,863,573) <= tx_bytes /sec Ethtool(ixgbe2 ) stat: 781724427 ( 781,724,427) <= tx_bytes_nic /sec Ethtool(ixgbe2 ) stat: 12214393 ( 12,214,393) <= tx_packets /sec Ethtool(ixgbe2 ) stat: 12214435 ( 12,214,435) <= tx_pkts_nic /sec Ethtool(mlx5p2 ) stat: 12211786 ( 12,211,786) <= rx3_cache_empty /sec Ethtool(mlx5p2 ) stat: 36506736 ( 36,506,736) <= rx_64_bytes_phy /sec Ethtool(mlx5p2 ) stat: 2336430575 ( 2,336,430,575) <= rx_bytes_phy /sec Ethtool(mlx5p2 ) stat: 12211786 ( 12,211,786) <= rx_cache_empty /sec Ethtool(mlx5p2 ) stat: 22823073 ( 22,823,073) <= rx_discards_phy /sec Ethtool(mlx5p2 ) stat: 1471860 ( 1,471,860) <= rx_out_of_buffer /sec Ethtool(mlx5p2 ) stat: 36506715 ( 36,506,715) <= rx_packets_phy /sec Ethtool(mlx5p2 ) stat: 2336542282 ( 2,336,542,282) <= rx_prio0_bytes /sec Ethtool(mlx5p2 ) stat: 13683921 ( 13,683,921) <= rx_prio0_packets /sec Ethtool(mlx5p2 ) stat: 821015537 ( 821,015,537) <= rx_vport_unicast_bytes /sec Ethtool(mlx5p2 ) stat: 13683608 ( 13,683,608) <= rx_vport_unicast_packets /sec Before this patch: single flow performance was 6Mpps, and if I started two flows the collective performance drop to 4Mpps, because we hit the page allocator lock (further negative scaling occurs). V2: Adjustments requested by Tariq - Changed page_pool_create return codes not return NULL, only ERR_PTR, as this simplifies err handling in drivers. - Save a branch in mlx5e_page_release - Correct page_pool size calc for MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ Signed-off-by: Jesper Dangaard Brouer <bro...@redhat.com> --- drivers/net/ethernet/mellanox/mlx5/core/en.h | 3 ++ drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 41 +++++++++++++++++---- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 16 ++++++-- net/core/page_pool.c | 2 + 4 files changed, 49 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 28cc26debeda..ab91166f7c5a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -53,6 +53,8 @@ #include "mlx5_core.h" #include "en_stats.h" +struct page_pool; + #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v) #define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN) @@ -535,6 +537,7 @@ struct mlx5e_rq { /* XDP */ struct bpf_prog *xdp_prog; struct mlx5e_xdpsq xdpsq; + struct page_pool *page_pool; /* control */ struct mlx5_wq_ctrl wq_ctrl; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 2e4ca0f15b62..bf17e6d614d6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -35,6 +35,7 @@ #include <linux/mlx5/fs.h> #include <net/vxlan.h> #include <linux/bpf.h> +#include <net/page_pool.h> #include "eswitch.h" #include "en.h" #include "en_tc.h" @@ -387,10 +388,11 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, struct mlx5e_rq_param *rqp, struct mlx5e_rq *rq) { + struct page_pool_params pp_params = { 0 }; struct mlx5_core_dev *mdev = c->mdev; void *rqc = rqp->rqc; void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq); - u32 byte_count; + u32 byte_count, pool_size; int npages; int wq_sz; int err; @@ -429,10 +431,13 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; rq->buff.headroom = params->rq_headroom; + pool_size = 1 << params->log_rq_size; switch (rq->wq_type) { case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: + pool_size = pool_size * MLX5_MPWRQ_PAGES_PER_WQE; + rq->post_wqes = mlx5e_post_rx_mpwqes; rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe; @@ -506,13 +511,31 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, rq->mkey_be = c->mkey_be; } - /* This must only be activate for order-0 pages */ - if (rq->xdp_prog) { - err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, - MEM_TYPE_PAGE_ORDER0, NULL); - if (err) - goto err_rq_wq_destroy; + /* Create a page_pool and register it with rxq */ + pp_params.size = PAGE_POOL_PARAMS_SIZE; + pp_params.order = rq->buff.page_order; + pp_params.dev = c->pdev; + pp_params.nid = cpu_to_node(c->cpu); + pp_params.dma_dir = rq->buff.map_dir; + pp_params.pool_size = pool_size; + pp_params.flags = 0; /* No-internal DMA mapping in page_pool */ + + /* page_pool can be used even when there is no rq->xdp_prog, + * given page_pool does not handle DMA mapping there is no + * required state to clear. And page_pool gracefully handle + * elevated refcnt. + */ + rq->page_pool = page_pool_create(&pp_params); + if (IS_ERR(rq->page_pool)) { + kfree(rq->wqe.frag_info); + err = PTR_ERR(rq->page_pool); + rq->page_pool = NULL; + goto err_rq_wq_destroy; } + err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq, + MEM_TYPE_PAGE_POOL, rq->page_pool); + if (err) + goto err_rq_wq_destroy; for (i = 0; i < wq_sz; i++) { struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i); @@ -550,6 +573,8 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c, if (rq->xdp_prog) bpf_prog_put(rq->xdp_prog); xdp_rxq_info_unreg(&rq->xdp_rxq); + if (rq->page_pool) + page_pool_destroy_rcu(rq->page_pool); mlx5_wq_destroy(&rq->wq_ctrl); return err; @@ -563,6 +588,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq) bpf_prog_put(rq->xdp_prog); xdp_rxq_info_unreg(&rq->xdp_rxq); + if (rq->page_pool) + page_pool_destroy_rcu(rq->page_pool); switch (rq->wq_type) { case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 6dcc3e8fbd3e..2ac78b88fc3d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -37,6 +37,7 @@ #include <linux/bpf_trace.h> #include <net/busy_poll.h> #include <net/ip6_checksum.h> +#include <net/page_pool.h> #include "en.h" #include "en_tc.h" #include "eswitch.h" @@ -221,7 +222,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq, if (mlx5e_rx_cache_get(rq, dma_info)) return 0; - dma_info->page = dev_alloc_pages(rq->buff.page_order); + dma_info->page = page_pool_dev_alloc_pages(rq->page_pool); if (unlikely(!dma_info->page)) return -ENOMEM; @@ -246,11 +247,16 @@ static inline void mlx5e_page_dma_unmap(struct mlx5e_rq *rq, void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info, bool recycle) { - if (likely(recycle) && mlx5e_rx_cache_put(rq, dma_info)) - return; + if (likely(recycle)) { + if (mlx5e_rx_cache_put(rq, dma_info)) + return; - mlx5e_page_dma_unmap(rq, dma_info); - put_page(dma_info->page); + mlx5e_page_dma_unmap(rq, dma_info); + page_pool_recycle_direct(rq->page_pool, dma_info->page); + } else { + mlx5e_page_dma_unmap(rq, dma_info); + put_page(dma_info->page); + } } static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq, diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 8bf84d2a6a60..9706ff78a4c9 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -78,7 +78,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params) if (params->size < offsetof(struct page_pool_params, nid)) { WARN(1, "Fix page_pool_params->size code\n"); - return NULL; + return ERR_PTR(-EBADR); } pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);