[Intel-wired-lan] [PATCH RFC iwl-next 05/12] idpf: strictly assert cachelines of queue and queue vector structures

Alexander Lobakin Fri, 10 May 2024 08:27:43 -0700

Now that the queue and queue vector structures are separated and layed
out optimally, group the fields as read-mostly, read-write, and cold
cachelines and add size assertions to make sure new features won't push
something out of its place and provoke perf regression.
Despite looking innocent, this gives up to 2% of perf bump on Rx.


Signed-off-by: Alexander Lobakin <aleksander.loba...@intel.com>
---
 drivers/net/ethernet/intel/idpf/idpf_txrx.h | 370 +++++++++++---------
 1 file changed, 205 insertions(+), 165 deletions(-)

diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h 
b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
index 428b82b4de80..0192d33744ff 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
@@ -6,6 +6,7 @@
 
 #include <linux/dim.h>
 
+#include <net/libeth/cache.h>
 #include <net/page_pool/helpers.h>
 #include <net/tcp.h>
 #include <net/netdev_queues.h>
@@ -528,35 +529,43 @@ struct idpf_intr_reg {
  * @affinity_mask: CPU affinity mask
  */
 struct idpf_q_vector {
-       struct idpf_vport *vport;
-       struct napi_struct napi;
-       u16 v_idx;
-       struct idpf_intr_reg intr_reg;
-
-       u16 num_txq;
-       u16 num_complq;
-       struct idpf_tx_queue **tx;
-       struct idpf_compl_queue **complq;
-
-       struct dim tx_dim;
-       u16 tx_itr_value;
-       bool tx_intr_mode;
-       u32 tx_itr_idx;
-
-       u16 num_rxq;
-       struct idpf_rx_queue **rx;
-       struct dim rx_dim;
-       u16 rx_itr_value;
-       bool rx_intr_mode;
-       u32 rx_itr_idx;
-
-       u16 num_bufq;
-       struct idpf_buf_queue **bufq;
-
-       u16 total_events;
-
-       cpumask_var_t affinity_mask;
+       libeth_cacheline_group(read_mostly,
+               struct idpf_vport *vport;
+
+               u16 num_rxq;
+               u16 num_txq;
+               u16 num_bufq;
+               u16 num_complq;
+               struct idpf_rx_queue **rx;
+               struct idpf_tx_queue **tx;
+               struct idpf_buf_queue **bufq;
+               struct idpf_compl_queue **complq;
+
+               struct idpf_intr_reg intr_reg;
+       );
+       libeth_cacheline_group(read_write,
+               struct napi_struct napi;
+               u16 total_events;
+
+               struct dim tx_dim;
+               u16 tx_itr_value;
+               bool tx_intr_mode;
+               u32 tx_itr_idx;
+
+               struct dim rx_dim;
+               u16 rx_itr_value;
+               bool rx_intr_mode;
+               u32 rx_itr_idx;
+       );
+       libeth_cacheline_group(cold,
+               u16 v_idx;
+
+               cpumask_var_t affinity_mask;
+       );
 };
+libeth_cacheline_set_assert(struct idpf_q_vector, 104,
+                           424 + 2 * sizeof(struct dim),
+                           8 + sizeof(cpumask_var_t));
 
 struct idpf_rx_queue_stats {
        u64_stats_t packets;
@@ -641,52 +650,59 @@ struct idpf_txq_stash {
  * @rx_max_pkt_size: RX max packet size
  */
 struct idpf_rx_queue {
-       union {
-               union virtchnl2_rx_desc *rx;
-               struct virtchnl2_singleq_rx_buf_desc *single_buf;
+       libeth_cacheline_group(read_mostly,
+               union {
+                       union virtchnl2_rx_desc *rx;
+                       struct virtchnl2_singleq_rx_buf_desc *single_buf;
 
-               void *desc_ring;
-       };
-       union {
-               struct {
-                       struct idpf_bufq_set *bufq_sets;
-                       struct napi_struct *napi;
+                       void *desc_ring;
                };
-               struct {
-                       struct idpf_rx_buf *rx_buf;
-                       struct page_pool *pp;
+               union {
+                       struct {
+                               struct idpf_bufq_set *bufq_sets;
+                               struct napi_struct *napi;
+                       };
+                       struct {
+                               struct idpf_rx_buf *rx_buf;
+                               struct page_pool *pp;
+                       };
                };
-       };
-       struct net_device *netdev;
-       void __iomem *tail;
-
-       DECLARE_BITMAP(flags, __IDPF_Q_FLAGS_NBITS);
-       u16 idx;
-       u16 desc_count;
-       u16 next_to_use;
-       u16 next_to_clean;
-       u16 next_to_alloc;
-
-       u32 rxdids;
-
-       const struct idpf_rx_ptype_decoded *rx_ptype_lkup;
-       struct sk_buff *skb;
-
-       struct u64_stats_sync stats_sync;
-       struct idpf_rx_queue_stats q_stats;
-
-       /* Slowpath */
-       u32 q_id;
-       u32 size;
-       dma_addr_t dma;
-
-       struct idpf_q_vector *q_vector;
-
-       u16 rx_buffer_low_watermark;
-       u16 rx_hbuf_size;
-       u16 rx_buf_size;
-       u16 rx_max_pkt_size;
-} ____cacheline_aligned;
+               struct net_device *netdev;
+               void __iomem *tail;
+
+               DECLARE_BITMAP(flags, __IDPF_Q_FLAGS_NBITS);
+               u16 idx;
+               u16 desc_count;
+
+               u32 rxdids;
+               const struct idpf_rx_ptype_decoded *rx_ptype_lkup;
+       );
+       libeth_cacheline_group(read_write,
+               u16 next_to_use;
+               u16 next_to_clean;
+               u16 next_to_alloc;
+
+               struct sk_buff *skb;
+
+               struct u64_stats_sync stats_sync;
+               struct idpf_rx_queue_stats q_stats;
+       );
+       libeth_cacheline_group(cold,
+               u32 q_id;
+               u32 size;
+               dma_addr_t dma;
+
+               struct idpf_q_vector *q_vector;
+
+               u16 rx_buffer_low_watermark;
+               u16 rx_hbuf_size;
+               u16 rx_buf_size;
+               u16 rx_max_pkt_size;
+       );
+};
+libeth_cacheline_set_assert(struct idpf_rx_queue, 64,
+                           72 + sizeof(struct u64_stats_sync),
+                           32);
 
 /**
  * struct idpf_tx_queue - software structure represting a transmit queue
@@ -750,54 +766,60 @@ struct idpf_rx_queue {
  * @q_vector: Backreference to associated vector
  */
 struct idpf_tx_queue {
-       union {
-               struct idpf_base_tx_desc *base_tx;
-               struct idpf_base_tx_ctx_desc *base_ctx;
-               union idpf_tx_flex_desc *flex_tx;
-               struct idpf_flex_tx_ctx_desc *flex_ctx;
-
-               void *desc_ring;
-       };
-       struct idpf_tx_buf *tx_buf;
-       struct idpf_txq_group *txq_grp;
-       struct device *dev;
-       void __iomem *tail;
-
-       DECLARE_BITMAP(flags, __IDPF_Q_FLAGS_NBITS);
-       u16 idx;
-       u16 desc_count;
-       u16 next_to_use;
-       u16 next_to_clean;
-
-       struct net_device *netdev;
-
-       union {
-               u32 cleaned_bytes;
-               u32 clean_budget;
-       };
-       u16 cleaned_pkts;
-
-       u16 tx_max_bufs;
-       u16 tx_min_pkt_len;
-
-       u16 compl_tag_bufid_m;
-       u16 compl_tag_gen_s;
-
-       u16 compl_tag_cur_gen;
-       u16 compl_tag_gen_max;
+       libeth_cacheline_group(read_mostly,
+               union {
+                       struct idpf_base_tx_desc *base_tx;
+                       struct idpf_base_tx_ctx_desc *base_ctx;
+                       union idpf_tx_flex_desc *flex_tx;
+                       struct idpf_flex_tx_ctx_desc *flex_ctx;
+
+                       void *desc_ring;
+               };
+               struct idpf_tx_buf *tx_buf;
+               struct idpf_txq_group *txq_grp;
+               struct device *dev;
+               void __iomem *tail;
+
+               DECLARE_BITMAP(flags, __IDPF_Q_FLAGS_NBITS);
+               u16 idx;
+               u16 desc_count;
+
+               u16 tx_min_pkt_len;
+               u16 compl_tag_gen_s;
+
+               struct net_device *netdev;
+       );
+       libeth_cacheline_group(read_write,
+               u16 next_to_use;
+               u16 next_to_clean;
+
+               union {
+                       u32 cleaned_bytes;
+                       u32 clean_budget;
+               };
+               u16 cleaned_pkts;
 
-       struct idpf_txq_stash *stash;
+               u16 tx_max_bufs;
+               struct idpf_txq_stash *stash;
 
-       struct u64_stats_sync stats_sync;
-       struct idpf_tx_queue_stats q_stats;
+               u16 compl_tag_bufid_m;
+               u16 compl_tag_cur_gen;
+               u16 compl_tag_gen_max;
 
-       /* Slowpath */
-       u32 q_id;
-       u32 size;
-       dma_addr_t dma;
+               struct u64_stats_sync stats_sync;
+               struct idpf_tx_queue_stats q_stats;
+       );
+       libeth_cacheline_group(cold,
+               u32 q_id;
+               u32 size;
+               dma_addr_t dma;
 
-       struct idpf_q_vector *q_vector;
-} ____cacheline_aligned;
+               struct idpf_q_vector *q_vector;
+       );
+};
+libeth_cacheline_set_assert(struct idpf_tx_queue, 64,
+                           88 + sizeof(struct u64_stats_sync),
+                           24);
 
 /**
  * struct idpf_buf_queue - software structure represting a buffer queue
@@ -822,32 +844,37 @@ struct idpf_tx_queue {
  * @rx_buf_size: Buffer size
  */
 struct idpf_buf_queue {
-       struct virtchnl2_splitq_rx_buf_desc *split_buf;
-       struct {
-               struct idpf_rx_buf *buf;
-               dma_addr_t hdr_buf_pa;
-               void *hdr_buf_va;
-       } rx_buf;
-       struct page_pool *pp;
-       void __iomem *tail;
-
-       DECLARE_BITMAP(flags, __IDPF_Q_FLAGS_NBITS);
-       u16 desc_count;
-       u16 next_to_use;
-       u16 next_to_clean;
-       u16 next_to_alloc;
-
-       /* Slowpath */
-       u32 q_id;
-       u32 size;
-       dma_addr_t dma;
-
-       struct idpf_q_vector *q_vector;
-
-       u16 rx_buffer_low_watermark;
-       u16 rx_hbuf_size;
-       u16 rx_buf_size;
-} ____cacheline_aligned;
+       libeth_cacheline_group(read_mostly,
+               struct virtchnl2_splitq_rx_buf_desc *split_buf;
+               struct {
+                       struct idpf_rx_buf *buf;
+                       dma_addr_t hdr_buf_pa;
+                       void *hdr_buf_va;
+               } rx_buf;
+               struct page_pool *pp;
+               void __iomem *tail;
+
+               DECLARE_BITMAP(flags, __IDPF_Q_FLAGS_NBITS);
+               u32 desc_count;
+       );
+       libeth_cacheline_group(read_write,
+               u32 next_to_use;
+               u32 next_to_clean;
+               u32 next_to_alloc;
+       );
+       libeth_cacheline_group(cold,
+               u32 q_id;
+               u32 size;
+               dma_addr_t dma;
+
+               struct idpf_q_vector *q_vector;
+
+               u16 rx_buffer_low_watermark;
+               u16 rx_hbuf_size;
+               u16 rx_buf_size;
+       );
+};
+libeth_cacheline_set_assert(struct idpf_buf_queue, 60, 12, 32);
 
 /**
  * struct idpf_compl_queue - software structure represting a completion queue
@@ -870,25 +897,31 @@ struct idpf_buf_queue {
  * @q_vector: Backreference to associated vector
  */
 struct idpf_compl_queue {
-       struct idpf_splitq_tx_compl_desc *comp;
-       struct idpf_txq_group *txq_grp;
-
-       DECLARE_BITMAP(flags, __IDPF_Q_FLAGS_NBITS);
-       u16 desc_count;
-       u16 next_to_use;
-       u16 next_to_clean;
-
-       struct net_device *netdev;
-       u32 clean_budget;
-       u32 num_completions;
+       libeth_cacheline_group(read_mostly,
+               struct idpf_splitq_tx_compl_desc *comp;
+               struct idpf_txq_group *txq_grp;
 
-       /* Slowpath */
-       u32 q_id;
-       u32 size;
-       dma_addr_t dma;
+               DECLARE_BITMAP(flags, __IDPF_Q_FLAGS_NBITS);
+               u32 desc_count;
 
-       struct idpf_q_vector *q_vector;
-} ____cacheline_aligned;
+               u32 clean_budget;
+               struct net_device *netdev;
+       );
+       libeth_cacheline_group(read_write,
+               u32 next_to_use;
+               u32 next_to_clean;
+
+               u32 num_completions;
+       );
+       libeth_cacheline_group(cold,
+               u32 q_id;
+               u32 size;
+               dma_addr_t dma;
+
+               struct idpf_q_vector *q_vector;
+       );
+};
+libeth_cacheline_set_assert(struct idpf_compl_queue, 40, 12, 24);
 
 /**
  * struct idpf_sw_queue
@@ -903,13 +936,20 @@ struct idpf_compl_queue {
  * lockless buffer management system and are strictly software only constructs.
  */
 struct idpf_sw_queue {
-       u32 *ring;
-
-       DECLARE_BITMAP(flags, __IDPF_Q_FLAGS_NBITS);
-       u16 desc_count;
-       u16 next_to_use;
-       u16 next_to_clean;
-} ____cacheline_aligned;
+       libeth_cacheline_group(read_mostly,
+               u32 *ring;
+
+               DECLARE_BITMAP(flags, __IDPF_Q_FLAGS_NBITS);
+               u32 desc_count;
+       );
+       libeth_cacheline_group(read_write,
+               u32 next_to_use;
+               u32 next_to_clean;
+       );
+};
+libeth_cacheline_group_assert(struct idpf_sw_queue, read_mostly, 20);
+libeth_cacheline_group_assert(struct idpf_sw_queue, read_write, 8);
+libeth_cacheline_struct_assert(struct idpf_sw_queue, 20, 8);
 
 /**
  * struct idpf_rxq_set
-- 
2.45.0

[Intel-wired-lan] [PATCH RFC iwl-next 05/12] idpf: strictly assert cachelines of queue and queue vector structures

Reply via email to