This patch counts the tick spent in rx-tx_burst routines in
dedicated counters and displays the gathered profiling statistics.

The feature is engaged only if CONFIG_RTE_TEST_PMD_RECORD_CORE_CYCLES
configured as 'Y'. The "set fwdprof (flags)" command can be used
to select what counters should be involved.

Signed-off-by: Viacheslav Ovsiienko <viachesl...@mellanox.com>
---
 app/test-pmd/csumonly.c   | 21 +++++++++------------
 app/test-pmd/flowgen.c    | 21 +++++++++------------
 app/test-pmd/icmpecho.c   | 21 +++++++++------------
 app/test-pmd/iofwd.c      | 21 +++++++++------------
 app/test-pmd/macfwd.c     | 21 +++++++++------------
 app/test-pmd/macswap.c    | 21 +++++++++------------
 app/test-pmd/rxonly.c     | 14 ++++----------
 app/test-pmd/softnicfwd.c | 21 +++++++++------------
 app/test-pmd/testpmd.c    | 18 +++++++++++++++++-
 app/test-pmd/testpmd.h    | 34 ++++++++++++++++++++++++++++++++--
 app/test-pmd/txonly.c     | 20 ++++++++------------
 11 files changed, 124 insertions(+), 109 deletions(-)

diff --git a/app/test-pmd/csumonly.c b/app/test-pmd/csumonly.c
index 25091de..4104737 100644
--- a/app/test-pmd/csumonly.c
+++ b/app/test-pmd/csumonly.c
@@ -789,18 +789,15 @@ struct simple_gre_hdr {
        int ret;
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       uint64_t start_tsc;
-       uint64_t end_tsc;
-       uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       start_tsc = rte_rdtsc();
+       uint64_t start_rx_tsc = 0;
+       uint64_t start_tx_tsc = 0;
 #endif
 
        /* receive a burst of packet */
+       TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
        nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
                                 nb_pkt_per_burst);
+       TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
        if (unlikely(nb_rx == 0))
                return;
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
@@ -1067,8 +1064,10 @@ struct simple_gre_hdr {
                printf("Preparing packet burst to transmit failed: %s\n",
                                rte_strerror(rte_errno));
 
+       TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
        nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, tx_pkts_burst,
                        nb_prep);
+       TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 
        /*
         * Retry if necessary
@@ -1077,8 +1076,10 @@ struct simple_gre_hdr {
                retry = 0;
                while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
                        rte_delay_us(burst_tx_delay_time);
+                       TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
                        nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
                                        &tx_pkts_burst[nb_tx], nb_rx - nb_tx);
+                       TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
                }
        }
        fs->tx_packets += nb_tx;
@@ -1096,11 +1097,7 @@ struct simple_gre_hdr {
                } while (++nb_tx < nb_rx);
        }
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       end_tsc = rte_rdtsc();
-       core_cycles = (end_tsc - start_tsc);
-       fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+       TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine csum_fwd_engine = {
diff --git a/app/test-pmd/flowgen.c b/app/test-pmd/flowgen.c
index 4bd351e..51e87b0 100644
--- a/app/test-pmd/flowgen.c
+++ b/app/test-pmd/flowgen.c
@@ -98,19 +98,16 @@
        uint32_t retry;
        uint64_t tx_offloads;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       uint64_t start_tsc;
-       uint64_t end_tsc;
-       uint64_t core_cycles;
+       uint64_t start_rx_tsc = 0;
+       uint64_t start_tx_tsc = 0;
 #endif
        static int next_flow = 0;
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       start_tsc = rte_rdtsc();
-#endif
-
        /* Receive a burst of packets and discard them. */
+       TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
        nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
                                 nb_pkt_per_burst);
+       TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
        fs->rx_packets += nb_rx;
 
        for (i = 0; i < nb_rx; i++)
@@ -180,7 +177,9 @@
                next_flow = (next_flow + 1) % cfg_n_flows;
        }
 
+       TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
        nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
+       TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
        /*
         * Retry if necessary
         */
@@ -188,8 +187,10 @@
                retry = 0;
                while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
                        rte_delay_us(burst_tx_delay_time);
+                       TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
                        nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
                                        &pkts_burst[nb_tx], nb_rx - nb_tx);
+                       TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
                }
        }
        fs->tx_packets += nb_tx;
@@ -207,11 +208,7 @@
                        rte_pktmbuf_free(pkts_burst[nb_tx]);
                } while (++nb_tx < nb_pkt);
        }
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       end_tsc = rte_rdtsc();
-       core_cycles = (end_tsc - start_tsc);
-       fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+       TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine flow_gen_engine = {
diff --git a/app/test-pmd/icmpecho.c b/app/test-pmd/icmpecho.c
index 65aece16..8843183 100644
--- a/app/test-pmd/icmpecho.c
+++ b/app/test-pmd/icmpecho.c
@@ -294,20 +294,17 @@
        uint8_t  i;
        int l2_len;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       uint64_t start_tsc;
-       uint64_t end_tsc;
-       uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       start_tsc = rte_rdtsc();
+       uint64_t start_rx_tsc = 0;
+       uint64_t start_tx_tsc = 0;
 #endif
 
        /*
         * First, receive a burst of packets.
         */
+       TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
        nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
                                 nb_pkt_per_burst);
+       TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
        if (unlikely(nb_rx == 0))
                return;
 
@@ -492,8 +489,10 @@
 
        /* Send back ICMP echo replies, if any. */
        if (nb_replies > 0) {
+               TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
                nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
                                         nb_replies);
+               TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
                /*
                 * Retry if necessary
                 */
@@ -502,10 +501,12 @@
                        while (nb_tx < nb_replies &&
                                        retry++ < burst_tx_retry_num) {
                                rte_delay_us(burst_tx_delay_time);
+                               TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
                                nb_tx += rte_eth_tx_burst(fs->tx_port,
                                                fs->tx_queue,
                                                &pkts_burst[nb_tx],
                                                nb_replies - nb_tx);
+                               TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
                        }
                }
                fs->tx_packets += nb_tx;
@@ -520,11 +521,7 @@
                }
        }
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       end_tsc = rte_rdtsc();
-       core_cycles = (end_tsc - start_tsc);
-       fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+       TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine icmp_echo_engine = {
diff --git a/app/test-pmd/iofwd.c b/app/test-pmd/iofwd.c
index 9dce76e..9ff6531 100644
--- a/app/test-pmd/iofwd.c
+++ b/app/test-pmd/iofwd.c
@@ -52,20 +52,17 @@
        uint32_t retry;
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       uint64_t start_tsc;
-       uint64_t end_tsc;
-       uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       start_tsc = rte_rdtsc();
+       uint64_t start_rx_tsc = 0;
+       uint64_t start_tx_tsc = 0;
 #endif
 
        /*
         * Receive a burst of packets and forward them.
         */
+       TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
        nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
                        pkts_burst, nb_pkt_per_burst);
+       TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
        if (unlikely(nb_rx == 0))
                return;
        fs->rx_packets += nb_rx;
@@ -73,8 +70,10 @@
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
        fs->rx_burst_stats.pkt_burst_spread[nb_rx]++;
 #endif
+       TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
        nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
                        pkts_burst, nb_rx);
+       TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
        /*
         * Retry if necessary
         */
@@ -82,8 +81,10 @@
                retry = 0;
                while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
                        rte_delay_us(burst_tx_delay_time);
+                       TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
                        nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
                                        &pkts_burst[nb_tx], nb_rx - nb_tx);
+                       TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
                }
        }
        fs->tx_packets += nb_tx;
@@ -96,11 +97,7 @@
                        rte_pktmbuf_free(pkts_burst[nb_tx]);
                } while (++nb_tx < nb_rx);
        }
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       end_tsc = rte_rdtsc();
-       core_cycles = (end_tsc - start_tsc);
-       fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+       TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine io_fwd_engine = {
diff --git a/app/test-pmd/macfwd.c b/app/test-pmd/macfwd.c
index d2ebb11..f4a213e 100644
--- a/app/test-pmd/macfwd.c
+++ b/app/test-pmd/macfwd.c
@@ -57,20 +57,17 @@
        uint64_t ol_flags = 0;
        uint64_t tx_offloads;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       uint64_t start_tsc;
-       uint64_t end_tsc;
-       uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       start_tsc = rte_rdtsc();
+       uint64_t start_rx_tsc = 0;
+       uint64_t start_tx_tsc = 0;
 #endif
 
        /*
         * Receive a burst of packets and forward them.
         */
+       TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
        nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
                                 nb_pkt_per_burst);
+       TEST_PMD_CORE_CYC_RX_ADD(fs, start_tx_tsc);
        if (unlikely(nb_rx == 0))
                return;
 
@@ -103,7 +100,9 @@
                mb->vlan_tci = txp->tx_vlan_id;
                mb->vlan_tci_outer = txp->tx_vlan_id_outer;
        }
+       TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
        nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
+       TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
        /*
         * Retry if necessary
         */
@@ -111,8 +110,10 @@
                retry = 0;
                while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
                        rte_delay_us(burst_tx_delay_time);
+                       TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
                        nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
                                        &pkts_burst[nb_tx], nb_rx - nb_tx);
+                       TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
                }
        }
 
@@ -126,11 +127,7 @@
                        rte_pktmbuf_free(pkts_burst[nb_tx]);
                } while (++nb_tx < nb_rx);
        }
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       end_tsc = rte_rdtsc();
-       core_cycles = (end_tsc - start_tsc);
-       fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+       TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine mac_fwd_engine = {
diff --git a/app/test-pmd/macswap.c b/app/test-pmd/macswap.c
index 8428c26..5cb3133 100644
--- a/app/test-pmd/macswap.c
+++ b/app/test-pmd/macswap.c
@@ -58,20 +58,17 @@
        uint16_t nb_tx;
        uint32_t retry;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       uint64_t start_tsc;
-       uint64_t end_tsc;
-       uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       start_tsc = rte_rdtsc();
+       uint64_t start_rx_tsc = 0;
+       uint64_t start_tx_tsc = 0;
 #endif
 
        /*
         * Receive a burst of packets and forward them.
         */
+       TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
        nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
                                 nb_pkt_per_burst);
+       TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
        if (unlikely(nb_rx == 0))
                return;
 
@@ -83,7 +80,9 @@
 
        do_macswap(pkts_burst, nb_rx, txp);
 
+       TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
        nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_rx);
+       TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
        /*
         * Retry if necessary
         */
@@ -91,8 +90,10 @@
                retry = 0;
                while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
                        rte_delay_us(burst_tx_delay_time);
+                       TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
                        nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
                                        &pkts_burst[nb_tx], nb_rx - nb_tx);
+                       TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
                }
        }
        fs->tx_packets += nb_tx;
@@ -105,11 +106,7 @@
                        rte_pktmbuf_free(pkts_burst[nb_tx]);
                } while (++nb_tx < nb_rx);
        }
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       end_tsc = rte_rdtsc();
-       core_cycles = (end_tsc - start_tsc);
-       fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+       TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine mac_swap_engine = {
diff --git a/app/test-pmd/rxonly.c b/app/test-pmd/rxonly.c
index 5c65fc4..2820d7f 100644
--- a/app/test-pmd/rxonly.c
+++ b/app/test-pmd/rxonly.c
@@ -51,18 +51,16 @@
        uint16_t i;
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       uint64_t start_tsc;
-       uint64_t end_tsc;
-       uint64_t core_cycles;
-
-       start_tsc = rte_rdtsc();
+       uint64_t start_rx_tsc = 0;
 #endif
 
        /*
         * Receive a burst of packets.
         */
+       TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
        nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue, pkts_burst,
                                 nb_pkt_per_burst);
+       TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
        if (unlikely(nb_rx == 0))
                return;
 
@@ -73,11 +71,7 @@
        for (i = 0; i < nb_rx; i++)
                rte_pktmbuf_free(pkts_burst[i]);
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       end_tsc = rte_rdtsc();
-       core_cycles = (end_tsc - start_tsc);
-       fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+       TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 struct fwd_engine rx_only_engine = {
diff --git a/app/test-pmd/softnicfwd.c b/app/test-pmd/softnicfwd.c
index e9d4373..b78f2ce 100644
--- a/app/test-pmd/softnicfwd.c
+++ b/app/test-pmd/softnicfwd.c
@@ -88,34 +88,35 @@ struct tm_hierarchy {
        uint32_t retry;
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       uint64_t start_tsc;
-       uint64_t end_tsc;
-       uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       start_tsc = rte_rdtsc();
+       uint64_t start_rx_tsc = 0;
+       uint64_t start_tx_tsc = 0;
 #endif
 
        /*  Packets Receive */
+       TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
        nb_rx = rte_eth_rx_burst(fs->rx_port, fs->rx_queue,
                        pkts_burst, nb_pkt_per_burst);
+       TEST_PMD_CORE_CYC_RX_ADD(fs, start_rx_tsc);
        fs->rx_packets += nb_rx;
 
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
        fs->rx_burst_stats.pkt_burst_spread[nb_rx]++;
 #endif
 
+       TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
        nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
                        pkts_burst, nb_rx);
+       TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
 
        /* Retry if necessary */
        if (unlikely(nb_tx < nb_rx) && fs->retry_enabled) {
                retry = 0;
                while (nb_tx < nb_rx && retry++ < burst_tx_retry_num) {
                        rte_delay_us(burst_tx_delay_time);
+                       TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
                        nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
                                        &pkts_burst[nb_tx], nb_rx - nb_tx);
+                       TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
                }
        }
        fs->tx_packets += nb_tx;
@@ -130,11 +131,7 @@ struct tm_hierarchy {
                        rte_pktmbuf_free(pkts_burst[nb_tx]);
                } while (++nb_tx < nb_rx);
        }
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       end_tsc = rte_rdtsc();
-       core_cycles = (end_tsc - start_tsc);
-       fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+       TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 static void
diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c
index c93fa35..b195880 100644
--- a/app/test-pmd/testpmd.c
+++ b/app/test-pmd/testpmd.c
@@ -1625,6 +1625,8 @@ struct extmem_param {
        struct rte_eth_stats stats;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
        uint64_t fwd_cycles = 0;
+       uint64_t rx_cycles = 0;
+       uint64_t tx_cycles = 0;
 #endif
        uint64_t total_recv = 0;
        uint64_t total_xmit = 0;
@@ -1655,6 +1657,8 @@ struct extmem_param {
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
                fwd_cycles += fs->core_cycles;
+               rx_cycles += fs->core_rx_cycles;
+               tx_cycles += fs->core_tx_cycles;
 #endif
        }
        for (i = 0; i < cur_fwd_config.nb_fwd_ports; i++) {
@@ -1785,11 +1789,21 @@ struct extmem_param {
               "%s\n",
               acc_stats_border, acc_stats_border);
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       if (total_recv > 0)
+       if (fwdprof_flags & RECORD_CORE_CYCLES_FWD && total_recv > 0)
                printf("\n  CPU cycles/packet=%u (total cycles="
                       "%"PRIu64" / total RX packets=%"PRIu64")\n",
                       (unsigned int)(fwd_cycles / total_recv),
                       fwd_cycles, total_recv);
+       if (fwdprof_flags & RECORD_CORE_CYCLES_RX && total_recv > 0)
+               printf("\n  rx CPU cycles/packet=%u (total cycles="
+                      "%"PRIu64" / total RX packets=%"PRIu64")\n",
+                      (unsigned int)(rx_cycles / total_recv),
+                      rx_cycles, total_recv);
+       if (fwdprof_flags & RECORD_CORE_CYCLES_TX && total_xmit > 0)
+               printf("\n  tx CPU cycles/packet=%u (total cycles="
+                      "%"PRIu64" / total TX packets=%"PRIu64")\n",
+                      (unsigned int)(tx_cycles / total_xmit),
+                      tx_cycles, total_xmit);
 #endif
 }
 
@@ -1820,6 +1834,8 @@ struct extmem_param {
 #endif
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
                fs->core_cycles = 0;
+               fs->core_rx_cycles = 0;
+               fs->core_tx_cycles = 0;
 #endif
        }
 }
diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h
index 466e611..6177a50 100644
--- a/app/test-pmd/testpmd.h
+++ b/app/test-pmd/testpmd.h
@@ -136,7 +136,9 @@ struct fwd_stream {
        /**< received packets has bad outer l4 checksum */
        unsigned int gro_times; /**< GRO operation times */
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       uint64_t     core_cycles; /**< used for RX and TX processing */
+       uint64_t core_cycles; /**< used for RX and TX processing */
+       uint64_t core_tx_cycles; /**< used for tx_burst processing */
+       uint64_t core_rx_cycles; /**< used for rx_burst processing */
 #endif
 #ifdef RTE_TEST_PMD_RECORD_BURST_STATS
        struct pkt_burst_stats rx_burst_stats;
@@ -325,7 +327,35 @@ struct queue_stats_mappings {
 #define RECORD_CORE_CYCLES_FWD (1<<0)
 #define RECORD_CORE_CYCLES_RX (1<<1)
 #define RECORD_CORE_CYCLES_TX (1<<2)
-#endif
+
+/* Macros to gather profiling statistics. */
+#define TEST_PMD_CORE_CYC_TX_START(a) \
+{if (fwdprof_flags & RECORD_CORE_CYCLES_TX) a = rte_rdtsc(); }
+
+#define TEST_PMD_CORE_CYC_RX_START(a) \
+{if (fwdprof_flags & (RECORD_CORE_CYCLES_FWD | \
+                      RECORD_CORE_CYCLES_RX)) a = rte_rdtsc(); }
+
+#define TEST_PMD_CORE_CYC_FWD_ADD(fs, s) \
+{if (fwdprof_flags & RECORD_CORE_CYCLES_FWD) \
+{uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_cycles += tsc; } }
+
+#define TEST_PMD_CORE_CYC_TX_ADD(fs, s) \
+{if (fwdprof_flags & RECORD_CORE_CYCLES_TX) \
+{uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_tx_cycles += tsc; } }
+
+#define TEST_PMD_CORE_CYC_RX_ADD(fs, s) \
+{if (fwdprof_flags & RECORD_CORE_CYCLES_RX) \
+{uint64_t tsc = rte_rdtsc(); tsc -= (s); fs->core_rx_cycles += tsc; } }
+
+#else
+/* No profiling statistics is configured. */
+#define TEST_PMD_CORE_CYC_TX_START(a)
+#define TEST_PMD_CORE_CYC_RX_START(a)
+#define TEST_PMD_CORE_CYC_FWD_ADD(fs, s)
+#define TEST_PMD_CORE_CYC_TX_ADD(fs, s)
+#define TEST_PMD_CORE_CYC_RX_ADD(fs, s)
+#endif /* RTE_TEST_PMD_RECORD_CORE_CYCLES */
 
 /* globals used for configuration */
 extern uint16_t verbose_level; /**< Drives messages being displayed, if any. */
diff --git a/app/test-pmd/txonly.c b/app/test-pmd/txonly.c
index 8a1989f..8ff7410 100644
--- a/app/test-pmd/txonly.c
+++ b/app/test-pmd/txonly.c
@@ -241,15 +241,11 @@
        uint64_t ol_flags = 0;
        uint64_t tx_offloads;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       uint64_t start_tsc;
-       uint64_t end_tsc;
-       uint64_t core_cycles;
-#endif
-
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       start_tsc = rte_rdtsc();
+       uint64_t start_rx_tsc = 0;
+       uint64_t start_tx_tsc = 0;
 #endif
 
+       TEST_PMD_CORE_CYC_RX_START(start_rx_tsc);
        mbp = current_fwd_lcore()->mbp;
        txp = &ports[fs->tx_port];
        tx_offloads = txp->dev_conf.txmode.offloads;
@@ -301,7 +297,9 @@
        if (nb_pkt == 0)
                return;
 
+       TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
        nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
+       TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
        /*
         * Retry if necessary
         */
@@ -309,8 +307,10 @@
                retry = 0;
                while (nb_tx < nb_pkt && retry++ < burst_tx_retry_num) {
                        rte_delay_us(burst_tx_delay_time);
+                       TEST_PMD_CORE_CYC_TX_START(start_tx_tsc);
                        nb_tx += rte_eth_tx_burst(fs->tx_port, fs->tx_queue,
                                        &pkts_burst[nb_tx], nb_pkt - nb_tx);
+                       TEST_PMD_CORE_CYC_TX_ADD(fs, start_tx_tsc);
                }
        }
        fs->tx_packets += nb_tx;
@@ -334,11 +334,7 @@
                } while (++nb_tx < nb_pkt);
        }
 
-#ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
-       end_tsc = rte_rdtsc();
-       core_cycles = (end_tsc - start_tsc);
-       fs->core_cycles = (uint64_t) (fs->core_cycles + core_cycles);
-#endif
+       TEST_PMD_CORE_CYC_FWD_ADD(fs, start_rx_tsc);
 }
 
 static void
-- 
1.8.3.1

Reply via email to