Hi Jianbo, Thanks for version v2. Addition of the prefetch instructions is definitely helping performance on ThunderX. But still performance is slightly less than that of scalar.
I tried few small tweaks which helped improve performance on my Thunderx setup. For details see comments inline. On Wed, 2017-05-10 at 10:30 +0800, Jianbo Liu wrote: > Use ARM NEON intrinsics to accelerate l3 fowarding. > > Signed-off-by: Jianbo Liu <jianbo....@linaro.org> > --- > examples/l3fwd/l3fwd_em.c | 4 +- > examples/l3fwd/l3fwd_em_hlm.h | 19 ++- > examples/l3fwd/l3fwd_em_hlm_neon.h | 74 ++++++++++ > examples/l3fwd/l3fwd_em_sequential.h | 20 ++- > examples/l3fwd/l3fwd_lpm.c | 4 +- > examples/l3fwd/l3fwd_lpm_neon.h | 165 ++++++++++++++++++++++ > examples/l3fwd/l3fwd_neon.h | 259 > +++++++++++++++++++++++++++++++++++ > 7 files changed, 539 insertions(+), 6 deletions(-) > create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h > create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h > create mode 100644 examples/l3fwd/l3fwd_neon.h > > [...] > diff --git a/examples/l3fwd/l3fwd_em_hlm.h > b/examples/l3fwd/l3fwd_em_hlm.h > index 636dea4..4ec600a 100644 > --- a/examples/l3fwd/l3fwd_em_hlm.h > +++ b/examples/l3fwd/l3fwd_em_hlm.h > @@ -35,8 +35,13 @@ > #ifndef __L3FWD_EM_HLM_H__ > #define __L3FWD_EM_HLM_H__ > > +#if defined(__SSE4_1__) > #include "l3fwd_sse.h" > #include "l3fwd_em_hlm_sse.h" > +#elif defined(RTE_MACHINE_CPUFLAG_NEON) > +#include "l3fwd_neon.h" > +#include "l3fwd_em_hlm_neon.h" > +#endif > > static inline __attribute__((always_inline)) void > em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf > *m[8], > @@ -238,7 +243,7 @@ static inline __attribute__((always_inline)) > uint16_t > l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, > uint8_t portid, struct lcore_conf *qconf) > { > - int32_t j; > + int32_t i, j, pos; > uint16_t dst_port[MAX_PKT_BURST]; > > /* > @@ -247,6 +252,12 @@ static inline __attribute__((always_inline)) > uint16_t > */ > int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8); > > + for (j = 0; j < 8 && j < nb_rx; j++) { > + rte_prefetch0(pkts_burst[j]); The above prefetch of rte_mbuf struct is unnecessary. With this we wont see any performance improvement as the contents of rte_mbuf (buf_addr and data_off) is used in right next instruction. Removing the above prefetch and similar prefetches at multiple places was improving performance on my ThunderX setup. > + rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j], > + struct ether_hdr *) + > 1); Better to prefetch at eth_hdr itself and not at eth_hdr + 1. In process_packet in l3fwd_neon.h, eth_header is accessed. > + } > + > for (j = 0; j < n; j += 8) { > > uint32_t pkt_type = > @@ -263,6 +274,12 @@ static inline __attribute__((always_inline)) > uint16_t > uint32_t tcp_or_udp = pkt_type & > (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP); > > + for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++, > pos++) { > + rte_prefetch0(pkts_burst[pos]); The above prefetch of rte_mbuf struct is unnecessary. > + rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[po > s], > + struct > ether_hdr *) + 1); Better to prefetch at eth_hdr itself and not at eth_hdr + 1 > + } > + > if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) { > > em_get_dst_port_ipv4x8(qconf, > &pkts_burst[j], portid, > > [...] > diff --git a/examples/l3fwd/l3fwd_em_sequential.h > b/examples/l3fwd/l3fwd_em_sequential.h > index c0a9725..c3df473 100644 > --- a/examples/l3fwd/l3fwd_em_sequential.h > +++ b/examples/l3fwd/l3fwd_em_sequential.h > @@ -43,7 +43,11 @@ > * compilation time. > */ > > +#if defined(__SSE4_1__) > #include "l3fwd_sse.h" > +#elif defined(RTE_MACHINE_CPUFLAG_NEON) > +#include "l3fwd_neon.h" > +#endif > > static inline __attribute__((always_inline)) uint16_t > em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf > *pkt, > @@ -101,11 +105,23 @@ static inline __attribute__((always_inline)) > uint16_t > l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, > uint8_t portid, struct lcore_conf *qconf) > { > - int32_t j; > + int32_t i, j; > uint16_t dst_port[MAX_PKT_BURST]; > > - for (j = 0; j < nb_rx; j++) > + if (nb_rx > 0) { > + rte_prefetch0(pkts_burst[0]); The above prefetch of rte_mbuf struct is unnecessary. > + rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[0], > + struct ether_hdr *) + > 1); Better to prefetch at eth_hdr itself and not at eth_hdr + 1 > + } > + > + for (i = 1, j = 0; j < nb_rx; i++, j++) { > + if (i < nb_rx) { > + rte_prefetch0(pkts_burst[i]); The above prefetch of rte_mbuf struct is unnecessary. > + rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i] > , > + struct > ether_hdr *) + 1); Better to prefetch at eth_hdr itself and not at eth_hdr + 1 > + } > dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], > portid); > + } > > send_packets_multi(qconf, pkts_burst, dst_port, nb_rx); > } > [...] > diff --git a/examples/l3fwd/l3fwd_lpm_neon.h > b/examples/l3fwd/l3fwd_lpm_neon.h > new file mode 100644 > index 0000000..2f047b3 > --- /dev/null > +++ b/examples/l3fwd/l3fwd_lpm_neon.h > > [...] > +/* > + * Buffer optimized handling of packets, invoked > + * from main_loop. > + */ > +static inline void > +l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst, > + uint8_t portid, struct lcore_conf *qconf) > +{ > + int32_t i, j, pos; > + uint16_t dst_port[MAX_PKT_BURST]; > + int32x4_t dip[MAX_PKT_BURST / FWDSTEP]; If you see carefully, we dont need an array of dip. We just need a single element. dip value is calculated in processx4_step1 and consumed in processx4_step2, and thats it. No need to save it in an array. > + uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP]; Same as dip. We dont need an array of ipv4_flag. > + const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP); > + > + for (j = 0; j < FWDSTEP && j < nb_rx; j++) { > + rte_prefetch0(pkts_burst[j]); The above prefetch of rte_mbuf struct is unnecessary. > + rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j], > + struct ether_hdr *) + > 1); Better to prefetch at eth_hdr itself and not at eth_hdr + 1 > + } > + > + for (j = 0; j != k; j += FWDSTEP) { > + for (i = 0, pos = j + FWDSTEP; i < FWDSTEP && pos < > nb_rx; > + i++, pos++) { > + rte_prefetch0(pkts_burst[pos]); The above prefetch of rte_mbuf struct is unnecessary. > + rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[po > s], > + struct > ether_hdr *) + 1); Better to prefetch at eth_hdr itself and not at eth_hdr + 1 > + } > + processx4_step1(&pkts_burst[j], &dip[j / FWDSTEP], > + &ipv4_flag[j / FWDSTEP]); > + > + processx4_step2(qconf, dip[j / FWDSTEP], > + ipv4_flag[j / FWDSTEP], portid, > &pkts_burst[j], > + &dst_port[j]); > + } > + > + /* Classify last up to 3 packets one by one */ > + switch (nb_rx % FWDSTEP) { > + case 3: > + dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], > portid); > + j++; > + /* fallthrough */ > + case 2: > + dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], > portid); > + j++; > + /* fallthrough */ > + case 1: > + dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], > portid); > + j++; > + } > + > + send_packets_multi(qconf, pkts_burst, dst_port, nb_rx); > +} > + > +#endif /* __L3FWD_LPM_NEON_H__ */ > diff --git a/examples/l3fwd/l3fwd_neon.h > b/examples/l3fwd/l3fwd_neon.h > new file mode 100644 > index 0000000..75c8976 > --- /dev/null > +++ b/examples/l3fwd/l3fwd_neon.h > [...] > + > +/** > + * Process one packet: > + * Update source and destination MAC addresses in the ethernet > header. > + * Perform RFC1812 checks and updates for IPV4 packets. > + */ > +static inline void > +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port) > +{ > + struct ether_hdr *eth_hdr; > + uint32x4_t te, ve; > + > + eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *); eth_hdr accessed here. Hence the earlier comments about prefetching at eth header. > + > + te = vld1q_u32((uint32_t *)eth_hdr); > + ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]); > + > + > + rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port, > + pkt->packet_type); > + > + ve = vsetq_lane_u32(vgetq_lane_u32(te, 3), ve, 3); Use vcopyq_laneq_u32 for easily doing the above. > + vst1q_u32((uint32_t *)eth_hdr, ve); > +} > + > [...] > +#endif /* _L3FWD_NEON_H_ */ Combining all the above comments, I made some changes on top of your patch. These changes are giving 3-4% improvement over your version. You may find the changes at https://gist.github.com/ashwinyes/34cbdd999784402c859c71613587fafc Please check it out and let me know your comments. Thanks Ashwin