Hi Ray, I have tried to avoid hand unrolling loops and found the following observations.
1. Although it decreases LOC it also takes away readability too. Example: Avoiding unrolled code below priv[0].u64[0] = rte_node_mbuf_priv1(mbuf[0])->u; priv[0].u64[1] = rte_node_mbuf_priv1(mbuf[1])->u; priv[1].u64[0] = rte_node_mbuf_priv1(mbuf[2])->u; priv[1].u64[1] = rte_node_mbuf_priv1(mbuf[3])->u; /* Increment checksum by one. */ priv[0].u32[1] += rte_cpu_to_be_16(0x0100); priv[0].u32[3] += rte_cpu_to_be_16(0x0100); priv[1].u32[1] += rte_cpu_to_be_16(0x0100); priv[1].u32[3] += rte_cpu_to_be_16(0x0100); d = rte_pktmbuf_mtod(mbuf[0], void *); rte_memcpy(d, nh[priv[0].u16[0]].rewrite_data, nh[priv[0].u16[0]].rewrite_len); next[0] = nh[priv[0].u16[0]].tx_node; ip = (struct rte_ipv4_hdr *)((uint8_t *)d + sizeof(struct rte_ether_hdr)); ip->time_to_live = priv[0].u16[1] - 1; ip->hdr_checksum = priv[0].u16[2] + priv[0].u16[3]; d = rte_pktmbuf_mtod(mbuf[1], void *); rte_memcpy(d, nh[priv[0].u16[4]].rewrite_data, nh[priv[0].u16[4]].rewrite_len); next[1] = nh[priv[0].u16[4]].tx_node; ip = (struct rte_ipv4_hdr *)((uint8_t *)d + sizeof(struct rte_ether_hdr)); ip->time_to_live = priv[0].u16[5] - 1; ip->hdr_checksum = priv[0].u16[6] + priv[0].u16[7]; d = rte_pktmbuf_mtod(mbuf[2], void *); rte_memcpy(d, nh[priv[1].u16[0]].rewrite_data, nh[priv[1].u16[0]].rewrite_len); next[2] = nh[priv[1].u16[0]].tx_node; ip = (struct rte_ipv4_hdr *)((uint8_t *)d + sizeof(struct rte_ether_hdr)); ip->time_to_live = priv[1].u16[1] - 1; ip->hdr_checksum = priv[1].u16[2] + priv[1].u16[3]; d = rte_pktmbuf_mtod(mbuf[3], void *); rte_memcpy(d, nh[priv[1].u16[4]].rewrite_data, nh[priv[1].u16[4]].rewrite_len); next[3] = nh[priv[1].u16[4]].tx_node; ip = (struct rte_ipv4_hdr *)((uint8_t *)d + sizeof(struct rte_ether_hdr)); ip->time_to_live = priv[1].u16[5] - 1; ip->hdr_checksum = priv[1].u16[6] + priv[1].u16[7]; Leads to something like: for (i = 0, j = 0; i < BUF_PER_LOOP; i += 2, j++) { priv[j].u64[0] = rte_node_mbuf_priv1(mbuf[i])->u; priv[j].u32[1] += rte_cpu_to_be_16(0x0100); d = rte_pktmbuf_mtod(mbuf[i], void *); ip = (struct rte_ipv4_hdr *)((uint8_t *)d + sizeof(struct rte_ether_hdr)); ip->time_to_live = priv[j].u16[1] - 1; ip->hdr_checksum = priv[j].u16[2] + priv[j].u16[3]; rte_memcpy(d, nh[priv[j].u16[0]].rewrite_data, nh[priv[j].u16[0]].rewrite_len); next[i] = nh[priv[j].u16[0]].tx_node; priv[j].u64[1] = rte_node_mbuf_priv1(mbuf[i + 1])->u; priv[j].u32[3] += rte_cpu_to_be_16(0x0100); d = rte_pktmbuf_mtod(mbuf[i + 1], void *); ip = (struct rte_ipv4_hdr *)((uint8_t *)d + sizeof(struct rte_ether_hdr)); ip->time_to_live = priv[j].u16[5] - 1; ip->hdr_checksum = priv[j].u16[6] + priv[j].u16[7]; rte_memcpy(d, nh[priv[j].u16[4]].rewrite_data, nh[priv[j].u16[4]].rewrite_len); next[i + 1] = nh[priv[j].u16[4]].tx_node; } Which is kind of unreadable. 2. Not all compilers are made equal. I found that most of the compilers don’t Unroll the loop above even when compiled with `-funroll-all-loops`. I have checked with following compilers: GCC 9.2.0 Clang 9.0.1 Aarch64 GCC 7.3.0 Aarch64 GCC 9.2.0 3. Performance wise I see a lot of degradation on our platform at least 13%. On IA with a Broadwell(Xeon E5-2690) and i40e the performance remain same w.r.t Rx/Tx since the hotspot is in the Tx path of the driver which limits the per core capability. But the performance difference in number of cycles per node can be seen below: Hand unrolling: +-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+ |Node |calls |objs |realloc_count |objs/call |objs/sec(10E6) |cycles/call| +-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+ |ip4_lookup |7765918 |248509344 |1 |32.000 |27.725408 |779.0000 | |ip4_rewrite |7765925 |248509568 |1 |32.000 |27.725408 |425.0000 | |ethdev_tx-1 |7765927 |204056223 |1 |26.000 |22.762720 |597.0000 | |pkt_drop |1389170 |44453409 |1 |32.000 |4.962688 |298.0000 | |ethdev_rx-0-0 |63604111 |248509792 |2 |32.000 |27.725408 |982.0000 | +-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+ W/o unrolling: +-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+ |Node |calls |objs |realloc_count |objs/call |objs/sec(10E6) |cycles/call| +-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+ |ip4_lookup |18864640 |603668448 |1 |32.000 |26.051328 |828.0000 | |ip4_rewrite |18864646 |603668640 |1 |32.000 |26.051328 |534.0000 | |ethdev_tx-1 |18864648 |527874175 |1 |27.000 |22.780256 |633.0000 | |pkt_drop |2368580 |75794529 |1 |32.000 |3.271072 |286.0000 | |ethdev_rx-0-0 |282058226 |603668864 |2 |32.000 |26.051328 |994.0000 | +-------------------------------+---------------+---------------+---------------+---------------+---------------+-----------+ Considering the above findings we would like to continue unrolling the loops by hand. Regards, Pavan. >-----Original Message----- >From: Ray Kinsella <m...@ashroe.eu> >Sent: Friday, March 20, 2020 2:44 PM >To: Pavan Nikhilesh Bhagavatula <pbhagavat...@marvell.com>; Jerin >Jacob Kollanukkaran <jer...@marvell.com>; Nithin Kumar Dabilpuram ><ndabilpu...@marvell.com> >Cc: dev@dpdk.org; tho...@monjalon.net; >david.march...@redhat.com; mattias.ronnb...@ericsson.com; Kiran >Kumar Kokkilagadda <kirankum...@marvell.com> >Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v1 20/26] node: ipv4 lookup >for x86 > > > >On 19/03/2020 16:13, Pavan Nikhilesh Bhagavatula wrote: >> >> >>> -----Original Message----- >>> From: Ray Kinsella <m...@ashroe.eu> >>> Sent: Thursday, March 19, 2020 9:21 PM >>> To: Pavan Nikhilesh Bhagavatula <pbhagavat...@marvell.com>; >Jerin >>> Jacob Kollanukkaran <jer...@marvell.com>; Nithin Kumar >Dabilpuram >>> <ndabilpu...@marvell.com> >>> Cc: dev@dpdk.org; tho...@monjalon.net; >>> david.march...@redhat.com; mattias.ronnb...@ericsson.com; >Kiran >>> Kumar Kokkilagadda <kirankum...@marvell.com> >>> Subject: Re: [EXT] Re: [dpdk-dev] [PATCH v1 20/26] node: ipv4 >lookup >>> for x86 >>> >>> >>> >>> On 19/03/2020 14:22, Pavan Nikhilesh Bhagavatula wrote: >>>>> On 18/03/2020 21:35, jer...@marvell.com wrote: >>>>>> From: Pavan Nikhilesh <pbhagavat...@marvell.com> >>>>>> >>>>>> Add IPv4 lookup process function for ip4_lookup >>>>>> rte_node. This node performs LPM lookup using x86_64 >>>>>> vector supported RTE_LPM API on every packet received >>>>>> and forwards it to a next node that is identified by >>>>>> lookup result. >>>>>> >>>>>> Signed-off-by: Pavan Nikhilesh <pbhagavat...@marvell.com> >>>>>> Signed-off-by: Nithin Dabilpuram <ndabilpu...@marvell.com> >>>>>> Signed-off-by: Kiran Kumar K <kirankum...@marvell.com> >>>>>> --- >>>>>> lib/librte_node/ip4_lookup.c | 245 >>>>> +++++++++++++++++++++++++++++++++++ >>>>>> 1 file changed, 245 insertions(+) >>>>>> >>>>>> diff --git a/lib/librte_node/ip4_lookup.c >>>>> b/lib/librte_node/ip4_lookup.c >>>>>> index d7fcd1158..c003e9c91 100644 >>>>>> --- a/lib/librte_node/ip4_lookup.c >>>>>> +++ b/lib/librte_node/ip4_lookup.c >>>>>> @@ -264,6 +264,251 @@ ip4_lookup_node_process(struct >>> rte_graph >>>>> *graph, struct rte_node *node, >>>>>> return nb_objs; >>>>>> } >>>>>> >>>>>> +#elif defined(RTE_ARCH_X86) >>>>>> + >>>>>> +/* X86 SSE */ >>>>>> +static uint16_t >>>>>> +ip4_lookup_node_process(struct rte_graph *graph, struct >>> rte_node >>>>> *node, >>>>>> + void **objs, uint16_t nb_objs) >>>>>> +{ >>>>>> + struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3, >**pkts; >>>>>> + rte_edge_t next0, next1, next2, next3, next_index; >>>>>> + struct rte_ipv4_hdr *ipv4_hdr; >>>>>> + struct rte_ether_hdr *eth_hdr; >>>>>> + uint32_t ip0, ip1, ip2, ip3; >>>>>> + void **to_next, **from; >>>>>> + uint16_t last_spec = 0; >>>>>> + uint16_t n_left_from; >>>>>> + struct rte_lpm *lpm; >>>>>> + uint16_t held = 0; >>>>>> + uint32_t drop_nh; >>>>>> + rte_xmm_t dst; >>>>>> + __m128i dip; /* SSE register */ >>>>>> + int rc, i; >>>>>> + >>>>>> + /* Speculative next */ >>>>>> + next_index = >RTE_NODE_IP4_LOOKUP_NEXT_REWRITE; >>>>>> + /* Drop node */ >>>>>> + drop_nh = >>>>> ((uint32_t)RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP) << 16; >>>>>> + >>>>>> + /* Get socket specific LPM from ctx */ >>>>>> + lpm = *((struct rte_lpm **)node->ctx); >>>>>> + >>>>>> + pkts = (struct rte_mbuf **)objs; >>>>>> + from = objs; >>>>>> + n_left_from = nb_objs; >>>>> >>>>> I doubt this initial prefetch of the first 4 packets has any benefit. >>>> >>>> Ack will remove in v2 for x86. >>>> >>>>> >>>>>> + if (n_left_from >= 4) { >>>>>> + for (i = 0; i < 4; i++) { >>>>>> + > rte_prefetch0(rte_pktmbuf_mtod(pkts[i], >>>>>> + struct >rte_ether_hdr >>>>> *) + >>>>>> + 1); >>>>>> + } >>>>>> + } >>>>>> + >>>>>> + /* Get stream for the speculated next node */ >>>>>> + to_next = rte_node_next_stream_get(graph, node, >>>>> next_index, nb_objs); >>>>> >>>>> Suggest you don't reuse the hand-unrolling optimization from >FD.io >>>>> VPP. >>>>> I have never found any performance benefit from them, and they >>>>> make the code unnecessarily verbose. >>>>> >>>> >>>> How would be take the benefit of rte_lpm_lookupx4 without >>> unrolling the loop?. >>>> Also, in future if we are using rte_rib and fib with a CPU supporting >>> wider SIMD we might >>>> need to unroll them further (AVX256 AND 512 currently >>> rte_lpm_lookup uses only 128bit >>>> since it is only uses SSE extension). >>> >>> Let the compiler do it for you, but using a constant vector length. >>> for (int i=0; i < 4; ++i) { ... } >>> >> >> Ok, I think I misunderstood the previous comment. >> It was only for the prefetches in the loop right? > > >no, it was for all the needless repetition. >hand-unrolling loops serve no purpose but to add verbosity. > >> >>>> >>>>> >>>>>> + while (n_left_from >= 4) { >>>>>> + /* Prefetch next-next mbufs */ >>>>>> + if (likely(n_left_from >= 11)) { >>>>>> + rte_prefetch0(pkts[8]); >>>>>> + rte_prefetch0(pkts[9]); >>>>>> + rte_prefetch0(pkts[10]); >>>>>> + rte_prefetch0(pkts[11]); >>>>>> + } >>>>>> + >>>>>> + /* Prefetch next mbuf data */ >>>>>> + if (likely(n_left_from >= 7)) { >>>>>> + > rte_prefetch0(rte_pktmbuf_mtod(pkts[4], >>>>>> + struct >rte_ether_hdr >>>>> *) + >>>>>> + 1); >>>>>> + > rte_prefetch0(rte_pktmbuf_mtod(pkts[5], >>>>>> + struct >rte_ether_hdr >>>>> *) + >>>>>> + 1); >>>>>> + > rte_prefetch0(rte_pktmbuf_mtod(pkts[6], >>>>>> + struct >rte_ether_hdr >>>>> *) + >>>>>> + 1); >>>>>> + > rte_prefetch0(rte_pktmbuf_mtod(pkts[7], >>>>>> + struct >rte_ether_hdr >>>>> *) + >>>>>> + 1); >>>>>> + } >>>>>> + >>>>>> + mbuf0 = pkts[0]; >>>>>> + mbuf1 = pkts[1]; >>>>>> + mbuf2 = pkts[2]; >>>>>> + mbuf3 = pkts[3]; >>>>>> + >>>>>> + pkts += 4; >>>>>> + n_left_from -= 4; >>>>>> + >>>>>> + /* Extract DIP of mbuf0 */ >>>>>> + eth_hdr = rte_pktmbuf_mtod(mbuf0, struct >>>>> rte_ether_hdr *); >>>>>> + ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1); >>>>>> + ip0 = ipv4_hdr->dst_addr; >>>>>> + /* Extract cksum, ttl as ipv4 hdr is in cache */ >>>>>> + rte_node_mbuf_priv1(mbuf0)->cksum = >ipv4_hdr- >>>>>> hdr_checksum; >>>>>> + rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr- >>>>>> time_to_live; >>>>>> + >>>>>> + /* Extract DIP of mbuf1 */ >>>>>> + eth_hdr = rte_pktmbuf_mtod(mbuf1, struct >>>>> rte_ether_hdr *); >>>>>> + ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1); >>>>>> + ip1 = ipv4_hdr->dst_addr; >>>>>> + /* Extract cksum, ttl as ipv4 hdr is in cache */ >>>>>> + rte_node_mbuf_priv1(mbuf1)->cksum = >ipv4_hdr- >>>>>> hdr_checksum; >>>>>> + rte_node_mbuf_priv1(mbuf1)->ttl = ipv4_hdr- >>>>>> time_to_live; >>>>>> + >>>>>> + /* Extract DIP of mbuf2 */ >>>>>> + eth_hdr = rte_pktmbuf_mtod(mbuf2, struct >>>>> rte_ether_hdr *); >>>>>> + ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1); >>>>>> + ip2 = ipv4_hdr->dst_addr; >>>>>> + /* Extract cksum, ttl as ipv4 hdr is in cache */ >>>>>> + rte_node_mbuf_priv1(mbuf2)->cksum = >ipv4_hdr- >>>>>> hdr_checksum; >>>>>> + rte_node_mbuf_priv1(mbuf2)->ttl = ipv4_hdr- >>>>>> time_to_live; >>>>>> + >>>>>> + /* Extract DIP of mbuf3 */ >>>>>> + eth_hdr = rte_pktmbuf_mtod(mbuf3, struct >>>>> rte_ether_hdr *); >>>>>> + ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1); >>>>>> + ip3 = ipv4_hdr->dst_addr; >>>>>> + >>>>>> + /* Prepare for lookup x4 */ >>>>>> + dip = _mm_set_epi32(ip3, ip2, ip1, ip0); >>>>>> + >>>>>> + /* Byte swap 4 IPV4 addresses. */ >>>>>> + const __m128i bswap_mask = _mm_set_epi8( >>>>>> + 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, >2, 3); >>>>>> + dip = _mm_shuffle_epi8(dip, bswap_mask); >>>>>> + >>>>>> + /* Extract cksum, ttl as ipv4 hdr is in cache */ >>>>>> + rte_node_mbuf_priv1(mbuf3)->cksum = >ipv4_hdr- >>>>>> hdr_checksum; >>>>>> + rte_node_mbuf_priv1(mbuf3)->ttl = ipv4_hdr- >>>>>> time_to_live; >>>>>> + >>>>>> + /* Perform LPM lookup to get NH and next >node */ >>>>>> + rte_lpm_lookupx4(lpm, dip, dst.u32, drop_nh); >>>>>> + >>>>>> + /* Extract next node id and NH */ >>>>>> + rte_node_mbuf_priv1(mbuf0)->nh = dst.u32[0] >& >>>>> 0xFFFF; >>>>>> + next0 = (dst.u32[0] >> 16); >>>>>> + >>>>>> + rte_node_mbuf_priv1(mbuf1)->nh = dst.u32[1] >& >>>>> 0xFFFF; >>>>>> + next1 = (dst.u32[1] >> 16); >>>>>> + >>>>>> + rte_node_mbuf_priv1(mbuf2)->nh = dst.u32[2] >& >>>>> 0xFFFF; >>>>>> + next2 = (dst.u32[2] >> 16); >>>>>> + >>>>>> + rte_node_mbuf_priv1(mbuf3)->nh = dst.u32[3] >& >>>>> 0xFFFF; >>>>>> + next3 = (dst.u32[3] >> 16); >>>>>> + >>>>>> + /* Enqueue four to next node */ >>>>>> + rte_edge_t fix_spec = >>>>>> + (next_index ^ next0) | (next_index ^ >next1) | >>>>>> + (next_index ^ next2) | (next_index ^ >next3); >>>>>> + >>>>>> + if (unlikely(fix_spec)) { >>>>>> + /* Copy things successfully speculated >till now >>>>> */ >>>>>> + rte_memcpy(to_next, from, last_spec * >>>>> sizeof(from[0])); >>>>>> + from += last_spec; >>>>>> + to_next += last_spec; >>>>>> + held += last_spec; >>>>>> + last_spec = 0; >>>>>> + >>>>>> + /* Next0 */ >>>>>> + if (next_index == next0) { >>>>>> + to_next[0] = from[0]; >>>>>> + to_next++; >>>>>> + held++; >>>>>> + } else { >>>>>> + rte_node_enqueue_x1(graph, >node, >>>>> next0, >>>>>> + from[0]); >>>>>> + } >>>>>> + >>>>>> + /* Next1 */ >>>>>> + if (next_index == next1) { >>>>>> + to_next[0] = from[1]; >>>>>> + to_next++; >>>>>> + held++; >>>>>> + } else { >>>>>> + rte_node_enqueue_x1(graph, >node, >>>>> next1, >>>>>> + from[1]); >>>>>> + } >>>>>> + >>>>>> + /* Next2 */ >>>>>> + if (next_index == next2) { >>>>>> + to_next[0] = from[2]; >>>>>> + to_next++; >>>>>> + held++; >>>>>> + } else { >>>>>> + rte_node_enqueue_x1(graph, >node, >>>>> next2, >>>>>> + from[2]); >>>>>> + } >>>>>> + >>>>>> + /* Next3 */ >>>>>> + if (next_index == next3) { >>>>>> + to_next[0] = from[3]; >>>>>> + to_next++; >>>>>> + held++; >>>>>> + } else { >>>>>> + rte_node_enqueue_x1(graph, >node, >>>>> next3, >>>>>> + from[3]); >>>>>> + } >>>>>> + >>>>>> + from += 4; >>>>>> + >>>>>> + } else { >>>>>> + last_spec += 4; >>>>>> + } >>>>>> + } >>>>>> + >>>>>> + while (n_left_from > 0) { >>>>>> + uint32_t next_hop; >>>>>> + >>>>>> + mbuf0 = pkts[0]; >>>>>> + >>>>>> + pkts += 1; >>>>>> + n_left_from -= 1; >>>>>> + >>>>>> + /* Extract DIP of mbuf0 */ >>>>>> + eth_hdr = rte_pktmbuf_mtod(mbuf0, struct >>>>> rte_ether_hdr *); >>>>>> + ipv4_hdr = (struct rte_ipv4_hdr *)(eth_hdr + 1); >>>>>> + /* Extract cksum, ttl as ipv4 hdr is in cache */ >>>>>> + rte_node_mbuf_priv1(mbuf0)->cksum = >ipv4_hdr- >>>>>> hdr_checksum; >>>>>> + rte_node_mbuf_priv1(mbuf0)->ttl = ipv4_hdr- >>>>>> time_to_live; >>>>>> + >>>>>> + rc = rte_lpm_lookup(lpm, >rte_be_to_cpu_32(ipv4_hdr- >>>>>> dst_addr), >>>>>> + &next_hop); >>>>>> + next_hop = (rc == 0) ? next_hop : drop_nh; >>>>>> + >>>>>> + rte_node_mbuf_priv1(mbuf0)->nh = next_hop >& >>>>> 0xFFFF; >>>>>> + next0 = (next_hop >> 16); >>>>>> + >>>>>> + if (unlikely(next_index ^ next0)) { >>>>>> + /* Copy things successfully speculated >till now >>>>> */ >>>>>> + rte_memcpy(to_next, from, last_spec * >>>>> sizeof(from[0])); >>>>>> + from += last_spec; >>>>>> + to_next += last_spec; >>>>>> + held += last_spec; >>>>>> + last_spec = 0; >>>>>> + >>>>>> + rte_node_enqueue_x1(graph, node, >next0, >>>>> from[0]); >>>>>> + from += 1; >>>>>> + } else { >>>>>> + last_spec += 1; >>>>>> + } >>>>>> + } >>>>>> + >>>>>> + /* !!! Home run !!! */ >>>>>> + if (likely(last_spec == nb_objs)) { >>>>>> + rte_node_next_stream_move(graph, node, >>>>> next_index); >>>>>> + return nb_objs; >>>>>> + } >>>>>> + >>>>>> + held += last_spec; >>>>>> + /* Copy things successfully speculated till now */ >>>>>> + rte_memcpy(to_next, from, last_spec * >sizeof(from[0])); >>>>>> + rte_node_next_stream_put(graph, node, next_index, >held); >>>>>> + >>>>>> + return nb_objs; >>>>>> +} >>>>>> + >>>>>> #else >>>>>> >>>>>> static uint16_t >>>>>>