Hi Vladimir, >> diff --git a/lib/node/ip4_lookup_fib.c b/lib/node/ip4_lookup_fib.c >> index e87864e672..c535b191f8 100644 >> --- a/lib/node/ip4_lookup_fib.c >> +++ b/lib/node/ip4_lookup_fib.c >> @@ -40,6 +40,169 @@ static struct ip4_lookup_fib_node_main >ip4_lookup_fib_nm; >> #define IP4_LOOKUP_NODE_PRIV1_OFF(ctx) \ >> (((struct ip4_lookup_fib_node_ctx *)ctx)->mbuf_priv1_off) >> >> +static uint16_t >> +ip4_lookup_fib_node_process(struct rte_graph *graph, struct rte_node >*node, void **objs, >> + uint16_t nb_objs) >> +{ >> + struct rte_mbuf *mbuf0, *mbuf1, *mbuf2, *mbuf3, **pkts; >> + struct rte_fib *fib = IP4_LOOKUP_NODE_FIB(node->ctx); >> + const int dyn = IP4_LOOKUP_NODE_PRIV1_OFF(node->ctx); >> + struct rte_ipv4_hdr *ipv4_hdr; >> + uint64_t next_hop[nb_objs]; >> + uint16_t lookup_err = 0; >> + void **to_next, **from; >> + uint16_t last_spec = 0; >> + rte_edge_t next_index; >> + uint16_t n_left_from; >> + uint32_t ip[nb_objs]; >> + uint16_t held = 0; >> + uint32_t drop_nh; >> + uint16_t next; >> + int i, rc; >> + >> + /* Speculative next */ >> + next_index = RTE_NODE_IP4_LOOKUP_NEXT_REWRITE; >> + /* Drop node */ >> + drop_nh = ((uint32_t)RTE_NODE_IP4_LOOKUP_NEXT_PKT_DROP) << >16; >> + >> + pkts = (struct rte_mbuf **)objs; >> + from = objs; >> + n_left_from = nb_objs; >> + >> + /* Get stream for the speculated next node */ >> + to_next = rte_node_next_stream_get(graph, node, next_index, >> +nb_objs); >> + >> + for (i = OBJS_PER_CLINE; i < RTE_GRAPH_BURST_SIZE; i += >OBJS_PER_CLINE) >> + rte_prefetch0(&objs[i]); > >Does this prefetching loop make any sense? Unless objs are not passed across >threads this array likely to be in the cache already. > >And if objs are passed across threads, then why do you start prefetching from >the next cache line instead of the first, and why don't you stop at nb_objs? For example if cache size is 64 bytes. Then there will be 8 pointers to mbuf per cache line (if address size is 8 bytes). If nb_objs is 256, then the remaining pointers needs to be prefetched to cache. This loop helps to prefetch nb_objs pointers to cache. The loop can be changed to stop at nb_objs. for (i = OBJS_PER_CLINE; i < nb_objs; i += OBJS_PER_CLINE) { }
> >> + >> +#if RTE_GRAPH_BURST_SIZE > 64 >> + for (i = 0; i < 4 && i < n_left_from; i++) { >> + rte_prefetch0(pkts[i]); >> + rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[i], void *, >> + sizeof(struct rte_ether_hdr))); > >This construction does not make sense to me. Same as similar constructions >below > >The second prefetch has memory dependency of the data, that will be >prefetched by the first one. Does removing this prefetch affects performance? The first prefetches the data at mbuf address. The second prefetch is for the address containing ipv4 header. Both can be in separate cache lines, as ipv4 starts at mbuf->buf_addr + mbuf->data_off + sizeof(ethernet header). data_off is generally 64 bytes or 128 bytes depending on cache line size. > >> + } >> +#endif >> + >> + i = 0; >> + while (n_left_from >= 4) { >> +#if RTE_GRAPH_BURST_SIZE > 64 >> + if (likely(n_left_from > 7)) { >> + rte_prefetch0(pkts[4]); >> + rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[4], void >*, >> + sizeof(struct rte_ether_hdr))); >> + rte_prefetch0(pkts[5]); >> + rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[5], void >*, >> + sizeof(struct rte_ether_hdr))); >> + rte_prefetch0(pkts[6]); >> + rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[6], void >*, >> + sizeof(struct rte_ether_hdr))); >> + rte_prefetch0(pkts[7]); >> + rte_prefetch0(rte_pktmbuf_mtod_offset(pkts[7], void >*, >> + sizeof(struct rte_ether_hdr))); >> + } >> +#endif >> + >> + mbuf0 = pkts[0]; >> + mbuf1 = pkts[1]; >> + mbuf2 = pkts[2]; >> + mbuf3 = pkts[3]; >> + pkts += 4; >> + n_left_from -= 4; >> + /* Extract DIP of mbuf0 */ >> + ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf0, struct >rte_ipv4_hdr *, >> + sizeof(struct rte_ether_hdr)); >> + /* Extract cksum, ttl as ipv4 hdr is in cache */ >> + node_mbuf_priv1(mbuf0, dyn)->cksum = ipv4_hdr- >>hdr_checksum; >> + node_mbuf_priv1(mbuf0, dyn)->ttl = ipv4_hdr->time_to_live; >> + >> + ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr); >> + >> + /* Extract DIP of mbuf1 */ >> + ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf1, struct >rte_ipv4_hdr *, >> + sizeof(struct rte_ether_hdr)); >> + /* Extract cksum, ttl as ipv4 hdr is in cache */ >> + node_mbuf_priv1(mbuf1, dyn)->cksum = ipv4_hdr- >>hdr_checksum; >> + node_mbuf_priv1(mbuf1, dyn)->ttl = ipv4_hdr->time_to_live; >> + >> + ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr); >> + >> + /* Extract DIP of mbuf2 */ >> + ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf2, struct >rte_ipv4_hdr *, >> + sizeof(struct rte_ether_hdr)); >> + /* Extract cksum, ttl as ipv4 hdr is in cache */ >> + node_mbuf_priv1(mbuf2, dyn)->cksum = ipv4_hdr- >>hdr_checksum; >> + node_mbuf_priv1(mbuf2, dyn)->ttl = ipv4_hdr->time_to_live; >> + >> + ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr); >> + >> + /* Extract DIP of mbuf3 */ >> + ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf3, struct >rte_ipv4_hdr *, >> + sizeof(struct rte_ether_hdr)); >> + >> + /* Extract cksum, ttl as ipv4 hdr is in cache */ >> + node_mbuf_priv1(mbuf3, dyn)->cksum = ipv4_hdr- >>hdr_checksum; >> + node_mbuf_priv1(mbuf3, dyn)->ttl = ipv4_hdr->time_to_live; >> + >> + ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr); >> + } >> + while (n_left_from > 0) { >> + mbuf0 = pkts[0]; >> + pkts += 1; >> + n_left_from -= 1; >> + >> + /* Extract DIP of mbuf0 */ >> + ipv4_hdr = rte_pktmbuf_mtod_offset(mbuf0, struct >rte_ipv4_hdr *, >> + sizeof(struct rte_ether_hdr)); >> + /* Extract cksum, ttl as ipv4 hdr is in cache */ >> + node_mbuf_priv1(mbuf0, dyn)->cksum = ipv4_hdr- >>hdr_checksum; >> + node_mbuf_priv1(mbuf0, dyn)->ttl = ipv4_hdr->time_to_live; >> + >> + ip[i++] = rte_be_to_cpu_32(ipv4_hdr->dst_addr); >> + } >> + >> + rc = rte_fib_lookup_bulk(fib, ip, next_hop, nb_objs); >> + if (unlikely(rc != 0)) >> + return 0; >> + >> + for (i = 0; i < nb_objs; i++) { >> + if (unlikely(next_hop[i] == FIB_DEFAULT_NH)) { >> + next_hop[i] = drop_nh; >maybe it is worth just do define FIB_DEFAULT_NH as a drop_nh vaue to omit >these next_hop reassignments? Ack. >> + lookup_err += 1; >> + } >> + >> + mbuf0 = (struct rte_mbuf *)objs[i]; >> + node_mbuf_priv1(mbuf0, dyn)->nh = (uint16_t)next_hop[i]; >> + next = (uint16_t)(next_hop[i] >> 16); >> + >> + if (unlikely(next_index ^ next)) { >> + /* Copy things successfully speculated till now */ >> + rte_memcpy(to_next, from, last_spec * >sizeof(from[0])); >> + from += last_spec; >> + to_next += last_spec; >> + held += last_spec; >> + last_spec = 0; >> + >> + rte_node_enqueue_x1(graph, node, next, from[0]); >> + from += 1; >> + } else { >> + last_spec += 1; >> + } >> + } >> + >> + /* !!! Home run !!! */ >> + if (likely(last_spec == nb_objs)) { >> + rte_node_next_stream_move(graph, node, next_index); >> + return nb_objs; >> + } >> + >> + NODE_INCREMENT_XSTAT_ID(node, 0, lookup_err != 0, lookup_err); >> + held += last_spec; >> + rte_memcpy(to_next, from, last_spec * sizeof(from[0])); >> + rte_node_next_stream_put(graph, node, next_index, held); >> + >> + return nb_objs; >> +} >> + >> RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_node_ip4_fib_route_add, 25.07) >> int >> rte_node_ip4_fib_route_add(uint32_t ip, uint8_t depth, uint16_t >> next_hop, @@ -147,6 +310,7 @@ static struct rte_node_xstats >ip4_lookup_fib_xstats = { >> }; >> >> static struct rte_node_register ip4_lookup_fib_node = { >> + .process = ip4_lookup_fib_node_process, >> .name = "ip4_lookup_fib", >> >> .init = ip4_lookup_fib_node_init, > >-- >Regards, >Vladimir