Moved rfc1812 process prior to NEON registers store.
On N1SDP, this reorganization mitigates CPU frontend stall and backend
stall when forwarding.

On N1SDP with MLX5 40G NIC, this change showed 10.2% performance gain
in single port single core MRR test.
On ThunderX2, this changed showed no performance degradation.

Signed-off-by: Ruifeng Wang <ruifeng.w...@arm.com>
---
 examples/l3fwd/l3fwd_neon.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
index 86ac5971d7..ea7fe22d00 100644
--- a/examples/l3fwd/l3fwd_neon.h
+++ b/examples/l3fwd/l3fwd_neon.h
@@ -43,11 +43,6 @@ processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t 
dst_port[FWDSTEP])
        ve[2] = vsetq_lane_u32(vgetq_lane_u32(te[2], 3), ve[2], 3);
        ve[3] = vsetq_lane_u32(vgetq_lane_u32(te[3], 3), ve[3], 3);
 
-       vst1q_u32(p[0], ve[0]);
-       vst1q_u32(p[1], ve[1]);
-       vst1q_u32(p[2], ve[2]);
-       vst1q_u32(p[3], ve[3]);
-
        rfc1812_process((struct rte_ipv4_hdr *)
                        ((struct rte_ether_hdr *)p[0] + 1),
                        &dst_port[0], pkt[0]->packet_type);
@@ -60,6 +55,11 @@ processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t 
dst_port[FWDSTEP])
        rfc1812_process((struct rte_ipv4_hdr *)
                        ((struct rte_ether_hdr *)p[3] + 1),
                        &dst_port[3], pkt[3]->packet_type);
+
+       vst1q_u32(p[0], ve[0]);
+       vst1q_u32(p[1], ve[1]);
+       vst1q_u32(p[2], ve[2]);
+       vst1q_u32(p[3], ve[3]);
 }
 
 /*
-- 
2.25.1

Reply via email to