Hi Jerin,
The patch is as below. Jie and I just modified from the original case.
1. If all consumers(slave lcore) start to work after the producer fill
the ring in every iteration test, there will be no problem.
2. If only one consumer with one producer, there will not be any
problem either.
3. The more consumers are involved in, the higher reproduction rate
will be observed.
4. With the rmb, no issue is found in more than 100 thousands iteration
tests.
./test/test/test -l 4-11 -n 6
test_refcnt_iter(lcore=4, iter=0) completed, 10 references processed
test_refcnt_iter(lcore=4, iter=1) completed, 10 references processed
test_refcnt_iter(lcore=4, iter=2) completed, 10 references processed
test_refcnt_iter(lcore=4, iter=3) completed, 10 references processed
test_refcnt_iter(lcore=4, iter=4) completed, 10 references processed
m ref is 2
test_refcnt_iter(lcore=4, iter=5) completed, 10 references processed
test_refcnt_iter(lcore=4, iter=6) completed, 10 references processed
test_refcnt_iter(lcore=4, iter=7) completed, 10 references processed
PANIC in __rte_ring_move_cons_head():
test_refcnt_iter(lcore=4, iter=8) completed, 10 references processed
test_refcnt_iter(lcore=4, iter=9) completed, 10 references processed
test_refcnt_iter(lcore=4, iter=10) completed, 10 references processed
test_refcnt_iter(lcore=4, iter=11) completed, 10 references processed
m ref is 2
test_refcnt_iter(lcore=4, iter=12) completed, 10 references processed
m ref is 2
test_refcnt_iter(lcore=4, iter=13) completed, 10 references processed
test_refcnt_iter(lcore=4, iter=14) completed, 10 references processed
test_refcnt_iter(lcore=4, iter=15) completed, 10 references processed
bizhao 4294967295 0x50 0x51 0x4f 0x50 0x50
m ref is 2
Note: only 0x50 of the filling operation and the PT is different in
every read act (0x4f != 0x50), and CH > PT. It does not exceed the ring
size(512) so the subtraction is overflow. Then since the CH is correct,
so the CH with +1 is considered to be correct and over the PT.
Test Patch:
--------------------------------
diff --git a/lib/librte_mbuf/rte_mbuf.c b/lib/librte_mbuf/rte_mbuf.c
old mode 100644
new mode 100755
index 26a62b8..ba0a6db
--- a/lib/librte_mbuf/rte_mbuf.c
+++ b/lib/librte_mbuf/rte_mbuf.c
@@ -243,8 +243,8 @@ rte_pktmbuf_dump(FILE *f, const struct rte_mbuf *m,
unsigned dump_len)
__rte_mbuf_sanity_check(m, 1);
- fprintf(f, "dump mbuf at %p, phys=%"PRIx64", buf_len=%u\n",
- m, (uint64_t)m->buf_physaddr, (unsigned)m->buf_len);
+ fprintf(f, "dump mbuf at %p, phys=%"PRIx64", buf_len=%u, ref=%u\n",
+ m, (uint64_t)m->buf_physaddr, (unsigned)m->buf_len,
(unsigned)m->refcnt);
fprintf(f, " pkt_len=%"PRIu32", ol_flags=%"PRIx64", nb_segs=%u, "
"in_port=%u\n", m->pkt_len, m->ol_flags,
(unsigned)m->nb_segs, (unsigned)m->port);
diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
old mode 100644
new mode 100755
index 8f5a493..3456b69
--- a/lib/librte_ring/rte_ring.h
+++ b/lib/librte_ring/rte_ring.h
@@ -502,6 +502,7 @@ __rte_ring_do_enqueue(struct rte_ring *r, void *
const *obj_table,
* - Actual number of objects dequeued.
* If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
*/
+#include <rte_debug.h>
static __rte_always_inline unsigned int
__rte_ring_move_cons_head(struct rte_ring *r, int is_sc,
unsigned int n, enum rte_ring_queue_behavior behavior,
@@ -517,6 +518,7 @@ __rte_ring_move_cons_head(struct rte_ring *r, int is_sc,
n = max;
*old_head = r->cons.head;
+ //rte_smp_rmb();
const uint32_t prod_tail = r->prod.tail;
/* The subtraction is done between two unsigned 32bits
value
* (the result is always modulo 32 bits even if we have
@@ -532,6 +534,8 @@ __rte_ring_move_cons_head(struct rte_ring *r, int is_sc,
return 0;
*new_head = *old_head + n;
+ if (*new_head > prod_tail) //|| r->cons.head > r->prod.tail)
+ rte_panic("bizhao %u %#x %#x %#x %#x %#x\n",
*entries, *old_head, *new_head, prod_tail, r->cons.head, r->prod.tail);
if (is_sc)
r->cons.head = *new_head, success = 1;
else
diff --git a/test/test/test_mbuf.c b/test/test/test_mbuf.c
old mode 100644
new mode 100755
index 3396b4a..731535c
--- a/test/test/test_mbuf.c
+++ b/test/test/test_mbuf.c
@@ -71,10 +71,12 @@
/* size of private data for mbuf in pktmbuf_pool2 */
#define MBUF2_PRIV_SIZE 128
-#define REFCNT_MAX_ITER 64
+// #define REFCNT_MAX_ITER 64
+#define REFCNT_MAX_ITER 32
#define REFCNT_MAX_TIMEOUT 10
-#define REFCNT_MAX_REF (RTE_MAX_LCORE)
-#define REFCNT_MBUF_NUM 64
+// #define REFCNT_MAX_REF (RTE_MAX_LCORE)
+#define REFCNT_MAX_REF 512
+#define REFCNT_MBUF_NUM 1
#define REFCNT_RING_SIZE (REFCNT_MBUF_NUM * REFCNT_MAX_REF)
#define MAGIC_DATA 0x42424242
@@ -683,12 +685,18 @@ test_refcnt_slave(void *arg)
lcore = rte_lcore_id();
printf("%s started at lcore %u\n", __func__, lcore);
+ uint32_t ava;
free = 0;
while (refcnt_stop_slaves == 0) {
- if (rte_ring_dequeue(refcnt_mbuf_ring, &mp) == 0) {
+ ava = rte_ring_dequeue_bulk(refcnt_mbuf_ring, &mp ,1, NULL);
+ if (ava > 0) {
+ if (ava > 10) {
+ rte_panic("OMG %u %u\n",
refcnt_mbuf_ring->prod.tail, refcnt_mbuf_ring->cons.tail);
+ }
free++;
- rte_pktmbuf_free(mp);
+ // rte_pktmbuf_free(mp);
+ rte_pktmbuf_refcnt_update(mp, -1);
}
}
@@ -719,6 +727,7 @@ test_refcnt_iter(unsigned int lcore, unsigned int iter,
i != n && (m = rte_pktmbuf_alloc(refcnt_pool)) != NULL;
i++) {
ref = RTE_MAX(rte_rand() % REFCNT_MAX_REF, 1UL);
+ ref = 10;
tref += ref;
if ((ref & 1) != 0) {
rte_pktmbuf_refcnt_update(m, ref);
@@ -730,7 +739,7 @@ test_refcnt_iter(unsigned int lcore, unsigned int iter,
rte_ring_enqueue(refcnt_mbuf_ring, m);
}
}
- rte_pktmbuf_free(m);
+ // rte_pktmbuf_free(m);
}
if (i != n)
@@ -741,6 +750,12 @@ test_refcnt_iter(unsigned int lcore, unsigned int iter,
while (!rte_ring_empty(refcnt_mbuf_ring))
;
+ if (NULL != m) {
+ if (1 != rte_mbuf_refcnt_read(m))
+ printf("m ref is %u\n", rte_mbuf_refcnt_read(m));
+ rte_pktmbuf_free(m);
+ }
+
/* check that all mbufs are back into mempool by now */
for (wn = 0; wn != REFCNT_MAX_TIMEOUT; wn++) {
if ((i = rte_mempool_avail_count(refcnt_pool)) == n) {
(END)
--------------------------------
BR. Bing
On 2017/10/13 11:23, Jia He wrote:
Hi Jerin
On 10/13/2017 9:49 AM, Jerin Jacob Wrote:
-----Original Message-----
Date: Fri, 13 Oct 2017 09:16:31 +0800
From: Jia He <hejia...@gmail.com>
To: Jerin Jacob <jerin.ja...@caviumnetworks.com>, "Ananyev, Konstantin"
<konstantin.anan...@intel.com>
Cc: Olivier MATZ <olivier.m...@6wind.com>, "dev@dpdk.org"
<dev@dpdk.org>,
"jia...@hxt-semitech.com" <jia...@hxt-semitech.com>,
"jie2....@hxt-semitech.com" <jie2....@hxt-semitech.com>,
"bing.z...@hxt-semitech.com" <bing.z...@hxt-semitech.com>
Subject: Re: [PATCH] ring: guarantee ordering of cons/prod loading when
doing enqueue/dequeue
User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:52.0) Gecko/20100101
Thunderbird/52.3.0
Hi
On 10/13/2017 9:02 AM, Jia He Wrote:
Hi Jerin
On 10/13/2017 1:23 AM, Jerin Jacob Wrote:
-----Original Message-----
Date: Thu, 12 Oct 2017 17:05:50 +0000
[...]
On the same lines,
Jia He, jie2.liu, bing.zhao,
Is this patch based on code review or do you saw this issue on any
of the
arm/ppc target? arm64 will have performance impact with this change.
sorry, miss one important information
Our platform is an aarch64 server with 46 cpus.
Is this an OOO(Out of order execution) aarch64 CPU implementation?
I think so, it is a server cpu (ARMv8-A), but do you know how to
confirm it?
cat /proc/cpuinfo
processor : 0
BogoMIPS : 40.00
Features : fp asimd evtstrm aes pmull sha1 sha2 crc32 cpuid
asimdrdm
CPU implementer : 0x51
CPU architecture: 8
CPU variant : 0x0
CPU part : 0x800
CPU revision : 0
If we reduced the involved cpu numbers, the bug occurred less
frequently.
Yes, mb barrier impact the performance, but correctness is more
important,
isn't it ;-)
Yes.
Maybe we can find any other lightweight barrier here?
Yes, Regarding the lightweight barrier, arm64 has native support for
acquire and release
semantics, which is exposed through gcc as architecture agnostic
functions.
https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
http://preshing.com/20130922/acquire-and-release-fences/
Good to know,
1) How much overhead this patch in your platform? Just relative
numbers are enough
I create a *standalone* test case for test_mbuf
Attached the debug patch
It is hard to believe but the truth is that the performance after
adding rmb barrier
is better than no adding.
With this patch (4 times running)
time ./test_good --no-huge -l 1-20
real 0m23.311s
user 7m21.870s
sys 0m0.021s
time ./test_bad --no-huge -l 1-20
Without this patch
real 0m38.972s
user 12m35.271s
sys 0m0.030s
Cheers,
Jia
2) As a prototype, Is Changing to acquire and release schematics
reduces the overhead in your platform?
Reference FreeBSD ring/DPDK style ring implementation through acquire
and release schematics
https://github.com/Linaro/odp/blob/master/platform/linux-generic/pktio/ring.c
I will also spend on cycles on this.
Cheers,
Jia
Based on mbuf_autotest, the rte_panic will be invoked in seconds.
PANIC in test_refcnt_iter():
(lcore=0, iter=0): after 10s only 61 of 64 mbufs left free
1: [./test(rte_dump_stack+0x38) [0x58d868]]
Aborted (core dumped)
Cheers,
Jia
Konstantin