In rte_free, the most cost time part of the whole process is
memset, we can do memset without hold heap->lock, the benefit
is reduce lock contention when multi thread try alloc or free.

In my test with 40 cores machine, I add some code to account
whole function cost in test_align_overlap_per_lcore with different
alloc/size, under legacy memory mode without existing hugepage,
files, this is test result:

size    w/      w/o
64      119us   118us
128     124us   118us
1024    137us   127us
4096    137us   140us
8192    142us   158us
16384   138us   186us
65536   139us   375us
131072  133us   627us
524277  694us   2973us
1048576 2117us  7685us

Signed-off-by: Fengnan Chang <changfeng...@bytedance.com>
---
 lib/eal/common/malloc_elem.c | 16 ----------------
 lib/eal/common/malloc_heap.c | 26 ++++++++++++++++++++++++--
 2 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/lib/eal/common/malloc_elem.c b/lib/eal/common/malloc_elem.c
index 35a2313d04..763bbe179b 100644
--- a/lib/eal/common/malloc_elem.c
+++ b/lib/eal/common/malloc_elem.c
@@ -569,12 +569,6 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem)
 struct malloc_elem *
 malloc_elem_free(struct malloc_elem *elem)
 {
-       void *ptr;
-       size_t data_len;
-
-       ptr = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN);
-       data_len = elem->size - MALLOC_ELEM_OVERHEAD;
-
        /*
         * Consider the element clean for the purposes of joining.
         * If both neighbors are clean or non-existent,
@@ -591,16 +585,6 @@ malloc_elem_free(struct malloc_elem *elem)
 
        /* decrease heap's count of allocated elements */
        elem->heap->alloc_count--;
-
-#ifndef RTE_MALLOC_DEBUG
-       /* Normally clear the memory when needed. */
-       if (!elem->dirty)
-               memset(ptr, 0, data_len);
-#else
-       /* Always poison the memory in debug mode. */
-       memset(ptr, MALLOC_POISON, data_len);
-#endif
-
        return elem;
 }
 
diff --git a/lib/eal/common/malloc_heap.c b/lib/eal/common/malloc_heap.c
index d25bdc98f9..a5fdc4cc6f 100644
--- a/lib/eal/common/malloc_heap.c
+++ b/lib/eal/common/malloc_heap.c
@@ -862,6 +862,8 @@ malloc_heap_free(struct malloc_elem *elem)
        unsigned int i, n_segs, before_space, after_space;
        int ret;
        bool unmapped = false;
+       void *ptr;
+       size_t data_len;
        const struct internal_config *internal_conf =
                eal_get_internal_configuration();
 
@@ -875,16 +877,36 @@ malloc_heap_free(struct malloc_elem *elem)
        msl = elem->msl;
        page_sz = (size_t)msl->page_sz;
 
-       rte_spinlock_lock(&(heap->lock));
-
        void *asan_ptr = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN + elem->pad);
        size_t asan_data_len = elem->size - MALLOC_ELEM_OVERHEAD - elem->pad;
 
+       ptr = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN);
+       data_len = elem->size - MALLOC_ELEM_OVERHEAD;
+
+       /* If orig_elem is clean, any child elem should be clean, so let's do 
memset
+        * before hold lock.
+        */
+       if (internal_conf->legacy_mem && !elem->orig_elem->dirty)
+               memset(ptr, 0, data_len);
+
+       rte_spinlock_lock(&(heap->lock));
        /* mark element as free */
        elem->state = ELEM_FREE;
 
        elem = malloc_elem_free(elem);
 
+#ifndef RTE_MALLOC_DEBUG
+       if (internal_conf->legacy_mem) {
+               /* If orig_elem is dirty, the joint element is clean, we need 
do memset now */
+               if (elem->orig_elem->dirty && !elem->dirty)
+                       memset(ptr, 0, data_len);
+       } else if (!elem->dirty) {
+               memset(ptr, 0, data_len);
+       }
+#else
+       /* Always poison the memory in debug mode. */
+       memset(ptr, MALLOC_POISON, data_len);
+#endif
        /* anything after this is a bonus */
        ret = 0;
 
-- 
2.20.1

Reply via email to