In rte_free, the most cost time part of the whole process is memset, we can do memset without hold heap->lock, the benefit is reduce lock contention when multi thread try alloc or free.
In my test with 40 cores machine, I add some code to account whole function cost in test_align_overlap_per_lcore with different alloc/size, under legacy memory mode without existing hugepage, files, this is test result: size w/ w/o 64 119us 118us 128 124us 118us 1024 137us 127us 4096 137us 140us 8192 142us 158us 16384 138us 186us 65536 139us 375us 131072 133us 627us 524277 694us 2973us 1048576 2117us 7685us Signed-off-by: Fengnan Chang <changfeng...@bytedance.com> --- lib/eal/common/malloc_elem.c | 16 ---------------- lib/eal/common/malloc_heap.c | 26 ++++++++++++++++++++++++-- 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/lib/eal/common/malloc_elem.c b/lib/eal/common/malloc_elem.c index 35a2313d04..763bbe179b 100644 --- a/lib/eal/common/malloc_elem.c +++ b/lib/eal/common/malloc_elem.c @@ -569,12 +569,6 @@ malloc_elem_join_adjacent_free(struct malloc_elem *elem) struct malloc_elem * malloc_elem_free(struct malloc_elem *elem) { - void *ptr; - size_t data_len; - - ptr = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN); - data_len = elem->size - MALLOC_ELEM_OVERHEAD; - /* * Consider the element clean for the purposes of joining. * If both neighbors are clean or non-existent, @@ -591,16 +585,6 @@ malloc_elem_free(struct malloc_elem *elem) /* decrease heap's count of allocated elements */ elem->heap->alloc_count--; - -#ifndef RTE_MALLOC_DEBUG - /* Normally clear the memory when needed. */ - if (!elem->dirty) - memset(ptr, 0, data_len); -#else - /* Always poison the memory in debug mode. */ - memset(ptr, MALLOC_POISON, data_len); -#endif - return elem; } diff --git a/lib/eal/common/malloc_heap.c b/lib/eal/common/malloc_heap.c index d25bdc98f9..a5fdc4cc6f 100644 --- a/lib/eal/common/malloc_heap.c +++ b/lib/eal/common/malloc_heap.c @@ -862,6 +862,8 @@ malloc_heap_free(struct malloc_elem *elem) unsigned int i, n_segs, before_space, after_space; int ret; bool unmapped = false; + void *ptr; + size_t data_len; const struct internal_config *internal_conf = eal_get_internal_configuration(); @@ -875,16 +877,36 @@ malloc_heap_free(struct malloc_elem *elem) msl = elem->msl; page_sz = (size_t)msl->page_sz; - rte_spinlock_lock(&(heap->lock)); - void *asan_ptr = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN + elem->pad); size_t asan_data_len = elem->size - MALLOC_ELEM_OVERHEAD - elem->pad; + ptr = RTE_PTR_ADD(elem, MALLOC_ELEM_HEADER_LEN); + data_len = elem->size - MALLOC_ELEM_OVERHEAD; + + /* If orig_elem is clean, any child elem should be clean, so let's do memset + * before hold lock. + */ + if (internal_conf->legacy_mem && !elem->orig_elem->dirty) + memset(ptr, 0, data_len); + + rte_spinlock_lock(&(heap->lock)); /* mark element as free */ elem->state = ELEM_FREE; elem = malloc_elem_free(elem); +#ifndef RTE_MALLOC_DEBUG + if (internal_conf->legacy_mem) { + /* If orig_elem is dirty, the joint element is clean, we need do memset now */ + if (elem->orig_elem->dirty && !elem->dirty) + memset(ptr, 0, data_len); + } else if (!elem->dirty) { + memset(ptr, 0, data_len); + } +#else + /* Always poison the memory in debug mode. */ + memset(ptr, MALLOC_POISON, data_len); +#endif /* anything after this is a bonus */ ret = 0; -- 2.20.1