Hi,
A question from Theo made me think about realloc and come up with a
particular bad case for performance. I do not know if it happens in
practice, but it was easy to create a test program to hit the case.
We're talking allocation >= a page here. Smaller allocation follow
different rules.
If an allocation is grown by realloc, I first tries to extend the
allocation by mapping pages next to the existing allocation. Since
the location of pages is randomized, chanches are high that next to an
allocation there are unmapped pages so the grow will work out.
If realloc needs to shrink the allocation it puts the high pages no
longer needed in the malloc cache. There they can be re-used by other
allocations. But if that happens, next a grow of first allocation will
fail: the pages are already mapped. So realloc needs to do a new
allocation followed by a copy and a cleanup of the original.
So this strategy of a shrinking realloc to of put unneeded pages into
the cache can work against us, plus it has the consequence that use of
realloc leads to allocations close to each other: no free guard pages.
The program below tests this scenario and runs awfully slow. The diff
fixes this by applying two strategies. The first already makes a huge
difference, but the second strategy will also reduce the total number
of syscalls at the cost of some more memory use.
1. I do not put high pages of shrinking reallocs into to cache, but
directly unmap.
2. For small shrinking reallocs realloc become a no-op. Pro: no
syscalls at all, cons: the actual allocation is larger, so less
overflow detection. So I do not do this if guard pages are active or
the reduction is larger than the cache size.
Some stats, First run is -current, second one is with (an earlier
version of) the diff on an armv7 machine. Other systems also show huge
differences.
[otto@wand:19]$ time ./a.out
0m31.68s real 0m10.02s user 0m21.65s system
[otto@wand:33]$ time ./a.out
0m00.16s real 0m00.12s user 0m00.03s system
I do not see any diffference for builds. But I cna imagine real-life
programs hitting the case.
-Otto
#include <err.h>
#include <stdlib.h>
#include <unistd.h>
void *p;
size_t psz;
#define E(x) if ((x) == NULL) err(1, NULL)
void f(void)
{
int i;
void *s[64];
p = realloc(p, 1023*psz);
E(p);
for (i = 0; i < 64; i++) {
s[i] = malloc(psz);
E(s[i]);
}
p = realloc(p, 1024*psz);
E(p);
for (i = 0; i < 64; i++)
free(s[i]);
}
int main()
{
int i;
psz = getpagesize();
p = malloc(1024*psz);
E(p);
for (i = 0; i < 1000; i++)
f();
}
Index: malloc.c
===================================================================
RCS file: /cvs/src/lib/libc/stdlib/malloc.c,v
retrieving revision 1.262
diff -u -p -r1.262 malloc.c
--- malloc.c 28 Jun 2019 13:32:42 -0000 1.262
+++ malloc.c 31 Aug 2020 06:01:40 -0000
@@ -728,28 +728,8 @@ unmap(struct dir_info *d, void *p, size_
wrterror(d, "malloc cache overflow");
}
-static void
-zapcacheregion(struct dir_info *d, void *p, size_t len)
-{
- u_int i;
- struct region_info *r;
- size_t rsz;
-
- for (i = 0; i < d->malloc_cache; i++) {
- r = &d->free_regions[i];
- if (r->p >= p && r->p <= (void *)((char *)p + len)) {
- rsz = r->size << MALLOC_PAGESHIFT;
- if (munmap(r->p, rsz))
- wrterror(d, "munmap %p", r->p);
- r->p = NULL;
- d->free_regions_size -= r->size;
- STATS_SUB(d->malloc_used, rsz);
- }
- }
-}
-
static void *
-map(struct dir_info *d, void *hint, size_t sz, int zero_fill)
+map(struct dir_info *d, size_t sz, int zero_fill)
{
size_t psz = sz >> MALLOC_PAGESHIFT;
struct region_info *r, *big = NULL;
@@ -762,7 +742,7 @@ map(struct dir_info *d, void *hint, size
if (sz != PAGEROUND(sz))
wrterror(d, "map round");
- if (hint == NULL && psz > d->free_regions_size) {
+ if (psz > d->free_regions_size) {
_MALLOC_LEAVE(d);
p = MMAP(sz, d->mmap_flag);
_MALLOC_ENTER(d);
@@ -774,8 +754,6 @@ map(struct dir_info *d, void *hint, size
for (i = 0; i < d->malloc_cache; i++) {
r = &d->free_regions[(i + d->rotor) & (d->malloc_cache - 1)];
if (r->p != NULL) {
- if (hint != NULL && r->p != hint)
- continue;
if (r->size == psz) {
p = r->p;
r->p = NULL;
@@ -807,8 +785,6 @@ map(struct dir_info *d, void *hint, size
memset(p, SOME_FREEJUNK, sz);
return p;
}
- if (hint != NULL)
- return MAP_FAILED;
if (d->free_regions_size > d->malloc_cache)
wrterror(d, "malloc cache");
_MALLOC_LEAVE(d);
@@ -892,7 +868,7 @@ omalloc_make_chunks(struct dir_info *d,
void *pp;
/* Allocate a new bucket */
- pp = map(d, NULL, MALLOC_PAGESIZE, 0);
+ pp = map(d, MALLOC_PAGESIZE, 0);
if (pp == MAP_FAILED)
return NULL;
@@ -1136,7 +1112,7 @@ omalloc(struct dir_info *pool, size_t sz
}
sz += mopts.malloc_guard;
psz = PAGEROUND(sz);
- p = map(pool, NULL, psz, zero_fill);
+ p = map(pool, psz, zero_fill);
if (p == MAP_FAILED) {
errno = ENOMEM;
return NULL;
@@ -1571,11 +1547,20 @@ orealloc(struct dir_info **argpool, void
forced = mopts.malloc_realloc || pool->mmap_flag;
if (newsz > MALLOC_MAXCHUNK && oldsz > MALLOC_MAXCHUNK && !forced) {
- /* First case: from n pages sized allocation to m pages sized
- allocation, m > n */
size_t roldsz = PAGEROUND(goldsz);
size_t rnewsz = PAGEROUND(gnewsz);
+ /* A (relatively and absolutely) small shrinking realloc
+ is a no-op but don't do that if guards are enabled */
+ if (rnewsz < roldsz && rnewsz > roldsz / 2 &&
+ roldsz - rnewsz <= pool->malloc_cache * MALLOC_PAGESIZE &&
+ !mopts.malloc_guard) {
+ ret = p;
+ goto done;
+ }
+
+ /* First case: from n pages sized allocation to m pages sized
+ allocation, m > n */
if (rnewsz > roldsz) {
/* try to extend existing region */
if (!mopts.malloc_guard) {
@@ -1583,10 +1568,6 @@ orealloc(struct dir_info **argpool, void
size_t needed = rnewsz - roldsz;
STATS_INC(pool->cheap_realloc_tries);
- q = map(pool, hint, needed, 0);
- if (q == hint)
- goto gotit;
- zapcacheregion(pool, hint, needed);
q = MQUERY(hint, needed, pool->mmap_flag);
if (q == hint)
q = MMAPA(hint, needed,
pool->mmap_flag);
@@ -1618,17 +1599,13 @@ gotit:
} else if (rnewsz < roldsz) {
/* shrink number of pages */
if (mopts.malloc_guard) {
- if (mprotect((char *)r->p + roldsz -
- mopts.malloc_guard, mopts.malloc_guard,
- PROT_READ | PROT_WRITE))
- wrterror(pool, "mprotect");
if (mprotect((char *)r->p + rnewsz -
mopts.malloc_guard, mopts.malloc_guard,
PROT_NONE))
wrterror(pool, "mprotect");
}
- unmap(pool, (char *)r->p + rnewsz, roldsz - rnewsz, 0,
- pool->malloc_junk);
+ if (munmap((char *)r->p + rnewsz, roldsz - rnewsz))
+ wrterror(pool, "munmap %p", (char *)r->p +
rnewsz);
r->size = gnewsz;
if (MALLOC_MOVE_COND(gnewsz)) {
void *pp = MALLOC_MOVE(r->p, gnewsz);
@@ -1800,9 +1777,18 @@ orecallocarray(struct dir_info **argpool
info->bits[info->offset + chunknum],
oldsize);
}
- } else if (oldsize != sz - mopts.malloc_guard)
- wrterror(pool, "recorded old size %zu != %zu",
- sz - mopts.malloc_guard, oldsize);
+ } else {
+ if (mopts.malloc_guard) {
+ if (oldsize != sz - mopts.malloc_guard)
+ wrterror(pool, "recorded old size %zu != %zu",
+ sz - mopts.malloc_guard, oldsize);
+ } else {
+ /* A no-op shrink by realloc could have been done */
+ if (oldsize < sz / 2 || oldsize > sz)
+ wrterror(pool, "recorded old size %zu "
+ "inconsistent with %zu", sz, oldsize);
+ }
+ }
newptr = omalloc(pool, newsize, 0, f);
if (newptr == NULL)
@@ -1937,7 +1923,7 @@ mapalign(struct dir_info *d, size_t alig
if (alignment > SIZE_MAX - sz)
return MAP_FAILED;
- p = map(d, NULL, sz + alignment, zero_fill);
+ p = map(d, sz + alignment, zero_fill);
if (p == MAP_FAILED)
return MAP_FAILED;
q = (char *)(((uintptr_t)p + alignment - 1) & ~(alignment - 1));