On Sat, Aug 24, 2019 at 6:24 PM, Mark Wielaard <m...@klomp.org> wrote:
Hi,
On Fri, Aug 16, 2019 at 02:24:28PM -0500, Jonathon Anderson wrote:
2. Adding a lock & array structure to the memory manager
(pseudo-TLS)
(libdwP.h, libdw_alloc.c)
Specifically concentrating on this part.
diff --git a/libdw/ChangeLog b/libdw/ChangeLog
index bf1f4857..87abf7a7 100644
--- a/libdw/ChangeLog
+++ b/libdw/ChangeLog
@@ -1,3 +1,12 @@
+2019-08-15 Jonathon Anderson <jm...@rice.edu>
+
+ * libdw_alloc.c (__libdw_allocate): Added thread-safe stack
allocator.
+ * libdwP.h (Dwarf): Likewise.
+ * dwarf_begin_elf.c (dwarf_begin_elf): Support for above.
+ * dwarf_end.c (dwarf_end): Likewise.
[...]
+ * Makefile.am: Link -lpthread to provide rwlocks.
diff --git a/libdw/Makefile.am b/libdw/Makefile.am
index 7a3d5322..6d0a0187 100644
--- a/libdw/Makefile.am
+++ b/libdw/Makefile.am
@@ -108,7 +108,7 @@ am_libdw_pic_a_OBJECTS =
$(libdw_a_SOURCES:.c=.os)
libdw_so_LIBS = libdw_pic.a ../libdwelf/libdwelf_pic.a \
../libdwfl/libdwfl_pic.a ../libebl/libebl.a
libdw_so_DEPS = ../lib/libeu.a ../libelf/libelf.so
-libdw_so_LDLIBS = $(libdw_so_DEPS) -ldl -lz $(argp_LDADD)
$(zip_LIBS)
+libdw_so_LDLIBS = $(libdw_so_DEPS) -ldl -lz $(argp_LDADD)
$(zip_LIBS)
-lpthread
libdw_so_SOURCES =
libdw.so$(EXEEXT): $(srcdir)/libdw.map $(libdw_so_LIBS)
$(libdw_so_DEPS)
# The rpath is necessary for libebl because its $ORIGIN use will
Do we also need -pthread for CFLAGS?
Yep, that would be the smart thing to do. Will be handled in the next
version of the patch (once I figure out how to autotools...).
diff --git a/libdw/dwarf_begin_elf.c b/libdw/dwarf_begin_elf.c
index 38c8f5c6..b3885bb5 100644
--- a/libdw/dwarf_begin_elf.c
+++ b/libdw/dwarf_begin_elf.c
@@ -417,11 +417,14 @@ dwarf_begin_elf (Elf *elf, Dwarf_Cmd cmd,
Elf_Scn
*scngrp)
/* Initialize the memory handling. */
result->mem_default_size = mem_default_size;
result->oom_handler = __libdw_oom;
- result->mem_tail = (struct libdw_memblock *) (result + 1);
- result->mem_tail->size = (result->mem_default_size
- - offsetof (struct libdw_memblock, mem));
- result->mem_tail->remaining = result->mem_tail->size;
- result->mem_tail->prev = NULL;
+ pthread_rwlock_init(&result->mem_rwl, NULL);
+ result->mem_stacks = 1;
+ result->mem_tails = malloc (sizeof (struct libdw_memblock *));
+ result->mem_tails[0] = (struct libdw_memblock *) (result + 1);
+ result->mem_tails[0]->size = (result->mem_default_size
+ - offsetof (struct libdw_memblock, mem));
+ result->mem_tails[0]->remaining = result->mem_tails[0]->size;
+ result->mem_tails[0]->prev = NULL;
Can't we just set mem_tails[0] = NULL?
Wouldn't __libdw_alloc_tail () then initialize it?
Mostly an artifact of preserving the previous behavior for
single-threaded programs, the malloc for the Dwarf then simplifies too
(and less memory usage for opened but otherwise unused Dwarfs).
Downside is an extra malloc on first usage (which shouldn't be an issue
for any real program).
diff --git a/libdw/dwarf_end.c b/libdw/dwarf_end.c
index 29795c10..6317bcda 100644
--- a/libdw/dwarf_end.c
+++ b/libdw/dwarf_end.c
@@ -94,14 +94,22 @@ dwarf_end (Dwarf *dwarf)
/* And the split Dwarf. */
tdestroy (dwarf->split_tree, noop_free);
- struct libdw_memblock *memp = dwarf->mem_tail;
- /* The first block is allocated together with the Dwarf
object. */
- while (memp->prev != NULL)
- {
- struct libdw_memblock *prevp = memp->prev;
- free (memp);
- memp = prevp;
- }
+ for (size_t i = 0; i < dwarf->mem_stacks; i++)
+ {
+ struct libdw_memblock *memp = dwarf->mem_tails[i];
+ /* The first block is allocated together with the Dwarf
object.
*/
Then this wouldn't be true again.
+ while (memp != NULL && memp->prev != NULL)
+ {
+ struct libdw_memblock *prevp = memp->prev;
+ free (memp);
+ memp = prevp;
+ }
+ /* Only for stack 0 though, the others are allocated
individually. */
+ if (memp != NULL && i > 0)
+ free (memp);
+ }
And then you don't need this special case.
+ free (dwarf->mem_tails);
+ pthread_rwlock_destroy (&dwarf->mem_rwl);
/* Free the pubnames helper structure. */
free (dwarf->pubnames_sets);
diff --git a/libdw/libdwP.h b/libdw/libdwP.h
index eebb7d12..442d493d 100644
--- a/libdw/libdwP.h
+++ b/libdw/libdwP.h
@@ -31,6 +31,7 @@
#include <libintl.h>
#include <stdbool.h>
+#include <pthread.h>
#include <libdw.h>
#include <dwarf.h>
@@ -218,16 +219,18 @@ struct Dwarf
/* Similar for addrx/constx, which will come from .debug_addr
section. */
struct Dwarf_CU *fake_addr_cu;
- /* Internal memory handling. This is basically a simplified
+ /* Internal memory handling. This is basically a simplified
thread-local
reimplementation of obstacks. Unfortunately the standard
obstack
implementation is not usable in libraries. */
+ pthread_rwlock_t mem_rwl;
+ size_t mem_stacks;
struct libdw_memblock
{
size_t size;
size_t remaining;
struct libdw_memblock *prev;
char mem[0];
- } *mem_tail;
+ } **mem_tails;
Please document what/how exactly mem_rwl protects which other fields.
Got it.
/* Default size of allocated memory blocks. */
size_t mem_default_size;
@@ -572,7 +575,7 @@ extern void __libdw_seterrno (int value)
internal_function;
/* Memory handling, the easy parts. This macro does not do any
locking. */
This comment isn't true anymore. __libdw_alloc_tail does locking.
Got it.
Also I think the locking should be extended beyond just that
functions, see below.
#define libdw_alloc(dbg, type, tsize, cnt) \
- ({ struct libdw_memblock *_tail = (dbg)->mem_tail;
\
+ ({ struct libdw_memblock *_tail = __libdw_alloc_tail(dbg);
\
size_t _required = (tsize) * (cnt); \
type *_result = (type *) (_tail->mem + (_tail->size -
_tail->remaining));\
size_t _padding = ((__alignof (type) \
@@ -591,6 +594,10 @@ extern void __libdw_seterrno (int value)
internal_function;
#define libdw_typed_alloc(dbg, type) \
libdw_alloc (dbg, type, sizeof (type), 1)
+/* Callback to choose a thread-local memory allocation stack. */
+extern struct libdw_memblock *__libdw_alloc_tail (Dwarf* dbg)
+ __nonnull_attribute__ (1);
+
/* Callback to allocate more. */
extern void *__libdw_allocate (Dwarf *dbg, size_t minsize, size_t
align)
__attribute__ ((__malloc__)) __nonnull_attribute__ (1);
diff --git a/libdw/libdw_alloc.c b/libdw/libdw_alloc.c
index f1e08714..c3c5e8a7 100644
--- a/libdw/libdw_alloc.c
+++ b/libdw/libdw_alloc.c
@@ -33,9 +33,73 @@
#include <errno.h>
#include <stdlib.h>
+#include <assert.h>
#include "libdwP.h"
#include "system.h"
+#include "stdatomic.h"
+#if USE_VG_ANNOTATIONS == 1
+#include <helgrind.h>
+#include <drd.h>
+#else
+#define ANNOTATE_HAPPENS_BEFORE(X)
+#define ANNOTATE_HAPPENS_AFTER(X)
+#endif
+
+
+#define thread_local __thread
+static thread_local int initialized = 0;
+static thread_local size_t thread_id;
+static atomic_size_t next_id = ATOMIC_VAR_INIT(0);
Could you initialize thread_id to (size_t) -1?
Then you don't need another thread_local to indicate not initialized?
Just a habit, easy enough to change for the next version.
+struct libdw_memblock *
+__libdw_alloc_tail (Dwarf *dbg)
+{
+ if (!initialized)
And change this to thead_id == (size_t) -1?
+ {
+ thread_id = atomic_fetch_add (&next_id, 1);
+ initialized = 1;
+ }
+
+ pthread_rwlock_rdlock (&dbg->mem_rwl);
+ if (thread_id >= dbg->mem_stacks)
+ {
+ pthread_rwlock_unlock (&dbg->mem_rwl);
+ pthread_rwlock_wrlock (&dbg->mem_rwl);
+
+ /* Another thread may have already reallocated. In theory
using an
+ atomic would be faster, but given that this only happens
once per
+ thread per Dwarf, some minor slowdown should be fine. */
+ if (thread_id >= dbg->mem_stacks)
+ {
+ dbg->mem_tails = realloc (dbg->mem_tails, (thread_id+1)
+ * sizeof (struct
libdw_memblock *));
+ assert(dbg->mem_tails);
Don't use assert here, use if (dbg->mem_tails == NULL)
dbg->oom_handler ();
Same as __libdw_allocate, got it. Apologies for not noticing sooner.
But what if realloc moves the block?
Then all dbg->mem_tails[thread_id] pointers become invalid.
And this function drops the lock before returning a libdw_memblock*.
Not quite, dbg->mem_tails is an array of pointers (note the ** in its
definition, and the use of the plural "tails"). So while the
dbg->mem_tails pointer becomes invalid, the dbg->mem_tails[thread_id]
pointers don't.
It would be a different story if dbg->mem_tails was an array of
libdw_memblocks, but its not. That would increase the "memory leak"
issue mentioned previously (to ~4K per dead thread) and require more
work on the part of the reallocing thread to initialize the new entries
(which at the moment should reduce to a memset, assuming the compiler
is smart enough and NULL == 0).
So I think the lock needs to extend beyond this function somehow. Or
we need to prevent another thread reallocing while we are dealing with
the assigned memblock.
No need, in fact we want to drop the lock as soon as possible to let
new threads in for realloc's. Under the assumption that we don't need
to allocate new blocks (extremely) often, the extra cost to relock when
we do should be relatively small. Also see __libdw_allocate.
+ for (size_t i = dbg->mem_stacks; i <= thread_id; i++)
+ dbg->mem_tails[i] = NULL;
+ dbg->mem_stacks = thread_id + 1;
+ ANNOTATE_HAPPENS_BEFORE (&dbg->mem_tails);
+ }
+
+ pthread_rwlock_unlock (&dbg->mem_rwl);
+ pthread_rwlock_rdlock (&dbg->mem_rwl);
+ }
+
+ /* At this point, we have an entry in the tail array. */
+ ANNOTATE_HAPPENS_AFTER (&dbg->mem_tails);
+ struct libdw_memblock *result = dbg->mem_tails[thread_id];
+ if (result == NULL)
+ {
+ result = malloc (dbg->mem_default_size);
+ result->size = dbg->mem_default_size
+ - offsetof (struct libdw_memblock, mem);
+ result->remaining = result->size;
+ result->prev = NULL;
+ dbg->mem_tails[thread_id] = result;
+ }
+ pthread_rwlock_unlock (&dbg->mem_rwl);
+ return result;
Here the lock is dropped and result points into dbg->mem_tails
which means it cannot be used because the realloc above might
make it invalid.
result points to a (potentially newly created) memory block, accessible
only by this thread and never realloc'd.
+}
void *
__libdw_allocate (Dwarf *dbg, size_t minsize, size_t align)
@@ -52,8 +116,10 @@ __libdw_allocate (Dwarf *dbg, size_t minsize,
size_t
align)
newp->size = size - offsetof (struct libdw_memblock, mem);
newp->remaining = (uintptr_t) newp + size - (result + minsize);
- newp->prev = dbg->mem_tail;
- dbg->mem_tail = newp;
+ pthread_rwlock_rdlock (&dbg->mem_rwl);
+ newp->prev = dbg->mem_tails[thread_id];
+ dbg->mem_tails[thread_id] = newp;
+ pthread_rwlock_unlock (&dbg->mem_rwl);
Here we tinker with dbg->mem_tails under a read lock, which is mutually
excluded from the write lock held during realloc. The new block belongs
to this thread alone, so it doesn't need any locks to manage its usage.
return (void *) result;
}
Cheers,
Mark
As mentioned in other mails, this management scheme isn't (always)
optimally memory efficient, but its speed is good under parallel
stress. Far better than wrapping the whole thing with a single
pthread_mutex_t.
-Jonathon