Re: [PATCH] libdw: add thread-safety to dwarf_getabbrev()

Jonathon Anderson Sat, 24 Aug 2019 18:11:43 -0700



On Sat, Aug 24, 2019 at 6:24 PM, Mark Wielaard <m...@klomp.org> wrote:

Hi,

On Fri, Aug 16, 2019 at 02:24:28PM -0500, Jonathon Anderson wrote:

2. Adding a lock & array structure to the memory manager(pseudo-TLS)
   (libdwP.h, libdw_alloc.c)


Specifically concentrating on this part.

 diff --git a/libdw/ChangeLog b/libdw/ChangeLog
 index bf1f4857..87abf7a7 100644
 --- a/libdw/ChangeLog
 +++ b/libdw/ChangeLog
 @@ -1,3 +1,12 @@
 +2019-08-15  Jonathon Anderson  <jm...@rice.edu>
 +

+ * libdw_alloc.c (__libdw_allocate): Added thread-safe stackallocator.

 +      * libdwP.h (Dwarf): Likewise.
 +      * dwarf_begin_elf.c (dwarf_begin_elf): Support for above.
 +      * dwarf_end.c (dwarf_end): Likewise.
 [...]
 +      * Makefile.am: Link -lpthread to provide rwlocks.

 diff --git a/libdw/Makefile.am b/libdw/Makefile.am
 index 7a3d5322..6d0a0187 100644
 --- a/libdw/Makefile.am
 +++ b/libdw/Makefile.am
@@ -108,7 +108,7 @@ am_libdw_pic_a_OBJECTS =$(libdw_a_SOURCES:.c=.os)
 libdw_so_LIBS = libdw_pic.a ../libdwelf/libdwelf_pic.a \
          ../libdwfl/libdwfl_pic.a ../libebl/libebl.a
 libdw_so_DEPS = ../lib/libeu.a ../libelf/libelf.so
-libdw_so_LDLIBS = $(libdw_so_DEPS) -ldl -lz $(argp_LDADD)$(zip_LIBS)+libdw_so_LDLIBS = $(libdw_so_DEPS) -ldl -lz $(argp_LDADD)$(zip_LIBS)
 -lpthread
 libdw_so_SOURCES =
libdw.so$(EXEEXT): $(srcdir)/libdw.map $(libdw_so_LIBS)$(libdw_so_DEPS)
 # The rpath is necessary for libebl because its $ORIGIN use will


Do we also need -pthread for CFLAGS?

Yep, that would be the smart thing to do. Will be handled in the nextversion of the patch (once I figure out how to autotools...).

 diff --git a/libdw/dwarf_begin_elf.c b/libdw/dwarf_begin_elf.c
 index 38c8f5c6..b3885bb5 100644
 --- a/libdw/dwarf_begin_elf.c
 +++ b/libdw/dwarf_begin_elf.c

@@ -417,11 +417,14 @@ dwarf_begin_elf (Elf *elf, Dwarf_Cmd cmd,Elf_Scn

 *scngrp)
   /* Initialize the memory handling.  */
   result->mem_default_size = mem_default_size;
   result->oom_handler = __libdw_oom;
 -  result->mem_tail = (struct libdw_memblock *) (result + 1);
 -  result->mem_tail->size = (result->mem_default_size
 -                          - offsetof (struct libdw_memblock, mem));
 -  result->mem_tail->remaining = result->mem_tail->size;
 -  result->mem_tail->prev = NULL;
 +  pthread_rwlock_init(&result->mem_rwl, NULL);
 +  result->mem_stacks = 1;
 +  result->mem_tails = malloc (sizeof (struct libdw_memblock *));
 +  result->mem_tails[0] = (struct libdw_memblock *) (result + 1);
 +  result->mem_tails[0]->size = (result->mem_default_size
 +                             - offsetof (struct libdw_memblock, mem));
 +  result->mem_tails[0]->remaining = result->mem_tails[0]->size;
 +  result->mem_tails[0]->prev = NULL;


Can't we just set mem_tails[0] = NULL?
Wouldn't __libdw_alloc_tail () then initialize it?

Mostly an artifact of preserving the previous behavior forsingle-threaded programs, the malloc for the Dwarf then simplifies too(and less memory usage for opened but otherwise unused Dwarfs).Downside is an extra malloc on first usage (which shouldn't be an issuefor any real program).

 diff --git a/libdw/dwarf_end.c b/libdw/dwarf_end.c
 index 29795c10..6317bcda 100644
 --- a/libdw/dwarf_end.c
 +++ b/libdw/dwarf_end.c
 @@ -94,14 +94,22 @@ dwarf_end (Dwarf *dwarf)
       /* And the split Dwarf.  */
       tdestroy (dwarf->split_tree, noop_free);

 -      struct libdw_memblock *memp = dwarf->mem_tail;

- /* The first block is allocated together with the Dwarfobject. */

 -      while (memp->prev != NULL)
 -      {
 -        struct libdw_memblock *prevp = memp->prev;
 -        free (memp);
 -        memp = prevp;
 -      }
 +      for (size_t i = 0; i < dwarf->mem_stacks; i++)
 +        {
 +          struct libdw_memblock *memp = dwarf->mem_tails[i];

+ /* The first block is allocated together with the Dwarfobject.

*/


Then this wouldn't be true again.

 +          while (memp != NULL && memp->prev != NULL)
 +          {
 +            struct libdw_memblock *prevp = memp->prev;
 +            free (memp);
 +            memp = prevp;
 +          }
 +          /* Only for stack 0 though, the others are allocated
 individually.  */
 +          if (memp != NULL && i > 0)
 +            free (memp);
 +        }


And then you don't need this special case.

 +      free (dwarf->mem_tails);
 +      pthread_rwlock_destroy (&dwarf->mem_rwl);

       /* Free the pubnames helper structure.  */
       free (dwarf->pubnames_sets);
 diff --git a/libdw/libdwP.h b/libdw/libdwP.h
 index eebb7d12..442d493d 100644
 --- a/libdw/libdwP.h
 +++ b/libdw/libdwP.h
 @@ -31,6 +31,7 @@

 #include <libintl.h>
 #include <stdbool.h>
 +#include <pthread.h>

 #include <libdw.h>
 #include <dwarf.h>
 @@ -218,16 +219,18 @@ struct Dwarf

/* Similar for addrx/constx, which will come from .debug_addrsection. */

   struct Dwarf_CU *fake_addr_cu;

 -  /* Internal memory handling.  This is basically a simplified

+ /* Internal memory handling. This is basically a simplifiedthread-localreimplementation of obstacks. Unfortunately the standardobstack

      implementation is not usable in libraries.  */
 +  pthread_rwlock_t mem_rwl;
 +  size_t mem_stacks;
   struct libdw_memblock
   {
     size_t size;
     size_t remaining;
     struct libdw_memblock *prev;
     char mem[0];
 -  } *mem_tail;
 +  } **mem_tails;


Please document what/how exactly mem_rwl protects which other fields.


Got it.

   /* Default size of allocated memory blocks.  */
   size_t mem_default_size;
 @@ -572,7 +575,7 @@ extern void __libdw_seterrno (int value)
 internal_function;
/* Memory handling, the easy parts. This macro does not do anylocking. */


This comment isn't true anymore. __libdw_alloc_tail does locking.


Got it.

Also I think the locking should be extended beyond just that
functions, see below.

 #define libdw_alloc(dbg, type, tsize, cnt) \
 -  ({ struct libdw_memblock *_tail = (dbg)->mem_tail;                          
   \

+ ({ struct libdw_memblock *_tail = __libdw_alloc_tail(dbg);\

      size_t _required = (tsize) * (cnt);                                     \
      type *_result = (type *) (_tail->mem + (_tail->size -
 _tail->remaining));\
      size_t _padding = ((__alignof (type)                                    \
 @@ -591,6 +594,10 @@ extern void __libdw_seterrno (int value)
 internal_function;
 #define libdw_typed_alloc(dbg, type) \
   libdw_alloc (dbg, type, sizeof (type), 1)

 +/* Callback to choose a thread-local memory allocation stack.  */
 +extern struct libdw_memblock *__libdw_alloc_tail (Dwarf* dbg)
 +     __nonnull_attribute__ (1);
 +
 /* Callback to allocate more.  */

extern void *__libdw_allocate (Dwarf *dbg, size_t minsize, size_talign)

      __attribute__ ((__malloc__)) __nonnull_attribute__ (1);
 diff --git a/libdw/libdw_alloc.c b/libdw/libdw_alloc.c
 index f1e08714..c3c5e8a7 100644
 --- a/libdw/libdw_alloc.c
 +++ b/libdw/libdw_alloc.c
 @@ -33,9 +33,73 @@

 #include <errno.h>
 #include <stdlib.h>
 +#include <assert.h>
 #include "libdwP.h"
 #include "system.h"
 +#include "stdatomic.h"
 +#if USE_VG_ANNOTATIONS == 1
 +#include <helgrind.h>
 +#include <drd.h>
 +#else
 +#define ANNOTATE_HAPPENS_BEFORE(X)
 +#define ANNOTATE_HAPPENS_AFTER(X)
 +#endif
 +
 +
 +#define thread_local __thread

 +static thread_local int initialized = 0;
 +static thread_local size_t thread_id;
 +static atomic_size_t next_id = ATOMIC_VAR_INIT(0);


Could you initialize thread_id to (size_t) -1?
Then you don't need another thread_local to indicate not initialized?


Just a habit, easy enough to change for the next version.

 +struct libdw_memblock *
 +__libdw_alloc_tail (Dwarf *dbg)
 +{
 +  if (!initialized)
And change this to thead_id == (size_t) -1?
 +    {
 +      thread_id = atomic_fetch_add (&next_id, 1);
 +      initialized = 1;
 +    }
 +
 +  pthread_rwlock_rdlock (&dbg->mem_rwl);
 +  if (thread_id >= dbg->mem_stacks)
 +    {
 +      pthread_rwlock_unlock (&dbg->mem_rwl);
 +      pthread_rwlock_wrlock (&dbg->mem_rwl);
 +
+ /* Another thread may have already reallocated. In theoryusing an+ atomic would be faster, but given that this only happensonce per
 +         thread per Dwarf, some minor slowdown should be fine.  */
 +      if (thread_id >= dbg->mem_stacks)
 +        {
 +          dbg->mem_tails = realloc (dbg->mem_tails, (thread_id+1)
+ * sizeof (structlibdw_memblock *));
 +          assert(dbg->mem_tails);
Don't use assert here, use if (dbg->mem_tails == NULL)dbg->oom_handler ();


Same as __libdw_allocate, got it. Apologies for not noticing sooner.


But what if realloc moves the block?
Then all dbg->mem_tails[thread_id] pointers become invalid.
And this function drops the lock before returning a libdw_memblock*.

Not quite, dbg->mem_tails is an array of pointers (note the ** in itsdefinition, and the use of the plural "tails"). So while thedbg->mem_tails pointer becomes invalid, the dbg->mem_tails[thread_id]pointers don't.

It would be a different story if dbg->mem_tails was an array oflibdw_memblocks, but its not. That would increase the "memory leak"issue mentioned previously (to ~4K per dead thread) and require morework on the part of the reallocing thread to initialize the new entries(which at the moment should reduce to a memset, assuming the compileris smart enough and NULL == 0).


So I think the lock needs to extend beyond this function somehow.  Or
we need to prevent another thread reallocing while we are dealing with
the assigned memblock.

No need, in fact we want to drop the lock as soon as possible to letnew threads in for realloc's. Under the assumption that we don't needto allocate new blocks (extremely) often, the extra cost to relock whenwe do should be relatively small. Also see __libdw_allocate.

 +          for (size_t i = dbg->mem_stacks; i <= thread_id; i++)
 +            dbg->mem_tails[i] = NULL;
 +          dbg->mem_stacks = thread_id + 1;
 +          ANNOTATE_HAPPENS_BEFORE (&dbg->mem_tails);
 +        }
 +
 +      pthread_rwlock_unlock (&dbg->mem_rwl);
 +      pthread_rwlock_rdlock (&dbg->mem_rwl);
 +    }
 +
 +  /* At this point, we have an entry in the tail array.  */
 +  ANNOTATE_HAPPENS_AFTER (&dbg->mem_tails);
 +  struct libdw_memblock *result = dbg->mem_tails[thread_id];
 +  if (result == NULL)
 +    {
 +      result = malloc (dbg->mem_default_size);
 +      result->size = dbg->mem_default_size
 +                     - offsetof (struct libdw_memblock, mem);
 +      result->remaining = result->size;
 +      result->prev = NULL;
 +      dbg->mem_tails[thread_id] = result;
 +    }
 +  pthread_rwlock_unlock (&dbg->mem_rwl);
 +  return result;


Here the lock is dropped and result points into dbg->mem_tails
which means it cannot be used because the realloc above might
make it invalid.

result points to a (potentially newly created) memory block, accessibleonly by this thread and never realloc'd.

 +}

 void *
 __libdw_allocate (Dwarf *dbg, size_t minsize, size_t align)

@@ -52,8 +116,10 @@ __libdw_allocate (Dwarf *dbg, size_t minsize,size_t

 align)
   newp->size = size - offsetof (struct libdw_memblock, mem);
   newp->remaining = (uintptr_t) newp + size - (result + minsize);

 -  newp->prev = dbg->mem_tail;
 -  dbg->mem_tail = newp;
 +  pthread_rwlock_rdlock (&dbg->mem_rwl);
 +  newp->prev = dbg->mem_tails[thread_id];
 +  dbg->mem_tails[thread_id] = newp;
 +  pthread_rwlock_unlock (&dbg->mem_rwl);

Here we tinker with dbg->mem_tails under a read lock, which is mutuallyexcluded from the write lock held during realloc. The new block belongsto this thread alone, so it doesn't need any locks to manage its usage.


   return (void *) result;
 }


Cheers,

Mark

As mentioned in other mails, this management scheme isn't (always)optimally memory efficient, but its speed is good under parallelstress. Far better than wrapping the whole thing with a singlepthread_mutex_t.


-Jonathon

Re: [PATCH] libdw: add thread-safety to dwarf_getabbrev()

Reply via email to