Am 03.03.19 um 18:59 schrieb Johannes Pfau:
Hello all,
I wanted to ask for some feedback on an issue which has affected GDC
for a long time now: Letting the D garbage collector scan
emulated TLS memory. Basically, for every variable instance in TLS
memory we need to be able to get the address and size of that instance,
so the GC can scan this memory for pointers.
Now with libgcc emutls there are essentially two problems:
1) Memory is allocated individually for each variable, so it's not
possible to simply get a memory range as in native TLS.
I can't think of a way to get the required information with the
current API, so I've attached proof-of-concept patch which
implements a __emutls_iterate_memory function. It is however rather
intrusive, so maybe there's a better solution. Currently it keeps
the size for all variables in every threads local memory, but an
optimization would be to keep the size only once in a separate array.
A completely different, less intrusive solution would be to allow the
frontend to specify different emutls functions (e.g.
__d_emutls_get_address instead of __emutls_get_address). The main
drawback here is that linking C/D emutls variables would not be
possible due to the incompatible ABI. However, this is also true
for the other D compilers' (DMD) emutls implementation, so this may
be fine. A D emutls implementation would also be much simpler and
probably faster (we can just allocate using the GC).
(Of course it's way too late for such changes in GCC9, so I'm just
asking for some general feedback.)
Ping.
In the meantime I've also implemented the second solution: I added
additional langhooks to make gdc/gcc call __emutls_d_get_addr instead
and reimplemented the emutls support in druntime. However, the druntime
implementation didn't turn out as I originally hoped: Because druntime
uses TLS before it even uses the GC, I can't use the GC to allocate
memory in the emutls implementation. And as core.thread uses TLS, I
can't store the per-thread array in the Thread class as I initially
wanted. Because of that, the D reimplementation is now 1:1 the same as
the original libgcc implementation. In that case, it might be better to
change libgcc, as we then still have ABI compatibility between C/D
emutls variables.
Attached is version 2 of the original patch. This keeps the sizes for
all threads' allocations in one array, so the memory overhead is
constant with respect to the number of threads.
When running the testsuite I realized there's one very unfortunate
regression with the attached patch: It breaks asan for emutls.c. The
reason is simple: I now allocate memory when the emutls mutex is locked.
Asan hooks into that memory allocation but it also internally uses TLS
so it calls __emutls_get_addr again and then deadlocks as it tries to
lock the emutls mutex again:
#2 0x00007ffff721b4d7 in __gthread_mutex_lock (__mutex=0x7ffff7221420
<emutls_mutex>) at ./gthr-default.h:749
#3 __emutls_get_address (obj=obj@entry=0x7ffff7725000
<__emutls_v._ZN6__lsan15disable_counterE>) at
../../../gcc/libgcc/emutls.c:294
#4 0x00007ffff76d4590 in __lsan::DisabledInThisThread () at
../../../../gcc/libsanitizer/lsan/lsan_common_linux.cc:42
#5 0x00007ffff75cdfa8 in __asan::Allocator::Allocate
(this=this@entry=0x7ffff77250c0 <__asan::instance>, size=size@entry=272,
alignment=<optimized out>,
alignment@entry=8, stack=<optimized out>,
alloc_type=alloc_type@entry=__asan::FROM_MALLOC,
can_fill=can_fill@entry=false)
at ../../../../gcc/libsanitizer/asan/asan_allocator.cc:537
#6 0x00007ffff75ca4fc in __asan::Allocator::Calloc
(stack=0x7fffffffd980, size=size@entry=272, nmemb=nmemb@entry=272,
this=0x7ffff77250c0 <__asan::instance>)
at ../../../../gcc/libsanitizer/asan/asan_allocator.cc:680
#7 __asan::asan_calloc (nmemb=nmemb@entry=272, size=size@entry=1,
stack=stack@entry=0x7fffffffd980) at
../../../../gcc/libsanitizer/asan/asan_allocator.cc:878
#8 0x00007ffff76a9e72 in __interceptor_calloc (nmemb=272, size=1) at
../../../../gcc/libsanitizer/asan/asan_malloc_linux.cc:154
#9 0x00007ffff721b700 in emutls_save_size (size=32, offset=1) at
../../../gcc/libgcc/emutls.c:153
Does anybody have an idea how to fix this or any comments in general
related to GC / emutls integration?
Best regards,
Johannes
>From cabe88b4711c451178d4211bfcd4e28cce9d5253 Mon Sep 17 00:00:00 2001
From: Johannes Pfau <johannesp...@gmail.com>
Date: Sun, 3 Mar 2019 15:18:05 +0100
Subject: [PATCH] libgcc: Introduce emutls iterate_memory hook
---
libgcc/emutls.c | 153 +++++++++++++++++++++++++++++++++++++++
libgcc/libgcc-std.ver.in | 5 ++
2 files changed, 158 insertions(+)
diff --git a/libgcc/emutls.c b/libgcc/emutls.c
index c725142e465..83c7572d20c 100644
--- a/libgcc/emutls.c
+++ b/libgcc/emutls.c
@@ -50,8 +50,17 @@ struct __emutls_array
void **data[];
};
+struct __emutls_word_array
+{
+ pointer size;
+ word data[];
+};
+
void *__emutls_get_address (struct __emutls_object *);
void __emutls_register_common (struct __emutls_object *, word, word, void *);
+// iterate all allocated TLS memory
+typedef void (*iterate_memory_cb)(void* mem, word size, void *user);
+void __emutls_iterate_memory (iterate_memory_cb cb, void *user);
#ifdef __GTHREADS
#ifdef __GTHREAD_MUTEX_INIT
@@ -62,10 +71,150 @@ static __gthread_mutex_t emutls_mutex;
static __gthread_key_t emutls_key;
static pointer emutls_size;
+// Array of allocated __emutls_array for all threads
+static struct __emutls_array *emutls_threads_array;
+// Size of allocated memory for variable
+static struct __emutls_word_array *emutls_size_array;
+
+static void
+emutls_array_register (struct __emutls_array *arr)
+{
+ __gthread_mutex_lock (&emutls_mutex);
+
+ if (emutls_threads_array == NULL)
+ {
+ emutls_threads_array = calloc (32 + 1, sizeof (void *));
+ emutls_threads_array->size = 32;
+ }
+
+ // Try to write to an empty slot
+ pointer slot_index = 0;
+ for (; slot_index < emutls_threads_array->size; slot_index++)
+ {
+ if (emutls_threads_array->data[slot_index] == NULL)
+ {
+ emutls_threads_array->data[slot_index] = (void *) arr;
+ break;
+ }
+ }
+ // No empty slot?
+ if (slot_index == emutls_threads_array->size)
+ {
+ emutls_threads_array = realloc (emutls_threads_array,
+ (slot_index + 2) * sizeof (void *));
+ if (emutls_threads_array == NULL)
+ abort ();
+ emutls_threads_array->size = slot_index + 1;
+ emutls_threads_array->data[slot_index] = (void *) arr;
+ }
+
+ __gthread_mutex_unlock (&emutls_mutex);
+}
+
+static void
+emutls_array_update (struct __emutls_array *old, struct __emutls_array *updated)
+{
+ if (updated == old)
+ return;
+
+ __gthread_mutex_lock (&emutls_mutex);
+
+ for (pointer slot_index = 0; slot_index < emutls_threads_array->size;
+ slot_index++)
+ {
+ if (emutls_threads_array->data[slot_index] == (void *) old)
+ emutls_threads_array->data[slot_index] = (void *) updated;
+ }
+
+ __gthread_mutex_unlock (&emutls_mutex);
+}
+
+static void
+emutls_array_unregister (struct __emutls_array *arr)
+{
+ __gthread_mutex_lock (&emutls_mutex);
+
+ for (pointer slot_index = 0; slot_index < emutls_threads_array->size;
+ slot_index++)
+ {
+ if (emutls_threads_array->data[slot_index] == (void *) arr)
+ emutls_threads_array->data[slot_index] = NULL;
+ }
+
+ __gthread_mutex_unlock (&emutls_mutex);
+}
+
+static void
+emutls_save_size (pointer offset, word size)
+{
+ if (__builtin_expect (emutls_size_array == NULL, 0))
+ {
+ pointer size = offset + 32;
+ emutls_size_array = calloc (sizeof(struct __emutls_word_array) +
+ size * sizeof (word), 1);
+ if (emutls_size_array == NULL)
+ abort ();
+ emutls_size_array->size = size;
+ }
+ else if (__builtin_expect (offset > emutls_size_array->size, 0))
+ {
+ pointer orig_size = emutls_size_array->size;
+ pointer size = orig_size * 2;
+ if (offset > size)
+ size = offset + 32;
+ emutls_size_array = realloc (emutls_size_array,
+ sizeof(struct __emutls_word_array) + size * sizeof (word));
+ if (emutls_size_array == NULL)
+ abort ();
+ emutls_size_array->size = size;
+ memset (emutls_size_array->data + orig_size, 0,
+ (size - orig_size) * sizeof (word));
+ }
+
+ emutls_size_array->data[offset - 1] = size;
+}
+
+static void
+emutls_array_iterate (struct __emutls_array *arr, iterate_memory_cb cb,
+ void *user)
+{
+ if (arr == NULL)
+ return;
+
+ for (pointer i = 0; i < arr->size; i++)
+ {
+ void *ptr = arr->data[i];
+ if (ptr)
+ {
+ word size = emutls_size_array->data[i];
+ cb (ptr, size, user);
+ }
+ }
+}
+
+void
+__emutls_iterate_memory (iterate_memory_cb cb, void *user)
+{
+ __gthread_mutex_lock (&emutls_mutex);
+ if (emutls_threads_array == NULL)
+ return;
+
+ for (pointer slot_index = 0; slot_index < emutls_threads_array->size;
+ slot_index++)
+ {
+ struct __emutls_array *arr =
+ (struct __emutls_array *) emutls_threads_array->data[slot_index];
+ emutls_array_iterate (arr, cb, user);
+ }
+
+ __gthread_mutex_unlock (&emutls_mutex);
+}
+
static void
emutls_destroy (void *ptr)
{
struct __emutls_array *arr = ptr;
+ emutls_array_unregister (arr);
pointer size = arr->size;
pointer i;
@@ -148,6 +297,7 @@ __emutls_get_address (struct __emutls_object *obj)
{
offset = ++emutls_size;
__atomic_store_n (&obj->loc.offset, offset, __ATOMIC_RELEASE);
+ emutls_save_size (offset, obj->size);
}
__gthread_mutex_unlock (&emutls_mutex);
}
@@ -161,9 +311,11 @@ __emutls_get_address (struct __emutls_object *obj)
abort ();
arr->size = size;
__gthread_setspecific (emutls_key, (void *) arr);
+ emutls_array_register (arr);
}
else if (__builtin_expect (offset > arr->size, 0))
{
+ struct __emutls_array *orig_arr = arr;
pointer orig_size = arr->size;
pointer size = orig_size * 2;
if (offset > size)
@@ -175,6 +327,7 @@ __emutls_get_address (struct __emutls_object *obj)
memset (arr->data + orig_size, 0,
(size - orig_size) * sizeof (void *));
__gthread_setspecific (emutls_key, (void *) arr);
+ emutls_array_update (orig_arr, arr);
}
void *ret = arr->data[offset - 1];
diff --git a/libgcc/libgcc-std.ver.in b/libgcc/libgcc-std.ver.in
index d5352b6f74e..b7c016f6a85 100644
--- a/libgcc/libgcc-std.ver.in
+++ b/libgcc/libgcc-std.ver.in
@@ -1944,3 +1944,8 @@ GCC_7.0.0 {
__PFX__divmoddi4
__PFX__divmodti4
}
+
+%inherit GCC_9.0.0 GCC_7.0.0
+GCC_9.0.0 {
+ __emutls_iterate_memory
+}
--
2.19.2