From: Marek Olšák <marek.ol...@amd.com> not sure if this helps --- src/gallium/auxiliary/util/u_threaded_context.c | 11 +++++++++-- src/gallium/auxiliary/util/u_threaded_context.h | 9 ++++++++- 2 files changed, 17 insertions(+), 3 deletions(-)
diff --git a/src/gallium/auxiliary/util/u_threaded_context.c b/src/gallium/auxiliary/util/u_threaded_context.c index 8ea7f8a..34206bf 100644 --- a/src/gallium/auxiliary/util/u_threaded_context.c +++ b/src/gallium/auxiliary/util/u_threaded_context.c @@ -2118,21 +2118,21 @@ tc_destroy(struct pipe_context *_pipe) if (tc->base.const_uploader && tc->base.stream_uploader != tc->base.const_uploader) u_upload_destroy(tc->base.const_uploader); if (tc->base.stream_uploader) u_upload_destroy(tc->base.stream_uploader); slab_destroy_child(&tc->pool_transfers); pipe->destroy(pipe); - FREE(tc); + os_free_aligned(tc); } static const tc_execute execute_func[TC_NUM_CALLS] = { #define CALL(name) tc_call_##name, #include "u_threaded_context_calls.h" #undef CALL }; /** * Wrap an existing pipe_context into a threaded_context. @@ -2158,25 +2158,32 @@ threaded_context_create(struct pipe_context *pipe, STATIC_ASSERT(sizeof(struct tc_call) <= 16); if (!pipe) return NULL; util_cpu_detect(); if (!debug_get_bool_option("GALLIUM_THREAD", util_cpu_caps.nr_cpus > 1)) return pipe; - tc = CALLOC_STRUCT(threaded_context); + tc = os_malloc_aligned(sizeof(struct threaded_context), 16); if (!tc) { pipe->destroy(pipe); return NULL; } + memset(tc, 0, sizeof(*tc)); + + assert((uintptr_t)tc % 16 == 0); + STATIC_ASSERT(offsetof(struct threaded_context, batch_slots[0]) % 16 == 0); + STATIC_ASSERT(offsetof(struct threaded_context, batch_slots[0].call[0]) % 16 == 0); + STATIC_ASSERT(offsetof(struct threaded_context, batch_slots[0].call[1]) % 16 == 0); + STATIC_ASSERT(offsetof(struct threaded_context, batch_slots[1].call[0]) % 16 == 0); /* The driver context isn't wrapped, so set its "priv" to NULL. */ pipe->priv = NULL; tc->pipe = pipe; tc->replace_buffer_storage = replace_buffer; tc->map_buffer_alignment = pipe->screen->get_param(pipe->screen, PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT); tc->base.priv = pipe; /* priv points to the wrapped driver context */ tc->base.screen = pipe->screen; diff --git a/src/gallium/auxiliary/util/u_threaded_context.h b/src/gallium/auxiliary/util/u_threaded_context.h index f139230..5d2a10c 100644 --- a/src/gallium/auxiliary/util/u_threaded_context.h +++ b/src/gallium/auxiliary/util/u_threaded_context.h @@ -266,21 +266,28 @@ struct threaded_query { * Most calls will typecast this to the type they need, typically larger * than 8 bytes. */ union tc_payload { struct pipe_query *query; struct pipe_resource *resource; struct pipe_transfer *transfer; uint64_t __use_8_bytes; }; -struct tc_call { +#ifdef _MSC_VER +#define ALIGN16 __declspec(align(16)) +#else +#define ALIGN16 __attribute__((aligned(16))) +#endif + +/* Each call slot should be aligned to its own size for optimal cache usage. */ +struct ALIGN16 tc_call { unsigned sentinel; ushort num_call_slots; ushort call_id; union tc_payload payload; }; struct tc_batch { struct pipe_context *pipe; unsigned sentinel; unsigned num_total_call_slots; -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev