From: Marek Olšák <marek.ol...@amd.com> v2: use set_context_param --- src/gallium/auxiliary/util/u_helpers.c | 42 +++++++++++++++++++ src/gallium/auxiliary/util/u_helpers.h | 4 ++ src/mesa/state_tracker/st_context.c | 3 ++ src/mesa/state_tracker/st_manager.c | 9 ++++ src/util/u_thread.h | 57 ++++++++++++++++++++++++++ 5 files changed, 115 insertions(+)
diff --git a/src/gallium/auxiliary/util/u_helpers.c b/src/gallium/auxiliary/util/u_helpers.c index 25d8fbce6f7..f773360adde 100644 --- a/src/gallium/auxiliary/util/u_helpers.c +++ b/src/gallium/auxiliary/util/u_helpers.c @@ -18,23 +18,25 @@ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS AND/OR THEIR SUPPLIERS BE LIABLE FOR * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * **************************************************************************/ +#include "util/u_cpu_detect.h" #include "util/u_helpers.h" #include "util/u_inlines.h" #include "util/u_upload_mgr.h" +#include "util/u_thread.h" #include <inttypes.h> /** * This function is used to copy an array of pipe_vertex_buffer structures, * while properly referencing the pipe_vertex_buffer::buffer member. * * enabled_buffers is updated such that the bits corresponding to the indices * of disabled buffers are set to 0 and the enabled ones are set to 1. * * \sa util_copy_framebuffer_state @@ -111,20 +113,60 @@ util_upload_index_buffer(struct pipe_context *pipe, u_upload_data(pipe->stream_uploader, start_offset, info->count * info->index_size, 4, (char*)info->index.user + start_offset, out_offset, out_buffer); u_upload_unmap(pipe->stream_uploader); *out_offset -= start_offset; return *out_buffer != NULL; } +/** + * Called by MakeCurrent. Used to notify the driver that the application + * thread may have been changed. + * + * The function pins the current thread and driver threads to a group of + * CPU cores that share the same L3 cache. This is needed for good multi- + * threading performance on AMD Zen CPUs. + * + * \param upper_thread thread in the state tracker that also needs to be + * pinned. + */ +void +util_context_thread_changed(struct pipe_context *ctx, thrd_t *upper_thread) +{ + thrd_t current = thrd_current(); + int cache = util_get_L3_for_pinned_thread(current, + util_cpu_caps.cores_per_L3); + + /* If the main thread is not pinned, choose the L3 cache. */ + if (cache == -1) { + unsigned num_caches = util_cpu_caps.nr_cpus / + util_cpu_caps.cores_per_L3; + static unsigned last_cache; + + /* Choose a different L3 cache for each subsequent MakeCurrent. */ + cache = p_atomic_inc_return(&last_cache) % num_caches; + util_pin_thread_to_L3(current, cache, util_cpu_caps.cores_per_L3); + } + + /* Tell the driver to pin its threads to the same L3 cache. */ + if (ctx->set_context_param) { + ctx->set_context_param(ctx, PIPE_CONTEXT_PARAM_PIN_THREADS_TO_L3_CACHE, + cache); + } + + /* Do the same for the upper level thread if there is any (e.g. glthread) */ + if (upper_thread) + util_pin_thread_to_L3(*upper_thread, cache, util_cpu_caps.cores_per_L3); +} + /* This is a helper for hardware bring-up. Don't remove. */ struct pipe_query * util_begin_pipestat_query(struct pipe_context *ctx) { struct pipe_query *q = ctx->create_query(ctx, PIPE_QUERY_PIPELINE_STATISTICS, 0); if (!q) return NULL; ctx->begin_query(ctx, q); diff --git a/src/gallium/auxiliary/util/u_helpers.h b/src/gallium/auxiliary/util/u_helpers.h index e65e64d7781..38c47c1cc98 100644 --- a/src/gallium/auxiliary/util/u_helpers.h +++ b/src/gallium/auxiliary/util/u_helpers.h @@ -22,20 +22,21 @@ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * **************************************************************************/ #ifndef U_HELPERS_H #define U_HELPERS_H #include "pipe/p_state.h" +#include "c11/threads.h" #include <stdio.h> #ifdef __cplusplus extern "C" { #endif void util_set_vertex_buffers_mask(struct pipe_vertex_buffer *dst, uint32_t *enabled_buffers, const struct pipe_vertex_buffer *src, unsigned start_slot, unsigned count); @@ -43,20 +44,23 @@ void util_set_vertex_buffers_mask(struct pipe_vertex_buffer *dst, void util_set_vertex_buffers_count(struct pipe_vertex_buffer *dst, unsigned *dst_count, const struct pipe_vertex_buffer *src, unsigned start_slot, unsigned count); bool util_upload_index_buffer(struct pipe_context *pipe, const struct pipe_draw_info *info, struct pipe_resource **out_buffer, unsigned *out_offset); +void +util_context_thread_changed(struct pipe_context *ctx, thrd_t *upper_thread); + struct pipe_query * util_begin_pipestat_query(struct pipe_context *ctx); void util_end_pipestat_query(struct pipe_context *ctx, struct pipe_query *q, FILE *f); void util_wait_for_idle(struct pipe_context *ctx); diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c index edcbd36a1bf..354876746f4 100644 --- a/src/mesa/state_tracker/st_context.c +++ b/src/mesa/state_tracker/st_context.c @@ -72,20 +72,21 @@ #include "st_draw.h" #include "st_extensions.h" #include "st_gen_mipmap.h" #include "st_pbo.h" #include "st_program.h" #include "st_sampler_view.h" #include "st_shader_cache.h" #include "st_vdpau.h" #include "st_texture.h" #include "pipe/p_context.h" +#include "util/u_cpu_detect.h" #include "util/u_inlines.h" #include "util/u_upload_mgr.h" #include "util/u_vbuf.h" #include "cso_cache/cso_context.h" DEBUG_GET_ONCE_BOOL_OPTION(mesa_mvp_dp4, "MESA_MVP_DP4", FALSE) /** @@ -561,20 +562,22 @@ st_create_context(gl_api api, struct pipe_context *pipe, const struct gl_config *visual, struct st_context *share, const struct st_config_options *options, bool no_error) { struct gl_context *ctx; struct gl_context *shareCtx = share ? share->ctx : NULL; struct dd_function_table funcs; struct st_context *st; + util_cpu_detect(); + memset(&funcs, 0, sizeof(funcs)); st_init_driver_functions(pipe->screen, &funcs); ctx = calloc(1, sizeof(struct gl_context)); if (!ctx) return NULL; if (!_mesa_initialize_context(ctx, api, visual, shareCtx, &funcs)) { free(ctx); return NULL; diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c index 69286b57916..7a37f9850f8 100644 --- a/src/mesa/state_tracker/st_manager.c +++ b/src/mesa/state_tracker/st_manager.c @@ -1056,20 +1056,29 @@ st_api_make_current(struct st_api *stapi, struct st_context_iface *stctxi, ret = _mesa_make_current(st->ctx, incomplete, incomplete); } st_framebuffer_reference(&stdraw, NULL); st_framebuffer_reference(&stread, NULL); /* Purge the context's winsys_buffers list in case any * of the referenced drawables no longer exist. */ st_framebuffers_purge(st); + + /* Notify the driver that the context thread may have been changed. + * This should pin all driver threads to a specific L3 cache for optimal + * performance on AMD Zen CPUs. + */ + struct glthread_state *glthread = st->ctx->GLThread; + thrd_t *upper_thread = glthread ? &glthread->queue.threads[0] : NULL; + + util_context_thread_changed(st->pipe, upper_thread); } else { ret = _mesa_make_current(NULL, NULL, NULL); } return ret; } static void diff --git a/src/util/u_thread.h b/src/util/u_thread.h index 8c6e0bdc59e..0555ba61111 100644 --- a/src/util/u_thread.h +++ b/src/util/u_thread.h @@ -63,20 +63,77 @@ static inline void u_thread_setname( const char *name ) #if defined(HAVE_PTHREAD) # if defined(__GNU_LIBRARY__) && defined(__GLIBC__) && defined(__GLIBC_MINOR__) && \ (__GLIBC__ >= 3 || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 12)) && \ defined(__linux__) pthread_setname_np(pthread_self(), name); # endif #endif (void)name; } +/** + * An AMD Zen CPU consists of multiple modules where each module has its own L3 + * cache. Inter-thread communication such as locks and atomics between modules + * is very expensive. It's desirable to pin a group of closely cooperating + * threads to one group of cores sharing L3. + * + * \param thread thread + * \param L3_index index of the L3 cache + * \param cores_per_L3 number of CPU cores shared by one L3 + */ +static inline void +util_pin_thread_to_L3(thrd_t thread, unsigned L3_index, unsigned cores_per_L3) +{ +#if defined(HAVE_PTHREAD) + cpu_set_t cpuset; + + CPU_ZERO(&cpuset); + for (unsigned i = 0; i < cores_per_L3; i++) + CPU_SET(L3_index * cores_per_L3 + i, &cpuset); + pthread_setaffinity_np(thread, sizeof(cpuset), &cpuset); +#endif +} + +/** + * Return the index of L3 that the thread is pinned to. If the thread is + * pinned to multiple L3 caches, return -1. + * + * \param thread thread + * \param cores_per_L3 number of CPU cores shared by one L3 + */ +static inline int +util_get_L3_for_pinned_thread(thrd_t thread, unsigned cores_per_L3) +{ +#if defined(HAVE_PTHREAD) + cpu_set_t cpuset; + + if (pthread_getaffinity_np(thread, sizeof(cpuset), &cpuset) == 0) { + int L3_index = -1; + + for (unsigned i = 0; i < CPU_SETSIZE; i++) { + if (CPU_ISSET(i, &cpuset)) { + int x = i / cores_per_L3; + + if (L3_index != x) { + if (L3_index == -1) + L3_index = x; + else + return -1; /* multiple L3s are set */ + } + } + } + return L3_index; + } +#endif + return -1; +} + /* * Thread statistics. */ /* Return the time of a thread's CPU time clock. */ static inline int64_t u_thread_get_time_nano(thrd_t thread) { #if defined(__linux__) && defined(HAVE_PTHREAD) struct timespec ts; -- 2.17.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev