[Mesa-dev] [PATCH 15/20] radeonsi: add infrastr. for compiling optimized shader variants asynchronously

Marek Olšák Wed, 16 Nov 2016 10:40:55 -0800

From: Marek Olšák <marek.ol...@amd.com>

---
 src/gallium/drivers/radeonsi/si_shader.h        |   7 ++
 src/gallium/drivers/radeonsi/si_state_shaders.c | 136 ++++++++++++++++++------
 2 files changed, 109 insertions(+), 34 deletions(-)


diff --git a/src/gallium/drivers/radeonsi/si_shader.h 
b/src/gallium/drivers/radeonsi/si_shader.h
index 2ed0cb7..38aa361 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -428,20 +428,24 @@ struct si_shader_key {
        /* Flags for monolithic compilation only. */
        union {
                struct {
                        /* One pair of bits for every input: SI_FIX_FETCH_* 
enums. */
                        uint32_t        fix_fetch;
                } vs;
                struct {
                        uint64_t        inputs_to_copy; /* for fixed-func TCS */
                } tcs;
        } mono;
+
+       /* Optimization flags for asynchronous compilation only. */
+       union {
+       } opt;
 };
 
 struct si_shader_config {
        unsigned                        num_sgprs;
        unsigned                        num_vgprs;
        unsigned                        spilled_sgprs;
        unsigned                        spilled_vgprs;
        unsigned                        lds_size;
        unsigned                        spi_ps_input_ena;
        unsigned                        spi_ps_input_addr;
@@ -477,21 +481,24 @@ struct si_shader {
        struct si_shader_selector       *selector;
        struct si_shader                *next_variant;
 
        struct si_shader_part           *prolog;
        struct si_shader_part           *epilog;
 
        struct si_pm4_state             *pm4;
        struct r600_resource            *bo;
        struct r600_resource            *scratch_bo;
        struct si_shader_key            key;
+       struct util_queue_fence         optimized_ready;
+       bool                            compilation_failed;
        bool                            is_monolithic;
+       bool                            is_optimized;
        bool                            is_binary_shared;
        bool                            is_gs_copy_shader;
 
        /* The following data is all that's needed for binary shaders. */
        struct radeon_shader_binary     binary;
        struct si_shader_config         config;
        struct si_shader_info           info;
 
        /* Shader key + LLVM IR + disassembly + statistics.
         * Generated for debug contexts only.
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 72d0518..00ccbbd 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1008,118 +1008,182 @@ static inline void si_shader_selector_key(struct 
pipe_context *ctx,
                }
 
                key->part.ps.epilog.alpha_func = si_get_alpha_test_func(sctx);
                break;
        }
        default:
                assert(0);
        }
 }
 
+static void si_build_shader_variant(void *job, int thread_index)
+{
+       struct si_shader *shader = (struct si_shader *)job;
+       struct si_shader_selector *sel = shader->selector;
+       struct si_screen *sscreen = sel->screen;
+       LLVMTargetMachineRef tm;
+       struct pipe_debug_callback *debug = &sel->debug;
+       int r;
+
+       if (thread_index >= 0) {
+               assert(thread_index < ARRAY_SIZE(sscreen->tm));
+               tm = sscreen->tm[thread_index];
+               if (!debug->async)
+                       debug = NULL;
+       } else {
+               tm = sel->tm;
+       }
+
+       r = si_shader_create(sscreen, tm, shader, debug);
+       if (unlikely(r)) {
+               R600_ERR("Failed to build shader variant (type=%u) %d\n",
+                        sel->type, r);
+               shader->compilation_failed = true;
+               return;
+       }
+
+       if (sel->is_debug_context) {
+               FILE *f = open_memstream(&shader->shader_log,
+                                        &shader->shader_log_size);
+               if (f) {
+                       si_shader_dump(sscreen, shader, NULL, sel->type, f);
+                       fclose(f);
+               }
+       }
+
+       si_shader_init_pm4_state(sscreen, shader);
+}
+
 /* Select the hw shader variant depending on the current state. */
 static int si_shader_select_with_key(struct si_screen *sscreen,
                                     struct si_shader_ctx_state *state,
                                     struct si_shader_key *key,
-                                    LLVMTargetMachineRef tm,
-                                    struct pipe_debug_callback *debug,
-                                    bool wait,
-                                    bool is_debug_context)
+                                    int thread_index)
 {
        static const struct si_shader_key zeroed;
        struct si_shader_selector *sel = state->cso;
        struct si_shader *current = state->current;
        struct si_shader *iter, *shader = NULL;
-       int r;
-
+again:
        /* Check if we don't need to change anything.
         * This path is also used for most shaders that don't need multiple
         * variants, it will cost just a computation of the key and this
         * test. */
-       if (likely(current && memcmp(&current->key, key, sizeof(*key)) == 0))
+       if (likely(current &&
+                  memcmp(&current->key, key, sizeof(*key)) == 0 &&
+                  (!current->is_optimized ||
+                   util_queue_fence_is_signalled(&current->optimized_ready))))
                return 0;
 
        /* This must be done before the mutex is locked, because async GS
         * compilation calls this function too, and therefore must enter
         * the mutex first.
+        *
+        * Only wait if we are in a draw call. Don't wait if we are
+        * in a compiler thread.
         */
-       if (wait)
+       if (thread_index < 0)
                util_queue_job_wait(&sel->ready);
 
        pipe_mutex_lock(sel->mutex);
 
        /* Find the shader variant. */
        for (iter = sel->first_variant; iter; iter = iter->next_variant) {
                /* Don't check the "current" shader. We checked it above. */
                if (current != iter &&
                    memcmp(&iter->key, key, sizeof(*key)) == 0) {
+                       /* If it's an optimized shader and its compilation has
+                        * been started but isn't done, use the unoptimized
+                        * shader so as not to cause a stall due to compilation.
+                        */
+                       if (iter->is_optimized &&
+                           
!util_queue_fence_is_signalled(&iter->optimized_ready)) {
+                               memset(&key->opt, 0, sizeof(key->opt));
+                               pipe_mutex_unlock(sel->mutex);
+                               goto again;
+                       }
+
+                       if (iter->compilation_failed) {
+                               pipe_mutex_unlock(sel->mutex);
+                               return -1; /* skip the draw call */
+                       }
+
                        state->current = iter;
                        pipe_mutex_unlock(sel->mutex);
                        return 0;
                }
        }
 
        /* Build a new shader. */
        shader = CALLOC_STRUCT(si_shader);
        if (!shader) {
                pipe_mutex_unlock(sel->mutex);
                return -ENOMEM;
        }
        shader->selector = sel;
        shader->key = *key;
+
+       /* Monolithic-only shaders don't make a distinction between optimized
+        * and unoptimized. */
        shader->is_monolithic =
                !sel->main_shader_part ||
                sel->main_shader_part->key.as_ls != key->as_ls ||
                sel->main_shader_part->key.as_es != key->as_es ||
+               memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0 ||
                memcmp(&key->mono, &zeroed.mono, sizeof(key->mono)) != 0;
 
-       r = si_shader_create(sscreen, tm, shader, debug);
-       if (unlikely(r)) {
-               R600_ERR("Failed to build shader variant (type=%u) %d\n",
-                        sel->type, r);
-               FREE(shader);
-               pipe_mutex_unlock(sel->mutex);
-               return r;
-       }
-
-       if (is_debug_context) {
-               FILE *f = open_memstream(&shader->shader_log,
-                                        &shader->shader_log_size);
-               if (f) {
-                       si_shader_dump(sscreen, shader, NULL, sel->type, f);
-                       fclose(f);
-               }
-       }
-
-       si_shader_init_pm4_state(sscreen, shader);
+       shader->is_optimized =
+               !sscreen->use_monolithic_shaders &&
+               memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0;
+       if (shader->is_optimized)
+               util_queue_fence_init(&shader->optimized_ready);
 
        if (!sel->last_variant) {
                sel->first_variant = shader;
                sel->last_variant = shader;
        } else {
                sel->last_variant->next_variant = shader;
                sel->last_variant = shader;
        }
-       state->current = shader;
+
+       /* If it's an optimized shader, compile it asynchronously. */
+       if (shader->is_optimized &&
+           thread_index < 0) {
+               /* Compile it asynchronously. */
+               util_queue_add_job(&sscreen->shader_compiler_queue,
+                                  shader, &shader->optimized_ready,
+                                  si_build_shader_variant, NULL);
+
+               /* Use the default (unoptimized) shader for now. */
+               memset(&key->opt, 0, sizeof(key->opt));
+               pipe_mutex_unlock(sel->mutex);
+               goto again;
+       }
+
+       assert(!shader->is_optimized);
+       si_build_shader_variant(shader, thread_index);
+
+       if (!shader->compilation_failed)
+               state->current = shader;
+
        pipe_mutex_unlock(sel->mutex);
-       return 0;
+       return shader->compilation_failed ? -1 : 0;
 }
 
 static int si_shader_select(struct pipe_context *ctx,
                            struct si_shader_ctx_state *state)
 {
        struct si_context *sctx = (struct si_context *)ctx;
        struct si_shader_key key;
 
        si_shader_selector_key(ctx, state->cso, &key);
-       return si_shader_select_with_key(sctx->screen, state, &key,
-                                        sctx->tm, &sctx->b.debug, true,
-                                        sctx->is_debug);
+       return si_shader_select_with_key(sctx->screen, state, &key, -1);
 }
 
 static void si_parse_next_shader_property(const struct tgsi_shader_info *info,
                                          struct si_shader_key *key)
 {
        unsigned next_shader = info->properties[TGSI_PROPERTY_NEXT_SHADER];
 
        switch (info->processor) {
        case PIPE_SHADER_VERTEX:
                switch (next_shader) {
@@ -1240,22 +1304,21 @@ void si_init_shader_selector_async(void *job, int 
thread_index)
                                sel->info.uses_linear_center &&
                                sel->info.uses_linear_centroid;
                        key.part.ps.epilog.alpha_func = PIPE_FUNC_ALWAYS;
                        for (i = 0; i < 8; i++)
                                if (sel->info.colors_written & (1 << i))
                                        
key.part.ps.epilog.spi_shader_col_format |=
                                                V_028710_SPI_SHADER_FP16_ABGR 
<< (i * 4);
                        break;
                }
 
-               if (si_shader_select_with_key(sscreen, &state, &key, tm, debug,
-                                             false, sel->is_debug_context))
+               if (si_shader_select_with_key(sscreen, &state, &key, 
thread_index))
                        fprintf(stderr, "radeonsi: can't create a monolithic 
shader\n");
        }
 
        /* The GS copy shader is always pre-compiled. */
        if (sel->type == PIPE_SHADER_GEOMETRY) {
                sel->gs_copy_shader = si_generate_gs_copy_shader(sscreen, tm, 
sel, debug);
                if (!sel->gs_copy_shader) {
                        fprintf(stderr, "radeonsi: can't create GS copy 
shader\n");
                        return;
                }
@@ -1517,20 +1580,25 @@ static void si_bind_ps_shader(struct pipe_context *ctx, 
void *state)
                return;
 
        sctx->ps_shader.cso = sel;
        sctx->ps_shader.current = sel ? sel->first_variant : NULL;
        sctx->do_update_shaders = true;
        si_mark_atom_dirty(sctx, &sctx->cb_render_state);
 }
 
 static void si_delete_shader(struct si_context *sctx, struct si_shader *shader)
 {
+       if (shader->is_optimized) {
+               util_queue_job_wait(&shader->optimized_ready);
+               util_queue_fence_destroy(&shader->optimized_ready);
+       }
+
        if (shader->pm4) {
                switch (shader->selector->type) {
                case PIPE_SHADER_VERTEX:
                        if (shader->key.as_ls)
                                si_pm4_delete_state(sctx, ls, shader->pm4);
                        else if (shader->key.as_es)
                                si_pm4_delete_state(sctx, es, shader->pm4);
                        else
                                si_pm4_delete_state(sctx, vs, shader->pm4);
                        break;
-- 
2.7.4

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 15/20] radeonsi: add infrastr. for compiling optimized shader variants asynchronously

Reply via email to