From: Nicolai Hähnle <nicolai.haeh...@amd.com> There's a race condition between si_shader_select_with_key and si_bind_XX_shader:
Thread 1 Thread 2 -------- -------- si_shader_select_with_key begin compiling the first variant (guarded by sel->mutex) si_bind_XX_shader select first_variant by default as state->current si_shader_select_with_key match state->current and early-out Since thread 2 never takes sel->mutex, it may go on rendering without a PM4 for that shader, for example. The solution taken by this patch is to broaden the scope of shader->optimized_ready to a fence shader->ready that applies to all shaders. This does not hurt the fast path (if anything it makes it faster, because we don't explicitly check is_optimized). It will also allow reducing the scope of sel->mutex locks, but this is deferred to a later commit for better bisectability. Fixes dEQP-EGL.functional.sharing.gles2.multithread.simple.buffers.bufferdata_render Reviewed-by: Marek Olšák <marek.ol...@amd.com> --- src/gallium/drivers/radeonsi/si_shader.c | 3 + src/gallium/drivers/radeonsi/si_shader.h | 2 +- src/gallium/drivers/radeonsi/si_state_shaders.c | 88 ++++++++++++++++++------- 3 files changed, 67 insertions(+), 26 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 6bc08dd3890..195c1cd7d94 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -5388,20 +5388,23 @@ si_generate_gs_copy_shader(struct si_screen *sscreen, if (!outputs) return NULL; shader = CALLOC_STRUCT(si_shader); if (!shader) { FREE(outputs); return NULL; } + /* We can leave the fence as permanently signaled because the GS copy + * shader only becomes visible globally after it has been compiled. */ + util_queue_fence_init(&shader->ready); shader->selector = gs_selector; shader->is_gs_copy_shader = true; si_init_shader_ctx(&ctx, sscreen, tm); ctx.shader = shader; ctx.type = PIPE_SHADER_VERTEX; builder = ctx.ac.builder; diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index b8f9d18f5c8..41851627a87 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -586,21 +586,21 @@ struct si_shader { struct si_shader_part *prolog; struct si_shader *previous_stage; /* for GFX9 */ struct si_shader_part *prolog2; struct si_shader_part *epilog; struct si_pm4_state *pm4; struct r600_resource *bo; struct r600_resource *scratch_bo; struct si_shader_key key; - struct util_queue_fence optimized_ready; + struct util_queue_fence ready; bool compilation_failed; bool is_monolithic; bool is_optimized; bool is_binary_shared; bool is_gs_copy_shader; /* The following data is all that's needed for binary shaders. */ struct ac_shader_binary binary; struct si_shader_config config; struct si_shader_info info; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index b72f6fc74a4..4c0292404e6 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -1545,20 +1545,25 @@ static bool si_check_missing_main_part(struct si_screen *sscreen, struct si_shader_key *key) { struct si_shader **mainp = si_get_main_shader_part(sel, key); if (!*mainp) { struct si_shader *main_part = CALLOC_STRUCT(si_shader); if (!main_part) return false; + /* We can leave the fence as permanently signaled because the + * main part becomes visible globally only after it has been + * compiled. */ + util_queue_fence_init(&main_part->ready); + main_part->selector = sel; main_part->key.as_es = key->as_es; main_part->key.as_ls = key->as_ls; if (si_compile_tgsi_shader(sscreen, compiler_state->tm, main_part, false, &compiler_state->debug) != 0) { FREE(main_part); return false; } @@ -1578,70 +1583,85 @@ static int si_shader_select_with_key(struct si_screen *sscreen, struct si_shader_selector *previous_stage_sel = NULL; struct si_shader *current = state->current; struct si_shader *iter, *shader = NULL; again: /* Check if we don't need to change anything. * This path is also used for most shaders that don't need multiple * variants, it will cost just a computation of the key and this * test. */ if (likely(current && - memcmp(¤t->key, key, sizeof(*key)) == 0 && - (!current->is_optimized || - util_queue_fence_is_signalled(¤t->optimized_ready)))) + memcmp(¤t->key, key, sizeof(*key)) == 0)) { + if (unlikely(!util_queue_fence_is_signalled(¤t->ready))) { + if (current->is_optimized) { + memset(&key->opt, 0, sizeof(key->opt)); + goto current_not_ready; + } + + util_queue_fence_wait(¤t->ready); + } + return current->compilation_failed ? -1 : 0; + } +current_not_ready: /* This must be done before the mutex is locked, because async GS * compilation calls this function too, and therefore must enter * the mutex first. * * Only wait if we are in a draw call. Don't wait if we are * in a compiler thread. */ if (thread_index < 0) util_queue_fence_wait(&sel->ready); mtx_lock(&sel->mutex); /* Find the shader variant. */ for (iter = sel->first_variant; iter; iter = iter->next_variant) { /* Don't check the "current" shader. We checked it above. */ if (current != iter && memcmp(&iter->key, key, sizeof(*key)) == 0) { - /* If it's an optimized shader and its compilation has - * been started but isn't done, use the unoptimized - * shader so as not to cause a stall due to compilation. - */ - if (iter->is_optimized && - !util_queue_fence_is_signalled(&iter->optimized_ready)) { - memset(&key->opt, 0, sizeof(key->opt)); - mtx_unlock(&sel->mutex); - goto again; + if (unlikely(!util_queue_fence_is_signalled(&iter->ready))) { + /* If it's an optimized shader and its compilation has + * been started but isn't done, use the unoptimized + * shader so as not to cause a stall due to compilation. + */ + if (iter->is_optimized) { + memset(&key->opt, 0, sizeof(key->opt)); + mtx_unlock(&sel->mutex); + goto again; + } + + util_queue_fence_wait(&iter->ready); } if (iter->compilation_failed) { mtx_unlock(&sel->mutex); return -1; /* skip the draw call */ } state->current = iter; mtx_unlock(&sel->mutex); return 0; } } /* Build a new shader. */ shader = CALLOC_STRUCT(si_shader); if (!shader) { mtx_unlock(&sel->mutex); return -ENOMEM; } + + util_queue_fence_init(&shader->ready); + shader->selector = sel; shader->key = *key; shader->compiler_ctx_state = *compiler_state; /* If this is a merged shader, get the first shader's selector. */ if (sscreen->b.chip_class >= GFX9) { if (sel->type == PIPE_SHADER_TESS_CTRL) previous_stage_sel = key->part.tcs.ls; else if (sel->type == PIPE_SHADER_GEOMETRY) previous_stage_sel = key->part.gs.es; @@ -1704,49 +1724,62 @@ again: /* Monolithic-only shaders don't make a distinction between optimized * and unoptimized. */ shader->is_monolithic = is_pure_monolithic || memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0; shader->is_optimized = !is_pure_monolithic && memcmp(&key->opt, &zeroed.opt, sizeof(key->opt)) != 0; - if (shader->is_optimized) - util_queue_fence_init(&shader->optimized_ready); - - if (!sel->last_variant) { - sel->first_variant = shader; - sel->last_variant = shader; - } else { - sel->last_variant->next_variant = shader; - sel->last_variant = shader; - } /* If it's an optimized shader, compile it asynchronously. */ if (shader->is_optimized && !is_pure_monolithic && thread_index < 0) { /* Compile it asynchronously. */ util_queue_add_job(&sscreen->shader_compiler_queue_low_priority, - shader, &shader->optimized_ready, + shader, &shader->ready, si_build_shader_variant_low_priority, NULL); + /* Add only after the ready fence was reset, to guard against a + * race with si_bind_XX_shader. */ + if (!sel->last_variant) { + sel->first_variant = shader; + sel->last_variant = shader; + } else { + sel->last_variant->next_variant = shader; + sel->last_variant = shader; + } + /* Use the default (unoptimized) shader for now. */ memset(&key->opt, 0, sizeof(key->opt)); mtx_unlock(&sel->mutex); goto again; } + /* Reset the fence before adding to the variant list. */ + util_queue_fence_reset(&shader->ready); + + if (!sel->last_variant) { + sel->first_variant = shader; + sel->last_variant = shader; + } else { + sel->last_variant->next_variant = shader; + sel->last_variant = shader; + } + assert(!shader->is_optimized); si_build_shader_variant(shader, thread_index, false); + util_queue_fence_signal(&shader->ready); + if (!shader->compilation_failed) state->current = shader; mtx_unlock(&sel->mutex); return shader->compilation_failed ? -1 : 0; } static int si_shader_select(struct pipe_context *ctx, struct si_shader_ctx_state *state, struct si_compiler_ctx_state *compiler_state) @@ -1822,20 +1855,24 @@ static void si_init_shader_selector_async(void *job, int thread_index) */ if (!sscreen->use_monolithic_shaders) { struct si_shader *shader = CALLOC_STRUCT(si_shader); void *tgsi_binary = NULL; if (!shader) { fprintf(stderr, "radeonsi: can't allocate a main shader part\n"); return; } + /* We can leave the fence signaled because use of the default + * main part is guarded by the selector's ready fence. */ + util_queue_fence_init(&shader->ready); + shader->selector = sel; si_parse_next_shader_property(&sel->info, sel->so.num_outputs != 0, &shader->key); if (sel->tokens) tgsi_binary = si_get_tgsi_binary(sel); /* Try to load the shader from the shader cache. */ mtx_lock(&sscreen->shader_cache_mutex); @@ -2441,24 +2478,25 @@ static void si_bind_ps_shader(struct pipe_context *ctx, void *state) sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL])) si_mark_atom_dirty(sctx, &sctx->msaa_config); } si_set_active_descriptors_for_shader(sctx, sel); } static void si_delete_shader(struct si_context *sctx, struct si_shader *shader) { if (shader->is_optimized) { util_queue_drop_job(&sctx->screen->shader_compiler_queue_low_priority, - &shader->optimized_ready); - util_queue_fence_destroy(&shader->optimized_ready); + &shader->ready); } + util_queue_fence_destroy(&shader->ready); + if (shader->pm4) { switch (shader->selector->type) { case PIPE_SHADER_VERTEX: if (shader->key.as_ls) { assert(sctx->b.chip_class <= VI); si_pm4_delete_state(sctx, ls, shader->pm4); } else if (shader->key.as_es) { assert(sctx->b.chip_class <= VI); si_pm4_delete_state(sctx, es, shader->pm4); } else { -- 2.11.0 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev