On Thu, May 18, 2017 at 11:31 AM, Nicolai Hähnle <nhaeh...@gmail.com> wrote: > On 17.05.2017 21:38, Marek Olšák wrote: >> >> From: Marek Olšák <marek.ol...@amd.com> >> >> This decreases the size of CE RAM dumps to L2, or the size of descriptor >> uploads without CE. >> --- >> src/gallium/drivers/radeonsi/si_compute.c | 28 ++++++-- >> src/gallium/drivers/radeonsi/si_descriptors.c | 85 >> ++++++++++++++++++++----- >> src/gallium/drivers/radeonsi/si_state.h | 18 +++++- >> src/gallium/drivers/radeonsi/si_state_shaders.c | 6 ++ >> 4 files changed, 113 insertions(+), 24 deletions(-) >> >> diff --git a/src/gallium/drivers/radeonsi/si_compute.c >> b/src/gallium/drivers/radeonsi/si_compute.c >> index 22ef111..4c98066 100644 >> --- a/src/gallium/drivers/radeonsi/si_compute.c >> +++ b/src/gallium/drivers/radeonsi/si_compute.c >> @@ -201,21 +201,38 @@ static void *si_create_compute_state( >> return NULL; >> } >> } >> >> return program; >> } >> >> static void si_bind_compute_state(struct pipe_context *ctx, void *state) >> { >> struct si_context *sctx = (struct si_context*)ctx; >> - sctx->cs_shader_state.program = (struct si_compute*)state; >> + struct si_compute *program = (struct si_compute*)state; >> + >> + sctx->cs_shader_state.program = program; >> + if (!program) >> + return; >> + >> + /* Wait because we need active slot usage masks. */ >> + if (program->ir_type == PIPE_SHADER_IR_TGSI) >> + util_queue_fence_wait(&program->ready); >> + >> + si_set_active_descriptors(sctx, >> + SI_DESCS_FIRST_COMPUTE + >> + >> SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS, >> + >> program->active_const_and_shader_buffers); >> + si_set_active_descriptors(sctx, >> + SI_DESCS_FIRST_COMPUTE + >> + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES, >> + program->active_samplers_and_images); >> } >> >> static void si_set_global_binding( >> struct pipe_context *ctx, unsigned first, unsigned n, >> struct pipe_resource **resources, >> uint32_t **handles) >> { >> unsigned i; >> struct si_context *sctx = (struct si_context*)ctx; >> struct si_compute *program = sctx->cs_shader_state.program; >> @@ -749,26 +766,23 @@ static void si_launch_grid( >> bool cs_regalloc_hang = >> (sctx->b.chip_class == SI || >> sctx->b.family == CHIP_BONAIRE || >> sctx->b.family == CHIP_KABINI) && >> info->block[0] * info->block[1] * info->block[2] > 256; >> >> if (cs_regalloc_hang) >> sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | >> SI_CONTEXT_CS_PARTIAL_FLUSH; >> >> - if (program->ir_type == PIPE_SHADER_IR_TGSI) { >> - util_queue_fence_wait(&program->ready); >> - >> - if (program->shader.compilation_failed) >> - return; >> - } >> + if (program->ir_type == PIPE_SHADER_IR_TGSI && >> + program->shader.compilation_failed) >> + return; >> >> si_decompress_compute_textures(sctx); >> >> /* Add buffer sizes for memory checking in need_cs_space. */ >> r600_context_add_resource_size(ctx, &program->shader.bo->b.b); >> /* TODO: add the scratch buffer */ >> >> if (info->indirect) { >> r600_context_add_resource_size(ctx, info->indirect); >> >> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c >> b/src/gallium/drivers/radeonsi/si_descriptors.c >> index 38e4ae1..a2f40a8 100644 >> --- a/src/gallium/drivers/radeonsi/si_descriptors.c >> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c >> @@ -118,26 +118,28 @@ static void si_init_descriptors(struct >> si_descriptors *desc, >> } >> } >> >> static void si_release_descriptors(struct si_descriptors *desc) >> { >> r600_resource_reference(&desc->buffer, NULL); >> FREE(desc->list); >> } >> >> static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, >> unsigned size, >> - unsigned *out_offset, struct r600_resource >> **out_buf) { >> + unsigned *out_offset, struct r600_resource >> **out_buf) >> +{ >> uint64_t va; >> >> u_suballocator_alloc(sctx->ce_suballocator, size, >> - sctx->screen->b.info.tcc_cache_line_size, >> - out_offset, (struct pipe_resource**)out_buf); >> + si_optimal_tcc_alignment(sctx, size), >> + (unsigned*)out_offset, > > > The extra cast of out_offset is unnecessary. > > >> + (struct pipe_resource**)out_buf); >> if (!out_buf) >> return false; >> >> va = (*out_buf)->gpu_address + *out_offset; >> >> radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0)); >> radeon_emit(sctx->ce_ib, ce_offset); >> radeon_emit(sctx->ce_ib, size / 4); >> radeon_emit(sctx->ce_ib, va); >> radeon_emit(sctx->ce_ib, va >> 32); >> @@ -186,58 +188,70 @@ void si_ce_enable_loads(struct radeon_winsys_cs *ib) >> radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); >> radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) | >> CONTEXT_CONTROL_LOAD_CE_RAM(1)); >> radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1)); >> } >> >> static bool si_upload_descriptors(struct si_context *sctx, >> struct si_descriptors *desc, >> struct r600_atom * atom) >> { >> - unsigned list_size = desc->num_elements * desc->element_dw_size * >> 4; >> + unsigned slot_size = desc->element_dw_size * 4; >> + unsigned first_slot_offset = desc->first_active_slot * slot_size; >> + unsigned upload_size = desc->num_active_slots * slot_size; >> + >> + if (!upload_size) >> + return true; > > > The early-out here means that desc->num_active_slots *does* control what is > written to CE RAM, contrary to what its descriptive comment says. It needs > to be moved further down.
True, but I think it doesn't matter, because dirty_mask stays dirty and the descriptors will be uploaded when there is a shader using them. Marek _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev