On Thu, May 18, 2017 at 12:41 PM, Marek Olšák <mar...@gmail.com> wrote: > On Thu, May 18, 2017 at 11:31 AM, Nicolai Hähnle <nhaeh...@gmail.com> wrote: >> On 17.05.2017 21:38, Marek Olšák wrote: >>> >>> From: Marek Olšák <marek.ol...@amd.com> >>> >>> This decreases the size of CE RAM dumps to L2, or the size of descriptor >>> uploads without CE. >>> --- >>> src/gallium/drivers/radeonsi/si_compute.c | 28 ++++++-- >>> src/gallium/drivers/radeonsi/si_descriptors.c | 85 >>> ++++++++++++++++++++----- >>> src/gallium/drivers/radeonsi/si_state.h | 18 +++++- >>> src/gallium/drivers/radeonsi/si_state_shaders.c | 6 ++ >>> 4 files changed, 113 insertions(+), 24 deletions(-) >>> >>> diff --git a/src/gallium/drivers/radeonsi/si_compute.c >>> b/src/gallium/drivers/radeonsi/si_compute.c >>> index 22ef111..4c98066 100644 >>> --- a/src/gallium/drivers/radeonsi/si_compute.c >>> +++ b/src/gallium/drivers/radeonsi/si_compute.c >>> @@ -201,21 +201,38 @@ static void *si_create_compute_state( >>> return NULL; >>> } >>> } >>> >>> return program; >>> } >>> >>> static void si_bind_compute_state(struct pipe_context *ctx, void *state) >>> { >>> struct si_context *sctx = (struct si_context*)ctx; >>> - sctx->cs_shader_state.program = (struct si_compute*)state; >>> + struct si_compute *program = (struct si_compute*)state; >>> + >>> + sctx->cs_shader_state.program = program; >>> + if (!program) >>> + return; >>> + >>> + /* Wait because we need active slot usage masks. */ >>> + if (program->ir_type == PIPE_SHADER_IR_TGSI) >>> + util_queue_fence_wait(&program->ready); >>> + >>> + si_set_active_descriptors(sctx, >>> + SI_DESCS_FIRST_COMPUTE + >>> + >>> SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS, >>> + >>> program->active_const_and_shader_buffers); >>> + si_set_active_descriptors(sctx, >>> + SI_DESCS_FIRST_COMPUTE + >>> + SI_SHADER_DESCS_SAMPLERS_AND_IMAGES, >>> + program->active_samplers_and_images); >>> } >>> >>> static void si_set_global_binding( >>> struct pipe_context *ctx, unsigned first, unsigned n, >>> struct pipe_resource **resources, >>> uint32_t **handles) >>> { >>> unsigned i; >>> struct si_context *sctx = (struct si_context*)ctx; >>> struct si_compute *program = sctx->cs_shader_state.program; >>> @@ -749,26 +766,23 @@ static void si_launch_grid( >>> bool cs_regalloc_hang = >>> (sctx->b.chip_class == SI || >>> sctx->b.family == CHIP_BONAIRE || >>> sctx->b.family == CHIP_KABINI) && >>> info->block[0] * info->block[1] * info->block[2] > 256; >>> >>> if (cs_regalloc_hang) >>> sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | >>> SI_CONTEXT_CS_PARTIAL_FLUSH; >>> >>> - if (program->ir_type == PIPE_SHADER_IR_TGSI) { >>> - util_queue_fence_wait(&program->ready); >>> - >>> - if (program->shader.compilation_failed) >>> - return; >>> - } >>> + if (program->ir_type == PIPE_SHADER_IR_TGSI && >>> + program->shader.compilation_failed) >>> + return; >>> >>> si_decompress_compute_textures(sctx); >>> >>> /* Add buffer sizes for memory checking in need_cs_space. */ >>> r600_context_add_resource_size(ctx, &program->shader.bo->b.b); >>> /* TODO: add the scratch buffer */ >>> >>> if (info->indirect) { >>> r600_context_add_resource_size(ctx, info->indirect); >>> >>> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c >>> b/src/gallium/drivers/radeonsi/si_descriptors.c >>> index 38e4ae1..a2f40a8 100644 >>> --- a/src/gallium/drivers/radeonsi/si_descriptors.c >>> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c >>> @@ -118,26 +118,28 @@ static void si_init_descriptors(struct >>> si_descriptors *desc, >>> } >>> } >>> >>> static void si_release_descriptors(struct si_descriptors *desc) >>> { >>> r600_resource_reference(&desc->buffer, NULL); >>> FREE(desc->list); >>> } >>> >>> static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, >>> unsigned size, >>> - unsigned *out_offset, struct r600_resource >>> **out_buf) { >>> + unsigned *out_offset, struct r600_resource >>> **out_buf) >>> +{ >>> uint64_t va; >>> >>> u_suballocator_alloc(sctx->ce_suballocator, size, >>> - sctx->screen->b.info.tcc_cache_line_size, >>> - out_offset, (struct pipe_resource**)out_buf); >>> + si_optimal_tcc_alignment(sctx, size), >>> + (unsigned*)out_offset, >> >> >> The extra cast of out_offset is unnecessary. >> >> >>> + (struct pipe_resource**)out_buf); >>> if (!out_buf) >>> return false; >>> >>> va = (*out_buf)->gpu_address + *out_offset; >>> >>> radeon_emit(sctx->ce_ib, PKT3(PKT3_DUMP_CONST_RAM, 3, 0)); >>> radeon_emit(sctx->ce_ib, ce_offset); >>> radeon_emit(sctx->ce_ib, size / 4); >>> radeon_emit(sctx->ce_ib, va); >>> radeon_emit(sctx->ce_ib, va >> 32); >>> @@ -186,58 +188,70 @@ void si_ce_enable_loads(struct radeon_winsys_cs *ib) >>> radeon_emit(ib, PKT3(PKT3_CONTEXT_CONTROL, 1, 0)); >>> radeon_emit(ib, CONTEXT_CONTROL_LOAD_ENABLE(1) | >>> CONTEXT_CONTROL_LOAD_CE_RAM(1)); >>> radeon_emit(ib, CONTEXT_CONTROL_SHADOW_ENABLE(1)); >>> } >>> >>> static bool si_upload_descriptors(struct si_context *sctx, >>> struct si_descriptors *desc, >>> struct r600_atom * atom) >>> { >>> - unsigned list_size = desc->num_elements * desc->element_dw_size * >>> 4; >>> + unsigned slot_size = desc->element_dw_size * 4; >>> + unsigned first_slot_offset = desc->first_active_slot * slot_size; >>> + unsigned upload_size = desc->num_active_slots * slot_size; >>> + >>> + if (!upload_size) >>> + return true; >> >> >> The early-out here means that desc->num_active_slots *does* control what is >> written to CE RAM, contrary to what its descriptive comment says. It needs >> to be moved further down. > > True, but I think it doesn't matter, because dirty_mask stays dirty > and the descriptors will be uploaded when there is a shader using > them.
Is this enough? diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c index 58d2723..89588c3 100644 --- a/src/gallium/drivers/radeonsi/si_descriptors.c +++ b/src/gallium/drivers/radeonsi/si_descriptors.c @@ -136,7 +136,7 @@ static bool si_ce_upload(struct si_context *sctx, unsigned ce_offset, unsigned s u_suballocator_alloc(sctx->ce_suballocator, size, si_optimal_tcc_alignment(sctx, size), - (unsigned*)out_offset, + out_offset, (struct pipe_resource**)out_buf); if (!out_buf) return false; @@ -204,6 +204,10 @@ static bool si_upload_descriptors(struct si_context *sctx, unsigned first_slot_offset = desc->first_active_slot * slot_size; unsigned upload_size = desc->num_active_slots * slot_size; + /* Skip the upload if no shader is using the descriptors. dirty_mask + * will stay dirty and the descriptors will be uploaded when there is + * a shader using them. + */ if (!upload_size) return true; Marek _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev