1) It's quite a lot of CPU work, so it shouldn't be done unconditionally. Please see how si_pm4_state_changed is used to check if a "shader variant" has been changed. I'd like to have an early exit path for performance reasons, so: a) If no shader state has been changed (see si_pm4_state_changed), return. b) If all current shaders don't use the scratch buffer, return.
2) Please put it in si_state_shaders.c. It's the new place for all non-trivial shader stuff. 3) Since the CP runs in parallel with the graphics engine, shaders can be busy while WRITE_DATA is changing their code. (no good can come out of that) Therefore, the partial flushes (= partial "WAIT_UNTIL") must be emitted before the WRITE_DATA packet. The other thing is that the code is missing checking if there is enough space in the CS for the WRITE_DATA packets. In order to simplify everything, I recommend this solution: - Forget about partial flushes, forget about WRITE_DATA. - If the shader relocs need updating, copy the current shader bo to a new shader bo using the CPU (map the current shader bo with read+unsynchronized flags). - Update the relocs using the CPU in the new shader bo. - Then replace the current shader bo with the new one and mark the whole shader state as dirty. 4) Like Michel said, si_begin_new_cs should set some flag that SPI_TMPRING_SIZE should be emitted and the buffer reloc should be added. Marek On Fri, Jan 16, 2015 at 6:54 PM, Tom Stellard <thomas.stell...@amd.com> wrote: > v2: > - Only emit write SPI_TMPRING_SIZE once per packet. > - Use context global scratch buffer. > > v3: > - Patch shaders using WRITE_DATA packet instead of map/unmap. > - Emit ICACHE_FLUSH, CS_PARTIAL_FLUSH, PS_PARTIAL_FLUSH, and > VS_PARTIAL_FLUSH when patching shaders. > --- > src/gallium/drivers/radeonsi/si_compute.c | 44 ++-------- > src/gallium/drivers/radeonsi/si_pipe.c | 6 ++ > src/gallium/drivers/radeonsi/si_pipe.h | 3 + > src/gallium/drivers/radeonsi/si_shader.c | 68 ++++++++++++++- > src/gallium/drivers/radeonsi/si_shader.h | 9 +- > src/gallium/drivers/radeonsi/si_state_draw.c | 105 > ++++++++++++++++++++++++ > src/gallium/drivers/radeonsi/si_state_shaders.c | 12 ++- > 7 files changed, 200 insertions(+), 47 deletions(-) > > diff --git a/src/gallium/drivers/radeonsi/si_compute.c > b/src/gallium/drivers/radeonsi/si_compute.c > index 4427d3b..e407fb9 100644 > --- a/src/gallium/drivers/radeonsi/si_compute.c > +++ b/src/gallium/drivers/radeonsi/si_compute.c > @@ -42,12 +42,6 @@ > #define NUM_USER_SGPRS 4 > #endif > > -static const char *scratch_rsrc_dword0_symbol = > - "SCRATCH_RSRC_DWORD0"; > - > -static const char *scratch_rsrc_dword1_symbol = > - "SCRATCH_RSRC_DWORD1"; > - > struct si_compute { > struct si_context *ctx; > > @@ -183,35 +177,6 @@ static unsigned compute_num_waves_for_scratch( > return scratch_waves; > } > > -static void apply_scratch_relocs(const struct si_screen *sscreen, > - const struct radeon_shader_binary *binary, > - struct si_shader *shader, uint64_t scratch_va) { > - unsigned i; > - char *ptr; > - uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff; > - uint32_t scratch_rsrc_dword1 = > - S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) > - | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64); > - > - if (!binary->reloc_count) { > - return; > - } > - > - ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL, > - PIPE_TRANSFER_READ_WRITE); > - for (i = 0 ; i < binary->reloc_count; i++) { > - const struct radeon_shader_reloc *reloc = &binary->relocs[i]; > - if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) { > - util_memcpy_cpu_to_le32(ptr + reloc->offset, > - &scratch_rsrc_dword0, 4); > - } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { > - util_memcpy_cpu_to_le32(ptr + reloc->offset, > - &scratch_rsrc_dword1, 4); > - } > - } > - sscreen->b.ws->buffer_unmap(shader->bo->cs_buf); > -} > - > static void si_launch_grid( > struct pipe_context *ctx, > const uint *block_layout, const uint *grid_layout, > @@ -256,7 +221,8 @@ static void si_launch_grid( > > #if HAVE_LLVM >= 0x0306 > /* Read the config information */ > - si_shader_binary_read_config(&program->binary, &program->program, pc); > + si_shader_binary_read_config(sctx->screen, &program->binary, > + &program->program, pc); > #endif > > /* Upload the kernel arguments */ > @@ -295,8 +261,10 @@ static void si_launch_grid( > RADEON_PRIO_SHADER_RESOURCE_RW); > > /* Patch the shader with the scratch buffer address. */ > - apply_scratch_relocs(sctx->screen, > - &program->binary, shader, scratch_buffer_va); > + si_shader_apply_scratch_relocs(sctx, > + shader, program->binary.relocs, > + program->binary.reloc_count, > + scratch_buffer_va); > > } > > diff --git a/src/gallium/drivers/radeonsi/si_pipe.c > b/src/gallium/drivers/radeonsi/si_pipe.c > index e3f8fcf..8e89f1e 100644 > --- a/src/gallium/drivers/radeonsi/si_pipe.c > +++ b/src/gallium/drivers/radeonsi/si_pipe.c > @@ -46,6 +46,7 @@ static void si_destroy_context(struct pipe_context *context) > pipe_resource_reference(&sctx->gsvs_ring, NULL); > pipe_resource_reference(&sctx->null_const_buf.buffer, NULL); > r600_resource_reference(&sctx->border_color_table, NULL); > + r600_resource_reference(&sctx->scratch_buffer, NULL); > > si_pm4_free_state(sctx, sctx->init_config, ~0); > si_pm4_delete_state(sctx, gs_rings, sctx->gs_rings); > @@ -158,6 +159,11 @@ static struct pipe_context *si_create_context(struct > pipe_screen *screen, void * > sctx->null_const_buf.buffer->width0, 0, > false); > } > > + /* XXX: This is the maximum value allowed. I'm not sure how compute > + * this for non-cs shaders. > + */ > + sctx->scratch_waves = 32 * sscreen->b.info.max_compute_units; > + > return &sctx->b.b; > fail: > si_destroy_context(&sctx->b.b); > diff --git a/src/gallium/drivers/radeonsi/si_pipe.h > b/src/gallium/drivers/radeonsi/si_pipe.h > index dfb1cd6..6051763 100644 > --- a/src/gallium/drivers/radeonsi/si_pipe.h > +++ b/src/gallium/drivers/radeonsi/si_pipe.h > @@ -174,6 +174,7 @@ struct si_context { > struct si_buffer_resources const_buffers[SI_NUM_SHADERS]; > struct si_buffer_resources rw_buffers[SI_NUM_SHADERS]; > struct si_textures_info samplers[SI_NUM_SHADERS]; > + struct r600_resource *scratch_buffer; > struct r600_resource *border_color_table; > unsigned border_color_offset; > > @@ -221,6 +222,8 @@ struct si_context { > int last_prim; > int last_multi_vgt_param; > int last_rast_prim; > + > + unsigned scratch_waves; > }; > > /* si_blit.c */ > diff --git a/src/gallium/drivers/radeonsi/si_shader.c > b/src/gallium/drivers/radeonsi/si_shader.c > index 81cb2ef..fd5b92b 100644 > --- a/src/gallium/drivers/radeonsi/si_shader.c > +++ b/src/gallium/drivers/radeonsi/si_shader.c > @@ -32,6 +32,7 @@ > #include "gallivm/lp_bld_logic.h" > #include "gallivm/lp_bld_arit.h" > #include "gallivm/lp_bld_flow.h" > +#include "radeon/r600_cs.h" > #include "radeon/radeon_llvm.h" > #include "radeon/radeon_elf_util.h" > #include "radeon/radeon_llvm_emit.h" > @@ -46,6 +47,12 @@ > > #include <errno.h> > > +static const char *scratch_rsrc_dword0_symbol = > + "SCRATCH_RSRC_DWORD0"; > + > +static const char *scratch_rsrc_dword1_symbol = > + "SCRATCH_RSRC_DWORD1"; > + > struct si_shader_output_values > { > LLVMValueRef values[4]; > @@ -2517,7 +2524,8 @@ static void preload_ring_buffers(struct > si_shader_context *si_shader_ctx) > } > } > > -void si_shader_binary_read_config(const struct radeon_shader_binary *binary, > +void si_shader_binary_read_config(const struct si_screen *sscreen, > + const struct radeon_shader_binary *binary, > struct si_shader *shader, > unsigned symbol_offset) > { > @@ -2549,6 +2557,7 @@ void si_shader_binary_read_config(const struct > radeon_shader_binary *binary, > case R_0286CC_SPI_PS_INPUT_ENA: > shader->spi_ps_input_ena = value; > break; > + case R_0286E8_SPI_TMPRING_SIZE: > case R_00B860_COMPUTE_TMPRING_SIZE: > /* WAVESIZE is in units of 256 dwords. */ > shader->scratch_bytes_per_wave = > @@ -2562,6 +2571,54 @@ void si_shader_binary_read_config(const struct > radeon_shader_binary *binary, > } > } > > +void si_shader_apply_scratch_relocs(struct si_context *sctx, > + struct si_shader *shader, > + const struct radeon_shader_reloc *relocs, > + unsigned num_relocs, uint64_t scratch_va) > +{ > + unsigned i; > + struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; > + uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff; > + uint32_t scratch_rsrc_dword1 = > + S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) > + | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64); > + > + if (num_relocs == 0) { > + return; > + } > + > + assert(relocs); > + > + r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, shader->bo, > + RADEON_USAGE_WRITE, RADEON_PRIO_SHADER_DATA); > + > + for (i = 0 ; i < num_relocs; i++) { > + const struct radeon_shader_reloc *reloc = &relocs[i]; > + uint64_t addr = shader->bo->gpu_address + reloc->offset; > + uint32_t value; > + if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) { > + value = util_cpu_to_le32(scratch_rsrc_dword0); > + } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { > + value = util_cpu_to_le32(scratch_rsrc_dword1); > + } else { > + continue; > + } > + > + radeon_emit(cs, PKT3(PKT3_WRITE_DATA, 3, 0)); > + radeon_emit(cs, > PKT3_WRITE_DATA_DST_SEL(PKT3_WRITE_DATA_DST_SEL_MEM_SYNC) | > + PKT3_WRITE_DATA_WR_CONFIRM | > + > PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME)); > + radeon_emit(cs, addr & 0xFFFFFFFFUL); > + radeon_emit(cs, (addr >> 32UL) & 0xFFFFFFFFUL); > + radeon_emit(cs, value); > + } > + sctx->b.flags |= SI_CONTEXT_INV_ICACHE | > + SI_CONTEXT_CS_PARTIAL_FLUSH | > + SI_CONTEXT_PS_PARTIAL_FLUSH | > + SI_CONTEXT_VS_PARTIAL_FLUSH; > +} > + > + > int si_shader_binary_read(struct si_screen *sscreen, > struct si_shader *shader, > const struct radeon_shader_binary *binary) > @@ -2582,7 +2639,7 @@ int si_shader_binary_read(struct si_screen *sscreen, > } > } > > - si_shader_binary_read_config(binary, shader, 0); > + si_shader_binary_read_config(sscreen, binary, shader, 0); > > /* copy new shader */ > code_size = binary->code_size + binary->rodata_size; > @@ -2621,7 +2678,10 @@ int si_compile_llvm(struct si_screen *sscreen, struct > si_shader *shader, > return r; > } > r = si_shader_binary_read(sscreen, shader, &binary); > - radeon_shader_binary_free_members(&binary, true); > + > + shader->relocs = binary.relocs; > + shader->num_relocs = binary.reloc_count; > + radeon_shader_binary_free_members(&binary, false); > return r; > } > > @@ -2857,6 +2917,6 @@ void si_shader_destroy(struct pipe_context *ctx, struct > si_shader *shader) > if (shader->gs_copy_shader) > si_shader_destroy(ctx, shader->gs_copy_shader); > > + radeon_shader_binary_free_relocs(shader->relocs, shader->num_relocs); > r600_resource_reference(&shader->bo, NULL); > - r600_resource_reference(&shader->scratch_bo, NULL); > } > diff --git a/src/gallium/drivers/radeonsi/si_shader.h > b/src/gallium/drivers/radeonsi/si_shader.h > index 08e344a..f33f021 100644 > --- a/src/gallium/drivers/radeonsi/si_shader.h > +++ b/src/gallium/drivers/radeonsi/si_shader.h > @@ -34,6 +34,7 @@ > #include "si_state.h" > > struct radeon_shader_binary; > +struct radeon_shader_reloc; > > #define SI_SGPR_RW_BUFFERS 0 /* rings (& stream-out, VS only) */ > #define SI_SGPR_CONST 2 > @@ -142,6 +143,8 @@ struct si_shader { > struct si_pm4_state *pm4; > struct r600_resource *bo; > struct r600_resource *scratch_bo; > + struct radeon_shader_reloc *relocs; > + unsigned num_relocs; > unsigned num_sgprs; > unsigned num_vgprs; > unsigned lds_size; > @@ -185,7 +188,11 @@ void si_shader_destroy(struct pipe_context *ctx, struct > si_shader *shader); > unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned > index); > int si_shader_binary_read(struct si_screen *sscreen, struct si_shader > *shader, > const struct radeon_shader_binary *binary); > -void si_shader_binary_read_config(const struct radeon_shader_binary *binary, > +void si_shader_apply_scratch_relocs(struct si_context *sctx, > + struct si_shader *shader, const struct radeon_shader_reloc *relocs, > + unsigned num_relocs, uint64_t scratch_va); > +void si_shader_binary_read_config(const struct si_screen *sscreen, > + const struct radeon_shader_binary *binary, > struct si_shader *shader, > unsigned symbol_offset); > > diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c > b/src/gallium/drivers/radeonsi/si_state_draw.c > index cd4880b..0cd0492 100644 > --- a/src/gallium/drivers/radeonsi/si_state_draw.c > +++ b/src/gallium/drivers/radeonsi/si_state_draw.c > @@ -173,6 +173,110 @@ static void si_emit_rasterizer_prim_state(struct > si_context *sctx, unsigned mode > sctx->last_rast_prim = mode; > } > > +static void si_update_scratch_buffer(struct si_context *sctx, > + struct si_shader_selector *sel) > +{ > + struct si_shader *shader; > + unsigned scratch_bytes; > + > + if (!sel) { > + return; > + } > + > + shader = sel->current; > + scratch_bytes = shader->scratch_bytes_per_wave * > + sctx->scratch_waves; > + > + /* This shader doesn't need a scratch buffer */ > + if (scratch_bytes == 0) { > + return; > + } > + > + /* This shader is already configured to use the current > + * scratch buffer. */ > + if (shader->scratch_bo == sctx->scratch_buffer) { > + return; > + } > + > + assert(sctx->scratch_buffer); > + > + si_shader_apply_scratch_relocs(sctx, shader, > + shader->relocs, shader->num_relocs, > + sctx->scratch_buffer->gpu_address); > + > + shader->scratch_bo = sctx->scratch_buffer; > +} > + > +static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx) > +{ > + if (!sctx->scratch_buffer) { > + return 0; > + } > + > + return sctx->scratch_buffer->b.b.width0; > +} > + > +static unsigned si_get_scratch_buffer_size(struct si_context *sctx, > + struct si_shader_selector *sel) > +{ > + if (!sel) { > + return 0; > + } > + > + return sel->current->scratch_bytes_per_wave * > + sctx->scratch_waves; > + > +} > + > +static unsigned si_get_max_scratch_size_needed(struct si_context *sctx) > +{ > + > + return MAX3(si_get_scratch_buffer_size(sctx, sctx->ps_shader), > + si_get_scratch_buffer_size(sctx, sctx->gs_shader), > + si_get_scratch_buffer_size(sctx, sctx->vs_shader)); > +} > + > +static void si_emit_spi_tmpring_state(struct si_context *sctx) > +{ > + struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; > + unsigned current_scratch_buffer_size = > + si_get_current_scratch_buffer_size(sctx); > + unsigned scratch_needed_size = > + si_get_max_scratch_size_needed(sctx); > + unsigned scratch_bytes_per_wave = scratch_needed_size / > + sctx->scratch_waves; > + > + if (scratch_needed_size > current_scratch_buffer_size) { > + /* Create a bigger scratch buffer */ > + struct r600_resource *new_scratch_buffer = > + si_resource_create_custom(&sctx->screen->b.b, > + PIPE_USAGE_DEFAULT, scratch_needed_size); > + > + pipe_resource_reference( > + (struct pipe_resource**)&sctx->scratch_buffer, > + &new_scratch_buffer->b.b); > + } > + > + /* Update the shaders, so they are using the latest scratch buffer. */ > + si_update_scratch_buffer(sctx, sctx->ps_shader); > + si_update_scratch_buffer(sctx, sctx->gs_shader); > + si_update_scratch_buffer(sctx, sctx->vs_shader); > + > + /* The LLVM shader backend should be reporting aligned scratch_sizes. > */ > + assert((scratch_needed_size & ~0x3FF) == scratch_needed_size && > + "scratch size should already be aligned correctly."); > + > + r600_write_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE, > + S_0286E8_WAVES(sctx->scratch_waves) | > + S_0286E8_WAVESIZE(scratch_bytes_per_wave >> > 10)); > + > + if (scratch_needed_size > 0) { > + r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, > + sctx->scratch_buffer, RADEON_USAGE_READWRITE, > + RADEON_PRIO_SHADER_RESOURCE_RW); > + } > +} > + > static void si_emit_draw_registers(struct si_context *sctx, > const struct pipe_draw_info *info, > const struct pipe_index_buffer *ib) > @@ -581,6 +685,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct > pipe_draw_info *info) > } > } > > + si_emit_spi_tmpring_state(sctx); > si_pm4_emit_dirty(sctx); > si_emit_rasterizer_prim_state(sctx, info->mode); > si_emit_draw_registers(sctx, info, &ib); > diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c > b/src/gallium/drivers/radeonsi/si_state_shaders.c > index 817a990..c24573c 100644 > --- a/src/gallium/drivers/radeonsi/si_state_shaders.c > +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c > @@ -67,7 +67,8 @@ static void si_shader_es(struct si_shader *shader) > S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) | > S_00B328_DX10_CLAMP(shader->dx10_clamp_mode)); > si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES, > - S_00B32C_USER_SGPR(num_user_sgprs)); > + S_00B32C_USER_SGPR(num_user_sgprs) | > + S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > > 0)); > } > > static void si_shader_gs(struct si_shader *shader) > @@ -136,7 +137,8 @@ static void si_shader_gs(struct si_shader *shader) > S_00B228_SGPRS((num_sgprs - 1) / 8) | > S_00B228_DX10_CLAMP(shader->dx10_clamp_mode)); > si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, > - S_00B22C_USER_SGPR(num_user_sgprs)); > + S_00B22C_USER_SGPR(num_user_sgprs) | > + S_00B22C_SCRATCH_EN(shader->scratch_bytes_per_wave > > 0)); > } > > static void si_shader_vs(struct si_shader *shader) > @@ -216,7 +218,8 @@ static void si_shader_vs(struct si_shader *shader) > S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) > | > S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) > | > S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) > | > - S_00B12C_SO_EN(!!shader->selector->so.num_outputs)); > + S_00B12C_SO_EN(!!shader->selector->so.num_outputs) | > + S_00B12C_SCRATCH_EN(shader->scratch_bytes_per_wave > > 0)); > if (window_space) > si_pm4_set_reg(pm4, R_028818_PA_CL_VTE_CNTL, > S_028818_VTX_XY_FMT(1) | > S_028818_VTX_Z_FMT(1)); > @@ -307,7 +310,8 @@ static void si_shader_ps(struct si_shader *shader) > S_00B028_DX10_CLAMP(shader->dx10_clamp_mode)); > si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS, > S_00B02C_EXTRA_LDS_SIZE(shader->lds_size) | > - S_00B02C_USER_SGPR(num_user_sgprs)); > + S_00B02C_USER_SGPR(num_user_sgprs) | > + S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > > 0)); > } > > static void si_shader_init_pm4_state(struct si_shader *shader) > -- > 1.8.5.5 > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev