v2: - Only emit write SPI_TMPRING_SIZE once per packet. - Use context global scratch buffers, one for each shader type. --- src/gallium/drivers/radeon/radeon_elf_util.c | 10 ++- src/gallium/drivers/radeon/radeon_elf_util.h | 5 +- src/gallium/drivers/radeonsi/si_compute.c | 46 ++--------- src/gallium/drivers/radeonsi/si_pipe.c | 7 ++ src/gallium/drivers/radeonsi/si_pipe.h | 3 + src/gallium/drivers/radeonsi/si_shader.c | 53 +++++++++++- src/gallium/drivers/radeonsi/si_shader.h | 9 +- src/gallium/drivers/radeonsi/si_state_draw.c | 105 ++++++++++++++++++++++++ src/gallium/drivers/radeonsi/si_state_shaders.c | 12 ++- 9 files changed, 199 insertions(+), 51 deletions(-)
diff --git a/src/gallium/drivers/radeon/radeon_elf_util.c b/src/gallium/drivers/radeon/radeon_elf_util.c index 9e8c53f..fd0632b 100644 --- a/src/gallium/drivers/radeon/radeon_elf_util.c +++ b/src/gallium/drivers/radeon/radeon_elf_util.c @@ -208,9 +208,15 @@ void radeon_shader_binary_free_relocs(struct radeon_shader_reloc *relocs, FREE(relocs); } -void radeon_shader_binary_free_members(struct radeon_shader_binary *binary) { +void radeon_shader_binary_free_members(struct radeon_shader_binary *binary, + unsigned free_relocs) +{ FREE(binary->code); FREE(binary->config); FREE(binary->rodata); - radeon_shader_binary_free_relocs(binary->relocs, binary->reloc_count); + + if (free_relocs) { + radeon_shader_binary_free_relocs(binary->relocs, + binary->reloc_count); + } } diff --git a/src/gallium/drivers/radeon/radeon_elf_util.h b/src/gallium/drivers/radeon/radeon_elf_util.h index cff07f2..ab83f98 100644 --- a/src/gallium/drivers/radeon/radeon_elf_util.h +++ b/src/gallium/drivers/radeon/radeon_elf_util.h @@ -50,8 +50,11 @@ const unsigned char *radeon_shader_binary_config_start( /** * Free all memory allocated for members of \p binary. This function does * not free \p binary. + * + * @param free_relocs If false, reolc information will not be freed. */ -void radeon_shader_binary_free_members(struct radeon_shader_binary *binary); +void radeon_shader_binary_free_members(struct radeon_shader_binary *binary, + unsigned free_relocs); /** * Free \p relocs and all member data. diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c index 9571a3e..82ce43b 100644 --- a/src/gallium/drivers/radeonsi/si_compute.c +++ b/src/gallium/drivers/radeonsi/si_compute.c @@ -42,12 +42,6 @@ #define NUM_USER_SGPRS 4 #endif -static const char *scratch_rsrc_dword0_symbol = - "SCRATCH_RSRC_DWORD0"; - -static const char *scratch_rsrc_dword1_symbol = - "SCRATCH_RSRC_DWORD1"; - struct si_compute { struct si_context *ctx; @@ -183,35 +177,6 @@ static unsigned compute_num_waves_for_scratch( return scratch_waves; } -static void apply_scratch_relocs(const struct si_screen *sscreen, - const struct radeon_shader_binary *binary, - struct si_shader *shader, uint64_t scratch_va) { - unsigned i; - char *ptr; - uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff; - uint32_t scratch_rsrc_dword1 = - S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) - | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64); - - if (!binary->reloc_count) { - return; - } - - ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL, - PIPE_TRANSFER_READ_WRITE); - for (i = 0 ; i < binary->reloc_count; i++) { - const struct radeon_shader_reloc *reloc = &binary->relocs[i]; - if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) { - util_memcpy_cpu_to_le32(ptr + reloc->offset, - &scratch_rsrc_dword0, 4); - } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { - util_memcpy_cpu_to_le32(ptr + reloc->offset, - &scratch_rsrc_dword1, 4); - } - } - sscreen->b.ws->buffer_unmap(shader->bo->cs_buf); -} - static void si_launch_grid( struct pipe_context *ctx, const uint *block_layout, const uint *grid_layout, @@ -256,7 +221,8 @@ static void si_launch_grid( #if HAVE_LLVM >= 0x0306 /* Read the config information */ - si_shader_binary_read_config(&program->binary, &program->program, pc); + si_shader_binary_read_config(sctx->screen, &program->binary, + &program->program, pc); #endif /* Upload the kernel arguments */ @@ -295,8 +261,10 @@ static void si_launch_grid( RADEON_PRIO_SHADER_RESOURCE_RW); /* Patch the shader with the scratch buffer address. */ - apply_scratch_relocs(sctx->screen, - &program->binary, shader, scratch_buffer_va); + si_shader_apply_scratch_relocs(sctx->screen, + shader, program->binary.relocs, + program->binary.reloc_count, + scratch_buffer_va); } @@ -481,7 +449,7 @@ static void si_delete_compute_state(struct pipe_context *ctx, void* state){ pipe_resource_reference( (struct pipe_resource **)&program->input_buffer, NULL); - radeon_shader_binary_free_members(&program->binary); + radeon_shader_binary_free_members(&program->binary, true); FREE(program); } diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index e3f8fcf..deb0d25 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -46,6 +46,7 @@ static void si_destroy_context(struct pipe_context *context) pipe_resource_reference(&sctx->gsvs_ring, NULL); pipe_resource_reference(&sctx->null_const_buf.buffer, NULL); r600_resource_reference(&sctx->border_color_table, NULL); + r600_resource_reference(&sctx->scratch_buffer, NULL); si_pm4_free_state(sctx, sctx->init_config, ~0); si_pm4_delete_state(sctx, gs_rings, sctx->gs_rings); @@ -55,6 +56,7 @@ static void si_destroy_context(struct pipe_context *context) if (sctx->dummy_pixel_shader) { sctx->b.b.delete_fs_state(&sctx->b.b, sctx->dummy_pixel_shader); } + sctx->b.b.delete_depth_stencil_alpha_state(&sctx->b.b, sctx->custom_dsa_flush); sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_resolve); sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_decompress); @@ -158,6 +160,11 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void * sctx->null_const_buf.buffer->width0, 0, false); } + /* XXX: This is the maximum value allowed. I'm not sure how compute + * this for non-cs shaders. + */ + sctx->scratch_waves = 32 * sscreen->b.info.max_compute_units; + return &sctx->b.b; fail: si_destroy_context(&sctx->b.b); diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index dfb1cd6..6051763 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -174,6 +174,7 @@ struct si_context { struct si_buffer_resources const_buffers[SI_NUM_SHADERS]; struct si_buffer_resources rw_buffers[SI_NUM_SHADERS]; struct si_textures_info samplers[SI_NUM_SHADERS]; + struct r600_resource *scratch_buffer; struct r600_resource *border_color_table; unsigned border_color_offset; @@ -221,6 +222,8 @@ struct si_context { int last_prim; int last_multi_vgt_param; int last_rast_prim; + + unsigned scratch_waves; }; /* si_blit.c */ diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index a891bc6..cefb18b 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -46,6 +46,12 @@ #include <errno.h> +static const char *scratch_rsrc_dword0_symbol = + "SCRATCH_RSRC_DWORD0"; + +static const char *scratch_rsrc_dword1_symbol = + "SCRATCH_RSRC_DWORD1"; + struct si_shader_output_values { LLVMValueRef values[4]; @@ -2517,7 +2523,8 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx) } } -void si_shader_binary_read_config(const struct radeon_shader_binary *binary, +void si_shader_binary_read_config(const struct si_screen *sscreen, + const struct radeon_shader_binary *binary, struct si_shader *shader, unsigned symbol_offset) { @@ -2549,6 +2556,7 @@ void si_shader_binary_read_config(const struct radeon_shader_binary *binary, case R_0286CC_SPI_PS_INPUT_ENA: shader->spi_ps_input_ena = value; break; + case R_0286E8_SPI_TMPRING_SIZE: case R_00B860_COMPUTE_TMPRING_SIZE: /* WAVESIZE is in units of 256 dwords. */ shader->scratch_bytes_per_wave = @@ -2562,6 +2570,39 @@ void si_shader_binary_read_config(const struct radeon_shader_binary *binary, } } +void si_shader_apply_scratch_relocs(const struct si_screen *sscreen, + struct si_shader *shader, + const struct radeon_shader_reloc *relocs, + unsigned num_relocs, uint64_t scratch_va) +{ + unsigned i; + char *ptr; + uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff; + uint32_t scratch_rsrc_dword1 = + S_008F04_BASE_ADDRESS_HI(scratch_va >> 32) + | S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64); + + if (num_relocs == 0) { + return; + } + + assert(relocs); + ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL, + PIPE_TRANSFER_READ_WRITE); + for (i = 0 ; i < num_relocs; i++) { + const struct radeon_shader_reloc *reloc = &relocs[i]; + if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) { + util_memcpy_cpu_to_le32(ptr + reloc->offset, + &scratch_rsrc_dword0, 4); + } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { + util_memcpy_cpu_to_le32(ptr + reloc->offset, + &scratch_rsrc_dword1, 4); + } + } + sscreen->b.ws->buffer_unmap(shader->bo->cs_buf); +} + + int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader, const struct radeon_shader_binary *binary) @@ -2582,7 +2623,7 @@ int si_shader_binary_read(struct si_screen *sscreen, } } - si_shader_binary_read_config(binary, shader, 0); + si_shader_binary_read_config(sscreen, binary, shader, 0); /* copy new shader */ code_size = binary->code_size + binary->rodata_size; @@ -2601,6 +2642,7 @@ int si_shader_binary_read(struct si_screen *sscreen, util_memcpy_cpu_to_le32(ptr, binary->rodata, binary->rodata_size); } + sscreen->b.ws->buffer_unmap(shader->bo->cs_buf); return 0; @@ -2621,7 +2663,10 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader, return r; } r = si_shader_binary_read(sscreen, shader, &binary); - radeon_shader_binary_free_members(&binary); + + shader->relocs = binary.relocs; + shader->num_relocs = binary.reloc_count; + radeon_shader_binary_free_members(&binary, false); return r; } @@ -2857,6 +2902,6 @@ void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader) if (shader->gs_copy_shader) si_shader_destroy(ctx, shader->gs_copy_shader); + radeon_shader_binary_free_relocs(shader->relocs, shader->num_relocs); r600_resource_reference(&shader->bo, NULL); - r600_resource_reference(&shader->scratch_bo, NULL); } diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 08e344a..0021d6c 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -34,6 +34,7 @@ #include "si_state.h" struct radeon_shader_binary; +struct radeon_shader_reloc; #define SI_SGPR_RW_BUFFERS 0 /* rings (& stream-out, VS only) */ #define SI_SGPR_CONST 2 @@ -142,6 +143,8 @@ struct si_shader { struct si_pm4_state *pm4; struct r600_resource *bo; struct r600_resource *scratch_bo; + struct radeon_shader_reloc *relocs; + unsigned num_relocs; unsigned num_sgprs; unsigned num_vgprs; unsigned lds_size; @@ -185,7 +188,11 @@ void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader); unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index); int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader, const struct radeon_shader_binary *binary); -void si_shader_binary_read_config(const struct radeon_shader_binary *binary, +void si_shader_apply_scratch_relocs(const struct si_screen *sscreen, + struct si_shader *shader, const struct radeon_shader_reloc *relocs, + unsigned num_relocs, uint64_t scratch_va); +void si_shader_binary_read_config(const struct si_screen *sscreen, + const struct radeon_shader_binary *binary, struct si_shader *shader, unsigned symbol_offset); diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index cd4880b..4604c67 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -173,6 +173,110 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx, unsigned mode sctx->last_rast_prim = mode; } +static void si_update_scratch_buffer(struct si_context *sctx, + struct si_shader_selector *sel) +{ + struct si_shader *shader; + unsigned scratch_bytes; + + if (!sel) { + return; + } + + shader = sel->current; + scratch_bytes = shader->scratch_bytes_per_wave * + sctx->scratch_waves; + + /* This shader doesn't need a scratch buffer */ + if (scratch_bytes == 0) { + return; + } + + /* This shader is already configured to use the current + * scratch buffer. */ + if (shader->scratch_bo == sctx->scratch_buffer) { + return; + } + + assert(sctx->scratch_buffer); + + si_shader_apply_scratch_relocs(sctx->screen, shader, + shader->relocs, shader->num_relocs, + sctx->scratch_buffer->gpu_address); + + shader->scratch_bo = sctx->scratch_buffer; +} + +static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx) +{ + if (!sctx->scratch_buffer) { + return 0; + } + + return sctx->scratch_buffer->b.b.width0; +} + +static unsigned si_get_scratch_buffer_size(struct si_context *sctx, + struct si_shader_selector *sel) +{ + if (!sel) { + return 0; + } + + return sel->current->scratch_bytes_per_wave * + sctx->scratch_waves; + +} + +static unsigned si_get_max_scratch_size_needed(struct si_context *sctx) +{ + + return MAX3(si_get_scratch_buffer_size(sctx, sctx->ps_shader), + si_get_scratch_buffer_size(sctx, sctx->gs_shader), + si_get_scratch_buffer_size(sctx, sctx->vs_shader)); +} + +static void si_emit_spi_tmpring_state(struct si_context *sctx) +{ + struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs; + unsigned current_scratch_buffer_size = + si_get_current_scratch_buffer_size(sctx); + unsigned scratch_needed_size = + si_get_max_scratch_size_needed(sctx); + unsigned scratch_bytes_per_wave = scratch_needed_size / + sctx->scratch_waves; + + if (scratch_needed_size > current_scratch_buffer_size) { + /* Create a bigger scratch buffer */ + struct r600_resource *new_scratch_buffer = + si_resource_create_custom(&sctx->screen->b.b, + PIPE_USAGE_DEFAULT, scratch_needed_size); + + pipe_resource_reference( + (struct pipe_resource**)&sctx->scratch_buffer, + &new_scratch_buffer->b.b); + } + + /* Update the shaders, so they are using the latest scratch buffer. */ + si_update_scratch_buffer(sctx, sctx->ps_shader); + si_update_scratch_buffer(sctx, sctx->gs_shader); + si_update_scratch_buffer(sctx, sctx->vs_shader); + + /* The LLVM shader backend should be reporting aligned scratch_sizes. */ + assert((scratch_needed_size & ~0x3FF) == scratch_needed_size && + "scratch size should already be aligned correctly."); + + r600_write_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE, + S_0286E8_WAVES(sctx->scratch_waves) | + S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10)); + + if (scratch_needed_size > 0) { + r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, + sctx->scratch_buffer, RADEON_USAGE_READWRITE, + RADEON_PRIO_SHADER_RESOURCE_RW); + } +} + static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_draw_info *info, const struct pipe_index_buffer *ib) @@ -583,6 +687,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) si_pm4_emit_dirty(sctx); si_emit_rasterizer_prim_state(sctx, info->mode); + si_emit_spi_tmpring_state(sctx); si_emit_draw_registers(sctx, info, &ib); si_emit_draw_packets(sctx, info, &ib); diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 817a990..c24573c 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -67,7 +67,8 @@ static void si_shader_es(struct si_shader *shader) S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) | S_00B328_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES, - S_00B32C_USER_SGPR(num_user_sgprs)); + S_00B32C_USER_SGPR(num_user_sgprs) | + S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); } static void si_shader_gs(struct si_shader *shader) @@ -136,7 +137,8 @@ static void si_shader_gs(struct si_shader *shader) S_00B228_SGPRS((num_sgprs - 1) / 8) | S_00B228_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, - S_00B22C_USER_SGPR(num_user_sgprs)); + S_00B22C_USER_SGPR(num_user_sgprs) | + S_00B22C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); } static void si_shader_vs(struct si_shader *shader) @@ -216,7 +218,8 @@ static void si_shader_vs(struct si_shader *shader) S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) | S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) | S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) | - S_00B12C_SO_EN(!!shader->selector->so.num_outputs)); + S_00B12C_SO_EN(!!shader->selector->so.num_outputs) | + S_00B12C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); if (window_space) si_pm4_set_reg(pm4, R_028818_PA_CL_VTE_CNTL, S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1)); @@ -307,7 +310,8 @@ static void si_shader_ps(struct si_shader *shader) S_00B028_DX10_CLAMP(shader->dx10_clamp_mode)); si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS, S_00B02C_EXTRA_LDS_SIZE(shader->lds_size) | - S_00B02C_USER_SGPR(num_user_sgprs)); + S_00B02C_USER_SGPR(num_user_sgprs) | + S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0)); } static void si_shader_init_pm4_state(struct si_shader *shader) -- 1.8.5.5 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev