v2:
  - Only emit write SPI_TMPRING_SIZE once per packet.
  - Use context global scratch buffers, one for each shader type.
---
 src/gallium/drivers/radeon/radeon_elf_util.c    |  10 ++-
 src/gallium/drivers/radeon/radeon_elf_util.h    |   5 +-
 src/gallium/drivers/radeonsi/si_compute.c       |  46 ++---------
 src/gallium/drivers/radeonsi/si_pipe.c          |   7 ++
 src/gallium/drivers/radeonsi/si_pipe.h          |   3 +
 src/gallium/drivers/radeonsi/si_shader.c        |  53 +++++++++++-
 src/gallium/drivers/radeonsi/si_shader.h        |   9 +-
 src/gallium/drivers/radeonsi/si_state_draw.c    | 105 ++++++++++++++++++++++++
 src/gallium/drivers/radeonsi/si_state_shaders.c |  12 ++-
 9 files changed, 199 insertions(+), 51 deletions(-)

diff --git a/src/gallium/drivers/radeon/radeon_elf_util.c 
b/src/gallium/drivers/radeon/radeon_elf_util.c
index 9e8c53f..fd0632b 100644
--- a/src/gallium/drivers/radeon/radeon_elf_util.c
+++ b/src/gallium/drivers/radeon/radeon_elf_util.c
@@ -208,9 +208,15 @@ void radeon_shader_binary_free_relocs(struct 
radeon_shader_reloc *relocs,
        FREE(relocs);
 }
 
-void radeon_shader_binary_free_members(struct radeon_shader_binary *binary) {
+void radeon_shader_binary_free_members(struct radeon_shader_binary *binary,
+                                       unsigned free_relocs)
+{
        FREE(binary->code);
        FREE(binary->config);
        FREE(binary->rodata);
-       radeon_shader_binary_free_relocs(binary->relocs, binary->reloc_count);
+
+       if (free_relocs) {
+               radeon_shader_binary_free_relocs(binary->relocs,
+                                               binary->reloc_count);
+       }
 }
diff --git a/src/gallium/drivers/radeon/radeon_elf_util.h 
b/src/gallium/drivers/radeon/radeon_elf_util.h
index cff07f2..ab83f98 100644
--- a/src/gallium/drivers/radeon/radeon_elf_util.h
+++ b/src/gallium/drivers/radeon/radeon_elf_util.h
@@ -50,8 +50,11 @@ const unsigned char *radeon_shader_binary_config_start(
 /**
  * Free all memory allocated for members of \p binary.  This function does
  * not free \p binary.
+ *
+ * @param free_relocs If false, reolc information will not be freed.
  */
-void radeon_shader_binary_free_members(struct radeon_shader_binary *binary);
+void radeon_shader_binary_free_members(struct radeon_shader_binary *binary,
+       unsigned free_relocs);
 
 /**
  * Free \p relocs and all member data.
diff --git a/src/gallium/drivers/radeonsi/si_compute.c 
b/src/gallium/drivers/radeonsi/si_compute.c
index 9571a3e..82ce43b 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -42,12 +42,6 @@
 #define NUM_USER_SGPRS 4
 #endif
 
-static const char *scratch_rsrc_dword0_symbol =
-       "SCRATCH_RSRC_DWORD0";
-
-static const char *scratch_rsrc_dword1_symbol =
-       "SCRATCH_RSRC_DWORD1";
-
 struct si_compute {
        struct si_context *ctx;
 
@@ -183,35 +177,6 @@ static unsigned compute_num_waves_for_scratch(
        return scratch_waves;
 }
 
-static void apply_scratch_relocs(const struct si_screen *sscreen,
-                       const struct radeon_shader_binary *binary,
-                       struct si_shader *shader, uint64_t scratch_va) {
-       unsigned i;
-       char *ptr;
-       uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
-       uint32_t scratch_rsrc_dword1 =
-               S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
-               |  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
-
-       if (!binary->reloc_count) {
-               return;
-       }
-
-       ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL,
-                                       PIPE_TRANSFER_READ_WRITE);
-       for (i = 0 ; i < binary->reloc_count; i++) {
-               const struct radeon_shader_reloc *reloc = &binary->relocs[i];
-               if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
-                       util_memcpy_cpu_to_le32(ptr + reloc->offset,
-                               &scratch_rsrc_dword0, 4);
-               } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
-                       util_memcpy_cpu_to_le32(ptr + reloc->offset,
-                               &scratch_rsrc_dword1, 4);
-               }
-       }
-       sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
-}
-
 static void si_launch_grid(
                struct pipe_context *ctx,
                const uint *block_layout, const uint *grid_layout,
@@ -256,7 +221,8 @@ static void si_launch_grid(
 
 #if HAVE_LLVM >= 0x0306
        /* Read the config information */
-       si_shader_binary_read_config(&program->binary, &program->program, pc);
+       si_shader_binary_read_config(sctx->screen, &program->binary,
+                                       &program->program, pc);
 #endif
 
        /* Upload the kernel arguments */
@@ -295,8 +261,10 @@ static void si_launch_grid(
                                RADEON_PRIO_SHADER_RESOURCE_RW);
 
                /* Patch the shader with the scratch buffer address. */
-               apply_scratch_relocs(sctx->screen,
-                       &program->binary, shader, scratch_buffer_va);
+               si_shader_apply_scratch_relocs(sctx->screen,
+                       shader, program->binary.relocs,
+                       program->binary.reloc_count,
+                       scratch_buffer_va);
 
        }
 
@@ -481,7 +449,7 @@ static void si_delete_compute_state(struct pipe_context 
*ctx, void* state){
        pipe_resource_reference(
                (struct pipe_resource **)&program->input_buffer, NULL);
 
-       radeon_shader_binary_free_members(&program->binary);
+       radeon_shader_binary_free_members(&program->binary, true);
        FREE(program);
 }
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index e3f8fcf..deb0d25 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -46,6 +46,7 @@ static void si_destroy_context(struct pipe_context *context)
        pipe_resource_reference(&sctx->gsvs_ring, NULL);
        pipe_resource_reference(&sctx->null_const_buf.buffer, NULL);
        r600_resource_reference(&sctx->border_color_table, NULL);
+       r600_resource_reference(&sctx->scratch_buffer, NULL);
 
        si_pm4_free_state(sctx, sctx->init_config, ~0);
        si_pm4_delete_state(sctx, gs_rings, sctx->gs_rings);
@@ -55,6 +56,7 @@ static void si_destroy_context(struct pipe_context *context)
        if (sctx->dummy_pixel_shader) {
                sctx->b.b.delete_fs_state(&sctx->b.b, sctx->dummy_pixel_shader);
        }
+
        sctx->b.b.delete_depth_stencil_alpha_state(&sctx->b.b, 
sctx->custom_dsa_flush);
        sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_resolve);
        sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_decompress);
@@ -158,6 +160,11 @@ static struct pipe_context *si_create_context(struct 
pipe_screen *screen, void *
                                     sctx->null_const_buf.buffer->width0, 0, 
false);
        }
 
+       /* XXX: This is the maximum value allowed.  I'm not sure how compute
+        * this for non-cs shaders.
+        */
+       sctx->scratch_waves = 32 * sscreen->b.info.max_compute_units;
+
        return &sctx->b.b;
 fail:
        si_destroy_context(&sctx->b.b);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index dfb1cd6..6051763 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -174,6 +174,7 @@ struct si_context {
        struct si_buffer_resources      const_buffers[SI_NUM_SHADERS];
        struct si_buffer_resources      rw_buffers[SI_NUM_SHADERS];
        struct si_textures_info         samplers[SI_NUM_SHADERS];
+       struct r600_resource            *scratch_buffer;
        struct r600_resource            *border_color_table;
        unsigned                        border_color_offset;
 
@@ -221,6 +222,8 @@ struct si_context {
        int                     last_prim;
        int                     last_multi_vgt_param;
        int                     last_rast_prim;
+
+       unsigned                scratch_waves;
 };
 
 /* si_blit.c */
diff --git a/src/gallium/drivers/radeonsi/si_shader.c 
b/src/gallium/drivers/radeonsi/si_shader.c
index a891bc6..cefb18b 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -46,6 +46,12 @@
 
 #include <errno.h>
 
+static const char *scratch_rsrc_dword0_symbol =
+       "SCRATCH_RSRC_DWORD0";
+
+static const char *scratch_rsrc_dword1_symbol =
+       "SCRATCH_RSRC_DWORD1";
+
 struct si_shader_output_values
 {
        LLVMValueRef values[4];
@@ -2517,7 +2523,8 @@ static void preload_ring_buffers(struct si_shader_context 
*si_shader_ctx)
        }
 }
 
-void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
+void si_shader_binary_read_config(const struct si_screen *sscreen,
+                               const struct radeon_shader_binary *binary,
                                struct si_shader *shader,
                                unsigned symbol_offset)
 {
@@ -2549,6 +2556,7 @@ void si_shader_binary_read_config(const struct 
radeon_shader_binary *binary,
                case R_0286CC_SPI_PS_INPUT_ENA:
                        shader->spi_ps_input_ena = value;
                        break;
+               case R_0286E8_SPI_TMPRING_SIZE:
                case R_00B860_COMPUTE_TMPRING_SIZE:
                        /* WAVESIZE is in units of 256 dwords. */
                        shader->scratch_bytes_per_wave =
@@ -2562,6 +2570,39 @@ void si_shader_binary_read_config(const struct 
radeon_shader_binary *binary,
        }
 }
 
+void si_shader_apply_scratch_relocs(const struct si_screen *sscreen,
+                       struct si_shader *shader,
+                       const struct radeon_shader_reloc *relocs,
+                       unsigned num_relocs, uint64_t scratch_va)
+{
+       unsigned i;
+       char *ptr;
+       uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
+       uint32_t scratch_rsrc_dword1 =
+               S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
+               |  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
+
+       if (num_relocs == 0) {
+               return;
+       }
+
+       assert(relocs);
+       ptr = sscreen->b.ws->buffer_map(shader->bo->cs_buf, NULL,
+                                       PIPE_TRANSFER_READ_WRITE);
+       for (i = 0 ; i < num_relocs; i++) {
+               const struct radeon_shader_reloc *reloc = &relocs[i];
+               if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
+                       util_memcpy_cpu_to_le32(ptr + reloc->offset,
+                               &scratch_rsrc_dword0, 4);
+               } else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
+                       util_memcpy_cpu_to_le32(ptr + reloc->offset,
+                               &scratch_rsrc_dword1, 4);
+               }
+       }
+       sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
+}
+
+
 int si_shader_binary_read(struct si_screen *sscreen,
                        struct si_shader *shader,
                        const struct radeon_shader_binary *binary)
@@ -2582,7 +2623,7 @@ int si_shader_binary_read(struct si_screen *sscreen,
                }
        }
 
-       si_shader_binary_read_config(binary, shader, 0);
+       si_shader_binary_read_config(sscreen, binary, shader, 0);
 
        /* copy new shader */
        code_size = binary->code_size + binary->rodata_size;
@@ -2601,6 +2642,7 @@ int si_shader_binary_read(struct si_screen *sscreen,
                util_memcpy_cpu_to_le32(ptr, binary->rodata, 
binary->rodata_size);
        }
 
+
        sscreen->b.ws->buffer_unmap(shader->bo->cs_buf);
 
        return 0;
@@ -2621,7 +2663,10 @@ int si_compile_llvm(struct si_screen *sscreen, struct 
si_shader *shader,
                return r;
        }
        r = si_shader_binary_read(sscreen, shader, &binary);
-       radeon_shader_binary_free_members(&binary);
+
+       shader->relocs = binary.relocs;
+       shader->num_relocs = binary.reloc_count;
+       radeon_shader_binary_free_members(&binary, false);
        return r;
 }
 
@@ -2857,6 +2902,6 @@ void si_shader_destroy(struct pipe_context *ctx, struct 
si_shader *shader)
        if (shader->gs_copy_shader)
                si_shader_destroy(ctx, shader->gs_copy_shader);
 
+       radeon_shader_binary_free_relocs(shader->relocs, shader->num_relocs);
        r600_resource_reference(&shader->bo, NULL);
-       r600_resource_reference(&shader->scratch_bo, NULL);
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader.h 
b/src/gallium/drivers/radeonsi/si_shader.h
index 08e344a..0021d6c 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -34,6 +34,7 @@
 #include "si_state.h"
 
 struct radeon_shader_binary;
+struct radeon_shader_reloc;
 
 #define SI_SGPR_RW_BUFFERS     0  /* rings (& stream-out, VS only) */
 #define SI_SGPR_CONST          2
@@ -142,6 +143,8 @@ struct si_shader {
        struct si_pm4_state             *pm4;
        struct r600_resource            *bo;
        struct r600_resource            *scratch_bo;
+       struct radeon_shader_reloc      *relocs;
+       unsigned                        num_relocs;
        unsigned                        num_sgprs;
        unsigned                        num_vgprs;
        unsigned                        lds_size;
@@ -185,7 +188,11 @@ void si_shader_destroy(struct pipe_context *ctx, struct 
si_shader *shader);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
                const struct radeon_shader_binary *binary);
-void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
+void si_shader_apply_scratch_relocs(const struct si_screen *sscreen,
+       struct si_shader *shader, const struct radeon_shader_reloc *relocs,
+       unsigned num_relocs, uint64_t scratch_va);
+void si_shader_binary_read_config(const struct si_screen *sscreen,
+                               const struct radeon_shader_binary *binary,
                                struct si_shader *shader,
                                unsigned symbol_offset);
 
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c 
b/src/gallium/drivers/radeonsi/si_state_draw.c
index cd4880b..4604c67 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -173,6 +173,110 @@ static void si_emit_rasterizer_prim_state(struct 
si_context *sctx, unsigned mode
        sctx->last_rast_prim = mode;
 }
 
+static void si_update_scratch_buffer(struct si_context *sctx,
+                                   struct si_shader_selector *sel)
+{
+       struct si_shader *shader;
+       unsigned scratch_bytes;
+
+       if (!sel) {
+               return;
+       }
+
+       shader = sel->current;
+       scratch_bytes = shader->scratch_bytes_per_wave *
+                                       sctx->scratch_waves;
+
+       /* This shader doesn't need a scratch buffer */
+       if (scratch_bytes == 0)  {
+               return;
+       }
+
+       /* This shader is already configured to use the current
+        * scratch buffer. */
+       if (shader->scratch_bo == sctx->scratch_buffer) {
+               return;
+       }
+
+       assert(sctx->scratch_buffer);
+
+       si_shader_apply_scratch_relocs(sctx->screen, shader,
+               shader->relocs, shader->num_relocs,
+               sctx->scratch_buffer->gpu_address);
+
+       shader->scratch_bo = sctx->scratch_buffer;
+}
+
+static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx)
+{
+       if (!sctx->scratch_buffer) {
+               return 0;
+       }
+
+       return sctx->scratch_buffer->b.b.width0;
+}
+
+static unsigned si_get_scratch_buffer_size(struct si_context *sctx,
+                                       struct si_shader_selector *sel)
+{
+       if (!sel) {
+               return 0;
+       }
+
+       return sel->current->scratch_bytes_per_wave *
+               sctx->scratch_waves;
+
+}
+
+static unsigned si_get_max_scratch_size_needed(struct si_context *sctx)
+{
+
+       return MAX3(si_get_scratch_buffer_size(sctx, sctx->ps_shader),
+                       si_get_scratch_buffer_size(sctx, sctx->gs_shader),
+                       si_get_scratch_buffer_size(sctx, sctx->vs_shader));
+}
+
+static void si_emit_spi_tmpring_state(struct si_context *sctx)
+{
+       struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+       unsigned current_scratch_buffer_size =
+               si_get_current_scratch_buffer_size(sctx);
+       unsigned scratch_needed_size =
+               si_get_max_scratch_size_needed(sctx);
+       unsigned scratch_bytes_per_wave = scratch_needed_size /
+               sctx->scratch_waves;
+
+       if (scratch_needed_size > current_scratch_buffer_size) {
+               /* Create a bigger scratch buffer */
+               struct r600_resource *new_scratch_buffer =
+                               si_resource_create_custom(&sctx->screen->b.b,
+                                PIPE_USAGE_DEFAULT, scratch_needed_size);
+
+               pipe_resource_reference(
+                               (struct pipe_resource**)&sctx->scratch_buffer,
+                               &new_scratch_buffer->b.b);
+       }
+
+       /* Update the shaders, so they are using the latest scratch buffer. */
+       si_update_scratch_buffer(sctx, sctx->ps_shader);
+       si_update_scratch_buffer(sctx, sctx->gs_shader);
+       si_update_scratch_buffer(sctx, sctx->vs_shader);
+
+       /* The LLVM shader backend should be reporting aligned scratch_sizes. */
+       assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
+               "scratch size should already be aligned correctly.");
+
+       r600_write_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
+                               S_0286E8_WAVES(sctx->scratch_waves) |
+                               S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 
10));
+
+       if (scratch_needed_size > 0) {
+               r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+                       sctx->scratch_buffer, RADEON_USAGE_READWRITE,
+                       RADEON_PRIO_SHADER_RESOURCE_RW);
+       }
+}
+
 static void si_emit_draw_registers(struct si_context *sctx,
                                   const struct pipe_draw_info *info,
                                   const struct pipe_index_buffer *ib)
@@ -583,6 +687,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct 
pipe_draw_info *info)
 
        si_pm4_emit_dirty(sctx);
        si_emit_rasterizer_prim_state(sctx, info->mode);
+       si_emit_spi_tmpring_state(sctx);
        si_emit_draw_registers(sctx, info, &ib);
        si_emit_draw_packets(sctx, info, &ib);
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c 
b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 817a990..c24573c 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -67,7 +67,8 @@ static void si_shader_es(struct si_shader *shader)
                       S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
                       S_00B328_DX10_CLAMP(shader->dx10_clamp_mode));
        si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
-                      S_00B32C_USER_SGPR(num_user_sgprs));
+                      S_00B32C_USER_SGPR(num_user_sgprs) |
+                      S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
 }
 
 static void si_shader_gs(struct si_shader *shader)
@@ -136,7 +137,8 @@ static void si_shader_gs(struct si_shader *shader)
                       S_00B228_SGPRS((num_sgprs - 1) / 8) |
                       S_00B228_DX10_CLAMP(shader->dx10_clamp_mode));
        si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
-                      S_00B22C_USER_SGPR(num_user_sgprs));
+                      S_00B22C_USER_SGPR(num_user_sgprs) |
+                      S_00B22C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
 }
 
 static void si_shader_vs(struct si_shader *shader)
@@ -216,7 +218,8 @@ static void si_shader_vs(struct si_shader *shader)
                       S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) |
                       S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
                       S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
-                      S_00B12C_SO_EN(!!shader->selector->so.num_outputs));
+                      S_00B12C_SO_EN(!!shader->selector->so.num_outputs) |
+                      S_00B12C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
        if (window_space)
                si_pm4_set_reg(pm4, R_028818_PA_CL_VTE_CNTL,
                               S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1));
@@ -307,7 +310,8 @@ static void si_shader_ps(struct si_shader *shader)
                       S_00B028_DX10_CLAMP(shader->dx10_clamp_mode));
        si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
                       S_00B02C_EXTRA_LDS_SIZE(shader->lds_size) |
-                      S_00B02C_USER_SGPR(num_user_sgprs));
+                      S_00B02C_USER_SGPR(num_user_sgprs) |
+                      S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
 }
 
 static void si_shader_init_pm4_state(struct si_shader *shader)
-- 
1.8.5.5

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to