Re: [Mesa-dev] [PATCH] radeonsi: Enable VGPR spilling for all shader types v3

Tom Stellard Fri, 23 Jan 2015 18:57:13 -0800

On Thu, Jan 22, 2015 at 11:27:32AM +0900, Michel Dänzer wrote:
> On 21.01.2015 21:12, Marek Olšák wrote:
> > We also had a case when the CPU accidentally corrupted shaders,
> > because the shaders were mapped after textures and a CPU texture
> > upload overflowed and overwrote shaders. I suppose we should have
> > unmapped the shaders.
> 
> Sounds like a good idea.
> 
> 
> Tom, for now I suggest this solution, summarized from Marek's previous
> descriptions:
> 
> (At least) for shaders which have relocations, keep a copy of the
> machine code in malloced memory. When the relocated values change,
> update them in the malloced memory, allocate a new BO, map it, copy the
> machine code from the malloced memory to the BO, replace any existing
> shader BO with the new one and invalidate the shader state.
>


Hi,

Attached is a WIP patch attempting to implement it this way.
Unfortunately, I was unable to get it working, so I wanted to
submit it for review in case someone can spot what I'm doing wrong.

You can find the broken code wrapped in #if 0 in the
si_update_scratch_buffer() function in si_state_shaders.c

Based on the dmesg output and other tests I've done, it appears
that the GPU is still executing the shader code from the old bo
which does not contain the relocations.

The code in the #else branch works fine, but it updates the existing
bo in place rather than creating a new one.

Any idea what I've done wrong?

Thanks,
Tom

>From ba673155672756fb0bf9873b2ae76c3f5ccd02e2 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stell...@amd.com>
Date: Wed, 10 Dec 2014 09:13:59 -0500
Subject: [PATCH] radeonsi: Enable VGPR spilling for all shader types v5 (WIP)

v2:
  - Only emit write SPI_TMPRING_SIZE once per packet.
  - Use context global scratch buffer.

v3:
  - Patch shaders using WRITE_DATA packet instead of map/unmap.
  - Emit ICACHE_FLUSH, CS_PARTIAL_FLUSH, PS_PARTIAL_FLUSH, and
    VS_PARTIAL_FLUSH when patching shaders.

v4:
  - Code cleanups.
  - Remove unnecessary multiplies.

v5:
  - Patch shaders in system memory and re-upload to vram.
---
 src/gallium/drivers/radeonsi/si_compute.c       |  42 +------
 src/gallium/drivers/radeonsi/si_hw_context.c    |   1 +
 src/gallium/drivers/radeonsi/si_pipe.c          |   9 +-
 src/gallium/drivers/radeonsi/si_pipe.h          |   6 +
 src/gallium/drivers/radeonsi/si_shader.c        |  54 +++++++--
 src/gallium/drivers/radeonsi/si_shader.h        |   8 +-
 src/gallium/drivers/radeonsi/si_state_draw.c    |  15 +++
 src/gallium/drivers/radeonsi/si_state_shaders.c | 141 +++++++++++++++++++++++-
 8 files changed, 227 insertions(+), 49 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 981bccb..4dd4379 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -42,12 +42,6 @@
 #define NUM_USER_SGPRS 4
 #endif
 
-static const char *scratch_rsrc_dword0_symbol =
-	"SCRATCH_RSRC_DWORD0";
-
-static const char *scratch_rsrc_dword1_symbol =
-	"SCRATCH_RSRC_DWORD1";
-
 struct si_compute {
 	struct si_context *ctx;
 
@@ -68,8 +62,6 @@ struct si_compute {
 #endif
 };
 
-static void apply_scratch_relocs(const struct si_screen *sscreen,
-			struct si_shader *shader, uint64_t scratch_va);
 static void init_scratch_buffer(struct si_context *sctx, struct si_compute *program)
 {
 	unsigned scratch_bytes = 0;
@@ -85,8 +77,8 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog
 				program->shader.binary.global_symbol_offsets[i];
 		unsigned scratch_bytes_needed;
 
-		si_shader_binary_read_config(&program->shader.binary,
-						&program->shader, offset);
+		si_shader_binary_read_config(sctx->screen, &program->shader.binary,
+				&program->shader, offset);
 		scratch_bytes_needed = scratch_waves *
 				program->shader.scratch_bytes_per_wave;
 		scratch_bytes = MAX2(scratch_bytes, scratch_bytes_needed);
@@ -101,7 +93,8 @@ static void init_scratch_buffer(struct si_context *sctx, struct si_compute *prog
 	scratch_buffer_va = program->scratch_bo->gpu_address;
 
 	/* Patch the shader with the scratch buffer address. */
-	apply_scratch_relocs(sctx->screen, &program->shader, scratch_buffer_va);
+	si_shader_apply_scratch_relocs(sctx,
+				&program->shader, scratch_buffer_va);
 
 }
 
@@ -226,30 +219,6 @@ static unsigned compute_num_waves_for_scratch(
 	return scratch_waves;
 }
 
-static void apply_scratch_relocs(const struct si_screen *sscreen,
-			struct si_shader *shader, uint64_t scratch_va) {
-	unsigned i;
-	uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
-	uint32_t scratch_rsrc_dword1 =
-		S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
-		|  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
-
-	if (!shader->binary.reloc_count) {
-		return;
-	}
-
-	for (i = 0 ; i < shader->binary.reloc_count; i++) {
-		const struct radeon_shader_reloc *reloc = &shader->binary.relocs[i];
-		if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
-			util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
-				&scratch_rsrc_dword0, 4);
-		} else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
-			util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
-				&scratch_rsrc_dword1, 4);
-		}
-	}
-}
-
 static void si_launch_grid(
 		struct pipe_context *ctx,
 		const uint *block_layout, const uint *grid_layout,
@@ -294,7 +263,8 @@ static void si_launch_grid(
 
 #if HAVE_LLVM >= 0x0306
 	/* Read the config information */
-	si_shader_binary_read_config(&program->shader.binary, shader, pc);
+	si_shader_binary_read_config(sctx->screen, &shader->binary,
+				shader, pc);
 #endif
 
 	/* Upload the kernel arguments */
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index 21c3ebf..1cacc26 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -160,4 +160,5 @@ void si_begin_new_cs(struct si_context *ctx)
 	ctx->last_prim = -1;
 	ctx->last_multi_vgt_param = -1;
 	ctx->last_rast_prim = -1;
+	ctx->emit_scratch_reloc = true;
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index e3f8fcf..3580367 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -46,6 +46,7 @@ static void si_destroy_context(struct pipe_context *context)
 	pipe_resource_reference(&sctx->gsvs_ring, NULL);
 	pipe_resource_reference(&sctx->null_const_buf.buffer, NULL);
 	r600_resource_reference(&sctx->border_color_table, NULL);
+	r600_resource_reference(&sctx->scratch_buffer, NULL);
 
 	si_pm4_free_state(sctx, sctx->init_config, ~0);
 	si_pm4_delete_state(sctx, gs_rings, sctx->gs_rings);
@@ -158,6 +159,12 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, void *
 				     sctx->null_const_buf.buffer->width0, 0, false);
 	}
 
+	/* XXX: This is the maximum value allowed.  I'm not sure how compute
+	 * this for non-cs shaders.  Using the wrong value here can result in
+	 * GPU lockups, but the maximum value seems to always work.
+	 */
+	sctx->scratch_waves = 32 * sscreen->b.info.max_compute_units;
+
 	return &sctx->b.b;
 fail:
 	si_destroy_context(&sctx->b.b);
@@ -525,7 +532,7 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
 	r600_target = radeon_llvm_get_r600_target(triple);
 	sscreen->tm = LLVMCreateTargetMachine(r600_target, triple,
 				r600_get_llvm_processor_name(sscreen->b.family),
-				"+DumpCode", LLVMCodeGenLevelDefault, LLVMRelocDefault,
+				"+DumpCode,+vgpr-spilling", LLVMCodeGenLevelDefault, LLVMRelocDefault,
 				LLVMCodeModelDefault);
 #endif
 	return &sscreen->b.b;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index dfb1cd6..568958c 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -174,6 +174,7 @@ struct si_context {
 	struct si_buffer_resources	const_buffers[SI_NUM_SHADERS];
 	struct si_buffer_resources	rw_buffers[SI_NUM_SHADERS];
 	struct si_textures_info		samplers[SI_NUM_SHADERS];
+	struct r600_resource		*scratch_buffer;
 	struct r600_resource		*border_color_table;
 	unsigned			border_color_offset;
 
@@ -221,6 +222,11 @@ struct si_context {
 	int			last_prim;
 	int			last_multi_vgt_param;
 	int			last_rast_prim;
+
+	/* Scratch buffer */
+	boolean                 emit_scratch_reloc ;
+	unsigned		scratch_waves;
+	unsigned		spi_tmpring_size;
 };
 
 /* si_blit.c */
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 0256004..fea8c19 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -32,6 +32,7 @@
 #include "gallivm/lp_bld_logic.h"
 #include "gallivm/lp_bld_arit.h"
 #include "gallivm/lp_bld_flow.h"
+#include "radeon/r600_cs.h"
 #include "radeon/radeon_llvm.h"
 #include "radeon/radeon_elf_util.h"
 #include "radeon/radeon_llvm_emit.h"
@@ -46,6 +47,12 @@
 
 #include <errno.h>
 
+static const char *scratch_rsrc_dword0_symbol =
+	"SCRATCH_RSRC_DWORD0";
+
+static const char *scratch_rsrc_dword1_symbol =
+	"SCRATCH_RSRC_DWORD1";
+
 struct si_shader_output_values
 {
 	LLVMValueRef values[4];
@@ -2517,7 +2524,8 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
 	}
 }
 
-void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
+void si_shader_binary_read_config(const struct si_screen *sscreen,
+				const struct radeon_shader_binary *binary,
 				struct si_shader *shader,
 				unsigned symbol_offset)
 {
@@ -2549,6 +2557,7 @@ void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
 		case R_0286CC_SPI_PS_INPUT_ENA:
 			shader->spi_ps_input_ena = value;
 			break;
+		case R_0286E8_SPI_TMPRING_SIZE:
 		case R_00B860_COMPUTE_TMPRING_SIZE:
 			/* WAVESIZE is in units of 256 dwords. */
 			shader->scratch_bytes_per_wave =
@@ -2562,6 +2571,29 @@ void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
 	}
 }
 
+void si_shader_apply_scratch_relocs(struct si_context *sctx,
+			struct si_shader *shader,
+			uint64_t scratch_va)
+{
+	unsigned i;
+	uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
+	uint32_t scratch_rsrc_dword1 =
+		S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
+		|  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);
+
+	for (i = 0 ; i < shader->binary.reloc_count; i++) {
+		const struct radeon_shader_reloc *reloc =
+					&shader->binary.relocs[i];
+		if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name)) {
+			util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
+			&scratch_rsrc_dword0, 4);
+		} else if (!strcmp(scratch_rsrc_dword1_symbol, reloc->name)) {
+			util_memcpy_cpu_to_le32(shader->binary.code + reloc->offset,
+			&scratch_rsrc_dword1, 4);
+		}
+	}
+}
+
 int si_shader_binary_read(struct si_screen *sscreen,
 			struct si_shader *shader,
 			const struct radeon_shader_binary *binary)
@@ -2582,7 +2614,7 @@ int si_shader_binary_read(struct si_screen *sscreen,
 		}
 	}
 
-	si_shader_binary_read_config(binary, shader, 0);
+	si_shader_binary_read_config(sscreen, binary, shader, 0);
 
 	/* copy new shader */
 	code_size = binary->code_size + binary->rodata_size;
@@ -2610,18 +2642,24 @@ int si_compile_llvm(struct si_screen *sscreen, struct si_shader *shader,
 							LLVMModuleRef mod)
 {
 	int r = 0;
-	struct radeon_shader_binary binary;
 	bool dump = r600_can_dump_shader(&sscreen->b,
 			shader->selector ? shader->selector->tokens : NULL);
-	memset(&binary, 0, sizeof(binary));
-	r = radeon_llvm_compile(mod, &binary,
+	r = radeon_llvm_compile(mod, &shader->binary,
 		r600_get_llvm_processor_name(sscreen->b.family), dump, sscreen->tm);
 
 	if (r) {
 		return r;
 	}
-	r = si_shader_binary_read(sscreen, shader, &binary);
-	radeon_shader_binary_free_members(&binary, true);
+	r = si_shader_binary_read(sscreen, shader, &shader->binary);
+
+	FREE(shader->binary.config);
+	FREE(shader->binary.rodata);
+	FREE(shader->binary.global_symbol_offsets);
+	if (shader->scratch_bytes_per_wave == 0) {
+		FREE(shader->binary.code);
+		FREE(shader->binary.relocs);
+		memset(&shader->binary, 0, sizeof(shader->binary));
+	}
 	return r;
 }
 
@@ -2857,5 +2895,7 @@ void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader)
 	if (shader->gs_copy_shader)
 		si_shader_destroy(ctx, shader->gs_copy_shader);
 
+	FREE(shader->binary.code);
+	FREE(shader->binary.relocs);
 	r600_resource_reference(&shader->bo, NULL);
 }
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 4a8a04d..70769af 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -34,6 +34,7 @@
 #include "si_state.h"
 
 struct radeon_shader_binary;
+struct radeon_shader_reloc;
 
 #define SI_SGPR_RW_BUFFERS	0  /* rings (& stream-out, VS only) */
 #define SI_SGPR_CONST		2
@@ -147,6 +148,7 @@ struct si_shader {
 	unsigned			lds_size;
 	unsigned			spi_ps_input_ena;
 	unsigned			scratch_bytes_per_wave;
+	uint64_t			scratch_va;
 	unsigned			spi_shader_col_format;
 	unsigned			spi_shader_z_format;
 	unsigned			db_shader_control;
@@ -185,7 +187,11 @@ void si_shader_destroy(struct pipe_context *ctx, struct si_shader *shader);
 unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index);
 int si_shader_binary_read(struct si_screen *sscreen, struct si_shader *shader,
 		const struct radeon_shader_binary *binary);
-void si_shader_binary_read_config(const struct radeon_shader_binary *binary,
+void si_shader_apply_scratch_relocs(struct si_context *sctx,
+			struct si_shader *shader,
+			uint64_t scratch_va);
+void si_shader_binary_read_config(const struct si_screen *sscreen,
+				const struct radeon_shader_binary *binary,
 				struct si_shader *shader,
 				unsigned symbol_offset);
 
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index cd4880b..fbecf15 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -571,6 +571,21 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	if (sctx->b.flags)
 		sctx->atoms.s.cache_flush->dirty = true;
 
+
+	if (sctx->emit_scratch_reloc) {
+		struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+		r600_write_context_reg(cs, R_0286E8_SPI_TMPRING_SIZE,
+				sctx->spi_tmpring_size);
+
+		if (sctx->scratch_buffer) {
+			 r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
+				sctx->scratch_buffer, RADEON_USAGE_READWRITE,
+				RADEON_PRIO_SHADER_RESOURCE_RW);
+
+		}
+		sctx->emit_scratch_reloc = false;
+	}
+
 	si_need_cs_space(sctx, 0, TRUE);
 
 	/* Emit states. */
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 817a990..05c8d36 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -67,7 +67,8 @@ static void si_shader_es(struct si_shader *shader)
 		       S_00B328_VGPR_COMP_CNT(vgpr_comp_cnt) |
 		       S_00B328_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B32C_SPI_SHADER_PGM_RSRC2_ES,
-		       S_00B32C_USER_SGPR(num_user_sgprs));
+		       S_00B32C_USER_SGPR(num_user_sgprs) |
+		       S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
 }
 
 static void si_shader_gs(struct si_shader *shader)
@@ -136,7 +137,8 @@ static void si_shader_gs(struct si_shader *shader)
 		       S_00B228_SGPRS((num_sgprs - 1) / 8) |
 		       S_00B228_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS,
-		       S_00B22C_USER_SGPR(num_user_sgprs));
+		       S_00B22C_USER_SGPR(num_user_sgprs) |
+		       S_00B22C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
 }
 
 static void si_shader_vs(struct si_shader *shader)
@@ -216,7 +218,8 @@ static void si_shader_vs(struct si_shader *shader)
 		       S_00B12C_SO_BASE1_EN(!!shader->selector->so.stride[1]) |
 		       S_00B12C_SO_BASE2_EN(!!shader->selector->so.stride[2]) |
 		       S_00B12C_SO_BASE3_EN(!!shader->selector->so.stride[3]) |
-		       S_00B12C_SO_EN(!!shader->selector->so.num_outputs));
+		       S_00B12C_SO_EN(!!shader->selector->so.num_outputs) |
+		       S_00B12C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
 	if (window_space)
 		si_pm4_set_reg(pm4, R_028818_PA_CL_VTE_CNTL,
 			       S_028818_VTX_XY_FMT(1) | S_028818_VTX_Z_FMT(1));
@@ -307,7 +310,8 @@ static void si_shader_ps(struct si_shader *shader)
 		       S_00B028_DX10_CLAMP(shader->dx10_clamp_mode));
 	si_pm4_set_reg(pm4, R_00B02C_SPI_SHADER_PGM_RSRC2_PS,
 		       S_00B02C_EXTRA_LDS_SIZE(shader->lds_size) |
-		       S_00B02C_USER_SGPR(num_user_sgprs));
+		       S_00B02C_USER_SGPR(num_user_sgprs) |
+		       S_00B32C_SCRATCH_EN(shader->scratch_bytes_per_wave > 0));
 }
 
 static void si_shader_init_pm4_state(struct si_shader *shader)
@@ -706,6 +710,130 @@ static void si_init_gs_rings(struct si_context *sctx)
 			   false, false, 0, 0);
 }
 
+/**
+ * @returns 1 if \p sel has been updated to use a new scratch buffer and 0
+ *          otherwise.
+ */
+static unsigned si_update_scratch_buffer(struct si_context *sctx,
+				    struct si_shader_selector *sel)
+{
+	struct si_shader *shader;
+	uint64_t scratch_va = sctx->scratch_buffer->gpu_address;
+	unsigned char *ptr;
+
+	if (!sel)
+		return 0;
+
+	shader = sel->current;
+
+	/* This shader doesn't need a scratch buffer */
+	if (shader->scratch_bytes_per_wave == 0)
+		return 0;
+
+	/* This shader is already configured to use the current
+	 * scratch buffer. */
+	if (shader->scratch_va == scratch_va)
+		return 0;
+
+	assert(sctx->scratch_buffer);
+
+
+	si_shader_apply_scratch_relocs(sctx, shader, scratch_va);
+
+#if 0
+	/* XXX: This was my attempt to create a new bo for the updated shader.
+	 * It did not work, and I think this is because the shader code from the
+	 * old bo was what was being executed.
+	 */
+	r600_resource_reference(&shader->bo, NULL);
+
+	shader->bo = si_resource_create_custom(&sctx->screen->b.b, PIPE_USAGE_IMMUTABLE,
+					       shader->binary.code_size);
+	ptr = sctx->screen->b.ws->buffer_map(shader->bo->cs_buf, NULL, PIPE_TRANSFER_WRITE);
+	util_memcpy_cpu_to_le32(ptr, shader->binary.code, shader->binary.code_size);
+	sctx->screen->b.ws->buffer_unmap(shader->bo->cs_buf);
+        si_shader_init_pm4_state(shader);
+#else
+	ptr = sctx->screen->b.ws->buffer_map(shader->bo->cs_buf, NULL, PIPE_TRANSFER_WRITE);
+	util_memcpy_cpu_to_le32(ptr, shader->binary.code, shader->binary.code_size);
+	sctx->screen->b.ws->buffer_unmap(shader->bo->cs_buf);
+	sctx->b.flags |= SI_CONTEXT_INV_ICACHE;
+#endif
+
+	shader->scratch_va = scratch_va;
+
+	return 1;
+}
+
+static unsigned si_get_current_scratch_buffer_size(struct si_context *sctx)
+{
+	if (!sctx->scratch_buffer)
+		return 0;
+
+	return sctx->scratch_buffer->b.b.width0;
+}
+
+static unsigned si_get_scratch_buffer_bytes_per_wave(struct si_context *sctx,
+					struct si_shader_selector *sel)
+{
+	if (!sel)
+		return 0;
+
+	return sel->current->scratch_bytes_per_wave;
+}
+
+static unsigned si_get_max_scratch_bytes_per_wave(struct si_context *sctx)
+{
+
+	return MAX3(si_get_scratch_buffer_bytes_per_wave(sctx, sctx->ps_shader),
+			si_get_scratch_buffer_bytes_per_wave(sctx, sctx->gs_shader),
+			si_get_scratch_buffer_bytes_per_wave(sctx, sctx->vs_shader));
+}
+
+static void si_update_spi_tmpring_size(struct si_context *sctx)
+{
+	unsigned current_scratch_buffer_size =
+		si_get_current_scratch_buffer_size(sctx);
+	unsigned scratch_bytes_per_wave =
+		si_get_max_scratch_bytes_per_wave(sctx);
+	unsigned scratch_needed_size = scratch_bytes_per_wave *
+		sctx->scratch_waves;
+
+	if (scratch_needed_size > 0) {
+
+		if (scratch_needed_size > current_scratch_buffer_size) {
+
+			/* Create a bigger scratch buffer */
+			pipe_resource_reference(
+					(struct pipe_resource**)&sctx->scratch_buffer,
+					NULL);
+
+			sctx->scratch_buffer =
+					si_resource_create_custom(&sctx->screen->b.b,
+	                                PIPE_USAGE_DEFAULT, scratch_needed_size);
+		}
+
+		/* Update the shaders, so they are using the latest scratch.  The
+		 * scratch buffer may have been changed since these shaders were
+		 * last used, so we still need to try to update them, even if
+		 * they require scratch buffers smaller than the current size.
+		 */
+		if (si_update_scratch_buffer(sctx, sctx->ps_shader))
+			sctx->emitted.named.ps = NULL;
+		if (si_update_scratch_buffer(sctx, sctx->gs_shader))
+			sctx->emitted.named.gs = NULL;
+		if (si_update_scratch_buffer(sctx, sctx->vs_shader))
+			sctx->emitted.named.vs = NULL;
+	}
+
+	/* The LLVM shader backend should be reporting aligned scratch_sizes. */
+	assert((scratch_needed_size & ~0x3FF) == scratch_needed_size &&
+		"scratch size should already be aligned correctly.");
+
+	sctx->spi_tmpring_size = S_0286E8_WAVES(sctx->scratch_waves) |
+				S_0286E8_WAVESIZE(scratch_bytes_per_wave >> 10);
+}
+
 void si_update_shaders(struct si_context *sctx)
 {
 	struct pipe_context *ctx = (struct pipe_context*)sctx;
@@ -790,6 +918,11 @@ void si_update_shaders(struct si_context *sctx)
 		si_update_spi_map(sctx);
 	}
 
+	if (si_pm4_state_changed(sctx, ps) || si_pm4_state_changed(sctx, vs) ||
+	    si_pm4_state_changed(sctx, gs)) {
+		si_update_spi_tmpring_size(sctx);
+	}
+
 	if (sctx->ps_db_shader_control != sctx->ps_shader->current->db_shader_control) {
 		sctx->ps_db_shader_control = sctx->ps_shader->current->db_shader_control;
 		sctx->db_render_state.dirty = true;
-- 
2.0.4

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH] radeonsi: Enable VGPR spilling for all shader types v3

Reply via email to