Hi Emil,

I'm nominating the attached patches for 10.6. They are quite big, but
they fix a lot of graphical corruption on SI cards, especially in
games from Valve (Portal, CS:GO, etc). I've checked with piglit that
there are no regressions.

There was a comment on Phoronix forums that this might fix about a
half of radeonsi bugs in the bugzilla. (I sincerely hope that's true,
but I don't know)

Marek
From d4c55a4bc4f64cca22c5eb267d2e2dabaef5e67d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.ol...@amd.com>
Date: Mon, 15 Sep 2014 23:34:28 +0200
Subject: [PATCH 1/2] radeonsi: rework how shader pointers to descriptors are
 set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For 10.6: This is a prerequisite for the next fix. The below comment is from
the original commit.

This is mainly needed for tessellation where a VS can be bound as VS, ES,
or LS, and TES (tess. evaluationshader) can be bound as VS or ES or neither.
Therefore we need the ability to move pointers to descriptors between
shaders arbitrarily.

The idea is that the context has a mapping from PIPE_SHADER_x to
SPI_SHADER_USER_DATA_x. After a shader is enabled or disabled,
si_shader_change_notify should be called to update this mapping accordingly.

There is a dirty flag for each shader pointer, but only one emit function
for all pointers in the whole context, whose code and logic is separated
from descriptors.

Reviewed-by: Michel Dänzer <michel.daen...@amd.com>
(cherry picked from commit 3ce91c727f2a00a05f414351266b0b45d677611e)
---
 src/gallium/drivers/radeonsi/si_descriptors.c   | 222 +++++++++++++++---------
 src/gallium/drivers/radeonsi/si_pipe.h          |   5 +-
 src/gallium/drivers/radeonsi/si_state.h         |  13 +-
 src/gallium/drivers/radeonsi/si_state_draw.c    |   3 +-
 src/gallium/drivers/radeonsi/si_state_shaders.c |   4 +
 5 files changed, 156 insertions(+), 91 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index bbfd36d..f31cccb 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -141,7 +141,7 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
 
 static void si_init_descriptors(struct si_context *sctx,
 				struct si_descriptors *desc,
-				unsigned shader_userdata_reg,
+				unsigned shader_userdata_index,
 				unsigned element_dw_size,
 				unsigned num_elements,
 				void (*emit_func)(struct si_context *ctx, struct r600_atom *state))
@@ -150,7 +150,7 @@ static void si_init_descriptors(struct si_context *sctx,
 	assert(num_elements <= sizeof(desc->dirty_mask)*8);
 
 	desc->atom.emit = (void*)emit_func;
-	desc->shader_userdata_reg = shader_userdata_reg;
+	desc->shader_userdata_offset = shader_userdata_index * 4;
 	desc->element_dw_size = element_dw_size;
 	desc->num_elements = num_elements;
 	desc->context_size = num_elements * element_dw_size * 4;
@@ -181,14 +181,11 @@ static void si_update_descriptors(struct si_context *sctx,
 	if (desc->dirty_mask) {
 		desc->atom.num_dw =
 			7 + /* copy */
-			(4 + desc->element_dw_size) * util_bitcount64(desc->dirty_mask) + /* update */
-			4; /* pointer update */
-
-		if (desc->shader_userdata_reg >= R_00B130_SPI_SHADER_USER_DATA_VS_0 &&
-		    desc->shader_userdata_reg < R_00B230_SPI_SHADER_USER_DATA_GS_0)
-			desc->atom.num_dw += 4; /* second pointer update */
+			(4 + desc->element_dw_size) * util_bitcount(desc->dirty_mask); /* update */
 
 		desc->atom.dirty = true;
+		desc->pointer_dirty = true;
+		sctx->shader_userdata.atom.dirty = true;
 
 		/* TODO: Investigate if these flushes can be removed after
 		 * adding CE support. */
@@ -206,32 +203,6 @@ static void si_update_descriptors(struct si_context *sctx,
 	}
 }
 
-static void si_emit_shader_pointer(struct si_context *sctx,
-				   struct r600_atom *atom)
-{
-	struct si_descriptors *desc = (struct si_descriptors*)atom;
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	uint64_t va = desc->buffer->gpu_address +
-		      desc->current_context_id * desc->context_size +
-		      desc->buffer_offset;
-
-	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
-	radeon_emit(cs, (desc->shader_userdata_reg - SI_SH_REG_OFFSET) >> 2);
-	radeon_emit(cs, va);
-	radeon_emit(cs, va >> 32);
-
-	if (desc->shader_userdata_reg >= R_00B130_SPI_SHADER_USER_DATA_VS_0 &&
-	    desc->shader_userdata_reg < R_00B230_SPI_SHADER_USER_DATA_GS_0) {
-		radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
-		radeon_emit(cs, (desc->shader_userdata_reg +
-				 (R_00B330_SPI_SHADER_USER_DATA_ES_0 -
-				  R_00B130_SPI_SHADER_USER_DATA_VS_0) -
-				 SI_SH_REG_OFFSET) >> 2);
-		radeon_emit(cs, va);
-		radeon_emit(cs, va >> 32);
-	}
-}
-
 static void si_emit_descriptors(struct si_context *sctx,
 				struct si_descriptors *desc,
 				uint32_t **descriptors)
@@ -295,24 +266,6 @@ static void si_emit_descriptors(struct si_context *sctx,
 
 	desc->dirty_mask = 0;
 	desc->current_context_id = new_context_id;
-
-	/* Now update the shader userdata pointer. */
-	si_emit_shader_pointer(sctx, &desc->atom);
-}
-
-static unsigned si_get_shader_user_data_base(unsigned shader)
-{
-	switch (shader) {
-	case PIPE_SHADER_VERTEX:
-		return R_00B130_SPI_SHADER_USER_DATA_VS_0;
-	case PIPE_SHADER_GEOMETRY:
-		return R_00B230_SPI_SHADER_USER_DATA_GS_0;
-	case PIPE_SHADER_FRAGMENT:
-		return R_00B030_SPI_SHADER_USER_DATA_PS_0;
-	default:
-		assert(0);
-		return 0;
-	}
 }
 
 /* SAMPLER VIEWS */
@@ -325,14 +278,11 @@ static void si_emit_sampler_views(struct si_context *sctx, struct r600_atom *ato
 }
 
 static void si_init_sampler_views(struct si_context *sctx,
-				  struct si_sampler_views *views,
-				  unsigned shader)
+				  struct si_sampler_views *views)
 {
 	int i;
 
-	si_init_descriptors(sctx, &views->desc,
-			    si_get_shader_user_data_base(shader) +
-			    SI_SGPR_RESOURCE * 4,
+	si_init_descriptors(sctx, &views->desc, SI_SGPR_RESOURCE,
 			    8, SI_NUM_SAMPLER_VIEWS, si_emit_sampler_views);
 
 	for (i = 0; i < views->desc.num_elements; i++) {
@@ -384,8 +334,6 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
 
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
-
-	si_emit_shader_pointer(sctx, &views->desc.atom);
 }
 
 static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
@@ -493,7 +441,6 @@ static void si_sampler_states_begin_new_cs(struct si_context *sctx,
 {
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
-	si_emit_shader_pointer(sctx, &states->desc.atom);
 }
 
 void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
@@ -536,7 +483,7 @@ static void si_emit_buffer_resources(struct si_context *sctx, struct r600_atom *
 
 static void si_init_buffer_resources(struct si_context *sctx,
 				     struct si_buffer_resources *buffers,
-				     unsigned num_buffers, unsigned shader,
+				     unsigned num_buffers,
 				     unsigned shader_userdata_index,
 				     enum radeon_bo_usage shader_usage,
 				     enum radeon_bo_priority priority)
@@ -556,10 +503,8 @@ static void si_init_buffer_resources(struct si_context *sctx,
 		buffers->desc_data[i] = &buffers->desc_storage[i*4];
 	}
 
-	si_init_descriptors(sctx, &buffers->desc,
-			    si_get_shader_user_data_base(shader) +
-			    shader_userdata_index*4, 4, num_buffers,
-			    si_emit_buffer_resources);
+	si_init_descriptors(sctx, &buffers->desc, shader_userdata_index, 4,
+			    num_buffers, si_emit_buffer_resources);
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers)
@@ -593,8 +538,6 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      buffers->desc.buffer, RADEON_USAGE_READWRITE,
 			      RADEON_PRIO_SHADER_DATA);
-
-	si_emit_shader_pointer(sctx, &buffers->desc.atom);
 }
 
 /* VERTEX BUFFERS */
@@ -620,8 +563,6 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_SHADER_DATA);
-
-	si_emit_shader_pointer(sctx, &desc->atom);
 }
 
 void si_update_vertex_buffers(struct si_context *sctx)
@@ -693,13 +634,12 @@ void si_update_vertex_buffers(struct si_context *sctx)
 		}
 	}
 
-	desc->atom.num_dw = 8; /* update 2 shader pointers (VS+ES) */
-	desc->atom.dirty = true;
-
 	/* Don't flush the const cache. It would have a very negative effect
 	 * on performance (confirmed by testing). New descriptors are always
 	 * uploaded to a fresh new buffer, so I don't think flushing the const
 	 * cache is needed. */
+	desc->pointer_dirty = true;
+	sctx->shader_userdata.atom.dirty = true;
 }
 
 
@@ -1277,6 +1217,112 @@ void si_copy_buffer(struct si_context *sctx,
 		r600_resource(dst)->TC_L2_dirty = true;
 }
 
+/* SHADER USER DATA */
+
+static void si_mark_shader_pointers_dirty(struct si_context *sctx,
+					  unsigned shader)
+{
+	sctx->const_buffers[shader].desc.pointer_dirty = true;
+	sctx->rw_buffers[shader].desc.pointer_dirty = true;
+	sctx->samplers[shader].views.desc.pointer_dirty = true;
+	sctx->samplers[shader].states.desc.pointer_dirty = true;
+
+	if (shader == PIPE_SHADER_VERTEX)
+		sctx->vertex_buffers.pointer_dirty = true;
+
+	sctx->shader_userdata.atom.dirty = true;
+}
+
+static void si_shader_userdata_begin_new_cs(struct si_context *sctx)
+{
+	int i;
+
+	for (i = 0; i < SI_NUM_SHADERS; i++) {
+		si_mark_shader_pointers_dirty(sctx, i);
+	}
+}
+
+/* Set a base register address for user data constants in the given shader.
+ * This assigns a mapping from PIPE_SHADER_* to SPI_SHADER_USER_DATA_*.
+ */
+static void si_set_user_data_base(struct si_context *sctx,
+				  unsigned shader, uint32_t new_base)
+{
+	uint32_t *base = &sctx->shader_userdata.sh_base[shader];
+
+	if (*base != new_base) {
+		*base = new_base;
+
+		if (new_base)
+			si_mark_shader_pointers_dirty(sctx, shader);
+	}
+}
+
+/* This must be called when these shaders are changed from non-NULL to NULL
+ * and vice versa:
+ * - geometry shader
+ * - tessellation control shader
+ * - tessellation evaluation shader
+ */
+void si_shader_change_notify(struct si_context *sctx)
+{
+	/* VS can be bound as VS or ES. */
+	if (sctx->gs_shader)
+		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
+				      R_00B330_SPI_SHADER_USER_DATA_ES_0);
+	else
+		si_set_user_data_base(sctx, PIPE_SHADER_VERTEX,
+				      R_00B130_SPI_SHADER_USER_DATA_VS_0);
+}
+
+static void si_emit_shader_pointer(struct si_context *sctx,
+				   struct si_descriptors *desc,
+				   unsigned sh_base, bool keep_dirty)
+{
+	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
+	uint64_t va;
+
+	if (!desc->pointer_dirty)
+		return;
+
+	va = desc->buffer->gpu_address +
+	     desc->current_context_id * desc->context_size +
+	     desc->buffer_offset;
+
+	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
+	radeon_emit(cs, (sh_base + desc->shader_userdata_offset - SI_SH_REG_OFFSET) >> 2);
+	radeon_emit(cs, va);
+	radeon_emit(cs, va >> 32);
+
+	desc->pointer_dirty = keep_dirty;
+}
+
+static void si_emit_shader_userdata(struct si_context *sctx,
+				    struct r600_atom *atom)
+{
+	unsigned i;
+	uint32_t *sh_base = sctx->shader_userdata.sh_base;
+
+	/* The VS copy shader needs these for clipping, streamout, and rings. */
+	if (sctx->gs_shader) {
+		unsigned base = R_00B130_SPI_SHADER_USER_DATA_VS_0;
+		unsigned i = PIPE_SHADER_VERTEX;
+
+		si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, true);
+		si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, base, true);
+	}
+
+	for (i = 0; i < SI_NUM_SHADERS; i++) {
+		unsigned base = sh_base[i];
+
+		si_emit_shader_pointer(sctx, &sctx->const_buffers[i].desc, base, false);
+		si_emit_shader_pointer(sctx, &sctx->rw_buffers[i].desc, base, false);
+		si_emit_shader_pointer(sctx, &sctx->samplers[i].views.desc, base, false);
+		si_emit_shader_pointer(sctx, &sctx->samplers[i].states.desc, base, false);
+	}
+	si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false);
+}
+
 /* INIT/DEINIT */
 
 void si_init_all_descriptors(struct si_context *sctx)
@@ -1285,19 +1331,17 @@ void si_init_all_descriptors(struct si_context *sctx)
 
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
 		si_init_buffer_resources(sctx, &sctx->const_buffers[i],
-					 SI_NUM_CONST_BUFFERS, i, SI_SGPR_CONST,
+					 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST,
 					 RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
 		si_init_buffer_resources(sctx, &sctx->rw_buffers[i],
-					 i == PIPE_SHADER_VERTEX ?
-					 SI_NUM_RW_BUFFERS : SI_NUM_RING_BUFFERS,
-					 i, SI_SGPR_RW_BUFFERS,
+					 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
 					 RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
 
-		si_init_sampler_views(sctx, &sctx->samplers[i].views, i);
+		si_init_sampler_views(sctx, &sctx->samplers[i].views);
 
 		si_init_descriptors(sctx, &sctx->samplers[i].states.desc,
-				    si_get_shader_user_data_base(i) + SI_SGPR_SAMPLER * 4,
-				    4, SI_NUM_SAMPLER_STATES, si_emit_sampler_states);
+				    SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES,
+				    si_emit_sampler_states);
 
 		sctx->atoms.s.const_buffers[i] = &sctx->const_buffers[i].desc.atom;
 		sctx->atoms.s.rw_buffers[i] = &sctx->rw_buffers[i].desc.atom;
@@ -1305,11 +1349,8 @@ void si_init_all_descriptors(struct si_context *sctx)
 		sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom;
 	}
 
-	si_init_descriptors(sctx, &sctx->vertex_buffers,
-			    si_get_shader_user_data_base(PIPE_SHADER_VERTEX) +
-			    SI_SGPR_VERTEX_BUFFER*4, 4, SI_NUM_VERTEX_BUFFERS,
-			    si_emit_shader_pointer);
-	sctx->atoms.s.vertex_buffers = &sctx->vertex_buffers.atom;
+	si_init_descriptors(sctx, &sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
+			    4, SI_NUM_VERTEX_BUFFERS, NULL);
 
 	/* Set pipe_context functions. */
 	sctx->b.b.set_constant_buffer = si_set_constant_buffer;
@@ -1317,6 +1358,18 @@ void si_init_all_descriptors(struct si_context *sctx)
 	sctx->b.b.set_stream_output_targets = si_set_streamout_targets;
 	sctx->b.clear_buffer = si_clear_buffer;
 	sctx->b.invalidate_buffer = si_invalidate_buffer;
+
+	/* Shader user data. */
+	sctx->atoms.s.shader_userdata = &sctx->shader_userdata.atom;
+	sctx->shader_userdata.atom.emit = (void*)si_emit_shader_userdata;
+
+	/* Upper bound, 4 pointers per shader, +1 for vertex buffers, +2 for the VS copy shader. */
+	sctx->shader_userdata.atom.num_dw = (SI_NUM_SHADERS * 4 + 1 + 2) * 4;
+
+	/* Set default and immutable mappings. */
+	si_set_user_data_base(sctx, PIPE_SHADER_VERTEX, R_00B130_SPI_SHADER_USER_DATA_VS_0);
+	si_set_user_data_base(sctx, PIPE_SHADER_GEOMETRY, R_00B230_SPI_SHADER_USER_DATA_GS_0);
+	si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
 }
 
 void si_release_all_descriptors(struct si_context *sctx)
@@ -1343,4 +1396,5 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx)
 		si_sampler_states_begin_new_cs(sctx, &sctx->samplers[i].states);
 	}
 	si_vertex_buffers_begin_new_cs(sctx);
+	si_shader_userdata_begin_new_cs(sctx);
 }
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index f98c7a8..3f00c96 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -125,8 +125,6 @@ struct si_framebuffer {
 
 #define SI_NUM_ATOMS(sctx) (sizeof((sctx)->atoms)/sizeof((sctx)->atoms.array[0]))
 
-#define SI_NUM_SHADERS (PIPE_SHADER_GEOMETRY+1)
-
 struct si_context {
 	struct r600_common_context	b;
 	struct blitter_context		*blitter;
@@ -141,7 +139,6 @@ struct si_context {
 	union {
 		struct {
 			/* The order matters. */
-			struct r600_atom *vertex_buffers;
 			struct r600_atom *const_buffers[SI_NUM_SHADERS];
 			struct r600_atom *rw_buffers[SI_NUM_SHADERS];
 			struct r600_atom *sampler_views[SI_NUM_SHADERS];
@@ -156,6 +153,7 @@ struct si_context {
 			struct r600_atom *db_render_state;
 			struct r600_atom *msaa_config;
 			struct r600_atom *clip_regs;
+			struct r600_atom *shader_userdata;
 		} s;
 		struct r600_atom *array[0];
 	} atoms;
@@ -169,6 +167,7 @@ struct si_context {
 	struct si_shader_selector	*gs_shader;
 	struct si_shader_selector	*vs_shader;
 	struct si_cs_shader_state	cs_shader_state;
+	struct si_shader_data		shader_userdata;
 	/* shader information */
 	unsigned			sprite_coord_enable;
 	bool				flatshade;
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 2f8a943..a2a007e 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -30,6 +30,8 @@
 #include "si_pm4.h"
 #include "radeon/r600_pipe_common.h"
 
+#define SI_NUM_SHADERS (PIPE_SHADER_GEOMETRY+1)
+
 struct si_screen;
 struct si_shader;
 
@@ -111,6 +113,11 @@ union si_state {
 	struct si_pm4_state	*array[0];
 };
 
+struct si_shader_data {
+	struct r600_atom	atom;
+	uint32_t		sh_base[SI_NUM_SHADERS];
+};
+
 #define SI_NUM_USER_SAMPLERS            16 /* AKA OpenGL textures units per shader */
 #define SI_POLY_STIPPLE_SAMPLER         SI_NUM_USER_SAMPLERS
 #define SI_NUM_SAMPLERS                 (SI_POLY_STIPPLE_SAMPLER + 1)
@@ -172,9 +179,10 @@ struct si_descriptors {
 	/* The size of a context, should be equal to 4*element_dw_size*num_elements. */
 	unsigned context_size;
 
-	/* The shader userdata register where the 64-bit pointer to the descriptor
+	/* The shader userdata offset within a shader where the 64-bit pointer to the descriptor
 	 * array will be stored. */
-	unsigned shader_userdata_reg;
+	unsigned shader_userdata_offset;
+	bool pointer_dirty;
 };
 
 struct si_sampler_views {
@@ -246,6 +254,7 @@ void si_copy_buffer(struct si_context *sctx,
 		    uint64_t dst_offset, uint64_t src_offset, unsigned size, bool is_framebuffer);
 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
 			    const uint8_t *ptr, unsigned size, uint32_t *const_offset);
+void si_shader_change_notify(struct si_context *sctx);
 
 /* si_state.c */
 struct si_shader_selector;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index e85ed15..abfae1c 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -249,8 +249,7 @@ static void si_emit_draw_packets(struct si_context *sctx,
 				 const struct pipe_index_buffer *ib)
 {
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	unsigned sh_base_reg = (sctx->gs_shader ? R_00B330_SPI_SHADER_USER_DATA_ES_0 :
-						  R_00B130_SPI_SHADER_USER_DATA_VS_0);
+	unsigned sh_base_reg = sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX];
 
 	if (info->count_from_stream_output) {
 		struct r600_so_target *t =
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 16941e3..ce0d195 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -535,6 +535,7 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_shader_selector *sel = state;
+	bool enable_changed = !!sctx->gs_shader != !!sel;
 
 	if (sctx->gs_shader == sel)
 		return;
@@ -542,6 +543,9 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
 	sctx->gs_shader = sel;
 	sctx->clip_regs.dirty = true;
 	sctx->last_rast_prim = -1; /* reset this so that it gets updated */
+
+	if (enable_changed)
+		si_shader_change_notify(sctx);
 }
 
 static void si_make_dummy_ps(struct si_context *sctx)
-- 
2.1.4

From cc86439146dd158f511458b7972260ffa7b7a7cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.ol...@amd.com>
Date: Sat, 25 Jul 2015 00:53:16 +0200
Subject: [PATCH 2/2] radeonsi: completely rework updating descriptors without
 CP DMA
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For 10.6: This fixes graphical corruption occuring on most Southern Islands
Radeon GPUs. This will allow closing a lot of bugs in the bugzilla.

The patch has a better explanation. Just a summary here:
- The CPU always uploads a whole descriptor array to previously-unused memory.
- CP DMA isn't used.
- No caches need to be flushed.
- All descriptors are always up-to-date in memory even after a hang, because
  CP DMA doesn't serve as a middle man to update them.

This should bring:
- better hang recovery (descriptors are always up-to-date)
- better GPU performance (no KCACHE and TC flushes)
- worse CPU performance for partial updates (only whole arrays are uploaded)
- less used IB space (no CP_DMA and WRITE_DATA packets)
- simpler code
- corruption issues are fixed on SI cards

Reviewed-by: Michel Dänzer <michel.daen...@amd.com>
(cherry picked from commit b0528118dfb1af00e7d08cdb637191b80c14c2ba)
---
 src/gallium/drivers/radeonsi/si_descriptors.c | 354 +++++++++-----------------
 src/gallium/drivers/radeonsi/si_pipe.h        |   6 -
 src/gallium/drivers/radeonsi/si_state.h       |  32 +--
 src/gallium/drivers/radeonsi/si_state_draw.c  |   7 +-
 4 files changed, 128 insertions(+), 271 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index f31cccb..f6d2b2c 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -24,14 +24,23 @@
  *      Marek Olšák <marek.ol...@amd.com>
  */
 
-/* Resource binding slots and sampler states (each described with 8 or 4 dwords)
- * live in memory on SI.
+/* Resource binding slots and sampler states (each described with 8 or
+ * 4 dwords) are stored in lists in memory which is accessed by shaders
+ * using scalar load instructions.
  *
- * This file is responsible for managing lists of resources and sampler states
- * in memory and binding them, which means updating those structures in memory.
+ * This file is responsible for managing such lists. It keeps a copy of all
+ * descriptors in CPU memory and re-uploads a whole list if some slots have
+ * been changed.
  *
- * There is also code for updating shader pointers to resources and sampler
- * states. CP DMA functions are here too.
+ * This code is also reponsible for updating shader pointers to those lists.
+ *
+ * Note that CP DMA can't be used for updating the lists, because a GPU hang
+ * could leave the list in a mid-IB state and the next IB would get wrong
+ * descriptors and the whole context would be unusable at that point.
+ * (Note: The register shadowing can't be used due to the same reason)
+ *
+ * Also, uploading descriptors to newly allocated memory doesn't require
+ * a KCACHE flush.
  */
 
 #include "radeon/r600_cs.h"
@@ -42,7 +51,6 @@
 #include "util/u_memory.h"
 #include "util/u_upload_mgr.h"
 
-#define SI_NUM_CONTEXTS 16
 
 /* NULL image and buffer descriptor.
  *
@@ -139,159 +147,62 @@ static void si_emit_cp_dma_clear_buffer(struct si_context *sctx,
 	}
 }
 
-static void si_init_descriptors(struct si_context *sctx,
-				struct si_descriptors *desc,
+static void si_init_descriptors(struct si_descriptors *desc,
 				unsigned shader_userdata_index,
 				unsigned element_dw_size,
-				unsigned num_elements,
-				void (*emit_func)(struct si_context *ctx, struct r600_atom *state))
+				unsigned num_elements)
 {
+	int i;
+
 	assert(num_elements <= sizeof(desc->enabled_mask)*8);
-	assert(num_elements <= sizeof(desc->dirty_mask)*8);
 
-	desc->atom.emit = (void*)emit_func;
-	desc->shader_userdata_offset = shader_userdata_index * 4;
+	desc->list = CALLOC(num_elements, element_dw_size * 4);
 	desc->element_dw_size = element_dw_size;
 	desc->num_elements = num_elements;
-	desc->context_size = num_elements * element_dw_size * 4;
-
-	desc->buffer = (struct r600_resource*)
-		pipe_buffer_create(sctx->b.b.screen, PIPE_BIND_CUSTOM,
-				   PIPE_USAGE_DEFAULT,
-				   SI_NUM_CONTEXTS * desc->context_size);
-
-	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
-			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
+	desc->list_dirty = true; /* upload the list before the next draw */
+	desc->shader_userdata_offset = shader_userdata_index * 4;
 
-	/* We don't check for CS space here, because this should be called
-	 * only once at context initialization. */
-	si_emit_cp_dma_clear_buffer(sctx, desc->buffer->gpu_address,
-				    desc->buffer->b.b.width0, 0,
-				    R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2);
+	/* Initialize the array to NULL descriptors if the element size is 8. */
+	if (element_dw_size == 8)
+		for (i = 0; i < num_elements; i++)
+			memcpy(desc->list + i*element_dw_size, null_descriptor,
+			       sizeof(null_descriptor));
 }
 
 static void si_release_descriptors(struct si_descriptors *desc)
 {
 	pipe_resource_reference((struct pipe_resource**)&desc->buffer, NULL);
+	FREE(desc->list);
 }
 
-static void si_update_descriptors(struct si_context *sctx,
+static bool si_upload_descriptors(struct si_context *sctx,
 				  struct si_descriptors *desc)
 {
-	if (desc->dirty_mask) {
-		desc->atom.num_dw =
-			7 + /* copy */
-			(4 + desc->element_dw_size) * util_bitcount(desc->dirty_mask); /* update */
-
-		desc->atom.dirty = true;
-		desc->pointer_dirty = true;
-		sctx->shader_userdata.atom.dirty = true;
-
-		/* TODO: Investigate if these flushes can be removed after
-		 * adding CE support. */
-
-		/* The descriptors are read with the K cache. */
-		sctx->b.flags |= SI_CONTEXT_INV_KCACHE;
-
-		/* Since SI uses uncached CP DMA to update descriptors,
-		 * we have to flush TC L2, which is used to fetch constants
-		 * along with KCACHE. */
-		if (sctx->b.chip_class == SI)
-			sctx->b.flags |= SI_CONTEXT_INV_TC_L2;
-	} else {
-		desc->atom.dirty = false;
-	}
-}
+	unsigned list_size = desc->num_elements * desc->element_dw_size * 4;
+	void *ptr;
 
-static void si_emit_descriptors(struct si_context *sctx,
-				struct si_descriptors *desc,
-				uint32_t **descriptors)
-{
-	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
-	uint64_t va_base;
-	int packet_start = 0;
-	int packet_size = 0;
-	int last_index = desc->num_elements; /* point to a non-existing element */
-	uint64_t dirty_mask = desc->dirty_mask;
-	unsigned new_context_id = (desc->current_context_id + 1) % SI_NUM_CONTEXTS;
-
-	assert(dirty_mask);
-
-	va_base = desc->buffer->gpu_address;
-
-	/* Copy the descriptors to a new context slot. */
-	si_emit_cp_dma_copy_buffer(sctx,
-				   va_base + new_context_id * desc->context_size,
-				   va_base + desc->current_context_id * desc->context_size,
-				   desc->context_size, R600_CP_DMA_SYNC | CIK_CP_DMA_USE_L2);
-
-	va_base += new_context_id * desc->context_size;
-
-	/* Update the descriptors.
-	 * Updates of consecutive descriptors are merged to one WRITE_DATA packet.
-	 *
-	 * XXX When unbinding lots of resources, consider clearing the memory
-	 *     with CP DMA instead of emitting zeros.
-	 */
-	while (dirty_mask) {
-		int i = u_bit_scan64(&dirty_mask);
-
-		assert(i < desc->num_elements);
+	if (!desc->list_dirty)
+		return true;
 
-		if (last_index+1 == i && packet_size) {
-			/* Append new data at the end of the last packet. */
-			packet_size += desc->element_dw_size;
-			cs->buf[packet_start] = PKT3(PKT3_WRITE_DATA, packet_size, 0);
-		} else {
-			/* Start a new packet. */
-			uint64_t va = va_base + i * desc->element_dw_size * 4;
-
-			packet_start = cs->cdw;
-			packet_size = 2 + desc->element_dw_size;
-
-			radeon_emit(cs, PKT3(PKT3_WRITE_DATA, packet_size, 0));
-			radeon_emit(cs, PKT3_WRITE_DATA_DST_SEL(sctx->b.chip_class == SI ?
-						PKT3_WRITE_DATA_DST_SEL_MEM_SYNC :
-						PKT3_WRITE_DATA_DST_SEL_TC_L2) |
-					     PKT3_WRITE_DATA_WR_CONFIRM |
-					     PKT3_WRITE_DATA_ENGINE_SEL(PKT3_WRITE_DATA_ENGINE_SEL_ME));
-			radeon_emit(cs, va & 0xFFFFFFFFUL);
-			radeon_emit(cs, (va >> 32UL) & 0xFFFFFFFFUL);
-		}
+	u_upload_alloc(sctx->b.uploader, 0, list_size,
+		       &desc->buffer_offset,
+		       (struct pipe_resource**)&desc->buffer, &ptr);
+	if (!desc->buffer)
+		return false; /* skip the draw call */
 
-		radeon_emit_array(cs, descriptors[i], desc->element_dw_size);
+	util_memcpy_cpu_to_le32(ptr, desc->list, list_size);
 
-		last_index = i;
-	}
+	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, desc->buffer,
+			      RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
 
-	desc->dirty_mask = 0;
-	desc->current_context_id = new_context_id;
+	desc->list_dirty = false;
+	desc->pointer_dirty = true;
+	sctx->shader_userdata.atom.dirty = true;
+	return true;
 }
 
 /* SAMPLER VIEWS */
 
-static void si_emit_sampler_views(struct si_context *sctx, struct r600_atom *atom)
-{
-	struct si_sampler_views *views = (struct si_sampler_views*)atom;
-
-	si_emit_descriptors(sctx, &views->desc, views->desc_data);
-}
-
-static void si_init_sampler_views(struct si_context *sctx,
-				  struct si_sampler_views *views)
-{
-	int i;
-
-	si_init_descriptors(sctx, &views->desc, SI_SGPR_RESOURCE,
-			    8, SI_NUM_SAMPLER_VIEWS, si_emit_sampler_views);
-
-	for (i = 0; i < views->desc.num_elements; i++) {
-		views->desc_data[i] = null_descriptor;
-		views->desc.dirty_mask |= 1llu << i;
-	}
-	si_update_descriptors(sctx, &views->desc);
-}
-
 static void si_release_sampler_views(struct si_sampler_views *views)
 {
 	int i;
@@ -332,6 +243,8 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
 				      si_get_resource_ro_priority(rview->resource));
 	}
 
+	if (!views->desc.buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, views->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
 }
@@ -354,17 +267,16 @@ static void si_set_sampler_view(struct si_context *sctx, unsigned shader,
 				rview->resource, RADEON_USAGE_READ,
 				si_get_resource_ro_priority(rview->resource));
 
-
 		pipe_sampler_view_reference(&views->views[slot], view);
-		views->desc_data[slot] = view_desc;
+		memcpy(views->desc.list + slot*8, view_desc, 8*4);
 		views->desc.enabled_mask |= 1llu << slot;
 	} else {
 		pipe_sampler_view_reference(&views->views[slot], NULL);
-		views->desc_data[slot] = null_descriptor;
+		memcpy(views->desc.list + slot*8, null_descriptor, 8*4);
 		views->desc.enabled_mask &= ~(1llu << slot);
 	}
 
-	views->desc.dirty_mask |= 1llu << slot;
+	views->desc.list_dirty = true;
 }
 
 static void si_set_sampler_views(struct pipe_context *ctx,
@@ -423,22 +335,15 @@ static void si_set_sampler_views(struct pipe_context *ctx,
 					    NULL, NULL);
 		}
 	}
-
-	si_update_descriptors(sctx, &samplers->views.desc);
 }
 
 /* SAMPLER STATES */
 
-static void si_emit_sampler_states(struct si_context *sctx, struct r600_atom *atom)
-{
-	struct si_sampler_states *states = (struct si_sampler_states*)atom;
-
-	si_emit_descriptors(sctx, &states->desc, states->desc_data);
-}
-
 static void si_sampler_states_begin_new_cs(struct si_context *sctx,
 					   struct si_sampler_states *states)
 {
+	if (!states->desc.buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx, states->desc.buffer,
 			      RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_DATA);
 }
@@ -460,64 +365,39 @@ void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
 	for (i = 0; i < count; i++) {
 		unsigned slot = start + i;
 
-		if (!sstates[i]) {
-			samplers->desc.dirty_mask &= ~(1llu << slot);
+		if (!sstates[i])
 			continue;
-		}
 
-		samplers->desc_data[slot] = sstates[i]->val;
-		samplers->desc.dirty_mask |= 1llu << slot;
+		memcpy(samplers->desc.list + slot*4, sstates[i]->val, 4*4);
+		samplers->desc.list_dirty = true;
 	}
-
-	si_update_descriptors(sctx, &samplers->desc);
 }
 
 /* BUFFER RESOURCES */
 
-static void si_emit_buffer_resources(struct si_context *sctx, struct r600_atom *atom)
-{
-	struct si_buffer_resources *buffers = (struct si_buffer_resources*)atom;
-
-	si_emit_descriptors(sctx, &buffers->desc, buffers->desc_data);
-}
-
-static void si_init_buffer_resources(struct si_context *sctx,
-				     struct si_buffer_resources *buffers,
+static void si_init_buffer_resources(struct si_buffer_resources *buffers,
 				     unsigned num_buffers,
 				     unsigned shader_userdata_index,
 				     enum radeon_bo_usage shader_usage,
 				     enum radeon_bo_priority priority)
 {
-	int i;
-
-	buffers->num_buffers = num_buffers;
 	buffers->shader_usage = shader_usage;
 	buffers->priority = priority;
 	buffers->buffers = CALLOC(num_buffers, sizeof(struct pipe_resource*));
-	buffers->desc_storage = CALLOC(num_buffers, sizeof(uint32_t) * 4);
 
-	/* si_emit_descriptors only accepts an array of arrays.
-	 * This adds such an array. */
-	buffers->desc_data = CALLOC(num_buffers, sizeof(uint32_t*));
-	for (i = 0; i < num_buffers; i++) {
-		buffers->desc_data[i] = &buffers->desc_storage[i*4];
-	}
-
-	si_init_descriptors(sctx, &buffers->desc, shader_userdata_index, 4,
-			    num_buffers, si_emit_buffer_resources);
+	si_init_descriptors(&buffers->desc, shader_userdata_index, 4,
+			    num_buffers);
 }
 
 static void si_release_buffer_resources(struct si_buffer_resources *buffers)
 {
 	int i;
 
-	for (i = 0; i < buffers->num_buffers; i++) {
+	for (i = 0; i < buffers->desc.num_elements; i++) {
 		pipe_resource_reference(&buffers->buffers[i], NULL);
 	}
 
 	FREE(buffers->buffers);
-	FREE(buffers->desc_storage);
-	FREE(buffers->desc_data);
 	si_release_descriptors(&buffers->desc);
 }
 
@@ -535,6 +415,8 @@ static void si_buffer_resources_begin_new_cs(struct si_context *sctx,
 				      buffers->shader_usage, buffers->priority);
 	}
 
+	if (!buffers->desc.buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      buffers->desc.buffer, RADEON_USAGE_READWRITE,
 			      RADEON_PRIO_SHADER_DATA);
@@ -560,12 +442,15 @@ static void si_vertex_buffers_begin_new_cs(struct si_context *sctx)
 				      (struct r600_resource*)sctx->vertex_buffer[vb].buffer,
 				      RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
 	}
+
+	if (!desc->buffer)
+		return;
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_SHADER_DATA);
 }
 
-void si_update_vertex_buffers(struct si_context *sctx)
+static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 {
 	struct si_descriptors *desc = &sctx->vertex_buffers;
 	bool bound[SI_NUM_VERTEX_BUFFERS] = {};
@@ -573,8 +458,10 @@ void si_update_vertex_buffers(struct si_context *sctx)
 	uint64_t va;
 	uint32_t *ptr;
 
+	if (!sctx->vertex_buffers_dirty)
+		return true;
 	if (!count || !sctx->vertex_elements)
-		return;
+		return true;
 
 	/* Vertex buffer descriptors are the only ones which are uploaded
 	 * directly through a staging buffer and don't go through
@@ -582,13 +469,14 @@ void si_update_vertex_buffers(struct si_context *sctx)
 	 */
 	u_upload_alloc(sctx->b.uploader, 0, count * 16, &desc->buffer_offset,
 		       (struct pipe_resource**)&desc->buffer, (void**)&ptr);
+	if (!desc->buffer)
+		return false;
 
 	r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 			      desc->buffer, RADEON_USAGE_READ,
 			      RADEON_PRIO_SHADER_DATA);
 
 	assert(count <= SI_NUM_VERTEX_BUFFERS);
-	assert(desc->current_context_id == 0);
 
 	for (i = 0; i < count; i++) {
 		struct pipe_vertex_element *ve = &sctx->vertex_elements->elements[i];
@@ -640,6 +528,8 @@ void si_update_vertex_buffers(struct si_context *sctx)
 	 * cache is needed. */
 	desc->pointer_dirty = true;
 	sctx->shader_userdata.atom.dirty = true;
+	sctx->vertex_buffers_dirty = false;
+	return true;
 }
 
 
@@ -664,7 +554,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 	if (shader >= SI_NUM_SHADERS)
 		return;
 
-	assert(slot < buffers->num_buffers);
+	assert(slot < buffers->desc.num_elements);
 	pipe_resource_reference(&buffers->buffers[slot], NULL);
 
 	/* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
@@ -691,7 +581,7 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 		}
 
 		/* Set the descriptor. */
-		uint32_t *desc = buffers->desc_data[slot];
+		uint32_t *desc = buffers->desc.list + slot*4;
 		desc[0] = va;
 		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 			  S_008F04_STRIDE(0);
@@ -710,12 +600,11 @@ static void si_set_constant_buffer(struct pipe_context *ctx, uint shader, uint s
 		buffers->desc.enabled_mask |= 1llu << slot;
 	} else {
 		/* Clear the descriptor. */
-		memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
+		memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
 		buffers->desc.enabled_mask &= ~(1llu << slot);
 	}
 
-	buffers->desc.dirty_mask |= 1llu << slot;
-	si_update_descriptors(sctx, &buffers->desc);
+	buffers->desc.list_dirty = true;
 }
 
 /* RING BUFFERS */
@@ -735,7 +624,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 	/* The stride field in the resource descriptor has 14 bits */
 	assert(stride < (1 << 14));
 
-	assert(slot < buffers->num_buffers);
+	assert(slot < buffers->desc.num_elements);
 	pipe_resource_reference(&buffers->buffers[slot], NULL);
 
 	if (buffer) {
@@ -780,7 +669,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 		}
 
 		/* Set the descriptor. */
-		uint32_t *desc = buffers->desc_data[slot];
+		uint32_t *desc = buffers->desc.list + slot*4;
 		desc[0] = va;
 		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 			  S_008F04_STRIDE(stride) |
@@ -803,12 +692,11 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 		buffers->desc.enabled_mask |= 1llu << slot;
 	} else {
 		/* Clear the descriptor. */
-		memset(buffers->desc_data[slot], 0, sizeof(uint32_t) * 4);
+		memset(buffers->desc.list + slot*4, 0, sizeof(uint32_t) * 4);
 		buffers->desc.enabled_mask &= ~(1llu << slot);
 	}
 
-	buffers->desc.dirty_mask |= 1llu << slot;
-	si_update_descriptors(sctx, &buffers->desc);
+	buffers->desc.list_dirty = true;
 }
 
 /* STREAMOUT BUFFERS */
@@ -870,7 +758,7 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 			uint64_t va = r600_resource(buffer)->gpu_address;
 
 			/* Set the descriptor. */
-			uint32_t *desc = buffers->desc_data[bufidx];
+			uint32_t *desc = buffers->desc.list + bufidx*4;
 			desc[0] = va;
 			desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32);
 			desc[2] = 0xffffffff;
@@ -888,24 +776,22 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 			buffers->desc.enabled_mask |= 1llu << bufidx;
 		} else {
 			/* Clear the descriptor and unset the resource. */
-			memset(buffers->desc_data[bufidx], 0,
+			memset(buffers->desc.list + bufidx*4, 0,
 			       sizeof(uint32_t) * 4);
 			pipe_resource_reference(&buffers->buffers[bufidx],
 						NULL);
 			buffers->desc.enabled_mask &= ~(1llu << bufidx);
 		}
-		buffers->desc.dirty_mask |= 1llu << bufidx;
 	}
 	for (; i < old_num_targets; i++) {
 		bufidx = SI_SO_BUF_OFFSET + i;
 		/* Clear the descriptor and unset the resource. */
-		memset(buffers->desc_data[bufidx], 0, sizeof(uint32_t) * 4);
+		memset(buffers->desc.list + bufidx*4, 0, sizeof(uint32_t) * 4);
 		pipe_resource_reference(&buffers->buffers[bufidx], NULL);
 		buffers->desc.enabled_mask &= ~(1llu << bufidx);
-		buffers->desc.dirty_mask |= 1llu << bufidx;
 	}
 
-	si_update_descriptors(sctx, &buffers->desc);
+	buffers->desc.list_dirty = true;
 }
 
 static void si_desc_reset_buffer_offset(struct pipe_context *ctx,
@@ -974,22 +860,19 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 	/* Read/Write buffers. */
 	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
 		struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
-		bool found = false;
 		uint64_t mask = buffers->desc.enabled_mask;
 
 		while (mask) {
 			i = u_bit_scan64(&mask);
 			if (buffers->buffers[i] == buf) {
-				si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
+				si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
 							    old_va, buf);
+				buffers->desc.list_dirty = true;
 
 				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 						      rbuffer, buffers->shader_usage,
 						      buffers->priority);
 
-				buffers->desc.dirty_mask |= 1llu << i;
-				found = true;
-
 				if (i >= SI_SO_BUF_OFFSET && shader == PIPE_SHADER_VERTEX) {
 					/* Update the streamout state. */
 					if (sctx->b.streamout.begin_emitted) {
@@ -1001,34 +884,25 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 				}
 			}
 		}
-		if (found) {
-			si_update_descriptors(sctx, &buffers->desc);
-		}
 	}
 
 	/* Constant buffers. */
 	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
 		struct si_buffer_resources *buffers = &sctx->const_buffers[shader];
-		bool found = false;
 		uint64_t mask = buffers->desc.enabled_mask;
 
 		while (mask) {
 			unsigned i = u_bit_scan64(&mask);
 			if (buffers->buffers[i] == buf) {
-				si_desc_reset_buffer_offset(ctx, buffers->desc_data[i],
+				si_desc_reset_buffer_offset(ctx, buffers->desc.list + i*4,
 							    old_va, buf);
+				buffers->desc.list_dirty = true;
 
 				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 						      rbuffer, buffers->shader_usage,
 						      buffers->priority);
-
-				buffers->desc.dirty_mask |= 1llu << i;
-				found = true;
 			}
 		}
-		if (found) {
-			si_update_descriptors(sctx, &buffers->desc);
-		}
 	}
 
 	/* Texture buffers - update virtual addresses in sampler view descriptors. */
@@ -1040,23 +914,20 @@ static void si_invalidate_buffer(struct pipe_context *ctx, struct pipe_resource
 	/* Texture buffers - update bindings. */
 	for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
 		struct si_sampler_views *views = &sctx->samplers[shader].views;
-		bool found = false;
 		uint64_t mask = views->desc.enabled_mask;
 
 		while (mask) {
 			unsigned i = u_bit_scan64(&mask);
 			if (views->views[i]->texture == buf) {
+				si_desc_reset_buffer_offset(ctx, views->desc.list + i*8+4,
+							    old_va, buf);
+				views->desc.list_dirty = true;
+
 				r600_context_bo_reloc(&sctx->b, &sctx->b.rings.gfx,
 						      rbuffer, RADEON_USAGE_READ,
 						      RADEON_PRIO_SHADER_BUFFER_RO);
-
-				views->desc.dirty_mask |= 1llu << i;
-				found = true;
 			}
 		}
-		if (found) {
-			si_update_descriptors(sctx, &views->desc);
-		}
 	}
 }
 
@@ -1282,11 +1153,10 @@ static void si_emit_shader_pointer(struct si_context *sctx,
 	struct radeon_winsys_cs *cs = sctx->b.rings.gfx.cs;
 	uint64_t va;
 
-	if (!desc->pointer_dirty)
+	if (!desc->pointer_dirty || !desc->buffer)
 		return;
 
 	va = desc->buffer->gpu_address +
-	     desc->current_context_id * desc->context_size +
 	     desc->buffer_offset;
 
 	radeon_emit(cs, PKT3(PKT3_SET_SH_REG, 2, 0));
@@ -1323,34 +1193,28 @@ static void si_emit_shader_userdata(struct si_context *sctx,
 	si_emit_shader_pointer(sctx, &sctx->vertex_buffers, sh_base[PIPE_SHADER_VERTEX], false);
 }
 
-/* INIT/DEINIT */
+/* INIT/DEINIT/UPLOAD */
 
 void si_init_all_descriptors(struct si_context *sctx)
 {
 	int i;
 
 	for (i = 0; i < SI_NUM_SHADERS; i++) {
-		si_init_buffer_resources(sctx, &sctx->const_buffers[i],
+		si_init_buffer_resources(&sctx->const_buffers[i],
 					 SI_NUM_CONST_BUFFERS, SI_SGPR_CONST,
 					 RADEON_USAGE_READ, RADEON_PRIO_SHADER_BUFFER_RO);
-		si_init_buffer_resources(sctx, &sctx->rw_buffers[i],
+		si_init_buffer_resources(&sctx->rw_buffers[i],
 					 SI_NUM_RW_BUFFERS, SI_SGPR_RW_BUFFERS,
 					 RADEON_USAGE_READWRITE, RADEON_PRIO_SHADER_RESOURCE_RW);
 
-		si_init_sampler_views(sctx, &sctx->samplers[i].views);
-
-		si_init_descriptors(sctx, &sctx->samplers[i].states.desc,
-				    SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES,
-				    si_emit_sampler_states);
-
-		sctx->atoms.s.const_buffers[i] = &sctx->const_buffers[i].desc.atom;
-		sctx->atoms.s.rw_buffers[i] = &sctx->rw_buffers[i].desc.atom;
-		sctx->atoms.s.sampler_views[i] = &sctx->samplers[i].views.desc.atom;
-		sctx->atoms.s.sampler_states[i] = &sctx->samplers[i].states.desc.atom;
+		si_init_descriptors(&sctx->samplers[i].views.desc,
+				    SI_SGPR_RESOURCE, 8, SI_NUM_SAMPLER_VIEWS);
+		si_init_descriptors(&sctx->samplers[i].states.desc,
+				    SI_SGPR_SAMPLER, 4, SI_NUM_SAMPLER_STATES);
 	}
 
-	si_init_descriptors(sctx, &sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
-			    4, SI_NUM_VERTEX_BUFFERS, NULL);
+	si_init_descriptors(&sctx->vertex_buffers, SI_SGPR_VERTEX_BUFFER,
+			    4, SI_NUM_VERTEX_BUFFERS);
 
 	/* Set pipe_context functions. */
 	sctx->b.b.set_constant_buffer = si_set_constant_buffer;
@@ -1372,6 +1236,20 @@ void si_init_all_descriptors(struct si_context *sctx)
 	si_set_user_data_base(sctx, PIPE_SHADER_FRAGMENT, R_00B030_SPI_SHADER_USER_DATA_PS_0);
 }
 
+bool si_upload_shader_descriptors(struct si_context *sctx)
+{
+	int i;
+
+	for (i = 0; i < SI_NUM_SHADERS; i++) {
+		if (!si_upload_descriptors(sctx, &sctx->const_buffers[i].desc) ||
+		    !si_upload_descriptors(sctx, &sctx->rw_buffers[i].desc) ||
+		    !si_upload_descriptors(sctx, &sctx->samplers[i].views.desc) ||
+		    !si_upload_descriptors(sctx, &sctx->samplers[i].states.desc))
+			return false;
+	}
+	return si_upload_vertex_buffer_descriptors(sctx);
+}
+
 void si_release_all_descriptors(struct si_context *sctx)
 {
 	int i;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 3f00c96..4a9f9b7 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -139,12 +139,6 @@ struct si_context {
 	union {
 		struct {
 			/* The order matters. */
-			struct r600_atom *const_buffers[SI_NUM_SHADERS];
-			struct r600_atom *rw_buffers[SI_NUM_SHADERS];
-			struct r600_atom *sampler_views[SI_NUM_SHADERS];
-			struct r600_atom *sampler_states[SI_NUM_SHADERS];
-			/* Caches must be flushed after resource descriptors are
-			 * updated in memory. */
 			struct r600_atom *cache_flush;
 			struct r600_atom *streamout_begin;
 			struct r600_atom *streamout_enable; /* must be after streamout_begin */
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index a2a007e..b549913 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -151,60 +151,48 @@ struct si_shader_data {
 #define SI_NUM_VERTEX_BUFFERS	16
 
 
-/* This represents resource descriptors in memory, such as buffer resources,
+/* This represents descriptors in memory, such as buffer resources,
  * image resources, and sampler states.
  */
 struct si_descriptors {
-	struct r600_atom atom;
-
-	/* The size of one resource descriptor. */
+	/* The list of descriptors in malloc'd memory. */
+	uint32_t *list;
+	/* The size of one descriptor. */
 	unsigned element_dw_size;
-	/* The maximum number of resource descriptors. */
+	/* The maximum number of descriptors. */
 	unsigned num_elements;
+	/* Whether the list has been changed and should be re-uploaded. */
+	bool list_dirty;
 
-	/* The buffer where resource descriptors are stored. */
+	/* The buffer where the descriptors have been uploaded. */
 	struct r600_resource *buffer;
 	unsigned buffer_offset;
 
-	/* The i-th bit is set if that element is dirty (changed but not emitted). */
-	uint64_t dirty_mask;
 	/* The i-th bit is set if that element is enabled (non-NULL resource). */
 	uint64_t enabled_mask;
 
-	/* We can't update descriptors directly because the GPU might be
-	 * reading them at the same time, so we have to update them
-	 * in a copy-on-write manner. Each such copy is called a context,
-	 * which is just another array descriptors in the same buffer. */
-	unsigned current_context_id;
-	/* The size of a context, should be equal to 4*element_dw_size*num_elements. */
-	unsigned context_size;
-
 	/* The shader userdata offset within a shader where the 64-bit pointer to the descriptor
 	 * array will be stored. */
 	unsigned shader_userdata_offset;
+	/* Whether the pointer should be re-emitted. */
 	bool pointer_dirty;
 };
 
 struct si_sampler_views {
 	struct si_descriptors		desc;
 	struct pipe_sampler_view	*views[SI_NUM_SAMPLER_VIEWS];
-	uint32_t			*desc_data[SI_NUM_SAMPLER_VIEWS];
 };
 
 struct si_sampler_states {
 	struct si_descriptors		desc;
-	uint32_t			*desc_data[SI_NUM_SAMPLER_STATES];
 	void				*saved_states[2]; /* saved for u_blitter */
 };
 
 struct si_buffer_resources {
 	struct si_descriptors		desc;
-	unsigned			num_buffers;
 	enum radeon_bo_usage		shader_usage; /* READ, WRITE, or READWRITE */
 	enum radeon_bo_priority		priority;
 	struct pipe_resource		**buffers; /* this has num_buffers elements */
-	uint32_t			*desc_storage; /* this has num_buffers*4 elements */
-	uint32_t			**desc_data; /* an array of pointers pointing to desc_storage */
 };
 
 #define si_pm4_block_idx(member) \
@@ -240,13 +228,13 @@ struct si_buffer_resources {
 /* si_descriptors.c */
 void si_set_sampler_descriptors(struct si_context *sctx, unsigned shader,
 				unsigned start, unsigned count, void **states);
-void si_update_vertex_buffers(struct si_context *sctx);
 void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
 			struct pipe_resource *buffer,
 			unsigned stride, unsigned num_records,
 			bool add_tid, bool swizzle,
 			unsigned element_size, unsigned index_stride);
 void si_init_all_descriptors(struct si_context *sctx);
+bool si_upload_shader_descriptors(struct si_context *sctx);
 void si_release_all_descriptors(struct si_context *sctx);
 void si_all_descriptors_begin_new_cs(struct si_context *sctx);
 void si_copy_buffer(struct si_context *sctx,
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index abfae1c..b22b266 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -539,11 +539,8 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		sctx->current_rast_prim = info->mode;
 
 	si_update_shaders(sctx);
-
-	if (sctx->vertex_buffers_dirty) {
-		si_update_vertex_buffers(sctx);
-		sctx->vertex_buffers_dirty = false;
-	}
+	if (!si_upload_shader_descriptors(sctx))
+		return;
 
 	if (info->indexed) {
 		/* Initialize the index buffer struct. */
-- 
2.1.4

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to