Re: [Mesa-dev] [PATCH 4/4] radeonsi: use SDMA for uploading data through const_uploader

Dieter Nützel Wed, 20 Feb 2019 16:08:21 -0800

For the _rev2_ version (Patchwork Mesa) series is

Tested-by: Dieter Nützel <die...@nuetzel-hh.de>


on Polaris 20

UH+UV working flawlessly, now.
No 'measurable' speed decrease. - GREAT!
Blender, FreeCAD, glmark2 all fine.

But I had to have rebased part 4 (see attachment).

Dieter

Am 07.02.2019 02:22, schrieb Marek Olšák:

From: Marek Olšák <marek.ol...@amd.com>

---
 src/gallium/drivers/radeonsi/si_buffer.c | 56 ++++++++++++++++++++++--
 src/gallium/drivers/radeonsi/si_dma_cs.c | 19 ++++----
 src/gallium/drivers/radeonsi/si_gfx_cs.c | 42 +++++++++++++++---
 src/gallium/drivers/radeonsi/si_pipe.c   | 23 ++++++----
 src/gallium/drivers/radeonsi/si_pipe.h   | 17 +++++++
 5 files changed, 131 insertions(+), 26 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_buffer.c
b/src/gallium/drivers/radeonsi/si_buffer.c
index c01118ce96a..3f8db7cf4f0 100644
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@@ -433,21 +433,29 @@ static void *si_buffer_transfer_map(struct
pipe_context *ctx,

                if (si_invalidate_buffer(sctx, buf)) {
                        /* At this point, the buffer is always idle. */
                        usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
                } else {
                        /* Fall back to a temporary buffer. */
                        usage |= PIPE_TRANSFER_DISCARD_RANGE;
                }
        }

-       if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
+       if (usage & PIPE_TRANSFER_FLUSH_EXPLICIT &&

+ buf->b.b.flags & SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA){

+               usage &= ~(PIPE_TRANSFER_UNSYNCHRONIZED |
+                          PIPE_TRANSFER_PERSISTENT);
+               usage |= PIPE_TRANSFER_DISCARD_RANGE;
+               force_discard_range = true;
+       }
+
+       if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
            ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
                         PIPE_TRANSFER_PERSISTENT))) ||
             (buf->flags & RADEON_FLAG_SPARSE))) {
                assert(usage & PIPE_TRANSFER_WRITE);

                /* Check if mapping this buffer would cause waiting for the GPU.
                 */
                if (buf->flags & RADEON_FLAG_SPARSE ||
                    force_discard_range ||

si_rings_is_buffer_referenced(sctx, buf->buf,RADEON_USAGE_READWRITE) ||

@@ -514,32 +522,72 @@ static void *si_buffer_transfer_map(struct
pipe_context *ctx,
        data += box->x;

        return si_buffer_get_transfer(ctx, resource, usage, box,
                                        ptransfer, data, NULL, 0);
 }

 static void si_buffer_do_flush_region(struct pipe_context *ctx,
                                      struct pipe_transfer *transfer,
                                      const struct pipe_box *box)
 {
+       struct si_context *sctx = (struct si_context*)ctx;
        struct si_transfer *stransfer = (struct si_transfer*)transfer;
        struct si_resource *buf = si_resource(transfer->resource);

        if (stransfer->staging) {
                unsigned src_offset = stransfer->offset +
                                      transfer->box.x % SI_MAP_BUFFER_ALIGNMENT 
+
                                      (box->x - transfer->box.x);

+ if (buf->b.b.flags &SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA) {

+                       /* This should be true for all uploaders. */
+                       assert(transfer->box.x == 0);
+
+                       /* Find a previous upload and extend its range. The last
+                        * upload is likely to be at the end of the list.
+                        */
+                       for (int i = sctx->num_sdma_uploads - 1; i >= 0; i--) {
+                               struct si_sdma_upload *up = 
&sctx->sdma_uploads[i];
+
+                               if (up->dst != buf)
+                                       continue;
+
+                               assert(up->src == stransfer->staging);
+                               assert(box->x > up->dst_offset);
+                               up->size = box->x + box->width - up->dst_offset;
+                               return;
+                       }
+
+                       /* Enlarge the array if it's full. */
+                       if (sctx->num_sdma_uploads == sctx->max_sdma_uploads) {
+                               unsigned size;
+
+                               sctx->max_sdma_uploads += 4;
+                               size = sctx->max_sdma_uploads * 
sizeof(sctx->sdma_uploads[0]);
+                               sctx->sdma_uploads = 
realloc(sctx->sdma_uploads, size);
+                       }
+
+                       /* Add a new upload. */
+                       struct si_sdma_upload *up =
+                               &sctx->sdma_uploads[sctx->num_sdma_uploads++];
+                       up->dst = up->src = NULL;
+                       si_resource_reference(&up->dst, buf);
+                       si_resource_reference(&up->src, stransfer->staging);
+                       up->dst_offset = box->x;
+                       up->src_offset = src_offset;
+                       up->size = box->width;
+                       return;
+               }
+
                /* Copy the staging buffer into the original one. */
-               si_copy_buffer((struct si_context*)ctx, transfer->resource,
-                              &stransfer->staging->b.b, box->x, src_offset,
-                              box->width);
+               si_copy_buffer(sctx, transfer->resource, 
&stransfer->staging->b.b,
+                              box->x, src_offset, box->width);
        }

        util_range_add(&buf->valid_buffer_range, box->x,
                       box->x + box->width);
 }

 static void si_buffer_flush_region(struct pipe_context *ctx,
                                   struct pipe_transfer *transfer,
                                   const struct pipe_box *rel_box)
 {
diff --git a/src/gallium/drivers/radeonsi/si_dma_cs.c
b/src/gallium/drivers/radeonsi/si_dma_cs.c
index 2aafc1f09a0..bba1bd95826 100644
--- a/src/gallium/drivers/radeonsi/si_dma_cs.c
+++ b/src/gallium/drivers/radeonsi/si_dma_cs.c
@@ -133,21 +133,22 @@ void si_need_dma_space(struct si_context *ctx,
unsigned num_dw,
        if (dst) {
                vram += dst->vram_usage;
                gtt += dst->gart_usage;
        }
        if (src) {
                vram += src->vram_usage;
                gtt += src->gart_usage;
        }

        /* Flush the GFX IB if DMA depends on it. */
-       if (radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
+       if (!ctx->sdma_uploads_in_progress &&
+           radeon_emitted(ctx->gfx_cs, ctx->initial_gfx_cs_size) &&
            ((dst &&
              ws->cs_is_buffer_referenced(ctx->gfx_cs, dst->buf,
                                          RADEON_USAGE_READWRITE)) ||
             (src &&
              ws->cs_is_buffer_referenced(ctx->gfx_cs, src->buf,
                                          RADEON_USAGE_WRITE))))

si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW,NULL);


        /* Flush if there's not enough space, or if the memory usage per IB
         * is too large.
@@ -155,45 +156,47 @@ void si_need_dma_space(struct si_context *ctx,
unsigned num_dw,

* IBs using too little memory are limited by the IB submissionoverhead.

         * IBs using too much memory are limited by the kernel/TTM overhead.
         * Too long IBs create CPU-GPU pipeline bubbles and add latency.
         *
         * This heuristic makes sure that DMA requests are executed
         * very soon after the call is made and lowers memory usage.
         * It improves texture upload performance by keeping the DMA
         * engine busy while uploads are being submitted.
         */
        num_dw++; /* for emit_wait_idle below */
-       if (!ws->cs_check_space(ctx->dma_cs, num_dw) ||

- ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 *1024 ||- !radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram,gtt)) {

+       if (!ctx->sdma_uploads_in_progress &&
+           (!ws->cs_check_space(ctx->dma_cs, num_dw) ||

+ ctx->dma_cs->used_vram + ctx->dma_cs->used_gart > 64 * 1024 *1024 ||+ !radeon_cs_memory_below_limit(ctx->screen, ctx->dma_cs, vram,gtt))) {

                si_flush_dma_cs(ctx, PIPE_FLUSH_ASYNC, NULL);

assert((num_dw + ctx->dma_cs->current.cdw) <=ctx->dma_cs->current.max_dw);

        }

        /* Wait for idle if either buffer has been used in the IB before to
         * prevent read-after-write hazards.
         */
        if ((dst &&
             ws->cs_is_buffer_referenced(ctx->dma_cs, dst->buf,
                                         RADEON_USAGE_READWRITE)) ||
            (src &&
             ws->cs_is_buffer_referenced(ctx->dma_cs, src->buf,
                                         RADEON_USAGE_WRITE)))
                si_dma_emit_wait_idle(ctx);

+ unsigned sync = ctx->sdma_uploads_in_progress ? 0 :RADEON_USAGE_SYNCHRONIZED;

        if (dst) {
-               radeon_add_to_buffer_list(ctx, ctx->dma_cs, dst,
-                                         RADEON_USAGE_WRITE, 0);
+               ws->cs_add_buffer(ctx->dma_cs, dst->buf, RADEON_USAGE_WRITE | 
sync,
+                                 dst->domains, 0);
        }
        if (src) {
-               radeon_add_to_buffer_list(ctx, ctx->dma_cs, src,
-                                         RADEON_USAGE_READ, 0);
+               ws->cs_add_buffer(ctx->dma_cs, src->buf, RADEON_USAGE_READ | 
sync,
+                                 src->domains, 0);
        }

/* this function is called before all DMA calls, so increment this.*/

        ctx->num_dma_calls++;
 }

 void si_flush_dma_cs(struct si_context *ctx, unsigned flags,
                     struct pipe_fence_handle **fence)
 {
        struct radeon_cmdbuf *cs = ctx->dma_cs;
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c
b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index 3d64587fa2b..13d5b5a959a 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -19,20 +19,21 @@

* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENTSHALL

  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,

* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORTOR* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWAREOR THE

  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */

 #include "si_pipe.h"

 #include "util/os_time.h"
+#include "util/u_upload_mgr.h"

 /* initialize */
 void si_need_gfx_cs_space(struct si_context *ctx)
 {
        struct radeon_cmdbuf *cs = ctx->gfx_cs;

        /* There is no need to flush the DMA IB here, because
         * si_need_dma_space always flushes the GFX IB if there is
         * a conflict, which means any unflushed DMA commands automatically
         * precede the GFX IB (= they had no dependency on the GFX IB when
@@ -57,20 +58,29 @@ void si_need_gfx_cs_space(struct si_context *ctx)
         * and just flush if there is not enough space left.
         *
         * Also reserve space for stopping queries at the end of IB, because
         * the number of active queries is mostly unlimited.
         */
        unsigned need_dwords = 2048 + ctx->num_cs_dw_queries_suspend;
        if (!ctx->ws->cs_check_space(cs, need_dwords))

si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW,NULL);

 }

+void si_unref_sdma_uploads(struct si_context *sctx)
+{
+       for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) {
+               si_resource_reference(&sctx->sdma_uploads[i].dst, NULL);
+               si_resource_reference(&sctx->sdma_uploads[i].src, NULL);
+       }
+       sctx->num_sdma_uploads = 0;
+}
+
 void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
                     struct pipe_fence_handle **fence)
 {
        struct radeon_cmdbuf *cs = ctx->gfx_cs;
        struct radeon_winsys *ws = ctx->ws;
        unsigned wait_flags = 0;

        if (ctx->gfx_flush_in_progress)
                return;

@@ -91,31 +101,51 @@ void si_flush_gfx_cs(struct si_context *ctx,
unsigned flags,
        if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) &&
            (!wait_flags || !ctx->gfx_last_ib_is_busy))
                return;

        if (si_check_device_reset(ctx))
                return;

        if (ctx->screen->debug_flags & DBG(CHECK_VM))
                flags &= ~PIPE_FLUSH_ASYNC;

+       ctx->gfx_flush_in_progress = true;
+
        /* If the state tracker is flushing the GFX IB, si_flush_from_st is

* responsible for flushing the DMA IB and merging the fences fromboth.

-        * This code is only needed when the driver flushes the GFX IB
-        * internally, and it never asks for a fence handle.

+ * If the driver flushes the GFX IB internally, and it should neverask

+        * for a fence handle.
         */
-       if (radeon_emitted(ctx->dma_cs, 0)) {
-               assert(fence == NULL); /* internal flushes only */
-               si_flush_dma_cs(ctx, flags, NULL);
+       assert(!radeon_emitted(ctx->dma_cs, 0) || fence == NULL);
+
+       /* Update the sdma_uploads list by flushing the uploader. */
+       u_upload_unmap(ctx->b.const_uploader);
+
+       /* Execute SDMA uploads. */
+       ctx->sdma_uploads_in_progress = true;
+       for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) {
+               struct si_sdma_upload *up = &ctx->sdma_uploads[i];
+               struct pipe_box box;
+
+               assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 &&
+                      up->size % 4 == 0);
+
+               u_box_1d(up->src_offset, up->size, &box);
+               ctx->dma_copy(&ctx->b, &up->dst->b.b, 0, up->dst_offset, 0, 0,
+                             &up->src->b.b, 0, &box);
        }
+       ctx->sdma_uploads_in_progress = false;
+       si_unref_sdma_uploads(ctx);

-       ctx->gfx_flush_in_progress = true;
+       /* Flush SDMA (preamble IB). */
+       if (radeon_emitted(ctx->dma_cs, 0))
+               si_flush_dma_cs(ctx, flags, NULL);

        if (!LIST_IS_EMPTY(&ctx->active_queries))
                si_suspend_queries(ctx);

        ctx->streamout.suspended = false;
        if (ctx->streamout.begin_emitted) {
                si_emit_streamout_end(ctx);
                ctx->streamout.suspended = true;
        }

diff --git a/src/gallium/drivers/radeonsi/si_pipe.c
b/src/gallium/drivers/radeonsi/si_pipe.c
index c6f93e7b15e..c0ee2b1a6dc 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -257,20 +257,21 @@ static void si_destroy_context(struct
pipe_context *context)
        si_saved_cs_reference(&sctx->current_saved_cs, NULL);

        _mesa_hash_table_destroy(sctx->tex_handles, NULL);
        _mesa_hash_table_destroy(sctx->img_handles, NULL);

        util_dynarray_fini(&sctx->resident_tex_handles);
        util_dynarray_fini(&sctx->resident_img_handles);
        util_dynarray_fini(&sctx->resident_tex_needs_color_decompress);
        util_dynarray_fini(&sctx->resident_img_needs_color_decompress);
        util_dynarray_fini(&sctx->resident_tex_needs_depth_decompress);
+       si_unref_sdma_uploads(sctx);
        FREE(sctx);
 }

static enum pipe_reset_status si_get_reset_status(struct pipe_context*ctx)

 {
        struct si_context *sctx = (struct si_context *)ctx;

        if (sctx->screen->info.has_gpu_reset_status_query)
                return sctx->ws->ctx_query_reset_status(sctx->ctx);

@@ -436,43 +437,49 @@ static struct pipe_context
*si_create_context(struct pipe_screen *screen,
                                      SI_RESOURCE_FLAG_CLEAR, false);
        if (!sctx->allocator_zeroed_memory)
                goto fail;

        sctx->b.stream_uploader = u_upload_create(&sctx->b, 1024 * 1024,
                                                    0, PIPE_USAGE_STREAM,
                                                    SI_RESOURCE_FLAG_READ_ONLY);
        if (!sctx->b.stream_uploader)
                goto fail;

-       sctx->b.const_uploader = u_upload_create(&sctx->b, 128 * 1024,
-                                                  0, PIPE_USAGE_DEFAULT,
-                                                  SI_RESOURCE_FLAG_32BIT |
-                                                  
(sscreen->cpdma_prefetch_writes_memory ?
-                                                           0 : 
SI_RESOURCE_FLAG_READ_ONLY));
-       if (!sctx->b.const_uploader)
-               goto fail;
-
        sctx->cached_gtt_allocator = u_upload_create(&sctx->b, 16 * 1024,
                                                       0, PIPE_USAGE_STAGING, 
0);
        if (!sctx->cached_gtt_allocator)
                goto fail;

        sctx->ctx = sctx->ws->ctx_create(sctx->ws);
        if (!sctx->ctx)
                goto fail;

        if (sscreen->info.num_sdma_rings && !(sscreen->debug_flags &
DBG(NO_ASYNC_DMA))) {
                sctx->dma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA,
                                                   (void*)si_flush_dma_cs,
                                                   sctx, stop_exec_on_failure);
        }

+       bool use_sdma_upload = sscreen->info.has_dedicated_vram &&
sctx->dma_cs && debug_get_bool_option("SDMA", true);
+       sctx->b.const_uploader = u_upload_create(&sctx->b, 256 * 1024,
+                                                0, PIPE_USAGE_DEFAULT,
+                                                SI_RESOURCE_FLAG_32BIT |
+                                                (use_sdma_upload ?
+                                                         
SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA :
+                                                         
(sscreen->cpdma_prefetch_writes_memory ?
+                                                                  0 : 
SI_RESOURCE_FLAG_READ_ONLY)));
+       if (!sctx->b.const_uploader)
+               goto fail;
+
+       if (use_sdma_upload)
+               u_upload_enable_flush_explicit(sctx->b.const_uploader);
+
        si_init_buffer_functions(sctx);
        si_init_clear_functions(sctx);
        si_init_blit_functions(sctx);
        si_init_compute_functions(sctx);
        si_init_compute_blit_functions(sctx);
        si_init_debug_functions(sctx);
        si_init_msaa_functions(sctx);
        si_init_streamout_functions(sctx);

        if (sscreen->info.has_hw_decode) {
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h
b/src/gallium/drivers/radeonsi/si_pipe.h
index b01d5744752..b208bdeb848 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -103,20 +103,22 @@
 #define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024

 #define SI_RESOURCE_FLAG_TRANSFER      (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)

#define SI_RESOURCE_FLAG_FLUSHED_DEPTH (PIPE_RESOURCE_FLAG_DRV_PRIV <<1)#define SI_RESOURCE_FLAG_FORCE_MSAA_TILING(PIPE_RESOURCE_FLAG_DRV_PRIV << 2)#define SI_RESOURCE_FLAG_DISABLE_DCC (PIPE_RESOURCE_FLAG_DRV_PRIV <<3)

 #define SI_RESOURCE_FLAG_UNMAPPABLE    (PIPE_RESOURCE_FLAG_DRV_PRIV << 4)
 #define SI_RESOURCE_FLAG_READ_ONLY     (PIPE_RESOURCE_FLAG_DRV_PRIV << 5)
 #define SI_RESOURCE_FLAG_32BIT         (PIPE_RESOURCE_FLAG_DRV_PRIV << 6)
 #define SI_RESOURCE_FLAG_CLEAR         (PIPE_RESOURCE_FLAG_DRV_PRIV << 7)
+/* For const_uploader, upload data via GTT and copy to VRAM on
context flush via SDMA. */
+#define SI_RESOURCE_FLAG_UPLOAD_FLUSH_EXPLICIT_VIA_SDMA
(PIPE_RESOURCE_FLAG_DRV_PRIV << 8)

 enum si_clear_code
 {
        DCC_CLEAR_COLOR_0000   = 0x00000000,
        DCC_CLEAR_COLOR_0001   = 0x40404040,
        DCC_CLEAR_COLOR_1110   = 0x80808080,
        DCC_CLEAR_COLOR_1111   = 0xC0C0C0C0,
        DCC_CLEAR_COLOR_REG    = 0x20202020,
        DCC_UNCOMPRESSED       = 0xFFFFFFFF,
 };
@@ -769,20 +771,28 @@ struct si_saved_cs {
        struct si_context       *ctx;
        struct radeon_saved_cs  gfx;
        struct si_resource      *trace_buf;
        unsigned                trace_id;

        unsigned                gfx_last_dw;
        bool                    flushed;
        int64_t                 time_flush;
 };

+struct si_sdma_upload {
+       struct si_resource      *dst;
+       struct si_resource      *src;
+       unsigned                src_offset;
+       unsigned                dst_offset;
+       unsigned                size;
+};
+
 struct si_context {
        struct pipe_context             b; /* base class */

        enum radeon_family              family;
        enum chip_class                 chip_class;

        struct radeon_winsys            *ws;
        struct radeon_winsys_ctx        *ctx;
        struct radeon_cmdbuf            *gfx_cs;
        struct radeon_cmdbuf            *dma_cs;
@@ -1074,20 +1084,26 @@ struct si_context {
        int                             num_perfect_occlusion_queries;
        struct list_head                active_queries;
        unsigned                        num_cs_dw_queries_suspend;

        /* Render condition. */
        struct pipe_query               *render_cond;
        unsigned                        render_cond_mode;
        bool                            render_cond_invert;
        bool                            render_cond_force_off; /* for u_blitter 
*/

+ /* For uploading data via GTT and copy to VRAM on context flush viaSDMA. */

+       bool                            sdma_uploads_in_progress;
+       struct si_sdma_upload           *sdma_uploads;
+       unsigned                        num_sdma_uploads;
+       unsigned                        max_sdma_uploads;
+
        /* Statistics gathering for the DCC enablement heuristic. It can't be
         * in si_texture because si_texture can be shared by multiple
         * contexts. This is for back buffers only. We shouldn't get too many
         * of those.
         *
         * X11 DRI3 rotates among a finite set of back buffers. They should
         * all fit in this array. If they don't, separate DCC might never be
         * enabled by DCC stat gathering.
         */
        struct {
@@ -1273,20 +1289,21 @@ struct pipe_fence_handle
*si_create_fence(struct pipe_context *ctx,
                                          struct tc_unflushed_batch_token 
*tc_token);

 /* si_get.c */
 void si_init_screen_get_functions(struct si_screen *sscreen);

 /* si_gfx_cs.c */
 void si_flush_gfx_cs(struct si_context *ctx, unsigned flags,
                     struct pipe_fence_handle **fence);
 void si_begin_new_gfx_cs(struct si_context *ctx);
 void si_need_gfx_cs_space(struct si_context *ctx);
+void si_unref_sdma_uploads(struct si_context *sctx);

 /* si_gpu_load.c */
 void si_gpu_load_kill_thread(struct si_screen *sscreen);
 uint64_t si_begin_counter(struct si_screen *sscreen, unsigned type);
 unsigned si_end_counter(struct si_screen *sscreen, unsigned type,
                        uint64_t begin);

 /* si_compute.c */
 void si_init_compute_functions(struct si_context *sctx);

diff --git a/src/gallium/drivers/radeonsi/si_buffer.c b/src/gallium/drivers/radeonsi/si_buffer.c
index c01118ce96a..1bd86669398 100644
--- a/src/gallium/drivers/radeonsi/si_buffer.c
+++ b/src/gallium/drivers/radeonsi/si_buffer.c
@@ -453,9 +453,17 @@ static void *si_buffer_transfer_map(struct pipe_context *ctx,
 		    si_rings_is_buffer_referenced(sctx, buf->buf, RADEON_USAGE_READWRITE) ||
 		    !sctx->ws->buffer_wait(buf->buf, 0, RADEON_USAGE_READWRITE)) {
 			/* Do a wait-free write-only transfer using a temporary buffer. */
-			unsigned offset;
+			struct u_upload_mgr *uploader;
 			struct si_resource *staging = NULL;
+			unsigned offset;
 
+			/* If we are not called from the driver thread, we have
+			 * to use the uploader from u_threaded_context, which is
+			 * local to the calling thread.
+			 */
+			if (usage & TC_TRANSFER_MAP_THREADED_UNSYNC)
+				uploader = sctx->tc->base.stream_uploader;
+			else
 			u_upload_alloc(ctx->stream_uploader, 0,
                                        box->width + (box->x % SI_MAP_BUFFER_ALIGNMENT),
 				       sctx->screen->info.tcc_cache_line_size,

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 4/4] radeonsi: use SDMA for uploading data through const_uploader

Reply via email to