[Mesa-dev] [PATCH 3/3] radeonsi: don't map big VRAM buffers for the first upload directly

Marek Olšák Sat, 04 Nov 2017 06:03:50 -0700

From: Marek Olšák <marek.ol...@amd.com>

---
 src/gallium/drivers/radeon/r600_buffer_common.c | 20 ++++++++++++++++++++
 src/gallium/drivers/radeon/r600_pipe_common.h   |  1 +
 2 files changed, 21 insertions(+)


diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c 
b/src/gallium/drivers/radeon/r600_buffer_common.c
index 92521f4..519e52e 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -187,20 +187,21 @@ void si_init_resource_fields(struct r600_common_screen 
*rscreen,
        if (rscreen->debug_flags & DBG(NO_WC))
                res->flags &= ~RADEON_FLAG_GTT_WC;
 
        /* Set expected VRAM and GART usage for the buffer. */
        res->vram_usage = 0;
        res->gart_usage = 0;
 
        if (res->domains & RADEON_DOMAIN_VRAM) {
                res->vram_usage = size;
 
+               res->max_forced_staging_uploads =
                res->b.max_forced_staging_uploads =
                        rscreen->info.has_dedicated_vram &&
                        size >= rscreen->info.vram_vis_size / 4 ? 1 : 0;
        } else if (res->domains & RADEON_DOMAIN_GTT) {
                res->gart_usage = size;
        }
 }
 
 bool si_alloc_resource(struct r600_common_screen *rscreen,
                       struct r600_resource *res)
@@ -288,20 +289,21 @@ void si_replace_buffer_storage(struct pipe_context *ctx,
 {
        struct r600_common_context *rctx = (struct r600_common_context *)ctx;
        struct r600_resource *rdst = r600_resource(dst);
        struct r600_resource *rsrc = r600_resource(src);
        uint64_t old_gpu_address = rdst->gpu_address;
 
        pb_reference(&rdst->buf, rsrc->buf);
        rdst->gpu_address = rsrc->gpu_address;
        rdst->b.b.bind = rsrc->b.b.bind;
        rdst->b.max_forced_staging_uploads = rsrc->b.max_forced_staging_uploads;
+       rdst->max_forced_staging_uploads = rsrc->max_forced_staging_uploads;
        rdst->flags = rsrc->flags;
 
        assert(rdst->vram_usage == rsrc->vram_usage);
        assert(rdst->gart_usage == rsrc->gart_usage);
        assert(rdst->bo_size == rsrc->bo_size);
        assert(rdst->bo_alignment == rsrc->bo_alignment);
        assert(rdst->domains == rsrc->domains);
 
        rctx->rebind_buffer(ctx, dst, old_gpu_address);
 }
@@ -395,20 +397,37 @@ static void *r600_buffer_transfer_map(struct pipe_context 
*ctx,
            !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x 
+ box->width)) {
                usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
        }
 
        /* If discarding the entire range, discard the whole resource instead. 
*/
        if (usage & PIPE_TRANSFER_DISCARD_RANGE &&
            box->x == 0 && box->width == resource->width0) {
                usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE;
        }
 
+       /* If a buffer in VRAM is too large and the range is discarded, don't
+        * map it directly. This makes sure that the buffer stays in VRAM.
+        */
+       bool force_discard_range = false;
+       if (usage & (PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
+                    PIPE_TRANSFER_DISCARD_RANGE) &&
+           !(usage & PIPE_TRANSFER_PERSISTENT) &&
+           /* Try not to decrement the counter if it's not positive. Still 
racy,
+            * but it makes it harder to wrap the counter from INT_MIN to 
INT_MAX. */
+           rbuffer->max_forced_staging_uploads > 0 &&
+           p_atomic_dec_return(&rbuffer->max_forced_staging_uploads) >= 0) {
+               usage &= ~(PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE |
+                          PIPE_TRANSFER_UNSYNCHRONIZED);
+               usage |= PIPE_TRANSFER_DISCARD_RANGE;
+               force_discard_range = true;
+       }
+
        if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE &&
            !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
                       TC_TRANSFER_MAP_NO_INVALIDATE))) {
                assert(usage & PIPE_TRANSFER_WRITE);
 
                if (r600_invalidate_buffer(rctx, rbuffer)) {
                        /* At this point, the buffer is always idle. */
                        usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
                } else {
                        /* Fall back to a temporary buffer. */
@@ -420,20 +439,21 @@ static void *r600_buffer_transfer_map(struct pipe_context 
*ctx,
            !(rscreen->debug_flags & DBG(NO_DISCARD_RANGE)) &&
            ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
                         PIPE_TRANSFER_PERSISTENT)) &&
              r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) ||
             (rbuffer->flags & RADEON_FLAG_SPARSE))) {
                assert(usage & PIPE_TRANSFER_WRITE);
 
                /* Check if mapping this buffer would cause waiting for the GPU.
                 */
                if (rbuffer->flags & RADEON_FLAG_SPARSE ||
+                   force_discard_range ||
                    si_rings_is_buffer_referenced(rctx, rbuffer->buf, 
RADEON_USAGE_READWRITE) ||
                    !rctx->ws->buffer_wait(rbuffer->buf, 0, 
RADEON_USAGE_READWRITE)) {
                        /* Do a wait-free write-only transfer using a temporary 
buffer. */
                        unsigned offset;
                        struct r600_resource *staging = NULL;
 
                        u_upload_alloc(ctx->stream_uploader, 0,
                                        box->width + (box->x % 
R600_MAP_BUFFER_ALIGNMENT),
                                       rctx->screen->info.tcc_cache_line_size,
                                       &offset, (struct 
pipe_resource**)&staging,
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h 
b/src/gallium/drivers/radeon/r600_pipe_common.h
index a45921e..a7fec37 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -167,20 +167,21 @@ struct r600_resource {
        /* Memory usage if the buffer placement is optimal. */
        uint64_t                        vram_usage;
        uint64_t                        gart_usage;
 
        /* Resource properties. */
        uint64_t                        bo_size;
        unsigned                        bo_alignment;
        enum radeon_bo_domain           domains;
        enum radeon_bo_flag             flags;
        unsigned                        bind_history;
+       int                             max_forced_staging_uploads;
 
        /* The buffer range which is initialized (with a write transfer,
         * streamout, DMA, or as a random access target). The rest of
         * the buffer is considered invalid and can be mapped unsynchronized.
         *
         * This allows unsychronized mapping of a buffer range which hasn't
         * been used yet. It's for applications which forget to use
         * the unsynchronized map flag and expect the driver to figure it out.
          */
        struct util_range               valid_buffer_range;
-- 
2.7.4

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 3/3] radeonsi: don't map big VRAM buffers for the first upload directly

Reply via email to