From: Marek Olšák <marek.ol...@amd.com> --- src/gallium/drivers/radeon/r600_buffer_common.c | 20 ++++++++++++++++++++ src/gallium/drivers/radeon/r600_pipe_common.h | 1 + 2 files changed, 21 insertions(+)
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c index 92521f4..519e52e 100644 --- a/src/gallium/drivers/radeon/r600_buffer_common.c +++ b/src/gallium/drivers/radeon/r600_buffer_common.c @@ -187,20 +187,21 @@ void si_init_resource_fields(struct r600_common_screen *rscreen, if (rscreen->debug_flags & DBG(NO_WC)) res->flags &= ~RADEON_FLAG_GTT_WC; /* Set expected VRAM and GART usage for the buffer. */ res->vram_usage = 0; res->gart_usage = 0; if (res->domains & RADEON_DOMAIN_VRAM) { res->vram_usage = size; + res->max_forced_staging_uploads = res->b.max_forced_staging_uploads = rscreen->info.has_dedicated_vram && size >= rscreen->info.vram_vis_size / 4 ? 1 : 0; } else if (res->domains & RADEON_DOMAIN_GTT) { res->gart_usage = size; } } bool si_alloc_resource(struct r600_common_screen *rscreen, struct r600_resource *res) @@ -288,20 +289,21 @@ void si_replace_buffer_storage(struct pipe_context *ctx, { struct r600_common_context *rctx = (struct r600_common_context *)ctx; struct r600_resource *rdst = r600_resource(dst); struct r600_resource *rsrc = r600_resource(src); uint64_t old_gpu_address = rdst->gpu_address; pb_reference(&rdst->buf, rsrc->buf); rdst->gpu_address = rsrc->gpu_address; rdst->b.b.bind = rsrc->b.b.bind; rdst->b.max_forced_staging_uploads = rsrc->b.max_forced_staging_uploads; + rdst->max_forced_staging_uploads = rsrc->max_forced_staging_uploads; rdst->flags = rsrc->flags; assert(rdst->vram_usage == rsrc->vram_usage); assert(rdst->gart_usage == rsrc->gart_usage); assert(rdst->bo_size == rsrc->bo_size); assert(rdst->bo_alignment == rsrc->bo_alignment); assert(rdst->domains == rsrc->domains); rctx->rebind_buffer(ctx, dst, old_gpu_address); } @@ -395,20 +397,37 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) { usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } /* If discarding the entire range, discard the whole resource instead. */ if (usage & PIPE_TRANSFER_DISCARD_RANGE && box->x == 0 && box->width == resource->width0) { usage |= PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE; } + /* If a buffer in VRAM is too large and the range is discarded, don't + * map it directly. This makes sure that the buffer stays in VRAM. + */ + bool force_discard_range = false; + if (usage & (PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE | + PIPE_TRANSFER_DISCARD_RANGE) && + !(usage & PIPE_TRANSFER_PERSISTENT) && + /* Try not to decrement the counter if it's not positive. Still racy, + * but it makes it harder to wrap the counter from INT_MIN to INT_MAX. */ + rbuffer->max_forced_staging_uploads > 0 && + p_atomic_dec_return(&rbuffer->max_forced_staging_uploads) >= 0) { + usage &= ~(PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE | + PIPE_TRANSFER_UNSYNCHRONIZED); + usage |= PIPE_TRANSFER_DISCARD_RANGE; + force_discard_range = true; + } + if (usage & PIPE_TRANSFER_DISCARD_WHOLE_RESOURCE && !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | TC_TRANSFER_MAP_NO_INVALIDATE))) { assert(usage & PIPE_TRANSFER_WRITE); if (r600_invalidate_buffer(rctx, rbuffer)) { /* At this point, the buffer is always idle. */ usage |= PIPE_TRANSFER_UNSYNCHRONIZED; } else { /* Fall back to a temporary buffer. */ @@ -420,20 +439,21 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx, !(rscreen->debug_flags & DBG(NO_DISCARD_RANGE)) && ((!(usage & (PIPE_TRANSFER_UNSYNCHRONIZED | PIPE_TRANSFER_PERSISTENT)) && r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) || (rbuffer->flags & RADEON_FLAG_SPARSE))) { assert(usage & PIPE_TRANSFER_WRITE); /* Check if mapping this buffer would cause waiting for the GPU. */ if (rbuffer->flags & RADEON_FLAG_SPARSE || + force_discard_range || si_rings_is_buffer_referenced(rctx, rbuffer->buf, RADEON_USAGE_READWRITE) || !rctx->ws->buffer_wait(rbuffer->buf, 0, RADEON_USAGE_READWRITE)) { /* Do a wait-free write-only transfer using a temporary buffer. */ unsigned offset; struct r600_resource *staging = NULL; u_upload_alloc(ctx->stream_uploader, 0, box->width + (box->x % R600_MAP_BUFFER_ALIGNMENT), rctx->screen->info.tcc_cache_line_size, &offset, (struct pipe_resource**)&staging, diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index a45921e..a7fec37 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -167,20 +167,21 @@ struct r600_resource { /* Memory usage if the buffer placement is optimal. */ uint64_t vram_usage; uint64_t gart_usage; /* Resource properties. */ uint64_t bo_size; unsigned bo_alignment; enum radeon_bo_domain domains; enum radeon_bo_flag flags; unsigned bind_history; + int max_forced_staging_uploads; /* The buffer range which is initialized (with a write transfer, * streamout, DMA, or as a random access target). The rest of * the buffer is considered invalid and can be mapped unsynchronized. * * This allows unsychronized mapping of a buffer range which hasn't * been used yet. It's for applications which forget to use * the unsynchronized map flag and expect the driver to figure it out. */ struct util_range valid_buffer_range; -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev