From: Marek Olšák <marek.ol...@amd.com> --- src/gallium/drivers/radeonsi/si_clear.c | 6 +- src/gallium/drivers/radeonsi/si_cp_dma.c | 88 ++++++++++--------- src/gallium/drivers/radeonsi/si_pipe.c | 2 +- src/gallium/drivers/radeonsi/si_pipe.h | 11 ++- .../drivers/radeonsi/si_test_clearbuffer.c | 4 +- src/gallium/drivers/radeonsi/si_test_dma.c | 2 +- 6 files changed, 60 insertions(+), 53 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c index 654ff0ace78..4e07de81bac 100644 --- a/src/gallium/drivers/radeonsi/si_clear.c +++ b/src/gallium/drivers/radeonsi/si_clear.c @@ -249,21 +249,21 @@ void vi_dcc_clear_level(struct si_context *sctx, * would be more efficient than separate per-layer clear operations. */ assert(tex->buffer.b.b.nr_storage_samples <= 2 || num_layers == 1); dcc_offset += tex->surface.u.legacy.level[level].dcc_offset; clear_size = tex->surface.u.legacy.level[level].dcc_fast_clear_size * num_layers; } si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size, - clear_value, SI_COHERENCY_CB_META, SI_METHOD_BEST); + clear_value, SI_COHERENCY_CB_META); } /* Set the same micro tile mode as the destination of the last MSAA resolve. * This allows hitting the MSAA resolve fast path, which requires that both * src and dst micro tile modes match. */ static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen, struct si_texture *tex) { if (tex->buffer.b.is_shared || @@ -482,21 +482,21 @@ static void si_do_fast_color_clear(struct si_context *sctx, continue; /* DCC fast clear with MSAA should clear CMASK to 0xC. */ if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) { /* TODO: This doesn't work with MSAA. */ if (eliminate_needed) continue; si_clear_buffer(sctx, &tex->cmask_buffer->b.b, tex->cmask_offset, tex->surface.cmask_size, - 0xCCCCCCCC, SI_COHERENCY_CB_META, SI_METHOD_BEST); + 0xCCCCCCCC, SI_COHERENCY_CB_META); need_decompress_pass = true; } vi_dcc_clear_level(sctx, tex, 0, reset_value); if (eliminate_needed) need_decompress_pass = true; tex->separate_dcc_dirty = true; } else { @@ -513,21 +513,21 @@ static void si_do_fast_color_clear(struct si_context *sctx, continue; /* ensure CMASK is enabled */ si_alloc_separate_cmask(sctx->screen, tex); if (!tex->cmask_buffer) continue; /* Do the fast clear. */ si_clear_buffer(sctx, &tex->cmask_buffer->b.b, tex->cmask_offset, tex->surface.cmask_size, 0, - SI_COHERENCY_CB_META, SI_METHOD_BEST); + SI_COHERENCY_CB_META); need_decompress_pass = true; } if (need_decompress_pass && !(tex->dirty_level_mask & (1 << level))) { tex->dirty_level_mask |= 1 << level; p_atomic_inc(&sctx->screen->compressed_colortex_counter); } /* We can change the micro tile mode before a full clear. */ diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 86eb3529d9b..bae592a4f7d 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -217,95 +217,103 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst */ if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) && byte_count == remaining_size) { *packet_flags |= CP_DMA_SYNC; if (coher == SI_COHERENCY_SHADER) *packet_flags |= CP_DMA_PFP_SYNC_ME; } } +void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, + uint64_t offset, uint64_t size, unsigned value, + enum si_coherency coher, + enum si_cache_policy cache_policy) +{ + struct r600_resource *rdst = r600_resource(dst); + uint64_t va = rdst->gpu_address + offset; + bool is_first = true; + + assert(size && size % 4 == 0); + + /* Mark the buffer range of destination as valid (initialized), + * so that transfer_map knows it should wait for the GPU when mapping + * that range. */ + util_range_add(&rdst->valid_buffer_range, offset, offset + size); + + /* Flush the caches. */ + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH | + get_flush_flags(sctx, coher, cache_policy); + + while (size) { + unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx)); + unsigned dma_flags = CP_DMA_CLEAR; + + si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, 0, coher, + &is_first, &dma_flags); + + /* Emit the clear packet. */ + si_emit_cp_dma(sctx, va, value, byte_count, dma_flags, cache_policy); + + size -= byte_count; + va += byte_count; + } + + if (cache_policy != L2_BYPASS) + rdst->TC_L2_dirty = true; + + /* If it's not a framebuffer fast clear... */ + if (coher == SI_COHERENCY_SHADER) + sctx->num_cp_dma_calls++; +} + void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned value, - enum si_coherency coher, enum si_method xfer) + enum si_coherency coher) { struct radeon_winsys *ws = sctx->ws; struct r600_resource *rdst = r600_resource(dst); enum si_cache_policy cache_policy = get_cache_policy(sctx, coher); - unsigned flush_flags = get_flush_flags(sctx, coher, cache_policy); uint64_t dma_clear_size; - bool is_first = true; if (!size) return; dma_clear_size = size & ~3ull; - /* Mark the buffer range of destination as valid (initialized), - * so that transfer_map knows it should wait for the GPU when mapping - * that range. */ - util_range_add(&rdst->valid_buffer_range, offset, - offset + dma_clear_size); - /* dma_clear_buffer can use clear_buffer on failure. Make sure that * doesn't happen. We don't want an infinite recursion: */ if (sctx->dma_cs && !(dst->flags & PIPE_RESOURCE_FLAG_SPARSE) && (offset % 4 == 0) && /* CP DMA is very slow. Always use SDMA for big clears. This * alone improves DeusEx:MD performance by 70%. */ (size > CP_DMA_CLEAR_PERF_THRESHOLD || /* Buffers not used by the GFX IB yet will be cleared by SDMA. * This happens to move most buffer clears to SDMA, including * DCC and CMASK clears, because pipe->clear clears them before * si_emit_framebuffer_state (in a draw call) adds them. * For example, DeusEx:MD has 21 buffer clears per frame and all * of them are moved to SDMA thanks to this. */ !ws->cs_is_buffer_referenced(sctx->gfx_cs, rdst->buf, - RADEON_USAGE_READWRITE)) && - /* bypass sdma transfer with param xfer */ - (xfer != SI_METHOD_CP_DMA)) { + RADEON_USAGE_READWRITE))) { sctx->dma_clear_buffer(sctx, dst, offset, dma_clear_size, value); offset += dma_clear_size; size -= dma_clear_size; } else if (dma_clear_size >= 4) { - uint64_t va = rdst->gpu_address + offset; + si_cp_dma_clear_buffer(sctx, dst, offset, dma_clear_size, value, + coher, cache_policy); offset += dma_clear_size; size -= dma_clear_size; - - /* Flush the caches. */ - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | - SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags; - - while (dma_clear_size) { - unsigned byte_count = MIN2(dma_clear_size, cp_dma_max_byte_count(sctx)); - unsigned dma_flags = CP_DMA_CLEAR; - - si_cp_dma_prepare(sctx, dst, NULL, byte_count, dma_clear_size, 0, - coher, &is_first, &dma_flags); - - /* Emit the clear packet. */ - si_emit_cp_dma(sctx, va, value, byte_count, dma_flags, - cache_policy); - - dma_clear_size -= byte_count; - va += byte_count; - } - - if (cache_policy != L2_BYPASS) - rdst->TC_L2_dirty = true; - - /* If it's not a framebuffer fast clear... */ - if (coher == SI_COHERENCY_SHADER) - sctx->num_cp_dma_calls++; } if (size) { /* Handle non-dword alignment. * * This function is called for embedded texture metadata clears, * but those should always be properly aligned. */ assert(dst->target == PIPE_BUFFER); assert(size < 4); @@ -363,21 +371,21 @@ static void si_pipe_clear_buffer(struct pipe_context *ctx, break; case 2: dword_value = *(uint16_t*)clear_value_ptr; dword_value |= dword_value << 16; break; default: dword_value = *(uint32_t*)clear_value_ptr; } si_clear_buffer(sctx, dst, offset, size, dword_value, - SI_COHERENCY_SHADER, SI_METHOD_BEST); + SI_COHERENCY_SHADER); } /** * Realign the CP DMA engine. This must be done after a copy with an unaligned * size. * * \param size Remaining size to the CP DMA alignment. */ static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size, unsigned user_flags, enum si_coherency coher, diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index c1983b86661..3ca53dfed7c 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -539,21 +539,21 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &sctx->null_const_buf); si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &sctx->null_const_buf); si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &sctx->null_const_buf); /* Clear the NULL constant buffer, because loads should return zeros. */ si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0, sctx->null_const_buf.buffer->width0, 0, - SI_COHERENCY_SHADER, SI_METHOD_BEST); + SI_COHERENCY_SHADER); } uint64_t max_threads_per_block; screen->get_compute_param(screen, PIPE_SHADER_IR_TGSI, PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK, &max_threads_per_block); /* The maximum number of scratch waves. Scratch space isn't divided * evenly between CUs. The number is only a function of the number of CUs. * We can decrease the constant to decrease the scratch buffer size. diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 0b398018c4a..acdc0d11f2c 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1109,29 +1109,28 @@ enum si_cache_policy { L2_BYPASS, L2_LRU, /* same as SLC=0 */ }; enum si_coherency { SI_COHERENCY_NONE, /* no cache flushes needed */ SI_COHERENCY_SHADER, SI_COHERENCY_CB_META, }; -enum si_method { - SI_METHOD_CP_DMA, - SI_METHOD_BEST, -}; - void si_cp_dma_wait_for_idle(struct si_context *sctx); +void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, + uint64_t offset, uint64_t size, unsigned value, + enum si_coherency coher, + enum si_cache_policy cache_policy); void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned value, - enum si_coherency coher, enum si_method xfer); + enum si_coherency coher); void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned user_flags); void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset, unsigned size); void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only); void si_init_cp_dma_functions(struct si_context *sctx); /* si_debug.c */ diff --git a/src/gallium/drivers/radeonsi/si_test_clearbuffer.c b/src/gallium/drivers/radeonsi/si_test_clearbuffer.c index c0696da26db..e863381fd15 100644 --- a/src/gallium/drivers/radeonsi/si_test_clearbuffer.c +++ b/src/gallium/drivers/radeonsi/si_test_clearbuffer.c @@ -41,22 +41,22 @@ measure_clearbuf_time(struct pipe_context *ctx, struct si_context *sctx = (struct si_context*)ctx; struct pipe_screen *screen = ctx->screen; buf = pipe_buffer_create(screen, 0, PIPE_USAGE_DEFAULT, memory_size); query_te = ctx->create_query(ctx, PIPE_QUERY_TIME_ELAPSED, 0); ctx->begin_query(ctx, query_te); /* operation */ - si_clear_buffer(sctx, buf, 0, memory_size, 0x00, - SI_COHERENCY_SHADER, SI_METHOD_CP_DMA); + si_cp_dma_clear_buffer(sctx, buf, 0, memory_size, 0x00, + SI_COHERENCY_SHADER, L2_LRU); ctx->end_query(ctx, query_te); ctx->get_query_result(ctx, query_te, true, &qresult); /* Cleanup. */ ctx->destroy_query(ctx, query_te); pipe_resource_reference(&buf, NULL); /* Report Results */ return qresult.u64; } diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium/drivers/radeonsi/si_test_dma.c index f125769d1cf..c81ec75dde2 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma.c +++ b/src/gallium/drivers/radeonsi/si_test_dma.c @@ -301,21 +301,21 @@ void si_test_dma(struct si_screen *sscreen) array_mode_to_string(sscreen, &sdst->surface), tsrc.width0, tsrc.height0, tsrc.array_size, array_mode_to_string(sscreen, &ssrc->surface), bpp); fflush(stdout); /* set src pixels */ set_random_pixels(ctx, src, &src_cpu); /* clear dst pixels */ si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, 0, - SI_COHERENCY_SHADER, SI_METHOD_BEST); + SI_COHERENCY_SHADER); memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size); /* preparation */ max_width = MIN2(tsrc.width0, tdst.width0); max_height = MIN2(tsrc.height0, tdst.height0); max_depth = MIN2(tsrc.array_size, tdst.array_size); num = do_partial_copies ? num_partial_copies : 1; for (j = 0; j < num; j++) { int width, height, depth; -- 2.17.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev