From: Marek Olšák <marek.ol...@amd.com> --- src/gallium/drivers/radeonsi/si_cp_dma.c | 2 +- src/gallium/drivers/radeonsi/si_dma.c | 40 ++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-)
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index aed8bb8..f06b8dd 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -196,21 +196,21 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, map += offset; for (uint64_t i = 0; i < size; i++) { unsigned byte_within_dword = (offset + i) % 4; *map++ = (value >> (byte_within_dword * 8)) & 0xff; } return; } /* dma_clear_buffer can use clear_buffer on failure. Make sure that * doesn't happen. We don't want an infinite recursion: */ - if (sctx->b.chip_class >= CIK && sctx->b.dma.cs && + if (sctx->b.dma.cs && /* CP DMA is very slow. Always use SDMA for big clears. This * alone improves DeusEx:MD performance by 70%. */ (size > 128 * 1024 || /* Buffers not used by the GFX IB yet will be cleared by SDMA. * This happens to move most buffer clears to SDMA, including * DCC and CMASK clears, because pipe->clear clears them before * si_emit_framebuffer_state (in a draw call) adds them. * For example, DeusEx:MD has 21 buffer clears per frame and all * of them are moved to SDMA thanks to this. */ !ws->cs_is_buffer_referenced(sctx->b.gfx.cs, rdst->buf, diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c index b6aab00..9dbee3a 100644 --- a/src/gallium/drivers/radeonsi/si_dma.c +++ b/src/gallium/drivers/radeonsi/si_dma.c @@ -71,20 +71,59 @@ static void si_dma_copy_buffer(struct si_context *ctx, radeon_emit(cs, dst_offset); radeon_emit(cs, src_offset); radeon_emit(cs, (dst_offset >> 32UL) & 0xff); radeon_emit(cs, (src_offset >> 32UL) & 0xff); dst_offset += count; src_offset += count; size -= count; } } +static void si_dma_clear_buffer(struct pipe_context *ctx, + struct pipe_resource *dst, + uint64_t offset, + uint64_t size, + unsigned clear_value) +{ + struct si_context *sctx = (struct si_context *)ctx; + struct radeon_winsys_cs *cs = sctx->b.dma.cs; + unsigned i, ncopy, csize; + struct r600_resource *rdst = r600_resource(dst); + + if (!cs || offset % 4 != 0 || size % 4 != 0) { + ctx->clear_buffer(ctx, dst, offset, size, &clear_value, 4); + return; + } + + /* Mark the buffer range of destination as valid (initialized), + * so that transfer_map knows it should wait for the GPU when mapping + * that range. */ + util_range_add(&rdst->valid_buffer_range, offset, offset + size); + + offset += rdst->gpu_address; + + /* the same maximum size as for copying */ + ncopy = DIV_ROUND_UP(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); + r600_need_dma_space(&sctx->b, ncopy * 4, rdst, NULL); + + for (i = 0; i < ncopy; i++) { + csize = MIN2(size, SI_DMA_COPY_MAX_DWORD_ALIGNED_SIZE); + radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_CONSTANT_FILL, 0, + csize / 4)); + radeon_emit(cs, offset); + radeon_emit(cs, clear_value); + radeon_emit(cs, (offset >> 32) << 16); + offset += csize; + size -= csize; + } +} + static void si_dma_copy_tile(struct si_context *ctx, struct pipe_resource *dst, unsigned dst_level, unsigned dst_x, unsigned dst_y, unsigned dst_z, struct pipe_resource *src, unsigned src_level, unsigned src_x, unsigned src_y, @@ -278,11 +317,12 @@ static void si_dma_copy(struct pipe_context *ctx, return; fallback: si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box); } void si_init_dma_functions(struct si_context *sctx) { sctx->b.dma_copy = si_dma_copy; + sctx->b.dma_clear_buffer = si_dma_clear_buffer; } -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev