From: Marek Olšák <marek.ol...@amd.com> --- src/gallium/drivers/radeonsi/si_cp_dma.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+)
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index 13b901b..fb6ed26 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -167,20 +167,21 @@ static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) && byte_count == remaining_size) *packet_flags |= CP_DMA_SYNC; } static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned value, enum r600_coherency coher) { struct si_context *sctx = (struct si_context*)ctx; + struct radeon_winsys *ws = sctx->b.ws; unsigned tc_l2_flag = get_tc_l2_flag(sctx, coher); unsigned flush_flags = get_flush_flags(sctx, coher); bool is_first = true; if (!size) return; /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ @@ -193,20 +194,39 @@ static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, sctx->b.gfx.cs, PIPE_TRANSFER_WRITE); map += offset; for (uint64_t i = 0; i < size; i++) { unsigned byte_within_dword = (offset + i) % 4; *map++ = (value >> (byte_within_dword * 8)) & 0xff; } return; } + /* dma_clear_buffer can use clear_buffer on failure. Make sure that + * doesn't happen. We don't want an infinite recursion: */ + if (sctx->b.chip_class >= CIK && sctx->b.dma.cs && + /* CP DMA is very slow. Always use SDMA for big clears. This + * alone improves DeusEx:MD performance by 70%. */ + (size > 128 * 1024 || + /* Buffers not used by the GFX IB yet will be cleared by SDMA. + * This happens to move most buffer clears to SDMA, including + * DCC and CMASK clears, because pipe->clear clears them before + * si_emit_framebuffer_state (in a draw call) adds them. + * For example, DeusEx:MD has 21 buffer clears per frame and all + * of them are moved to SDMA thanks to this. */ + !ws->cs_is_buffer_referenced(sctx->b.gfx.cs, + r600_resource(dst)->buf, + RADEON_USAGE_READWRITE))) { + sctx->b.dma_clear_buffer(ctx, dst, offset, size, value); + return; + } + uint64_t va = r600_resource(dst)->gpu_address + offset; /* Flush the caches. */ sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags; while (size) { unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); unsigned dma_flags = tc_l2_flag | CP_DMA_CLEAR; -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev