From: Marek Olšák <marek.ol...@amd.com>

---
 src/gallium/drivers/radeonsi/si_cp_dma.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
b/src/gallium/drivers/radeonsi/si_cp_dma.c
index 13b901b..fb6ed26 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -167,20 +167,21 @@ static void si_cp_dma_prepare(struct si_context *sctx, 
struct pipe_resource *dst
        if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) &&
            byte_count == remaining_size)
                *packet_flags |= CP_DMA_SYNC;
 }
 
 static void si_clear_buffer(struct pipe_context *ctx, struct pipe_resource 
*dst,
                            uint64_t offset, uint64_t size, unsigned value,
                            enum r600_coherency coher)
 {
        struct si_context *sctx = (struct si_context*)ctx;
+       struct radeon_winsys *ws = sctx->b.ws;
        unsigned tc_l2_flag = get_tc_l2_flag(sctx, coher);
        unsigned flush_flags = get_flush_flags(sctx, coher);
        bool is_first = true;
 
        if (!size)
                return;
 
        /* Mark the buffer range of destination as valid (initialized),
         * so that transfer_map knows it should wait for the GPU when mapping
         * that range. */
@@ -193,20 +194,39 @@ static void si_clear_buffer(struct pipe_context *ctx, 
struct pipe_resource *dst,
                                                      sctx->b.gfx.cs,
                                                      PIPE_TRANSFER_WRITE);
                map += offset;
                for (uint64_t i = 0; i < size; i++) {
                        unsigned byte_within_dword = (offset + i) % 4;
                        *map++ = (value >> (byte_within_dword * 8)) & 0xff;
                }
                return;
        }
 
+       /* dma_clear_buffer can use clear_buffer on failure. Make sure that
+        * doesn't happen. We don't want an infinite recursion: */
+       if (sctx->b.chip_class >= CIK && sctx->b.dma.cs &&
+           /* CP DMA is very slow. Always use SDMA for big clears. This
+            * alone improves DeusEx:MD performance by 70%. */
+           (size > 128 * 1024 ||
+            /* Buffers not used by the GFX IB yet will be cleared by SDMA.
+             * This happens to move most buffer clears to SDMA, including
+             * DCC and CMASK clears, because pipe->clear clears them before
+             * si_emit_framebuffer_state (in a draw call) adds them.
+             * For example, DeusEx:MD has 21 buffer clears per frame and all
+             * of them are moved to SDMA thanks to this. */
+            !ws->cs_is_buffer_referenced(sctx->b.gfx.cs,
+                                         r600_resource(dst)->buf,
+                                         RADEON_USAGE_READWRITE))) {
+               sctx->b.dma_clear_buffer(ctx, dst, offset, size, value);
+               return;
+       }
+
        uint64_t va = r600_resource(dst)->gpu_address + offset;
 
        /* Flush the caches. */
        sctx->b.flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
                         SI_CONTEXT_CS_PARTIAL_FLUSH | flush_flags;
 
        while (size) {
                unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT);
                unsigned dma_flags = tc_l2_flag  | CP_DMA_CLEAR;
 
-- 
2.7.4

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to