[Mesa-dev] [PATCH 2/4] radeonsi: use compute shaders for clear_buffer & copy_buffer

Marek Olšák Sun, 07 Oct 2018 00:06:30 -0700

From: Marek Olšák <marek.ol...@amd.com>

Fast color clears should be much faster. Also, fast color clears on
evicted buffers should be 200x faster on GFX8 and older.
---
 src/gallium/drivers/radeonsi/Makefile.sources |   1 +
 src/gallium/drivers/radeonsi/meson.build      |   1 +
 src/gallium/drivers/radeonsi/si_clear.c       |  10 +-
 .../drivers/radeonsi/si_compute_blit.c        | 285 ++++++++++++++++++
 src/gallium/drivers/radeonsi/si_cp_dma.c      | 180 +----------
 src/gallium/drivers/radeonsi/si_pipe.c        |  22 +-
 src/gallium/drivers/radeonsi/si_pipe.h        |  51 ++--
 src/gallium/drivers/radeonsi/si_test_dma.c    |   3 +-
 8 files changed, 350 insertions(+), 203 deletions(-)
 create mode 100644 src/gallium/drivers/radeonsi/si_compute_blit.c


diff --git a/src/gallium/drivers/radeonsi/Makefile.sources 
b/src/gallium/drivers/radeonsi/Makefile.sources
index abdc4e07f1e..aeb9b7982c4 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -4,20 +4,21 @@ GENERATED_SOURCES := \
 C_SOURCES := \
        $(GENERATED_SOURCES) \
        cik_sdma.c \
        driinfo_radeonsi.h \
        si_blit.c \
        si_buffer.c \
        si_build_pm4.h \
        si_clear.c \
        si_compute.c \
        si_compute.h \
+       si_compute_blit.c \
        si_cp_dma.c \
        si_debug.c \
        si_descriptors.c \
        si_dma.c \
        si_dma_cs.c \
        si_fence.c \
        si_get.c \
        si_gfx_cs.c \
        si_gpu_load.c \
        si_pipe.c \
diff --git a/src/gallium/drivers/radeonsi/meson.build 
b/src/gallium/drivers/radeonsi/meson.build
index 4d6044f724b..2542f136d11 100644
--- a/src/gallium/drivers/radeonsi/meson.build
+++ b/src/gallium/drivers/radeonsi/meson.build
@@ -20,20 +20,21 @@
 
 files_libradeonsi = files(
   'cik_sdma.c',
   'driinfo_radeonsi.h',
   'si_blit.c',
   'si_buffer.c',
   'si_build_pm4.h',
   'si_clear.c',
   'si_compute.c',
   'si_compute.h',
+  'si_compute_blit.c',
   'si_cp_dma.c',
   'si_debug.c',
   'si_descriptors.c',
   'si_dma.c',
   'si_dma_cs.c',
   'si_fence.c',
   'si_get.c',
   'si_gfx_cs.c',
   'si_gpu_load.c',
   'si_perfcounter.c',
diff --git a/src/gallium/drivers/radeonsi/si_clear.c 
b/src/gallium/drivers/radeonsi/si_clear.c
index 4e07de81bac..520e5b94f4a 100644
--- a/src/gallium/drivers/radeonsi/si_clear.c
+++ b/src/gallium/drivers/radeonsi/si_clear.c
@@ -249,21 +249,21 @@ void vi_dcc_clear_level(struct si_context *sctx,
                 * would be more efficient than separate per-layer clear 
operations.
                 */
                assert(tex->buffer.b.b.nr_storage_samples <= 2 || num_layers == 
1);
 
                dcc_offset += tex->surface.u.legacy.level[level].dcc_offset;
                clear_size = 
tex->surface.u.legacy.level[level].dcc_fast_clear_size *
                             num_layers;
        }
 
        si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size,
-                       clear_value, SI_COHERENCY_CB_META);
+                       &clear_value, 4, SI_COHERENCY_CB_META);
 }
 
 /* Set the same micro tile mode as the destination of the last MSAA resolve.
  * This allows hitting the MSAA resolve fast path, which requires that both
  * src and dst micro tile modes match.
  */
 static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen,
                                           struct si_texture *tex)
 {
        if (tex->buffer.b.is_shared ||
@@ -480,23 +480,24 @@ static void si_do_fast_color_clear(struct si_context 
*sctx,
 
                        if (eliminate_needed && too_small)
                                continue;
 
                        /* DCC fast clear with MSAA should clear CMASK to 0xC. 
*/
                        if (tex->buffer.b.b.nr_samples >= 2 && 
tex->cmask_buffer) {
                                /* TODO: This doesn't work with MSAA. */
                                if (eliminate_needed)
                                        continue;
 
+                               uint32_t clear_value = 0xCCCCCCCC;
                                si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
                                                tex->cmask_offset, 
tex->surface.cmask_size,
-                                               0xCCCCCCCC, 
SI_COHERENCY_CB_META);
+                                               &clear_value, 4, 
SI_COHERENCY_CB_META);
                                need_decompress_pass = true;
                        }
 
                        vi_dcc_clear_level(sctx, tex, 0, reset_value);
 
                        if (eliminate_needed)
                                need_decompress_pass = true;
 
                        tex->separate_dcc_dirty = true;
                } else {
@@ -511,23 +512,24 @@ static void si_do_fast_color_clear(struct si_context 
*sctx,
                        /* RB+ doesn't work with CMASK fast clear on Stoney. */
                        if (sctx->family == CHIP_STONEY)
                                continue;
 
                        /* ensure CMASK is enabled */
                        si_alloc_separate_cmask(sctx->screen, tex);
                        if (!tex->cmask_buffer)
                                continue;
 
                        /* Do the fast clear. */
+                       uint32_t clear_value = 0;
                        si_clear_buffer(sctx, &tex->cmask_buffer->b.b,
-                                       tex->cmask_offset, 
tex->surface.cmask_size, 0,
-                                       SI_COHERENCY_CB_META);
+                                       tex->cmask_offset, 
tex->surface.cmask_size,
+                                       &clear_value, 4, SI_COHERENCY_CB_META);
                        need_decompress_pass = true;
                }
 
                if (need_decompress_pass &&
                    !(tex->dirty_level_mask & (1 << level))) {
                        tex->dirty_level_mask |= 1 << level;
                        
p_atomic_inc(&sctx->screen->compressed_colortex_counter);
                }
 
                /* We can change the micro tile mode before a full clear. */
diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c 
b/src/gallium/drivers/radeonsi/si_compute_blit.c
new file mode 100644
index 00000000000..20e4f591fbb
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/si_compute_blit.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright 2018 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "si_pipe.h"
+
+/* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst
+ * and L2_STREAM for src.
+ */
+static enum si_cache_policy get_cache_policy(struct si_context *sctx,
+                                            enum si_coherency coher,
+                                            uint64_t size)
+{
+       if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META ||
+                                         coher == SI_COHERENCY_CP)) ||
+           (sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER))
+               return size <= 256 * 1024 ? L2_LRU : L2_STREAM;
+
+       return L2_BYPASS;
+}
+
+unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
+                           enum si_cache_policy cache_policy)
+{
+       switch (coher) {
+       default:
+       case SI_COHERENCY_NONE:
+       case SI_COHERENCY_CP:
+               return 0;
+       case SI_COHERENCY_SHADER:
+               return SI_CONTEXT_INV_SMEM_L1 |
+                      SI_CONTEXT_INV_VMEM_L1 |
+                      (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 
0);
+       case SI_COHERENCY_CB_META:
+               return SI_CONTEXT_FLUSH_AND_INV_CB;
+       }
+}
+
+static void si_compute_do_clear_or_copy(struct si_context *sctx,
+                                       struct pipe_resource *dst,
+                                       unsigned dst_offset,
+                                       struct pipe_resource *src,
+                                       unsigned src_offset,
+                                       unsigned size,
+                                       const uint32_t *clear_value,
+                                       unsigned clear_value_size,
+                                       enum si_coherency coher)
+{
+       struct pipe_context *ctx = &sctx->b;
+
+       assert(src_offset % 4 == 0);
+       assert(dst_offset % 4 == 0);
+       assert(size % 4 == 0);
+
+       assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0);
+       assert(!src || src_offset + size <= src->width0);
+
+       sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
+                      SI_CONTEXT_CS_PARTIAL_FLUSH |
+                      si_get_flush_flags(sctx, coher, 
SI_COMPUTE_DST_CACHE_POLICY);
+       si_emit_cache_flush(sctx);
+
+       /* Save states. */
+       void *saved_cs = sctx->cs_shader_state.program;
+       struct pipe_shader_buffer saved_sb[2] = {};
+       si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, 
saved_sb);
+
+       /* The memory accesses are coalesced, meaning that the 1st instruction 
writes
+        * the 1st contiguous block of data for the whole wave, the 2nd 
instruction
+        * writes the 2nd contiguous block of data, etc.
+        */
+       unsigned dwords_per_thread = src ? SI_COMPUTE_COPY_DW_PER_THREAD :
+                                          SI_COMPUTE_CLEAR_DW_PER_THREAD;
+       unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4);
+       unsigned dwords_per_instruction = dwords_per_thread / 
instructions_per_thread;
+       unsigned dwords_per_wave = dwords_per_thread * 64;
+
+       unsigned num_dwords = size / 4;
+       unsigned num_instructions = DIV_ROUND_UP(num_dwords, 
dwords_per_instruction);
+
+       struct pipe_grid_info info = {};
+       info.block[0] = MIN2(64, num_instructions);
+       info.block[1] = 1;
+       info.block[2] = 1;
+       info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave);
+       info.grid[1] = 1;
+       info.grid[2] = 1;
+
+       struct pipe_shader_buffer sb[2] = {};
+       sb[0].buffer = dst;
+       sb[0].buffer_offset = dst_offset;
+       sb[0].buffer_size = size;
+
+       if (src) {
+               sb[1].buffer = src;
+               sb[1].buffer_offset = src_offset;
+               sb[1].buffer_size = size;
+
+               ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb);
+               ctx->bind_compute_state(ctx, sctx->cs_copy_buffer);
+       } else {
+               assert(clear_value_size >= 4 &&
+                      clear_value_size <= 16 &&
+                      util_is_power_of_two_or_zero(clear_value_size));
+
+               for (unsigned i = 0; i < 4; i++)
+                       sctx->cs_user_data[i] = clear_value[i % 
(clear_value_size / 4)];
+
+               ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb);
+               ctx->bind_compute_state(ctx, sctx->cs_clear_buffer);
+       }
+
+       ctx->launch_grid(ctx, &info);
+
+       enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
+       sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH |
+                      (cache_policy == L2_BYPASS ? 
SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0);
+
+       if (cache_policy != L2_BYPASS)
+               r600_resource(dst)->TC_L2_dirty = true;
+
+       /* Restore states. */
+       ctx->bind_compute_state(ctx, saved_cs);
+       ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, 
saved_sb);
+}
+
+void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
+                    uint64_t offset, uint64_t size, uint32_t *clear_value,
+                    uint32_t clear_value_size, enum si_coherency coher)
+{
+       if (!size)
+               return;
+
+       unsigned clear_alignment = MIN2(clear_value_size, 4);
+
+       assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is 
allowed. */
+       assert(offset % clear_alignment == 0);
+       assert(size % clear_alignment == 0);
+       assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all 
codepaths */
+
+       /* Reduce a large clear value size if possible. */
+       if (clear_value_size > 4) {
+               bool clear_dword_duplicated = true;
+
+               /* See if we can lower large fills to dword fills. */
+               for (unsigned i = 1; i < clear_value_size / 4; i++) {
+                       if (clear_value[0] != clear_value[i]) {
+                               clear_dword_duplicated = false;
+                               break;
+                       }
+               }
+               if (clear_dword_duplicated)
+                       clear_value_size = 4;
+       }
+
+       /* Expand a small clear value size. */
+       uint32_t tmp_clear_value;
+       if (clear_value_size <= 2) {
+               if (clear_value_size == 1) {
+                       tmp_clear_value = *(uint8_t*)clear_value;
+                       tmp_clear_value |= (tmp_clear_value << 8) |
+                                          (tmp_clear_value << 16) |
+                                          (tmp_clear_value << 24);
+               } else {
+                       tmp_clear_value = *(uint16_t*)clear_value;
+                       tmp_clear_value |= tmp_clear_value << 16;
+               }
+               clear_value = &tmp_clear_value;
+               clear_value_size = 4;
+       }
+
+       /* Use transform feedback for 12-byte clears. */
+       /* TODO: Use compute. */
+       if (clear_value_size == 12) {
+               union pipe_color_union streamout_clear_value;
+
+               memcpy(&streamout_clear_value, clear_value, clear_value_size);
+               si_blitter_begin(sctx, SI_DISABLE_RENDER_COND);
+               util_blitter_clear_buffer(sctx->blitter, dst, offset,
+                                         size, clear_value_size / 4,
+                                         &streamout_clear_value);
+               si_blitter_end(sctx);
+               return;
+       }
+
+       uint64_t aligned_size = size & ~3ull;
+       if (aligned_size >= 4) {
+               /* Before GFX9, CP DMA was very slow when clearing GTT, so never
+                * use CP DMA clears on those chips, because we can't be certain
+                * about buffer placements.
+                */
+               if (clear_value_size > 4 ||
+                   (clear_value_size == 4 &&
+                    offset % 4 == 0 &&
+                    (size > 32*1024 || sctx->chip_class <= VI))) {
+                       si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0,
+                                                   aligned_size, clear_value,
+                                                   clear_value_size, coher);
+               } else {
+                       assert(clear_value_size == 4);
+                       si_cp_dma_clear_buffer(sctx, dst, offset,
+                                              aligned_size, *clear_value, 
coher,
+                                              get_cache_policy(sctx, coher, 
size));
+               }
+
+               offset += aligned_size;
+               size -= aligned_size;
+       }
+
+       /* Handle non-dword alignment. */
+       if (size) {
+               assert(dst);
+               assert(dst->target == PIPE_BUFFER);
+               assert(size < 4);
+
+               pipe_buffer_write(&sctx->b, dst, offset, size, clear_value);
+       }
+}
+
+static void si_pipe_clear_buffer(struct pipe_context *ctx,
+                                struct pipe_resource *dst,
+                                unsigned offset, unsigned size,
+                                const void *clear_value,
+                                int clear_value_size)
+{
+       enum si_coherency coher;
+
+       if (dst->flags & SI_RESOURCE_FLAG_SO_FILLED_SIZE)
+               coher = SI_COHERENCY_CP;
+       else
+               coher = SI_COHERENCY_SHADER;
+
+       si_clear_buffer((struct si_context*)ctx, dst, offset, size, 
(uint32_t*)clear_value,
+                       clear_value_size, coher);
+}
+
+void si_copy_buffer(struct si_context *sctx,
+                   struct pipe_resource *dst, struct pipe_resource *src,
+                   uint64_t dst_offset, uint64_t src_offset, unsigned size)
+{
+       if (!size)
+               return;
+
+       enum si_coherency coher = SI_COHERENCY_SHADER;
+       enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size);
+
+       /* Only use compute for VRAM copies on dGPUs. */
+       if (sctx->screen->info.has_dedicated_vram &&
+           r600_resource(dst)->domains & RADEON_DOMAIN_VRAM &&
+           r600_resource(src)->domains & RADEON_DOMAIN_VRAM &&
+           size > 32 * 1024 &&
+           dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) {
+               si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, 
src_offset,
+                                           size, NULL, 0, coher);
+       } else {
+               si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, 
size,
+                                     0, coher, cache_policy);
+       }
+}
+
+void si_init_compute_blit_functions(struct si_context *sctx)
+{
+       sctx->b.clear_buffer = si_pipe_clear_buffer;
+}
diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c 
b/src/gallium/drivers/radeonsi/si_cp_dma.c
index c1ecd5fb3e8..839b31b7fdf 100644
--- a/src/gallium/drivers/radeonsi/si_cp_dma.c
+++ b/src/gallium/drivers/radeonsi/si_cp_dma.c
@@ -18,26 +18,20 @@
  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
  * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
 #include "si_pipe.h"
 #include "sid.h"
 
-/* Recommended maximum sizes for optimal performance.
- * Fall back to compute or SDMA if the size is greater.
- */
-#define CP_DMA_COPY_PERF_THRESHOLD     (64 * 1024) /* copied from Vulkan */
-#define CP_DMA_CLEAR_PERF_THRESHOLD    (32 * 1024) /* guess (clear is much 
slower) */
-
 /* Set this if you want the ME to wait until CP DMA is done.
  * It should be set on the last CP DMA packet. */
 #define CP_DMA_SYNC            (1 << 0)
 
 /* Set this if the source data was used as a destination in a previous CP DMA
  * packet. It's for preventing a read-after-write (RAW) hazard between two
  * CP DMA packets. */
 #define CP_DMA_RAW_WAIT                (1 << 1)
 #define CP_DMA_DST_IS_GDS      (1 << 2)
 #define CP_DMA_CLEAR           (1 << 3)
@@ -148,49 +142,20 @@ void si_cp_dma_wait_for_idle(struct si_context *sctx)
 {
        /* Issue a dummy DMA that copies zero bytes.
         *
         * The DMA engine will see that there's no work to do and skip this
         * DMA request, however, the CP will see the sync flag and still wait
         * for all DMAs to complete.
         */
        si_emit_cp_dma(sctx, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS);
 }
 
-static unsigned get_flush_flags(struct si_context *sctx, enum si_coherency 
coher,
-                               enum si_cache_policy cache_policy)
-{
-       switch (coher) {
-       default:
-       case SI_COHERENCY_NONE:
-               return 0;
-       case SI_COHERENCY_SHADER:
-               assert(sctx->chip_class != SI || cache_policy == L2_BYPASS);
-               return SI_CONTEXT_INV_SMEM_L1 |
-                      SI_CONTEXT_INV_VMEM_L1 |
-                      (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 
0);
-       case SI_COHERENCY_CB_META:
-               assert(sctx->chip_class >= GFX9 ? cache_policy != L2_BYPASS :
-                                                 cache_policy == L2_BYPASS);
-               return SI_CONTEXT_FLUSH_AND_INV_CB;
-       }
-}
-
-static enum si_cache_policy get_cache_policy(struct si_context *sctx,
-                                            enum si_coherency coher)
-{
-       if ((sctx->chip_class >= GFX9 && coher == SI_COHERENCY_CB_META) ||
-           (sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER))
-               return L2_LRU;
-
-       return L2_BYPASS;
-}
-
 static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource 
*dst,
                              struct pipe_resource *src, unsigned byte_count,
                              uint64_t remaining_size, unsigned user_flags,
                              enum si_coherency coher, bool *is_first,
                              unsigned *packet_flags)
 {
        /* Fast exit for a CPDMA prefetch. */
        if ((user_flags & SI_CPDMA_SKIP_ALL) == SI_CPDMA_SKIP_ALL) {
                *is_first = false;
                return;
@@ -255,21 +220,21 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, 
struct pipe_resource *dst,
 
        /* Mark the buffer range of destination as valid (initialized),
         * so that transfer_map knows it should wait for the GPU when mapping
         * that range. */
        if (rdst)
                util_range_add(&rdst->valid_buffer_range, offset, offset + 
size);
 
        /* Flush the caches. */
        sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
                       SI_CONTEXT_CS_PARTIAL_FLUSH |
-                      get_flush_flags(sctx, coher, cache_policy);
+                      si_get_flush_flags(sctx, coher, cache_policy);
 
        while (size) {
                unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
                unsigned dma_flags = CP_DMA_CLEAR | (rdst ? 0 : 
CP_DMA_DST_IS_GDS);
 
                si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, 0, coher,
                                  &is_first, &dma_flags);
 
                /* Emit the clear packet. */
                si_emit_cp_dma(sctx, va, value, byte_count, dma_flags, 
cache_policy);
@@ -279,136 +244,20 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, 
struct pipe_resource *dst,
        }
 
        if (rdst && cache_policy != L2_BYPASS)
                rdst->TC_L2_dirty = true;
 
        /* If it's not a framebuffer fast clear... */
        if (coher == SI_COHERENCY_SHADER)
                sctx->num_cp_dma_calls++;
 }
 
-/* dst == NULL means GDS. */
-void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
-                    uint64_t offset, uint64_t size, unsigned value,
-                    enum si_coherency coher)
-{
-       struct radeon_winsys *ws = sctx->ws;
-       struct r600_resource *rdst = r600_resource(dst);
-       enum si_cache_policy cache_policy = get_cache_policy(sctx, coher);
-       uint64_t dma_clear_size;
-
-       if (!size)
-               return;
-
-       dma_clear_size = size & ~3ull;
-
-       /* dma_clear_buffer can use clear_buffer on failure. Make sure that
-        * doesn't happen. We don't want an infinite recursion: */
-       if (sctx->dma_cs &&
-           !(dst->flags & PIPE_RESOURCE_FLAG_SPARSE) &&
-           (offset % 4 == 0) &&
-           /* CP DMA is very slow. Always use SDMA for big clears. This
-            * alone improves DeusEx:MD performance by 70%. */
-           (size > CP_DMA_CLEAR_PERF_THRESHOLD ||
-            /* Buffers not used by the GFX IB yet will be cleared by SDMA.
-             * This happens to move most buffer clears to SDMA, including
-             * DCC and CMASK clears, because pipe->clear clears them before
-             * si_emit_framebuffer_state (in a draw call) adds them.
-             * For example, DeusEx:MD has 21 buffer clears per frame and all
-             * of them are moved to SDMA thanks to this. */
-            !ws->cs_is_buffer_referenced(sctx->gfx_cs, rdst->buf,
-                                         RADEON_USAGE_READWRITE))) {
-               si_sdma_clear_buffer(sctx, dst, offset, dma_clear_size, value);
-
-               offset += dma_clear_size;
-               size -= dma_clear_size;
-       } else if (dma_clear_size >= 4) {
-               si_cp_dma_clear_buffer(sctx, dst, offset, dma_clear_size, value,
-                                      coher, cache_policy);
-
-               offset += dma_clear_size;
-               size -= dma_clear_size;
-       }
-
-       if (size) {
-               /* Handle non-dword alignment.
-                *
-                * This function is called for embedded texture metadata clears,
-                * but those should always be properly aligned. */
-               assert(dst);
-               assert(dst->target == PIPE_BUFFER);
-               assert(size < 4);
-
-               pipe_buffer_write(&sctx->b, dst, offset, size, &value);
-       }
-}
-
-static void si_pipe_clear_buffer(struct pipe_context *ctx,
-                                struct pipe_resource *dst,
-                                unsigned offset, unsigned size,
-                                const void *clear_value_ptr,
-                                int clear_value_size)
-{
-       struct si_context *sctx = (struct si_context*)ctx;
-       uint32_t dword_value;
-       unsigned i;
-
-       assert(offset % clear_value_size == 0);
-       assert(size % clear_value_size == 0);
-
-       if (clear_value_size > 4) {
-               const uint32_t *u32 = clear_value_ptr;
-               bool clear_dword_duplicated = true;
-
-               /* See if we can lower large fills to dword fills. */
-               for (i = 1; i < clear_value_size / 4; i++)
-                       if (u32[0] != u32[i]) {
-                               clear_dword_duplicated = false;
-                               break;
-                       }
-
-               if (!clear_dword_duplicated) {
-                       /* Use transform feedback for 64-bit, 96-bit, and
-                        * 128-bit fills.
-                        */
-                       union pipe_color_union clear_value;
-
-                       memcpy(&clear_value, clear_value_ptr, clear_value_size);
-                       si_blitter_begin(sctx, SI_DISABLE_RENDER_COND);
-                       util_blitter_clear_buffer(sctx->blitter, dst, offset,
-                                                 size, clear_value_size / 4,
-                                                 &clear_value);
-                       si_blitter_end(sctx);
-                       return;
-               }
-       }
-
-       /* Expand the clear value to a dword. */
-       switch (clear_value_size) {
-       case 1:
-               dword_value = *(uint8_t*)clear_value_ptr;
-               dword_value |= (dword_value << 8) |
-                              (dword_value << 16) |
-                              (dword_value << 24);
-               break;
-       case 2:
-               dword_value = *(uint16_t*)clear_value_ptr;
-               dword_value |= dword_value << 16;
-               break;
-       default:
-               dword_value = *(uint32_t*)clear_value_ptr;
-       }
-
-       si_clear_buffer(sctx, dst, offset, size, dword_value,
-                       SI_COHERENCY_SHADER);
-}
-
 /**
  * Realign the CP DMA engine. This must be done after a copy with an unaligned
  * size.
  *
  * \param size  Remaining size to the CP DMA alignment.
  */
 static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size,
                                     unsigned user_flags, enum si_coherency 
coher,
                                     enum si_cache_policy cache_policy,
                                     bool *is_first)
@@ -502,21 +351,21 @@ void si_cp_dma_copy_buffer(struct si_context *sctx,
                        /* The main part will be skipped if the size is too 
small. */
                        skipped_size = MIN2(skipped_size, size);
                        size -= skipped_size;
                }
        }
 
        /* Flush the caches. */
        if ((dst || src) && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) {
                sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
                               SI_CONTEXT_CS_PARTIAL_FLUSH |
-                              get_flush_flags(sctx, coher, cache_policy);
+                              si_get_flush_flags(sctx, coher, cache_policy);
        }
 
        /* This is the main part doing the copying. Src is always aligned. */
        main_dst_offset = dst_offset + skipped_size;
        main_src_offset = src_offset + skipped_size;
 
        while (size) {
                unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx));
                unsigned dma_flags = gds_flags;
 
@@ -542,40 +391,26 @@ void si_cp_dma_copy_buffer(struct si_context *sctx,
 
                si_emit_cp_dma(sctx, dst_offset, src_offset, skipped_size,
                               dma_flags, cache_policy);
        }
 
        /* Finally, realign the engine if the size wasn't aligned. */
        if (realign_size) {
                si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher,
                                         cache_policy, &is_first);
        }
-}
-
-void si_copy_buffer(struct si_context *sctx,
-                   struct pipe_resource *dst, struct pipe_resource *src,
-                   uint64_t dst_offset, uint64_t src_offset, unsigned size)
-{
-       enum si_coherency coher = SI_COHERENCY_SHADER;
-       enum si_cache_policy cache_policy = get_cache_policy(sctx, coher);
-
-       if (!size)
-               return;
-
-       si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size,
-                             0, coher, cache_policy);
 
-       if (cache_policy != L2_BYPASS)
+       if (dst && cache_policy != L2_BYPASS)
                r600_resource(dst)->TC_L2_dirty = true;
 
-       /* If it's not a prefetch... */
-       if (dst_offset != src_offset)
+       /* If it's not a prefetch or GDS copy... */
+       if (dst && src && (dst != src || dst_offset != src_offset))
                sctx->num_cp_dma_calls++;
 }
 
 void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource 
*buf,
                              uint64_t offset, unsigned size)
 {
        assert(sctx->chip_class >= CIK);
 
        si_cp_dma_copy_buffer(sctx, buf, buf, offset, offset, size,
                              SI_CPDMA_SKIP_ALL, SI_COHERENCY_SHADER, L2_LRU);
@@ -737,15 +572,10 @@ void si_test_gds(struct si_context *sctx)
 
        pipe_buffer_read(ctx, dst, 0, sizeof(r), r);
        printf("GDS clear = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], 
r[3],
                        r[0] == 0xc1ea4146 && r[1] == 0xc1ea4146 &&
                        r[2] == 0xc1ea4146 && r[3] == 0xc1ea4146 ? "pass" : 
"fail");
 
        pipe_resource_reference(&src, NULL);
        pipe_resource_reference(&dst, NULL);
        exit(0);
 }
-
-void si_init_cp_dma_functions(struct si_context *sctx)
-{
-       sctx->b.clear_buffer = si_pipe_clear_buffer;
-}
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c 
b/src/gallium/drivers/radeonsi/si_pipe.c
index 4b481b47af3..9d25748df40 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -188,20 +188,24 @@ static void si_destroy_context(struct pipe_context 
*context)
        if (sctx->vs_blit_pos)
                sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos);
        if (sctx->vs_blit_pos_layered)
                sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos_layered);
        if (sctx->vs_blit_color)
                sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color);
        if (sctx->vs_blit_color_layered)
                sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color_layered);
        if (sctx->vs_blit_texcoord)
                sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord);
+       if (sctx->cs_clear_buffer)
+               sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer);
+       if (sctx->cs_copy_buffer)
+               sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer);
 
        if (sctx->blitter)
                util_blitter_destroy(sctx->blitter);
 
        /* Release DCC stats. */
        for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) {
                assert(!sctx->dcc_stats[i].query_active);
 
                for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); 
j++)
                        if (sctx->dcc_stats[i].ps_stats[j])
@@ -409,21 +413,22 @@ static struct pipe_context *si_create_context(struct 
pipe_screen *screen,
            sctx->chip_class == GFX9) {
                sctx->eop_bug_scratch = r600_resource(
                        pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
                                           16 * 
sscreen->info.num_render_backends));
                if (!sctx->eop_bug_scratch)
                        goto fail;
        }
 
        sctx->allocator_zeroed_memory =
                        u_suballocator_create(&sctx->b, 
sscreen->info.gart_page_size,
-                                             0, PIPE_USAGE_DEFAULT, 0, true);
+                                             0, PIPE_USAGE_DEFAULT,
+                                             SI_RESOURCE_FLAG_SO_FILLED_SIZE, 
true);
        if (!sctx->allocator_zeroed_memory)
                goto fail;
 
        sctx->b.stream_uploader = u_upload_create(&sctx->b, 1024 * 1024,
                                                    0, PIPE_USAGE_STREAM,
                                                    SI_RESOURCE_FLAG_READ_ONLY);
        if (!sctx->b.stream_uploader)
                goto fail;
 
        sctx->b.const_uploader = u_upload_create(&sctx->b, 128 * 1024,
@@ -446,21 +451,21 @@ static struct pipe_context *si_create_context(struct 
pipe_screen *screen,
        if (sscreen->info.num_sdma_rings && !(sscreen->debug_flags & 
DBG(NO_ASYNC_DMA))) {
                sctx->dma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA,
                                                       (void*)si_flush_dma_cs,
                                                       sctx);
        }
 
        si_init_buffer_functions(sctx);
        si_init_clear_functions(sctx);
        si_init_blit_functions(sctx);
        si_init_compute_functions(sctx);
-       si_init_cp_dma_functions(sctx);
+       si_init_compute_blit_functions(sctx);
        si_init_debug_functions(sctx);
        si_init_msaa_functions(sctx);
        si_init_streamout_functions(sctx);
 
        if (sscreen->info.has_hw_decode) {
                sctx->b.create_video_codec = si_uvd_create_decoder;
                sctx->b.create_video_buffer = si_video_buffer_create;
        } else {
                sctx->b.create_video_codec = vl_create_decoder;
                sctx->b.create_video_buffer = vl_video_buffer_create;
@@ -496,20 +501,28 @@ static struct pipe_context *si_create_context(struct 
pipe_screen *screen,
        si_init_ia_multi_vgt_param_table(sctx);
 
        if (sctx->chip_class >= CIK)
                cik_init_sdma_functions(sctx);
        else
                si_init_dma_functions(sctx);
 
        if (sscreen->debug_flags & DBG(FORCE_DMA))
                sctx->b.resource_copy_region = sctx->dma_copy;
 
+       bool dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU;
+       sctx->cs_clear_buffer = si_create_dma_compute_shader(&sctx->b,
+                                            SI_COMPUTE_CLEAR_DW_PER_THREAD,
+                                            dst_stream_policy, false);
+       sctx->cs_copy_buffer = si_create_dma_compute_shader(&sctx->b,
+                                            SI_COMPUTE_COPY_DW_PER_THREAD,
+                                            dst_stream_policy, true);
+
        sctx->blitter = util_blitter_create(&sctx->b);
        if (sctx->blitter == NULL)
                goto fail;
        sctx->blitter->draw_rectangle = si_draw_rectangle;
        sctx->blitter->skip_viewport_restore = true;
 
        sctx->sample_mask = 0xffff;
 
        if (sctx->chip_class >= GFX9) {
                sctx->wait_mem_scratch = r600_resource(
@@ -554,23 +567,24 @@ static struct pipe_context *si_create_context(struct 
pipe_screen *screen,
                si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS,
                                 &sctx->null_const_buf);
                si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES,
                                 &sctx->null_const_buf);
                si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE,
                                 &sctx->null_const_buf);
                si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS,
                                 &sctx->null_const_buf);
 
                /* Clear the NULL constant buffer, because loads should return 
zeros. */
+               uint32_t clear_value = 0;
                si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0,
-                               sctx->null_const_buf.buffer->width0, 0,
-                               SI_COHERENCY_SHADER);
+                               sctx->null_const_buf.buffer->width0,
+                               &clear_value, 4, SI_COHERENCY_SHADER);
        }
 
        uint64_t max_threads_per_block;
        screen->get_compute_param(screen, PIPE_SHADER_IR_TGSI,
                                  PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK,
                                  &max_threads_per_block);
 
        /* The maximum number of scratch waves. Scratch space isn't divided
         * evenly between CUs. The number is only a function of the number of 
CUs.
         * We can decrease the constant to decrease the scratch buffer size.
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h 
b/src/gallium/drivers/radeonsi/si_pipe.h
index 7e15412ef87..7ae17435ab6 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -45,20 +45,25 @@
 /* The base vertex and primitive restart can be any number, but we must pick
  * one which will mean "unknown" for the purpose of state tracking and
  * the number shouldn't be a commonly-used one. */
 #define SI_BASE_VERTEX_UNKNOWN         INT_MIN
 #define SI_RESTART_INDEX_UNKNOWN       INT_MIN
 #define SI_NUM_SMOOTH_AA_SAMPLES       8
 #define SI_GS_PER_ES                   128
 /* Alignment for optimal CP DMA performance. */
 #define SI_CPDMA_ALIGNMENT             32
 
+/* Tunables for compute-based clear_buffer and copy_buffer: */
+#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4
+#define SI_COMPUTE_COPY_DW_PER_THREAD  4
+#define SI_COMPUTE_DST_CACHE_POLICY    L2_STREAM
+
 /* Pipeline & streamout query controls. */
 #define SI_CONTEXT_START_PIPELINE_STATS        (1 << 0)
 #define SI_CONTEXT_STOP_PIPELINE_STATS (1 << 1)
 #define SI_CONTEXT_FLUSH_FOR_RENDER_COND (1 << 2)
 /* Instruction cache. */
 #define SI_CONTEXT_INV_ICACHE          (1 << 3)
 /* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */
 #define SI_CONTEXT_INV_SMEM_L1         (1 << 4)
 /* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */
 #define SI_CONTEXT_INV_VMEM_L1         (1 << 5)
@@ -95,20 +100,21 @@
 #define SI_MAP_BUFFER_ALIGNMENT                64
 #define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024
 
 #define SI_RESOURCE_FLAG_TRANSFER      (PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
 #define SI_RESOURCE_FLAG_FLUSHED_DEPTH (PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
 #define SI_RESOURCE_FLAG_FORCE_TILING  (PIPE_RESOURCE_FLAG_DRV_PRIV << 2)
 #define SI_RESOURCE_FLAG_DISABLE_DCC   (PIPE_RESOURCE_FLAG_DRV_PRIV << 3)
 #define SI_RESOURCE_FLAG_UNMAPPABLE    (PIPE_RESOURCE_FLAG_DRV_PRIV << 4)
 #define SI_RESOURCE_FLAG_READ_ONLY     (PIPE_RESOURCE_FLAG_DRV_PRIV << 5)
 #define SI_RESOURCE_FLAG_32BIT         (PIPE_RESOURCE_FLAG_DRV_PRIV << 6)
+#define SI_RESOURCE_FLAG_SO_FILLED_SIZE        (PIPE_RESOURCE_FLAG_DRV_PRIV << 
7)
 
 /* Debug flags. */
 enum {
        /* Shader logging options: */
        DBG_VS = PIPE_SHADER_VERTEX,
        DBG_PS = PIPE_SHADER_FRAGMENT,
        DBG_GS = PIPE_SHADER_GEOMETRY,
        DBG_TCS = PIPE_SHADER_TESS_CTRL,
        DBG_TES = PIPE_SHADER_TESS_EVAL,
        DBG_CS = PIPE_SHADER_COMPUTE,
@@ -165,20 +171,33 @@ enum {
        DBG_TEST_VMFAULT_CP,
        DBG_TEST_VMFAULT_SDMA,
        DBG_TEST_VMFAULT_SHADER,
        DBG_TEST_DMA_PERF,
        DBG_TEST_GDS,
 };
 
 #define DBG_ALL_SHADERS                (((1 << (DBG_CS + 1)) - 1))
 #define DBG(name)              (1ull << DBG_##name)
 
+enum si_cache_policy {
+       L2_BYPASS,
+       L2_STREAM, /* same as SLC=1 */
+       L2_LRU,    /* same as SLC=0 */
+};
+
+enum si_coherency {
+       SI_COHERENCY_NONE, /* no cache flushes needed */
+       SI_COHERENCY_SHADER,
+       SI_COHERENCY_CB_META,
+       SI_COHERENCY_CP,
+};
+
 struct si_compute;
 struct hash_table;
 struct u_suballocator;
 
 /* Only 32-bit buffer allocations are supported, gallium doesn't support more
  * at the moment.
  */
 struct r600_resource {
        struct threaded_resource        b;
 
@@ -766,20 +785,22 @@ struct si_context {
        void                            *custom_dsa_flush;
        void                            *custom_blend_resolve;
        void                            *custom_blend_fmask_decompress;
        void                            *custom_blend_eliminate_fastclear;
        void                            *custom_blend_dcc_decompress;
        void                            *vs_blit_pos;
        void                            *vs_blit_pos_layered;
        void                            *vs_blit_color;
        void                            *vs_blit_color_layered;
        void                            *vs_blit_texcoord;
+       void                            *cs_clear_buffer;
+       void                            *cs_copy_buffer;
        struct si_screen                *screen;
        struct pipe_debug_callback      debug;
        struct ac_llvm_compiler         compiler; /* only non-threaded 
compilation */
        struct si_shader_ctx_state      fixed_func_tcs_shader;
        struct r600_resource            *wait_mem_scratch;
        unsigned                        wait_mem_number;
        uint16_t                        prefetch_L2_mask;
 
        bool                            gfx_flush_in_progress:1;
        bool                            gfx_last_ib_is_busy:1;
@@ -1103,65 +1124,57 @@ void si_init_screen_buffer_functions(struct si_screen 
*sscreen);
 void si_init_buffer_functions(struct si_context *sctx);
 
 /* si_clear.c */
 enum pipe_format si_simplify_cb_format(enum pipe_format format);
 bool vi_alpha_is_on_msb(enum pipe_format format);
 void vi_dcc_clear_level(struct si_context *sctx,
                        struct si_texture *tex,
                        unsigned level, unsigned clear_value);
 void si_init_clear_functions(struct si_context *sctx);
 
+/* si_compute_blit.c */
+unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher,
+                           enum si_cache_policy cache_policy);
+void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
+                    uint64_t offset, uint64_t size, uint32_t *clear_value,
+                    uint32_t clear_value_size, enum si_coherency coher);
+void si_copy_buffer(struct si_context *sctx,
+                   struct pipe_resource *dst, struct pipe_resource *src,
+                   uint64_t dst_offset, uint64_t src_offset, unsigned size);
+void si_init_compute_blit_functions(struct si_context *sctx);
+
 /* si_cp_dma.c */
 #define SI_CPDMA_SKIP_CHECK_CS_SPACE   (1 << 0) /* don't call need_cs_space */
 #define SI_CPDMA_SKIP_SYNC_AFTER       (1 << 1) /* don't wait for DMA after 
the copy */
 #define SI_CPDMA_SKIP_SYNC_BEFORE      (1 << 2) /* don't wait for DMA before 
the copy (RAW hazards) */
 #define SI_CPDMA_SKIP_GFX_SYNC         (1 << 3) /* don't flush caches and 
don't wait for PS/CS */
 #define SI_CPDMA_SKIP_BO_LIST_UPDATE   (1 << 4) /* don't update the BO list */
 #define SI_CPDMA_SKIP_ALL (SI_CPDMA_SKIP_CHECK_CS_SPACE | \
                           SI_CPDMA_SKIP_SYNC_AFTER | \
                           SI_CPDMA_SKIP_SYNC_BEFORE | \
                           SI_CPDMA_SKIP_GFX_SYNC | \
                           SI_CPDMA_SKIP_BO_LIST_UPDATE)
 
-enum si_cache_policy {
-       L2_BYPASS,
-       L2_STREAM, /* same as SLC=1 */
-       L2_LRU,    /* same as SLC=0 */
-};
-
-enum si_coherency {
-       SI_COHERENCY_NONE, /* no cache flushes needed */
-       SI_COHERENCY_SHADER,
-       SI_COHERENCY_CB_META,
-};
-
 void si_cp_dma_wait_for_idle(struct si_context *sctx);
 void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
                            uint64_t offset, uint64_t size, unsigned value,
                            enum si_coherency coher,
                            enum si_cache_policy cache_policy);
-void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst,
-                    uint64_t offset, uint64_t size, unsigned value,
-                    enum si_coherency coher);
 void si_cp_dma_copy_buffer(struct si_context *sctx,
                           struct pipe_resource *dst, struct pipe_resource *src,
                           uint64_t dst_offset, uint64_t src_offset, unsigned 
size,
                           unsigned user_flags, enum si_coherency coher,
                           enum si_cache_policy cache_policy);
-void si_copy_buffer(struct si_context *sctx,
-                   struct pipe_resource *dst, struct pipe_resource *src,
-                   uint64_t dst_offset, uint64_t src_offset, unsigned size);
 void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource 
*buf,
                              uint64_t offset, unsigned size);
 void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only);
 void si_test_gds(struct si_context *sctx);
-void si_init_cp_dma_functions(struct si_context *sctx);
 
 /* si_debug.c */
 void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs,
                struct radeon_saved_cs *saved, bool get_buffer_list);
 void si_clear_saved_cs(struct radeon_saved_cs *saved);
 void si_destroy_saved_cs(struct si_saved_cs *scs);
 void si_auto_log_cs(void *data, struct u_log_context *log);
 void si_log_hw_flush(struct si_context *sctx);
 void si_log_draw_state(struct si_context *sctx, struct u_log_context *log);
 void si_log_compute_state(struct si_context *sctx, struct u_log_context *log);
diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c 
b/src/gallium/drivers/radeonsi/si_test_dma.c
index c81ec75dde2..90a2032cd80 100644
--- a/src/gallium/drivers/radeonsi/si_test_dma.c
+++ b/src/gallium/drivers/radeonsi/si_test_dma.c
@@ -300,21 +300,22 @@ void si_test_dma(struct si_screen *sscreen)
                       i, tdst.width0, tdst.height0, tdst.array_size,
                       array_mode_to_string(sscreen, &sdst->surface),
                       tsrc.width0, tsrc.height0, tsrc.array_size,
                       array_mode_to_string(sscreen, &ssrc->surface), bpp);
                fflush(stdout);
 
                /* set src pixels */
                set_random_pixels(ctx, src, &src_cpu);
 
                /* clear dst pixels */
-               si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, 0,
+               uint32_t zero = 0;
+               si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4,
                                SI_COHERENCY_SHADER);
                memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size);
 
                /* preparation */
                max_width = MIN2(tsrc.width0, tdst.width0);
                max_height = MIN2(tsrc.height0, tdst.height0);
                max_depth = MIN2(tsrc.array_size, tdst.array_size);
 
                num = do_partial_copies ? num_partial_copies : 1;
                for (j = 0; j < num; j++) {
-- 
2.17.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

[Mesa-dev] [PATCH 2/4] radeonsi: use compute shaders for clear_buffer & copy_buffer

Reply via email to