From: Marek Olšák <marek.ol...@amd.com> Fast color clears should be much faster. Also, fast color clears on evicted buffers should be 200x faster on GFX8 and older. --- src/gallium/drivers/radeonsi/Makefile.sources | 1 + src/gallium/drivers/radeonsi/meson.build | 1 + src/gallium/drivers/radeonsi/si_clear.c | 10 +- .../drivers/radeonsi/si_compute_blit.c | 285 ++++++++++++++++++ src/gallium/drivers/radeonsi/si_cp_dma.c | 180 +---------- src/gallium/drivers/radeonsi/si_pipe.c | 22 +- src/gallium/drivers/radeonsi/si_pipe.h | 51 ++-- src/gallium/drivers/radeonsi/si_test_dma.c | 3 +- 8 files changed, 350 insertions(+), 203 deletions(-) create mode 100644 src/gallium/drivers/radeonsi/si_compute_blit.c
diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources index abdc4e07f1e..aeb9b7982c4 100644 --- a/src/gallium/drivers/radeonsi/Makefile.sources +++ b/src/gallium/drivers/radeonsi/Makefile.sources @@ -4,20 +4,21 @@ GENERATED_SOURCES := \ C_SOURCES := \ $(GENERATED_SOURCES) \ cik_sdma.c \ driinfo_radeonsi.h \ si_blit.c \ si_buffer.c \ si_build_pm4.h \ si_clear.c \ si_compute.c \ si_compute.h \ + si_compute_blit.c \ si_cp_dma.c \ si_debug.c \ si_descriptors.c \ si_dma.c \ si_dma_cs.c \ si_fence.c \ si_get.c \ si_gfx_cs.c \ si_gpu_load.c \ si_pipe.c \ diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build index 4d6044f724b..2542f136d11 100644 --- a/src/gallium/drivers/radeonsi/meson.build +++ b/src/gallium/drivers/radeonsi/meson.build @@ -20,20 +20,21 @@ files_libradeonsi = files( 'cik_sdma.c', 'driinfo_radeonsi.h', 'si_blit.c', 'si_buffer.c', 'si_build_pm4.h', 'si_clear.c', 'si_compute.c', 'si_compute.h', + 'si_compute_blit.c', 'si_cp_dma.c', 'si_debug.c', 'si_descriptors.c', 'si_dma.c', 'si_dma_cs.c', 'si_fence.c', 'si_get.c', 'si_gfx_cs.c', 'si_gpu_load.c', 'si_perfcounter.c', diff --git a/src/gallium/drivers/radeonsi/si_clear.c b/src/gallium/drivers/radeonsi/si_clear.c index 4e07de81bac..520e5b94f4a 100644 --- a/src/gallium/drivers/radeonsi/si_clear.c +++ b/src/gallium/drivers/radeonsi/si_clear.c @@ -249,21 +249,21 @@ void vi_dcc_clear_level(struct si_context *sctx, * would be more efficient than separate per-layer clear operations. */ assert(tex->buffer.b.b.nr_storage_samples <= 2 || num_layers == 1); dcc_offset += tex->surface.u.legacy.level[level].dcc_offset; clear_size = tex->surface.u.legacy.level[level].dcc_fast_clear_size * num_layers; } si_clear_buffer(sctx, dcc_buffer, dcc_offset, clear_size, - clear_value, SI_COHERENCY_CB_META); + &clear_value, 4, SI_COHERENCY_CB_META); } /* Set the same micro tile mode as the destination of the last MSAA resolve. * This allows hitting the MSAA resolve fast path, which requires that both * src and dst micro tile modes match. */ static void si_set_optimal_micro_tile_mode(struct si_screen *sscreen, struct si_texture *tex) { if (tex->buffer.b.is_shared || @@ -480,23 +480,24 @@ static void si_do_fast_color_clear(struct si_context *sctx, if (eliminate_needed && too_small) continue; /* DCC fast clear with MSAA should clear CMASK to 0xC. */ if (tex->buffer.b.b.nr_samples >= 2 && tex->cmask_buffer) { /* TODO: This doesn't work with MSAA. */ if (eliminate_needed) continue; + uint32_t clear_value = 0xCCCCCCCC; si_clear_buffer(sctx, &tex->cmask_buffer->b.b, tex->cmask_offset, tex->surface.cmask_size, - 0xCCCCCCCC, SI_COHERENCY_CB_META); + &clear_value, 4, SI_COHERENCY_CB_META); need_decompress_pass = true; } vi_dcc_clear_level(sctx, tex, 0, reset_value); if (eliminate_needed) need_decompress_pass = true; tex->separate_dcc_dirty = true; } else { @@ -511,23 +512,24 @@ static void si_do_fast_color_clear(struct si_context *sctx, /* RB+ doesn't work with CMASK fast clear on Stoney. */ if (sctx->family == CHIP_STONEY) continue; /* ensure CMASK is enabled */ si_alloc_separate_cmask(sctx->screen, tex); if (!tex->cmask_buffer) continue; /* Do the fast clear. */ + uint32_t clear_value = 0; si_clear_buffer(sctx, &tex->cmask_buffer->b.b, - tex->cmask_offset, tex->surface.cmask_size, 0, - SI_COHERENCY_CB_META); + tex->cmask_offset, tex->surface.cmask_size, + &clear_value, 4, SI_COHERENCY_CB_META); need_decompress_pass = true; } if (need_decompress_pass && !(tex->dirty_level_mask & (1 << level))) { tex->dirty_level_mask |= 1 << level; p_atomic_inc(&sctx->screen->compressed_colortex_counter); } /* We can change the micro tile mode before a full clear. */ diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c new file mode 100644 index 00000000000..20e4f591fbb --- /dev/null +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -0,0 +1,285 @@ +/* + * Copyright 2018 Advanced Micro Devices, Inc. + * All Rights Reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * on the rights to use, copy, modify, merge, publish, distribute, sub + * license, and/or sell copies of the Software, and to permit persons to whom + * the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL + * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR + * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE + * USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + +#include "si_pipe.h" + +/* Note: Compute shaders always use SI_COMPUTE_DST_CACHE_POLICY for dst + * and L2_STREAM for src. + */ +static enum si_cache_policy get_cache_policy(struct si_context *sctx, + enum si_coherency coher, + uint64_t size) +{ + if ((sctx->chip_class >= GFX9 && (coher == SI_COHERENCY_CB_META || + coher == SI_COHERENCY_CP)) || + (sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER)) + return size <= 256 * 1024 ? L2_LRU : L2_STREAM; + + return L2_BYPASS; +} + +unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher, + enum si_cache_policy cache_policy) +{ + switch (coher) { + default: + case SI_COHERENCY_NONE: + case SI_COHERENCY_CP: + return 0; + case SI_COHERENCY_SHADER: + return SI_CONTEXT_INV_SMEM_L1 | + SI_CONTEXT_INV_VMEM_L1 | + (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0); + case SI_COHERENCY_CB_META: + return SI_CONTEXT_FLUSH_AND_INV_CB; + } +} + +static void si_compute_do_clear_or_copy(struct si_context *sctx, + struct pipe_resource *dst, + unsigned dst_offset, + struct pipe_resource *src, + unsigned src_offset, + unsigned size, + const uint32_t *clear_value, + unsigned clear_value_size, + enum si_coherency coher) +{ + struct pipe_context *ctx = &sctx->b; + + assert(src_offset % 4 == 0); + assert(dst_offset % 4 == 0); + assert(size % 4 == 0); + + assert(dst->target != PIPE_BUFFER || dst_offset + size <= dst->width0); + assert(!src || src_offset + size <= src->width0); + + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | + SI_CONTEXT_CS_PARTIAL_FLUSH | + si_get_flush_flags(sctx, coher, SI_COMPUTE_DST_CACHE_POLICY); + si_emit_cache_flush(sctx); + + /* Save states. */ + void *saved_cs = sctx->cs_shader_state.program; + struct pipe_shader_buffer saved_sb[2] = {}; + si_get_shader_buffers(sctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb); + + /* The memory accesses are coalesced, meaning that the 1st instruction writes + * the 1st contiguous block of data for the whole wave, the 2nd instruction + * writes the 2nd contiguous block of data, etc. + */ + unsigned dwords_per_thread = src ? SI_COMPUTE_COPY_DW_PER_THREAD : + SI_COMPUTE_CLEAR_DW_PER_THREAD; + unsigned instructions_per_thread = MAX2(1, dwords_per_thread / 4); + unsigned dwords_per_instruction = dwords_per_thread / instructions_per_thread; + unsigned dwords_per_wave = dwords_per_thread * 64; + + unsigned num_dwords = size / 4; + unsigned num_instructions = DIV_ROUND_UP(num_dwords, dwords_per_instruction); + + struct pipe_grid_info info = {}; + info.block[0] = MIN2(64, num_instructions); + info.block[1] = 1; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(num_dwords, dwords_per_wave); + info.grid[1] = 1; + info.grid[2] = 1; + + struct pipe_shader_buffer sb[2] = {}; + sb[0].buffer = dst; + sb[0].buffer_offset = dst_offset; + sb[0].buffer_size = size; + + if (src) { + sb[1].buffer = src; + sb[1].buffer_offset = src_offset; + sb[1].buffer_size = size; + + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 2, sb); + ctx->bind_compute_state(ctx, sctx->cs_copy_buffer); + } else { + assert(clear_value_size >= 4 && + clear_value_size <= 16 && + util_is_power_of_two_or_zero(clear_value_size)); + + for (unsigned i = 0; i < 4; i++) + sctx->cs_user_data[i] = clear_value[i % (clear_value_size / 4)]; + + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, 1, sb); + ctx->bind_compute_state(ctx, sctx->cs_clear_buffer); + } + + ctx->launch_grid(ctx, &info); + + enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size); + sctx->flags |= SI_CONTEXT_CS_PARTIAL_FLUSH | + (cache_policy == L2_BYPASS ? SI_CONTEXT_WRITEBACK_GLOBAL_L2 : 0); + + if (cache_policy != L2_BYPASS) + r600_resource(dst)->TC_L2_dirty = true; + + /* Restore states. */ + ctx->bind_compute_state(ctx, saved_cs); + ctx->set_shader_buffers(ctx, PIPE_SHADER_COMPUTE, 0, src ? 2 : 1, saved_sb); +} + +void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, + uint64_t offset, uint64_t size, uint32_t *clear_value, + uint32_t clear_value_size, enum si_coherency coher) +{ + if (!size) + return; + + unsigned clear_alignment = MIN2(clear_value_size, 4); + + assert(clear_value_size != 3 && clear_value_size != 6); /* 12 is allowed. */ + assert(offset % clear_alignment == 0); + assert(size % clear_alignment == 0); + assert(size < (UINT_MAX & ~0xf)); /* TODO: test 64-bit sizes in all codepaths */ + + /* Reduce a large clear value size if possible. */ + if (clear_value_size > 4) { + bool clear_dword_duplicated = true; + + /* See if we can lower large fills to dword fills. */ + for (unsigned i = 1; i < clear_value_size / 4; i++) { + if (clear_value[0] != clear_value[i]) { + clear_dword_duplicated = false; + break; + } + } + if (clear_dword_duplicated) + clear_value_size = 4; + } + + /* Expand a small clear value size. */ + uint32_t tmp_clear_value; + if (clear_value_size <= 2) { + if (clear_value_size == 1) { + tmp_clear_value = *(uint8_t*)clear_value; + tmp_clear_value |= (tmp_clear_value << 8) | + (tmp_clear_value << 16) | + (tmp_clear_value << 24); + } else { + tmp_clear_value = *(uint16_t*)clear_value; + tmp_clear_value |= tmp_clear_value << 16; + } + clear_value = &tmp_clear_value; + clear_value_size = 4; + } + + /* Use transform feedback for 12-byte clears. */ + /* TODO: Use compute. */ + if (clear_value_size == 12) { + union pipe_color_union streamout_clear_value; + + memcpy(&streamout_clear_value, clear_value, clear_value_size); + si_blitter_begin(sctx, SI_DISABLE_RENDER_COND); + util_blitter_clear_buffer(sctx->blitter, dst, offset, + size, clear_value_size / 4, + &streamout_clear_value); + si_blitter_end(sctx); + return; + } + + uint64_t aligned_size = size & ~3ull; + if (aligned_size >= 4) { + /* Before GFX9, CP DMA was very slow when clearing GTT, so never + * use CP DMA clears on those chips, because we can't be certain + * about buffer placements. + */ + if (clear_value_size > 4 || + (clear_value_size == 4 && + offset % 4 == 0 && + (size > 32*1024 || sctx->chip_class <= VI))) { + si_compute_do_clear_or_copy(sctx, dst, offset, NULL, 0, + aligned_size, clear_value, + clear_value_size, coher); + } else { + assert(clear_value_size == 4); + si_cp_dma_clear_buffer(sctx, dst, offset, + aligned_size, *clear_value, coher, + get_cache_policy(sctx, coher, size)); + } + + offset += aligned_size; + size -= aligned_size; + } + + /* Handle non-dword alignment. */ + if (size) { + assert(dst); + assert(dst->target == PIPE_BUFFER); + assert(size < 4); + + pipe_buffer_write(&sctx->b, dst, offset, size, clear_value); + } +} + +static void si_pipe_clear_buffer(struct pipe_context *ctx, + struct pipe_resource *dst, + unsigned offset, unsigned size, + const void *clear_value, + int clear_value_size) +{ + enum si_coherency coher; + + if (dst->flags & SI_RESOURCE_FLAG_SO_FILLED_SIZE) + coher = SI_COHERENCY_CP; + else + coher = SI_COHERENCY_SHADER; + + si_clear_buffer((struct si_context*)ctx, dst, offset, size, (uint32_t*)clear_value, + clear_value_size, coher); +} + +void si_copy_buffer(struct si_context *sctx, + struct pipe_resource *dst, struct pipe_resource *src, + uint64_t dst_offset, uint64_t src_offset, unsigned size) +{ + if (!size) + return; + + enum si_coherency coher = SI_COHERENCY_SHADER; + enum si_cache_policy cache_policy = get_cache_policy(sctx, coher, size); + + /* Only use compute for VRAM copies on dGPUs. */ + if (sctx->screen->info.has_dedicated_vram && + r600_resource(dst)->domains & RADEON_DOMAIN_VRAM && + r600_resource(src)->domains & RADEON_DOMAIN_VRAM && + size > 32 * 1024 && + dst_offset % 4 == 0 && src_offset % 4 == 0 && size % 4 == 0) { + si_compute_do_clear_or_copy(sctx, dst, dst_offset, src, src_offset, + size, NULL, 0, coher); + } else { + si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, + 0, coher, cache_policy); + } +} + +void si_init_compute_blit_functions(struct si_context *sctx) +{ + sctx->b.clear_buffer = si_pipe_clear_buffer; +} diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c b/src/gallium/drivers/radeonsi/si_cp_dma.c index c1ecd5fb3e8..839b31b7fdf 100644 --- a/src/gallium/drivers/radeonsi/si_cp_dma.c +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c @@ -18,26 +18,20 @@ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include "si_pipe.h" #include "sid.h" -/* Recommended maximum sizes for optimal performance. - * Fall back to compute or SDMA if the size is greater. - */ -#define CP_DMA_COPY_PERF_THRESHOLD (64 * 1024) /* copied from Vulkan */ -#define CP_DMA_CLEAR_PERF_THRESHOLD (32 * 1024) /* guess (clear is much slower) */ - /* Set this if you want the ME to wait until CP DMA is done. * It should be set on the last CP DMA packet. */ #define CP_DMA_SYNC (1 << 0) /* Set this if the source data was used as a destination in a previous CP DMA * packet. It's for preventing a read-after-write (RAW) hazard between two * CP DMA packets. */ #define CP_DMA_RAW_WAIT (1 << 1) #define CP_DMA_DST_IS_GDS (1 << 2) #define CP_DMA_CLEAR (1 << 3) @@ -148,49 +142,20 @@ void si_cp_dma_wait_for_idle(struct si_context *sctx) { /* Issue a dummy DMA that copies zero bytes. * * The DMA engine will see that there's no work to do and skip this * DMA request, however, the CP will see the sync flag and still wait * for all DMAs to complete. */ si_emit_cp_dma(sctx, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS); } -static unsigned get_flush_flags(struct si_context *sctx, enum si_coherency coher, - enum si_cache_policy cache_policy) -{ - switch (coher) { - default: - case SI_COHERENCY_NONE: - return 0; - case SI_COHERENCY_SHADER: - assert(sctx->chip_class != SI || cache_policy == L2_BYPASS); - return SI_CONTEXT_INV_SMEM_L1 | - SI_CONTEXT_INV_VMEM_L1 | - (cache_policy == L2_BYPASS ? SI_CONTEXT_INV_GLOBAL_L2 : 0); - case SI_COHERENCY_CB_META: - assert(sctx->chip_class >= GFX9 ? cache_policy != L2_BYPASS : - cache_policy == L2_BYPASS); - return SI_CONTEXT_FLUSH_AND_INV_CB; - } -} - -static enum si_cache_policy get_cache_policy(struct si_context *sctx, - enum si_coherency coher) -{ - if ((sctx->chip_class >= GFX9 && coher == SI_COHERENCY_CB_META) || - (sctx->chip_class >= CIK && coher == SI_COHERENCY_SHADER)) - return L2_LRU; - - return L2_BYPASS; -} - static void si_cp_dma_prepare(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, unsigned byte_count, uint64_t remaining_size, unsigned user_flags, enum si_coherency coher, bool *is_first, unsigned *packet_flags) { /* Fast exit for a CPDMA prefetch. */ if ((user_flags & SI_CPDMA_SKIP_ALL) == SI_CPDMA_SKIP_ALL) { *is_first = false; return; @@ -255,21 +220,21 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, /* Mark the buffer range of destination as valid (initialized), * so that transfer_map knows it should wait for the GPU when mapping * that range. */ if (rdst) util_range_add(&rdst->valid_buffer_range, offset, offset + size); /* Flush the caches. */ sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | - get_flush_flags(sctx, coher, cache_policy); + si_get_flush_flags(sctx, coher, cache_policy); while (size) { unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx)); unsigned dma_flags = CP_DMA_CLEAR | (rdst ? 0 : CP_DMA_DST_IS_GDS); si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, 0, coher, &is_first, &dma_flags); /* Emit the clear packet. */ si_emit_cp_dma(sctx, va, value, byte_count, dma_flags, cache_policy); @@ -279,136 +244,20 @@ void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, } if (rdst && cache_policy != L2_BYPASS) rdst->TC_L2_dirty = true; /* If it's not a framebuffer fast clear... */ if (coher == SI_COHERENCY_SHADER) sctx->num_cp_dma_calls++; } -/* dst == NULL means GDS. */ -void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, - uint64_t offset, uint64_t size, unsigned value, - enum si_coherency coher) -{ - struct radeon_winsys *ws = sctx->ws; - struct r600_resource *rdst = r600_resource(dst); - enum si_cache_policy cache_policy = get_cache_policy(sctx, coher); - uint64_t dma_clear_size; - - if (!size) - return; - - dma_clear_size = size & ~3ull; - - /* dma_clear_buffer can use clear_buffer on failure. Make sure that - * doesn't happen. We don't want an infinite recursion: */ - if (sctx->dma_cs && - !(dst->flags & PIPE_RESOURCE_FLAG_SPARSE) && - (offset % 4 == 0) && - /* CP DMA is very slow. Always use SDMA for big clears. This - * alone improves DeusEx:MD performance by 70%. */ - (size > CP_DMA_CLEAR_PERF_THRESHOLD || - /* Buffers not used by the GFX IB yet will be cleared by SDMA. - * This happens to move most buffer clears to SDMA, including - * DCC and CMASK clears, because pipe->clear clears them before - * si_emit_framebuffer_state (in a draw call) adds them. - * For example, DeusEx:MD has 21 buffer clears per frame and all - * of them are moved to SDMA thanks to this. */ - !ws->cs_is_buffer_referenced(sctx->gfx_cs, rdst->buf, - RADEON_USAGE_READWRITE))) { - si_sdma_clear_buffer(sctx, dst, offset, dma_clear_size, value); - - offset += dma_clear_size; - size -= dma_clear_size; - } else if (dma_clear_size >= 4) { - si_cp_dma_clear_buffer(sctx, dst, offset, dma_clear_size, value, - coher, cache_policy); - - offset += dma_clear_size; - size -= dma_clear_size; - } - - if (size) { - /* Handle non-dword alignment. - * - * This function is called for embedded texture metadata clears, - * but those should always be properly aligned. */ - assert(dst); - assert(dst->target == PIPE_BUFFER); - assert(size < 4); - - pipe_buffer_write(&sctx->b, dst, offset, size, &value); - } -} - -static void si_pipe_clear_buffer(struct pipe_context *ctx, - struct pipe_resource *dst, - unsigned offset, unsigned size, - const void *clear_value_ptr, - int clear_value_size) -{ - struct si_context *sctx = (struct si_context*)ctx; - uint32_t dword_value; - unsigned i; - - assert(offset % clear_value_size == 0); - assert(size % clear_value_size == 0); - - if (clear_value_size > 4) { - const uint32_t *u32 = clear_value_ptr; - bool clear_dword_duplicated = true; - - /* See if we can lower large fills to dword fills. */ - for (i = 1; i < clear_value_size / 4; i++) - if (u32[0] != u32[i]) { - clear_dword_duplicated = false; - break; - } - - if (!clear_dword_duplicated) { - /* Use transform feedback for 64-bit, 96-bit, and - * 128-bit fills. - */ - union pipe_color_union clear_value; - - memcpy(&clear_value, clear_value_ptr, clear_value_size); - si_blitter_begin(sctx, SI_DISABLE_RENDER_COND); - util_blitter_clear_buffer(sctx->blitter, dst, offset, - size, clear_value_size / 4, - &clear_value); - si_blitter_end(sctx); - return; - } - } - - /* Expand the clear value to a dword. */ - switch (clear_value_size) { - case 1: - dword_value = *(uint8_t*)clear_value_ptr; - dword_value |= (dword_value << 8) | - (dword_value << 16) | - (dword_value << 24); - break; - case 2: - dword_value = *(uint16_t*)clear_value_ptr; - dword_value |= dword_value << 16; - break; - default: - dword_value = *(uint32_t*)clear_value_ptr; - } - - si_clear_buffer(sctx, dst, offset, size, dword_value, - SI_COHERENCY_SHADER); -} - /** * Realign the CP DMA engine. This must be done after a copy with an unaligned * size. * * \param size Remaining size to the CP DMA alignment. */ static void si_cp_dma_realign_engine(struct si_context *sctx, unsigned size, unsigned user_flags, enum si_coherency coher, enum si_cache_policy cache_policy, bool *is_first) @@ -502,21 +351,21 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, /* The main part will be skipped if the size is too small. */ skipped_size = MIN2(skipped_size, size); size -= skipped_size; } } /* Flush the caches. */ if ((dst || src) && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) { sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | - get_flush_flags(sctx, coher, cache_policy); + si_get_flush_flags(sctx, coher, cache_policy); } /* This is the main part doing the copying. Src is always aligned. */ main_dst_offset = dst_offset + skipped_size; main_src_offset = src_offset + skipped_size; while (size) { unsigned byte_count = MIN2(size, cp_dma_max_byte_count(sctx)); unsigned dma_flags = gds_flags; @@ -542,40 +391,26 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, si_emit_cp_dma(sctx, dst_offset, src_offset, skipped_size, dma_flags, cache_policy); } /* Finally, realign the engine if the size wasn't aligned. */ if (realign_size) { si_cp_dma_realign_engine(sctx, realign_size, user_flags, coher, cache_policy, &is_first); } -} - -void si_copy_buffer(struct si_context *sctx, - struct pipe_resource *dst, struct pipe_resource *src, - uint64_t dst_offset, uint64_t src_offset, unsigned size) -{ - enum si_coherency coher = SI_COHERENCY_SHADER; - enum si_cache_policy cache_policy = get_cache_policy(sctx, coher); - - if (!size) - return; - - si_cp_dma_copy_buffer(sctx, dst, src, dst_offset, src_offset, size, - 0, coher, cache_policy); - if (cache_policy != L2_BYPASS) + if (dst && cache_policy != L2_BYPASS) r600_resource(dst)->TC_L2_dirty = true; - /* If it's not a prefetch... */ - if (dst_offset != src_offset) + /* If it's not a prefetch or GDS copy... */ + if (dst && src && (dst != src || dst_offset != src_offset)) sctx->num_cp_dma_calls++; } void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset, unsigned size) { assert(sctx->chip_class >= CIK); si_cp_dma_copy_buffer(sctx, buf, buf, offset, offset, size, SI_CPDMA_SKIP_ALL, SI_COHERENCY_SHADER, L2_LRU); @@ -737,15 +572,10 @@ void si_test_gds(struct si_context *sctx) pipe_buffer_read(ctx, dst, 0, sizeof(r), r); printf("GDS clear = %08x %08x %08x %08x -> %s\n", r[0], r[1], r[2], r[3], r[0] == 0xc1ea4146 && r[1] == 0xc1ea4146 && r[2] == 0xc1ea4146 && r[3] == 0xc1ea4146 ? "pass" : "fail"); pipe_resource_reference(&src, NULL); pipe_resource_reference(&dst, NULL); exit(0); } - -void si_init_cp_dma_functions(struct si_context *sctx) -{ - sctx->b.clear_buffer = si_pipe_clear_buffer; -} diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 4b481b47af3..9d25748df40 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -188,20 +188,24 @@ static void si_destroy_context(struct pipe_context *context) if (sctx->vs_blit_pos) sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos); if (sctx->vs_blit_pos_layered) sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_pos_layered); if (sctx->vs_blit_color) sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color); if (sctx->vs_blit_color_layered) sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_color_layered); if (sctx->vs_blit_texcoord) sctx->b.delete_vs_state(&sctx->b, sctx->vs_blit_texcoord); + if (sctx->cs_clear_buffer) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer); + if (sctx->cs_copy_buffer) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer); if (sctx->blitter) util_blitter_destroy(sctx->blitter); /* Release DCC stats. */ for (int i = 0; i < ARRAY_SIZE(sctx->dcc_stats); i++) { assert(!sctx->dcc_stats[i].query_active); for (int j = 0; j < ARRAY_SIZE(sctx->dcc_stats[i].ps_stats); j++) if (sctx->dcc_stats[i].ps_stats[j]) @@ -409,21 +413,22 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, sctx->chip_class == GFX9) { sctx->eop_bug_scratch = r600_resource( pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT, 16 * sscreen->info.num_render_backends)); if (!sctx->eop_bug_scratch) goto fail; } sctx->allocator_zeroed_memory = u_suballocator_create(&sctx->b, sscreen->info.gart_page_size, - 0, PIPE_USAGE_DEFAULT, 0, true); + 0, PIPE_USAGE_DEFAULT, + SI_RESOURCE_FLAG_SO_FILLED_SIZE, true); if (!sctx->allocator_zeroed_memory) goto fail; sctx->b.stream_uploader = u_upload_create(&sctx->b, 1024 * 1024, 0, PIPE_USAGE_STREAM, SI_RESOURCE_FLAG_READ_ONLY); if (!sctx->b.stream_uploader) goto fail; sctx->b.const_uploader = u_upload_create(&sctx->b, 128 * 1024, @@ -446,21 +451,21 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, if (sscreen->info.num_sdma_rings && !(sscreen->debug_flags & DBG(NO_ASYNC_DMA))) { sctx->dma_cs = sctx->ws->cs_create(sctx->ctx, RING_DMA, (void*)si_flush_dma_cs, sctx); } si_init_buffer_functions(sctx); si_init_clear_functions(sctx); si_init_blit_functions(sctx); si_init_compute_functions(sctx); - si_init_cp_dma_functions(sctx); + si_init_compute_blit_functions(sctx); si_init_debug_functions(sctx); si_init_msaa_functions(sctx); si_init_streamout_functions(sctx); if (sscreen->info.has_hw_decode) { sctx->b.create_video_codec = si_uvd_create_decoder; sctx->b.create_video_buffer = si_video_buffer_create; } else { sctx->b.create_video_codec = vl_create_decoder; sctx->b.create_video_buffer = vl_video_buffer_create; @@ -496,20 +501,28 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, si_init_ia_multi_vgt_param_table(sctx); if (sctx->chip_class >= CIK) cik_init_sdma_functions(sctx); else si_init_dma_functions(sctx); if (sscreen->debug_flags & DBG(FORCE_DMA)) sctx->b.resource_copy_region = sctx->dma_copy; + bool dst_stream_policy = SI_COMPUTE_DST_CACHE_POLICY != L2_LRU; + sctx->cs_clear_buffer = si_create_dma_compute_shader(&sctx->b, + SI_COMPUTE_CLEAR_DW_PER_THREAD, + dst_stream_policy, false); + sctx->cs_copy_buffer = si_create_dma_compute_shader(&sctx->b, + SI_COMPUTE_COPY_DW_PER_THREAD, + dst_stream_policy, true); + sctx->blitter = util_blitter_create(&sctx->b); if (sctx->blitter == NULL) goto fail; sctx->blitter->draw_rectangle = si_draw_rectangle; sctx->blitter->skip_viewport_restore = true; sctx->sample_mask = 0xffff; if (sctx->chip_class >= GFX9) { sctx->wait_mem_scratch = r600_resource( @@ -554,23 +567,24 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &sctx->null_const_buf); si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &sctx->null_const_buf); si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &sctx->null_const_buf); si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &sctx->null_const_buf); /* Clear the NULL constant buffer, because loads should return zeros. */ + uint32_t clear_value = 0; si_clear_buffer(sctx, sctx->null_const_buf.buffer, 0, - sctx->null_const_buf.buffer->width0, 0, - SI_COHERENCY_SHADER); + sctx->null_const_buf.buffer->width0, + &clear_value, 4, SI_COHERENCY_SHADER); } uint64_t max_threads_per_block; screen->get_compute_param(screen, PIPE_SHADER_IR_TGSI, PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK, &max_threads_per_block); /* The maximum number of scratch waves. Scratch space isn't divided * evenly between CUs. The number is only a function of the number of CUs. * We can decrease the constant to decrease the scratch buffer size. diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 7e15412ef87..7ae17435ab6 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -45,20 +45,25 @@ /* The base vertex and primitive restart can be any number, but we must pick * one which will mean "unknown" for the purpose of state tracking and * the number shouldn't be a commonly-used one. */ #define SI_BASE_VERTEX_UNKNOWN INT_MIN #define SI_RESTART_INDEX_UNKNOWN INT_MIN #define SI_NUM_SMOOTH_AA_SAMPLES 8 #define SI_GS_PER_ES 128 /* Alignment for optimal CP DMA performance. */ #define SI_CPDMA_ALIGNMENT 32 +/* Tunables for compute-based clear_buffer and copy_buffer: */ +#define SI_COMPUTE_CLEAR_DW_PER_THREAD 4 +#define SI_COMPUTE_COPY_DW_PER_THREAD 4 +#define SI_COMPUTE_DST_CACHE_POLICY L2_STREAM + /* Pipeline & streamout query controls. */ #define SI_CONTEXT_START_PIPELINE_STATS (1 << 0) #define SI_CONTEXT_STOP_PIPELINE_STATS (1 << 1) #define SI_CONTEXT_FLUSH_FOR_RENDER_COND (1 << 2) /* Instruction cache. */ #define SI_CONTEXT_INV_ICACHE (1 << 3) /* SMEM L1, other names: KCACHE, constant cache, DCACHE, data cache */ #define SI_CONTEXT_INV_SMEM_L1 (1 << 4) /* VMEM L1 can optionally be bypassed (GLC=1). Other names: TC L1 */ #define SI_CONTEXT_INV_VMEM_L1 (1 << 5) @@ -95,20 +100,21 @@ #define SI_MAP_BUFFER_ALIGNMENT 64 #define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024 #define SI_RESOURCE_FLAG_TRANSFER (PIPE_RESOURCE_FLAG_DRV_PRIV << 0) #define SI_RESOURCE_FLAG_FLUSHED_DEPTH (PIPE_RESOURCE_FLAG_DRV_PRIV << 1) #define SI_RESOURCE_FLAG_FORCE_TILING (PIPE_RESOURCE_FLAG_DRV_PRIV << 2) #define SI_RESOURCE_FLAG_DISABLE_DCC (PIPE_RESOURCE_FLAG_DRV_PRIV << 3) #define SI_RESOURCE_FLAG_UNMAPPABLE (PIPE_RESOURCE_FLAG_DRV_PRIV << 4) #define SI_RESOURCE_FLAG_READ_ONLY (PIPE_RESOURCE_FLAG_DRV_PRIV << 5) #define SI_RESOURCE_FLAG_32BIT (PIPE_RESOURCE_FLAG_DRV_PRIV << 6) +#define SI_RESOURCE_FLAG_SO_FILLED_SIZE (PIPE_RESOURCE_FLAG_DRV_PRIV << 7) /* Debug flags. */ enum { /* Shader logging options: */ DBG_VS = PIPE_SHADER_VERTEX, DBG_PS = PIPE_SHADER_FRAGMENT, DBG_GS = PIPE_SHADER_GEOMETRY, DBG_TCS = PIPE_SHADER_TESS_CTRL, DBG_TES = PIPE_SHADER_TESS_EVAL, DBG_CS = PIPE_SHADER_COMPUTE, @@ -165,20 +171,33 @@ enum { DBG_TEST_VMFAULT_CP, DBG_TEST_VMFAULT_SDMA, DBG_TEST_VMFAULT_SHADER, DBG_TEST_DMA_PERF, DBG_TEST_GDS, }; #define DBG_ALL_SHADERS (((1 << (DBG_CS + 1)) - 1)) #define DBG(name) (1ull << DBG_##name) +enum si_cache_policy { + L2_BYPASS, + L2_STREAM, /* same as SLC=1 */ + L2_LRU, /* same as SLC=0 */ +}; + +enum si_coherency { + SI_COHERENCY_NONE, /* no cache flushes needed */ + SI_COHERENCY_SHADER, + SI_COHERENCY_CB_META, + SI_COHERENCY_CP, +}; + struct si_compute; struct hash_table; struct u_suballocator; /* Only 32-bit buffer allocations are supported, gallium doesn't support more * at the moment. */ struct r600_resource { struct threaded_resource b; @@ -766,20 +785,22 @@ struct si_context { void *custom_dsa_flush; void *custom_blend_resolve; void *custom_blend_fmask_decompress; void *custom_blend_eliminate_fastclear; void *custom_blend_dcc_decompress; void *vs_blit_pos; void *vs_blit_pos_layered; void *vs_blit_color; void *vs_blit_color_layered; void *vs_blit_texcoord; + void *cs_clear_buffer; + void *cs_copy_buffer; struct si_screen *screen; struct pipe_debug_callback debug; struct ac_llvm_compiler compiler; /* only non-threaded compilation */ struct si_shader_ctx_state fixed_func_tcs_shader; struct r600_resource *wait_mem_scratch; unsigned wait_mem_number; uint16_t prefetch_L2_mask; bool gfx_flush_in_progress:1; bool gfx_last_ib_is_busy:1; @@ -1103,65 +1124,57 @@ void si_init_screen_buffer_functions(struct si_screen *sscreen); void si_init_buffer_functions(struct si_context *sctx); /* si_clear.c */ enum pipe_format si_simplify_cb_format(enum pipe_format format); bool vi_alpha_is_on_msb(enum pipe_format format); void vi_dcc_clear_level(struct si_context *sctx, struct si_texture *tex, unsigned level, unsigned clear_value); void si_init_clear_functions(struct si_context *sctx); +/* si_compute_blit.c */ +unsigned si_get_flush_flags(struct si_context *sctx, enum si_coherency coher, + enum si_cache_policy cache_policy); +void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, + uint64_t offset, uint64_t size, uint32_t *clear_value, + uint32_t clear_value_size, enum si_coherency coher); +void si_copy_buffer(struct si_context *sctx, + struct pipe_resource *dst, struct pipe_resource *src, + uint64_t dst_offset, uint64_t src_offset, unsigned size); +void si_init_compute_blit_functions(struct si_context *sctx); + /* si_cp_dma.c */ #define SI_CPDMA_SKIP_CHECK_CS_SPACE (1 << 0) /* don't call need_cs_space */ #define SI_CPDMA_SKIP_SYNC_AFTER (1 << 1) /* don't wait for DMA after the copy */ #define SI_CPDMA_SKIP_SYNC_BEFORE (1 << 2) /* don't wait for DMA before the copy (RAW hazards) */ #define SI_CPDMA_SKIP_GFX_SYNC (1 << 3) /* don't flush caches and don't wait for PS/CS */ #define SI_CPDMA_SKIP_BO_LIST_UPDATE (1 << 4) /* don't update the BO list */ #define SI_CPDMA_SKIP_ALL (SI_CPDMA_SKIP_CHECK_CS_SPACE | \ SI_CPDMA_SKIP_SYNC_AFTER | \ SI_CPDMA_SKIP_SYNC_BEFORE | \ SI_CPDMA_SKIP_GFX_SYNC | \ SI_CPDMA_SKIP_BO_LIST_UPDATE) -enum si_cache_policy { - L2_BYPASS, - L2_STREAM, /* same as SLC=1 */ - L2_LRU, /* same as SLC=0 */ -}; - -enum si_coherency { - SI_COHERENCY_NONE, /* no cache flushes needed */ - SI_COHERENCY_SHADER, - SI_COHERENCY_CB_META, -}; - void si_cp_dma_wait_for_idle(struct si_context *sctx); void si_cp_dma_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned value, enum si_coherency coher, enum si_cache_policy cache_policy); -void si_clear_buffer(struct si_context *sctx, struct pipe_resource *dst, - uint64_t offset, uint64_t size, unsigned value, - enum si_coherency coher); void si_cp_dma_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned user_flags, enum si_coherency coher, enum si_cache_policy cache_policy); -void si_copy_buffer(struct si_context *sctx, - struct pipe_resource *dst, struct pipe_resource *src, - uint64_t dst_offset, uint64_t src_offset, unsigned size); void cik_prefetch_TC_L2_async(struct si_context *sctx, struct pipe_resource *buf, uint64_t offset, unsigned size); void cik_emit_prefetch_L2(struct si_context *sctx, bool vertex_stage_only); void si_test_gds(struct si_context *sctx); -void si_init_cp_dma_functions(struct si_context *sctx); /* si_debug.c */ void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs, struct radeon_saved_cs *saved, bool get_buffer_list); void si_clear_saved_cs(struct radeon_saved_cs *saved); void si_destroy_saved_cs(struct si_saved_cs *scs); void si_auto_log_cs(void *data, struct u_log_context *log); void si_log_hw_flush(struct si_context *sctx); void si_log_draw_state(struct si_context *sctx, struct u_log_context *log); void si_log_compute_state(struct si_context *sctx, struct u_log_context *log); diff --git a/src/gallium/drivers/radeonsi/si_test_dma.c b/src/gallium/drivers/radeonsi/si_test_dma.c index c81ec75dde2..90a2032cd80 100644 --- a/src/gallium/drivers/radeonsi/si_test_dma.c +++ b/src/gallium/drivers/radeonsi/si_test_dma.c @@ -300,21 +300,22 @@ void si_test_dma(struct si_screen *sscreen) i, tdst.width0, tdst.height0, tdst.array_size, array_mode_to_string(sscreen, &sdst->surface), tsrc.width0, tsrc.height0, tsrc.array_size, array_mode_to_string(sscreen, &ssrc->surface), bpp); fflush(stdout); /* set src pixels */ set_random_pixels(ctx, src, &src_cpu); /* clear dst pixels */ - si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, 0, + uint32_t zero = 0; + si_clear_buffer(sctx, dst, 0, sdst->surface.surf_size, &zero, 4, SI_COHERENCY_SHADER); memset(dst_cpu.ptr, 0, dst_cpu.layer_stride * tdst.array_size); /* preparation */ max_width = MIN2(tsrc.width0, tdst.width0); max_height = MIN2(tsrc.height0, tdst.height0); max_depth = MIN2(tsrc.array_size, tdst.array_size); num = do_partial_copies ? num_partial_copies : 1; for (j = 0; j < num; j++) { -- 2.17.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev