From: Marek Olšák <marek.ol...@amd.com> Call r600_dma_emit_wait_idle only when there is a possibility of a read-after-write hazard. Buffers not yet used by the SDMA IB don't have to wait. --- src/gallium/drivers/r600/evergreen_hw_context.c | 1 - src/gallium/drivers/r600/evergreen_state.c | 1 - src/gallium/drivers/r600/r600_hw_context.c | 1 - src/gallium/drivers/r600/r600_state.c | 1 - src/gallium/drivers/radeon/r600_pipe_common.c | 48 ++++++++++++++----------- src/gallium/drivers/radeon/r600_pipe_common.h | 1 - src/gallium/drivers/radeonsi/cik_sdma.c | 8 ----- src/gallium/drivers/radeonsi/si_dma.c | 2 -- 8 files changed, 27 insertions(+), 36 deletions(-)
diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c b/src/gallium/drivers/r600/evergreen_hw_context.c index 06f0348..5352dc0 100644 --- a/src/gallium/drivers/r600/evergreen_hw_context.c +++ b/src/gallium/drivers/r600/evergreen_hw_context.c @@ -70,21 +70,20 @@ void evergreen_dma_copy_buffer(struct r600_context *rctx, RADEON_PRIO_SDMA_BUFFER); radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, sub_cmd, csize)); radeon_emit(cs, dst_offset & 0xffffffff); radeon_emit(cs, src_offset & 0xffffffff); radeon_emit(cs, (dst_offset >> 32UL) & 0xff); radeon_emit(cs, (src_offset >> 32UL) & 0xff); dst_offset += csize << shift; src_offset += csize << shift; size -= csize; } - r600_dma_emit_wait_idle(&rctx->b); } /* The max number of bytes to copy per packet. */ #define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8) void evergreen_cp_dma_clear_buffer(struct r600_context *rctx, struct pipe_resource *dst, uint64_t offset, unsigned size, uint32_t clear_value, enum r600_coherency coher) { diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c index 015ff02..c5dd9f7 100644 --- a/src/gallium/drivers/r600/evergreen_state.c +++ b/src/gallium/drivers/r600/evergreen_state.c @@ -3446,21 +3446,20 @@ static void evergreen_dma_copy_tile(struct r600_context *rctx, radeon_emit(cs, (pitch_tile_max << 0) | ((height - 1) << 16)); radeon_emit(cs, (slice_tile_max << 0)); radeon_emit(cs, (x << 0) | (z << 18)); radeon_emit(cs, (y << 0) | (tile_split << 21) | (nbanks << 25) | (non_disp_tiling << 28)); radeon_emit(cs, addr & 0xfffffffc); radeon_emit(cs, (addr >> 32UL) & 0xff); copy_height -= cheight; addr += cheight * pitch; y += cheight; } - r600_dma_emit_wait_idle(&rctx->b); } static void evergreen_dma_copy(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz, struct pipe_resource *src, unsigned src_level, const struct pipe_box *src_box) { diff --git a/src/gallium/drivers/r600/r600_hw_context.c b/src/gallium/drivers/r600/r600_hw_context.c index bc6217a..4663d99 100644 --- a/src/gallium/drivers/r600/r600_hw_context.c +++ b/src/gallium/drivers/r600/r600_hw_context.c @@ -548,12 +548,11 @@ void r600_dma_copy_buffer(struct r600_context *rctx, RADEON_PRIO_SDMA_BUFFER); radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize)); radeon_emit(cs, dst_offset & 0xfffffffc); radeon_emit(cs, src_offset & 0xfffffffc); radeon_emit(cs, (dst_offset >> 32UL) & 0xff); radeon_emit(cs, (src_offset >> 32UL) & 0xff); dst_offset += csize << 2; src_offset += csize << 2; size -= csize; } - r600_dma_emit_wait_idle(&rctx->b); } diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c index ba97490..006bb62 100644 --- a/src/gallium/drivers/r600/r600_state.c +++ b/src/gallium/drivers/r600/r600_state.c @@ -2897,21 +2897,20 @@ static boolean r600_dma_copy_tile(struct r600_context *rctx, (lbpp << 24) | ((height - 1) << 10) | pitch_tile_max); radeon_emit(cs, (slice_tile_max << 12) | (z << 0)); radeon_emit(cs, (x << 3) | (y << 17)); radeon_emit(cs, addr & 0xfffffffc); radeon_emit(cs, (addr >> 32UL) & 0xff); copy_height -= cheight; addr += cheight * pitch; y += cheight; } - r600_dma_emit_wait_idle(&rctx->b); return TRUE; } static void r600_dma_copy(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz, struct pipe_resource *src, unsigned src_level, const struct pipe_box *src_box) diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c index 6b7bbaf..4d8bb74 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.c +++ b/src/gallium/drivers/radeon/r600_pipe_common.c @@ -217,20 +217,35 @@ void r600_draw_rectangle(struct blitter_context *blitter, memcpy(vb+12, attrib->f, sizeof(float)*4); memcpy(vb+20, attrib->f, sizeof(float)*4); } /* draw */ util_draw_vertex_buffer(&rctx->b, NULL, buf, blitter->vb_slot, offset, R600_PRIM_RECTANGLE_LIST, 3, 2); pipe_resource_reference(&buf, NULL); } +static void r600_dma_emit_wait_idle(struct r600_common_context *rctx) +{ + struct radeon_winsys_cs *cs = rctx->dma.cs; + + /* NOP waits for idle on Evergreen and later. */ + if (rctx->chip_class >= CIK) + radeon_emit(cs, 0x00000000); /* NOP */ + else if (rctx->chip_class >= EVERGREEN) + radeon_emit(cs, 0xf0000000); /* NOP */ + else { + /* TODO: R600-R700 should use the FENCE packet. + * CS checker support is required. */ + } +} + void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, struct r600_resource *dst, struct r600_resource *src) { uint64_t vram = ctx->dma.cs->used_vram; uint64_t gtt = ctx->dma.cs->used_gart; if (dst) { vram += dst->vram_usage; gtt += dst->gart_usage; } @@ -254,66 +269,57 @@ void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, * * IBs using too little memory are limited by the IB submission overhead. * IBs using too much memory are limited by the kernel/TTM overhead. * Too long IBs create CPU-GPU pipeline bubbles and add latency. * * This heuristic makes sure that DMA requests are executed * very soon after the call is made and lowers memory usage. * It improves texture upload performance by keeping the DMA * engine busy while uploads are being submitted. */ + num_dw++; /* for emit_wait_idle below */ if (!ctx->ws->cs_check_space(ctx->dma.cs, num_dw) || ctx->dma.cs->used_vram + ctx->dma.cs->used_gart > 64 * 1024 * 1024 || !radeon_cs_memory_below_limit(ctx->screen, ctx->dma.cs, vram, gtt)) { ctx->dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL); assert((num_dw + ctx->dma.cs->current.cdw) <= ctx->dma.cs->current.max_dw); } + /* Wait for idle if either buffer has been used in the IB before to + * prevent read-after-write hazards. + */ + if ((dst && + ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, dst->buf, + RADEON_USAGE_READWRITE)) || + (src && + ctx->ws->cs_is_buffer_referenced(ctx->dma.cs, src->buf, + RADEON_USAGE_WRITE))) + r600_dma_emit_wait_idle(ctx); + /* If GPUVM is not supported, the CS checker needs 2 entries * in the buffer list per packet, which has to be done manually. */ if (ctx->screen->info.has_virtual_memory) { if (dst) radeon_add_to_buffer_list(ctx, &ctx->dma, dst, RADEON_USAGE_WRITE, RADEON_PRIO_SDMA_BUFFER); if (src) radeon_add_to_buffer_list(ctx, &ctx->dma, src, RADEON_USAGE_READ, RADEON_PRIO_SDMA_BUFFER); } /* this function is called before all DMA calls, so increment this. */ ctx->num_dma_calls++; } -/* This is required to prevent read-after-write hazards. */ -void r600_dma_emit_wait_idle(struct r600_common_context *rctx) -{ - struct radeon_winsys_cs *cs = rctx->dma.cs; - - r600_need_dma_space(rctx, 1, NULL, NULL); - - if (!radeon_emitted(cs, 0)) /* empty queue */ - return; - - /* NOP waits for idle on Evergreen and later. */ - if (rctx->chip_class >= CIK) - radeon_emit(cs, 0x00000000); /* NOP */ - else if (rctx->chip_class >= EVERGREEN) - radeon_emit(cs, 0xf0000000); /* NOP */ - else { - /* TODO: R600-R700 should use the FENCE packet. - * CS checker support is required. */ - } -} - static void r600_memory_barrier(struct pipe_context *ctx, unsigned flags) { } void r600_preflush_suspend_features(struct r600_common_context *ctx) { /* suspend queries */ if (!LIST_IS_EMPTY(&ctx->active_queries)) r600_suspend_queries(ctx); diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h index 917059c..74f86dc 100644 --- a/src/gallium/drivers/radeon/r600_pipe_common.h +++ b/src/gallium/drivers/radeon/r600_pipe_common.h @@ -725,21 +725,20 @@ bool r600_can_dump_shader(struct r600_common_screen *rscreen, unsigned processor); bool r600_extra_shader_checks(struct r600_common_screen *rscreen, unsigned processor); void r600_screen_clear_buffer(struct r600_common_screen *rscreen, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned value); struct pipe_resource *r600_resource_create_common(struct pipe_screen *screen, const struct pipe_resource *templ); const char *r600_get_llvm_processor_name(enum radeon_family family); void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw, struct r600_resource *dst, struct r600_resource *src); -void r600_dma_emit_wait_idle(struct r600_common_context *rctx); void radeon_save_cs(struct radeon_winsys *ws, struct radeon_winsys_cs *cs, struct radeon_saved_cs *saved); void radeon_clear_saved_cs(struct radeon_saved_cs *saved); bool r600_check_device_reset(struct r600_common_context *rctx); /* r600_gpu_load.c */ void r600_gpu_load_kill_thread(struct r600_common_screen *rscreen); uint64_t r600_gpu_load_begin(struct r600_common_screen *rscreen); unsigned r600_gpu_load_end(struct r600_common_screen *rscreen, uint64_t begin); diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c index 648b1ca..bee35cd 100644 --- a/src/gallium/drivers/radeonsi/cik_sdma.c +++ b/src/gallium/drivers/radeonsi/cik_sdma.c @@ -60,21 +60,20 @@ static void cik_sdma_copy_buffer(struct si_context *ctx, radeon_emit(cs, csize); radeon_emit(cs, 0); /* src/dst endian swap */ radeon_emit(cs, src_offset); radeon_emit(cs, src_offset >> 32); radeon_emit(cs, dst_offset); radeon_emit(cs, dst_offset >> 32); dst_offset += csize; src_offset += csize; size -= csize; } - r600_dma_emit_wait_idle(&ctx->b); } static void cik_sdma_clear_buffer(struct pipe_context *ctx, struct pipe_resource *dst, uint64_t offset, uint64_t size, unsigned clear_value) { struct si_context *sctx = (struct si_context *)ctx; struct radeon_winsys_cs *cs = sctx->b.dma.cs; @@ -101,21 +100,20 @@ static void cik_sdma_clear_buffer(struct pipe_context *ctx, csize = MIN2(size, CIK_SDMA_COPY_MAX_SIZE); radeon_emit(cs, CIK_SDMA_PACKET(CIK_SDMA_PACKET_CONSTANT_FILL, 0, 0x8000 /* dword copy */)); radeon_emit(cs, offset); radeon_emit(cs, offset >> 32); radeon_emit(cs, clear_value); radeon_emit(cs, csize); offset += csize; size -= csize; } - r600_dma_emit_wait_idle(&sctx->b); } static unsigned minify_as_blocks(unsigned width, unsigned level, unsigned blk_w) { width = u_minify(width, level); return DIV_ROUND_UP(width, blk_w); } static unsigned encode_tile_info(struct si_context *sctx, struct r600_texture *tex, unsigned level, @@ -244,22 +242,20 @@ static bool cik_sdma_copy_texture(struct si_context *sctx, radeon_emit(cs, dstx | (dsty << 16)); radeon_emit(cs, dstz | ((dst_pitch - 1) << 16)); radeon_emit(cs, dst_slice_pitch - 1); if (sctx->b.chip_class == CIK) { radeon_emit(cs, copy_width | (copy_height << 16)); radeon_emit(cs, copy_depth); } else { radeon_emit(cs, (copy_width - 1) | ((copy_height - 1) << 16)); radeon_emit(cs, (copy_depth - 1)); } - - r600_dma_emit_wait_idle(&sctx->b); return true; } /* Tiled <-> linear sub-window copy. */ if ((src_mode >= RADEON_SURF_MODE_1D) != (dst_mode >= RADEON_SURF_MODE_1D)) { struct r600_texture *tiled = src_mode >= RADEON_SURF_MODE_1D ? rsrc : rdst; struct r600_texture *linear = tiled == rsrc ? rdst : rsrc; unsigned tiled_level = tiled == rsrc ? src_level : dst_level; unsigned linear_level = linear == rsrc ? src_level : dst_level; unsigned tiled_x = tiled == rsrc ? srcx : dstx; @@ -410,22 +406,20 @@ static bool cik_sdma_copy_texture(struct si_context *sctx, radeon_emit(cs, linear_x | (linear_y << 16)); radeon_emit(cs, linear_z | ((linear_pitch - 1) << 16)); radeon_emit(cs, linear_slice_pitch - 1); if (sctx->b.chip_class == CIK) { radeon_emit(cs, copy_width_aligned | (copy_height << 16)); radeon_emit(cs, copy_depth); } else { radeon_emit(cs, (copy_width_aligned - 1) | ((copy_height - 1) << 16)); radeon_emit(cs, (copy_depth - 1)); } - - r600_dma_emit_wait_idle(&sctx->b); return true; } } /* Tiled -> Tiled sub-window copy. */ if (dst_mode >= RADEON_SURF_MODE_1D && src_mode >= RADEON_SURF_MODE_1D && /* check if these fit into the bitfields */ src_address % 256 == 0 && dst_address % 256 == 0 && @@ -508,22 +502,20 @@ static bool cik_sdma_copy_texture(struct si_context *sctx, radeon_emit(cs, encode_tile_info(sctx, rdst, dst_level, false)); if (sctx->b.chip_class == CIK) { radeon_emit(cs, copy_width_aligned | (copy_height_aligned << 16)); radeon_emit(cs, copy_depth); } else { radeon_emit(cs, (copy_width_aligned - 8) | ((copy_height_aligned - 8) << 16)); radeon_emit(cs, (copy_depth - 1)); } - - r600_dma_emit_wait_idle(&sctx->b); return true; } } return false; } static void cik_sdma_copy(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level, diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c index 8d186c3..1009bb2 100644 --- a/src/gallium/drivers/radeonsi/si_dma.c +++ b/src/gallium/drivers/radeonsi/si_dma.c @@ -69,21 +69,20 @@ static void si_dma_copy_buffer(struct si_context *ctx, csize = size < max_csize ? size : max_csize; radeon_emit(cs, SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, csize)); radeon_emit(cs, dst_offset); radeon_emit(cs, src_offset); radeon_emit(cs, (dst_offset >> 32UL) & 0xff); radeon_emit(cs, (src_offset >> 32UL) & 0xff); dst_offset += csize << shift; src_offset += csize << shift; size -= csize; } - r600_dma_emit_wait_idle(&ctx->b); } static void si_dma_copy_tile(struct si_context *ctx, struct pipe_resource *dst, unsigned dst_level, unsigned dst_x, unsigned dst_y, unsigned dst_z, struct pipe_resource *src, unsigned src_level, @@ -170,21 +169,20 @@ static void si_dma_copy_tile(struct si_context *ctx, radeon_emit(cs, (pitch_tile_max << 0) | ((height - 1) << 16)); radeon_emit(cs, (slice_tile_max << 0) | (pipe_config << 26)); radeon_emit(cs, (tiled_x << 0) | (tiled_z << 18)); radeon_emit(cs, (tiled_y << 0) | (tile_split << 21) | (nbanks << 25) | (mt << 27)); radeon_emit(cs, addr & 0xfffffffc); radeon_emit(cs, (addr >> 32UL) & 0xff); copy_height -= cheight; addr += cheight * pitch; tiled_y += cheight; } - r600_dma_emit_wait_idle(&ctx->b); } static void si_dma_copy(struct pipe_context *ctx, struct pipe_resource *dst, unsigned dst_level, unsigned dstx, unsigned dsty, unsigned dstz, struct pipe_resource *src, unsigned src_level, const struct pipe_box *src_box) { -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev