GDS = Global Data Share (on-chip memory) IB = indirect buffer (command buffer)
Marek On Mon, Nov 26, 2018 at 9:07 PM Mike Lothian <m...@fireburn.co.uk> wrote: > Hi > > What's GDS and IB stand for? > > Thanks > > Mike > > On Tue, 27 Nov 2018, 01:57 Marek Olšák, <mar...@gmail.com> wrote: > >> From: Marek Olšák <marek.ol...@amd.com> >> >> --- >> .../drivers/radeonsi/si_compute_blit.c | 4 +- >> src/gallium/drivers/radeonsi/si_cp_dma.c | 49 ++++++++++--------- >> src/gallium/drivers/radeonsi/si_pipe.h | 8 +-- >> .../drivers/radeonsi/si_test_dma_perf.c | 3 +- >> 4 files changed, 33 insertions(+), 31 deletions(-) >> >> diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c >> b/src/gallium/drivers/radeonsi/si_compute_blit.c >> index 20e4f591fbb..086793637f0 100644 >> --- a/src/gallium/drivers/radeonsi/si_compute_blit.c >> +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c >> @@ -212,22 +212,22 @@ void si_clear_buffer(struct si_context *sctx, >> struct pipe_resource *dst, >> */ >> if (clear_value_size > 4 || >> (clear_value_size == 4 && >> offset % 4 == 0 && >> (size > 32*1024 || sctx->chip_class <= VI))) { >> si_compute_do_clear_or_copy(sctx, dst, offset, >> NULL, 0, >> aligned_size, >> clear_value, >> clear_value_size, >> coher); >> } else { >> assert(clear_value_size == 4); >> - si_cp_dma_clear_buffer(sctx, dst, offset, >> - aligned_size, >> *clear_value, coher, >> + si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, >> offset, >> + aligned_size, >> *clear_value, 0, coher, >> get_cache_policy(sctx, >> coher, size)); >> } >> >> offset += aligned_size; >> size -= aligned_size; >> } >> >> /* Handle non-dword alignment. */ >> if (size) { >> assert(dst); >> diff --git a/src/gallium/drivers/radeonsi/si_cp_dma.c >> b/src/gallium/drivers/radeonsi/si_cp_dma.c >> index 839b31b7fdf..33220d9f0fa 100644 >> --- a/src/gallium/drivers/radeonsi/si_cp_dma.c >> +++ b/src/gallium/drivers/radeonsi/si_cp_dma.c >> @@ -47,25 +47,24 @@ static inline unsigned cp_dma_max_byte_count(struct >> si_context *sctx) >> >> /* make it aligned for optimal performance */ >> return max & ~(SI_CPDMA_ALIGNMENT - 1); >> } >> >> >> /* Emit a CP DMA packet to do a copy from one buffer to another, or to >> clear >> * a buffer. The size must fit in bits [20:0]. If CP_DMA_CLEAR is set, >> src_va is a 32-bit >> * clear value. >> */ >> -static void si_emit_cp_dma(struct si_context *sctx, uint64_t dst_va, >> - uint64_t src_va, unsigned size, unsigned flags, >> - enum si_cache_policy cache_policy) >> +static void si_emit_cp_dma(struct si_context *sctx, struct radeon_cmdbuf >> *cs, >> + uint64_t dst_va, uint64_t src_va, unsigned >> size, >> + unsigned flags, enum si_cache_policy >> cache_policy) >> { >> - struct radeon_cmdbuf *cs = sctx->gfx_cs; >> uint32_t header = 0, command = 0; >> >> assert(size <= cp_dma_max_byte_count(sctx)); >> assert(sctx->chip_class != SI || cache_policy == L2_BYPASS); >> >> if (sctx->chip_class >= GFX9) >> command |= S_414_BYTE_COUNT_GFX9(size); >> else >> command |= S_414_BYTE_COUNT_GFX6(size); >> >> @@ -139,21 +138,21 @@ static void si_emit_cp_dma(struct si_context *sctx, >> uint64_t dst_va, >> } >> >> void si_cp_dma_wait_for_idle(struct si_context *sctx) >> { >> /* Issue a dummy DMA that copies zero bytes. >> * >> * The DMA engine will see that there's no work to do and skip >> this >> * DMA request, however, the CP will see the sync flag and still >> wait >> * for all DMAs to complete. >> */ >> - si_emit_cp_dma(sctx, 0, 0, 0, CP_DMA_SYNC, L2_BYPASS); >> + si_emit_cp_dma(sctx, sctx->gfx_cs, 0, 0, 0, CP_DMA_SYNC, >> L2_BYPASS); >> } >> >> static void si_cp_dma_prepare(struct si_context *sctx, struct >> pipe_resource *dst, >> struct pipe_resource *src, unsigned >> byte_count, >> uint64_t remaining_size, unsigned >> user_flags, >> enum si_coherency coher, bool *is_first, >> unsigned *packet_flags) >> { >> /* Fast exit for a CPDMA prefetch. */ >> if ((user_flags & SI_CPDMA_SKIP_ALL) == SI_CPDMA_SKIP_ALL) { >> @@ -200,51 +199,53 @@ static void si_cp_dma_prepare(struct si_context >> *sctx, struct pipe_resource *dst >> */ >> if (!(user_flags & SI_CPDMA_SKIP_SYNC_AFTER) && >> byte_count == remaining_size) { >> *packet_flags |= CP_DMA_SYNC; >> >> if (coher == SI_COHERENCY_SHADER) >> *packet_flags |= CP_DMA_PFP_SYNC_ME; >> } >> } >> >> -void si_cp_dma_clear_buffer(struct si_context *sctx, struct >> pipe_resource *dst, >> - uint64_t offset, uint64_t size, unsigned >> value, >> - enum si_coherency coher, >> - enum si_cache_policy cache_policy) >> +void si_cp_dma_clear_buffer(struct si_context *sctx, struct >> radeon_cmdbuf *cs, >> + struct pipe_resource *dst, uint64_t offset, >> + uint64_t size, unsigned value, unsigned >> user_flags, >> + enum si_coherency coher, enum si_cache_policy >> cache_policy) >> { >> struct r600_resource *rdst = r600_resource(dst); >> uint64_t va = (rdst ? rdst->gpu_address : 0) + offset; >> bool is_first = true; >> >> assert(size && size % 4 == 0); >> >> /* Mark the buffer range of destination as valid (initialized), >> * so that transfer_map knows it should wait for the GPU when >> mapping >> * that range. */ >> if (rdst) >> util_range_add(&rdst->valid_buffer_range, offset, offset >> + size); >> >> /* Flush the caches. */ >> - sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | >> - SI_CONTEXT_CS_PARTIAL_FLUSH | >> - si_get_flush_flags(sctx, coher, cache_policy); >> + if (rdst && !(user_flags & SI_CPDMA_SKIP_GFX_SYNC)) { >> + sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | >> + SI_CONTEXT_CS_PARTIAL_FLUSH | >> + si_get_flush_flags(sctx, coher, >> cache_policy); >> + } >> >> while (size) { >> unsigned byte_count = MIN2(size, >> cp_dma_max_byte_count(sctx)); >> unsigned dma_flags = CP_DMA_CLEAR | (rdst ? 0 : >> CP_DMA_DST_IS_GDS); >> >> - si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, 0, >> coher, >> - &is_first, &dma_flags); >> + si_cp_dma_prepare(sctx, dst, NULL, byte_count, size, >> user_flags, >> + coher, &is_first, &dma_flags); >> >> /* Emit the clear packet. */ >> - si_emit_cp_dma(sctx, va, value, byte_count, dma_flags, >> cache_policy); >> + si_emit_cp_dma(sctx, cs, va, value, byte_count, >> dma_flags, cache_policy); >> >> size -= byte_count; >> va += byte_count; >> } >> >> if (rdst && cache_policy != L2_BYPASS) >> rdst->TC_L2_dirty = true; >> >> /* If it's not a framebuffer fast clear... */ >> if (coher == SI_COHERENCY_SHADER) >> @@ -283,21 +284,21 @@ static void si_cp_dma_realign_engine(struct >> si_context *sctx, unsigned size, >> return; >> >> si_mark_atom_dirty(sctx, &sctx->atoms.s.scratch_state); >> } >> >> si_cp_dma_prepare(sctx, &sctx->scratch_buffer->b.b, >> &sctx->scratch_buffer->b.b, size, size, >> user_flags, >> coher, is_first, &dma_flags); >> >> va = sctx->scratch_buffer->gpu_address; >> - si_emit_cp_dma(sctx, va, va + SI_CPDMA_ALIGNMENT, size, dma_flags, >> + si_emit_cp_dma(sctx, sctx->gfx_cs, va, va + SI_CPDMA_ALIGNMENT, >> size, dma_flags, >> cache_policy); >> } >> >> /** >> * Do memcpy between buffers using CP DMA. >> * If src or dst is NULL, it means read or write GDS, respectively. >> * >> * \param user_flags bitmask of SI_CPDMA_* >> */ >> void si_cp_dma_copy_buffer(struct si_context *sctx, >> @@ -366,37 +367,37 @@ void si_cp_dma_copy_buffer(struct si_context *sctx, >> main_src_offset = src_offset + skipped_size; >> >> while (size) { >> unsigned byte_count = MIN2(size, >> cp_dma_max_byte_count(sctx)); >> unsigned dma_flags = gds_flags; >> >> si_cp_dma_prepare(sctx, dst, src, byte_count, >> size + skipped_size + realign_size, >> user_flags, coher, &is_first, >> &dma_flags); >> >> - si_emit_cp_dma(sctx, main_dst_offset, main_src_offset, >> + si_emit_cp_dma(sctx, sctx->gfx_cs, main_dst_offset, >> main_src_offset, >> byte_count, dma_flags, cache_policy); >> >> size -= byte_count; >> main_src_offset += byte_count; >> main_dst_offset += byte_count; >> } >> >> /* Copy the part we skipped because src wasn't aligned. */ >> if (skipped_size) { >> unsigned dma_flags = gds_flags; >> >> si_cp_dma_prepare(sctx, dst, src, skipped_size, >> skipped_size + realign_size, user_flags, >> coher, &is_first, &dma_flags); >> >> - si_emit_cp_dma(sctx, dst_offset, src_offset, skipped_size, >> + si_emit_cp_dma(sctx, sctx->gfx_cs, dst_offset, >> src_offset, skipped_size, >> dma_flags, cache_policy); >> } >> >> /* Finally, realign the engine if the size wasn't aligned. */ >> if (realign_size) { >> si_cp_dma_realign_engine(sctx, realign_size, user_flags, >> coher, >> cache_policy, &is_first); >> } >> >> if (dst && cache_policy != L2_BYPASS) >> @@ -546,35 +547,35 @@ void cik_emit_prefetch_L2(struct si_context *sctx, >> bool vertex_stage_only) >> >> void si_test_gds(struct si_context *sctx) >> { >> struct pipe_context *ctx = &sctx->b; >> struct pipe_resource *src, *dst; >> unsigned r[4] = {}; >> unsigned offset = debug_get_num_option("OFFSET", 16); >> >> src = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16); >> dst = pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_DEFAULT, 16); >> - si_cp_dma_clear_buffer(sctx, src, 0, 4, 0xabcdef01, >> SI_COHERENCY_SHADER, L2_BYPASS); >> - si_cp_dma_clear_buffer(sctx, src, 4, 4, 0x23456789, >> SI_COHERENCY_SHADER, L2_BYPASS); >> - si_cp_dma_clear_buffer(sctx, src, 8, 4, 0x87654321, >> SI_COHERENCY_SHADER, L2_BYPASS); >> - si_cp_dma_clear_buffer(sctx, src, 12, 4, 0xfedcba98, >> SI_COHERENCY_SHADER, L2_BYPASS); >> - si_cp_dma_clear_buffer(sctx, dst, 0, 16, 0xdeadbeef, >> SI_COHERENCY_SHADER, L2_BYPASS); >> + si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 0, 4, 0xabcdef01, >> 0, SI_COHERENCY_SHADER, L2_BYPASS); >> + si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 4, 4, 0x23456789, >> 0, SI_COHERENCY_SHADER, L2_BYPASS); >> + si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 8, 4, 0x87654321, >> 0, SI_COHERENCY_SHADER, L2_BYPASS); >> + si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, src, 12, 4, >> 0xfedcba98, 0, SI_COHERENCY_SHADER, L2_BYPASS); >> + si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, 16, >> 0xdeadbeef, 0, SI_COHERENCY_SHADER, L2_BYPASS); >> >> si_cp_dma_copy_buffer(sctx, NULL, src, offset, 0, 16, 0, >> SI_COHERENCY_NONE, L2_BYPASS); >> si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, >> SI_COHERENCY_NONE, L2_BYPASS); >> >> pipe_buffer_read(ctx, dst, 0, sizeof(r), r); >> printf("GDS copy = %08x %08x %08x %08x -> %s\n", r[0], r[1], >> r[2], r[3], >> r[0] == 0xabcdef01 && r[1] == 0x23456789 && >> r[2] == 0x87654321 && r[3] == 0xfedcba98 ? "pass" >> : "fail"); >> >> - si_cp_dma_clear_buffer(sctx, NULL, offset, 16, 0xc1ea4146, >> SI_COHERENCY_NONE, L2_BYPASS); >> + si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, NULL, offset, 16, >> 0xc1ea4146, 0, SI_COHERENCY_NONE, L2_BYPASS); >> si_cp_dma_copy_buffer(sctx, dst, NULL, 0, offset, 16, 0, >> SI_COHERENCY_NONE, L2_BYPASS); >> >> pipe_buffer_read(ctx, dst, 0, sizeof(r), r); >> printf("GDS clear = %08x %08x %08x %08x -> %s\n", r[0], r[1], >> r[2], r[3], >> r[0] == 0xc1ea4146 && r[1] == 0xc1ea4146 && >> r[2] == 0xc1ea4146 && r[3] == 0xc1ea4146 ? "pass" >> : "fail"); >> >> pipe_resource_reference(&src, NULL); >> pipe_resource_reference(&dst, NULL); >> exit(0); >> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h >> b/src/gallium/drivers/radeonsi/si_pipe.h >> index 023e0f0a0f9..3ec645f9c71 100644 >> --- a/src/gallium/drivers/radeonsi/si_pipe.h >> +++ b/src/gallium/drivers/radeonsi/si_pipe.h >> @@ -1152,24 +1152,24 @@ void si_init_compute_blit_functions(struct >> si_context *sctx); >> #define SI_CPDMA_SKIP_SYNC_BEFORE (1 << 2) /* don't wait for DMA >> before the copy (RAW hazards) */ >> #define SI_CPDMA_SKIP_GFX_SYNC (1 << 3) /* don't flush caches >> and don't wait for PS/CS */ >> #define SI_CPDMA_SKIP_BO_LIST_UPDATE (1 << 4) /* don't update the BO >> list */ >> #define SI_CPDMA_SKIP_ALL (SI_CPDMA_SKIP_CHECK_CS_SPACE | \ >> SI_CPDMA_SKIP_SYNC_AFTER | \ >> SI_CPDMA_SKIP_SYNC_BEFORE | \ >> SI_CPDMA_SKIP_GFX_SYNC | \ >> SI_CPDMA_SKIP_BO_LIST_UPDATE) >> >> void si_cp_dma_wait_for_idle(struct si_context *sctx); >> -void si_cp_dma_clear_buffer(struct si_context *sctx, struct >> pipe_resource *dst, >> - uint64_t offset, uint64_t size, unsigned >> value, >> - enum si_coherency coher, >> - enum si_cache_policy cache_policy); >> +void si_cp_dma_clear_buffer(struct si_context *sctx, struct >> radeon_cmdbuf *cs, >> + struct pipe_resource *dst, uint64_t offset, >> + uint64_t size, unsigned value, unsigned >> user_flags, >> + enum si_coherency coher, enum si_cache_policy >> cache_policy); >> void si_cp_dma_copy_buffer(struct si_context *sctx, >> struct pipe_resource *dst, struct >> pipe_resource *src, >> uint64_t dst_offset, uint64_t src_offset, >> unsigned size, >> unsigned user_flags, enum si_coherency coher, >> enum si_cache_policy cache_policy); >> void cik_prefetch_TC_L2_async(struct si_context *sctx, struct >> pipe_resource *buf, >> uint64_t offset, unsigned size); >> void cik_emit_prefetch_L2(struct si_context *sctx, bool >> vertex_stage_only); >> void si_test_gds(struct si_context *sctx); >> >> diff --git a/src/gallium/drivers/radeonsi/si_test_dma_perf.c >> b/src/gallium/drivers/radeonsi/si_test_dma_perf.c >> index 6c04720e963..657c4ebeff8 100644 >> --- a/src/gallium/drivers/radeonsi/si_test_dma_perf.c >> +++ b/src/gallium/drivers/radeonsi/si_test_dma_perf.c >> @@ -174,21 +174,22 @@ void si_test_dma_perf(struct si_screen *sscreen) >> for (unsigned iter = 0; iter < NUM_RUNS; >> iter++) { >> q[iter] = ctx->create_query(ctx, >> query_type, 0); >> ctx->begin_query(ctx, q[iter]); >> >> if (test_cp) { >> /* CP DMA */ >> if (is_copy) { >> >> si_cp_dma_copy_buffer(sctx, dst, src, 0, 0, size, 0, >> >> SI_COHERENCY_NONE, cache_policy); >> } else { >> - >> si_cp_dma_clear_buffer(sctx, dst, 0, size, clear_value, >> + >> si_cp_dma_clear_buffer(sctx, sctx->gfx_cs, dst, 0, size, >> + >> clear_value, 0, >> >> SI_COHERENCY_NONE, cache_policy); >> } >> } else if (test_sdma) { >> /* SDMA */ >> if (is_copy) { >> struct pipe_box >> box; >> u_box_1d(0, size, >> &box); >> >> sctx->dma_copy(ctx, dst, 0, 0, 0, 0, src, 0, &box); >> } else { >> >> si_sdma_clear_buffer(sctx, dst, 0, size, clear_value); >> -- >> 2.17.1 >> >> _______________________________________________ >> mesa-dev mailing list >> mesa-dev@lists.freedesktop.org >> https://lists.freedesktop.org/mailman/listinfo/mesa-dev >> >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev