On Fri, Jan 4, 2013 at 6:33 PM, Alex Deucher <alexdeuc...@gmail.com> wrote: > On Fri, Jan 4, 2013 at 5:19 PM, <j.gli...@gmail.com> wrote: >> From: Jerome Glisse <jgli...@redhat.com> >> >> The design is to take advantage of the fact that kernel will emit >> semaphore when buffer is referenced by different ring. So the only >> thing we need to enforce synchronization btw dma and gfx/compute >> ring is to make sure that we never reference same bo at the same >> time on the dma and gfx ring. >> >> This is achieved by tracking relocation, when we add a relocation >> to the dma ring for a bo we check first if the bo has an active >> relocation on the gfx ring. If it's the case we flush the gfx ring. >> We do the same when adding a bo to the gfx ring we check it does >> not have a relocation on the dma ring if it has one we flush the >> dma ring. >> >> This patch also simplify the helper query function to know if a bo >> has pending write/read command. > > Looks good. A couple of minor comments below. BTW, any performance gains? >
No, there isn't much benchmark that will trigger a lot of buffer copy AFAICT. Here is a WIP patch for texture copy : http://people.freedesktop.org/~glisse/0001-r600g-r7xx-use-async-dma-for-resource-copy.patch Kernel mostly reject the command stream so far i need to check what's going on. Cheers, Jerome > Alex > >> >> Signed-off-by: Jerome Glisse <jgli...@redhat.com> >> --- >> src/gallium/drivers/r300/r300_emit.c | 21 +- >> src/gallium/drivers/r300/r300_flush.c | 7 +- >> src/gallium/drivers/r600/evergreen_hw_context.c | 39 +++ >> src/gallium/drivers/r600/evergreend.h | 16 ++ >> src/gallium/drivers/r600/r600.h | 13 + >> src/gallium/drivers/r600/r600_blit.c | 94 +++++-- >> src/gallium/drivers/r600/r600_hw_context.c | 44 +++- >> src/gallium/drivers/r600/r600_pipe.c | 13 +- >> src/gallium/drivers/r600/r600_pipe.h | 2 +- >> src/gallium/drivers/r600/r600_texture.c | 2 +- >> src/gallium/drivers/r600/r600d.h | 16 ++ >> src/gallium/drivers/radeonsi/r600_hw_context.c | 2 +- >> .../drivers/radeonsi/r600_hw_context_priv.h | 2 +- >> src/gallium/drivers/radeonsi/r600_texture.c | 2 +- >> src/gallium/drivers/radeonsi/radeonsi_pipe.c | 13 +- >> src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 10 +- >> src/gallium/winsys/radeon/drm/radeon_drm_bo.h | 2 + >> src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 270 >> +++++++++++++++++---- >> src/gallium/winsys/radeon/drm/radeon_drm_cs.h | 40 ++- >> src/gallium/winsys/radeon/drm/radeon_drm_winsys.c | 6 + >> src/gallium/winsys/radeon/drm/radeon_winsys.h | 28 ++- >> 21 files changed, 509 insertions(+), 133 deletions(-) >> >> diff --git a/src/gallium/drivers/r300/r300_emit.c >> b/src/gallium/drivers/r300/r300_emit.c >> index d1ed4b3..c824821 100644 >> --- a/src/gallium/drivers/r300/r300_emit.c >> +++ b/src/gallium/drivers/r300/r300_emit.c >> @@ -1184,7 +1184,8 @@ validate: >> assert(tex && tex->buf && "cbuf is marked, but NULL!"); >> r300->rws->cs_add_reloc(r300->cs, tex->cs_buf, >> RADEON_USAGE_READWRITE, >> - r300_surface(fb->cbufs[i])->domain); >> + r300_surface(fb->cbufs[i])->domain, >> + RADEON_RING_DMA); >> } >> /* ...depth buffer... */ >> if (fb->zsbuf) { >> @@ -1192,7 +1193,8 @@ validate: >> assert(tex && tex->buf && "zsbuf is marked, but NULL!"); >> r300->rws->cs_add_reloc(r300->cs, tex->cs_buf, >> RADEON_USAGE_READWRITE, >> - r300_surface(fb->zsbuf)->domain); >> + r300_surface(fb->zsbuf)->domain, >> + RADEON_RING_DMA); >> } >> } >> if (r300->textures_state.dirty) { >> @@ -1204,18 +1206,21 @@ validate: >> >> tex = r300_resource(texstate->sampler_views[i]->base.texture); >> r300->rws->cs_add_reloc(r300->cs, tex->cs_buf, >> RADEON_USAGE_READ, >> - tex->domain); >> + tex->domain, >> + RADEON_RING_DMA); >> } >> } >> /* ...occlusion query buffer... */ >> if (r300->query_current) >> r300->rws->cs_add_reloc(r300->cs, r300->query_current->cs_buf, >> - RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT); >> + RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT, >> + RADEON_RING_DMA); >> /* ...vertex buffer for SWTCL path... */ >> if (r300->vbo) >> r300->rws->cs_add_reloc(r300->cs, r300_resource(r300->vbo)->cs_buf, >> RADEON_USAGE_READ, >> - r300_resource(r300->vbo)->domain); >> + r300_resource(r300->vbo)->domain, >> + RADEON_RING_DMA); >> /* ...vertex buffers for HWTCL path... */ >> if (do_validate_vertex_buffers && r300->vertex_arrays_dirty) { >> struct pipe_vertex_buffer *vbuf = r300->vertex_buffer; >> @@ -1230,14 +1235,16 @@ validate: >> >> r300->rws->cs_add_reloc(r300->cs, r300_resource(buf)->cs_buf, >> RADEON_USAGE_READ, >> - r300_resource(buf)->domain); >> + r300_resource(buf)->domain, >> + RADEON_RING_DMA); >> } >> } >> /* ...and index buffer for HWTCL path. */ >> if (index_buffer) >> r300->rws->cs_add_reloc(r300->cs, >> r300_resource(index_buffer)->cs_buf, >> RADEON_USAGE_READ, >> - r300_resource(index_buffer)->domain); >> + r300_resource(index_buffer)->domain, >> + RADEON_RING_DMA); >> >> /* Now do the validation (flush is called inside cs_validate on >> failure). */ >> if (!r300->rws->cs_validate(r300->cs)) { >> diff --git a/src/gallium/drivers/r300/r300_flush.c >> b/src/gallium/drivers/r300/r300_flush.c >> index 2170c59..ff3dbe9 100644 >> --- a/src/gallium/drivers/r300/r300_flush.c >> +++ b/src/gallium/drivers/r300/r300_flush.c >> @@ -70,8 +70,10 @@ void r300_flush(struct pipe_context *pipe, >> struct r300_context *r300 = r300_context(pipe); >> struct pb_buffer **rfence = (struct pb_buffer**)fence; >> >> + /* r3xx-r5xx only have gfx ring */ >> + flags |= RADEON_FLUSH_GFX; >> if (r300->draw && !r300->draw_vbo_locked) >> - r300_draw_flush_vbuf(r300); >> + r300_draw_flush_vbuf(r300); >> >> if (r300->screen->info.drm_minor >= 12) { >> flags |= RADEON_FLUSH_KEEP_TILING_FLAGS; >> @@ -84,7 +86,8 @@ void r300_flush(struct pipe_context *pipe, >> /* Add the fence as a dummy relocation. */ >> r300->rws->cs_add_reloc(r300->cs, >> r300->rws->buffer_get_cs_handle(*rfence), >> - RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT); >> + RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT, >> + RADEON_RING_DMA); >> } >> >> if (r300->dirty_hw) { >> diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c >> b/src/gallium/drivers/r600/evergreen_hw_context.c >> index 0ca7f9e..302f652 100644 >> --- a/src/gallium/drivers/r600/evergreen_hw_context.c >> +++ b/src/gallium/drivers/r600/evergreen_hw_context.c >> @@ -238,3 +238,42 @@ void evergreen_set_streamout_enable(struct r600_context >> *ctx, unsigned buffer_en >> r600_write_context_reg(cs, R_028B94_VGT_STRMOUT_CONFIG, >> S_028B94_STREAMOUT_0_EN(0)); >> } >> } >> + >> +void evergreen_dma_copy(struct r600_context *rctx, >> + struct pipe_resource *dst, >> + struct pipe_resource *src, >> + unsigned long dst_offset, >> + unsigned long src_offset, >> + unsigned long size) >> +{ >> + struct radeon_winsys_cs *cs = rctx->cs; >> + unsigned i, ncopy, csize, command, shift; >> + struct r600_resource *rdst = (struct r600_resource*)dst; >> + struct r600_resource *rsrc = (struct r600_resource*)src; >> + >> + /* see if we use dword or byte copy */ >> + if (!(dst_offset & 0x3) && !(src_offset & 0x3) && !(size & 0x3)) { >> + size >>= 2; >> + command = 0x00 << 20; >> + shift = 2; >> + } else { >> + command = 0x40 << 20; >> + shift = 0; >> + } >> + ncopy = (size / 0xfffff) + !!(size % 0xfffff); >> + >> + r600_need_dma_space(rctx, ncopy * 5); >> + for (i = 0; i < ncopy; i++) { >> + csize = size < 0xfffff ? size : 0xfffff; >> + cs->dma_buf[cs->dma_cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, >> 0, csize) | command; > > > I'd perfer to clean up the DMA_PACKET() macro on evergreen since the t > and s bits don't really make sense on evergreen. Just make the macro > take 3 args and treat bits 27:20 as a sub-command field. something > like: Ok with that >> +#define DMA_PACKET(cmd, sub_cmd, n) ((((cmd) & 0xF) << 28) | \ >> + (((sub_cmd) & 0xFF) << 20) | \ >> + (((n) & 0xFFFFF) << 0)) > > Looks a little cleaner in my opinion. Yes >> + cs->dma_buf[cs->dma_cdw++] = dst_offset & 0xffffffff; >> + cs->dma_buf[cs->dma_cdw++] = src_offset & 0xffffffff; >> + cs->dma_buf[cs->dma_cdw++] = (dst_offset >> 32UL) & 0xff; >> + cs->dma_buf[cs->dma_cdw++] = (src_offset >> 32UL) & 0xff; >> + rctx->ws->cs_add_reloc(rctx->cs, rsrc->cs_buf, >> RADEON_USAGE_READ, rsrc->domains, RADEON_RING_DMA); >> + rctx->ws->cs_add_reloc(rctx->cs, rdst->cs_buf, >> RADEON_USAGE_WRITE, rdst->domains, RADEON_RING_DMA); >> + dst_offset += csize << shift; >> + src_offset += csize << shift; >> + size -= csize; >> + } >> +} >> diff --git a/src/gallium/drivers/r600/evergreend.h >> b/src/gallium/drivers/r600/evergreend.h >> index d9dba95..d8c50b6 100644 >> --- a/src/gallium/drivers/r600/evergreend.h >> +++ b/src/gallium/drivers/r600/evergreend.h >> @@ -2317,4 +2317,20 @@ >> #define G_028AA8_SWITCH_ON_EOP(x) (((x) >> 17) & 0x1) >> #define C_028AA8_SWITCH_ON_EOP 0xFFFDFFFF >> >> +/* async DMA packets */ >> +#define DMA_PACKET(cmd, t, s, n) ((((cmd) & 0xF) << 28) | \ >> + (((t) & 0x1) << 23) | \ >> + (((s) & 0x1) << 22) | \ >> + (((n) & 0xFFFFF) << 0)) >> +/* async DMA Packet types */ >> +#define DMA_PACKET_WRITE 0x2 >> +#define DMA_PACKET_COPY 0x3 >> +#define DMA_PACKET_INDIRECT_BUFFER 0x4 >> +#define DMA_PACKET_SEMAPHORE 0x5 >> +#define DMA_PACKET_FENCE 0x6 >> +#define DMA_PACKET_TRAP 0x7 >> +#define DMA_PACKET_SRBM_WRITE 0x9 >> +#define DMA_PACKET_CONSTANT_FILL 0xd >> +#define DMA_PACKET_NOP 0xf >> + >> #endif > > <snip> > >> + >> +void r600_dma_copy(struct r600_context *rctx, >> + struct pipe_resource *dst, >> + struct pipe_resource *src, >> + unsigned long dst_offset, >> + unsigned long src_offset, >> + unsigned long size) >> +{ > > Maybe rename this r700_dma_copy() since r600 is flakey and seems to > actually take a slightly different copy packet format? > Will do >> + struct radeon_winsys_cs *cs = rctx->cs; >> + unsigned i, ncopy, csize, shift; >> + struct r600_resource *rdst = (struct r600_resource*)dst; >> + struct r600_resource *rsrc = (struct r600_resource*)src; >> + >> + size >>= 2; >> + shift = 2; >> + ncopy = (size / 0xffff) + !!(size % 0xffff); >> + >> + r600_need_dma_space(rctx, ncopy * 5); >> + for (i = 0; i < ncopy; i++) { >> + csize = size < 0xffff ? size : 0xffff; >> + cs->dma_buf[cs->dma_cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, >> 0, csize); >> + cs->dma_buf[cs->dma_cdw++] = dst_offset & 0xfffffffc; >> + cs->dma_buf[cs->dma_cdw++] = src_offset & 0xfffffffc; >> + cs->dma_buf[cs->dma_cdw++] = (dst_offset >> 32UL) & 0xff; >> + cs->dma_buf[cs->dma_cdw++] = (src_offset >> 32UL) & 0xff; >> + rctx->ws->cs_add_reloc(rctx->cs, rsrc->cs_buf, >> RADEON_USAGE_READ, rsrc->domains, RADEON_RING_DMA); >> + rctx->ws->cs_add_reloc(rctx->cs, rdst->cs_buf, >> RADEON_USAGE_WRITE, rdst->domains, RADEON_RING_DMA); >> + dst_offset += csize << shift; >> + src_offset += csize << shift; >> + size -= csize; >> + } >> +} _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev