On Fri, Jan 4, 2013 at 5:19 PM, <j.gli...@gmail.com> wrote: > From: Jerome Glisse <jgli...@redhat.com> > > The design is to take advantage of the fact that kernel will emit > semaphore when buffer is referenced by different ring. So the only > thing we need to enforce synchronization btw dma and gfx/compute > ring is to make sure that we never reference same bo at the same > time on the dma and gfx ring. > > This is achieved by tracking relocation, when we add a relocation > to the dma ring for a bo we check first if the bo has an active > relocation on the gfx ring. If it's the case we flush the gfx ring. > We do the same when adding a bo to the gfx ring we check it does > not have a relocation on the dma ring if it has one we flush the > dma ring. > > This patch also simplify the helper query function to know if a bo > has pending write/read command.
Looks good. A couple of minor comments below. BTW, any performance gains? Alex > > Signed-off-by: Jerome Glisse <jgli...@redhat.com> > --- > src/gallium/drivers/r300/r300_emit.c | 21 +- > src/gallium/drivers/r300/r300_flush.c | 7 +- > src/gallium/drivers/r600/evergreen_hw_context.c | 39 +++ > src/gallium/drivers/r600/evergreend.h | 16 ++ > src/gallium/drivers/r600/r600.h | 13 + > src/gallium/drivers/r600/r600_blit.c | 94 +++++-- > src/gallium/drivers/r600/r600_hw_context.c | 44 +++- > src/gallium/drivers/r600/r600_pipe.c | 13 +- > src/gallium/drivers/r600/r600_pipe.h | 2 +- > src/gallium/drivers/r600/r600_texture.c | 2 +- > src/gallium/drivers/r600/r600d.h | 16 ++ > src/gallium/drivers/radeonsi/r600_hw_context.c | 2 +- > .../drivers/radeonsi/r600_hw_context_priv.h | 2 +- > src/gallium/drivers/radeonsi/r600_texture.c | 2 +- > src/gallium/drivers/radeonsi/radeonsi_pipe.c | 13 +- > src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 10 +- > src/gallium/winsys/radeon/drm/radeon_drm_bo.h | 2 + > src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 270 > +++++++++++++++++---- > src/gallium/winsys/radeon/drm/radeon_drm_cs.h | 40 ++- > src/gallium/winsys/radeon/drm/radeon_drm_winsys.c | 6 + > src/gallium/winsys/radeon/drm/radeon_winsys.h | 28 ++- > 21 files changed, 509 insertions(+), 133 deletions(-) > > diff --git a/src/gallium/drivers/r300/r300_emit.c > b/src/gallium/drivers/r300/r300_emit.c > index d1ed4b3..c824821 100644 > --- a/src/gallium/drivers/r300/r300_emit.c > +++ b/src/gallium/drivers/r300/r300_emit.c > @@ -1184,7 +1184,8 @@ validate: > assert(tex && tex->buf && "cbuf is marked, but NULL!"); > r300->rws->cs_add_reloc(r300->cs, tex->cs_buf, > RADEON_USAGE_READWRITE, > - r300_surface(fb->cbufs[i])->domain); > + r300_surface(fb->cbufs[i])->domain, > + RADEON_RING_DMA); > } > /* ...depth buffer... */ > if (fb->zsbuf) { > @@ -1192,7 +1193,8 @@ validate: > assert(tex && tex->buf && "zsbuf is marked, but NULL!"); > r300->rws->cs_add_reloc(r300->cs, tex->cs_buf, > RADEON_USAGE_READWRITE, > - r300_surface(fb->zsbuf)->domain); > + r300_surface(fb->zsbuf)->domain, > + RADEON_RING_DMA); > } > } > if (r300->textures_state.dirty) { > @@ -1204,18 +1206,21 @@ validate: > > tex = r300_resource(texstate->sampler_views[i]->base.texture); > r300->rws->cs_add_reloc(r300->cs, tex->cs_buf, RADEON_USAGE_READ, > - tex->domain); > + tex->domain, > + RADEON_RING_DMA); > } > } > /* ...occlusion query buffer... */ > if (r300->query_current) > r300->rws->cs_add_reloc(r300->cs, r300->query_current->cs_buf, > - RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT); > + RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT, > + RADEON_RING_DMA); > /* ...vertex buffer for SWTCL path... */ > if (r300->vbo) > r300->rws->cs_add_reloc(r300->cs, r300_resource(r300->vbo)->cs_buf, > RADEON_USAGE_READ, > - r300_resource(r300->vbo)->domain); > + r300_resource(r300->vbo)->domain, > + RADEON_RING_DMA); > /* ...vertex buffers for HWTCL path... */ > if (do_validate_vertex_buffers && r300->vertex_arrays_dirty) { > struct pipe_vertex_buffer *vbuf = r300->vertex_buffer; > @@ -1230,14 +1235,16 @@ validate: > > r300->rws->cs_add_reloc(r300->cs, r300_resource(buf)->cs_buf, > RADEON_USAGE_READ, > - r300_resource(buf)->domain); > + r300_resource(buf)->domain, > + RADEON_RING_DMA); > } > } > /* ...and index buffer for HWTCL path. */ > if (index_buffer) > r300->rws->cs_add_reloc(r300->cs, > r300_resource(index_buffer)->cs_buf, > RADEON_USAGE_READ, > - r300_resource(index_buffer)->domain); > + r300_resource(index_buffer)->domain, > + RADEON_RING_DMA); > > /* Now do the validation (flush is called inside cs_validate on > failure). */ > if (!r300->rws->cs_validate(r300->cs)) { > diff --git a/src/gallium/drivers/r300/r300_flush.c > b/src/gallium/drivers/r300/r300_flush.c > index 2170c59..ff3dbe9 100644 > --- a/src/gallium/drivers/r300/r300_flush.c > +++ b/src/gallium/drivers/r300/r300_flush.c > @@ -70,8 +70,10 @@ void r300_flush(struct pipe_context *pipe, > struct r300_context *r300 = r300_context(pipe); > struct pb_buffer **rfence = (struct pb_buffer**)fence; > > + /* r3xx-r5xx only have gfx ring */ > + flags |= RADEON_FLUSH_GFX; > if (r300->draw && !r300->draw_vbo_locked) > - r300_draw_flush_vbuf(r300); > + r300_draw_flush_vbuf(r300); > > if (r300->screen->info.drm_minor >= 12) { > flags |= RADEON_FLUSH_KEEP_TILING_FLAGS; > @@ -84,7 +86,8 @@ void r300_flush(struct pipe_context *pipe, > /* Add the fence as a dummy relocation. */ > r300->rws->cs_add_reloc(r300->cs, > r300->rws->buffer_get_cs_handle(*rfence), > - RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT); > + RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT, > + RADEON_RING_DMA); > } > > if (r300->dirty_hw) { > diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c > b/src/gallium/drivers/r600/evergreen_hw_context.c > index 0ca7f9e..302f652 100644 > --- a/src/gallium/drivers/r600/evergreen_hw_context.c > +++ b/src/gallium/drivers/r600/evergreen_hw_context.c > @@ -238,3 +238,42 @@ void evergreen_set_streamout_enable(struct r600_context > *ctx, unsigned buffer_en > r600_write_context_reg(cs, R_028B94_VGT_STRMOUT_CONFIG, > S_028B94_STREAMOUT_0_EN(0)); > } > } > + > +void evergreen_dma_copy(struct r600_context *rctx, > + struct pipe_resource *dst, > + struct pipe_resource *src, > + unsigned long dst_offset, > + unsigned long src_offset, > + unsigned long size) > +{ > + struct radeon_winsys_cs *cs = rctx->cs; > + unsigned i, ncopy, csize, command, shift; > + struct r600_resource *rdst = (struct r600_resource*)dst; > + struct r600_resource *rsrc = (struct r600_resource*)src; > + > + /* see if we use dword or byte copy */ > + if (!(dst_offset & 0x3) && !(src_offset & 0x3) && !(size & 0x3)) { > + size >>= 2; > + command = 0x00 << 20; > + shift = 2; > + } else { > + command = 0x40 << 20; > + shift = 0; > + } > + ncopy = (size / 0xfffff) + !!(size % 0xfffff); > + > + r600_need_dma_space(rctx, ncopy * 5); > + for (i = 0; i < ncopy; i++) { > + csize = size < 0xfffff ? size : 0xfffff; > + cs->dma_buf[cs->dma_cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, > 0, csize) | command; I'd perfer to clean up the DMA_PACKET() macro on evergreen since the t and s bits don't really make sense on evergreen. Just make the macro take 3 args and treat bits 27:20 as a sub-command field. something like: > +#define DMA_PACKET(cmd, sub_cmd, n) ((((cmd) & 0xF) << 28) | \ > + (((sub_cmd) & 0xFF) << 20) | \ > + (((n) & 0xFFFFF) << 0)) Looks a little cleaner in my opinion. > + cs->dma_buf[cs->dma_cdw++] = dst_offset & 0xffffffff; > + cs->dma_buf[cs->dma_cdw++] = src_offset & 0xffffffff; > + cs->dma_buf[cs->dma_cdw++] = (dst_offset >> 32UL) & 0xff; > + cs->dma_buf[cs->dma_cdw++] = (src_offset >> 32UL) & 0xff; > + rctx->ws->cs_add_reloc(rctx->cs, rsrc->cs_buf, > RADEON_USAGE_READ, rsrc->domains, RADEON_RING_DMA); > + rctx->ws->cs_add_reloc(rctx->cs, rdst->cs_buf, > RADEON_USAGE_WRITE, rdst->domains, RADEON_RING_DMA); > + dst_offset += csize << shift; > + src_offset += csize << shift; > + size -= csize; > + } > +} > diff --git a/src/gallium/drivers/r600/evergreend.h > b/src/gallium/drivers/r600/evergreend.h > index d9dba95..d8c50b6 100644 > --- a/src/gallium/drivers/r600/evergreend.h > +++ b/src/gallium/drivers/r600/evergreend.h > @@ -2317,4 +2317,20 @@ > #define G_028AA8_SWITCH_ON_EOP(x) (((x) >> 17) & 0x1) > #define C_028AA8_SWITCH_ON_EOP 0xFFFDFFFF > > +/* async DMA packets */ > +#define DMA_PACKET(cmd, t, s, n) ((((cmd) & 0xF) << 28) | \ > + (((t) & 0x1) << 23) | \ > + (((s) & 0x1) << 22) | \ > + (((n) & 0xFFFFF) << 0)) > +/* async DMA Packet types */ > +#define DMA_PACKET_WRITE 0x2 > +#define DMA_PACKET_COPY 0x3 > +#define DMA_PACKET_INDIRECT_BUFFER 0x4 > +#define DMA_PACKET_SEMAPHORE 0x5 > +#define DMA_PACKET_FENCE 0x6 > +#define DMA_PACKET_TRAP 0x7 > +#define DMA_PACKET_SRBM_WRITE 0x9 > +#define DMA_PACKET_CONSTANT_FILL 0xd > +#define DMA_PACKET_NOP 0xf > + > #endif <snip> > + > +void r600_dma_copy(struct r600_context *rctx, > + struct pipe_resource *dst, > + struct pipe_resource *src, > + unsigned long dst_offset, > + unsigned long src_offset, > + unsigned long size) > +{ Maybe rename this r700_dma_copy() since r600 is flakey and seems to actually take a slightly different copy packet format? > + struct radeon_winsys_cs *cs = rctx->cs; > + unsigned i, ncopy, csize, shift; > + struct r600_resource *rdst = (struct r600_resource*)dst; > + struct r600_resource *rsrc = (struct r600_resource*)src; > + > + size >>= 2; > + shift = 2; > + ncopy = (size / 0xffff) + !!(size % 0xffff); > + > + r600_need_dma_space(rctx, ncopy * 5); > + for (i = 0; i < ncopy; i++) { > + csize = size < 0xffff ? size : 0xffff; > + cs->dma_buf[cs->dma_cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, > 0, csize); > + cs->dma_buf[cs->dma_cdw++] = dst_offset & 0xfffffffc; > + cs->dma_buf[cs->dma_cdw++] = src_offset & 0xfffffffc; > + cs->dma_buf[cs->dma_cdw++] = (dst_offset >> 32UL) & 0xff; > + cs->dma_buf[cs->dma_cdw++] = (src_offset >> 32UL) & 0xff; > + rctx->ws->cs_add_reloc(rctx->cs, rsrc->cs_buf, > RADEON_USAGE_READ, rsrc->domains, RADEON_RING_DMA); > + rctx->ws->cs_add_reloc(rctx->cs, rdst->cs_buf, > RADEON_USAGE_WRITE, rdst->domains, RADEON_RING_DMA); > + dst_offset += csize << shift; > + src_offset += csize << shift; > + size -= csize; > + } > +} _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev