winsys: add async dma infrastructure

Alex Deucher Fri, 04 Jan 2013 15:33:35 -0800

On Fri, Jan 4, 2013 at 5:19 PM,  <j.gli...@gmail.com> wrote:
> From: Jerome Glisse <jgli...@redhat.com>
>
> The design is to take advantage of the fact that kernel will emit
> semaphore when buffer is referenced by different ring. So the only
> thing we need to enforce synchronization btw dma and gfx/compute
> ring is to make sure that we never reference same bo at the same
> time on the dma and gfx ring.
>
> This is achieved by tracking relocation, when we add a relocation
> to the dma ring for a bo we check first if the bo has an active
> relocation on the gfx ring. If it's the case we flush the gfx ring.
> We do the same when adding a bo to the gfx ring we check it does
> not have a relocation on the dma ring if it has one we flush the
> dma ring.
>
> This patch also simplify the helper query function to know if a bo
> has pending write/read command.


Looks good.  A couple of minor comments below.  BTW, any performance gains?

Alex

>
> Signed-off-by: Jerome Glisse <jgli...@redhat.com>
> ---
>  src/gallium/drivers/r300/r300_emit.c               |  21 +-
>  src/gallium/drivers/r300/r300_flush.c              |   7 +-
>  src/gallium/drivers/r600/evergreen_hw_context.c    |  39 +++
>  src/gallium/drivers/r600/evergreend.h              |  16 ++
>  src/gallium/drivers/r600/r600.h                    |  13 +
>  src/gallium/drivers/r600/r600_blit.c               |  94 +++++--
>  src/gallium/drivers/r600/r600_hw_context.c         |  44 +++-
>  src/gallium/drivers/r600/r600_pipe.c               |  13 +-
>  src/gallium/drivers/r600/r600_pipe.h               |   2 +-
>  src/gallium/drivers/r600/r600_texture.c            |   2 +-
>  src/gallium/drivers/r600/r600d.h                   |  16 ++
>  src/gallium/drivers/radeonsi/r600_hw_context.c     |   2 +-
>  .../drivers/radeonsi/r600_hw_context_priv.h        |   2 +-
>  src/gallium/drivers/radeonsi/r600_texture.c        |   2 +-
>  src/gallium/drivers/radeonsi/radeonsi_pipe.c       |  13 +-
>  src/gallium/winsys/radeon/drm/radeon_drm_bo.c      |  10 +-
>  src/gallium/winsys/radeon/drm/radeon_drm_bo.h      |   2 +
>  src/gallium/winsys/radeon/drm/radeon_drm_cs.c      | 270 
> +++++++++++++++++----
>  src/gallium/winsys/radeon/drm/radeon_drm_cs.h      |  40 ++-
>  src/gallium/winsys/radeon/drm/radeon_drm_winsys.c  |   6 +
>  src/gallium/winsys/radeon/drm/radeon_winsys.h      |  28 ++-
>  21 files changed, 509 insertions(+), 133 deletions(-)
>
> diff --git a/src/gallium/drivers/r300/r300_emit.c 
> b/src/gallium/drivers/r300/r300_emit.c
> index d1ed4b3..c824821 100644
> --- a/src/gallium/drivers/r300/r300_emit.c
> +++ b/src/gallium/drivers/r300/r300_emit.c
> @@ -1184,7 +1184,8 @@ validate:
>              assert(tex && tex->buf && "cbuf is marked, but NULL!");
>              r300->rws->cs_add_reloc(r300->cs, tex->cs_buf,
>                                      RADEON_USAGE_READWRITE,
> -                                    r300_surface(fb->cbufs[i])->domain);
> +                                    r300_surface(fb->cbufs[i])->domain,
> +                                    RADEON_RING_DMA);
>          }
>          /* ...depth buffer... */
>          if (fb->zsbuf) {
> @@ -1192,7 +1193,8 @@ validate:
>              assert(tex && tex->buf && "zsbuf is marked, but NULL!");
>              r300->rws->cs_add_reloc(r300->cs, tex->cs_buf,
>                                      RADEON_USAGE_READWRITE,
> -                                    r300_surface(fb->zsbuf)->domain);
> +                                    r300_surface(fb->zsbuf)->domain,
> +                                    RADEON_RING_DMA);
>          }
>      }
>      if (r300->textures_state.dirty) {
> @@ -1204,18 +1206,21 @@ validate:
>
>              tex = r300_resource(texstate->sampler_views[i]->base.texture);
>              r300->rws->cs_add_reloc(r300->cs, tex->cs_buf, RADEON_USAGE_READ,
> -                                    tex->domain);
> +                                    tex->domain,
> +                                    RADEON_RING_DMA);
>          }
>      }
>      /* ...occlusion query buffer... */
>      if (r300->query_current)
>          r300->rws->cs_add_reloc(r300->cs, r300->query_current->cs_buf,
> -                                RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT);
> +                                RADEON_USAGE_WRITE, RADEON_DOMAIN_GTT,
> +                                RADEON_RING_DMA);
>      /* ...vertex buffer for SWTCL path... */
>      if (r300->vbo)
>          r300->rws->cs_add_reloc(r300->cs, r300_resource(r300->vbo)->cs_buf,
>                                  RADEON_USAGE_READ,
> -                                r300_resource(r300->vbo)->domain);
> +                                r300_resource(r300->vbo)->domain,
> +                                RADEON_RING_DMA);
>      /* ...vertex buffers for HWTCL path... */
>      if (do_validate_vertex_buffers && r300->vertex_arrays_dirty) {
>          struct pipe_vertex_buffer *vbuf = r300->vertex_buffer;
> @@ -1230,14 +1235,16 @@ validate:
>
>              r300->rws->cs_add_reloc(r300->cs, r300_resource(buf)->cs_buf,
>                                      RADEON_USAGE_READ,
> -                                    r300_resource(buf)->domain);
> +                                    r300_resource(buf)->domain,
> +                                    RADEON_RING_DMA);
>          }
>      }
>      /* ...and index buffer for HWTCL path. */
>      if (index_buffer)
>          r300->rws->cs_add_reloc(r300->cs, 
> r300_resource(index_buffer)->cs_buf,
>                                  RADEON_USAGE_READ,
> -                                r300_resource(index_buffer)->domain);
> +                                r300_resource(index_buffer)->domain,
> +                                RADEON_RING_DMA);
>
>      /* Now do the validation (flush is called inside cs_validate on 
> failure). */
>      if (!r300->rws->cs_validate(r300->cs)) {
> diff --git a/src/gallium/drivers/r300/r300_flush.c 
> b/src/gallium/drivers/r300/r300_flush.c
> index 2170c59..ff3dbe9 100644
> --- a/src/gallium/drivers/r300/r300_flush.c
> +++ b/src/gallium/drivers/r300/r300_flush.c
> @@ -70,8 +70,10 @@ void r300_flush(struct pipe_context *pipe,
>      struct r300_context *r300 = r300_context(pipe);
>      struct pb_buffer **rfence = (struct pb_buffer**)fence;
>
> +    /* r3xx-r5xx only have gfx ring */
> +    flags |= RADEON_FLUSH_GFX;
>      if (r300->draw && !r300->draw_vbo_locked)
> -       r300_draw_flush_vbuf(r300);
> +        r300_draw_flush_vbuf(r300);
>
>      if (r300->screen->info.drm_minor >= 12) {
>          flags |= RADEON_FLUSH_KEEP_TILING_FLAGS;
> @@ -84,7 +86,8 @@ void r300_flush(struct pipe_context *pipe,
>          /* Add the fence as a dummy relocation. */
>          r300->rws->cs_add_reloc(r300->cs,
>                                  r300->rws->buffer_get_cs_handle(*rfence),
> -                                RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT);
> +                                RADEON_USAGE_READWRITE, RADEON_DOMAIN_GTT,
> +                                RADEON_RING_DMA);
>      }
>
>      if (r300->dirty_hw) {
> diff --git a/src/gallium/drivers/r600/evergreen_hw_context.c 
> b/src/gallium/drivers/r600/evergreen_hw_context.c
> index 0ca7f9e..302f652 100644
> --- a/src/gallium/drivers/r600/evergreen_hw_context.c
> +++ b/src/gallium/drivers/r600/evergreen_hw_context.c
> @@ -238,3 +238,42 @@ void evergreen_set_streamout_enable(struct r600_context 
> *ctx, unsigned buffer_en
>                 r600_write_context_reg(cs, R_028B94_VGT_STRMOUT_CONFIG, 
> S_028B94_STREAMOUT_0_EN(0));
>         }
>  }
> +
> +void evergreen_dma_copy(struct r600_context *rctx,
> +               struct pipe_resource *dst,
> +               struct pipe_resource *src,
> +               unsigned long dst_offset,
> +               unsigned long src_offset,
> +               unsigned long size)
> +{
> +       struct radeon_winsys_cs *cs = rctx->cs;
> +       unsigned i, ncopy, csize, command, shift;
> +       struct r600_resource *rdst = (struct r600_resource*)dst;
> +       struct r600_resource *rsrc = (struct r600_resource*)src;
> +
> +       /* see if we use dword or byte copy */
> +       if (!(dst_offset & 0x3) && !(src_offset & 0x3) && !(size & 0x3)) {
> +               size >>= 2;
> +               command = 0x00 << 20;
> +               shift = 2;
> +       } else {
> +               command = 0x40 << 20;
> +               shift = 0;
> +       }
> +       ncopy = (size / 0xfffff) + !!(size % 0xfffff);
> +
> +       r600_need_dma_space(rctx, ncopy * 5);
> +       for (i = 0; i < ncopy; i++) {
> +               csize = size < 0xfffff ? size : 0xfffff;
> +               cs->dma_buf[cs->dma_cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, 
> 0, csize) | command;


I'd perfer to clean up the DMA_PACKET() macro on evergreen since the t
and s bits don't really make sense on evergreen.  Just make the macro
take 3 args and treat bits 27:20 as a sub-command field.  something
like:

> +#define DMA_PACKET(cmd, sub_cmd, n)    ((((cmd) & 0xF) << 28) |    \
> +                                    (((sub_cmd) & 0xFF) << 20) |       \
> +                                    (((n) & 0xFFFFF) << 0))

Looks a little cleaner in my opinion.

> +               cs->dma_buf[cs->dma_cdw++] = dst_offset & 0xffffffff;
> +               cs->dma_buf[cs->dma_cdw++] = src_offset & 0xffffffff;
> +               cs->dma_buf[cs->dma_cdw++] = (dst_offset >> 32UL) & 0xff;
> +               cs->dma_buf[cs->dma_cdw++] = (src_offset >> 32UL) & 0xff;
> +               rctx->ws->cs_add_reloc(rctx->cs, rsrc->cs_buf, 
> RADEON_USAGE_READ, rsrc->domains, RADEON_RING_DMA);
> +               rctx->ws->cs_add_reloc(rctx->cs, rdst->cs_buf, 
> RADEON_USAGE_WRITE, rdst->domains, RADEON_RING_DMA);
> +               dst_offset += csize << shift;
> +               src_offset += csize << shift;
> +               size -= csize;
> +       }
> +}
> diff --git a/src/gallium/drivers/r600/evergreend.h 
> b/src/gallium/drivers/r600/evergreend.h
> index d9dba95..d8c50b6 100644
> --- a/src/gallium/drivers/r600/evergreend.h
> +++ b/src/gallium/drivers/r600/evergreend.h
> @@ -2317,4 +2317,20 @@
>  #define   G_028AA8_SWITCH_ON_EOP(x)                    (((x) >> 17) & 0x1)
>  #define   C_028AA8_SWITCH_ON_EOP                       0xFFFDFFFF
>
> +/* async DMA packets */
> +#define DMA_PACKET(cmd, t, s, n)    ((((cmd) & 0xF) << 28) |    \
> +                                    (((t) & 0x1) << 23) |       \
> +                                    (((s) & 0x1) << 22) |       \
> +                                    (((n) & 0xFFFFF) << 0))
> +/* async DMA Packet types */
> +#define    DMA_PACKET_WRITE                     0x2
> +#define    DMA_PACKET_COPY                      0x3
> +#define    DMA_PACKET_INDIRECT_BUFFER           0x4
> +#define    DMA_PACKET_SEMAPHORE                 0x5
> +#define    DMA_PACKET_FENCE                     0x6
> +#define    DMA_PACKET_TRAP                      0x7
> +#define    DMA_PACKET_SRBM_WRITE                0x9
> +#define    DMA_PACKET_CONSTANT_FILL             0xd
> +#define    DMA_PACKET_NOP                       0xf
> +
>  #endif

<snip>

> +
> +void r600_dma_copy(struct r600_context *rctx,
> +               struct pipe_resource *dst,
> +               struct pipe_resource *src,
> +               unsigned long dst_offset,
> +               unsigned long src_offset,
> +               unsigned long size)
> +{

Maybe rename this r700_dma_copy() since r600 is flakey and seems to
actually take a slightly different copy packet format?


> +       struct radeon_winsys_cs *cs = rctx->cs;
> +       unsigned i, ncopy, csize, shift;
> +       struct r600_resource *rdst = (struct r600_resource*)dst;
> +       struct r600_resource *rsrc = (struct r600_resource*)src;
> +
> +       size >>= 2;
> +       shift = 2;
> +       ncopy = (size / 0xffff) + !!(size % 0xffff);
> +
> +       r600_need_dma_space(rctx, ncopy * 5);
> +       for (i = 0; i < ncopy; i++) {
> +               csize = size < 0xffff ? size : 0xffff;
> +               cs->dma_buf[cs->dma_cdw++] = DMA_PACKET(DMA_PACKET_COPY, 0, 
> 0, csize);
> +               cs->dma_buf[cs->dma_cdw++] = dst_offset & 0xfffffffc;
> +               cs->dma_buf[cs->dma_cdw++] = src_offset & 0xfffffffc;
> +               cs->dma_buf[cs->dma_cdw++] = (dst_offset >> 32UL) & 0xff;
> +               cs->dma_buf[cs->dma_cdw++] = (src_offset >> 32UL) & 0xff;
> +               rctx->ws->cs_add_reloc(rctx->cs, rsrc->cs_buf, 
> RADEON_USAGE_READ, rsrc->domains, RADEON_RING_DMA);
> +               rctx->ws->cs_add_reloc(rctx->cs, rdst->cs_buf, 
> RADEON_USAGE_WRITE, rdst->domains, RADEON_RING_DMA);
> +               dst_offset += csize << shift;
> +               src_offset += csize << shift;
> +               size -= csize;
> +       }
> +}
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Re: [Mesa-dev] [PATCH 3/3] radeon/winsys: add async dma infrastructure

Reply via email to