Requires Evergreen/Cayman and radeon kernel module
2.41.0 or newer.

Signed-off-by: Glenn Kennard <glenn.kenn...@gmail.com>
---
Changes since v2:
* Fix failing arb_draw_indirect-vertexid piglit test cases.
* Ensure start_instance, base_vertex, index_offset are reset when
  switching back to direct draws.
* Juggled some header defines to avoid use of magic numbers.

 docs/GL3.txt                                 |   4 +-
 docs/relnotes/10.5.0.html                    |   1 +
 src/gallium/drivers/r600/evergreend.h        |   1 -
 src/gallium/drivers/r600/r600_pipe.c         |   4 +-
 src/gallium/drivers/r600/r600_pipe.h         |   1 +
 src/gallium/drivers/r600/r600_shader.c       |  14 ++-
 src/gallium/drivers/r600/r600_state_common.c | 128 ++++++++++++++++++++++-----
 src/gallium/drivers/r600/r600d.h             |   8 +-
 8 files changed, 130 insertions(+), 31 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 23f5561..ef4f0ae 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -95,7 +95,7 @@ GL 3.3, GLSL 3.30 --- all DONE: i965, nv50, nvc0, r600, 
radeonsi, llvmpipe, soft
 GL 4.0, GLSL 4.00:
 
   GL_ARB_draw_buffers_blend                            DONE (i965, nv50, nvc0, 
r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_draw_indirect                                 DONE (i965, nvc0, 
radeonsi, llvmpipe, softpipe)
+  GL_ARB_draw_indirect                                 DONE (i965, nvc0, r600, 
radeonsi, llvmpipe, softpipe)
   GL_ARB_gpu_shader5                                   DONE (i965, nvc0)
   - 'precise' qualifier                                DONE
   - Dynamically uniform sampler array indices          DONE (r600)
@@ -159,7 +159,7 @@ GL 4.3, GLSL 4.30:
   GL_ARB_framebuffer_no_attachments                    not started
   GL_ARB_internalformat_query2                         not started
   GL_ARB_invalidate_subdata                            DONE (all drivers)
-  GL_ARB_multi_draw_indirect                           DONE (i965, nvc0, 
radeonsi, llvmpipe, softpipe)
+  GL_ARB_multi_draw_indirect                           DONE (i965, nvc0, r600, 
radeonsi, llvmpipe, softpipe)
   GL_ARB_program_interface_query                       not started
   GL_ARB_robust_buffer_access_behavior                 not started
   GL_ARB_shader_image_size                             not started
diff --git a/docs/relnotes/10.5.0.html b/docs/relnotes/10.5.0.html
index 4f921ea..47686c0 100644
--- a/docs/relnotes/10.5.0.html
+++ b/docs/relnotes/10.5.0.html
@@ -49,6 +49,7 @@ Note: some of the new features are only available with 
certain drivers.
 <li>GL_EXT_packed_float on freedreno</li>
 <li>GL_EXT_texture_shared_exponent on freedreno</li>
 <li>GL_EXT_texture_snorm on freedreno</li>
+<li>GL_ARB_draw_indirect, GL_ARB_multi_draw_indirect on r600</li>
 </ul>
 
 
diff --git a/src/gallium/drivers/r600/evergreend.h 
b/src/gallium/drivers/r600/evergreend.h
index 4989996..cd4ff46 100644
--- a/src/gallium/drivers/r600/evergreend.h
+++ b/src/gallium/drivers/r600/evergreend.h
@@ -72,7 +72,6 @@
 #define PKT3_REG_RMW                           0x21
 #define PKT3_COND_EXEC                         0x22
 #define PKT3_PRED_EXEC                         0x23
-#define PKT3_START_3D_CMDBUF                   0x24
 #define PKT3_DRAW_INDEX_2                      0x27
 #define PKT3_CONTEXT_CONTROL                   0x28
 #define PKT3_DRAW_INDEX_IMMD_BE                0x29
diff --git a/src/gallium/drivers/r600/r600_pipe.c 
b/src/gallium/drivers/r600/r600_pipe.c
index b6f7859..3127e23 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -313,6 +313,9 @@ static int r600_get_param(struct pipe_screen* pscreen, enum 
pipe_cap param)
                return family >= CHIP_CEDAR ? 1 : 0;
        case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
                return family >= CHIP_CEDAR ? 4 : 0;
+       case PIPE_CAP_DRAW_INDIRECT:
+               /* kernel command checker support is also required */
+               return family >= CHIP_CEDAR && rscreen->b.info.drm_minor >= 41;
 
        /* Unsupported features. */
        case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
@@ -322,7 +325,6 @@ static int r600_get_param(struct pipe_screen* pscreen, enum 
pipe_cap param)
        case PIPE_CAP_VERTEX_COLOR_CLAMPED:
        case PIPE_CAP_USER_VERTEX_BUFFERS:
        case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
-       case PIPE_CAP_DRAW_INDIRECT:
        case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
        case PIPE_CAP_SAMPLER_VIEW_TARGET:
        case PIPE_CAP_VERTEXID_NOBASE:
diff --git a/src/gallium/drivers/r600/r600_pipe.h 
b/src/gallium/drivers/r600/r600_pipe.h
index e110efe..1db43c4 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -145,6 +145,7 @@ struct r600_vgt_state {
        uint32_t vgt_multi_prim_ib_reset_en;
        uint32_t vgt_multi_prim_ib_reset_indx;
        uint32_t vgt_indx_offset;
+       bool last_draw_was_indirect;
 };
 
 struct r600_blend_color {
diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 16e820e..19c84bb 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -291,6 +291,7 @@ struct r600_shader_ctx {
        uint32_t                                nliterals;
        uint32_t                                max_driver_temp_used;
        boolean use_llvm;
+       boolean                 has_vertexid;
        /* needed for evergreen interpolation */
        struct eg_interp                eg_interpolators[6]; // indexed by 
Persp/Linear * 3 + sample/center/centroid
        /* evergreen/cayman also store sample mask in face register */
@@ -749,8 +750,10 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
                                        return r;
                        }
                        break;
-               } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
+               } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID) {
+                       ctx->has_vertexid = true;
                        break;
+               }
                else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
                        break;
        default:
@@ -1060,6 +1063,11 @@ static void tgsi_src(struct r600_shader_ctx *ctx,
        }
 }
 
+static int get_vfetch_type(struct r600_shader_ctx *ctx) {
+       // TODO: Only set VERTEX if src depends on VERTEXID
+       return ctx->has_vertexid ? 0 : 2;               /* 
SQ_VTX_FETCH_VERTEX_DATA / VTX_FETCH_NO_INDEX_OFFSET */;
+}
+
 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
                                 unsigned int cb_idx, unsigned cb_rel, unsigned 
int offset, unsigned ar_chan,
                                 unsigned int dst_reg)
@@ -1095,7 +1103,7 @@ static int tgsi_fetch_rel_const(struct r600_shader_ctx 
*ctx,
 
        memset(&vtx, 0, sizeof(vtx));
        vtx.buffer_id = cb_idx;
-       vtx.fetch_type = 2;             /* VTX_FETCH_NO_INDEX_OFFSET */
+       vtx.fetch_type = get_vfetch_type(ctx);
        vtx.src_gpr = ar_reg;
        vtx.src_sel_x = ar_chan;
        vtx.mega_fetch_count = 16;
@@ -4990,7 +4998,7 @@ static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, 
boolean src_requires_l
        memset(&vtx, 0, sizeof(vtx));
        vtx.op = FETCH_OP_VFETCH;
        vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
-       vtx.fetch_type = 2;             /* VTX_FETCH_NO_INDEX_OFFSET */
+       vtx.fetch_type = get_vfetch_type(ctx);
        vtx.src_gpr = src_gpr;
        vtx.mega_fetch_count = 16;
        vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + 
inst->Dst[0].Register.Index;
diff --git a/src/gallium/drivers/r600/r600_state_common.c 
b/src/gallium/drivers/r600/r600_state_common.c
index b498d00..a08124b 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -196,6 +196,10 @@ void r600_emit_vgt_state(struct r600_context *rctx, struct 
r600_atom *atom)
        r600_write_context_reg_seq(cs, R_028408_VGT_INDX_OFFSET, 2);
        radeon_emit(cs, a->vgt_indx_offset); /* R_028408_VGT_INDX_OFFSET */
        radeon_emit(cs, a->vgt_multi_prim_ib_reset_indx); /* 
R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX */
+       if (a->last_draw_was_indirect) {
+               a->last_draw_was_indirect = false;
+               r600_write_ctl_const(cs, R_03CFF0_SQ_VTX_BASE_VTX_LOC, 0);
+       }
 }
 
 static void r600_set_clip_state(struct pipe_context *ctx,
@@ -1353,7 +1357,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const 
struct pipe_draw_info
        unsigned i;
        struct radeon_winsys_cs *cs = rctx->b.rings.gfx.cs;
 
-       if (!info.count && (info.indexed || !info.count_from_stream_output)) {
+       if (!info.indirect && !info.count && (info.indexed || 
!info.count_from_stream_output)) {
                return;
        }
 
@@ -1379,19 +1383,44 @@ static void r600_draw_vbo(struct pipe_context *ctx, 
const struct pipe_draw_info
                pipe_resource_reference(&ib.buffer, rctx->index_buffer.buffer);
                ib.user_buffer = rctx->index_buffer.user_buffer;
                ib.index_size = rctx->index_buffer.index_size;
-               ib.offset = rctx->index_buffer.offset + info.start * 
ib.index_size;
+               ib.offset = rctx->index_buffer.offset;
+               if (!info.indirect) {
+                       ib.offset += info.start * ib.index_size;
+               }
 
                /* Translate 8-bit indices to 16-bit. */
-               if (ib.index_size == 1) {
+               if (unlikely(ib.index_size == 1)) {
                        struct pipe_resource *out_buffer = NULL;
                        unsigned out_offset;
                        void *ptr;
+                       unsigned start, count;
+
+                       if (likely(!info.indirect)) {
+                               start = 0;
+                               count = info.count;
+                       }
+                       else {
+                               /* Have to get start/count from indirect 
buffer, slow path ahead... */
+                               struct r600_resource *indirect_resource = 
(struct r600_resource *)info.indirect;
+                               unsigned *data = 
r600_buffer_map_sync_with_rings(&rctx->b, indirect_resource,
+                                       PIPE_TRANSFER_READ);
+                               if (data) {
+                                       data += info.indirect_offset / 
sizeof(unsigned);
+                                       start = data[2] * ib.index_size;
+                                       count = data[0];
+                                       
rctx->b.ws->buffer_unmap(indirect_resource->cs_buf);
+                               }
+                               else {
+                                       start = 0;
+                                       count = 0;
+                               }
+                       }
 
-                       u_upload_alloc(rctx->b.uploader, 0, info.count * 2,
+                       u_upload_alloc(rctx->b.uploader, start, count * 2,
                                       &out_offset, &out_buffer, &ptr);
 
                        util_shorten_ubyte_elts_to_userptr(
-                                               &rctx->b.b, &ib, 0, ib.offset, 
info.count, ptr);
+                                               &rctx->b.b, &ib, 0, ib.offset + 
start, count, ptr);
 
                        pipe_resource_reference(&ib.buffer, NULL);
                        ib.user_buffer = NULL;
@@ -1403,9 +1432,11 @@ static void r600_draw_vbo(struct pipe_context *ctx, 
const struct pipe_draw_info
                /* Upload the index buffer.
                 * The upload is skipped for small index counts on 
little-endian machines
                 * and the indices are emitted via PKT3_DRAW_INDEX_IMMD.
+                * Indirect draws never use immediate indices.
                 * Note: Instanced rendering in combination with immediate 
indices hangs. */
-               if (ib.user_buffer && (R600_BIG_ENDIAN || info.instance_count > 
1 ||
-                                      info.count*ib.index_size > 20)) {
+               if (ib.user_buffer && (R600_BIG_ENDIAN || info.indirect ||
+                                                info.instance_count > 1 ||
+                                                info.count*ib.index_size > 
20)) {
                        u_upload_data(rctx->b.uploader, 0, info.count * 
ib.index_size,
                                      ib.user_buffer, &ib.offset, &ib.buffer);
                        ib.user_buffer = NULL;
@@ -1417,7 +1448,8 @@ static void r600_draw_vbo(struct pipe_context *ctx, const 
struct pipe_draw_info
        /* Set the index offset and primitive restart. */
        if (rctx->vgt_state.vgt_multi_prim_ib_reset_en != 
info.primitive_restart ||
            rctx->vgt_state.vgt_multi_prim_ib_reset_indx != info.restart_index 
||
-           rctx->vgt_state.vgt_indx_offset != info.index_bias) {
+           rctx->vgt_state.vgt_indx_offset != info.index_bias ||
+           (rctx->vgt_state.last_draw_was_indirect && !info.indirect)) {
                rctx->vgt_state.vgt_multi_prim_ib_reset_en = 
info.primitive_restart;
                rctx->vgt_state.vgt_multi_prim_ib_reset_indx = 
info.restart_index;
                rctx->vgt_state.vgt_indx_offset = info.index_bias;
@@ -1485,7 +1517,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const 
struct pipe_draw_info
        }
 
        /* Update start instance. */
-       if (rctx->last_start_instance != info.start_instance) {
+       if (!info.indirect && rctx->last_start_instance != info.start_instance) 
{
                r600_write_ctl_const(cs, R_03CFF4_SQ_VTX_START_INST_LOC, 
info.start_instance);
                rctx->last_start_instance = info.start_instance;
        }
@@ -1510,8 +1542,30 @@ static void r600_draw_vbo(struct pipe_context *ctx, 
const struct pipe_draw_info
        }
 
        /* Draw packets. */
-       cs->buf[cs->cdw++] = PKT3(PKT3_NUM_INSTANCES, 0, 
rctx->b.predicate_drawing);
-       cs->buf[cs->cdw++] = info.instance_count;
+       if (!info.indirect) {
+               cs->buf[cs->cdw++] = PKT3(PKT3_NUM_INSTANCES, 0, 
rctx->b.predicate_drawing);
+               cs->buf[cs->cdw++] = info.instance_count;
+       }
+
+       if (unlikely(info.indirect)) {
+               uint64_t va = r600_resource(info.indirect)->gpu_address;
+               assert(rctx->b.chip_class >= EVERGREEN);
+
+               // Invalidate so non-indirect draw calls reset this state
+               rctx->vgt_state.last_draw_was_indirect = true;
+               rctx->last_start_instance = -1;
+
+               cs->buf[cs->cdw++] = PKT3(EG_PKT3_SET_BASE, 2, 
rctx->b.predicate_drawing);
+               cs->buf[cs->cdw++] = EG_DRAW_INDEX_INDIRECT_PATCH_TABLE_BASE;
+               cs->buf[cs->cdw++] = va;
+               cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
+
+               cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 
rctx->b.predicate_drawing);
+               cs->buf[cs->cdw++] = r600_context_bo_reloc(&rctx->b, 
&rctx->b.rings.gfx,
+                                                          (struct 
r600_resource*)info.indirect,
+                                                          RADEON_USAGE_READ, 
RADEON_PRIO_MIN);
+       }
+
        if (info.indexed) {
                cs->buf[cs->cdw++] = PKT3(PKT3_INDEX_TYPE, 0, 
rctx->b.predicate_drawing);
                cs->buf[cs->cdw++] = ib.index_size == 4 ?
@@ -1528,18 +1582,40 @@ static void r600_draw_vbo(struct pipe_context *ctx, 
const struct pipe_draw_info
                        cs->cdw += size_dw;
                } else {
                        uint64_t va = r600_resource(ib.buffer)->gpu_address + 
ib.offset;
-                       cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX, 3, 
rctx->b.predicate_drawing);
-                       cs->buf[cs->cdw++] = va;
-                       cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
-                       cs->buf[cs->cdw++] = info.count;
-                       cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_DMA;
-                       cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 
rctx->b.predicate_drawing);
-                       cs->buf[cs->cdw++] = r600_context_bo_reloc(&rctx->b, 
&rctx->b.rings.gfx,
-                                                                  (struct 
r600_resource*)ib.buffer,
-                                                                  
RADEON_USAGE_READ, RADEON_PRIO_MIN);
+
+                       if (likely(!info.indirect)) {
+                               cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX, 3, 
rctx->b.predicate_drawing);
+                               cs->buf[cs->cdw++] = va;
+                               cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
+                               cs->buf[cs->cdw++] = info.count;
+                               cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_DMA;
+                               cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 
rctx->b.predicate_drawing);
+                               cs->buf[cs->cdw++] = 
r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx,
+                                                                          
(struct r600_resource*)ib.buffer,
+                                                                          
RADEON_USAGE_READ, RADEON_PRIO_MIN);
+                       }
+                       else {
+                               uint32_t max_size = (ib.buffer->width0 - 
ib.offset) / ib.index_size;
+
+                               cs->buf[cs->cdw++] = PKT3(EG_PKT3_INDEX_BASE, 
1, rctx->b.predicate_drawing);
+                               cs->buf[cs->cdw++] = va;
+                               cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
+
+                               cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 
rctx->b.predicate_drawing);
+                               cs->buf[cs->cdw++] = 
r600_context_bo_reloc(&rctx->b, &rctx->b.rings.gfx,
+                                                                          
(struct r600_resource*)ib.buffer,
+                                                                          
RADEON_USAGE_READ, RADEON_PRIO_MIN);
+
+                               cs->buf[cs->cdw++] = 
PKT3(EG_PKT3_INDEX_BUFFER_SIZE, 0, rctx->b.predicate_drawing);
+                               cs->buf[cs->cdw++] = max_size;
+
+                               cs->buf[cs->cdw++] = 
PKT3(EG_PKT3_DRAW_INDEX_INDIRECT, 1, rctx->b.predicate_drawing);
+                               cs->buf[cs->cdw++] = info.indirect_offset;
+                               cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_DMA;
+                       }
                }
        } else {
-               if (info.count_from_stream_output) {
+               if (unlikely(info.count_from_stream_output)) {
                        struct r600_so_target *t = (struct 
r600_so_target*)info.count_from_stream_output;
                        uint64_t va = t->buf_filled_size->gpu_address + 
t->buf_filled_size_offset;
 
@@ -1558,8 +1634,14 @@ static void r600_draw_vbo(struct pipe_context *ctx, 
const struct pipe_draw_info
                                                                   
RADEON_PRIO_MIN);
                }
 
-               cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_AUTO, 1, 
rctx->b.predicate_drawing);
-               cs->buf[cs->cdw++] = info.count;
+               if (likely(!info.indirect)) {
+                       cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_AUTO, 1, 
rctx->b.predicate_drawing);
+                       cs->buf[cs->cdw++] = info.count;
+               }
+               else {
+                       cs->buf[cs->cdw++] = PKT3(EG_PKT3_DRAW_INDIRECT, 1, 
rctx->b.predicate_drawing);
+                       cs->buf[cs->cdw++] = info.indirect_offset;
+               }
                cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_AUTO_INDEX |
                                        (info.count_from_stream_output ? 
S_0287F0_USE_OPAQUE(1) : 0);
        }
diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
index 6a5b964..bce8b4e 100644
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -52,12 +52,18 @@
 
 
 #define PKT3_NOP                               0x10
+#define EG_PKT3_SET_BASE                       0x11 /* >= evergreen */
+#define     EG_DRAW_INDEX_INDIRECT_PATCH_TABLE_BASE 1 /* DX11 
Draw_Index_Indirect Patch Table Base */
+#define EG_PKT3_INDEX_BUFFER_SIZE              0x13
 #define PKT3_INDIRECT_BUFFER_END               0x17
 #define PKT3_SET_PREDICATION                   0x20
 #define PKT3_REG_RMW                           0x21
 #define PKT3_COND_EXEC                         0x22
 #define PKT3_PRED_EXEC                         0x23
-#define PKT3_START_3D_CMDBUF                   0x24
+#define PKT3_START_3D_CMDBUF                   0x24 /* removed on evergreen */
+#define EG_PKT3_DRAW_INDIRECT                  0x24 /* >= evergreen */
+#define EG_PKT3_DRAW_INDEX_INDIRECT            0x25
+#define EG_PKT3_INDEX_BASE                     0x26
 #define PKT3_DRAW_INDEX_2                      0x27
 #define PKT3_CONTEXT_CONTROL                   0x28
 #define PKT3_DRAW_INDEX_IMMD_BE                0x29
-- 
1.9.1

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to