This improves performance a little bit if there are lots of small indexed
draw commands.
---
 src/gallium/drivers/r600/r600_state_common.c |   41 +++++++++++++++++---------
 src/gallium/drivers/r600/r600d.h             |    1 +
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_state_common.c 
b/src/gallium/drivers/r600/r600_state_common.c
index df8a3f5..8b8e34d 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1125,7 +1125,6 @@ static void r600_draw_vbo(struct pipe_context *ctx, const 
struct pipe_draw_info
        unsigned i;
        struct r600_block *dirty_block = NULL, *next_block = NULL;
        struct radeon_winsys_cs *cs = rctx->cs;
-       uint64_t va;
 
        if (!info.count && (info.indexed || !info.count_from_stream_output)) {
                assert(0);
@@ -1165,10 +1164,15 @@ static void r600_draw_vbo(struct pipe_context *ctx, 
const struct pipe_draw_info
                        ib.index_size = 2;
                }
 
-               /* Upload the index buffer. */
-               if (ib.user_buffer) {
+               /* Upload the index buffer.
+                * The upload is skipped for small index counts on 
little-endian machines
+                * and the indices are emitted via PKT3_DRAW_INDEX_IMMD.
+                * Note: Instanced rendering in combination with immediate 
indices hangs. */
+               if (ib.user_buffer && (R600_BIG_ENDIAN || info.instance_count > 
1 ||
+                                      info.count*ib.index_size > 20)) {
                        u_upload_data(rctx->uploader, 0, info.count * 
ib.index_size,
                                      ib.user_buffer, &ib.offset, &ib.buffer);
+                       ib.user_buffer = NULL;
                }
        } else {
                info.index_bias = info.start;
@@ -1192,8 +1196,8 @@ static void r600_draw_vbo(struct pipe_context *ctx, const 
struct pipe_draw_info
                rctx->vgt_state.atom.dirty = true;
        }
 
-       /* Emit states (the function expects that we emit at most 17 dwords 
here). */
-       r600_need_cs_space(rctx, 0, TRUE);
+       /* Emit states. */
+       r600_need_cs_space(rctx, ib.user_buffer ? 5 : 0, TRUE);
        r600_flush_emit(rctx);
 
        for (i = 0; i < R600_NUM_ATOMS; i++) {
@@ -1243,15 +1247,24 @@ static void r600_draw_vbo(struct pipe_context *ctx, 
const struct pipe_draw_info
                                        (VGT_INDEX_32 | (R600_BIG_ENDIAN ? 
VGT_DMA_SWAP_32_BIT : 0)) :
                                        (VGT_INDEX_16 | (R600_BIG_ENDIAN ? 
VGT_DMA_SWAP_16_BIT : 0));
 
-               va = r600_resource_va(ctx->screen, ib.buffer);
-               va += ib.offset;
-               cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX, 3, 
rctx->predicate_drawing);
-               cs->buf[cs->cdw++] = va;
-               cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
-               cs->buf[cs->cdw++] = info.count;
-               cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_DMA;
-               cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, rctx->predicate_drawing);
-               cs->buf[cs->cdw++] = r600_context_bo_reloc(rctx, (struct 
r600_resource*)ib.buffer, RADEON_USAGE_READ);
+               if (ib.user_buffer) {
+                       unsigned size_bytes = info.count*ib.index_size;
+                       unsigned size_dw = align(size_bytes, 4) / 4;
+                       cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX_IMMD, 1 + 
size_dw, rctx->predicate_drawing);
+                       cs->buf[cs->cdw++] = info.count;
+                       cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_IMMEDIATE;
+                       memcpy(cs->buf+cs->cdw, ib.user_buffer, size_bytes);
+                       cs->cdw += size_dw;
+               } else {
+                       uint64_t va = r600_resource_va(ctx->screen, ib.buffer) 
+ ib.offset;
+                       cs->buf[cs->cdw++] = PKT3(PKT3_DRAW_INDEX, 3, 
rctx->predicate_drawing);
+                       cs->buf[cs->cdw++] = va;
+                       cs->buf[cs->cdw++] = (va >> 32UL) & 0xFF;
+                       cs->buf[cs->cdw++] = info.count;
+                       cs->buf[cs->cdw++] = V_0287F0_DI_SRC_SEL_DMA;
+                       cs->buf[cs->cdw++] = PKT3(PKT3_NOP, 0, 
rctx->predicate_drawing);
+                       cs->buf[cs->cdw++] = r600_context_bo_reloc(rctx, 
(struct r600_resource*)ib.buffer, RADEON_USAGE_READ);
+               }
        } else {
                if (info.count_from_stream_output) {
                        struct r600_so_target *t = (struct 
r600_so_target*)info.count_from_stream_output;
diff --git a/src/gallium/drivers/r600/r600d.h b/src/gallium/drivers/r600/r600d.h
index 4b46bd7..8cb5fea 100644
--- a/src/gallium/drivers/r600/r600d.h
+++ b/src/gallium/drivers/r600/r600d.h
@@ -988,6 +988,7 @@
 #define   G_0287F0_SOURCE_SELECT(x)                    (((x) >> 0) & 0x3)
 #define   C_0287F0_SOURCE_SELECT                       0xFFFFFFFC
 #define     V_0287F0_DI_SRC_SEL_DMA                    0
+#define     V_0287F0_DI_SRC_SEL_IMMEDIATE              1
 #define     V_0287F0_DI_SRC_SEL_AUTO_INDEX             2
 #define   S_0287F0_MAJOR_MODE(x)                       (((x) & 0x3) << 2)
 #define   G_0287F0_MAJOR_MODE(x)                       (((x) >> 2) & 0x3)
-- 
1.7.9.5

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to