From: Marek Olšák <marek.ol...@amd.com> --- src/gallium/auxiliary/util/u_vbuf.c | 189 ++++++++++++++++++++++++---- 1 file changed, 165 insertions(+), 24 deletions(-)
diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c index 87b159ec1bf..b0b92f7e966 100644 --- a/src/gallium/auxiliary/util/u_vbuf.c +++ b/src/gallium/auxiliary/util/u_vbuf.c @@ -1132,20 +1132,45 @@ static void u_vbuf_set_driver_vertex_buffers(struct u_vbuf *mgr) unsigned start_slot, count; start_slot = ffs(mgr->dirty_real_vb_mask) - 1; count = util_last_bit(mgr->dirty_real_vb_mask >> start_slot); pipe->set_vertex_buffers(pipe, start_slot, count, mgr->real_vertex_buffer + start_slot); mgr->dirty_real_vb_mask = 0; } +static void +u_vbuf_split_indexed_multidraw(struct u_vbuf *mgr, struct pipe_draw_info *info, + unsigned *indirect_data, unsigned stride, + unsigned draw_count) +{ + assert(info->index_size); + info->indirect = NULL; + + for (unsigned i = 0; i < draw_count; i++) { + unsigned offset = i * stride / 4; + + info->count = indirect_data[offset + 0]; + info->instance_count = indirect_data[offset + 1]; + + if (!info->count || !info->instance_count) + continue; + + info->start = indirect_data[offset + 2]; + info->index_bias = indirect_data[offset + 3]; + info->start_instance = indirect_data[offset + 4]; + + u_vbuf_draw_vbo(mgr, info); + } +} + void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info) { struct pipe_context *pipe = mgr->pipe; int start_vertex, min_index; unsigned num_vertices; boolean unroll_indices = FALSE; const uint32_t used_vb_mask = mgr->ve->used_vb_mask; uint32_t user_vb_mask = mgr->user_vb_mask & used_vb_mask; const uint32_t incompatible_vb_mask = mgr->incompatible_vb_mask & used_vb_mask; @@ -1160,47 +1185,162 @@ void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info) if (mgr->dirty_real_vb_mask & used_vb_mask) { u_vbuf_set_driver_vertex_buffers(mgr); } pipe->draw_vbo(pipe, info); return; } new_info = *info; - /* Fallback. We need to know all the parameters. */ + /* Handle indirect (multi)draws. */ if (new_info.indirect) { - struct pipe_transfer *transfer = NULL; - int *data; - - if (new_info.index_size) { - data = pipe_buffer_map_range(pipe, new_info.indirect->buffer, - new_info.indirect->offset, 20, - PIPE_TRANSFER_READ, &transfer); - new_info.index_bias = data[3]; - new_info.start_instance = data[4]; - } - else { - data = pipe_buffer_map_range(pipe, new_info.indirect->buffer, - new_info.indirect->offset, 16, - PIPE_TRANSFER_READ, &transfer); - new_info.start_instance = data[3]; + const struct pipe_draw_indirect_info *indirect = new_info.indirect; + unsigned draw_count = 0; + + /* Get the number of draws. */ + if (indirect->indirect_draw_count) { + pipe_buffer_read(pipe, indirect->indirect_draw_count, + indirect->indirect_draw_count_offset, + 4, &draw_count); + } else { + draw_count = indirect->draw_count; } - new_info.count = data[0]; - new_info.instance_count = data[1]; - new_info.start = data[2]; - pipe_buffer_unmap(pipe, transfer); - new_info.indirect = NULL; - - if (!new_info.count) + if (!draw_count) return; + + unsigned data_size = (draw_count - 1) * indirect->stride + + (new_info.index_size ? 20 : 16); + unsigned *data = alloca(data_size); + + /* Read the used buffer range only once, because the read can be + * uncached. + */ + pipe_buffer_read(pipe, indirect->buffer, indirect->offset, data_size, + data); + + if (info->index_size) { + /* Indexed multidraw. */ + unsigned index_bias0 = data[3]; + bool index_bias_same = true; + + /* If we invoke the translate path, we have to split the multidraw. */ + if (incompatible_vb_mask || + mgr->ve->incompatible_elem_mask) { + u_vbuf_split_indexed_multidraw(mgr, &new_info, data, + indirect->stride, draw_count); + return; + } + + /* See if index_bias is the same for all draws. */ + for (unsigned i = 1; i < draw_count; i++) { + if (data[i * indirect->stride / 4 + 3] != index_bias0) { + index_bias_same = false; + break; + } + } + + /* Split the multidraw if index_bias is different. */ + if (!index_bias_same) { + u_vbuf_split_indexed_multidraw(mgr, &new_info, data, + indirect->stride, draw_count); + return; + } + + /* If we don't need to use the translate path and index_bias is + * the same, we can process the multidraw with the time complexity + * equal to 1 draw call (except for the index range computation). + * We only need to compute the index range covering all draw calls + * of the multidraw. + * + * The driver will not look at these values because indirect != NULL. + * These values determine the user buffer bounds to upload. + */ + new_info.index_bias = index_bias0; + new_info.min_index = ~0u; + new_info.max_index = 0; + new_info.start_instance = ~0u; + new_info.instance_count = 0; + + struct pipe_transfer *transfer = NULL; + const uint8_t *indices; + + if (info->has_user_indices) { + indices = (uint8_t*)info->index.user; + } else { + indices = (uint8_t*)pipe_buffer_map(pipe, info->index.resource, + PIPE_TRANSFER_READ, &transfer); + } + + for (unsigned i = 0; i < draw_count; i++) { + unsigned offset = i * indirect->stride / 4; + unsigned start = data[offset + 2]; + unsigned count = data[offset + 0]; + unsigned instance_count = data[offset + 1]; + + if (!count || !instance_count) + continue; + + /* Update the ranges of instances. */ + new_info.start_instance = MIN2(new_info.start_instance, + data[offset + 4]); + new_info.instance_count = MAX2(new_info.instance_count, + instance_count); + + /* Update the index range. */ + unsigned min, max; + new_info.count = count; /* only used by get_minmax_index */ + u_vbuf_get_minmax_index_mapped(&new_info, + indices + + new_info.index_size * start, + (int*)&min, (int*)&max); + + new_info.min_index = MIN2(new_info.min_index, min); + new_info.max_index = MAX2(new_info.max_index, max); + } + + if (transfer) + pipe_buffer_unmap(pipe, transfer); + + if (!new_info.instance_count) + return; + } else { + /* Non-indexed multidraw. + * + * Keep the draw call indirect and compute minimums & maximums, + * which will determine the user buffer bounds to upload, but + * the driver will not look at these values because indirect != NULL. + * + * This efficiently processes the multidraw with the time complexity + * equal to 1 draw call. + */ + new_info.start = ~0u; + new_info.count = 0; + new_info.start_instance = ~0u; + new_info.instance_count = 0; + + for (unsigned i = 0; i < draw_count; i++) { + unsigned offset = i * indirect->stride / 4; + + new_info.start = MIN2(new_info.start, data[offset + 2]); + new_info.count = MAX2(new_info.count, data[offset + 0]); + + new_info.start_instance = MIN2(new_info.start_instance, + data[offset + 3]); + new_info.instance_count = MAX2(new_info.instance_count, + data[offset + 1]); + } + + if (!new_info.count || !new_info.instance_count) + return; + } } if (new_info.index_size) { /* See if anything needs to be done for per-vertex attribs. */ if (u_vbuf_need_minmax_index(mgr)) { int max_index; if (new_info.max_index != ~0u) { min_index = new_info.min_index; max_index = new_info.max_index; @@ -1211,21 +1351,22 @@ void u_vbuf_draw_vbo(struct u_vbuf *mgr, const struct pipe_draw_info *info) assert(min_index <= max_index); start_vertex = min_index + new_info.index_bias; num_vertices = max_index + 1 - min_index; /* Primitive restart doesn't work when unrolling indices. * We would have to break this drawing operation into several ones. */ /* Use some heuristic to see if unrolling indices improves * performance. */ - if (!new_info.primitive_restart && + if (!info->indirect && + !new_info.primitive_restart && num_vertices > new_info.count*2 && num_vertices - new_info.count > 32 && !u_vbuf_mapping_vertex_buffer_blocks(mgr)) { unroll_indices = TRUE; user_vb_mask &= ~(mgr->nonzero_stride_vb_mask & mgr->ve->noninstance_vb_mask_any); } } else { /* Nothing to do for per-vertex attribs. */ start_vertex = 0; -- 2.17.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev