From: Marek Olšák <marek.ol...@amd.com> We know the divisors when we upload them, so instead we can precompute and upload division factors derived from each divisor.
This fast division consists of add, mul_hi, and two shifts, and we have to load 4 dwords intead of 1. This probably won't affect any apps. --- src/gallium/drivers/radeonsi/si_shader.c | 46 +++++++++++++++----------------- src/gallium/drivers/radeonsi/si_state.c | 42 ++++++++++++++++++++++++----- src/gallium/drivers/radeonsi/si_state.h | 2 +- 3 files changed, 57 insertions(+), 33 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 36f58e2..90cb059 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -421,34 +421,20 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx) return LLVMConstInt(ctx->i32, stride, 0); } return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8); default: assert(0); return NULL; } } -static LLVMValueRef get_instance_index_for_fetch( - struct si_shader_context *ctx, - unsigned param_start_instance, LLVMValueRef divisor) -{ - LLVMValueRef result = ctx->abi.instance_id; - - /* The division must be done before START_INSTANCE is added. */ - if (divisor != ctx->i32_1) - result = LLVMBuildUDiv(ctx->ac.builder, result, divisor, ""); - - return LLVMBuildAdd(ctx->ac.builder, result, - LLVMGetParam(ctx->main_fn, param_start_instance), ""); -} - /* Bitcast <4 x float> to <2 x double>, extract the component, and convert * to float. */ static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx, LLVMValueRef vec4, unsigned double_index) { LLVMBuilderRef builder = ctx->ac.builder; LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->ac.context); LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4, LLVMVectorType(f64, 2), ""); @@ -7294,34 +7280,44 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx, ac_build_load_to_sgpr(&ctx->ac, list, buf_index); } for (i = 0; i <= key->vs_prolog.last_input; i++) { bool divisor_is_one = key->vs_prolog.states.instance_divisor_is_one & (1u << i); bool divisor_is_fetched = key->vs_prolog.states.instance_divisor_is_fetched & (1u << i); LLVMValueRef index; - if (divisor_is_one || divisor_is_fetched) { - LLVMValueRef divisor = ctx->i32_1; + if (divisor_is_one) { + index = ctx->abi.instance_id; + } else if (divisor_is_fetched) { + LLVMValueRef udiv_factors[4]; - if (divisor_is_fetched) { - divisor = buffer_load_const(ctx, instance_divisor_constbuf, - LLVMConstInt(ctx->i32, i * 4, 0)); - divisor = ac_to_integer(&ctx->ac, divisor); + for (unsigned j = 0; j < 4; j++) { + udiv_factors[j] = + buffer_load_const(ctx, instance_divisor_constbuf, + LLVMConstInt(ctx->i32, i*16 + j*4, 0)); + udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]); } + /* The faster NUW version doesn't work when InstanceID == UINT_MAX. + * Such InstanceID might not be achievable in a reasonable time though. + */ + index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id, + udiv_factors[0], udiv_factors[1], + udiv_factors[2], udiv_factors[3]); + } - /* InstanceID / Divisor + StartInstance */ - index = get_instance_index_for_fetch(ctx, - user_sgpr_base + - SI_SGPR_START_INSTANCE, - divisor); + if (divisor_is_one || divisor_is_fetched) { + /* Add StartInstance. */ + index = LLVMBuildAdd(ctx->ac.builder, index, + LLVMGetParam(ctx->main_fn, user_sgpr_base + + SI_SGPR_START_INSTANCE), ""); } else { /* VertexID + BaseVertex */ index = LLVMBuildAdd(ctx->ac.builder, ctx->abi.vertex_id, LLVMGetParam(func, user_sgpr_base + SI_SGPR_BASE_VERTEX), ""); } index = ac_to_float(&ctx->ac, index); ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index, diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c index bc1417a..aa57b3f 100644 --- a/src/gallium/drivers/radeonsi/si_state.c +++ b/src/gallium/drivers/radeonsi/si_state.c @@ -25,20 +25,21 @@ #include "si_build_pm4.h" #include "gfx9d.h" #include "si_query.h" #include "util/u_dual_blend.h" #include "util/u_format.h" #include "util/u_format_s3tc.h" #include "util/u_memory.h" #include "util/u_resource.h" #include "util/u_upload_mgr.h" +#include "util/fast_idiv_by_const.h" static unsigned si_map_swizzle(unsigned swizzle) { switch (swizzle) { case PIPE_SWIZZLE_Y: return V_008F0C_SQ_SEL_Y; case PIPE_SWIZZLE_Z: return V_008F0C_SQ_SEL_Z; case PIPE_SWIZZLE_W: return V_008F0C_SQ_SEL_W; @@ -4348,20 +4349,26 @@ static void si_delete_sampler_state(struct pipe_context *ctx, void *state) * Vertex elements & buffers */ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned count, const struct pipe_vertex_element *elements) { struct si_screen *sscreen = (struct si_screen*)ctx->screen; struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements); bool used[SI_NUM_VERTEX_BUFFERS] = {}; + struct util_fast_udiv_info divisor_factors[SI_MAX_ATTRIBS] = {}; + STATIC_ASSERT(sizeof(struct util_fast_udiv_info) == 16); + STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4); + STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4); + STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4); + STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4); int i; assert(count <= SI_MAX_ATTRIBS); if (!v) return NULL; v->count = count; v->desc_list_byte_size = align(count * 16, SI_CPDMA_ALIGNMENT); for (i = 0; i < count; ++i) { @@ -4370,28 +4377,31 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, unsigned data_format, num_format; int first_non_void; unsigned vbo_index = elements[i].vertex_buffer_index; unsigned char swizzle[4]; if (vbo_index >= SI_NUM_VERTEX_BUFFERS) { FREE(v); return NULL; } - if (elements[i].instance_divisor) { + unsigned instance_divisor = elements[i].instance_divisor; + if (instance_divisor) { v->uses_instance_divisors = true; - v->instance_divisors[i] = elements[i].instance_divisor; - if (v->instance_divisors[i] == 1) + if (instance_divisor == 1) { v->instance_divisor_is_one |= 1u << i; - else + } else { v->instance_divisor_is_fetched |= 1u << i; + divisor_factors[i] = + util_compute_fast_udiv_info(instance_divisor, 32); + } } if (!used[vbo_index]) { v->first_vb_use_mask |= 1 << i; used[vbo_index] = true; } desc = util_format_description(elements[i].src_format); first_non_void = util_format_get_first_non_void_channel(elements[i].src_format); data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void); @@ -4487,20 +4497,36 @@ static void *si_create_vertex_elements(struct pipe_context *ctx, } } v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(swizzle[0])) | S_008F0C_DST_SEL_Y(si_map_swizzle(swizzle[1])) | S_008F0C_DST_SEL_Z(si_map_swizzle(swizzle[2])) | S_008F0C_DST_SEL_W(si_map_swizzle(swizzle[3])) | S_008F0C_NUM_FORMAT(num_format) | S_008F0C_DATA_FORMAT(data_format); } + + if (v->instance_divisor_is_fetched) { + unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched); + + v->instance_divisor_factor_buffer = + (struct r600_resource*) + pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT, + num_divisors * sizeof(divisor_factors[0])); + if (!v->instance_divisor_factor_buffer) { + FREE(v); + return NULL; + } + void *map = sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf, + NULL, PIPE_TRANSFER_WRITE); + memcpy(map , divisor_factors, num_divisors * sizeof(divisor_factors[0])); + } return v; } static void si_bind_vertex_elements(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; struct si_vertex_elements *old = sctx->vertex_elements; struct si_vertex_elements *v = (struct si_vertex_elements*)state; sctx->vertex_elements = v; @@ -4510,34 +4536,36 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state) (!old || old->count != v->count || old->uses_instance_divisors != v->uses_instance_divisors || v->uses_instance_divisors || /* we don't check which divisors changed */ memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count))) sctx->do_update_shaders = true; if (v && v->instance_divisor_is_fetched) { struct pipe_constant_buffer cb; - cb.buffer = NULL; - cb.user_buffer = v->instance_divisors; + cb.buffer = &v->instance_divisor_factor_buffer->b.b; + cb.user_buffer = NULL; cb.buffer_offset = 0; - cb.buffer_size = sizeof(uint32_t) * v->count; + cb.buffer_size = 0xffffffff; si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb); } } static void si_delete_vertex_element(struct pipe_context *ctx, void *state) { struct si_context *sctx = (struct si_context *)ctx; + struct si_vertex_elements *v = (struct si_vertex_elements*)state; if (sctx->vertex_elements == state) sctx->vertex_elements = NULL; + r600_resource_reference(&v->instance_divisor_factor_buffer, NULL); FREE(state); } static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot, unsigned count, const struct pipe_vertex_buffer *buffers) { struct si_context *sctx = (struct si_context *)ctx; struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot; int i; diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h index 89bb5b6..d9c3e70 100644 --- a/src/gallium/drivers/radeonsi/si_state.h +++ b/src/gallium/drivers/radeonsi/si_state.h @@ -131,21 +131,21 @@ struct si_state_dsa { }; struct si_stencil_ref { struct pipe_stencil_ref state; struct si_dsa_stencil_ref_part dsa_part; }; struct si_vertex_elements { - uint32_t instance_divisors[SI_MAX_ATTRIBS]; + struct r600_resource *instance_divisor_factor_buffer; uint32_t rsrc_word3[SI_MAX_ATTRIBS]; uint16_t src_offset[SI_MAX_ATTRIBS]; uint8_t fix_fetch[SI_MAX_ATTRIBS]; uint8_t format_size[SI_MAX_ATTRIBS]; uint8_t vertex_buffer_index[SI_MAX_ATTRIBS]; uint8_t count; bool uses_instance_divisors; uint16_t first_vb_use_mask; -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev