On 07/05/16 09:22, Jordan Justen wrote: > On 2016-05-05 23:56:08, Samuel Iglesias Gonsálvez wrote: >> When there is a mix of definitions of uniforms with 32-bit or 64-bit >> data type sizes, the driver ends up doing misaligned access to double >> based variables in the push constant buffer. >> >> To fix this, this patch pushes first all the 64-bit variables and >> then the rest. Then, all the variables would be aligned to >> its data type size. >> >> Signed-off-by: Samuel Iglesias Gonsálvez <sigles...@igalia.com> >> --- >> src/mesa/drivers/dri/i965/brw_fs.cpp | 113 >> +++++++++++++++++++++++++---------- >> 1 file changed, 83 insertions(+), 30 deletions(-) >> >> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp >> b/src/mesa/drivers/dri/i965/brw_fs.cpp >> index 65aa3c7..7eaf1b4 100644 >> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp >> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp >> @@ -39,6 +39,7 @@ >> #include "brw_program.h" >> #include "brw_dead_control_flow.h" >> #include "compiler/glsl_types.h" >> +#include "program/prog_parameter.h" >> >> using namespace brw; >> >> @@ -2004,6 +2005,45 @@ fs_visitor::compact_virtual_grfs() >> return progress; >> } >> >> +static void >> +set_push_pull_constant_loc(unsigned uniform, int &chunk_start, bool >> contiguous, >> + int *push_constant_loc, int *pull_constant_loc, >> + unsigned &num_push_constants, >> + unsigned &num_pull_constants, >> + const unsigned max_push_components, >> + const unsigned max_chunk_size, >> + struct brw_stage_prog_data *stage_prog_data) >> +{ >> + /* This is the first live uniform in the chunk */ >> + if (chunk_start < 0) >> + chunk_start = uniform; >> + >> + /* If this element does not need to be contiguous with the next, we >> + * split at this point and everthing between chunk_start and u forms a >> + * single chunk. >> + */ >> + if (!contiguous) { >> + unsigned chunk_size = uniform - chunk_start + 1; >> + >> + /* Decide whether we should push or pull this parameter. In the >> + * Vulkan driver, push constants are explicitly exposed via the API >> + * so we push everything. In GL, we only push small arrays. >> + */ >> + if (stage_prog_data->pull_param == NULL || >> + (num_push_constants + chunk_size <= max_push_components && >> + chunk_size <= max_chunk_size)) { >> + assert(num_push_constants + chunk_size <= max_push_components); >> + for (unsigned j = chunk_start; j <= uniform; j++) >> + push_constant_loc[j] = num_push_constants++; >> + } else { >> + for (unsigned j = chunk_start; j <= uniform; j++) >> + pull_constant_loc[j] = num_pull_constants++; >> + } >> + >> + chunk_start = -1; >> + } >> +} >> + >> /** >> * Assign UNIFORM file registers to either push constants or pull constants. >> * >> @@ -2022,6 +2062,8 @@ fs_visitor::assign_constant_locations() >> >> bool is_live[uniforms]; >> memset(is_live, 0, sizeof(is_live)); >> + bool is_live_64bit[uniforms]; >> + memset(is_live_64bit, 0, sizeof(is_live_64bit)); >> >> /* For each uniform slot, a value of true indicates that the given slot >> and >> * the next slot must remain contiguous. This is used to keep us from >> @@ -2054,14 +2096,21 @@ fs_visitor::assign_constant_locations() >> for (unsigned j = constant_nr; j < last; j++) { >> is_live[j] = true; >> contiguous[j] = true; >> + if (type_sz(inst->src[i].type) == 8) { >> + is_live_64bit[j] = true; >> + } >> } >> is_live[last] = true; >> } else { >> if (constant_nr >= 0 && constant_nr < (int) uniforms) { >> int regs_read = inst->components_read(i) * >> type_sz(inst->src[i].type) / 4; >> - for (int j = 0; j < regs_read; j++) >> + for (int j = 0; j < regs_read; j++) { >> is_live[constant_nr + j] = true; >> + if (type_sz(inst->src[i].type) == 8) { >> + is_live_64bit[constant_nr + j] = true; >> + } >> + } >> } >> } >> } >> @@ -2090,43 +2139,46 @@ fs_visitor::assign_constant_locations() >> pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); >> >> int chunk_start = -1; >> + >> + >> + /* First push 64-bit uniforms */ > > Maybe add "to ensure they are properly aligned"? >
Yeah, I will add it. >> for (unsigned u = 0; u < uniforms; u++) { >> - push_constant_loc[u] = -1; >> + if (!is_live[u] || !is_live_64bit[u]) >> + continue; >> + >> pull_constant_loc[u] = -1; >> + push_constant_loc[u] = -1; >> >> - if (!is_live[u]) >> - continue; >> + set_push_pull_constant_loc(u, chunk_start, contiguous[u], >> + push_constant_loc, pull_constant_loc, >> + num_push_constants, num_pull_constants, >> + max_push_components, max_chunk_size, >> + stage_prog_data); >> >> - /* This is the first live uniform in the chunk */ >> - if (chunk_start < 0) >> - chunk_start = u; >> + } >> >> - /* If this element does not need to be contiguous with the next, we >> - * split at this point and everthing between chunk_start and u forms a >> - * single chunk. >> - */ >> - if (!contiguous[u]) { >> - unsigned chunk_size = u - chunk_start + 1; >> + /* Then push the rest of uniforms */ >> + for (unsigned u = 0; u < uniforms; u++) { >> + if (!is_live[u] || is_live_64bit[u]) >> + continue; >> >> - /* Decide whether we should push or pull this parameter. In the >> - * Vulkan driver, push constants are explicitly exposed via the API >> - * so we push everything. In GL, we only push small arrays. >> - */ >> - if (stage_prog_data->pull_param == NULL || >> - (num_push_constants + chunk_size <= max_push_components && >> - chunk_size <= max_chunk_size)) { >> - assert(num_push_constants + chunk_size <= max_push_components); >> - for (unsigned j = chunk_start; j <= u; j++) >> - push_constant_loc[j] = num_push_constants++; >> - } else { >> - for (unsigned j = chunk_start; j <= u; j++) >> - pull_constant_loc[j] = num_pull_constants++; >> - } >> + pull_constant_loc[u] = -1; >> + push_constant_loc[u] = -1; >> >> - chunk_start = -1; >> - } >> + set_push_pull_constant_loc(u, chunk_start, contiguous[u], >> + push_constant_loc, pull_constant_loc, >> + num_push_constants, num_pull_constants, >> + max_push_components, max_chunk_size, >> + stage_prog_data); >> } >> >> + /* As the uniforms are going to be reordered, take the data from a >> temporal > > temporary > OK >> + * copy of the original param[]. >> + */ >> + gl_constant_value **param = rzalloc_array(mem_ctx, gl_constant_value*, >> + stage_prog_data->nr_params); > > Since we always freeing it below, should we use NULL instead of > mem_ctx? > Yes, at the end using mem_ctx doesn't give any advantage as we are freeing it below. I'll change it. > I think don't need the 'z' version since we use memcpy next. Right! > > Reviewed-by: Jordan Justen <jordan.l.jus...@intel.com> > OK, thanks a lot for the review! Sam >> + memcpy(param, stage_prog_data->param, >> + sizeof(gl_constant_value*) * stage_prog_data->nr_params); >> stage_prog_data->nr_params = num_push_constants; >> stage_prog_data->nr_pull_params = num_pull_constants; >> >> @@ -2139,7 +2191,7 @@ fs_visitor::assign_constant_locations() >> * having to make a copy. >> */ >> for (unsigned int i = 0; i < uniforms; i++) { >> - const gl_constant_value *value = stage_prog_data->param[i]; >> + const gl_constant_value *value = param[i]; >> >> if (pull_constant_loc[i] != -1) { >> stage_prog_data->pull_param[pull_constant_loc[i]] = value; >> @@ -2147,6 +2199,7 @@ fs_visitor::assign_constant_locations() >> stage_prog_data->param[push_constant_loc[i]] = value; >> } >> } >> + ralloc_free(param); >> } >> >> /** >> -- >> 2.5.0 >> >> _______________________________________________ >> mesa-dev mailing list >> mesa-dev@lists.freedesktop.org >> https://lists.freedesktop.org/mailman/listinfo/mesa-dev > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev