On 03/05/16 01:10, Kenneth Graunke wrote: > On Friday, April 29, 2016 1:29:53 PM PDT Samuel Iglesias Gonsálvez wrote: >> When there is a mix of definitions of uniforms with 32-bit or 64-bit >> data type sizes, the driver ends up doing misaligned access to double >> based variables in the push constant buffer. >> >> To fix this, the driver adds padding when needed in the push constant >> buffer and takes it into account to avoid exceeding its limits and to >> update the GRF registers to access uniform variables. >> >> v2 (Iago): >> - Renamed some variables for clarity. >> - Replace the boolean array of paddings that tracked if each param was >> padded by an integer array that keeps count of accumnulated padding. >> This allows us to remove a couple of loops that had to compute that. >> - Reworked things a bit so we can get rid of the nr_paddings field in >> brw_stage_prog_data. >> - Use rzalloc_array instead or ralloc_array and memset. >> - Fixed wrong indentation. >> >> Signed-off-by: Samuel Iglesias Gonsálvez <sigles...@igalia.com> >> Signed-off-by: Iago Toral Quiroga <ito...@igalia.com> >> --- >> src/mesa/drivers/dri/i965/brw_compiler.h | 8 ++++ >> src/mesa/drivers/dri/i965/brw_fs.cpp | 50 +++++++++++++++++++++ > +--- >> src/mesa/drivers/dri/i965/brw_program.c | 1 + >> src/mesa/drivers/dri/i965/brw_wm.c | 2 + >> src/mesa/drivers/dri/i965/gen6_constant_state.c | 12 ++++-- >> 5 files changed, 64 insertions(+), 9 deletions(-) >> >> diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/ > dri/i965/brw_compiler.h >> index 5807305..bc15bf3 100644 >> --- a/src/mesa/drivers/dri/i965/brw_compiler.h >> +++ b/src/mesa/drivers/dri/i965/brw_compiler.h >> @@ -333,6 +333,14 @@ struct brw_stage_prog_data { >> } binding_table; >> >> GLuint nr_params; /**< number of float params/constants */ >> + /** >> + * Accumulated padding (in 32-bit units) in the push constant buffer for >> + * each param. If param_padding[n] = P, it means that params 0..n > required >> + * a total accumulated padding of P 32-bit slots. This is useful to > compute >> + * the actual location, including padding, for each param. >> + */ >> + uint32_t *param_padding; >> + >> GLuint nr_pull_params; >> unsigned nr_image_params; >> >> diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/ > i965/brw_fs.cpp >> index 53f709b..5aa6ded 100644 >> --- a/src/mesa/drivers/dri/i965/brw_fs.cpp >> +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp >> @@ -1571,7 +1571,8 @@ fs_visitor::assign_curb_setup() >> int uniform_nr = inst->src[i].nr + inst->src[i].reg_offset; >> int constant_nr; >> if (uniform_nr >= 0 && uniform_nr < (int) uniforms) { >> - constant_nr = push_constant_loc[uniform_nr]; >> + constant_nr = push_constant_loc[uniform_nr] + >> + stage_prog_data->param_padding[uniform_nr]; >> } else { >> /* Section 5.11 of the OpenGL 4.1 spec says: >> * "Out-of-bounds reads return undefined values, which > include >> @@ -2010,7 +2011,10 @@ fs_visitor::assign_constant_locations() >> return; >> >> bool is_live[uniforms]; >> + bool is_64bit[uniforms]; >> + >> memset(is_live, 0, sizeof(is_live)); >> + memset(is_64bit, 0, sizeof(is_64bit)); >> >> /* For each uniform slot, a value of true indicates that the given slot > and >> * the next slot must remain contiguous. This is used to keep us from >> @@ -2047,8 +2051,11 @@ fs_visitor::assign_constant_locations() >> is_live[last] = true; >> } else { >> if (constant_nr >= 0 && constant_nr < (int) uniforms) { >> - for (int j = 0; j < inst->regs_read(i); j++) >> + for (int j = 0; j < inst->regs_read(i); j++) { >> is_live[constant_nr + j] = true; >> + if (type_sz(inst->src[i].type) == 8) >> + is_64bit[constant_nr + j] = true; >> + } >> } >> } >> } >> @@ -2072,14 +2079,18 @@ fs_visitor::assign_constant_locations() >> >> unsigned int num_push_constants = 0; >> unsigned int num_pull_constants = 0; >> + unsigned num_paddings = 0; >> + bool is_64bit_aligned = true; >> >> push_constant_loc = ralloc_array(mem_ctx, int, uniforms); >> pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); >> + stage_prog_data->param_padding = rzalloc_array(NULL, uint32_t, > uniforms); >> >> int chunk_start = -1; >> for (unsigned u = 0; u < uniforms; u++) { >> push_constant_loc[u] = -1; >> pull_constant_loc[u] = -1; >> + stage_prog_data->param_padding[u] = num_paddings; >> >> if (!is_live[u]) >> continue; >> @@ -2094,17 +2105,44 @@ fs_visitor::assign_constant_locations() >> */ >> if (!contiguous[u]) { >> unsigned chunk_size = u - chunk_start + 1; >> + unsigned total_push_components = num_push_constants + > num_paddings; >> + bool push_32_bits_constant = !is_64bit[u] && >> + total_push_components + chunk_size <= max_push_components; >> + /* Verify if the 64 bits constant can be saved into the push > constant >> + * buffer. Take into account that it might need padding. >> + */ >> + bool push_64_bits_constant = is_64bit[u] && >> + (total_push_components + chunk_size + 1 + !is_64bit_aligned) <= >> + max_push_components; >> >> /* Decide whether we should push or pull this parameter. In the >> * Vulkan driver, push constants are explicitly exposed via the > API >> * so we push everything. In GL, we only push small arrays. >> */ >> if (stage_prog_data->pull_param == NULL || >> - (num_push_constants + chunk_size <= max_push_components && >> + ((push_32_bits_constant || push_64_bits_constant) && >> chunk_size <= max_chunk_size)) { >> - assert(num_push_constants + chunk_size <= max_push_components); >> - for (unsigned j = chunk_start; j <= u; j++) >> + assert(total_push_components + chunk_size <= > max_push_components); >> + >> + for (unsigned j = chunk_start; j <= u; j++) { >> + /* If it is a double and it is not aligned to 64 bits, add > padding. >> + * It is already aligned, don't add padding. >> + */ >> + if (push_64_bits_constant) { >> + if (!is_64bit_aligned) { >> + num_paddings++; >> + stage_prog_data->param_padding[j] = num_paddings; >> + is_64bit_aligned = true; >> + } >> + } else { >> + /* We are pushing a 32 bits constant. Is the next push > constant >> + * aligned 64 bits? >> + */ >> + is_64bit_aligned = !is_64bit_aligned; >> + } >> + >> push_constant_loc[j] = num_push_constants++; >> + } >> } else { >> for (unsigned j = chunk_start; j <= u; j++) >> pull_constant_loc[j] = num_pull_constants++; >> @@ -2114,7 +2152,7 @@ fs_visitor::assign_constant_locations() >> } >> } >> >> - stage_prog_data->nr_params = num_push_constants; >> + stage_prog_data->nr_params = num_push_constants + num_paddings; >> stage_prog_data->nr_pull_params = num_pull_constants; >> >> /* Up until now, the param[] array has been indexed by reg + reg_offset >> diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/ > i965/brw_program.c >> index 3112c0c..2d145cd 100644 >> --- a/src/mesa/drivers/dri/i965/brw_program.c >> +++ b/src/mesa/drivers/dri/i965/brw_program.c >> @@ -555,6 +555,7 @@ brw_stage_prog_data_free(const void *p) >> struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p; >> >> ralloc_free(prog_data->param); >> + ralloc_free(prog_data->param_padding); >> ralloc_free(prog_data->pull_param); >> ralloc_free(prog_data->image_param); >> } >> diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/ > brw_wm.c >> index dbc626c..025d484 100644 >> --- a/src/mesa/drivers/dri/i965/brw_wm.c >> +++ b/src/mesa/drivers/dri/i965/brw_wm.c >> @@ -103,6 +103,8 @@ brw_codegen_wm_prog(struct brw_context *brw, >> param_count += 2 * ctx- >> Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits; >> prog_data.base.param = >> rzalloc_array(NULL, const gl_constant_value *, param_count); >> + prog_data.base.param_padding = >> + rzalloc_array(NULL, uint32_t, param_count); >> prog_data.base.pull_param = >> rzalloc_array(NULL, const gl_constant_value *, param_count); >> prog_data.base.image_param = >> diff --git a/src/mesa/drivers/dri/i965/gen6_constant_state.c b/src/mesa/ > drivers/dri/i965/gen6_constant_state.c >> index 6c0c32b..3d0caf2 100644 >> --- a/src/mesa/drivers/dri/i965/gen6_constant_state.c >> +++ b/src/mesa/drivers/dri/i965/gen6_constant_state.c >> @@ -135,7 +135,9 @@ gen6_upload_push_constants(struct brw_context *brw, >> _mesa_load_state_parameters(ctx, prog->Parameters); >> >> gl_constant_value *param; >> - int i; >> + int i, p; >> + gl_constant_value zero; >> + zero.u = 0; >> >> param = brw_state_batch(brw, type, >> prog_data->nr_params * > sizeof(gl_constant_value), >> @@ -149,8 +151,12 @@ gen6_upload_push_constants(struct brw_context *brw, >> * side effect of dereferencing uniforms, so _NEW_PROGRAM_CONSTANTS >> * wouldn't be set for them. >> */ >> - for (i = 0; i < prog_data->nr_params; i++) { >> - param[i] = *prog_data->param[i]; >> + for (i = 0, p = 0; i < prog_data->nr_params; i++) { >> + if (p > 0 && prog_data->param_padding && >> + prog_data->param_padding[p] > prog_data->param_padding[p - 1]) > { >> + param[i++] = zero; >> + } >> + param[i] = *prog_data->param[p++]; >> } >> >> if (0) { > > This patch seems much too complicated. A couple things I don't like: > - having to add a param_padding[] array to prog_data > - making padding decisions in upload_push_constants(). > > It seems like we should be able to contain all of this code within > fs_visitor::assign_constant_locations(). It should just make sure > that 64-bit values have an aligned location, account for padding in > nr_params, and fill in any holes in stage_prog_data->param[] by > setting them to &zero, like we do in brw_vec4.cpp:1665: > > static gl_constant_value zero = { 0.0 }; > stage_prog_data->param[slot] = &zero; > > Then, the upload code can just iterate and dereference params[] as > it always has. >
OK. I am going to try this idea first. Thanks to Topi, Jordan and you for the suggestions for this patch! Sam > The idea of "upload all the 64-bit things first, add 0 or 1 padding > slots, then upload all the 32-bit things" also seems like it could > simplify this code a lot. > _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev