On Fri, Apr 29, 2016 at 01:29:53PM +0200, Samuel Iglesias Gons?lvez wrote: > When there is a mix of definitions of uniforms with 32-bit or 64-bit > data type sizes, the driver ends up doing misaligned access to double > based variables in the push constant buffer. > > To fix this, the driver adds padding when needed in the push constant > buffer and takes it into account to avoid exceeding its limits and to > update the GRF registers to access uniform variables. > > v2 (Iago): > - Renamed some variables for clarity. > - Replace the boolean array of paddings that tracked if each param was > padded by an integer array that keeps count of accumnulated padding. > This allows us to remove a couple of loops that had to compute that. > - Reworked things a bit so we can get rid of the nr_paddings field in > brw_stage_prog_data. > - Use rzalloc_array instead or ralloc_array and memset. > - Fixed wrong indentation. > > Signed-off-by: Samuel Iglesias Gonsálvez <sigles...@igalia.com> > Signed-off-by: Iago Toral Quiroga <ito...@igalia.com> > --- > src/mesa/drivers/dri/i965/brw_compiler.h | 8 ++++ > src/mesa/drivers/dri/i965/brw_fs.cpp | 50 > ++++++++++++++++++++++--- > src/mesa/drivers/dri/i965/brw_program.c | 1 + > src/mesa/drivers/dri/i965/brw_wm.c | 2 + > src/mesa/drivers/dri/i965/gen6_constant_state.c | 12 ++++-- > 5 files changed, 64 insertions(+), 9 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h > b/src/mesa/drivers/dri/i965/brw_compiler.h > index 5807305..bc15bf3 100644 > --- a/src/mesa/drivers/dri/i965/brw_compiler.h > +++ b/src/mesa/drivers/dri/i965/brw_compiler.h > @@ -333,6 +333,14 @@ struct brw_stage_prog_data { > } binding_table; > > GLuint nr_params; /**< number of float params/constants */ > + /** > + * Accumulated padding (in 32-bit units) in the push constant buffer for > + * each param. If param_padding[n] = P, it means that params 0..n required > + * a total accumulated padding of P 32-bit slots. This is useful to > compute > + * the actual location, including padding, for each param. > + */ > + uint32_t *param_padding; > + > GLuint nr_pull_params; > unsigned nr_image_params; > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp > b/src/mesa/drivers/dri/i965/brw_fs.cpp > index 53f709b..5aa6ded 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp > @@ -1571,7 +1571,8 @@ fs_visitor::assign_curb_setup() > int uniform_nr = inst->src[i].nr + inst->src[i].reg_offset; > int constant_nr; > if (uniform_nr >= 0 && uniform_nr < (int) uniforms) { > - constant_nr = push_constant_loc[uniform_nr]; > + constant_nr = push_constant_loc[uniform_nr] + > + stage_prog_data->param_padding[uniform_nr]; > } else { > /* Section 5.11 of the OpenGL 4.1 spec says: > * "Out-of-bounds reads return undefined values, which include > @@ -2010,7 +2011,10 @@ fs_visitor::assign_constant_locations() > return; > > bool is_live[uniforms]; > + bool is_64bit[uniforms]; > + > memset(is_live, 0, sizeof(is_live)); > + memset(is_64bit, 0, sizeof(is_64bit)); > > /* For each uniform slot, a value of true indicates that the given slot > and > * the next slot must remain contiguous. This is used to keep us from > @@ -2047,8 +2051,11 @@ fs_visitor::assign_constant_locations() > is_live[last] = true; > } else { > if (constant_nr >= 0 && constant_nr < (int) uniforms) { > - for (int j = 0; j < inst->regs_read(i); j++) > + for (int j = 0; j < inst->regs_read(i); j++) { > is_live[constant_nr + j] = true; > + if (type_sz(inst->src[i].type) == 8) > + is_64bit[constant_nr + j] = true; > + } > } > } > } > @@ -2072,14 +2079,18 @@ fs_visitor::assign_constant_locations() > > unsigned int num_push_constants = 0; > unsigned int num_pull_constants = 0; > + unsigned num_paddings = 0; > + bool is_64bit_aligned = true; > > push_constant_loc = ralloc_array(mem_ctx, int, uniforms); > pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); > + stage_prog_data->param_padding = rzalloc_array(NULL, uint32_t, uniforms); > > int chunk_start = -1; > for (unsigned u = 0; u < uniforms; u++) { > push_constant_loc[u] = -1; > pull_constant_loc[u] = -1; > + stage_prog_data->param_padding[u] = num_paddings; > > if (!is_live[u]) > continue; > @@ -2094,17 +2105,44 @@ fs_visitor::assign_constant_locations() > */ > if (!contiguous[u]) { > unsigned chunk_size = u - chunk_start + 1; > + unsigned total_push_components = num_push_constants + num_paddings; > + bool push_32_bits_constant = !is_64bit[u] && > + total_push_components + chunk_size <= max_push_components; > + /* Verify if the 64 bits constant can be saved into the push > constant > + * buffer. Take into account that it might need padding. > + */ > + bool push_64_bits_constant = is_64bit[u] && > + (total_push_components + chunk_size + 1 + !is_64bit_aligned) <= > + max_push_components; > > /* Decide whether we should push or pull this parameter. In the > * Vulkan driver, push constants are explicitly exposed via the API > * so we push everything. In GL, we only push small arrays. > */ > if (stage_prog_data->pull_param == NULL || > - (num_push_constants + chunk_size <= max_push_components && > + ((push_32_bits_constant || push_64_bits_constant) && > chunk_size <= max_chunk_size)) { > - assert(num_push_constants + chunk_size <= max_push_components); > - for (unsigned j = chunk_start; j <= u; j++) > + assert(total_push_components + chunk_size <= > max_push_components); > + > + for (unsigned j = chunk_start; j <= u; j++) { > + /* If it is a double and it is not aligned to 64 bits, add > padding. > + * It is already aligned, don't add padding. > + */ > + if (push_64_bits_constant) { > + if (!is_64bit_aligned) { > + num_paddings++; > + stage_prog_data->param_padding[j] = num_paddings; > + is_64bit_aligned = true; > + } > + } else { > + /* We are pushing a 32 bits constant. Is the next push > constant > + * aligned 64 bits? > + */ > + is_64bit_aligned = !is_64bit_aligned; > + } > + > push_constant_loc[j] = num_push_constants++; > + }
I am probably greatly over simplifying but I have to ask. Could we simply add a hole in the push file by increasing the counter? After all it is push_constant_loc[] that controls in the end the offset in the instructions (after running assign_curb_setup()) and tells the loader where to copy the data from the uniform storage. Something of this sort: if (!contiguous[u]) { unsigned chunk_size = u - chunk_start + 1; + if (is_64bit[u] && num_push_constants % 2) + num_push_constants++; /* Decide whether we should push or pull this parameter. In the * Vulkan driver, push constants are explicitly exposed via the API * so we push everything. In GL, we only push small arrays. */ if (stage_prog_data->pull_param == NULL || (num_push_constants + chunk_size <= max_push_components && chunk_size <= max_chunk_size)) { assert(num_push_constants + chunk_size <= max_push_components); for (unsigned j = chunk_start; j <= u; j++) push_constant_loc[j] = num_push_constants++; > } else { > for (unsigned j = chunk_start; j <= u; j++) > pull_constant_loc[j] = num_pull_constants++; > @@ -2114,7 +2152,7 @@ fs_visitor::assign_constant_locations() > } > } > > - stage_prog_data->nr_params = num_push_constants; > + stage_prog_data->nr_params = num_push_constants + num_paddings; > stage_prog_data->nr_pull_params = num_pull_constants; > > /* Up until now, the param[] array has been indexed by reg + reg_offset > diff --git a/src/mesa/drivers/dri/i965/brw_program.c > b/src/mesa/drivers/dri/i965/brw_program.c > index 3112c0c..2d145cd 100644 > --- a/src/mesa/drivers/dri/i965/brw_program.c > +++ b/src/mesa/drivers/dri/i965/brw_program.c > @@ -555,6 +555,7 @@ brw_stage_prog_data_free(const void *p) > struct brw_stage_prog_data *prog_data = (struct brw_stage_prog_data *)p; > > ralloc_free(prog_data->param); > + ralloc_free(prog_data->param_padding); > ralloc_free(prog_data->pull_param); > ralloc_free(prog_data->image_param); > } > diff --git a/src/mesa/drivers/dri/i965/brw_wm.c > b/src/mesa/drivers/dri/i965/brw_wm.c > index dbc626c..025d484 100644 > --- a/src/mesa/drivers/dri/i965/brw_wm.c > +++ b/src/mesa/drivers/dri/i965/brw_wm.c > @@ -103,6 +103,8 @@ brw_codegen_wm_prog(struct brw_context *brw, > param_count += 2 * > ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits; > prog_data.base.param = > rzalloc_array(NULL, const gl_constant_value *, param_count); > + prog_data.base.param_padding = > + rzalloc_array(NULL, uint32_t, param_count); > prog_data.base.pull_param = > rzalloc_array(NULL, const gl_constant_value *, param_count); > prog_data.base.image_param = > diff --git a/src/mesa/drivers/dri/i965/gen6_constant_state.c > b/src/mesa/drivers/dri/i965/gen6_constant_state.c > index 6c0c32b..3d0caf2 100644 > --- a/src/mesa/drivers/dri/i965/gen6_constant_state.c > +++ b/src/mesa/drivers/dri/i965/gen6_constant_state.c > @@ -135,7 +135,9 @@ gen6_upload_push_constants(struct brw_context *brw, > _mesa_load_state_parameters(ctx, prog->Parameters); > > gl_constant_value *param; > - int i; > + int i, p; > + gl_constant_value zero; > + zero.u = 0; > > param = brw_state_batch(brw, type, > prog_data->nr_params * > sizeof(gl_constant_value), > @@ -149,8 +151,12 @@ gen6_upload_push_constants(struct brw_context *brw, > * side effect of dereferencing uniforms, so _NEW_PROGRAM_CONSTANTS > * wouldn't be set for them. > */ > - for (i = 0; i < prog_data->nr_params; i++) { > - param[i] = *prog_data->param[i]; > + for (i = 0, p = 0; i < prog_data->nr_params; i++) { > + if (p > 0 && prog_data->param_padding && > + prog_data->param_padding[p] > prog_data->param_padding[p - 1]) { > + param[i++] = zero; > + } > + param[i] = *prog_data->param[p++]; > } > > if (0) { > -- > 2.5.0 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev