On Fri, May 27, 2016 at 11:24 AM, Jordan Justen <jordan.l.jus...@intel.com> wrote:
> This thread ID uniform will be used to compute the > gl_LocalInvocationIndex and gl_LocalInvocationID values. > > It is important for this uniform to be added in the last push constant > register. fs_visitor::assign_constant_locations is updated to make > sure this happens. > > The reason this is important is that the cross-thread push constant > registers are loaded first, and the per-thread push constant registers > are loaded after that. (Broadwell adds another push constant upload > mechanism which reverses this order, but we are ignoring this for > now.) > > v2: > * Add variable in intrinsics lowering pass > * Make sure the ID is pushed last in assign_constant_locations, and > that we save a spot for the ID in the push constants > > Signed-off-by: Jordan Justen <jordan.l.jus...@intel.com> > --- > src/mesa/drivers/dri/i965/brw_compiler.h | 1 + > src/mesa/drivers/dri/i965/brw_fs.cpp | 49 > +++++++++++++++++++++++++++++--- > 2 files changed, 46 insertions(+), 4 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h > b/src/mesa/drivers/dri/i965/brw_compiler.h > index a8fb486..f8379bc 100644 > --- a/src/mesa/drivers/dri/i965/brw_compiler.h > +++ b/src/mesa/drivers/dri/i965/brw_compiler.h > @@ -433,6 +433,7 @@ struct brw_cs_prog_data { > bool uses_barrier; > bool uses_num_work_groups; > unsigned local_invocation_id_regs; > + int thread_local_id_index; > > struct { > /** @{ > diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp > b/src/mesa/drivers/dri/i965/brw_fs.cpp > index bb2caa5..82b6781 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp > @@ -2086,6 +2086,10 @@ fs_visitor::assign_constant_locations() > bool contiguous[uniforms]; > memset(contiguous, 0, sizeof(contiguous)); > > + int thread_local_id_index = > + (stage == MESA_SHADER_COMPUTE) ? > + ((brw_cs_prog_data*)stage_prog_data)->thread_local_id_index : -1; > + > /* First, we walk through the instructions and do two things: > * > * 1) Figure out which uniforms are live. > @@ -2130,6 +2134,9 @@ fs_visitor::assign_constant_locations() > } > } > > + if (thread_local_id_index >= 0 && !is_live[thread_local_id_index]) > + thread_local_id_index = -1; > + > /* Only allow 16 registers (128 uniform components) as push constants. > * > * Just demote the end of the list. We could probably do better > @@ -2158,6 +2165,9 @@ fs_visitor::assign_constant_locations() > > int chunk_start = -1; > > + /* We may need to save a slot for the thread ID */ > + unsigned int saved_slots = thread_local_id_index >= 0 ? 1 : 0; > + > /* First push 64-bit uniforms to ensure they are properly aligned */ > for (unsigned u = 0; u < uniforms; u++) { > if (!is_live[u] || !is_live_64bit[u]) > @@ -2166,8 +2176,8 @@ fs_visitor::assign_constant_locations() > set_push_pull_constant_loc(u, &chunk_start, contiguous[u], > push_constant_loc, pull_constant_loc, > &num_push_constants, &num_pull_constants, > - max_push_components, max_chunk_size, > - stage_prog_data); > + max_push_components - saved_slots, > + max_chunk_size, stage_prog_data); > This seems a bit heavy-handed. I don't think we need to subtract saved_slots from max_push_components. It's just a heuristic and if we end up burning one extra register, oh well. Some day, we could try and make it smarter but I think "max_push_components -= 1" is probably as good as anything for that. > > } > > @@ -2176,13 +2186,29 @@ fs_visitor::assign_constant_locations() > if (!is_live[u] || is_live_64bit[u]) > continue; > > + /* Skip thread_local_id_index to put it in the last push register. > */ > + if (thread_local_id_index == (int)u) > + continue; > + > + set_push_pull_constant_loc(u, &chunk_start, contiguous[u], > + push_constant_loc, pull_constant_loc, > + &num_push_constants, &num_pull_constants, > + max_push_components - saved_slots, > + max_chunk_size, stage_prog_data); > + } > + > + if (thread_local_id_index >= 0) { > + /* Add the CS thread ID uniform at the end */ > + unsigned u = thread_local_id_index; > set_push_pull_constant_loc(u, &chunk_start, contiguous[u], > push_constant_loc, pull_constant_loc, > &num_push_constants, &num_pull_constants, > - max_push_components, max_chunk_size, > - stage_prog_data); > + max_push_components, > + max_chunk_size, stage_prog_data); > + assert(push_constant_loc[u] >= 0); > Why not just if (thread_local_id_index >= 0) push_constant_loc[thread_local_id_index] = num_push_constants++ Seems a lot better than calling a really complicated helper whose one job is to decide whether or not to push/pull something when we *always* want push. > } > > + > Extra line > /* As the uniforms are going to be reordered, take the data from a > temporary > * copy of the original param[]. > */ > @@ -2201,6 +2227,7 @@ fs_visitor::assign_constant_locations() > * push_constant_loc[i] <= i and we can do it in one smooth loop > without > * having to make a copy. > */ > + int new_thread_local_id_index = -1; > for (unsigned int i = 0; i < uniforms; i++) { > const gl_constant_value *value = param[i]; > > @@ -2208,9 +2235,15 @@ fs_visitor::assign_constant_locations() > stage_prog_data->pull_param[pull_constant_loc[i]] = value; > } else if (push_constant_loc[i] != -1) { > stage_prog_data->param[push_constant_loc[i]] = value; > + if (thread_local_id_index == (int)i) > + new_thread_local_id_index = push_constant_loc[i]; > } > } > ralloc_free(param); > + > + if (stage == MESA_SHADER_COMPUTE) > + ((brw_cs_prog_data*)stage_prog_data)->thread_local_id_index = > + new_thread_local_id_index; > } > > /** > @@ -6185,6 +6218,14 @@ brw_compile_cs(const struct brw_compiler *compiler, > void *log_data, > shader->info.cs.local_size[0] * shader->info.cs.local_size[1] * > shader->info.cs.local_size[2]; > > + prog_data->thread_local_id_index = -1; > + nir_foreach_variable(var, &shader->uniforms) { > + if (strcmp(var->name, "cs_thread_local_id") == 0) { > Hrm... this seems a bit ugly... I'll think about it a bit. > + prog_data->thread_local_id_index = var->data.driver_location / 4; > + break; > + } > + } > + > unsigned max_cs_threads = compiler->devinfo->max_cs_threads; > unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, > max_cs_threads); > > -- > 2.8.1 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/mesa-dev >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev