Previously, we added an extra uniform and inserted a NIR load_uniform intrinsic so that we could let NIR optimize it while avoiding adding a new NIR intrinsic. However, this ends up being a dirtier solution than one would like. This commit changes adds a new NIR intrinsic for intel thread local ID and makes that turn into a MOV from the UNIFORM file with a particular uniform number. The advantage to this is that the adding of the thread local ID uniform and assigning it's index is entirely contained within fs_visitor and the thread_local_id_index field of brw_cs_prog_data stops being an inout parameter. This also makes setting up prog data prior to calling brw_compile_cs less error-prone. --- src/compiler/nir/nir_intrinsics.h | 3 ++ src/intel/compiler/brw_fs.cpp | 62 ++++++++++++++------------------- src/intel/compiler/brw_fs.h | 5 +++ src/intel/compiler/brw_fs_nir.cpp | 9 +++++ src/intel/compiler/brw_fs_visitor.cpp | 1 + src/intel/compiler/brw_nir.h | 3 +- src/intel/compiler/brw_nir_intrinsics.c | 51 ++++++--------------------- src/intel/vulkan/anv_cmd_buffer.c | 10 +++--- src/intel/vulkan/anv_pipeline.c | 4 --- src/mesa/drivers/dri/i965/brw_cs.c | 3 -- 10 files changed, 60 insertions(+), 91 deletions(-)
diff --git a/src/compiler/nir/nir_intrinsics.h b/src/compiler/nir/nir_intrinsics.h index 0de7080..9389b74 100644 --- a/src/compiler/nir/nir_intrinsics.h +++ b/src/compiler/nir/nir_intrinsics.h @@ -364,6 +364,9 @@ SYSTEM_VALUE(blend_const_color_a_float, 1, 0, xx, xx, xx) SYSTEM_VALUE(blend_const_color_rgba8888_unorm, 1, 0, xx, xx, xx) SYSTEM_VALUE(blend_const_color_aaaa8888_unorm, 1, 0, xx, xx, xx) +/* Intel specific system values */ +SYSTEM_VALUE(intel_thread_local_id, 1, 0, xx, xx, xx) + /** * Barycentric coordinate intrinsics. * diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 90d9b22..baa940e 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -1400,7 +1400,12 @@ fs_visitor::emit_gs_thread_end() void fs_visitor::assign_curb_setup() { - unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8); + unsigned num_push_constants = stage_prog_data->nr_params; + if (stage == MESA_SHADER_COMPUTE && + brw_cs_prog_data(stage_prog_data)->thread_local_id_index >= 0) + num_push_constants++; + + unsigned uniform_push_length = DIV_ROUND_UP(num_push_constants, 8); unsigned ubo_push_length = 0; unsigned ubo_push_start[4]; @@ -1965,10 +1970,6 @@ fs_visitor::assign_constant_locations() bool contiguous[uniforms]; memset(contiguous, 0, sizeof(contiguous)); - int thread_local_id_index = - (stage == MESA_SHADER_COMPUTE) ? - brw_cs_prog_data(stage_prog_data)->thread_local_id_index : -1; - /* First, we walk through the instructions and do two things: * * 1) Figure out which uniforms are live. @@ -2011,9 +2012,6 @@ fs_visitor::assign_constant_locations() } } - if (thread_local_id_index >= 0 && !is_live[thread_local_id_index]) - thread_local_id_index = -1; - /* Only allow 16 registers (128 uniform components) as push constants. * * Just demote the end of the list. We could probably do better @@ -2080,9 +2078,12 @@ fs_visitor::assign_constant_locations() stage_prog_data); } - /* Add the CS local thread ID uniform at the end of the push constants */ - if (thread_local_id_index >= 0) - push_constant_loc[thread_local_id_index] = num_push_constants++; + /* Add the CS local thread ID uniform at the end of the push constants. + * We don't increment num_push_constants because this never actually ends + * up in the params array. + */ + if (thread_local_id_index >= 0 && is_live[thread_local_id_index]) + push_constant_loc[thread_local_id_index] = num_push_constants; /* As the uniforms are going to be reordered, take the data from a temporary * copy of the original param[]. @@ -2116,23 +2117,23 @@ fs_visitor::assign_constant_locations() * push_constant_loc[i] <= i and we can do it in one smooth loop without * having to make a copy. */ - int new_thread_local_id_index = -1; for (unsigned int i = 0; i < uniforms; i++) { const gl_constant_value *value = param[i]; + if (thread_local_id_index == (int)i) + continue; + if (pull_constant_loc[i] != -1) { stage_prog_data->pull_param[pull_constant_loc[i]] = value; } else if (push_constant_loc[i] != -1) { stage_prog_data->param[push_constant_loc[i]] = value; - if (thread_local_id_index == (int)i) - new_thread_local_id_index = push_constant_loc[i]; } } ralloc_free(param); if (stage == MESA_SHADER_COMPUTE) brw_cs_prog_data(stage_prog_data)->thread_local_id_index = - new_thread_local_id_index; + push_constant_loc[thread_local_id_index]; } bool @@ -6698,29 +6699,27 @@ cs_fill_push_const_info(const struct gen_device_info *devinfo, struct brw_cs_prog_data *cs_prog_data) { const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - bool fill_thread_id = - cs_prog_data->thread_local_id_index >= 0 && - cs_prog_data->thread_local_id_index < (int)prog_data->nr_params; bool cross_thread_supported = devinfo->gen > 7 || devinfo->is_haswell; + bool fill_thread_id = cs_prog_data->thread_local_id_index >= 0; /* The thread ID should be stored in the last param dword */ - assert(prog_data->nr_params > 0 || !fill_thread_id); - assert(!fill_thread_id || - cs_prog_data->thread_local_id_index == - (int)prog_data->nr_params - 1); + if (fill_thread_id) + assert(cs_prog_data->thread_local_id_index == (int)prog_data->nr_params); + + const unsigned dwords = prog_data->nr_params + fill_thread_id; unsigned cross_thread_dwords, per_thread_dwords; if (!cross_thread_supported) { cross_thread_dwords = 0u; - per_thread_dwords = prog_data->nr_params; + per_thread_dwords = dwords; } else if (fill_thread_id) { /* Fill all but the last register with cross-thread payload */ cross_thread_dwords = 8 * (cs_prog_data->thread_local_id_index / 8); - per_thread_dwords = prog_data->nr_params - cross_thread_dwords; + per_thread_dwords = dwords - cross_thread_dwords; assert(per_thread_dwords > 0 && per_thread_dwords <= 8); } else { /* Fill all data using cross-thread payload */ - cross_thread_dwords = prog_data->nr_params; + cross_thread_dwords = dwords; per_thread_dwords = 0u; } @@ -6736,7 +6735,7 @@ cs_fill_push_const_info(const struct gen_device_info *devinfo, cs_prog_data->push.per_thread.size == 0); assert(cs_prog_data->push.cross_thread.dwords + cs_prog_data->push.per_thread.dwords == - prog_data->nr_params); + prog_data->nr_params + fill_thread_id); } static void @@ -6760,16 +6759,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, { nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true); - - /* Now that we cloned the nir_shader, we can update num_uniforms based on - * the thread_local_id_index. - */ - assert(prog_data->thread_local_id_index >= 0); - shader->num_uniforms = - MAX2(shader->num_uniforms, - (unsigned)4 * (prog_data->thread_local_id_index + 1)); - - brw_nir_lower_intrinsics(shader, &prog_data->base); + brw_nir_lower_intrinsics(shader); shader = brw_postprocess_nir(shader, compiler, true); prog_data->local_size[0] = shader->info.cs.local_size[0]; diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 0b5126e..e0073d4 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -315,6 +315,11 @@ public: */ int *push_constant_loc; + /** + * Uniform index of the compute shader thread id + */ + int thread_local_id_index; + fs_reg frag_depth; fs_reg frag_stencil; fs_reg sample_mask; diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index c2148c3..ca82209 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -69,6 +69,9 @@ void fs_visitor::nir_setup_uniforms() { uniforms = nir->num_uniforms / 4; + + if (stage == MESA_SHADER_COMPUTE) + thread_local_id_index = uniforms++; } static bool @@ -3393,6 +3396,12 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, cs_prog_data->uses_barrier = true; break; + case nir_intrinsic_load_intel_thread_local_id: { + fs_reg uniform(UNIFORM, thread_local_id_index, BRW_REGISTER_TYPE_UD); + bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), uniform); + break; + } + case nir_intrinsic_load_local_invocation_id: case nir_intrinsic_load_work_group_id: { gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp index ad5124c..75ae463 100644 --- a/src/intel/compiler/brw_fs_visitor.cpp +++ b/src/intel/compiler/brw_fs_visitor.cpp @@ -887,6 +887,7 @@ fs_visitor::init() this->last_scratch = 0; this->pull_constant_loc = NULL; this->push_constant_loc = NULL; + this->thread_local_id_index = -1; this->promoted_constants = 0, diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h index 560027c..df73303 100644 --- a/src/intel/compiler/brw_nir.h +++ b/src/intel/compiler/brw_nir.h @@ -95,8 +95,7 @@ void brw_nir_analyze_boolean_resolves(nir_shader *nir); nir_shader *brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir); -bool brw_nir_lower_intrinsics(nir_shader *nir, - struct brw_stage_prog_data *prog_data); +bool brw_nir_lower_intrinsics(nir_shader *nir); void brw_nir_lower_vs_inputs(nir_shader *nir, bool use_legacy_snorm_formula, const uint8_t *vs_attrib_wa_flags); diff --git a/src/intel/compiler/brw_nir_intrinsics.c b/src/intel/compiler/brw_nir_intrinsics.c index abbbc6f..c4f6082 100644 --- a/src/intel/compiler/brw_nir_intrinsics.c +++ b/src/intel/compiler/brw_nir_intrinsics.c @@ -26,45 +26,12 @@ struct lower_intrinsics_state { nir_shader *nir; - union { - struct brw_stage_prog_data *prog_data; - struct brw_cs_prog_data *cs_prog_data; - }; nir_function_impl *impl; bool progress; nir_builder builder; - bool cs_thread_id_used; + unsigned local_workgroup_size; }; -static nir_ssa_def * -read_thread_local_id(struct lower_intrinsics_state *state) -{ - nir_builder *b = &state->builder; - nir_shader *nir = state->nir; - const unsigned *sizes = nir->info.cs.local_size; - const unsigned group_size = sizes[0] * sizes[1] * sizes[2]; - - /* Some programs have local_size dimensions so small that the thread local - * ID will always be 0. - */ - if (group_size <= 8) - return nir_imm_int(b, 0); - - assert(state->cs_prog_data->thread_local_id_index >= 0); - state->cs_thread_id_used = true; - const int id_index = state->cs_prog_data->thread_local_id_index; - - nir_intrinsic_instr *load = - nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform); - load->num_components = 1; - load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0)); - nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL); - nir_intrinsic_set_base(load, id_index * sizeof(uint32_t)); - nir_intrinsic_set_range(load, sizeof(uint32_t)); - nir_builder_instr_insert(b, &load->instr); - return &load->dest.ssa; -} - static bool lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state, nir_block *block) @@ -90,7 +57,12 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state, * gl_LocalInvocationIndex = * cs_thread_local_id + subgroup_invocation; */ - nir_ssa_def *thread_local_id = read_thread_local_id(state); + nir_ssa_def *thread_local_id; + if (state->local_workgroup_size <= 8) + thread_local_id = nir_imm_int(b, 0); + else + thread_local_id = nir_load_intel_thread_local_id(b); + nir_ssa_def *channel = nir_load_subgroup_invocation(b); sysval = nir_iadd(b, channel, thread_local_id); break; @@ -156,7 +128,7 @@ lower_cs_intrinsics_convert_impl(struct lower_intrinsics_state *state) } bool -brw_nir_lower_intrinsics(nir_shader *nir, struct brw_stage_prog_data *prog_data) +brw_nir_lower_intrinsics(nir_shader *nir) { /* Currently we only lower intrinsics for compute shaders */ if (nir->stage != MESA_SHADER_COMPUTE) @@ -166,7 +138,9 @@ brw_nir_lower_intrinsics(nir_shader *nir, struct brw_stage_prog_data *prog_data) struct lower_intrinsics_state state; memset(&state, 0, sizeof(state)); state.nir = nir; - state.prog_data = prog_data; + state.local_workgroup_size = nir->info.cs.local_size[0] * + nir->info.cs.local_size[1] * + nir->info.cs.local_size[2]; do { state.progress = false; @@ -179,8 +153,5 @@ brw_nir_lower_intrinsics(nir_shader *nir, struct brw_stage_prog_data *prog_data) progress |= state.progress; } while (state.progress); - if (nir->stage == MESA_SHADER_COMPUTE && !state.cs_thread_id_used) - state.cs_prog_data->thread_local_id_index = -1; - return progress; } diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c index 3b59af8..c0d949c 100644 --- a/src/intel/vulkan/anv_cmd_buffer.c +++ b/src/intel/vulkan/anv_cmd_buffer.c @@ -706,13 +706,11 @@ anv_cmd_buffer_cs_push_constants(struct anv_cmd_buffer *cmd_buffer) cs_prog_data->push.cross_thread.regs); unsigned src = cs_prog_data->push.cross_thread.dwords; for ( ; src < prog_data->nr_params; src++, dst++) { - if (src != cs_prog_data->thread_local_id_index) { - uint32_t offset = (uintptr_t)prog_data->param[src]; - u32_map[dst] = *(uint32_t *)((uint8_t *)data + offset); - } else { - u32_map[dst] = t * cs_prog_data->simd_size; - } + uint32_t offset = (uintptr_t)prog_data->param[src]; + u32_map[dst] = *(uint32_t *)((uint8_t *)data + offset); } + if (cs_prog_data->thread_local_id_index >= 0) + u32_map[dst] = t; } } diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 94e99d8..d593f59 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -404,10 +404,6 @@ anv_pipeline_compile(struct anv_pipeline *pipeline, pipeline->needs_data_cache = true; } - if (stage == MESA_SHADER_COMPUTE) - ((struct brw_cs_prog_data *)prog_data)->thread_local_id_index = - prog_data->nr_params++; /* The CS Thread ID uniform */ - if (nir->info.num_ssbos > 0) pipeline->needs_data_cache = true; diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c index cf72889..a8d6f37 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.c +++ b/src/mesa/drivers/dri/i965/brw_cs.c @@ -84,9 +84,6 @@ brw_codegen_cs_prog(struct brw_context *brw, */ int param_count = cp->program.nir->num_uniforms / 4; - /* The backend also sometimes add a param for the thread local id. */ - prog_data.thread_local_id_index = param_count++; - /* The backend also sometimes adds params for texture size. */ param_count += 2 * ctx->Const.Program[MESA_SHADER_COMPUTE].MaxTextureImageUnits; prog_data.base.param = -- 2.5.0.400.gff86faf _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev