Thank you for reviewing this Jordan! On Fri, 1 Jun 2018 at 23:45, Jordan Justen <jordan.l.jus...@intel.com> wrote:
> On 2018-06-01 15:21:34, Plamena Manolova wrote: > > This patch adds the implentation of ARB_compute_variable_group_size > > for i965. We do this by storing the group size in a buffer surface, > > similarly to the work group number. > > > > Signed-off-by: Plamena Manolova <plamena.n.manol...@gmail.com> > > --- > > docs/features.txt | 2 +- > > docs/relnotes/18.2.0.html | 1 + > > src/compiler/nir/nir_lower_system_values.c | 14 ++++ > > src/intel/compiler/brw_compiler.h | 2 + > > src/intel/compiler/brw_fs.cpp | 45 ++++++++---- > > src/intel/compiler/brw_fs_nir.cpp | 20 ++++++ > > src/intel/compiler/brw_nir_lower_cs_intrinsics.c | 87 > +++++++++++++++++------- > > src/mesa/drivers/dri/i965/brw_compute.c | 25 ++++++- > > src/mesa/drivers/dri/i965/brw_context.h | 1 + > > src/mesa/drivers/dri/i965/brw_cs.c | 4 ++ > > src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 27 +++++++- > > src/mesa/drivers/dri/i965/intel_extensions.c | 1 + > > 12 files changed, 187 insertions(+), 42 deletions(-) > > > > diff --git a/docs/features.txt b/docs/features.txt > > index ed4050cf98..7c3c856d73 100644 > > --- a/docs/features.txt > > +++ b/docs/features.txt > > @@ -298,7 +298,7 @@ Khronos, ARB, and OES extensions that are not part > of any OpenGL or OpenGL ES ve > > > > GL_ARB_bindless_texture DONE (nvc0, > radeonsi) > > GL_ARB_cl_event not started > > - GL_ARB_compute_variable_group_size DONE (nvc0, > radeonsi) > > + GL_ARB_compute_variable_group_size DONE (nvc0, > radeonsi, i965) > > GL_ARB_ES3_2_compatibility DONE > (i965/gen8+) > > GL_ARB_fragment_shader_interlock DONE (i965) > > GL_ARB_gpu_shader_int64 DONE > (i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe) > > diff --git a/docs/relnotes/18.2.0.html b/docs/relnotes/18.2.0.html > > index a3f44a29dc..4ceeb7471f 100644 > > --- a/docs/relnotes/18.2.0.html > > +++ b/docs/relnotes/18.2.0.html > > @@ -45,6 +45,7 @@ Note: some of the new features are only available with > certain drivers. > > > > <ul> > > <li>GL_ARB_fragment_shader_interlock on i965</li> > > +<li>GL_ARB_compute_variable_group_size on i965</li> > > </ul> > > > > <h2>Bug fixes</h2> > > diff --git a/src/compiler/nir/nir_lower_system_values.c > b/src/compiler/nir/nir_lower_system_values.c > > index 487da04262..0af6d69426 100644 > > --- a/src/compiler/nir/nir_lower_system_values.c > > +++ b/src/compiler/nir/nir_lower_system_values.c > > @@ -57,6 +57,15 @@ convert_block(nir_block *block, nir_builder *b) > > * gl_WorkGroupID * gl_WorkGroupSize + gl_LocalInvocationID" > > */ > > > > + > > Extra line. > > > + /* > > + * If the local work group size is variable we can't lower > the global > > + * invocation id here. > > + */ > > + if (b->shader->info.cs.local_size_variable) { > > + break; > > + } > > + > > The indent looks off here. One extra space? > > > nir_const_value local_size; > > memset(&local_size, 0, sizeof(local_size)); > > local_size.u32[0] = b->shader->info.cs.local_size[0]; > > @@ -102,6 +111,11 @@ convert_block(nir_block *block, nir_builder *b) > > } > > > > case SYSTEM_VALUE_LOCAL_GROUP_SIZE: { > > + /* If the local work group size is variable we can't lower it > here */ > > + if (b->shader->info.cs.local_size_variable) { > > + break; > > + } > > + > > nir_const_value local_size; > > memset(&local_size, 0, sizeof(local_size)); > > local_size.u32[0] = b->shader->info.cs.local_size[0]; > > diff --git a/src/intel/compiler/brw_compiler.h > b/src/intel/compiler/brw_compiler.h > > index 8b4e6fe2e2..f54952c28f 100644 > > --- a/src/intel/compiler/brw_compiler.h > > +++ b/src/intel/compiler/brw_compiler.h > > @@ -759,6 +759,7 @@ struct brw_cs_prog_data { > > unsigned threads; > > bool uses_barrier; > > bool uses_num_work_groups; > > + bool uses_variable_group_size; > > > > struct { > > struct brw_push_const_block cross_thread; > > @@ -771,6 +772,7 @@ struct brw_cs_prog_data { > > * surface indices the CS-specific surfaces > > */ > > uint32_t work_groups_start; > > + uint32_t work_group_size_start; > > /** @} */ > > } binding_table; > > }; > > diff --git a/src/intel/compiler/brw_fs.cpp > b/src/intel/compiler/brw_fs.cpp > > index d67c0a4192..28730af47b 100644 > > --- a/src/intel/compiler/brw_fs.cpp > > +++ b/src/intel/compiler/brw_fs.cpp > > @@ -7228,18 +7228,32 @@ brw_compile_cs(const struct brw_compiler > *compiler, void *log_data, > > int shader_time_index, > > char **error_str) > > { > > - prog_data->local_size[0] = src_shader->info.cs.local_size[0]; > > - prog_data->local_size[1] = src_shader->info.cs.local_size[1]; > > - prog_data->local_size[2] = src_shader->info.cs.local_size[2]; > > - unsigned local_workgroup_size = > > - src_shader->info.cs.local_size[0] * > src_shader->info.cs.local_size[1] * > > - src_shader->info.cs.local_size[2]; > > - > > - unsigned min_dispatch_width = > > - DIV_ROUND_UP(local_workgroup_size, > compiler->devinfo->max_cs_threads); > > - min_dispatch_width = MAX2(8, min_dispatch_width); > > - min_dispatch_width = util_next_power_of_two(min_dispatch_width); > > - assert(min_dispatch_width <= 32); > > + unsigned min_dispatch_width; > > + > > + if (!src_shader->info.cs.local_size_variable) { > > + unsigned local_workgroup_size = > > + src_shader->info.cs.local_size[0] * > src_shader->info.cs.local_size[1] * > > + src_shader->info.cs.local_size[2]; > > + > > + min_dispatch_width = > > + DIV_ROUND_UP(local_workgroup_size, > compiler->devinfo->max_cs_threads); > > + min_dispatch_width = MAX2(8, min_dispatch_width); > > + min_dispatch_width = util_next_power_of_two(min_dispatch_width); > > + assert(min_dispatch_width <= 32); > > + > > + prog_data->local_size[0] = src_shader->info.cs.local_size[0]; > > + prog_data->local_size[1] = src_shader->info.cs.local_size[1]; > > + prog_data->local_size[2] = src_shader->info.cs.local_size[2]; > > + prog_data->uses_variable_group_size = false; > > + } else { > > + /* > > + * If the local work group size is variable we have to use a > dispatch > > + * width of 32 here, since at this point we don't know the actual > size of > > + * the workload. > > + */ > > + min_dispatch_width = 32; > > If we find cases where this leads to bad perf, we could look into > generating other sizes too. If they end up using a smaller size, then > we might want to run the simd8 or simd16 instead if there is spilling. > I can have a go at making a patch for that if you think it'll improve performance. > But, this seems like a good first step. > > > + prog_data->uses_variable_group_size = true; > > + } > > > > fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL; > > cfg_t *cfg = NULL; > > @@ -7324,7 +7338,12 @@ brw_compile_cs(const struct brw_compiler > *compiler, void *log_data, > > } > > } else { > > cfg = v32->cfg; > > - cs_set_simd_size(prog_data, 32); > > + if (!src_shader->info.cs.local_size_variable) { > > + cs_set_simd_size(prog_data, 32); > > + } else { > > + prog_data->simd_size = 32; > > + prog_data->threads = compiler->devinfo->max_cs_threads; > > + } > > cs_fill_push_const_info(compiler->devinfo, prog_data); > > promoted_constants = v32->promoted_constants; > > } > > diff --git a/src/intel/compiler/brw_fs_nir.cpp > b/src/intel/compiler/brw_fs_nir.cpp > > index 166da0aa6d..c4948c2347 100644 > > --- a/src/intel/compiler/brw_fs_nir.cpp > > +++ b/src/intel/compiler/brw_fs_nir.cpp > > @@ -3766,6 +3766,26 @@ fs_visitor::nir_emit_cs_intrinsic(const > fs_builder &bld, > > break; > > } > > > > + case nir_intrinsic_load_local_group_size: { > > + const unsigned surface = > > + cs_prog_data->binding_table.work_group_size_start; > > + > > + fs_reg surf_index = brw_imm_ud(surface); > > + brw_mark_surface_used(prog_data, surface); > > + > > + /* Read the 3 GLuint components of gl_NumWorkGroups */ > > + for (unsigned i = 0; i < 3; i++) { > > + fs_reg read_result = > > + emit_untyped_read(bld, surf_index, > > + brw_imm_ud(i << 2), > > + 1 /* dims */, 1 /* size */, > > + BRW_PREDICATE_NONE); > > + read_result.type = dest.type; > > + bld.MOV(dest, read_result); > > + dest = offset(dest, bld, 1); > > + } > > + break; > > + } > > default: > > nir_emit_intrinsic(bld, instr); > > break; > > diff --git a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c > b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c > > index bfbdea0e8f..e9c9d34502 100644 > > --- a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c > > +++ b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c > > @@ -58,10 +58,12 @@ lower_cs_intrinsics_convert_block(struct > lower_intrinsics_state *state, > > * cs_thread_local_id + subgroup_invocation; > > */ > > nir_ssa_def *subgroup_id; > > - if (state->local_workgroup_size <= state->dispatch_width) > > + if ((state->local_workgroup_size <= state->dispatch_width) && > > + !state->nir->info.cs.local_size_variable) { > > subgroup_id = nir_imm_int(b, 0); > > - else > > + } else { > > subgroup_id = nir_load_subgroup_id(b); > > + } > > > > nir_ssa_def *thread_local_id = > > nir_imul(b, subgroup_id, nir_imm_int(b, > state->dispatch_width)); > > @@ -84,43 +86,80 @@ lower_cs_intrinsics_convert_block(struct > lower_intrinsics_state *state, > > * (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) % > > * gl_WorkGroupSize.z; > > */ > > - unsigned *size = nir->info.cs.local_size; > > - > > nir_ssa_def *local_index = nir_load_local_invocation_index(b); > > - > > - nir_const_value uvec3; > > - memset(&uvec3, 0, sizeof(uvec3)); > > - uvec3.u32[0] = 1; > > - uvec3.u32[1] = size[0]; > > - uvec3.u32[2] = size[0] * size[1]; > > - nir_ssa_def *div_val = nir_build_imm(b, 3, 32, uvec3); > > - uvec3.u32[0] = size[0]; > > - uvec3.u32[1] = size[1]; > > - uvec3.u32[2] = size[2]; > > - nir_ssa_def *mod_val = nir_build_imm(b, 3, 32, uvec3); > > - > > - sysval = nir_umod(b, nir_udiv(b, local_index, div_val), > mod_val); > > + if (!state->nir->info.cs.local_size_variable) { > > + unsigned *size = nir->info.cs.local_size; > > + > > + nir_const_value uvec3; > > + memset(&uvec3, 0, sizeof(uvec3)); > > + uvec3.u32[0] = 1; > > + uvec3.u32[1] = size[0]; > > + uvec3.u32[2] = size[0] * size[1]; > > + nir_ssa_def *div_val = nir_build_imm(b, 3, 32, uvec3); > > + uvec3.u32[0] = size[0]; > > + uvec3.u32[1] = size[1]; > > + uvec3.u32[2] = size[2]; > > + nir_ssa_def *mod_val = nir_build_imm(b, 3, 32, uvec3); > > + > > + sysval = nir_umod(b, nir_udiv(b, local_index, div_val), > mod_val); > > + } else { > > + nir_ssa_def *group_size_xyz = nir_load_local_group_size(b); > > + nir_ssa_def *group_size_x = nir_channel(b, group_size_xyz, > 0); > > + nir_ssa_def *group_size_y = nir_channel(b, group_size_xyz, > 1); > > + nir_ssa_def *group_size_z = nir_channel(b, group_size_xyz, > 2); > > + nir_ssa_def *result[3]; > > + result[0] = nir_umod(b, local_index, group_size_x); > > + result[1] = nir_umod(b, nir_udiv(b, local_index, > group_size_x), > > + group_size_y); > > + result[2] = nir_umod(b, nir_udiv(b, local_index, > > + nir_umul_high(b, group_size_x, group_size_y)), > group_size_z); > > + > > + sysval = nir_vec(b, result, 3); > > + } > > break; > > } > > > > case nir_intrinsic_load_subgroup_id: > > - if (state->local_workgroup_size > 8) > > + if (state->local_workgroup_size > 8 || > > + state->nir->info.cs.local_size_variable) { > > continue; > > + } > > > > /* For small workgroup sizes, we know subgroup_id will be zero > */ > > sysval = nir_imm_int(b, 0); > > break; > > > > case nir_intrinsic_load_num_subgroups: { > > - unsigned local_workgroup_size = > > - nir->info.cs.local_size[0] * nir->info.cs.local_size[1] * > > - nir->info.cs.local_size[2]; > > - unsigned num_subgroups = > > - DIV_ROUND_UP(local_workgroup_size, state->dispatch_width); > > - sysval = nir_imm_int(b, num_subgroups); > > + if (!state->nir->info.cs.local_size_variable) { > > + unsigned num_subgroups; > > + unsigned local_workgroup_size = > > + nir->info.cs.local_size[0] * nir->info.cs.local_size[1] * > > + nir->info.cs.local_size[2]; > > + num_subgroups = > > + DIV_ROUND_UP(local_workgroup_size, > state->dispatch_width); > > + sysval = nir_imm_int(b, num_subgroups); > > + } else { > > + nir_ssa_def *dispatch_width = nir_imm_int(b, > state->dispatch_width); > > + nir_ssa_def *group_size_xyz = nir_load_local_group_size(b); > > + nir_ssa_def *group_size_x = nir_channel(b, group_size_xyz, > 0); > > + nir_ssa_def *group_size_y = nir_channel(b, group_size_xyz, > 1); > > + nir_ssa_def *group_size_z = nir_channel(b, group_size_xyz, > 2); > > + nir_ssa_def *group_size = nir_imul(b, group_size_x, > nir_imul(b, > > + group_size_y, group_size_z)); > > + > > + sysval = nir_udiv(b, group_size, dispatch_width); > > I guess for DIV_ROUND_UP like above, you'd want to add > (dispatch_width - 1) before the udiv. > > -Jordan > > > + } > > break; > > } > > > > + case nir_intrinsic_load_global_invocation_id: { > > + nir_ssa_def *group_id = nir_load_work_group_id(b); > > + nir_ssa_def *local_id = nir_load_local_invocation_id(b); > > + nir_ssa_def *group_size = nir_load_local_group_size(b); > > + > > + sysval = nir_iadd(b, nir_imul(b, group_id, group_size), > local_id); > > + break; > > + } > > default: > > continue; > > } > > diff --git a/src/mesa/drivers/dri/i965/brw_compute.c > b/src/mesa/drivers/dri/i965/brw_compute.c > > index de08fc3ac1..7949e0ff51 100644 > > --- a/src/mesa/drivers/dri/i965/brw_compute.c > > +++ b/src/mesa/drivers/dri/i965/brw_compute.c > > @@ -121,8 +121,11 @@ brw_emit_gpgpu_walker(struct brw_context *brw) > > } > > > > const unsigned simd_size = prog_data->simd_size; > > - unsigned group_size = prog_data->local_size[0] * > > - prog_data->local_size[1] * prog_data->local_size[2]; > > + unsigned group_size = brw->compute.group_size != NULL ? > > + brw->compute.group_size[0] * brw->compute.group_size[1] * > > + brw->compute.group_size[2] : prog_data->local_size[0] * > > + prog_data->local_size[1] * prog_data->local_size[2]; > > + > > unsigned thread_width_max = > > (group_size + simd_size - 1) / simd_size; > > > > @@ -229,6 +232,7 @@ brw_dispatch_compute(struct gl_context *ctx, const > GLuint *num_groups) { > > > > brw->compute.num_work_groups_bo = NULL; > > brw->compute.num_work_groups = num_groups; > > + brw->compute.group_size = NULL; > > ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS; > > > > brw_dispatch_compute_common(ctx); > > @@ -248,6 +252,22 @@ brw_dispatch_compute_indirect(struct gl_context > *ctx, GLintptr indirect) > > brw->compute.num_work_groups_bo = bo; > > brw->compute.num_work_groups_offset = indirect; > > brw->compute.num_work_groups = indirect_group_counts; > > + brw->compute.group_size = NULL; > > + ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS; > > + > > + brw_dispatch_compute_common(ctx); > > +} > > + > > +static void > > +brw_dispatch_compute_group_size(struct gl_context *ctx, > > + const GLuint *num_groups, > > + const GLuint *group_size) > > +{ > > + struct brw_context *brw = brw_context(ctx); > > + > > + brw->compute.num_work_groups_bo = NULL; > > + brw->compute.num_work_groups = num_groups; > > + brw->compute.group_size = group_size; > > ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS; > > > > brw_dispatch_compute_common(ctx); > > @@ -258,4 +278,5 @@ brw_init_compute_functions(struct dd_function_table > *functions) > > { > > functions->DispatchCompute = brw_dispatch_compute; > > functions->DispatchComputeIndirect = brw_dispatch_compute_indirect; > > + functions->DispatchComputeGroupSize = > brw_dispatch_compute_group_size; > > } > > diff --git a/src/mesa/drivers/dri/i965/brw_context.h > b/src/mesa/drivers/dri/i965/brw_context.h > > index 2613b9fda2..0fb533c369 100644 > > --- a/src/mesa/drivers/dri/i965/brw_context.h > > +++ b/src/mesa/drivers/dri/i965/brw_context.h > > @@ -931,6 +931,7 @@ struct brw_context > > struct brw_bo *num_work_groups_bo; > > GLintptr num_work_groups_offset; > > const GLuint *num_work_groups; > > + const GLuint *group_size; > > } compute; > > > > struct { > > diff --git a/src/mesa/drivers/dri/i965/brw_cs.c > b/src/mesa/drivers/dri/i965/brw_cs.c > > index e3f8fc67a4..007273390b 100644 > > --- a/src/mesa/drivers/dri/i965/brw_cs.c > > +++ b/src/mesa/drivers/dri/i965/brw_cs.c > > @@ -43,6 +43,10 @@ assign_cs_binding_table_offsets(const struct > gen_device_info *devinfo, > > prog_data->binding_table.work_groups_start = > next_binding_table_offset; > > next_binding_table_offset++; > > > > + /* May not be used if the work group size is not variable. */ > > + prog_data->binding_table.work_group_size_start = > next_binding_table_offset; > > + next_binding_table_offset++; > > + > > brw_assign_common_binding_table_offsets(devinfo, prog, > &prog_data->base, > > next_binding_table_offset); > > } > > diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c > b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c > > index 73cae9ef7c..fa8851e2b4 100644 > > --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c > > +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c > > @@ -1634,7 +1634,7 @@ const struct brw_tracked_state > brw_wm_image_surfaces = { > > }; > > > > static void > > -brw_upload_cs_work_groups_surface(struct brw_context *brw) > > +brw_upload_cs_variable_surfaces(struct brw_context *brw) > > { > > struct gl_context *ctx = &brw->ctx; > > /* _NEW_PROGRAM */ > > @@ -1671,6 +1671,29 @@ brw_upload_cs_work_groups_surface(struct > brw_context *brw) > > RELOC_WRITE); > > brw->ctx.NewDriverState |= BRW_NEW_SURFACES; > > } > > + > > + if (prog && cs_prog_data->uses_variable_group_size) { > > + const unsigned surf_idx = > > + cs_prog_data->binding_table.work_group_size_start; > > + uint32_t *surf_offset = &brw->cs.base.surf_offset[surf_idx]; > > + struct brw_bo *bo; > > + uint32_t bo_offset; > > + > > + bo = NULL; > > + brw_upload_data(&brw->upload, > > + (void *)brw->compute.group_size, > > + 3 * sizeof(GLuint), > > + sizeof(GLuint), > > + &bo, > > + &bo_offset); > > + > > + brw_emit_buffer_surface_state(brw, surf_offset, > > + bo, bo_offset, > > + ISL_FORMAT_RAW, > > + 3 * sizeof(GLuint), 1, > > + RELOC_WRITE); > > + brw->ctx.NewDriverState |= BRW_NEW_SURFACES; > > + } > > } > > > > const struct brw_tracked_state brw_cs_work_groups_surface = { > > @@ -1678,5 +1701,5 @@ const struct brw_tracked_state > brw_cs_work_groups_surface = { > > .brw = BRW_NEW_CS_PROG_DATA | > > BRW_NEW_CS_WORK_GROUPS > > }, > > - .emit = brw_upload_cs_work_groups_surface, > > + .emit = brw_upload_cs_variable_surfaces, > > }; > > diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c > b/src/mesa/drivers/dri/i965/intel_extensions.c > > index 5a9369d7b4..f213360ed8 100644 > > --- a/src/mesa/drivers/dri/i965/intel_extensions.c > > +++ b/src/mesa/drivers/dri/i965/intel_extensions.c > > @@ -258,6 +258,7 @@ intelInitExtensions(struct gl_context *ctx) > > ctx->Extensions.ARB_compute_shader = true; > > ctx->Extensions.ARB_ES3_1_compatibility = > > devinfo->gen >= 8 || devinfo->is_haswell; > > + ctx->Extensions.ARB_compute_variable_group_size = true; > > } > > > > if (can_do_predicate_writes(brw->screen)) { > > -- > > 2.11.0 > > >
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev