This patch adds the implementation of ARB_compute_variable_group_size for i965. We do this by storing the group size in a buffer surface, similarly to the work group number.
v2: Fix some indentation inconsistencies (Jordan, Ilia) Do DIV_ROUND_UP correctly in brw_nir_lower_cs_intrinsics.c (Jordan) Use alphabetical order in features.txt (Matt) Set the extension constants properly in brw_context.c Signed-off-by: Plamena Manolova <plamena.n.manol...@gmail.com> --- docs/features.txt | 2 +- docs/relnotes/18.2.0.html | 1 + src/compiler/nir/nir_lower_system_values.c | 13 ++++ src/intel/compiler/brw_compiler.h | 2 + src/intel/compiler/brw_fs.cpp | 45 ++++++++---- src/intel/compiler/brw_fs_nir.cpp | 20 ++++++ src/intel/compiler/brw_nir_lower_cs_intrinsics.c | 88 +++++++++++++++++------- src/mesa/drivers/dri/i965/brw_compute.c | 25 ++++++- src/mesa/drivers/dri/i965/brw_context.c | 6 ++ src/mesa/drivers/dri/i965/brw_context.h | 1 + src/mesa/drivers/dri/i965/brw_cs.c | 4 ++ src/mesa/drivers/dri/i965/brw_wm_surface_state.c | 27 +++++++- src/mesa/drivers/dri/i965/intel_extensions.c | 1 + 13 files changed, 193 insertions(+), 42 deletions(-) diff --git a/docs/features.txt b/docs/features.txt index ed4050cf98..81b6663288 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -298,7 +298,7 @@ Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL ES ve GL_ARB_bindless_texture DONE (nvc0, radeonsi) GL_ARB_cl_event not started - GL_ARB_compute_variable_group_size DONE (nvc0, radeonsi) + GL_ARB_compute_variable_group_size DONE (i965, nvc0, radeonsi) GL_ARB_ES3_2_compatibility DONE (i965/gen8+) GL_ARB_fragment_shader_interlock DONE (i965) GL_ARB_gpu_shader_int64 DONE (i965/gen8+, nvc0, radeonsi, softpipe, llvmpipe) diff --git a/docs/relnotes/18.2.0.html b/docs/relnotes/18.2.0.html index 0db37b620d..7475a56633 100644 --- a/docs/relnotes/18.2.0.html +++ b/docs/relnotes/18.2.0.html @@ -52,6 +52,7 @@ Note: some of the new features are only available with certain drivers. <ul> <li>GL_ARB_fragment_shader_interlock on i965</li> +<li>GL_ARB_compute_variable_group_size on i965</li> </ul> <h2>Bug fixes</h2> diff --git a/src/compiler/nir/nir_lower_system_values.c b/src/compiler/nir/nir_lower_system_values.c index 487da04262..7ab005b000 100644 --- a/src/compiler/nir/nir_lower_system_values.c +++ b/src/compiler/nir/nir_lower_system_values.c @@ -57,6 +57,14 @@ convert_block(nir_block *block, nir_builder *b) * gl_WorkGroupID * gl_WorkGroupSize + gl_LocalInvocationID" */ + /* + * If the local work group size is variable we can't lower the global + * invocation id here. + */ + if (b->shader->info.cs.local_size_variable) { + break; + } + nir_const_value local_size; memset(&local_size, 0, sizeof(local_size)); local_size.u32[0] = b->shader->info.cs.local_size[0]; @@ -102,6 +110,11 @@ convert_block(nir_block *block, nir_builder *b) } case SYSTEM_VALUE_LOCAL_GROUP_SIZE: { + /* If the local work group size is variable we can't lower it here */ + if (b->shader->info.cs.local_size_variable) { + break; + } + nir_const_value local_size; memset(&local_size, 0, sizeof(local_size)); local_size.u32[0] = b->shader->info.cs.local_size[0]; diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index 8b4e6fe2e2..f54952c28f 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -759,6 +759,7 @@ struct brw_cs_prog_data { unsigned threads; bool uses_barrier; bool uses_num_work_groups; + bool uses_variable_group_size; struct { struct brw_push_const_block cross_thread; @@ -771,6 +772,7 @@ struct brw_cs_prog_data { * surface indices the CS-specific surfaces */ uint32_t work_groups_start; + uint32_t work_group_size_start; /** @} */ } binding_table; }; diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index d67c0a4192..28730af47b 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -7228,18 +7228,32 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, int shader_time_index, char **error_str) { - prog_data->local_size[0] = src_shader->info.cs.local_size[0]; - prog_data->local_size[1] = src_shader->info.cs.local_size[1]; - prog_data->local_size[2] = src_shader->info.cs.local_size[2]; - unsigned local_workgroup_size = - src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] * - src_shader->info.cs.local_size[2]; - - unsigned min_dispatch_width = - DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads); - min_dispatch_width = MAX2(8, min_dispatch_width); - min_dispatch_width = util_next_power_of_two(min_dispatch_width); - assert(min_dispatch_width <= 32); + unsigned min_dispatch_width; + + if (!src_shader->info.cs.local_size_variable) { + unsigned local_workgroup_size = + src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] * + src_shader->info.cs.local_size[2]; + + min_dispatch_width = + DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads); + min_dispatch_width = MAX2(8, min_dispatch_width); + min_dispatch_width = util_next_power_of_two(min_dispatch_width); + assert(min_dispatch_width <= 32); + + prog_data->local_size[0] = src_shader->info.cs.local_size[0]; + prog_data->local_size[1] = src_shader->info.cs.local_size[1]; + prog_data->local_size[2] = src_shader->info.cs.local_size[2]; + prog_data->uses_variable_group_size = false; + } else { + /* + * If the local work group size is variable we have to use a dispatch + * width of 32 here, since at this point we don't know the actual size of + * the workload. + */ + min_dispatch_width = 32; + prog_data->uses_variable_group_size = true; + } fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL; cfg_t *cfg = NULL; @@ -7324,7 +7338,12 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data, } } else { cfg = v32->cfg; - cs_set_simd_size(prog_data, 32); + if (!src_shader->info.cs.local_size_variable) { + cs_set_simd_size(prog_data, 32); + } else { + prog_data->simd_size = 32; + prog_data->threads = compiler->devinfo->max_cs_threads; + } cs_fill_push_const_info(compiler->devinfo, prog_data); promoted_constants = v32->promoted_constants; } diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp index 166da0aa6d..c4948c2347 100644 --- a/src/intel/compiler/brw_fs_nir.cpp +++ b/src/intel/compiler/brw_fs_nir.cpp @@ -3766,6 +3766,26 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, break; } + case nir_intrinsic_load_local_group_size: { + const unsigned surface = + cs_prog_data->binding_table.work_group_size_start; + + fs_reg surf_index = brw_imm_ud(surface); + brw_mark_surface_used(prog_data, surface); + + /* Read the 3 GLuint components of gl_NumWorkGroups */ + for (unsigned i = 0; i < 3; i++) { + fs_reg read_result = + emit_untyped_read(bld, surf_index, + brw_imm_ud(i << 2), + 1 /* dims */, 1 /* size */, + BRW_PREDICATE_NONE); + read_result.type = dest.type; + bld.MOV(dest, read_result); + dest = offset(dest, bld, 1); + } + break; + } default: nir_emit_intrinsic(bld, instr); break; diff --git a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c index bfbdea0e8f..096e86db19 100644 --- a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c +++ b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c @@ -58,10 +58,12 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state, * cs_thread_local_id + subgroup_invocation; */ nir_ssa_def *subgroup_id; - if (state->local_workgroup_size <= state->dispatch_width) + if ((state->local_workgroup_size <= state->dispatch_width) && + !state->nir->info.cs.local_size_variable) { subgroup_id = nir_imm_int(b, 0); - else + } else { subgroup_id = nir_load_subgroup_id(b); + } nir_ssa_def *thread_local_id = nir_imul(b, subgroup_id, nir_imm_int(b, state->dispatch_width)); @@ -84,43 +86,81 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state, * (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) % * gl_WorkGroupSize.z; */ - unsigned *size = nir->info.cs.local_size; - nir_ssa_def *local_index = nir_load_local_invocation_index(b); - - nir_const_value uvec3; - memset(&uvec3, 0, sizeof(uvec3)); - uvec3.u32[0] = 1; - uvec3.u32[1] = size[0]; - uvec3.u32[2] = size[0] * size[1]; - nir_ssa_def *div_val = nir_build_imm(b, 3, 32, uvec3); - uvec3.u32[0] = size[0]; - uvec3.u32[1] = size[1]; - uvec3.u32[2] = size[2]; - nir_ssa_def *mod_val = nir_build_imm(b, 3, 32, uvec3); - - sysval = nir_umod(b, nir_udiv(b, local_index, div_val), mod_val); + if (!state->nir->info.cs.local_size_variable) { + unsigned *size = nir->info.cs.local_size; + + nir_const_value uvec3; + memset(&uvec3, 0, sizeof(uvec3)); + uvec3.u32[0] = 1; + uvec3.u32[1] = size[0]; + uvec3.u32[2] = size[0] * size[1]; + nir_ssa_def *div_val = nir_build_imm(b, 3, 32, uvec3); + uvec3.u32[0] = size[0]; + uvec3.u32[1] = size[1]; + uvec3.u32[2] = size[2]; + nir_ssa_def *mod_val = nir_build_imm(b, 3, 32, uvec3); + + sysval = nir_umod(b, nir_udiv(b, local_index, div_val), mod_val); + } else { + nir_ssa_def *group_size_xyz = nir_load_local_group_size(b); + nir_ssa_def *group_size_x = nir_channel(b, group_size_xyz, 0); + nir_ssa_def *group_size_y = nir_channel(b, group_size_xyz, 1); + nir_ssa_def *group_size_z = nir_channel(b, group_size_xyz, 2); + nir_ssa_def *result[3]; + result[0] = nir_umod(b, local_index, group_size_x); + result[1] = nir_umod(b, nir_udiv(b, local_index, group_size_x), + group_size_y); + result[2] = nir_umod(b, nir_udiv(b, local_index, + nir_umul_high(b, group_size_x, group_size_y)), group_size_z); + + sysval = nir_vec(b, result, 3); + } break; } case nir_intrinsic_load_subgroup_id: - if (state->local_workgroup_size > 8) + if (state->local_workgroup_size > 8 || + state->nir->info.cs.local_size_variable) { continue; + } /* For small workgroup sizes, we know subgroup_id will be zero */ sysval = nir_imm_int(b, 0); break; case nir_intrinsic_load_num_subgroups: { - unsigned local_workgroup_size = - nir->info.cs.local_size[0] * nir->info.cs.local_size[1] * - nir->info.cs.local_size[2]; - unsigned num_subgroups = - DIV_ROUND_UP(local_workgroup_size, state->dispatch_width); - sysval = nir_imm_int(b, num_subgroups); + if (!state->nir->info.cs.local_size_variable) { + unsigned num_subgroups; + unsigned local_workgroup_size = + nir->info.cs.local_size[0] * nir->info.cs.local_size[1] * + nir->info.cs.local_size[2]; + num_subgroups = + DIV_ROUND_UP(local_workgroup_size, state->dispatch_width); + sysval = nir_imm_int(b, num_subgroups); + } else { + nir_ssa_def *group_size_xyz = nir_load_local_group_size(b); + nir_ssa_def *group_size_x = nir_channel(b, group_size_xyz, 0); + nir_ssa_def *group_size_y = nir_channel(b, group_size_xyz, 1); + nir_ssa_def *group_size_z = nir_channel(b, group_size_xyz, 2); + nir_ssa_def *group_size = nir_imul(b, group_size_x, nir_imul(b, + group_size_y, group_size_z)); + nir_ssa_def *dispatch_width = nir_imm_int(b, + state->dispatch_width - 1); + + sysval = nir_udiv(b, group_size, dispatch_width); + } break; } + case nir_intrinsic_load_global_invocation_id: { + nir_ssa_def *group_id = nir_load_work_group_id(b); + nir_ssa_def *local_id = nir_load_local_invocation_id(b); + nir_ssa_def *group_size = nir_load_local_group_size(b); + + sysval = nir_iadd(b, nir_imul(b, group_id, group_size), local_id); + break; + } default: continue; } diff --git a/src/mesa/drivers/dri/i965/brw_compute.c b/src/mesa/drivers/dri/i965/brw_compute.c index de08fc3ac1..7949e0ff51 100644 --- a/src/mesa/drivers/dri/i965/brw_compute.c +++ b/src/mesa/drivers/dri/i965/brw_compute.c @@ -121,8 +121,11 @@ brw_emit_gpgpu_walker(struct brw_context *brw) } const unsigned simd_size = prog_data->simd_size; - unsigned group_size = prog_data->local_size[0] * - prog_data->local_size[1] * prog_data->local_size[2]; + unsigned group_size = brw->compute.group_size != NULL ? + brw->compute.group_size[0] * brw->compute.group_size[1] * + brw->compute.group_size[2] : prog_data->local_size[0] * + prog_data->local_size[1] * prog_data->local_size[2]; + unsigned thread_width_max = (group_size + simd_size - 1) / simd_size; @@ -229,6 +232,7 @@ brw_dispatch_compute(struct gl_context *ctx, const GLuint *num_groups) { brw->compute.num_work_groups_bo = NULL; brw->compute.num_work_groups = num_groups; + brw->compute.group_size = NULL; ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS; brw_dispatch_compute_common(ctx); @@ -248,6 +252,22 @@ brw_dispatch_compute_indirect(struct gl_context *ctx, GLintptr indirect) brw->compute.num_work_groups_bo = bo; brw->compute.num_work_groups_offset = indirect; brw->compute.num_work_groups = indirect_group_counts; + brw->compute.group_size = NULL; + ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS; + + brw_dispatch_compute_common(ctx); +} + +static void +brw_dispatch_compute_group_size(struct gl_context *ctx, + const GLuint *num_groups, + const GLuint *group_size) +{ + struct brw_context *brw = brw_context(ctx); + + brw->compute.num_work_groups_bo = NULL; + brw->compute.num_work_groups = num_groups; + brw->compute.group_size = group_size; ctx->NewDriverState |= BRW_NEW_CS_WORK_GROUPS; brw_dispatch_compute_common(ctx); @@ -258,4 +278,5 @@ brw_init_compute_functions(struct dd_function_table *functions) { functions->DispatchCompute = brw_dispatch_compute; functions->DispatchComputeIndirect = brw_dispatch_compute_indirect; + functions->DispatchComputeGroupSize = brw_dispatch_compute_group_size; } diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 9ced230ec1..25d354e155 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -766,6 +766,12 @@ brw_initialize_cs_context_constants(struct brw_context *brw) ctx->Const.MaxComputeWorkGroupSize[2] = max_invocations; ctx->Const.MaxComputeWorkGroupInvocations = max_invocations; ctx->Const.MaxComputeSharedMemorySize = 64 * 1024; + + /* ARB_compute_variable_group_size constants */ + ctx->Const.MaxComputeVariableGroupSize[0] = max_invocations; + ctx->Const.MaxComputeVariableGroupSize[1] = max_invocations; + ctx->Const.MaxComputeVariableGroupSize[2] = max_invocations; + ctx->Const.MaxComputeVariableGroupInvocations = max_invocations; } /** diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h index 2613b9fda2..0fb533c369 100644 --- a/src/mesa/drivers/dri/i965/brw_context.h +++ b/src/mesa/drivers/dri/i965/brw_context.h @@ -931,6 +931,7 @@ struct brw_context struct brw_bo *num_work_groups_bo; GLintptr num_work_groups_offset; const GLuint *num_work_groups; + const GLuint *group_size; } compute; struct { diff --git a/src/mesa/drivers/dri/i965/brw_cs.c b/src/mesa/drivers/dri/i965/brw_cs.c index e3f8fc67a4..007273390b 100644 --- a/src/mesa/drivers/dri/i965/brw_cs.c +++ b/src/mesa/drivers/dri/i965/brw_cs.c @@ -43,6 +43,10 @@ assign_cs_binding_table_offsets(const struct gen_device_info *devinfo, prog_data->binding_table.work_groups_start = next_binding_table_offset; next_binding_table_offset++; + /* May not be used if the work group size is not variable. */ + prog_data->binding_table.work_group_size_start = next_binding_table_offset; + next_binding_table_offset++; + brw_assign_common_binding_table_offsets(devinfo, prog, &prog_data->base, next_binding_table_offset); } diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c index 73cae9ef7c..fa8851e2b4 100644 --- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c +++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c @@ -1634,7 +1634,7 @@ const struct brw_tracked_state brw_wm_image_surfaces = { }; static void -brw_upload_cs_work_groups_surface(struct brw_context *brw) +brw_upload_cs_variable_surfaces(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; /* _NEW_PROGRAM */ @@ -1671,6 +1671,29 @@ brw_upload_cs_work_groups_surface(struct brw_context *brw) RELOC_WRITE); brw->ctx.NewDriverState |= BRW_NEW_SURFACES; } + + if (prog && cs_prog_data->uses_variable_group_size) { + const unsigned surf_idx = + cs_prog_data->binding_table.work_group_size_start; + uint32_t *surf_offset = &brw->cs.base.surf_offset[surf_idx]; + struct brw_bo *bo; + uint32_t bo_offset; + + bo = NULL; + brw_upload_data(&brw->upload, + (void *)brw->compute.group_size, + 3 * sizeof(GLuint), + sizeof(GLuint), + &bo, + &bo_offset); + + brw_emit_buffer_surface_state(brw, surf_offset, + bo, bo_offset, + ISL_FORMAT_RAW, + 3 * sizeof(GLuint), 1, + RELOC_WRITE); + brw->ctx.NewDriverState |= BRW_NEW_SURFACES; + } } const struct brw_tracked_state brw_cs_work_groups_surface = { @@ -1678,5 +1701,5 @@ const struct brw_tracked_state brw_cs_work_groups_surface = { .brw = BRW_NEW_CS_PROG_DATA | BRW_NEW_CS_WORK_GROUPS }, - .emit = brw_upload_cs_work_groups_surface, + .emit = brw_upload_cs_variable_surfaces, }; diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c index 5a9369d7b4..f213360ed8 100644 --- a/src/mesa/drivers/dri/i965/intel_extensions.c +++ b/src/mesa/drivers/dri/i965/intel_extensions.c @@ -258,6 +258,7 @@ intelInitExtensions(struct gl_context *ctx) ctx->Extensions.ARB_compute_shader = true; ctx->Extensions.ARB_ES3_1_compatibility = devinfo->gen >= 8 || devinfo->is_haswell; + ctx->Extensions.ARB_compute_variable_group_size = true; } if (can_do_predicate_writes(brw->screen)) { -- 2.11.0 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev