Reviewed-by: Edward O'Callaghan <funfunc...@folklore1984.net> On 10/08/2016 04:05 AM, Nicolai Hähnle wrote: > From: Nicolai Hähnle <nicolai.haeh...@amd.com> > > Not sure if it's possible to avoid programming the block size twice (once for > the userdata and once for the dispatch). > --- > docs/features.txt | 2 +- > docs/relnotes/12.1.0.html | 2 +- > src/gallium/drivers/radeon/r600_pipe_common.c | 10 +++++- > src/gallium/drivers/radeon/r600_pipe_common.h | 2 ++ > src/gallium/drivers/radeonsi/si_compute.c | 10 +++++- > src/gallium/drivers/radeonsi/si_shader.c | 44 > ++++++++++++++++++--------- > src/gallium/drivers/radeonsi/si_shader.h | 4 ++- > 7 files changed, 55 insertions(+), 19 deletions(-) > > diff --git a/docs/features.txt b/docs/features.txt > index 08b5892..8917a2e 100644 > --- a/docs/features.txt > +++ b/docs/features.txt > @@ -272,21 +272,21 @@ GLES3.2, GLSL ES 3.2 -- all DONE: i965/gen9+ > GL_OES_texture_border_clamp DONE (all drivers) > GL_OES_texture_buffer DONE (i965, nvc0, > radeonsi) > GL_OES_texture_cube_map_array DONE (i965/gen8+, > nvc0, radeonsi) > GL_OES_texture_stencil8 DONE (all drivers > that support GL_ARB_texture_stencil8) > GL_OES_texture_storage_multisample_2d_array DONE (all drivers > that support GL_ARB_texture_multisample) > > Khronos, ARB, and OES extensions that are not part of any OpenGL or OpenGL > ES version: > > GL_ARB_bindless_texture started (airlied) > GL_ARB_cl_event not started > - GL_ARB_compute_variable_group_size DONE (nvc0) > + GL_ARB_compute_variable_group_size DONE (nvc0, radeonsi) > GL_ARB_ES3_2_compatibility DONE (i965/gen8+) > GL_ARB_fragment_shader_interlock not started > GL_ARB_gl_spirv not started > GL_ARB_gpu_shader_int64 started (airlied for > core and Gallium, idr for i965) > GL_ARB_indirect_parameters DONE (nvc0, radeonsi) > GL_ARB_parallel_shader_compile not started, but > Chia-I Wu did some related work in 2014 > GL_ARB_pipeline_statistics_query DONE (i965, nvc0, > radeonsi, softpipe, swr) > GL_ARB_post_depth_coverage not started > GL_ARB_robustness_isolation not started > GL_ARB_sample_locations not started > diff --git a/docs/relnotes/12.1.0.html b/docs/relnotes/12.1.0.html > index 43af1a5..20fd2cb 100644 > --- a/docs/relnotes/12.1.0.html > +++ b/docs/relnotes/12.1.0.html > @@ -42,21 +42,21 @@ TBD. > <p> > Note: some of the new features are only available with certain drivers. > </p> > > <ul> > <li>OpenGL ES 3.1 on i965/hsw</li> > <li>OpenGL ES 3.2 on i965/gen9+ (Skylake and later)</li> > <li>GL_ARB_ES3_1_compatibility on i965</li> > <li>GL_ARB_ES3_2_compatibility on i965/gen8+</li> > <li>GL_ARB_clear_texture on r600, radeonsi</li> > -<li>GL_ARB_compute_variable_group_size on nvc0</li> > +<li>GL_ARB_compute_variable_group_size on nvc0, radeonsi</li> > <li>GL_ARB_cull_distance on radeonsi</li> > <li>GL_ARB_enhanced_layouts on i965, radeonsi, llvmpipe, softpipe</li> > <li>GL_ARB_indirect_parameters on radeonsi</li> > <li>GL_ARB_query_buffer_object on radeonsi</li> > <li>GL_ARB_shader_draw_parameters on radeonsi</li> > <li>GL_ARB_shader_group_vote on nvc0</li> > <li>GL_ARB_shader_viewport_layer_array on i965/gen6+</li> > <li>GL_ARB_stencil_texturing on i965/hsw</li> > <li>GL_ARB_texture_stencil8 on i965/hsw</li> > <li>GL_EXT_window_rectangles on nv50, nvc0</li> > diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c > b/src/gallium/drivers/radeon/r600_pipe_common.c > index 44863ee..3dbcbc6 100644 > --- a/src/gallium/drivers/radeon/r600_pipe_common.c > +++ b/src/gallium/drivers/radeon/r600_pipe_common.c > @@ -1030,21 +1030,29 @@ static int r600_get_compute_param(struct pipe_screen > *screen, > return sizeof(uint32_t); > case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: > break; /* unused */ > case PIPE_COMPUTE_CAP_SUBGROUP_SIZE: > if (ret) { > uint32_t *subgroup_size = ret; > *subgroup_size = r600_wavefront_size(rscreen->family); > } > return sizeof(uint32_t); > case PIPE_COMPUTE_CAP_MAX_VARIABLE_THREADS_PER_BLOCK: > - return 0; > + if (ret) { > + uint64_t *max_variable_threads_per_block = ret; > + if (rscreen->chip_class >= SI && HAVE_LLVM >= 0x309 && > + ir_type == PIPE_SHADER_IR_TGSI) > + *max_variable_threads_per_block = > SI_MAX_VARIABLE_THREADS_PER_BLOCK; > + else > + *max_variable_threads_per_block = 0; > + } > + return sizeof(uint64_t); > } > > fprintf(stderr, "unknown PIPE_COMPUTE_CAP %d\n", param); > return 0; > } > > static uint64_t r600_get_timestamp(struct pipe_screen *screen) > { > struct r600_common_screen *rscreen = (struct r600_common_screen*)screen; > > diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h > b/src/gallium/drivers/radeon/r600_pipe_common.h > index 54991e8..290b228 100644 > --- a/src/gallium/drivers/radeon/r600_pipe_common.h > +++ b/src/gallium/drivers/radeon/r600_pipe_common.h > @@ -99,20 +99,22 @@ > #define DBG_NO_RB_PLUS (1llu << 45) > #define DBG_SI_SCHED (1llu << 46) > #define DBG_MONOLITHIC_SHADERS (1llu << 47) > #define DBG_NO_CE (1llu << 48) > #define DBG_UNSAFE_MATH (1llu << 49) > #define DBG_NO_DCC_FB (1llu << 50) > > #define R600_MAP_BUFFER_ALIGNMENT 64 > #define R600_MAX_VIEWPORTS 16 > > +#define SI_MAX_VARIABLE_THREADS_PER_BLOCK 1024 > + > enum r600_coherency { > R600_COHERENCY_NONE, /* no cache flushes needed */ > R600_COHERENCY_SHADER, > R600_COHERENCY_CB_META, > }; > > #ifdef PIPE_ARCH_BIG_ENDIAN > #define R600_BIG_ENDIAN 1 > #else > #define R600_BIG_ENDIAN 0 > diff --git a/src/gallium/drivers/radeonsi/si_compute.c > b/src/gallium/drivers/radeonsi/si_compute.c > index 1d1df2f..e59bafe 100644 > --- a/src/gallium/drivers/radeonsi/si_compute.c > +++ b/src/gallium/drivers/radeonsi/si_compute.c > @@ -594,25 +594,33 @@ static void si_setup_tgsi_grid(struct si_context *sctx, > for (i = 0; i < 3; ++i) { > radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); > radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | > COPY_DATA_DST_SEL(COPY_DATA_REG)); > radeon_emit(cs, (va + 4 * i)); > radeon_emit(cs, (va + 4 * i) >> 32); > radeon_emit(cs, (grid_size_reg >> 2) + i); > radeon_emit(cs, 0); > } > } else { > + struct si_compute *program = sctx->cs_shader_state.program; > + bool variable_group_size = > + > program->shader.selector->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] > == 0; > > - radeon_set_sh_reg_seq(cs, grid_size_reg, 3); > + radeon_set_sh_reg_seq(cs, grid_size_reg, variable_group_size ? > 6 : 3); > radeon_emit(cs, info->grid[0]); > radeon_emit(cs, info->grid[1]); > radeon_emit(cs, info->grid[2]); > + if (variable_group_size) { > + radeon_emit(cs, info->block[0]); > + radeon_emit(cs, info->block[1]); > + radeon_emit(cs, info->block[2]); > + } > } > } > > static void si_emit_dispatch_packets(struct si_context *sctx, > const struct pipe_grid_info *info) > { > struct radeon_winsys_cs *cs = sctx->b.gfx.cs; > bool render_cond_bit = sctx->b.render_cond && > !sctx->b.render_cond_force_off; > unsigned waves_per_threadgroup = > DIV_ROUND_UP(info->block[0] * info->block[1] * info->block[2], > 64); > diff --git a/src/gallium/drivers/radeonsi/si_shader.c > b/src/gallium/drivers/radeonsi/si_shader.c > index ff51c8b..49d4121 100644 > --- a/src/gallium/drivers/radeonsi/si_shader.c > +++ b/src/gallium/drivers/radeonsi/si_shader.c > @@ -1763,30 +1763,35 @@ static void declare_system_value( > > case TGSI_SEMANTIC_GRID_SIZE: > value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_GRID_SIZE); > break; > > case TGSI_SEMANTIC_BLOCK_SIZE: > { > LLVMValueRef values[3]; > unsigned i; > unsigned *properties = ctx->shader->selector->info.properties; > - unsigned sizes[3] = { > - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], > - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], > - properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] > - }; > > - for (i = 0; i < 3; ++i) > - values[i] = lp_build_const_int32(gallivm, sizes[i]); > + if (properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] != 0) { > + unsigned sizes[3] = { > + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH], > + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT], > + properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH] > + }; > + > + for (i = 0; i < 3; ++i) > + values[i] = lp_build_const_int32(gallivm, > sizes[i]); > > - value = lp_build_gather_values(gallivm, values, 3); > + value = lp_build_gather_values(gallivm, values, 3); > + } else { > + value = LLVMGetParam(radeon_bld->main_fn, > SI_PARAM_BLOCK_SIZE); > + } > break; > } > > case TGSI_SEMANTIC_BLOCK_ID: > value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_BLOCK_ID); > break; > > case TGSI_SEMANTIC_THREAD_ID: > value = LLVMGetParam(radeon_bld->main_fn, SI_PARAM_THREAD_ID); > break; > @@ -5673,20 +5678,21 @@ static void create_function(struct si_shader_context > *ctx) > > for (i = 0; i < num_return_sgprs; i++) > returns[i] = ctx->i32; > for (; i < num_returns; i++) > returns[i] = ctx->f32; > } > break; > > case PIPE_SHADER_COMPUTE: > params[SI_PARAM_GRID_SIZE] = v3i32; > + params[SI_PARAM_BLOCK_SIZE] = v3i32; > params[SI_PARAM_BLOCK_ID] = v3i32; > last_sgpr = SI_PARAM_BLOCK_ID; > > params[SI_PARAM_THREAD_ID] = v3i32; > num_params = SI_PARAM_THREAD_ID + 1; > break; > default: > assert(0 && "unimplemented shader"); > return; > } > @@ -5709,21 +5715,26 @@ static void create_function(struct si_shader_context > *ctx) > S_0286D0_LINEAR_CENTROID_ENA(1) | > S_0286D0_FRONT_FACE_ENA(1) | > S_0286D0_POS_FIXED_PT_ENA(1)); > } else if (ctx->type == PIPE_SHADER_COMPUTE) { > const unsigned *properties = shader->selector->info.properties; > unsigned max_work_group_size = > properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * > properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * > properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; > > - assert(max_work_group_size); > + if (!max_work_group_size) { > + /* This is a variable group size compute shader, > + * compile it for the maximum possible group size. > + */ > + max_work_group_size = SI_MAX_VARIABLE_THREADS_PER_BLOCK; > + } > > radeon_llvm_add_attribute(ctx->radeon_bld.main_fn, > "amdgpu-max-work-group-size", > max_work_group_size); > } > > shader->info.num_input_sgprs = 0; > shader->info.num_input_vgprs = 0; > > for (i = 0; i <= last_sgpr; ++i) > @@ -6646,25 +6657,30 @@ int si_compile_tgsi_shader(struct si_screen *sscreen, > > /* Validate SGPR and VGPR usage for compute to detect compiler bugs. > * LLVM 3.9svn has this bug. > */ > if (sel->type == PIPE_SHADER_COMPUTE) { > unsigned *props = sel->info.properties; > unsigned wave_size = 64; > unsigned max_vgprs = 256; > unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512; > unsigned max_sgprs_per_wave = 128; > - unsigned min_waves_per_cu = > - DIV_ROUND_UP(props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * > - props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] > * > - props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH], > - wave_size); > + unsigned max_block_threads; > + > + if (props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH]) > + max_block_threads = > props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] * > + > props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] * > + > props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH]; > + else > + max_block_threads = SI_MAX_VARIABLE_THREADS_PER_BLOCK; > + > + unsigned min_waves_per_cu = DIV_ROUND_UP(max_block_threads, > wave_size); > unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4); > > max_vgprs = max_vgprs / min_waves_per_simd; > max_sgprs = MIN2(max_sgprs / min_waves_per_simd, > max_sgprs_per_wave); > > if (shader->config.num_sgprs > max_sgprs || > shader->config.num_vgprs > max_vgprs) { > fprintf(stderr, "LLVM failed to compile a shader > correctly: " > "SGPR:VGPR usage is %u:%u, but the hw limit is > %u:%u\n", > shader->config.num_sgprs, > shader->config.num_vgprs, > diff --git a/src/gallium/drivers/radeonsi/si_shader.h > b/src/gallium/drivers/radeonsi/si_shader.h > index 67cb67d2..f2618ac 100644 > --- a/src/gallium/drivers/radeonsi/si_shader.h > +++ b/src/gallium/drivers/radeonsi/si_shader.h > @@ -122,21 +122,22 @@ enum { > /* GS limits */ > SI_GS_NUM_USER_SGPR = SI_NUM_RESOURCE_SGPRS, > SI_GSCOPY_NUM_USER_SGPR = SI_SGPR_RW_BUFFERS_HI + 1, > > /* PS only */ > SI_SGPR_ALPHA_REF = SI_NUM_RESOURCE_SGPRS, > SI_PS_NUM_USER_SGPR, > > /* CS only */ > SI_SGPR_GRID_SIZE = SI_NUM_RESOURCE_SGPRS, > - SI_CS_NUM_USER_SGPR = SI_SGPR_GRID_SIZE + 3 > + SI_SGPR_BLOCK_SIZE = SI_SGPR_GRID_SIZE + 3, > + SI_CS_NUM_USER_SGPR = SI_SGPR_BLOCK_SIZE + 3 > }; > > /* LLVM function parameter indices */ > enum { > SI_PARAM_RW_BUFFERS, > SI_PARAM_CONST_BUFFERS, > SI_PARAM_SAMPLERS, > SI_PARAM_IMAGES, > SI_PARAM_SHADER_BUFFERS, > SI_NUM_RESOURCE_PARAMS, > @@ -212,20 +213,21 @@ enum { > SI_PARAM_POS_Y_FLOAT, > SI_PARAM_POS_Z_FLOAT, > SI_PARAM_POS_W_FLOAT, > SI_PARAM_FRONT_FACE, > SI_PARAM_ANCILLARY, > SI_PARAM_SAMPLE_COVERAGE, > SI_PARAM_POS_FIXED_PT, > > /* CS only parameters */ > SI_PARAM_GRID_SIZE = SI_NUM_RESOURCE_PARAMS, > + SI_PARAM_BLOCK_SIZE, > SI_PARAM_BLOCK_ID, > SI_PARAM_THREAD_ID, > > SI_NUM_PARAMS = SI_PARAM_POS_FIXED_PT + 9, /* +8 for COLOR[0..1] */ > }; > > /* SI-specific system values. */ > enum { > TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI = TGSI_SEMANTIC_COUNT, > TGSI_SEMANTIC_DEFAULT_TESSINNER_SI, >
signature.asc
Description: OpenPGP digital signature
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev