On Tue, Apr 25, 2017 at 8:24 AM, Nicolai Hähnle <nhaeh...@gmail.com> wrote: > On 24.04.2017 18:22, Marek Olšák wrote: >> >> From: Marek Olšák <marek.ol...@amd.com> >> >> Basically, don't load GRID_SIZE or BLOCK_SIZE if they are unused, >> determine >> whether to load BLOCK_ID for each component separately, and set the number >> of THREAD_ID VGPRs to load. Now we should get the maximum CS launch wave >> rate in most cases. >> --- >> src/gallium/drivers/radeonsi/si_compute.c | 71 >> ++++++++++++++--------- >> src/gallium/drivers/radeonsi/si_shader.c | 37 ++++++++---- >> src/gallium/drivers/radeonsi/si_shader.h | 11 ---- >> src/gallium/drivers/radeonsi/si_shader_internal.h | 5 ++ >> 4 files changed, 76 insertions(+), 48 deletions(-) >> >> diff --git a/src/gallium/drivers/radeonsi/si_compute.c >> b/src/gallium/drivers/radeonsi/si_compute.c >> index 2b2efae..b3399d1 100644 >> --- a/src/gallium/drivers/radeonsi/si_compute.c >> +++ b/src/gallium/drivers/radeonsi/si_compute.c >> @@ -41,20 +41,22 @@ struct si_compute { >> >> unsigned ir_type; >> unsigned local_size; >> unsigned private_size; >> unsigned input_size; >> struct si_shader shader; >> >> struct pipe_resource *global_buffers[MAX_GLOBAL_BUFFERS]; >> unsigned use_code_object_v2 : 1; >> unsigned variable_group_size : 1; >> + unsigned uses_grid_size:1; >> + unsigned uses_block_size:1; >> }; >> >> struct dispatch_packet { >> uint16_t header; >> uint16_t setup; >> uint16_t workgroup_size_x; >> uint16_t workgroup_size_y; >> uint16_t workgroup_size_z; >> uint16_t reserved0; >> uint32_t grid_size_x; >> @@ -114,37 +116,45 @@ static void si_create_compute_state_async(void *job, >> int thread_index) >> memset(&sel, 0, sizeof(sel)); >> >> sel.screen = program->screen; >> tgsi_scan_shader(program->tokens, &sel.info); >> sel.tokens = program->tokens; >> sel.type = PIPE_SHADER_COMPUTE; >> sel.local_size = program->local_size; >> >> program->shader.selector = &sel; >> program->shader.is_monolithic = true; >> + program->uses_grid_size = sel.info.uses_grid_size; >> + program->uses_block_size = sel.info.uses_block_size; >> >> if (si_shader_create(program->screen, tm, &program->shader, >> debug)) { >> program->shader.compilation_failed = true; >> } else { >> bool scratch_enabled = >> shader->config.scratch_bytes_per_wave > 0; >> + unsigned user_sgprs = SI_NUM_RESOURCE_SGPRS + >> + (sel.info.uses_grid_size ? 3 : 0) + >> + (sel.info.uses_block_size ? 3 : 0); >> >> shader->config.rsrc1 = >> S_00B848_VGPRS((shader->config.num_vgprs - 1) / 4) >> | >> S_00B848_SGPRS((shader->config.num_sgprs - 1) / 8) >> | >> S_00B848_DX10_CLAMP(1) | >> S_00B848_FLOAT_MODE(shader->config.float_mode); >> >> shader->config.rsrc2 = >> - S_00B84C_USER_SGPR(SI_CS_NUM_USER_SGPR) | >> + S_00B84C_USER_SGPR(user_sgprs) | >> S_00B84C_SCRATCH_EN(scratch_enabled) | >> - S_00B84C_TGID_X_EN(1) | S_00B84C_TGID_Y_EN(1) | >> - S_00B84C_TGID_Z_EN(1) | S_00B84C_TIDIG_COMP_CNT(2) >> | >> + S_00B84C_TGID_X_EN(sel.info.uses_block_id[0]) | >> + S_00B84C_TGID_Y_EN(sel.info.uses_block_id[1]) | >> + S_00B84C_TGID_Z_EN(sel.info.uses_block_id[2]) | >> + S_00B84C_TIDIG_COMP_CNT(sel.info.uses_thread_id[2] >> ? 2 : >> + sel.info.uses_thread_id[1] >> ? 1 : 0) | >> S_00B84C_LDS_SIZE(shader->config.lds_size); >> >> program->variable_group_size = >> >> sel.info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] == 0; >> } >> >> FREE(program->tokens); >> program->shader.selector = NULL; >> } >> >> @@ -644,50 +654,57 @@ static bool si_upload_compute_input(struct >> si_context *sctx, >> } >> >> r600_resource_reference(&input_buffer, NULL); >> >> return true; >> } >> >> static void si_setup_tgsi_grid(struct si_context *sctx, >> const struct pipe_grid_info *info) >> { >> + struct si_compute *program = sctx->cs_shader_state.program; >> struct radeon_winsys_cs *cs = sctx->b.gfx.cs; >> unsigned grid_size_reg = R_00B900_COMPUTE_USER_DATA_0 + >> - 4 * SI_SGPR_GRID_SIZE; >> + 4 * SI_NUM_RESOURCE_SGPRS; >> + unsigned block_size_reg = grid_size_reg + >> + /* 12 bytes = 3 dwords. */ >> + 12 * program->uses_grid_size; >> >> if (info->indirect) { >> - uint64_t base_va = >> r600_resource(info->indirect)->gpu_address; >> - uint64_t va = base_va + info->indirect_offset; >> - int i; >> - >> - radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, >> - (struct r600_resource *)info->indirect, >> - RADEON_USAGE_READ, >> RADEON_PRIO_DRAW_INDIRECT); >> - >> - for (i = 0; i < 3; ++i) { >> - radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); >> - radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_MEM) | >> - COPY_DATA_DST_SEL(COPY_DATA_REG)); >> - radeon_emit(cs, (va + 4 * i)); >> - radeon_emit(cs, (va + 4 * i) >> 32); >> - radeon_emit(cs, (grid_size_reg >> 2) + i); >> - radeon_emit(cs, 0); >> + if (program->uses_grid_size) { >> + uint64_t base_va = >> r600_resource(info->indirect)->gpu_address; >> + uint64_t va = base_va + info->indirect_offset; >> + int i; >> + >> + radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, >> + (struct r600_resource >> *)info->indirect, >> + RADEON_USAGE_READ, >> RADEON_PRIO_DRAW_INDIRECT); >> + >> + for (i = 0; i < 3; ++i) { >> + radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, >> 0)); >> + radeon_emit(cs, >> COPY_DATA_SRC_SEL(COPY_DATA_MEM) | >> + >> COPY_DATA_DST_SEL(COPY_DATA_REG)); >> + radeon_emit(cs, (va + 4 * i)); > > > Extra space (was in the original as well, but may as well fix it now). > > >> + radeon_emit(cs, (va + 4 * i) >> 32); >> + radeon_emit(cs, (grid_size_reg >> 2) + i); >> + radeon_emit(cs, 0); >> + } >> } >> } else { >> - struct si_compute *program = >> sctx->cs_shader_state.program; >> - >> - radeon_set_sh_reg_seq(cs, grid_size_reg, >> program->variable_group_size ? 6 : 3); >> - radeon_emit(cs, info->grid[0]); >> - radeon_emit(cs, info->grid[1]); >> - radeon_emit(cs, info->grid[2]); >> - if (program->variable_group_size) { >> + if (program->uses_grid_size) { >> + radeon_set_sh_reg_seq(cs, grid_size_reg, 3); >> + radeon_emit(cs, info->grid[0]); >> + radeon_emit(cs, info->grid[1]); >> + radeon_emit(cs, info->grid[2]); >> + } >> + if (program->variable_group_size && >> program->uses_block_size) { >> + radeon_set_sh_reg_seq(cs, block_size_reg, 3); >> radeon_emit(cs, info->block[0]); >> radeon_emit(cs, info->block[1]); >> radeon_emit(cs, info->block[2]); > > > This is a slight regression if both are used, though I guess variable group > size is rare enough that it's not a big deal. > > Either way, > > Reviewed-by: Nicolai Hähnle <nicolai.haeh...@amd.com> > > By the way: Do we have a test case that exercises the corner case of a > compute shader that reads only the Y or only the Z component of BLOCK_ID? > That might be a good idea.
There are no piglits for those, but I have a few here. Marek _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev