On Wed, May 8, 2019 at 1:52 AM Marek Olšák <mar...@gmail.com> wrote:
> On Fri, May 3, 2019 at 7:19 AM Nicolai Hähnle <nhaeh...@gmail.com> wrote: > >> From: Nicolai Hähnle <nicolai.haeh...@amd.com> >> >> --- >> src/amd/common/ac_binary.c | 2 + >> src/gallium/drivers/radeonsi/si_compute.c | 14 +-- >> src/gallium/drivers/radeonsi/si_shader.c | 112 +++------------------- >> src/gallium/drivers/radeonsi/si_shader.h | 25 +---- >> 4 files changed, 27 insertions(+), 126 deletions(-) >> >> diff --git a/src/amd/common/ac_binary.c b/src/amd/common/ac_binary.c >> index 44251886b5f..d0ca55e0e0d 100644 >> --- a/src/amd/common/ac_binary.c >> +++ b/src/amd/common/ac_binary.c >> @@ -218,26 +218,28 @@ void ac_parse_shader_binary_config(const char >> *data, size_t nbytes, >> unsigned value = util_le32_to_cpu(*(uint32_t*)(data + i + >> 4)); >> switch (reg) { >> case R_00B028_SPI_SHADER_PGM_RSRC1_PS: >> case R_00B128_SPI_SHADER_PGM_RSRC1_VS: >> case R_00B228_SPI_SHADER_PGM_RSRC1_GS: >> case R_00B848_COMPUTE_PGM_RSRC1: >> case R_00B428_SPI_SHADER_PGM_RSRC1_HS: >> conf->num_sgprs = MAX2(conf->num_sgprs, >> (G_00B028_SGPRS(value) + 1) * 8); >> conf->num_vgprs = MAX2(conf->num_vgprs, >> (G_00B028_VGPRS(value) + 1) * 4); >> conf->float_mode = G_00B028_FLOAT_MODE(value); >> + conf->rsrc1 = value; >> break; >> case R_00B02C_SPI_SHADER_PGM_RSRC2_PS: >> conf->lds_size = MAX2(conf->lds_size, >> G_00B02C_EXTRA_LDS_SIZE(value)); >> break; >> case R_00B84C_COMPUTE_PGM_RSRC2: >> conf->lds_size = MAX2(conf->lds_size, >> G_00B84C_LDS_SIZE(value)); >> + conf->rsrc2 = value; >> break; >> case R_0286CC_SPI_PS_INPUT_ENA: >> conf->spi_ps_input_ena = value; >> break; >> case R_0286D0_SPI_PS_INPUT_ADDR: >> conf->spi_ps_input_addr = value; >> break; >> case R_0286E8_SPI_TMPRING_SIZE: >> case R_00B860_COMPUTE_TMPRING_SIZE: >> /* WAVESIZE is in units of 256 dwords. */ >> diff --git a/src/gallium/drivers/radeonsi/si_compute.c >> b/src/gallium/drivers/radeonsi/si_compute.c >> index 541d7e6f118..02d7bac406a 100644 >> --- a/src/gallium/drivers/radeonsi/si_compute.c >> +++ b/src/gallium/drivers/radeonsi/si_compute.c >> @@ -59,21 +59,21 @@ static const amd_kernel_code_t >> *si_compute_get_code_object( >> uint64_t symbol_offset) >> { >> if (!program->use_code_object_v2) { >> return NULL; >> } >> return (const amd_kernel_code_t*) >> (program->shader.binary.code + symbol_offset); >> } >> >> static void code_object_to_config(const amd_kernel_code_t *code_object, >> - struct si_shader_config *out_config) { >> + struct ac_shader_config *out_config) { >> >> uint32_t rsrc1 = code_object->compute_pgm_resource_registers; >> uint32_t rsrc2 = code_object->compute_pgm_resource_registers >> >> 32; >> out_config->num_sgprs = code_object->wavefront_sgpr_count; >> out_config->num_vgprs = code_object->workitem_vgpr_count; >> out_config->float_mode = G_00B028_FLOAT_MODE(rsrc1); >> out_config->rsrc1 = rsrc1; >> out_config->lds_size = MAX2(out_config->lds_size, >> G_00B84C_LDS_SIZE(rsrc2)); >> out_config->rsrc2 = rsrc2; >> out_config->scratch_bytes_per_wave = >> @@ -241,22 +241,22 @@ static void *si_create_compute_state( >> const amd_kernel_code_t *code_object = >> si_compute_get_code_object(program, 0); >> code_object_to_config(code_object, >> &program->shader.config); >> if (program->shader.binary.reloc_count != 0) { >> fprintf(stderr, "Error: %d unsupported >> relocations\n", >> >> program->shader.binary.reloc_count); >> FREE(program); >> return NULL; >> } >> } else { >> - >> si_shader_binary_read_config(&program->shader.binary, >> - &program->shader.config, 0); >> + >> ac_shader_binary_read_config(&program->shader.binary, >> + &program->shader.config, 0, false); >> } >> si_shader_dump(sctx->screen, &program->shader, >> &sctx->debug, >> PIPE_SHADER_COMPUTE, stderr, true); >> if (si_shader_binary_upload(sctx->screen, >> &program->shader) < 0) { >> fprintf(stderr, "LLVM failed to upload shader\n"); >> FREE(program); >> return NULL; >> } >> } >> >> @@ -362,21 +362,21 @@ static void si_initialize_compute(struct si_context >> *sctx) >> bc_va >> 8); >> } >> } >> >> sctx->cs_shader_state.emitted_program = NULL; >> sctx->cs_shader_state.initialized = true; >> } >> >> static bool si_setup_compute_scratch_buffer(struct si_context *sctx, >> struct si_shader *shader, >> - struct si_shader_config >> *config) >> + struct ac_shader_config >> *config) >> { >> uint64_t scratch_bo_size, scratch_needed; >> scratch_bo_size = 0; >> scratch_needed = config->scratch_bytes_per_wave * >> sctx->scratch_waves; >> if (sctx->compute_scratch_buffer) >> scratch_bo_size = >> sctx->compute_scratch_buffer->b.b.width0; >> >> if (scratch_bo_size < scratch_needed) { >> si_resource_reference(&sctx->compute_scratch_buffer, >> NULL); >> >> @@ -405,38 +405,38 @@ static bool si_setup_compute_scratch_buffer(struct >> si_context *sctx, >> return true; >> } >> >> static bool si_switch_compute_shader(struct si_context *sctx, >> struct si_compute *program, >> struct si_shader *shader, >> const amd_kernel_code_t *code_object, >> unsigned offset) >> { >> struct radeon_cmdbuf *cs = sctx->gfx_cs; >> - struct si_shader_config inline_config = {0}; >> - struct si_shader_config *config; >> + struct ac_shader_config inline_config = {0}; >> + struct ac_shader_config *config; >> uint64_t shader_va; >> >> if (sctx->cs_shader_state.emitted_program == program && >> sctx->cs_shader_state.offset == offset) >> return true; >> >> if (program->ir_type != PIPE_SHADER_IR_NATIVE) { >> config = &shader->config; >> } else { >> unsigned lds_blocks; >> >> config = &inline_config; >> if (code_object) { >> code_object_to_config(code_object, config); >> } else { >> - si_shader_binary_read_config(&shader->binary, >> config, offset); >> + ac_shader_binary_read_config(&shader->binary, >> config, offset, false); >> } >> >> lds_blocks = config->lds_size; >> /* XXX: We are over allocating LDS. For SI, the shader >> reports >> * LDS in blocks of 256 bytes, so if there are 4 bytes lds >> * allocated in the shader and 4 bytes allocated by the >> state >> * tracker, then we will set LDS_SIZE to 512 bytes rather >> than 256. >> */ >> if (sctx->chip_class <= SI) { >> lds_blocks += align(program->local_size, 256) >> >> 8; >> diff --git a/src/gallium/drivers/radeonsi/si_shader.c >> b/src/gallium/drivers/radeonsi/si_shader.c >> index f6d882cf583..da43447013d 100644 >> --- a/src/gallium/drivers/radeonsi/si_shader.c >> +++ b/src/gallium/drivers/radeonsi/si_shader.c >> @@ -4962,104 +4962,20 @@ static void si_llvm_emit_polygon_stipple(struct >> si_shader_context *ctx, >> /* The stipple pattern is 32x32, each row has 32 bits. */ >> offset = LLVMBuildMul(builder, address[1], >> LLVMConstInt(ctx->i32, 4, 0), ""); >> row = buffer_load_const(ctx, desc, offset); >> row = ac_to_integer(&ctx->ac, row); >> bit = LLVMBuildLShr(builder, row, address[0], ""); >> bit = LLVMBuildTrunc(builder, bit, ctx->i1, ""); >> ac_build_kill_if_false(&ctx->ac, bit); >> } >> >> -void si_shader_binary_read_config(struct ac_shader_binary *binary, >> - struct si_shader_config *conf, >> - unsigned symbol_offset) >> -{ >> - unsigned i; >> - const unsigned char *config = >> - ac_shader_binary_config_start(binary, symbol_offset); >> - bool really_needs_scratch = false; >> - >> - /* LLVM adds SGPR spills to the scratch size. >> - * Find out if we really need the scratch buffer. >> - */ >> - for (i = 0; i < binary->reloc_count; i++) { >> - const struct ac_shader_reloc *reloc = &binary->relocs[i]; >> - >> - if (!strcmp(scratch_rsrc_dword0_symbol, reloc->name) || >> - !strcmp(scratch_rsrc_dword1_symbol, reloc->name)) { >> - really_needs_scratch = true; >> - break; >> - } >> - } >> - >> - /* XXX: We may be able to emit some of these values directly >> rather than >> - * extracting fields to be emitted later. >> - */ >> - >> - for (i = 0; i < binary->config_size_per_symbol; i+= 8) { >> - unsigned reg = util_le32_to_cpu(*(uint32_t*)(config + i)); >> - unsigned value = util_le32_to_cpu(*(uint32_t*)(config + i >> + 4)); >> - switch (reg) { >> - case R_00B028_SPI_SHADER_PGM_RSRC1_PS: >> - case R_00B128_SPI_SHADER_PGM_RSRC1_VS: >> - case R_00B228_SPI_SHADER_PGM_RSRC1_GS: >> - case R_00B428_SPI_SHADER_PGM_RSRC1_HS: >> - case R_00B848_COMPUTE_PGM_RSRC1: >> - conf->num_sgprs = MAX2(conf->num_sgprs, >> (G_00B028_SGPRS(value) + 1) * 8); >> - conf->num_vgprs = MAX2(conf->num_vgprs, >> (G_00B028_VGPRS(value) + 1) * 4); >> - conf->float_mode = G_00B028_FLOAT_MODE(value); >> - conf->rsrc1 = value; >> - break; >> - case R_00B02C_SPI_SHADER_PGM_RSRC2_PS: >> - conf->lds_size = MAX2(conf->lds_size, >> G_00B02C_EXTRA_LDS_SIZE(value)); >> - break; >> - case R_00B84C_COMPUTE_PGM_RSRC2: >> - conf->lds_size = MAX2(conf->lds_size, >> G_00B84C_LDS_SIZE(value)); >> - conf->rsrc2 = value; >> - break; >> - case R_0286CC_SPI_PS_INPUT_ENA: >> - conf->spi_ps_input_ena = value; >> - break; >> - case R_0286D0_SPI_PS_INPUT_ADDR: >> - conf->spi_ps_input_addr = value; >> - break; >> - case R_0286E8_SPI_TMPRING_SIZE: >> - case R_00B860_COMPUTE_TMPRING_SIZE: >> - /* WAVESIZE is in units of 256 dwords. */ >> - if (really_needs_scratch) >> - conf->scratch_bytes_per_wave = >> - G_00B860_WAVESIZE(value) * 256 * >> 4; >> - break; >> - case 0x4: /* SPILLED_SGPRS */ >> - conf->spilled_sgprs = value; >> - break; >> - case 0x8: /* SPILLED_VGPRS */ >> - conf->spilled_vgprs = value; >> - break; >> - default: >> - { >> - static bool printed; >> - >> - if (!printed) { >> - fprintf(stderr, "Warning: LLVM >> emitted unknown " >> - "config register: >> 0x%x\n", reg); >> - printed = true; >> - } >> - } >> - break; >> - } >> - } >> - >> - if (!conf->spi_ps_input_addr) >> - conf->spi_ps_input_addr = conf->spi_ps_input_ena; >> -} >> - >> void si_shader_apply_scratch_relocs(struct si_shader *shader, >> uint64_t scratch_va) >> { >> unsigned i; >> uint32_t scratch_rsrc_dword0 = scratch_va; >> uint32_t scratch_rsrc_dword1 = >> S_008F04_BASE_ADDRESS_HI(scratch_va >> 32); >> >> /* Enable scratch coalescing. */ >> scratch_rsrc_dword1 |= S_008F04_SWIZZLE_ENABLE(1); >> @@ -5213,21 +5129,21 @@ static void si_shader_dump_disassembly(const >> struct ac_shader_binary *binary, >> fprintf(file, "@0x%x: %02x%02x%02x%02x\n", i, >> binary->code[i + 3], binary->code[i + 2], >> binary->code[i + 1], binary->code[i]); >> } >> } >> } >> >> static void si_calculate_max_simd_waves(struct si_shader *shader) >> { >> struct si_screen *sscreen = shader->selector->screen; >> - struct si_shader_config *conf = &shader->config; >> + struct ac_shader_config *conf = &shader->config; >> unsigned num_inputs = shader->selector->info.num_inputs; >> unsigned lds_increment = sscreen->info.chip_class >= CIK ? 512 : >> 256; >> unsigned lds_per_wave = 0; >> unsigned max_simd_waves; >> >> max_simd_waves = ac_get_max_simd_waves(sscreen->info.family); >> >> /* Compute LDS usage for PS. */ >> switch (shader->selector->type) { >> case PIPE_SHADER_FRAGMENT: >> @@ -5262,46 +5178,46 @@ static void si_calculate_max_simd_waves(struct >> si_shader *shader) >> } >> >> if (conf->num_vgprs) >> max_simd_waves = MIN2(max_simd_waves, 256 / >> conf->num_vgprs); >> >> /* LDS is 64KB per CU (4 SIMDs), which is 16KB per SIMD (usage >> above >> * 16KB makes some SIMDs unoccupied). */ >> if (lds_per_wave) >> max_simd_waves = MIN2(max_simd_waves, 16384 / >> lds_per_wave); >> >> - conf->max_simd_waves = max_simd_waves; >> + shader->max_simd_waves = max_simd_waves; >> } >> >> void si_shader_dump_stats_for_shader_db(const struct si_shader *shader, >> struct pipe_debug_callback *debug) >> { >> - const struct si_shader_config *conf = &shader->config; >> + const struct ac_shader_config *conf = &shader->config; >> >> pipe_debug_message(debug, SHADER_INFO, >> "Shader Stats: SGPRS: %d VGPRS: %d Code Size: >> %d " >> "LDS: %d Scratch: %d Max Waves: %d Spilled >> SGPRs: %d " >> "Spilled VGPRs: %d PrivMem VGPRs: %d", >> conf->num_sgprs, conf->num_vgprs, >> si_get_shader_binary_size(shader), >> conf->lds_size, conf->scratch_bytes_per_wave, >> - conf->max_simd_waves, conf->spilled_sgprs, >> - conf->spilled_vgprs, conf->private_mem_vgprs); >> + shader->max_simd_waves, conf->spilled_sgprs, >> + conf->spilled_vgprs, >> shader->private_mem_vgprs); >> } >> >> static void si_shader_dump_stats(struct si_screen *sscreen, >> const struct si_shader *shader, >> unsigned processor, >> FILE *file, >> bool check_debug_option) >> { >> - const struct si_shader_config *conf = &shader->config; >> + const struct ac_shader_config *conf = &shader->config; >> >> if (!check_debug_option || >> si_can_dump_shader(sscreen, processor)) { >> if (processor == PIPE_SHADER_FRAGMENT) { >> fprintf(file, "*** SHADER CONFIG ***\n" >> "SPI_PS_INPUT_ADDR = 0x%04x\n" >> "SPI_PS_INPUT_ENA = 0x%04x\n", >> conf->spi_ps_input_addr, >> conf->spi_ps_input_ena); >> } >> >> @@ -5311,24 +5227,24 @@ static void si_shader_dump_stats(struct si_screen >> *sscreen, >> "Spilled SGPRs: %d\n" >> "Spilled VGPRs: %d\n" >> "Private memory VGPRs: %d\n" >> "Code Size: %d bytes\n" >> "LDS: %d blocks\n" >> "Scratch: %d bytes per wave\n" >> "Max Waves: %d\n" >> "********************\n\n\n", >> conf->num_sgprs, conf->num_vgprs, >> conf->spilled_sgprs, conf->spilled_vgprs, >> - conf->private_mem_vgprs, >> + shader->private_mem_vgprs, >> si_get_shader_binary_size(shader), >> conf->lds_size, conf->scratch_bytes_per_wave, >> - conf->max_simd_waves); >> + shader->max_simd_waves); >> } >> } >> >> const char *si_get_shader_name(const struct si_shader *shader, unsigned >> processor) >> { >> switch (processor) { >> case PIPE_SHADER_VERTEX: >> if (shader->key.as_es) >> return "Vertex Shader as ES"; >> else if (shader->key.as_ls) >> @@ -5399,21 +5315,21 @@ void si_shader_dump(struct si_screen *sscreen, >> const struct si_shader *shader, >> debug, "epilog", file); >> fprintf(file, "\n"); >> } >> >> si_shader_dump_stats(sscreen, shader, processor, file, >> check_debug_option); >> } >> >> static int si_compile_llvm(struct si_screen *sscreen, >> struct ac_shader_binary *binary, >> - struct si_shader_config *conf, >> + struct ac_shader_config *conf, >> struct ac_llvm_compiler *compiler, >> LLVMModuleRef mod, >> struct pipe_debug_callback *debug, >> unsigned processor, >> const char *name, >> bool less_optimized) >> { >> int r = 0; >> unsigned count = p_atomic_inc_return(&sscreen->num_compilations); >> >> @@ -5433,21 +5349,21 @@ static int si_compile_llvm(struct si_screen >> *sscreen, >> LLVMDisposeMessage(ir); >> } >> >> if (!si_replace_shader(count, binary)) { >> r = si_llvm_compile(mod, binary, compiler, debug, >> less_optimized); >> if (r) >> return r; >> } >> >> - si_shader_binary_read_config(binary, conf, 0); >> + ac_shader_binary_read_config(binary, conf, 0, false); >> >> /* Enable 64-bit and 16-bit denormals, because there is no >> performance >> * cost. >> * >> * If denormals are enabled, all floating-point output modifiers >> are >> * ignored. >> * >> * Don't enable denormals for 32-bit floats, because: >> * - Floating-point output modifiers would be ignored by the hw. >> * - Some opcodes don't support denormals, such as v_mad_f32. We >> would >> @@ -6799,21 +6715,21 @@ int si_compile_tgsi_shader(struct si_screen >> *sscreen, >> need_prolog ? 1 : 0, 0); >> } >> >> si_llvm_optimize_module(&ctx); >> >> /* Post-optimization transformations and analysis. */ >> si_optimize_vs_outputs(&ctx); >> >> if ((debug && debug->debug_message) || >> si_can_dump_shader(sscreen, ctx.type)) { >> - ctx.shader->config.private_mem_vgprs = >> + ctx.shader->private_mem_vgprs = >> ac_count_scratch_private_memory(ctx.main_fn); >> } >> >> /* Make sure the input is a pointer and not integer followed by >> inttoptr. */ >> assert(LLVMGetTypeKind(LLVMTypeOf(LLVMGetParam(ctx.main_fn, 0))) >> == >> LLVMPointerTypeKind); >> >> /* Compile to bytecode. */ >> r = si_compile_llvm(sscreen, &shader->binary, &shader->config, >> compiler, >> ctx.ac.module, debug, ctx.type, >> @@ -7954,23 +7870,23 @@ int si_shader_create(struct si_screen *sscreen, >> struct ac_llvm_compiler *compile >> shader->config.num_sgprs = >> MAX2(shader->config.num_sgprs, >> >> shader->previous_stage->config.num_sgprs); >> shader->config.num_vgprs = >> MAX2(shader->config.num_vgprs, >> >> shader->previous_stage->config.num_vgprs); >> shader->config.spilled_sgprs = >> MAX2(shader->config.spilled_sgprs, >> >> shader->previous_stage->config.spilled_sgprs); >> shader->config.spilled_vgprs = >> MAX2(shader->config.spilled_vgprs, >> >> shader->previous_stage->config.spilled_vgprs); >> - shader->config.private_mem_vgprs = >> - MAX2(shader->config.private_mem_vgprs, >> - >> shader->previous_stage->config.private_mem_vgprs); >> + shader->private_mem_vgprs = >> + MAX2(shader->private_mem_vgprs, >> + >> shader->previous_stage->private_mem_vgprs); >> shader->config.scratch_bytes_per_wave = >> >> MAX2(shader->config.scratch_bytes_per_wave, >> >> shader->previous_stage->config.scratch_bytes_per_wave); >> shader->info.uses_instanceid |= >> >> shader->previous_stage->info.uses_instanceid; >> } >> if (shader->prolog2) { >> shader->config.num_sgprs = >> MAX2(shader->config.num_sgprs, >> >> shader->prolog2->config.num_sgprs); >> shader->config.num_vgprs = >> MAX2(shader->config.num_vgprs, >> diff --git a/src/gallium/drivers/radeonsi/si_shader.h >> b/src/gallium/drivers/radeonsi/si_shader.h >> index ecf7f8bbd7a..6c8f70dc94b 100644 >> --- a/src/gallium/drivers/radeonsi/si_shader.h >> +++ b/src/gallium/drivers/radeonsi/si_shader.h >> @@ -552,36 +552,20 @@ struct si_shader_key { >> * but forces monolithic shaders to be used as soon as >> * possible, because it's in the "opt" group. >> */ >> unsigned prefer_mono:1; >> } opt; >> }; >> >> /* Restore the pack alignment to default. */ >> #pragma pack(pop) >> >> -struct si_shader_config { >> - unsigned num_sgprs; >> - unsigned num_vgprs; >> - unsigned spilled_sgprs; >> - unsigned spilled_vgprs; >> - unsigned private_mem_vgprs; >> - unsigned lds_size; >> - unsigned max_simd_waves; >> - unsigned spi_ps_input_ena; >> - unsigned spi_ps_input_addr; >> - unsigned float_mode; >> - unsigned scratch_bytes_per_wave; >> - unsigned rsrc1; >> - unsigned rsrc2; >> -}; >> - >> /* GCN-specific shader info. */ >> struct si_shader_info { >> ubyte vs_output_param_offset[SI_MAX_VS_OUTPUTS]; >> ubyte num_input_sgprs; >> ubyte num_input_vgprs; >> signed char face_vgpr_index; >> signed char ancillary_vgpr_index; >> bool uses_instanceid; >> ubyte nr_pos_exports; >> ubyte nr_param_exports; >> @@ -605,22 +589,24 @@ struct si_shader { >> struct si_shader_key key; >> struct util_queue_fence ready; >> bool compilation_failed; >> bool is_monolithic; >> bool is_optimized; >> bool is_binary_shared; >> bool is_gs_copy_shader; >> >> /* The following data is all that's needed for binary shaders. */ >> struct ac_shader_binary binary; >> - struct si_shader_config config; >> + struct ac_shader_config config; >> struct si_shader_info info; >> + unsigned private_mem_vgprs; >> + unsigned max_simd_waves; >> > > The shader cache stores "config" but not these new members. > My updated version of the patch moves these 2 variables to si_shader_info, which trivially resolves the concern. Marek
_______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev