From: Nicolai Hähnle <nicolai.haeh...@amd.com> This will make it easier to use LDS for other purposes in geometry shaders in the future.
The lifetime of the esgs_ring variable is as follows: - declared as [0 x i32] while compiling shader parts or monolithic shaders - just before uploading, gfx9_get_gs_info computes (among other things) the final ESGS ring size (this depends on both the ES and the GS shader) - during upload, the "esgs_ring" symbol is given to ac_rtld as a shared LDS symbol, which will lead to correctly laying out the LDS including other LDS objects that may be defined in the future - si_shader_gs uses shader->config.lds_size as the LDS size This change depends on the LLVM changes for emitting LDS symbols into the ELF file. --- src/gallium/drivers/radeonsi/si_shader.c | 82 +++++++++++++++---- src/gallium/drivers/radeonsi/si_shader.h | 19 +++++ .../drivers/radeonsi/si_state_shaders.c | 29 ++----- 3 files changed, 94 insertions(+), 36 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 6968038d4d0..f95a96f2458 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -1527,23 +1527,36 @@ LLVMValueRef si_llvm_load_input_gs(struct ac_shader_abi *abi, break; case 2: vtx_offset = si_unpack_param(ctx, ctx->param_gs_vtx45_offset, index % 2 ? 16 : 0, 16); break; default: assert(0); return NULL; } + unsigned offset = param * 4 + swizzle; vtx_offset = LLVMBuildAdd(ctx->ac.builder, vtx_offset, - LLVMConstInt(ctx->i32, param * 4, 0), ""); - return lds_load(bld_base, type, swizzle, vtx_offset); + LLVMConstInt(ctx->i32, offset, false), ""); + + LLVMValueRef ptr = ac_build_gep0(&ctx->ac, ctx->esgs_ring, vtx_offset); + LLVMValueRef value = LLVMBuildLoad(ctx->ac.builder, ptr, ""); + if (llvm_type_is_64bit(ctx, type)) { + ptr = LLVMBuildGEP(ctx->ac.builder, ptr, + &ctx->ac.i32_1, 1, ""); + LLVMValueRef values[2] = { + value, + LLVMBuildLoad(ctx->ac.builder, ptr, "") + }; + value = ac_build_gather_values(&ctx->ac, values, 2); + } + return LLVMBuildBitCast(ctx->ac.builder, value, type, ""); } /* GFX6: input load from the ESGS ring in memory. */ if (swizzle == ~0) { LLVMValueRef values[TGSI_NUM_CHANNELS]; unsigned chan; for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) { values[chan] = si_llvm_load_input_gs(abi, input_index, vtx_offset_param, type, chan); } @@ -3424,21 +3437,23 @@ static void si_llvm_emit_es_epilogue(struct ac_shader_abi *abi, for (chan = 0; chan < 4; chan++) { if (!(info->output_usagemask[i] & (1 << chan))) continue; LLVMValueRef out_val = LLVMBuildLoad(ctx->ac.builder, addrs[4 * i + chan], ""); out_val = ac_to_integer(&ctx->ac, out_val); /* GFX9 has the ESGS ring in LDS. */ if (ctx->screen->info.chip_class >= GFX9) { - lds_store(ctx, param * 4 + chan, lds_base, out_val); + LLVMValueRef idx = LLVMConstInt(ctx->i32, param * 4 + chan, false); + idx = LLVMBuildAdd(ctx->ac.builder, lds_base, idx, ""); + ac_build_indexed_store(&ctx->ac, ctx->esgs_ring, idx, out_val); continue; } ac_build_buffer_store_dword(&ctx->ac, ctx->esgs_ring, out_val, 1, NULL, soffset, (4 * param + chan) * 4, 1, 1, true, true); } } @@ -4828,47 +4843,62 @@ static void create_function(struct si_shader_context *ctx) for (i = 0; i < fninfo.num_sgpr_params; ++i) shader->info.num_input_sgprs += ac_get_type_size(fninfo.types[i]) / 4; for (; i < fninfo.num_params; ++i) shader->info.num_input_vgprs += ac_get_type_size(fninfo.types[i]) / 4; assert(shader->info.num_input_vgprs >= num_prolog_vgprs); shader->info.num_input_vgprs -= num_prolog_vgprs; - if (shader->key.as_ls || - ctx->type == PIPE_SHADER_TESS_CTRL || - /* GFX9 has the ESGS ring buffer in LDS. */ - type == SI_SHADER_MERGED_VERTEX_OR_TESSEVAL_GEOMETRY) + if (shader->key.as_ls || ctx->type == PIPE_SHADER_TESS_CTRL) ac_declare_lds_as_pointer(&ctx->ac); } /** * Load ESGS and GSVS ring buffer resource descriptors and save the variables * for later use. */ static void preload_ring_buffers(struct si_shader_context *ctx) { LLVMBuilderRef builder = ctx->ac.builder; LLVMValueRef buf_ptr = LLVMGetParam(ctx->main_fn, ctx->param_rw_buffers); - if (ctx->screen->info.chip_class <= VI && - (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY)) { - unsigned ring = - ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS - : SI_ES_RING_ESGS; - LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0); + if (ctx->shader->key.as_es || ctx->type == PIPE_SHADER_GEOMETRY) { + if (ctx->screen->info.chip_class <= VI) { + unsigned ring = + ctx->type == PIPE_SHADER_GEOMETRY ? SI_GS_RING_ESGS + : SI_ES_RING_ESGS; + LLVMValueRef offset = LLVMConstInt(ctx->i32, ring, 0); - ctx->esgs_ring = - ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); + ctx->esgs_ring = + ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); + } else { + if (USE_LDS_SYMBOLS && HAVE_LLVM >= 0x0900) { + /* Declare the ESGS ring as an explicit LDS symbol. + * For monolithic shaders, we declare the ring only once. + * + * We declare it with 64KB alignment as a hint that the + * pointer value will always be 0. + */ + ctx->esgs_ring = LLVMAddGlobalInAddressSpace( + ctx->ac.module, LLVMArrayType(ctx->i32, 0), + "esgs_ring", + AC_ADDR_SPACE_LDS); + LLVMSetAlignment(ctx->esgs_ring, 64 * 1024); + } else { + ac_declare_lds_as_pointer(&ctx->ac); + ctx->esgs_ring = ctx->ac.lds; + } + } } if (ctx->shader->is_gs_copy_shader) { LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0); ctx->gsvs_ring[0] = ac_build_load_to_sgpr(&ctx->ac, buf_ptr, offset); } else if (ctx->type == PIPE_SHADER_GEOMETRY) { const struct si_shader_selector *sel = ctx->shader->selector; LLVMValueRef offset = LLVMConstInt(ctx->i32, SI_RING_GSVS, 0); @@ -4972,44 +5002,61 @@ static void si_llvm_emit_polygon_stipple(struct si_shader_context *ctx, } /* For the UMR disassembler. */ #define DEBUGGER_END_OF_CODE_MARKER 0xbf9f0000 /* invalid instruction */ #define DEBUGGER_NUM_MARKERS 5 static bool si_shader_binary_open(struct si_screen *screen, struct si_shader *shader, struct ac_rtld_binary *rtld) { + const struct si_shader_selector *sel = shader->selector; const char *part_elfs[5]; uint64_t part_sizes[5]; unsigned num_parts = 0; #define add_part(shader_or_part) \ if (shader_or_part) { \ part_elfs[num_parts] = (shader_or_part)->binary.elf_buffer; \ part_sizes[num_parts] = (shader_or_part)->binary.elf_size; \ num_parts++; \ } add_part(shader->prolog); add_part(shader->previous_stage); add_part(shader->prolog2); add_part(shader); add_part(shader->epilog); #undef add_part + struct ac_rtld_symbol lds_symbols[1]; + unsigned num_lds_symbols = 0; + + if (sel && screen->info.chip_class >= GFX9 && + sel->type == PIPE_SHADER_GEOMETRY && !shader->is_gs_copy_shader) { + /* We add this symbol even on LLVM <= 8 to ensure that + * shader->config.lds_size is set correctly below. + */ + struct ac_rtld_symbol *sym = &lds_symbols[num_lds_symbols++]; + sym->name = "esgs_ring"; + sym->size = shader->gs_info.esgs_ring_size; + sym->align = 64 * 1024; + } + bool ok = ac_rtld_open(rtld, (struct ac_rtld_open_info){ .info = &screen->info, .num_parts = num_parts, .elf_ptrs = part_elfs, - .elf_sizes = part_sizes }); + .elf_sizes = part_sizes, + .num_shared_lds_symbols = num_lds_symbols, + .shared_lds_symbols = lds_symbols }); if (rtld->lds_size > 0) { unsigned alloc_granularity = screen->info.chip_class >= CIK ? 512 : 256; shader->config.lds_size = align(rtld->lds_size, alloc_granularity) / alloc_granularity; } return ok; } @@ -7899,20 +7946,23 @@ bool si_shader_create(struct si_screen *sscreen, struct ac_llvm_compiler *compil } if (shader->epilog) { shader->config.num_sgprs = MAX2(shader->config.num_sgprs, shader->epilog->config.num_sgprs); shader->config.num_vgprs = MAX2(shader->config.num_vgprs, shader->epilog->config.num_vgprs); } si_calculate_max_simd_waves(shader); } + if (sscreen->info.chip_class >= GFX9 && sel->type == PIPE_SHADER_GEOMETRY) + gfx9_get_gs_info(shader->previous_stage_sel, sel, &shader->gs_info); + si_fix_resource_usage(sscreen, shader); si_shader_dump(sscreen, shader, debug, sel->info.processor, stderr, true); /* Upload. */ if (!si_shader_binary_upload(sscreen, shader, 0)) { fprintf(stderr, "LLVM failed to upload shader\n"); return false; } diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 0d018cef2d2..84966f92eb5 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -136,20 +136,24 @@ #include "tgsi/tgsi_scan.h" #include "util/u_inlines.h" #include "util/u_queue.h" #include "ac_binary.h" #include "ac_llvm_build.h" #include "ac_llvm_util.h" #include <stdio.h> +// Use LDS symbols when supported by LLVM. Can be disabled for testing the old +// path on newer LLVM for now. Should be removed in the long term. +#define USE_LDS_SYMBOLS (true) + struct nir_shader; struct si_shader; struct si_context; #define SI_MAX_ATTRIBS 16 #define SI_MAX_VS_OUTPUTS 40 /* Shader IO unique indices are supported for TGSI_SEMANTIC_GENERIC with an * index smaller than this. */ @@ -571,20 +575,28 @@ struct si_shader_info { ubyte nr_param_exports; }; struct si_shader_binary { const char *elf_buffer; uint64_t elf_size; char *llvm_ir_string; }; +struct gfx9_gs_info { + unsigned es_verts_per_subgroup; + unsigned gs_prims_per_subgroup; + unsigned gs_inst_prims_in_subgroup; + unsigned max_prims_per_subgroup; + unsigned esgs_ring_size; /* in bytes */ +}; + struct si_shader { struct si_compiler_ctx_state compiler_ctx_state; struct si_shader_selector *selector; struct si_shader_selector *previous_stage_sel; /* for refcounting */ struct si_shader *next_variant; struct si_shader_part *prolog; struct si_shader *previous_stage; /* for GFX9 */ struct si_shader_part *prolog2; @@ -607,20 +619,22 @@ struct si_shader { struct si_shader_info info; unsigned private_mem_vgprs; unsigned max_simd_waves; /* Shader key + LLVM IR + disassembly + statistics. * Generated for debug contexts only. */ char *shader_log; size_t shader_log_size; + struct gfx9_gs_info gs_info; + /* For save precompute context registers values. */ union { struct { unsigned vgt_gsvs_ring_offset_1; unsigned vgt_gsvs_ring_offset_2; unsigned vgt_gsvs_ring_offset_3; unsigned vgt_gs_out_prim_type; unsigned vgt_gsvs_ring_itemsize; unsigned vgt_gs_max_vert_out; unsigned vgt_gs_vert_itemsize; @@ -695,20 +709,25 @@ void si_multiwave_lds_size_workaround(struct si_screen *sscreen, const char *si_get_shader_name(const struct si_shader *shader, unsigned processor); void si_shader_binary_clean(struct si_shader_binary *binary); /* si_shader_nir.c */ void si_nir_scan_shader(const struct nir_shader *nir, struct tgsi_shader_info *info); void si_nir_scan_tess_ctrl(const struct nir_shader *nir, struct tgsi_tessctrl_info *out); void si_lower_nir(struct si_shader_selector *sel); +/* si_state_shaders.c */ +void gfx9_get_gs_info(struct si_shader_selector *es, + struct si_shader_selector *gs, + struct gfx9_gs_info *out); + /* Inline helpers. */ /* Return the pointer to the main shader part's pointer. */ static inline struct si_shader ** si_get_main_shader_part(struct si_shader_selector *sel, struct si_shader_key *key) { if (key->as_ls) return &sel->main_shader_part_ls; if (key->as_es) diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index aec818c23a7..4db6b069a38 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -639,31 +639,23 @@ static unsigned si_conv_prim_to_gs_out(unsigned mode) [PIPE_PRIM_LINE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_LINESTRIP, [PIPE_PRIM_TRIANGLES_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = V_028A6C_OUTPRIM_TYPE_TRISTRIP, [PIPE_PRIM_PATCHES] = V_028A6C_OUTPRIM_TYPE_POINTLIST, }; assert(mode < ARRAY_SIZE(prim_conv)); return prim_conv[mode]; } -struct gfx9_gs_info { - unsigned es_verts_per_subgroup; - unsigned gs_prims_per_subgroup; - unsigned gs_inst_prims_in_subgroup; - unsigned max_prims_per_subgroup; - unsigned lds_size; -}; - -static void gfx9_get_gs_info(struct si_shader_selector *es, - struct si_shader_selector *gs, - struct gfx9_gs_info *out) +void gfx9_get_gs_info(struct si_shader_selector *es, + struct si_shader_selector *gs, + struct gfx9_gs_info *out) { unsigned gs_num_invocations = MAX2(gs->gs_num_invocations, 1); unsigned input_prim = gs->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]; bool uses_adjacency = input_prim >= PIPE_PRIM_LINES_ADJACENCY && input_prim <= PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY; /* All these are in dwords: */ /* We can't allow using the whole LDS, because GS waves compete with * other shader stages for LDS space. */ const unsigned max_lds_size = 8 * 1024; @@ -740,21 +732,21 @@ static void gfx9_get_gs_info(struct si_shader_selector *es, * unique (e.g. not reused) we need to make sure there is enough LDS * space to account for those ES verts beyond ES_VERTS_PER_SUBGRP. */ es_verts -= min_es_verts - 1; out->es_verts_per_subgroup = es_verts; out->gs_prims_per_subgroup = gs_prims; out->gs_inst_prims_in_subgroup = gs_prims * gs_num_invocations; out->max_prims_per_subgroup = out->gs_inst_prims_in_subgroup * gs->gs_max_out_vertices; - out->lds_size = align(esgs_lds_size, 128) / 128; + out->esgs_ring_size = 4 * esgs_lds_size; assert(out->max_prims_per_subgroup <= max_out_prims); } static void si_emit_shader_gs(struct si_context *sctx) { struct si_shader *shader = sctx->queued.named.gs->shader; unsigned initial_cdw = sctx->gfx_cs->current.cdw; if (!shader) @@ -869,21 +861,20 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) shader->ctx_reg.gs.vgt_gs_instance_cnt = S_028B90_CNT(MIN2(gs_num_invocations, 127)) | S_028B90_ENABLE(gs_num_invocations > 0); va = shader->bo->gpu_address; si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_BINARY); if (sscreen->info.chip_class >= GFX9) { unsigned input_prim = sel->info.properties[TGSI_PROPERTY_GS_INPUT_PRIM]; unsigned es_type = shader->key.part.gs.es->type; unsigned es_vgpr_comp_cnt, gs_vgpr_comp_cnt; - struct gfx9_gs_info gs_info; if (es_type == PIPE_SHADER_VERTEX) /* VGPR0-3: (VertexID, InstanceID / StepRate0, ...) */ es_vgpr_comp_cnt = shader->info.uses_instanceid ? 1 : 0; else if (es_type == PIPE_SHADER_TESS_EVAL) es_vgpr_comp_cnt = shader->key.part.gs.es->info.uses_primid ? 3 : 2; else unreachable("invalid shader selector type"); /* If offsets 4, 5 are used, GS_VGPR_COMP_CNT is ignored and @@ -897,45 +888,43 @@ static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader) gs_vgpr_comp_cnt = 1; /* VGPR1 contains offsets 2, 3 */ else gs_vgpr_comp_cnt = 0; /* VGPR0 contains offsets 0, 1 */ unsigned num_user_sgprs; if (es_type == PIPE_SHADER_VERTEX) num_user_sgprs = si_get_num_vs_user_sgprs(GFX9_VSGS_NUM_USER_SGPR); else num_user_sgprs = GFX9_TESGS_NUM_USER_SGPR; - gfx9_get_gs_info(shader->key.part.gs.es, sel, &gs_info); - si_pm4_set_reg(pm4, R_00B210_SPI_SHADER_PGM_LO_ES, va >> 8); si_pm4_set_reg(pm4, R_00B214_SPI_SHADER_PGM_HI_ES, S_00B214_MEM_BASE(va >> 40)); si_pm4_set_reg(pm4, R_00B228_SPI_SHADER_PGM_RSRC1_GS, S_00B228_VGPRS((shader->config.num_vgprs - 1) / 4) | S_00B228_SGPRS((shader->config.num_sgprs - 1) / 8) | S_00B228_DX10_CLAMP(1) | S_00B228_FLOAT_MODE(shader->config.float_mode) | S_00B228_GS_VGPR_COMP_CNT(gs_vgpr_comp_cnt)); si_pm4_set_reg(pm4, R_00B22C_SPI_SHADER_PGM_RSRC2_GS, S_00B22C_USER_SGPR(num_user_sgprs) | S_00B22C_USER_SGPR_MSB(num_user_sgprs >> 5) | S_00B22C_ES_VGPR_COMP_CNT(es_vgpr_comp_cnt) | S_00B22C_OC_LDS_EN(es_type == PIPE_SHADER_TESS_EVAL) | - S_00B22C_LDS_SIZE(gs_info.lds_size) | + S_00B22C_LDS_SIZE(shader->config.lds_size) | S_00B22C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0)); shader->ctx_reg.gs.vgt_gs_onchip_cntl = - S_028A44_ES_VERTS_PER_SUBGRP(gs_info.es_verts_per_subgroup) | - S_028A44_GS_PRIMS_PER_SUBGRP(gs_info.gs_prims_per_subgroup) | - S_028A44_GS_INST_PRIMS_IN_SUBGRP(gs_info.gs_inst_prims_in_subgroup); + S_028A44_ES_VERTS_PER_SUBGRP(shader->gs_info.es_verts_per_subgroup) | + S_028A44_GS_PRIMS_PER_SUBGRP(shader->gs_info.gs_prims_per_subgroup) | + S_028A44_GS_INST_PRIMS_IN_SUBGRP(shader->gs_info.gs_inst_prims_in_subgroup); shader->ctx_reg.gs.vgt_gs_max_prims_per_subgroup = - S_028A94_MAX_PRIMS_PER_SUBGROUP(gs_info.max_prims_per_subgroup); + S_028A94_MAX_PRIMS_PER_SUBGROUP(shader->gs_info.max_prims_per_subgroup); shader->ctx_reg.gs.vgt_esgs_ring_itemsize = shader->key.part.gs.es->esgs_itemsize / 4; if (es_type == PIPE_SHADER_TESS_EVAL) si_set_tesseval_regs(sscreen, shader->key.part.gs.es, pm4); polaris_set_vgt_vertex_reuse(sscreen, shader->key.part.gs.es, NULL, pm4); } else { si_pm4_set_reg(pm4, R_00B220_SPI_SHADER_PGM_LO_GS, va >> 8); -- 2.20.1 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev