For the remaining patches (3-6), it would be better to precompute the register values in struct si_shader, so that the emit functions are as cheap as possible.
Marek On Mon, Sep 24, 2018 at 10:51 PM, Marek Olšák <mar...@gmail.com> wrote: > On Tue, Sep 18, 2018 at 4:21 PM, Sonny Jiang <sonny.ji...@amd.com> wrote: >> Signed-off-by: Sonny Jiang <sonny.ji...@amd.com> >> --- >> src/gallium/drivers/radeonsi/si_gfx_cs.c | 15 ++- >> src/gallium/drivers/radeonsi/si_state.h | 19 ++- >> .../drivers/radeonsi/si_state_shaders.c | 112 ++++++++++++------ >> 3 files changed, 109 insertions(+), 37 deletions(-) >> >> diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c >> b/src/gallium/drivers/radeonsi/si_gfx_cs.c >> index 99376c4847..39a97bf3da 100644 >> --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c >> +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c >> @@ -352,9 +352,22 @@ void si_begin_new_gfx_cs(struct si_context *ctx) >> >> ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000; >> ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] >> = 0xffff; >> >> ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0x00000000; >> + >> ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0x00000000; >> + >> ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0x00000000; >> + >> ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3] = 0x00000000; >> + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_OUT_PRIM_TYPE] >> = 0x00000000; >> + >> ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE] = 0x00000000; >> + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT] >> = 0x00000000; >> + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE] >> = 0x00000000; >> + >> ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1] = 0x00000000; >> + >> ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0x00000000; >> + >> ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0x00000000; >> + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT] >> = 0x00000000; >> + ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL] >> = 0x00000000; >> + >> ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP] = >> 0x00000000; >> >> /* Set all saved registers state to saved. */ >> - ctx->tracked_regs.reg_saved = 0xffffffff; >> + ctx->tracked_regs.reg_saved = 0xffffffffffffffff; >> } else { >> /* Set all saved registers state to unknown. */ >> ctx->tracked_regs.reg_saved = 0; >> diff --git a/src/gallium/drivers/radeonsi/si_state.h >> b/src/gallium/drivers/radeonsi/si_state.h >> index 7fb09ca953..9efb4be5a5 100644 >> --- a/src/gallium/drivers/radeonsi/si_state.h >> +++ b/src/gallium/drivers/radeonsi/si_state.h >> @@ -279,11 +279,28 @@ enum si_tracked_reg { >> >> SI_TRACKED_VGT_ESGS_RING_ITEMSIZE, >> >> + SI_TRACKED_VGT_GSVS_RING_OFFSET_1, /* 4 consecutive registers */ >> + SI_TRACKED_VGT_GSVS_RING_OFFSET_2, >> + SI_TRACKED_VGT_GSVS_RING_OFFSET_3, >> + SI_TRACKED_VGT_GS_OUT_PRIM_TYPE, >> + >> + SI_TRACKED_VGT_GSVS_RING_ITEMSIZE, >> + SI_TRACKED_VGT_GS_MAX_VERT_OUT, >> + >> + SI_TRACKED_VGT_GS_VERT_ITEMSIZE, /* 4 consecutive registers */ >> + SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1, >> + SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2, >> + SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3, >> + >> + SI_TRACKED_VGT_GS_INSTANCE_CNT, >> + SI_TRACKED_VGT_GS_ONCHIP_CNTL, >> + SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP, >> + >> SI_NUM_TRACKED_REGS, >> }; >> >> struct si_tracked_regs { >> - uint32_t reg_saved; >> + uint64_t reg_saved; > > This is good, but the functions in si_build_pm4.h also need to be > updated. They compute the masks with "1 << ...", which results in a > 32-bit number. You need to use "1ull << ..." instead. > > >> uint32_t reg_value[SI_NUM_TRACKED_REGS]; >> uint32_t spi_ps_input_cntl[32]; >> }; >> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c >> b/src/gallium/drivers/radeonsi/si_state_shaders.c >> index 0716021d85..87b58cd6e5 100644 >> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c >> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c >> @@ -747,48 +747,99 @@ static void gfx9_get_gs_info(struct si_shader_selector >> *es, >> assert(out->max_prims_per_subgroup <= max_out_prims); >> } >> >> -static void si_shader_gs(struct si_screen *sscreen, struct si_shader >> *shader) >> +static void si_emit_shader_gs(struct si_context *sctx) >> { >> + struct si_shader *shader = sctx->queued.named.gs->shader; >> + if (!shader) >> + return; >> + >> struct si_shader_selector *sel = shader->selector; >> const ubyte *num_components = sel->info.num_stream_output_components; >> unsigned gs_num_invocations = sel->gs_num_invocations; >> - struct si_pm4_state *pm4; >> uint64_t va; >> unsigned max_stream = sel->max_gs_stream; >> - unsigned offset; >> + unsigned offset_1, offset_2, offset_3, vgt_gsvs_ring_itemsize; >> >> - pm4 = si_get_shader_pm4_state(shader); >> - if (!pm4) >> - return; >> - >> - offset = num_components[0] * sel->gs_max_out_vertices; >> - si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, offset); >> + offset_3 = offset_2 = offset_1 = num_components[0] * >> sel->gs_max_out_vertices; >> if (max_stream >= 1) >> - offset += num_components[1] * sel->gs_max_out_vertices; >> - si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, offset); >> + offset_3 = offset_2 += num_components[1] * >> sel->gs_max_out_vertices; >> if (max_stream >= 2) >> - offset += num_components[2] * sel->gs_max_out_vertices; >> - si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, offset); >> - si_pm4_set_reg(pm4, R_028A6C_VGT_GS_OUT_PRIM_TYPE, >> - si_conv_prim_to_gs_out(sel->gs_output_prim)); >> + offset_3 += num_components[2] * sel->gs_max_out_vertices; >> + >> + >> + /* R_028A60_VGT_GSVS_RING_OFFSET_1, R_028A64_VGT_GSVS_RING_OFFSET_2 >> + * R_028A68_VGT_GSVS_RING_OFFSET_3, R_028A6C_VGT_GS_OUT_PRIM_TYPE */ >> + radeon_opt_set_context_reg4(sctx, R_028A60_VGT_GSVS_RING_OFFSET_1, >> + SI_TRACKED_VGT_GSVS_RING_OFFSET_1, >> + offset_1, offset_2, offset_3, >> + >> si_conv_prim_to_gs_out(sel->gs_output_prim)); >> + >> + vgt_gsvs_ring_itemsize = offset_3; >> if (max_stream >= 3) >> - offset += num_components[3] * sel->gs_max_out_vertices; >> - si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset); >> + vgt_gsvs_ring_itemsize += num_components[3] * >> sel->gs_max_out_vertices; >> >> /* The GSVS_RING_ITEMSIZE register takes 15 bits */ >> - assert(offset < (1 << 15)); >> + assert(vgt_gsvs_ring_itemsize < (1 << 15)); >> + >> + /* R_028AB0_VGT_GSVS_RING_ITEMSIZE */ >> + radeon_opt_set_context_reg(sctx, R_028AB0_VGT_GSVS_RING_ITEMSIZE, >> + SI_TRACKED_VGT_GSVS_RING_ITEMSIZE, >> + vgt_gsvs_ring_itemsize); >> + >> + /* R_028B38_VGT_GS_MAX_VERT_OUT */ >> + radeon_opt_set_context_reg(sctx, R_028B38_VGT_GS_MAX_VERT_OUT, >> + SI_TRACKED_VGT_GS_MAX_VERT_OUT, >> + sel->gs_max_out_vertices); >> + >> + /* R_028B5C_VGT_GS_VERT_ITEMSIZE, R_028B60_VGT_GS_VERT_ITEMSIZE_1 >> + * R_028B64_VGT_GS_VERT_ITEMSIZE_2, R_028B68_VGT_GS_VERT_ITEMSIZE_3 >> */ >> + radeon_opt_set_context_reg4(sctx, R_028B5C_VGT_GS_VERT_ITEMSIZE, >> + SI_TRACKED_VGT_GS_VERT_ITEMSIZE, >> + num_components[0], >> + (max_stream >= 1) ? num_components[1] : >> 0, >> + (max_stream >= 2) ? num_components[2] : >> 0, >> + (max_stream >= 3) ? num_components[3] : >> 0); >> + >> + /* R_028B90_VGT_GS_INSTANCE_CNT */ >> + radeon_opt_set_context_reg(sctx, R_028B90_VGT_GS_INSTANCE_CNT, >> + SI_TRACKED_VGT_GS_INSTANCE_CNT, >> + S_028B90_CNT(MIN2(gs_num_invocations, >> 127)) | >> + S_028B90_ENABLE(gs_num_invocations > 0)); >> >> - si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, >> sel->gs_max_out_vertices); >> + va = shader->bo->gpu_address; >> >> - si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, >> num_components[0]); >> - si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= >> 1) ? num_components[1] : 0); >> - si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= >> 2) ? num_components[2] : 0); >> - si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= >> 3) ? num_components[3] : 0); >> + if (sctx->chip_class >= GFX9) { >> + struct gfx9_gs_info gs_info; >> + gfx9_get_gs_info(shader->key.part.gs.es, sel, &gs_info); > > This call is expensive. The register values should ideally be > precomputed in si_shader_gs and saved in struct si_shader, so that > emitting the values here is cheap. > > Marek _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev