Reviewed-by: Marek Olšák <mar...@gmail.com> Marek
On Wed, Jun 27, 2012 at 5:37 PM, Tom Stellard <tstel...@gmail.com> wrote: > The start_compute_cs atom initializes some config and context registers > to the values needed for running compute shaders. When a compute shader > is dispatched, this atom is emitted after the start_cs_cmd atom, which > initializes registers that are common to both 3D and compute. > --- > src/gallium/drivers/r600/evergreen_compute.c | 184 > +++++++++++++------------- > src/gallium/drivers/r600/evergreen_compute.h | 2 +- > src/gallium/drivers/r600/r600_pipe.c | 1 + > src/gallium/drivers/r600/r600_pipe.h | 3 + > 4 files changed, 96 insertions(+), 94 deletions(-) > > diff --git a/src/gallium/drivers/r600/evergreen_compute.c > b/src/gallium/drivers/r600/evergreen_compute.c > index 1f2ba40..eaffb75 100644 > --- a/src/gallium/drivers/r600/evergreen_compute.c > +++ b/src/gallium/drivers/r600/evergreen_compute.c > @@ -163,8 +163,6 @@ static void evergreen_bind_compute_state(struct > pipe_context *ctx_, void *state) > > } > > - evergreen_compute_init_config(ctx); > - > struct evergreen_compute_resource* res = get_empty_res(ctx->cs_shader, > COMPUTE_RESOURCE_SHADER, 0); > > @@ -331,9 +329,24 @@ static void compute_emit_cs(struct r600_context *ctx) > struct radeon_winsys_cs *cs = ctx->cs; > int i; > > + struct r600_resource *onebo = NULL; > + > + /* Initialize all the registers common to both 3D and compute. Some > + * 3D only register will be initialized by this atom as well, but > + * this is OK for now. > + * > + * See evergreen_init_atom_start_cs() or cayman_init_atom_start_cs() > in > + * evergreen_state.c for the list of registers that are intialized by > + * the start_cs_cmd atom. > + */ > r600_emit_atom(ctx, &ctx->start_cs_cmd.atom); > > - struct r600_resource *onebo = NULL; > + /* Initialize all the compute specific registers. > + * > + * See evergreen_init_atom_start_compute_cs() in this file for the > list > + * of registers initialized by the start_compuet_cs_cmd atom. > + */ > + r600_emit_atom(ctx, &ctx->start_compute_cs_cmd.atom); > > for (i = 0; i < get_compute_resource_num(); i++) { > if (ctx->cs_shader->resources[i].enabled) { > @@ -520,120 +533,128 @@ static void evergreen_set_global_binding( > evergreen_set_vtx_resource(ctx->cs_shader, pool->bo, 1, 0, 1); > } > > - > -void evergreen_compute_init_config(struct r600_context *ctx) > +/** > + * This function initializes all the compute specific registers that need to > + * be initialized for each compute command stream. Registers that are common > + * to both compute and 3D will be initialized at the beginning of each > compute > + * command stream by the start_cs_cmd atom. However, since the > SET_CONTEXT_REG > + * packet requires that the shader type bit be set, we must initialize all > + * context registers needed for compute in this function. The registers > + * intialized by the start_cs_cmd atom can be found in evereen_state.c in the > + * functions evergreen_init_atom_start_cs or cayman_init_atom_start_cs > depending > + * on the GPU family. > + */ > +void evergreen_init_atom_start_compute_cs(struct r600_context *ctx) > { > - struct evergreen_compute_resource* res = > - get_empty_res(ctx->cs_shader, COMPUTE_RESOURCE_CONFIG, 0); > - > + struct r600_command_buffer *cb = &ctx->start_compute_cs_cmd; > int num_threads; > int num_stack_entries; > - int num_temp_gprs; > > - enum radeon_family family; > - unsigned tmp; > - > - family = ctx->family; > + /* We aren't passing the EMIT_EARLY flag as the third argument > + * because we will be emitting this atom manually in order to > + * ensure it gets emitted after the start_cs_cmd atom. > + */ > + r600_init_command_buffer(cb, 256, 0); > + cb->pkt_flags = RADEON_CP_PACKET3_COMPUTE_MODE; > > - switch (family) { > + switch (ctx->family) { > case CHIP_CEDAR: > default: > - num_temp_gprs = 4; > num_threads = 128; > num_stack_entries = 256; > break; > case CHIP_REDWOOD: > - num_temp_gprs = 4; > num_threads = 128; > num_stack_entries = 256; > break; > case CHIP_JUNIPER: > - num_temp_gprs = 4; > num_threads = 128; > num_stack_entries = 512; > break; > case CHIP_CYPRESS: > case CHIP_HEMLOCK: > - num_temp_gprs = 4; > num_threads = 128; > num_stack_entries = 512; > break; > case CHIP_PALM: > - num_temp_gprs = 4; > num_threads = 128; > num_stack_entries = 256; > break; > case CHIP_SUMO: > - num_temp_gprs = 4; > num_threads = 128; > num_stack_entries = 256; > break; > case CHIP_SUMO2: > - num_temp_gprs = 4; > num_threads = 128; > num_stack_entries = 512; > break; > case CHIP_BARTS: > - num_temp_gprs = 4; > num_threads = 128; > num_stack_entries = 512; > break; > case CHIP_TURKS: > - num_temp_gprs = 4; > num_threads = 128; > num_stack_entries = 256; > break; > case CHIP_CAICOS: > - num_temp_gprs = 4; > num_threads = 128; > num_stack_entries = 256; > break; > } > > - tmp = 0x00000000; > - switch (family) { > - case CHIP_CEDAR: > - case CHIP_PALM: > - case CHIP_SUMO: > - case CHIP_SUMO2: > - case CHIP_CAICOS: > - break; > - default: > - tmp |= S_008C00_VC_ENABLE(1); > - break; > - } > - tmp |= S_008C00_EXPORT_SRC_C(1); > - tmp |= S_008C00_CS_PRIO(0); > - tmp |= S_008C00_LS_PRIO(0); > - tmp |= S_008C00_HS_PRIO(0); > - tmp |= S_008C00_PS_PRIO(0); > - tmp |= S_008C00_VS_PRIO(0); > - tmp |= S_008C00_GS_PRIO(0); > - tmp |= S_008C00_ES_PRIO(0); > - > - evergreen_reg_set(res, R_008C00_SQ_CONFIG, tmp); > - > - evergreen_reg_set(res, R_008C04_SQ_GPR_RESOURCE_MGMT_1, > - S_008C04_NUM_CLAUSE_TEMP_GPRS(num_temp_gprs)); > + /* Config Registers */ > if (ctx->chip_class < CAYMAN) { > - evergreen_reg_set(res, R_008C08_SQ_GPR_RESOURCE_MGMT_2, 0); > + > + /* These registers control which simds can be used by each > stage. > + * The default for these registers is 0xffffffff, which means > + * all simds are available for each stage. It's possible we > may > + * want to play around with these in the future, but for now > + * the default value is fine. > + * > + * R_008E20_SQ_STATIC_THREAD_MGMT1 > + * R_008E24_SQ_STATIC_THREAD_MGMT2 > + * R_008E28_SQ_STATIC_THREAD_MGMT3 > + */ > + > + /* XXX: We may need to adjust the thread and stack resouce > + * values for 3D/compute interop */ > + > + r600_store_config_reg_seq(cb, > R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 5); > + > + /* R_008C18_SQ_THREAD_RESOURCE_MGMT_1 > + * Set the number of threads used by the PS/VS/GS/ES stage to > + * 0. > + */ > + r600_store_value(cb, 0); > + > + /* R_008C1C_SQ_THREAD_RESOURCE_MGMT_2 > + * Set the number of threads used by the CS (aka LS) stage to > + * the maximum number of threads and set the number of threads > + * for the HS stage to 0. */ > + r600_store_value(cb, S_008C1C_NUM_LS_THREADS(num_threads)); > + > + /* R_008C20_SQ_STACK_RESOURCE_MGMT_1 > + * Set the Control Flow stack entries to 0 for PS/VS stages */ > + r600_store_value(cb, 0); > + > + /* R_008C24_SQ_STACK_RESOURCE_MGMT_2 > + * Set the Control Flow stack entries to 0 for GS/ES stages */ > + r600_store_value(cb, 0); > + > + /* R_008C28_SQ_STACK_RESOURCE_MGMT_3 > + * Set the Contol Flow stack entries to 0 for the HS stage, > and > + * set it to the maximum value for the CS (aka LS) stage. */ > + r600_store_value(cb, > + S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries)); > } > - evergreen_reg_set(res, R_008C10_SQ_GLOBAL_GPR_RESOURCE_MGMT_1, 0); > - evergreen_reg_set(res, R_008C14_SQ_GLOBAL_GPR_RESOURCE_MGMT_2, 0); > - evergreen_reg_set(res, R_008D8C_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ, (1 << > 8)); > > - /* workaround for hw issues with dyn gpr - must set all limits to 240 > - * instead of 0, 0x1e == 240/8 */ > + /* Context Registers */ > + > if (ctx->chip_class < CAYMAN) { > - evergreen_reg_set(res, R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1, > - S_028838_PS_GPRS(0x1e) | > - S_028838_VS_GPRS(0x1e) | > - S_028838_GS_GPRS(0x1e) | > - S_028838_ES_GPRS(0x1e) | > - S_028838_HS_GPRS(0x1e) | > - S_028838_LS_GPRS(0x1e)); > - } else { > - evergreen_reg_set(res, 0x286f8, > + /* workaround for hw issues with dyn gpr - must set all limits > + * to 240 instead of 0, 0x1e == 240 / 8 > + */ > + r600_store_context_reg(cb, > R_028838_SQ_DYN_GPR_RESOURCE_LIMIT_1, > S_028838_PS_GPRS(0x1e) | > S_028838_VS_GPRS(0x1e) | > S_028838_GS_GPRS(0x1e) | > @@ -642,36 +663,13 @@ void evergreen_compute_init_config(struct r600_context > *ctx) > S_028838_LS_GPRS(0x1e)); > } > > - if (ctx->chip_class < CAYMAN) { > + /* XXX: Investigate setting bit 15, which is FAST_COMPUTE_MODE */ > + r600_store_context_reg(cb, R_028A40_VGT_GS_MODE, > + S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1)); > > - evergreen_reg_set(res, R_008E20_SQ_STATIC_THREAD_MGMT1, > 0xFFFFFFFF); > - evergreen_reg_set(res, R_008E24_SQ_STATIC_THREAD_MGMT2, > 0xFFFFFFFF); > - evergreen_reg_set(res, R_008E20_SQ_STATIC_THREAD_MGMT1, > 0xFFFFFFFF); > - evergreen_reg_set(res, R_008E24_SQ_STATIC_THREAD_MGMT2, > 0xFFFFFFFF); > - evergreen_reg_set(res, R_008E28_SQ_STATIC_THREAD_MGMT3, > 0xFFFFFFFF); > - evergreen_reg_set(res, R_008C18_SQ_THREAD_RESOURCE_MGMT_1, 0); > - tmp = S_008C1C_NUM_LS_THREADS(num_threads); > - evergreen_reg_set(res, R_008C1C_SQ_THREAD_RESOURCE_MGMT_2, > tmp); > - evergreen_reg_set(res, R_008C20_SQ_STACK_RESOURCE_MGMT_1, 0); > - evergreen_reg_set(res, R_008C24_SQ_STACK_RESOURCE_MGMT_2, 0); > - tmp = S_008C28_NUM_LS_STACK_ENTRIES(num_stack_entries); > - evergreen_reg_set(res, R_008C28_SQ_STACK_RESOURCE_MGMT_3, > tmp); > - } > - evergreen_reg_set(res, R_0286CC_SPI_PS_IN_CONTROL_0, > S_0286CC_LINEAR_GRADIENT_ENA(1)); > - evergreen_reg_set(res, R_0286D0_SPI_PS_IN_CONTROL_1, 0); > - evergreen_reg_set(res, R_0286E4_SPI_PS_IN_CONTROL_2, 0); > - evergreen_reg_set(res, R_0286D8_SPI_INPUT_Z, 0); > - evergreen_reg_set(res, R_0286E0_SPI_BARYC_CNTL, 1 << 20); > - tmp = S_0286E8_TID_IN_GROUP_ENA | S_0286E8_TGID_ENA | > S_0286E8_DISABLE_INDEX_PACK; > - evergreen_reg_set(res, R_0286E8_SPI_COMPUTE_INPUT_CNTL, tmp); > - tmp = S_028A40_COMPUTE_MODE(1) | S_028A40_PARTIAL_THD_AT_EOI(1); > - evergreen_reg_set(res, R_028A40_VGT_GS_MODE, tmp); > - evergreen_reg_set(res, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/); > - evergreen_reg_set(res, R_028800_DB_DEPTH_CONTROL, 0); > - evergreen_reg_set(res, R_02880C_DB_SHADER_CONTROL, 0); > - evergreen_reg_set(res, R_028000_DB_RENDER_CONTROL, > S_028000_COLOR_DISABLE(1)); > - evergreen_reg_set(res, R_02800C_DB_RENDER_OVERRIDE, 0); > - evergreen_reg_set(res, R_0286E8_SPI_COMPUTE_INPUT_CNTL, > + r600_store_context_reg(cb, R_028B54_VGT_SHADER_STAGES_EN, 2/*CS_ON*/); > + > + r600_store_context_reg(cb, R_0286E8_SPI_COMPUTE_INPUT_CNTL, > S_0286E8_TID_IN_GROUP_ENA > | S_0286E8_TGID_ENA > | S_0286E8_DISABLE_INDEX_PACK) > diff --git a/src/gallium/drivers/r600/evergreen_compute.h > b/src/gallium/drivers/r600/evergreen_compute.h > index a0881cd..809114d 100644 > --- a/src/gallium/drivers/r600/evergreen_compute.h > +++ b/src/gallium/drivers/r600/evergreen_compute.h > @@ -35,7 +35,7 @@ void *evergreen_create_compute_state(struct pipe_context > *ctx, const const struc > void evergreen_delete_compute_state(struct pipe_context *ctx, void *state); > void evergreen_direct_dispatch( struct pipe_context *context, const uint > *block_layout, const uint *grid_layout); > void evergreen_compute_upload_input(struct pipe_context *context, const uint > *block_layout, const uint *grid_layout, const void *input); > -void evergreen_compute_init_config(struct r600_context *rctx); > +void evergreen_init_atom_start_compute_cs(struct r600_context *rctx); > void evergreen_init_compute_state_functions(struct r600_context *rctx); > > struct pipe_resource *r600_compute_global_buffer_create(struct pipe_screen > *screen, const struct pipe_resource *templ); > diff --git a/src/gallium/drivers/r600/r600_pipe.c > b/src/gallium/drivers/r600/r600_pipe.c > index 435aa77..deddfb2 100644 > --- a/src/gallium/drivers/r600/r600_pipe.c > +++ b/src/gallium/drivers/r600/r600_pipe.c > @@ -251,6 +251,7 @@ static struct pipe_context *r600_create_context(struct > pipe_screen *screen, void > case CAYMAN: > evergreen_init_state_functions(rctx); > evergreen_init_atom_start_cs(rctx); > + evergreen_init_atom_start_compute_cs(rctx); > if (evergreen_context_init(rctx)) > goto fail; > rctx->custom_dsa_flush = evergreen_create_db_flush_dsa(rctx); > diff --git a/src/gallium/drivers/r600/r600_pipe.h > b/src/gallium/drivers/r600/r600_pipe.h > index cd67ee1..63eeab3 100644 > --- a/src/gallium/drivers/r600/r600_pipe.h > +++ b/src/gallium/drivers/r600/r600_pipe.h > @@ -324,6 +324,9 @@ struct r600_context { > /* States based on r600_atom. */ > struct list_head dirty_states; > struct r600_command_buffer start_cs_cmd; /* invariant state > mostly */ > + /** Compute specific registers initializations. The start_cs_cmd atom > + * must be emitted before start_compute_cs_cmd. */ > + struct r600_command_buffer start_compute_cs_cmd; > struct r600_surface_sync_cmd surface_sync_cmd; > struct r600_atom r6xx_flush_and_inv_cmd; > struct r600_db_misc_state db_misc_state; > -- > 1.7.7.6 > > _______________________________________________ > mesa-dev mailing list > mesa-dev@lists.freedesktop.org > http://lists.freedesktop.org/mailman/listinfo/mesa-dev _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/mesa-dev