From: Marek Olšák <marek.ol...@amd.com> --- src/gallium/drivers/radeonsi/si_pipe.c | 20 +++++++++++++ src/gallium/drivers/radeonsi/si_pipe.h | 1 + src/gallium/drivers/radeonsi/si_state_draw.c | 45 ++++++++++++++++++++-------- 3 files changed, 54 insertions(+), 12 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 1a83564..53a8201 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -29,20 +29,29 @@ #include "radeon/radeon_uvd.h" #include "util/u_memory.h" #include "util/u_suballoc.h" #include "util/u_tests.h" #include "vl/vl_decoder.h" #include "../ddebug/dd_util.h" #define SI_LLVM_DEFAULT_FEATURES \ "+DumpCode,+vgpr-spilling,-fp32-denormals,-xnack" +/* DX10/11 apply primitive restart to strip primitive types only. */ +static const char *apps_with_prim_restart_dx_behavior[] = { + "DeusExMD", + "DirtRally", + "HitmanPro", + "MadMax", + "TotalWarhammer", +}; + /* * pipe_context */ static void si_destroy_context(struct pipe_context *context) { struct si_context *sctx = (struct si_context *)context; int i; /* Unreference the framebuffer normally to disable related logic * properly. @@ -306,20 +315,31 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, * * The recommended value is 4 per CU at most. Higher numbers don't * bring much benefit, but they still occupy chip resources (think * async compute). I've seen ~2% performance difference between 4 and 32. */ sctx->scratch_waves = MAX2(32 * sscreen->b.info.num_good_compute_units, max_threads_per_block / 64); sctx->tm = si_create_llvm_target_machine(sscreen); + /* Process the app list. */ + char process_name[128]; + if (os_get_process_name(process_name, sizeof(process_name))) { + for (i = 0; i < ARRAY_SIZE(apps_with_prim_restart_dx_behavior); i++) { + if (strcmp(process_name, apps_with_prim_restart_dx_behavior[i]) == 0) { + sctx->use_prim_restart_dx_behavior = true; + break; + } + } + } + return &sctx->b.b; fail: fprintf(stderr, "radeonsi: Failed to create a context.\n"); si_destroy_context(&sctx->b.b); return NULL; } /* * pipe_screen */ diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index ea61e1e..1edcfbc 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -236,20 +236,21 @@ struct si_context { struct radeon_winsys_cs *ce_ib; struct radeon_winsys_cs *ce_preamble_ib; bool ce_need_synchronization; struct u_suballocator *ce_suballocator; struct si_shader_ctx_state fixed_func_tcs_shader; LLVMTargetMachineRef tm; /* only non-threaded compilation */ bool gfx_flush_in_progress; bool compute_is_busy; + bool use_prim_restart_dx_behavior; /* Atoms (direct states). */ union si_state_atoms atoms; unsigned dirty_atoms; /* mask */ /* PM4 states (precomputed immutable states) */ unsigned dirty_states; union si_state queued; union si_state emitted; /* Atom declarations. */ diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index e6a9ee0..319160e 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -445,42 +445,43 @@ void si_init_ia_multi_vgt_param_table(struct si_context *sctx) key.u.tcs_tes_uses_prim_id = tess_uses_primid; key.u.uses_gs = uses_gs; sctx->ia_multi_vgt_param[key.index] = si_get_init_multi_vgt_param(sctx->screen, &key); } } static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx, const struct pipe_draw_info *info, - unsigned num_patches) + unsigned num_patches, + bool primitive_restart) { union si_vgt_param_key key = sctx->ia_multi_vgt_param_key; unsigned primgroup_size; unsigned ia_multi_vgt_param; if (sctx->tes_shader.cso) { primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */ } else if (sctx->gs_shader.cso) { primgroup_size = 64; /* recommended with a GS */ } else { primgroup_size = 128; /* recommended without a GS and tess */ } key.u.prim = info->mode; key.u.uses_instancing = info->indirect || info->instance_count > 1; key.u.multi_instances_smaller_than_primgroup = info->indirect || (info->instance_count > 1 && (info->count_from_stream_output || si_num_prims_for_vertices(info) < primgroup_size)); - key.u.primitive_restart = info->primitive_restart; + key.u.primitive_restart = primitive_restart; key.u.count_from_stream_output = info->count_from_stream_output != NULL; ia_multi_vgt_param = sctx->ia_multi_vgt_param[key.index] | S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1); if (sctx->gs_shader.cso) { /* GS requirement. */ if (sctx->b.chip_class <= VI && SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3) ia_multi_vgt_param |= S_028AA8_PARTIAL_ES_WAVE_ON(1); @@ -544,28 +545,30 @@ static void si_emit_vs_state(struct si_context *sctx, sctx->shader_userdata.sh_base[PIPE_SHADER_VERTEX] + SI_SGPR_VS_STATE_BITS * 4, sctx->current_vs_state); sctx->last_vs_state = sctx->current_vs_state; } } static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_draw_info *info, - unsigned num_patches) + unsigned num_patches, + bool primitive_restart) { struct radeon_winsys_cs *cs = sctx->b.gfx.cs; unsigned prim = si_conv_pipe_prim(info->mode); unsigned gs_out_prim = si_conv_prim_to_gs_out(sctx->current_rast_prim); unsigned ia_multi_vgt_param; - ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, num_patches); + ia_multi_vgt_param = si_get_ia_multi_vgt_param(sctx, info, num_patches, + primitive_restart); /* Draw state. */ if (ia_multi_vgt_param != sctx->last_multi_vgt_param) { if (sctx->b.chip_class >= GFX9) radeon_set_uconfig_reg_idx(cs, R_030960_IA_MULTI_VGT_PARAM, 4, ia_multi_vgt_param); else if (sctx->b.chip_class >= CIK) radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param); else radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param); @@ -579,32 +582,32 @@ static void si_emit_draw_registers(struct si_context *sctx, sctx->last_prim = prim; } if (gs_out_prim != sctx->last_gs_out_prim) { radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim); sctx->last_gs_out_prim = gs_out_prim; } /* Primitive restart. */ - if (info->primitive_restart != sctx->last_primitive_restart_en) { + if (primitive_restart != sctx->last_primitive_restart_en) { if (sctx->b.chip_class >= GFX9) radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, - info->primitive_restart); + primitive_restart); else radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, - info->primitive_restart); + primitive_restart); - sctx->last_primitive_restart_en = info->primitive_restart; + sctx->last_primitive_restart_en = primitive_restart; } - if (info->primitive_restart && + if (primitive_restart && (info->restart_index != sctx->last_restart_index || sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN)) { radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX, info->restart_index); sctx->last_restart_index = info->restart_index; } } static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info, @@ -1129,29 +1132,47 @@ void si_ce_pre_draw_synchronization(struct si_context *sctx) void si_ce_post_draw_synchronization(struct si_context *sctx) { if (sctx->ce_need_synchronization) { radeon_emit(sctx->b.gfx.cs, PKT3(PKT3_INCREMENT_DE_COUNTER, 0, 0)); radeon_emit(sctx->b.gfx.cs, 0); sctx->ce_need_synchronization = false; } } +static bool is_strip_primitive_mode(unsigned prim) +{ + return ((1 << prim) & + ((1 << PIPE_PRIM_LINE_STRIP) | + (1 << PIPE_PRIM_LINE_STRIP_ADJACENCY) | + (1 << PIPE_PRIM_QUAD_STRIP) | + (1 << PIPE_PRIM_TRIANGLE_STRIP) | + (1 << PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY))) != 0; +} + void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) { struct si_context *sctx = (struct si_context *)ctx; struct si_state_rasterizer *rs = sctx->queued.named.rasterizer; const struct pipe_index_buffer *ib = &sctx->index_buffer; struct pipe_index_buffer ib_tmp; /* for index buffer uploads only */ unsigned mask, dirty_tex_counter; enum pipe_prim_type rast_prim; unsigned num_patches = 0; + bool primitive_restart = info->indexed && info->primitive_restart; + + /* This is better for performance, but the difference might not be + * measurable. + */ + if (sctx->use_prim_restart_dx_behavior && + !is_strip_primitive_mode(info->mode)) + primitive_restart = false; if (likely(!info->indirect)) { /* SI-CI treat instance_count==0 as instance_count==1. There is * no workaround for indirect draws, but we can at least skip * direct draws. */ if (unlikely(!info->instance_count)) return; /* Handle count == 0. */ @@ -1207,21 +1228,21 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) if (sctx->gs_shader.cso) { /* Determine whether the GS triangle strip adjacency fix should * be applied. Rotate every other triangle if * - triangle strips with adjacency are fed to the GS and * - primitive restart is disabled (the rotation doesn't help * when the restart occurs after an odd number of triangles). */ bool gs_tri_strip_adj_fix = !sctx->tes_shader.cso && info->mode == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY && - !info->primitive_restart; + !primitive_restart; if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) { sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix; sctx->do_update_shaders = true; } } if (sctx->do_update_shaders && !si_update_shaders(sctx)) return; @@ -1338,21 +1359,21 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) si_pm4_emit(sctx, state); sctx->emitted.array[i] = state; } sctx->dirty_states = 0; si_emit_rasterizer_prim_state(sctx); if (sctx->tes_shader.cso) si_emit_derived_tess_state(sctx, info, &num_patches); si_emit_vs_state(sctx, info); - si_emit_draw_registers(sctx, info, num_patches); + si_emit_draw_registers(sctx, info, num_patches, primitive_restart); si_ce_pre_draw_synchronization(sctx); si_emit_draw_packets(sctx, info, ib); si_ce_post_draw_synchronization(sctx); if (sctx->trace_buf) si_trace_emit(sctx); /* Workaround for a VGT hang when streamout is enabled. * It must be done after drawing. */ @@ -1389,21 +1410,21 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info) rtex->dirty_level_mask |= 1 << surf->u.tex.level; if (rtex->dcc_gather_statistics) rtex->separate_dcc_dirty = true; } while (mask); } sctx->framebuffer.do_update_surf_dirtiness = false; } pipe_resource_reference(&ib_tmp.buffer, NULL); sctx->b.num_draw_calls++; - if (info->primitive_restart) + if (primitive_restart) sctx->b.num_prim_restart_calls++; if (G_0286E8_WAVESIZE(sctx->spi_tmpring_size)) sctx->b.num_spill_draw_calls++; } void si_trace_emit(struct si_context *sctx) { struct radeon_winsys_cs *cs = sctx->b.gfx.cs; sctx->trace_id++; -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev