From: Marek Olšák <marek.ol...@amd.com> We were unconditionally storing these outputs, sometimes even one component at a time, but apps never read them in TES.
Move the TESSINNER/OUTER buffer stores into the TCS epilog where we can easily disable them on demand. --- src/gallium/drivers/radeonsi/si_shader.c | 89 ++++++++++++++++++++----- src/gallium/drivers/radeonsi/si_shader.h | 1 + src/gallium/drivers/radeonsi/si_state_shaders.c | 2 + 3 files changed, 77 insertions(+), 15 deletions(-) diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c index 65e3faf..cd537be 100644 --- a/src/gallium/drivers/radeonsi/si_shader.c +++ b/src/gallium/drivers/radeonsi/si_shader.c @@ -708,43 +708,44 @@ static LLVMValueRef get_dw_address(struct si_shader_context *ctx, * - attribute 1 of patch 0 vertex 0 * - attribute 1 of patch 0 vertex 1 * ... * - per patch attribute 0 of patch 0 * - per patch attribute 0 of patch 1 * ... * * Note that every attribute has 4 components. */ static LLVMValueRef get_tcs_tes_buffer_address(struct si_shader_context *ctx, + LLVMValueRef rel_patch_id, LLVMValueRef vertex_index, LLVMValueRef param_index) { struct gallivm_state *gallivm = ctx->bld_base.base.gallivm; LLVMValueRef base_addr, vertices_per_patch, num_patches, total_vertices; LLVMValueRef param_stride, constant16; vertices_per_patch = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 9, 6); num_patches = unpack_param(ctx, SI_PARAM_TCS_OFFCHIP_LAYOUT, 0, 9); total_vertices = LLVMBuildMul(gallivm->builder, vertices_per_patch, num_patches, ""); constant16 = lp_build_const_int32(gallivm, 16); if (vertex_index) { - base_addr = LLVMBuildMul(gallivm->builder, get_rel_patch_id(ctx), + base_addr = LLVMBuildMul(gallivm->builder, rel_patch_id, vertices_per_patch, ""); base_addr = LLVMBuildAdd(gallivm->builder, base_addr, vertex_index, ""); param_stride = total_vertices; } else { - base_addr = get_rel_patch_id(ctx); + base_addr = rel_patch_id; param_stride = num_patches; } base_addr = LLVMBuildAdd(gallivm->builder, base_addr, LLVMBuildMul(gallivm->builder, param_index, param_stride, ""), ""); base_addr = LLVMBuildMul(gallivm->builder, base_addr, constant16, ""); if (!vertex_index) { @@ -810,21 +811,22 @@ static LLVMValueRef get_tcs_tes_buffer_address_from_reg( param_index = lp_build_const_int32(gallivm, 0); } param_index_base = si_shader_io_get_unique_index(name[param_base], index[param_base]); param_index = LLVMBuildAdd(gallivm->builder, param_index, lp_build_const_int32(gallivm, param_index_base), ""); - return get_tcs_tes_buffer_address(ctx, vertex_index, param_index); + return get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), + vertex_index, param_index); } static LLVMValueRef buffer_load(struct lp_build_tgsi_context *bld_base, enum tgsi_opcode_type type, unsigned swizzle, LLVMValueRef buffer, LLVMValueRef offset, LLVMValueRef base) { struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; LLVMValueRef value, value2; @@ -981,20 +983,21 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base, { struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; const struct tgsi_full_dst_register *reg = &inst->Dst[0]; const struct tgsi_shader_info *sh_info = &ctx->shader->selector->info; unsigned chan_index; LLVMValueRef dw_addr, stride; LLVMValueRef rw_buffers, buffer, base, buf_addr; LLVMValueRef values[4]; bool skip_lds_store; + bool is_tess_factor = false; /* Only handle per-patch and per-vertex outputs here. * Vectors will be lowered to scalars and this function will be called again. */ if (reg->Register.File != TGSI_FILE_OUTPUT || (dst[0] && LLVMGetTypeKind(LLVMTypeOf(dst[0])) == LLVMVectorTypeKind)) { si_llvm_emit_store(bld_base, inst, info, dst); return; } @@ -1006,22 +1009,24 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base, } else { dw_addr = get_tcs_out_current_patch_data_offset(ctx); dw_addr = get_dw_address(ctx, reg, NULL, NULL, dw_addr); skip_lds_store = !sh_info->reads_perpatch_outputs; if (!reg->Register.Indirect) { int name = sh_info->output_semantic_name[reg->Register.Index]; /* Always write tess factors into LDS for the TCS epilog. */ if (name == TGSI_SEMANTIC_TESSINNER || - name == TGSI_SEMANTIC_TESSOUTER) + name == TGSI_SEMANTIC_TESSOUTER) { skip_lds_store = false; + is_tess_factor = true; + } } } rw_buffers = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers, lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP)); base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); buf_addr = get_tcs_tes_buffer_address_from_reg(ctx, reg, NULL); @@ -1033,28 +1038,28 @@ static void store_output_tcs(struct lp_build_tgsi_context *bld_base, if (inst->Instruction.Saturate) value = ac_emit_clamp(&ctx->ac, value); /* Skip LDS stores if there is no LDS read of this output. */ if (!skip_lds_store) lds_store(bld_base, chan_index, dw_addr, value); value = LLVMBuildBitCast(gallivm->builder, value, ctx->i32, ""); values[chan_index] = value; - if (inst->Dst[0].Register.WriteMask != 0xF) { + if (inst->Dst[0].Register.WriteMask != 0xF && !is_tess_factor) { ac_build_tbuffer_store_dwords(&ctx->ac, buffer, value, 1, buf_addr, base, 4 * chan_index); } } - if (inst->Dst[0].Register.WriteMask == 0xF) { + if (inst->Dst[0].Register.WriteMask == 0xF && !is_tess_factor) { LLVMValueRef value = lp_build_gather_values(bld_base->base.gallivm, values, 4); ac_build_tbuffer_store_dwords(&ctx->ac, buffer, value, 4, buf_addr, base, 0); } } static LLVMValueRef fetch_input_gs( struct lp_build_tgsi_context *bld_base, const struct tgsi_full_src_register *reg, @@ -1523,21 +1528,21 @@ static void declare_system_value( { LLVMValueRef rw_buffers, buffer, base, addr; int param = si_shader_io_get_unique_index(decl->Semantic.Name, 0); rw_buffers = LLVMGetParam(ctx->main_fn, SI_PARAM_RW_BUFFERS); buffer = ac_build_indexed_load_const(&ctx->ac, rw_buffers, lp_build_const_int32(gallivm, SI_HS_RING_TESS_OFFCHIP)); base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); - addr = get_tcs_tes_buffer_address(ctx, NULL, + addr = get_tcs_tes_buffer_address(ctx, get_rel_patch_id(ctx), NULL, lp_build_const_int32(gallivm, param)); value = buffer_load(&radeon_bld->bld_base, TGSI_TYPE_FLOAT, ~0, buffer, base, addr); break; } case TGSI_SEMANTIC_DEFAULT_TESSOUTER_SI: case TGSI_SEMANTIC_DEFAULT_TESSINNER_SI: @@ -2405,20 +2410,21 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base) inputs = ctx->shader->key.mono.tcs.inputs_to_copy; while (inputs) { unsigned i = u_bit_scan64(&inputs); LLVMValueRef lds_ptr = LLVMBuildAdd(gallivm->builder, lds_base, lp_build_const_int32(gallivm, 4 * i), ""); LLVMValueRef buffer_addr = get_tcs_tes_buffer_address(ctx, + get_rel_patch_id(ctx), invocation_id, lp_build_const_int32(gallivm, i)); LLVMValueRef value = lds_load(bld_base, TGSI_TYPE_SIGNED, ~0, lds_ptr); ac_build_tbuffer_store_dwords(&ctx->ac, buffer, value, 4, buffer_addr, buffer_offset, 0); } } @@ -2426,21 +2432,21 @@ static void si_copy_tcs_inputs(struct lp_build_tgsi_context *bld_base) static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, LLVMValueRef rel_patch_id, LLVMValueRef invocation_id, LLVMValueRef tcs_out_current_patch_data_offset) { struct si_shader_context *ctx = si_shader_context(bld_base); struct gallivm_state *gallivm = bld_base->base.gallivm; struct si_shader *shader = ctx->shader; unsigned tess_inner_index, tess_outer_index; LLVMValueRef lds_base, lds_inner, lds_outer, byteoffset, buffer; - LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base; + LLVMValueRef out[6], vec0, vec1, rw_buffers, tf_base, inner[4], outer[4]; unsigned stride, outer_comps, inner_comps, i; struct lp_build_if_state if_ctx, inner_if_ctx; si_llvm_emit_barrier(NULL, bld_base, NULL); /* Do this only for invocation 0, because the tess levels are per-patch, * not per-vertex. * * This can't jump, because invocation 0 executes this. It should * at least mask out the loads and stores for other invocations. @@ -2478,31 +2484,40 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, tess_outer_index = si_shader_io_get_unique_index(TGSI_SEMANTIC_TESSOUTER, 0); lds_base = tcs_out_current_patch_data_offset; lds_inner = LLVMBuildAdd(gallivm->builder, lds_base, lp_build_const_int32(gallivm, tess_inner_index * 4), ""); lds_outer = LLVMBuildAdd(gallivm->builder, lds_base, lp_build_const_int32(gallivm, tess_outer_index * 4), ""); + for (i = 0; i < 4; i++) { + inner[i] = LLVMGetUndef(ctx->i32); + outer[i] = LLVMGetUndef(ctx->i32); + } + if (shader->key.part.tcs.epilog.prim_mode == PIPE_PRIM_LINES) { /* For isolines, the hardware expects tess factors in the * reverse order from what GLSL / TGSI specify. */ - out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer); - out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer); + outer[0] = out[1] = lds_load(bld_base, TGSI_TYPE_SIGNED, 0, lds_outer); + outer[1] = out[0] = lds_load(bld_base, TGSI_TYPE_SIGNED, 1, lds_outer); } else { - for (i = 0; i < outer_comps; i++) - out[i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer); - for (i = 0; i < inner_comps; i++) - out[outer_comps+i] = lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner); + for (i = 0; i < outer_comps; i++) { + outer[i] = out[i] = + lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_outer); + } + for (i = 0; i < inner_comps; i++) { + inner[i] = out[outer_comps+i] = + lds_load(bld_base, TGSI_TYPE_SIGNED, i, lds_inner); + } } /* Convert the outputs to vectors for stores. */ vec0 = lp_build_gather_values(gallivm, out, MIN2(stride, 4)); vec1 = NULL; if (stride > 4) vec1 = lp_build_gather_values(gallivm, out+4, stride - 4); /* Get the buffer. */ @@ -2527,28 +2542,65 @@ static void si_write_tess_factors(struct lp_build_tgsi_context *bld_base, 1, lp_build_const_int32(gallivm, 0), tf_base, 0); lp_build_endif(&inner_if_ctx); /* Store the tessellation factors. */ ac_build_tbuffer_store_dwords(&ctx->ac, buffer, vec0, MIN2(stride, 4), byteoffset, tf_base, 4); if (vec1) ac_build_tbuffer_store_dwords(&ctx->ac, buffer, vec1, stride - 4, byteoffset, tf_base, 20); + + /* Store the tess factors into the offchip buffer if TES reads them. */ + if (shader->key.part.tcs.epilog.tes_reads_tess_factors) { + LLVMValueRef buf, base, inner_vec, outer_vec, tf_outer_offset; + LLVMValueRef tf_inner_offset; + unsigned param_outer, param_inner; + + buf = ac_build_indexed_load_const(&ctx->ac, rw_buffers, + LLVMConstInt(ctx->i32, SI_HS_RING_TESS_OFFCHIP, 0)); + base = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); + + param_outer = si_shader_io_get_unique_index( + TGSI_SEMANTIC_TESSOUTER, 0); + tf_outer_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, + LLVMConstInt(ctx->i32, param_outer, 0)); + + outer_vec = lp_build_gather_values(gallivm, outer, + util_next_power_of_two(outer_comps)); + + ac_build_tbuffer_store_dwords(&ctx->ac, buf, outer_vec, + outer_comps, tf_outer_offset, + base, 0); + if (inner_comps) { + param_inner = si_shader_io_get_unique_index( + TGSI_SEMANTIC_TESSINNER, 0); + tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL, + LLVMConstInt(ctx->i32, param_inner, 0)); + + inner_vec = inner_comps == 1 ? inner[0] : + lp_build_gather_values(gallivm, inner, inner_comps); + ac_build_tbuffer_store_dwords(&ctx->ac, buf, inner_vec, + inner_comps, tf_inner_offset, + base, 0); + } + } + lp_build_endif(&if_ctx); } /* This only writes the tessellation factor levels. */ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base) { struct si_shader_context *ctx = si_shader_context(bld_base); LLVMValueRef rel_patch_id, invocation_id, tf_lds_offset; + LLVMValueRef offchip_soffset, offchip_layout; si_copy_tcs_inputs(bld_base); rel_patch_id = get_rel_patch_id(ctx); invocation_id = unpack_param(ctx, SI_PARAM_REL_IDS, 8, 5); tf_lds_offset = get_tcs_out_current_patch_data_offset(ctx); /* Return epilog parameters from this function. */ LLVMBuilderRef builder = bld_base->base.gallivm->builder; LLVMValueRef ret = ctx->return_value; @@ -2560,23 +2612,30 @@ static void si_llvm_emit_tcs_epilogue(struct lp_build_tgsi_context *bld_base) SI_PARAM_RW_BUFFERS); rw_buffers = LLVMBuildPtrToInt(builder, rw_buffers, ctx->i64, ""); rw_buffers = LLVMBuildBitCast(builder, rw_buffers, ctx->v2i32, ""); rw0 = LLVMBuildExtractElement(builder, rw_buffers, bld_base->uint_bld.zero, ""); rw1 = LLVMBuildExtractElement(builder, rw_buffers, bld_base->uint_bld.one, ""); ret = LLVMBuildInsertValue(builder, ret, rw0, 0, ""); ret = LLVMBuildInsertValue(builder, ret, rw1, 1, ""); - /* Tess factor buffer soffset is after user SGPRs. */ + /* Tess offchip and factor buffer soffset are after user SGPRs. */ + offchip_layout = LLVMGetParam(ctx->main_fn, + SI_PARAM_TCS_OFFCHIP_LAYOUT); + offchip_soffset = LLVMGetParam(ctx->main_fn, ctx->param_oc_lds); tf_soffset = LLVMGetParam(ctx->main_fn, SI_PARAM_TESS_FACTOR_OFFSET); + ret = LLVMBuildInsertValue(builder, ret, offchip_layout, + SI_SGPR_TCS_OFFCHIP_LAYOUT, ""); + ret = LLVMBuildInsertValue(builder, ret, offchip_soffset, + SI_TCS_NUM_USER_SGPR, ""); ret = LLVMBuildInsertValue(builder, ret, tf_soffset, SI_TCS_NUM_USER_SGPR + 1, ""); /* VGPRs */ rel_patch_id = bitcast(bld_base, TGSI_TYPE_FLOAT, rel_patch_id); invocation_id = bitcast(bld_base, TGSI_TYPE_FLOAT, invocation_id); tf_lds_offset = bitcast(bld_base, TGSI_TYPE_FLOAT, tf_lds_offset); vgpr = SI_TCS_NUM_USER_SGPR + 2; ret = LLVMBuildInsertValue(builder, ret, rel_patch_id, vgpr++, ""); diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h index 0bb0f18..71cd95b 100644 --- a/src/gallium/drivers/radeonsi/si_shader.h +++ b/src/gallium/drivers/radeonsi/si_shader.h @@ -345,20 +345,21 @@ struct si_vs_prolog_bits { }; /* Common VS bits between the shader key and the epilog key. */ struct si_vs_epilog_bits { unsigned export_prim_id:1; /* when PS needs it and GS is disabled */ }; /* Common TCS bits between the shader key and the epilog key. */ struct si_tcs_epilog_bits { unsigned prim_mode:3; + unsigned tes_reads_tess_factors:1; }; struct si_gs_prolog_bits { unsigned tri_strip_adj_fix:1; }; /* Common PS bits between the shader key and the prolog key. */ struct si_ps_prolog_bits { unsigned color_two_side:1; unsigned flatshade_colors:1; diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c index 4a81b56..727ff33 100644 --- a/src/gallium/drivers/radeonsi/si_state_shaders.c +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c @@ -985,20 +985,22 @@ static inline void si_shader_selector_key(struct pipe_context *ctx, else { si_shader_selector_key_hw_vs(sctx, sel, key); if (sctx->ps_shader.cso && sctx->ps_shader.cso->info.uses_primid) key->part.vs.epilog.export_prim_id = 1; } break; case PIPE_SHADER_TESS_CTRL: key->part.tcs.epilog.prim_mode = sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE]; + key->part.tcs.epilog.tes_reads_tess_factors = + sctx->tes_shader.cso->info.reads_tess_factors; if (sel == sctx->fixed_func_tcs_shader.cso) key->mono.tcs.inputs_to_copy = sctx->vs_shader.cso->outputs_written; break; case PIPE_SHADER_TESS_EVAL: if (sctx->gs_shader.cso) key->as_es = 1; else { si_shader_selector_key_hw_vs(sctx, sel, key); -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev