Some offsets used for the LDS access are recalculated quite regularly. Since tesselation shaders are not optimized by the SB manually pre-evaluate some offsets to speed up this type of shader.
Signed-off-by: Gert Wollny <gw.foss...@gmail.com> --- src/gallium/drivers/r600/r600_shader.c | 253 ++++++++++++++++++++++----------- 1 file changed, 172 insertions(+), 81 deletions(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 873b525449..163ae75eb5 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -183,6 +183,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx, R600_ERR("translation from TGSI failed !\n"); goto error; } + if (shader->shader.processor_type == PIPE_SHADER_VERTEX) { /* only disable for vertex shaders in tess paths */ if (key.vs.as_ls) @@ -329,6 +330,7 @@ struct r600_tess_input_cache_entry { struct r600_tess_input_cache { struct r600_tess_input_cache_entry data[32]; int fill; + int uses_lds_io; }; struct r600_shader_ctx { @@ -367,7 +369,8 @@ struct r600_shader_ctx { unsigned enabled_stream_buffers_mask; unsigned tess_input_info; /* temp with tess input offsets */ unsigned tess_output_info; /* temp with tess input offsets */ - struct r600_tess_input_cache tess_input_cache; + unsigned tess_io_info_precalc; /* temp with precalcuated offsets */ + struct r600_tess_input_cache tess_input_cache; }; struct r600_shader_tgsi_instruction { @@ -392,7 +395,8 @@ static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src, const struct r600_shader_src *shader_src, unsigned chan); static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg, - unsigned dst_reg, unsigned mask, int param); + unsigned temp_chan, unsigned dst_reg, + unsigned mask, int param); static int tgsi_last_instruction(unsigned writemask) { @@ -1027,13 +1031,8 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx) d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) { int param = r600_get_lds_unique_index(d->Semantic.Name, 0); int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2; - unsigned temp_reg = r600_get_temp(ctx); - - r = get_lds_offset0(ctx, 2, temp_reg, true); - if (r) - return r; - do_lds_fetch_values(ctx, temp_reg, dreg, 0xF, param); + do_lds_fetch_values(ctx, ctx->tess_io_info_precalc, 3, dreg, 0xF, param); } else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) { /* MOV r1.x, r0.x; @@ -1648,7 +1647,9 @@ static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx) * All three shaders VS(LS), TCS, TES share the same LDS space. */ /* this will return with the dw address in temp_reg.x */ -static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg, +static int r600_get_byte_address(struct r600_shader_ctx *ctx, + unsigned *result_reg, unsigned *result_chan, + int base_offset_reg, int base_offset_chan, const struct tgsi_full_dst_register *dst, const struct tgsi_full_src_register *src, int stride_bytes_reg, int stride_bytes_chan, int *param) @@ -1656,7 +1657,11 @@ static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg, struct tgsi_full_dst_register reg; ubyte *name, *index, *array_first; int r; + int temp_reg = -1; struct tgsi_shader_info *info = &ctx->info; + *result_reg = base_offset_reg; + *result_chan = base_offset_chan; + /* Set the register description. The address computation is the same * for sources and destinations. */ if (src) { @@ -1686,14 +1691,18 @@ static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg, sel = V_SQ_ALU_SRC_LITERAL; chan = reg.Dimension.Index; } - + temp_reg = r600_get_temp(ctx); + *result_reg = temp_reg; + *result_chan = 0; r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, temp_reg, 0, stride_bytes_reg, stride_bytes_chan, sel, chan, - temp_reg, 0); + base_offset_reg, base_offset_chan); if (r) return r; + } else { + } if (reg.Register.File == TGSI_FILE_INPUT) { @@ -1719,15 +1728,20 @@ static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg, addr_reg = get_address_file_reg(ctx, reg.Indirect.Index); - /* pull the value from index_reg */ + if (temp_reg < 0) + temp_reg = r600_get_temp(ctx); + r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, temp_reg, 0, V_SQ_ALU_SRC_LITERAL, 16, addr_reg, 0, - temp_reg, 0); + *result_reg, *result_chan); if (r) return r; + *result_reg = temp_reg; + *result_chan = 0; + *param = r600_get_lds_unique_index(name[first], index[first]); @@ -1739,14 +1753,17 @@ static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg, return 0; } -static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg, +static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned offs_reg, + unsigned offs_chan, unsigned dst_reg, unsigned mask, int param) + { struct r600_bytecode_alu alu; int r, i; int lasti = tgsi_last_instruction(mask); int firsti = param > 0 ? 0 : 1; + if ((ctx->bc->cf_last->ndw>>1) >= 0x60) ctx->bc->force_add_cf = 1; @@ -1756,12 +1773,12 @@ static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg, continue; memset(&alu, 0, sizeof(struct r600_bytecode_alu)); - alu.dst.sel = temp_reg; + alu.dst.sel = ctx->temp_reg; alu.dst.chan = i; alu.dst.write = 1; alu.op = ALU_OP2_ADD_INT; - alu.src[0].sel = temp_reg; - alu.src[0].chan = 0; + alu.src[0].sel = offs_reg; + alu.src[0].chan = offs_chan; alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; alu.src[1].value = 4 * i + 16 * param; @@ -1779,8 +1796,13 @@ static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg, /* emit an LDS_READ_RET */ memset(&alu, 0, sizeof(alu)); alu.op = LDS_OP1_LDS_READ_RET; - alu.src[0].sel = temp_reg; - alu.src[0].chan = i; + if (i > 0 || firsti == 0) { + alu.src[0].sel = ctx->temp_reg; + alu.src[0].chan = i; + } else { + alu.src[0].sel = offs_reg; + alu.src[0].chan = offs_chan; + } alu.src[1].sel = V_SQ_ALU_SRC_0; alu.src[2].sel = V_SQ_ALU_SRC_0; alu.dst.chan = 0; @@ -1824,20 +1846,18 @@ static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_reg unsigned int dst_reg, unsigned mask) { int r, param; - unsigned temp_reg = r600_get_temp(ctx); - - r = get_lds_offset0(ctx, 2, temp_reg, - src->Register.Dimension ? false : true); - if (r) - return r; + unsigned temp_reg; + unsigned temp_chan; /* the base address is now in temp.x */ - r = r600_get_byte_address(ctx, temp_reg, + r = r600_get_byte_address(ctx, &temp_reg, &temp_chan, + ctx->tess_io_info_precalc, + src->Register.Dimension ? 2:3, NULL, src, ctx->tess_output_info, 1, ¶m); if (r) return r; - r = do_lds_fetch_values(ctx, temp_reg, dst_reg, mask, param); + r = do_lds_fetch_values(ctx, temp_reg, temp_chan, dst_reg, mask, param); if (r) return r; return 0; @@ -1848,23 +1868,16 @@ static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_reg { int r,param; unsigned temp_reg = r600_get_temp(ctx); - - /* t.x = ips * r0.y */ - r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24, - temp_reg, 0, - ctx->tess_input_info, 0, - 0, 1); - - if (r) - return r; + unsigned temp_chan = 0; /* the base address is now in temp.x */ - r = r600_get_byte_address(ctx, temp_reg, + r = r600_get_byte_address(ctx, &temp_reg, &temp_chan, + ctx->tess_io_info_precalc, 3, NULL, src, ctx->tess_input_info, 1, ¶m); if (r) return r; - r = do_lds_fetch_values(ctx, temp_reg, dst_reg, mask, param); + r = do_lds_fetch_values(ctx, temp_reg, temp_chan, dst_reg, mask, param); if (r) return r; return 0; @@ -1874,20 +1887,18 @@ static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_re unsigned int dst_reg, unsigned mask) { int r, param; - unsigned temp_reg = r600_get_temp(ctx); + unsigned temp_reg; + unsigned temp_chan; - r = get_lds_offset0(ctx, 1, temp_reg, - src->Register.Dimension ? false : true); - if (r) - return r; - /* the base address is now in temp.x */ - r = r600_get_byte_address(ctx, temp_reg, + r = r600_get_byte_address(ctx, &temp_reg, &temp_chan, + ctx->tess_io_info_precalc, + src->Register.Dimension ? 0:1, NULL, src, ctx->tess_output_info, 1, ¶m); if (r) return r; - r = do_lds_fetch_values(ctx, temp_reg, dst_reg, mask, param); + r = do_lds_fetch_values(ctx, temp_reg, temp_chan, dst_reg, mask, param); if (r) return r; return 0; @@ -1896,11 +1907,12 @@ static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_re static int tgsi_full_src_register_equal_for_cache(struct tgsi_full_src_register *lhs, struct tgsi_full_src_register *rhs) { + if (lhs->Register.File != rhs->Register.File) + return 0; + if (lhs->Register.Index != rhs->Register.Index) return 0; - if (lhs->Register.File != rhs->Register.File) - if (lhs->Register.Indirect || rhs->Register.Indirect) return 0; @@ -2028,9 +2040,10 @@ static void count_tess_inputs(struct r600_shader_ctx *ctx) for (i = 0; i < inst->Instruction.NumSrcRegs; i++) { struct tgsi_full_src_register *src = &inst->Src[i]; if (((src->Register.File == TGSI_FILE_INPUT) && (ctx->type == PIPE_SHADER_TESS_EVAL)) || - (ctx->type == PIPE_SHADER_TESS_CTRL && - (src->Register.File == TGSI_FILE_INPUT || src->Register.File == TGSI_FILE_OUTPUT))) + (ctx->type == PIPE_SHADER_TESS_CTRL)) { tess_input_cache_check(&ctx->tess_input_cache, src); + ctx->tess_input_cache.uses_lds_io = 1; + } } } @@ -2729,7 +2742,7 @@ static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx) 0, 0); if (r) return r; - + /* used by VS/TCS */ if (ctx->tess_input_info) { /* fetch tcs input values into resv space */ @@ -2752,12 +2765,13 @@ static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx) vtx.dst_sel_w = 3; vtx.src_gpr = temp_val; vtx.src_sel_x = 0; - + r = r600_bytecode_add_vtx(ctx->bc, &vtx); if (r) return r; + } - + /* used by TCS/TES */ if (ctx->tess_output_info) { /* fetch tcs output values into resv space */ @@ -2784,6 +2798,64 @@ static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx) r = r600_bytecode_add_vtx(ctx->bc, &vtx); if (r) return r; + + if (ctx->tess_input_cache.uses_lds_io) { + + /* Precalc some offsets, after this we have + + */ + + /* tess_io_info_precalc.x = tess_output_info.x * R0.y + tess_output_info.z */ + r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, + ctx->tess_io_info_precalc, 0, + ctx->tess_output_info, 0, + 0, 1, + ctx->tess_output_info, 2); + if (r) + return r; + + /* tess_io_info_precalc.y = tess_output_info.x * R0.y + tess_output_info.w */ + + r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, + ctx->tess_io_info_precalc, 1, + ctx->tess_output_info, 0, + 0, 1, + ctx->tess_output_info, 3); + if (r) + return r; + + + /* tess_io_info_precalc.z = tess_output_info.x * R0.z + tess_output_info.z */ + r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, + ctx->tess_io_info_precalc, 2, + ctx->tess_output_info, 0, + 0, 2, + ctx->tess_output_info, 2); + if (r) + return r; + + /* This is a TCS shader */ + if (ctx->tess_input_info) { + + /* t.x = ips * r0.y */ + r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24, + ctx->tess_io_info_precalc, 3, + ctx->tess_input_info, 0, + 0, 1); + if (r) + return r; + } else { + + /* tess_io_info_precalc.w = tess_output_info.x * R0.z + tess_output_info.w */ + r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24, + ctx->tess_io_info_precalc, 3, + ctx->tess_output_info, 0, + 0, 2, + ctx->tess_output_info, 3); + if (r) + return r; + } + } } return 0; } @@ -2858,8 +2930,10 @@ static int r600_store_tcs_output(struct r600_shader_ctx *ctx) { struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction; const struct tgsi_full_dst_register *dst = &inst->Dst[0]; - int i, r, lasti; + int i, r, lasti, firsti; int temp_reg = r600_get_temp(ctx); + unsigned offs_reg; + unsigned offs_chan; struct r600_bytecode_alu alu; unsigned write_mask = dst->Register.WriteMask; int param; @@ -2867,19 +2941,18 @@ static int r600_store_tcs_output(struct r600_shader_ctx *ctx) if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT) return 0; - r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true); - if (r) - return r; - /* the base address is now in temp.x */ - r = r600_get_byte_address(ctx, temp_reg, + r = r600_get_byte_address(ctx, &offs_reg, &offs_chan, + ctx->tess_io_info_precalc, + dst->Register.Dimension ? 0:1, &inst->Dst[0], NULL, ctx->tess_output_info, 1, ¶m); if (r) return r; + firsti = param > 0 ? 0 : 1; /* LDS write */ lasti = tgsi_last_instruction(write_mask); - for (i = (param > 0 ? 0: 1); i <= lasti; i++) { + for (i = firsti; i <= lasti; i++) { if (!(write_mask & (1 << i))) continue; @@ -2888,8 +2961,8 @@ static int r600_store_tcs_output(struct r600_shader_ctx *ctx) alu.dst.chan = i; alu.dst.write = 1; alu.op = ALU_OP2_ADD_INT; - alu.src[0].sel = temp_reg; - alu.src[0].chan = 0; + alu.src[0].sel = offs_reg; + alu.src[0].chan = offs_chan; alu.src[1].sel = V_SQ_ALU_SRC_LITERAL; alu.src[1].value = 4 * i + 16 * param; @@ -2909,8 +2982,14 @@ static int r600_store_tcs_output(struct r600_shader_ctx *ctx) (i == 2 && ((write_mask & 0xc) == 0xc))) { memset(&alu, 0, sizeof(struct r600_bytecode_alu)); alu.op = LDS_OP3_LDS_WRITE_REL; - alu.src[0].sel = temp_reg; - alu.src[0].chan = i; + + if (firsti == 0 || i > 0) { + alu.src[0].sel = temp_reg; + alu.src[0].chan = i; + } else { + alu.src[0].sel = offs_reg; + alu.src[0].chan = offs_chan; + } alu.src[1].sel = dst->Register.Index; alu.src[1].sel += ctx->file_offset[dst->Register.File]; @@ -2931,8 +3010,14 @@ static int r600_store_tcs_output(struct r600_shader_ctx *ctx) } memset(&alu, 0, sizeof(struct r600_bytecode_alu)); alu.op = LDS_OP2_LDS_WRITE; - alu.src[0].sel = temp_reg; - alu.src[0].chan = i; + + if (firsti == 0 || i > 0) { + alu.src[0].sel = temp_reg; + alu.src[0].chan = i; + } else { + alu.src[0].sel = offs_reg; + alu.src[0].chan = offs_chan; + } alu.src[1].sel = dst->Register.Index; alu.src[1].sel += ctx->file_offset[dst->Register.File]; @@ -2953,17 +3038,12 @@ static int r600_tess_factor_read(struct r600_shader_ctx *ctx, int output_idx) { int param; - unsigned temp_reg = r600_get_temp(ctx); unsigned name = ctx->shader->output[output_idx].name; int dreg = ctx->shader->output[output_idx].gpr; - int r; param = r600_get_lds_unique_index(name, 0); - r = get_lds_offset0(ctx, 1, temp_reg, true); - if (r) - return r; - - do_lds_fetch_values(ctx, temp_reg, dreg, 0xf, param); + + do_lds_fetch_values(ctx, ctx->tess_io_info_precalc, 1, dreg, 0xf, param); return 0; } @@ -3293,11 +3373,13 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, if (ctx.type == PIPE_SHADER_TESS_CTRL) { ctx.tess_input_info = ctx.bc->ar_reg + 3; ctx.tess_output_info = ctx.bc->ar_reg + 4; - ctx.temp_reg = ctx.bc->ar_reg + 5; + ctx.tess_io_info_precalc = ctx.bc->ar_reg + 5; + ctx.temp_reg = ctx.bc->ar_reg + 6; } else if (ctx.type == PIPE_SHADER_TESS_EVAL) { ctx.tess_input_info = 0; ctx.tess_output_info = ctx.bc->ar_reg + 3; - ctx.temp_reg = ctx.bc->ar_reg + 4; + ctx.tess_io_info_precalc = ctx.bc->ar_reg + 4; + ctx.temp_reg = ctx.bc->ar_reg + 5; } else if (ctx.type == PIPE_SHADER_GEOMETRY) { ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3; ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4; @@ -3316,18 +3398,27 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.temp_reg = ctx.bc->ar_reg + 3; } - if (lds_inputs) { + ctx.tess_input_cache.uses_lds_io = 0; + if (lds_inputs || lds_outputs) { tgsi_parse_init(&ctx.parse, tokens); + while (!tgsi_parse_end_of_tokens(&ctx.parse)) { tgsi_parse_token(&ctx.parse); - - if (ctx.parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION) - continue; - - count_tess_inputs(&ctx); + if (ctx.parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) + count_tess_inputs(&ctx); + else if (ctx.parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) { + struct tgsi_full_declaration *d = &ctx.parse.FullToken.FullDeclaration; + if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE && + (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER || + d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER)) + ctx.tess_input_cache.uses_lds_io = 1; + + } } ctx.temp_reg += tess_input_cache_count_multiused(&ctx.tess_input_cache, ctx.temp_reg); tgsi_parse_init(&ctx.parse, tokens); + } else { + } shader->max_arrays = 0; -- 2.13.6 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev