Some offsets used for the LDS access are recalculated quite regularly.
Since tesselation shaders are not optimized by the SB manually pre-evaluate
some offsets to speed up this type of shader.

Signed-off-by: Gert Wollny <gw.foss...@gmail.com>
---
 src/gallium/drivers/r600/r600_shader.c | 253 ++++++++++++++++++++++-----------
 1 file changed, 172 insertions(+), 81 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c 
b/src/gallium/drivers/r600/r600_shader.c
index 873b525449..163ae75eb5 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -183,6 +183,7 @@ int r600_pipe_shader_create(struct pipe_context *ctx,
                R600_ERR("translation from TGSI failed !\n");
                goto error;
        }
+
        if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
                /* only disable for vertex shaders in tess paths */
                if (key.vs.as_ls)
@@ -329,6 +330,7 @@ struct  r600_tess_input_cache_entry {
 struct r600_tess_input_cache {
        struct r600_tess_input_cache_entry data[32];
        int fill;
+       int uses_lds_io;
 };
 
 struct r600_shader_ctx {
@@ -367,7 +369,8 @@ struct r600_shader_ctx {
        unsigned                                enabled_stream_buffers_mask;
        unsigned                                tess_input_info; /* temp with 
tess input offsets */
        unsigned                                tess_output_info; /* temp with 
tess input offsets */
-       struct r600_tess_input_cache            tess_input_cache;
+       unsigned          tess_io_info_precalc; /* temp with precalcuated 
offsets */
+       struct r600_tess_input_cache tess_input_cache;
 };
 
 struct r600_shader_tgsi_instruction {
@@ -392,7 +395,8 @@ static void r600_bytecode_src(struct r600_bytecode_alu_src 
*bc_src,
                        const struct r600_shader_src *shader_src,
                        unsigned chan);
 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
-                              unsigned dst_reg, unsigned mask, int param);
+                              unsigned temp_chan, unsigned dst_reg,
+                              unsigned mask, int param);
 
 static int tgsi_last_instruction(unsigned writemask)
 {
@@ -1027,13 +1031,8 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
                         d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
                        int param = r600_get_lds_unique_index(d->Semantic.Name, 
0);
                        int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER 
? 3 : 2;
-                       unsigned temp_reg = r600_get_temp(ctx);
-
-                       r = get_lds_offset0(ctx, 2, temp_reg, true);
-                       if (r)
-                               return r;
 
-                       do_lds_fetch_values(ctx, temp_reg, dreg, 0xF, param);
+                       do_lds_fetch_values(ctx, ctx->tess_io_info_precalc, 3, 
dreg, 0xF, param);
                }
                else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
                        /* MOV r1.x, r0.x;
@@ -1648,7 +1647,9 @@ static int tgsi_split_gs_inputs(struct r600_shader_ctx 
*ctx)
  * All three shaders VS(LS), TCS, TES share the same LDS space.
  */
 /* this will return with the dw address in temp_reg.x */
-static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
+static int r600_get_byte_address(struct r600_shader_ctx *ctx,
+                                unsigned *result_reg, unsigned *result_chan,
+                                int base_offset_reg, int base_offset_chan,
                                 const struct tgsi_full_dst_register *dst,
                                 const struct tgsi_full_src_register *src,
                                 int stride_bytes_reg, int stride_bytes_chan, 
int *param)
@@ -1656,7 +1657,11 @@ static int r600_get_byte_address(struct r600_shader_ctx 
*ctx, int temp_reg,
        struct tgsi_full_dst_register reg;
        ubyte *name, *index, *array_first;
        int r;
+       int temp_reg = -1;
        struct tgsi_shader_info *info = &ctx->info;
+       *result_reg = base_offset_reg;
+       *result_chan = base_offset_chan;
+
        /* Set the register description. The address computation is the same
         * for sources and destinations. */
        if (src) {
@@ -1686,14 +1691,18 @@ static int r600_get_byte_address(struct r600_shader_ctx 
*ctx, int temp_reg,
                        sel = V_SQ_ALU_SRC_LITERAL;
                        chan = reg.Dimension.Index;
                }
-
+               temp_reg = r600_get_temp(ctx);
+               *result_reg = temp_reg;
+               *result_chan = 0;
                r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
                                   temp_reg, 0,
                                   stride_bytes_reg, stride_bytes_chan,
                                   sel, chan,
-                                  temp_reg, 0);
+                                  base_offset_reg, base_offset_chan);
                if (r)
                        return r;
+       } else {
+
        }
 
        if (reg.Register.File == TGSI_FILE_INPUT) {
@@ -1719,15 +1728,20 @@ static int r600_get_byte_address(struct r600_shader_ctx 
*ctx, int temp_reg,
 
                addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
 
-               /* pull the value from index_reg */
+               if (temp_reg < 0)
+                               temp_reg = r600_get_temp(ctx);
+
                r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
                                   temp_reg, 0,
                                   V_SQ_ALU_SRC_LITERAL, 16,
                                   addr_reg, 0,
-                                  temp_reg, 0);
+                                  *result_reg, *result_chan);
                if (r)
                        return r;
 
+               *result_reg = temp_reg;
+               *result_chan = 0;
+
                *param = r600_get_lds_unique_index(name[first],
                                                  index[first]);
 
@@ -1739,14 +1753,17 @@ static int r600_get_byte_address(struct r600_shader_ctx 
*ctx, int temp_reg,
        return 0;
 }
 
-static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
+static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned offs_reg,
+                              unsigned offs_chan,
                               unsigned dst_reg, unsigned mask, int param)
+       
 {
        struct r600_bytecode_alu alu;
        int r, i;
        int lasti = tgsi_last_instruction(mask);
        int firsti = param > 0 ? 0 : 1;
 
+
        if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
                ctx->bc->force_add_cf = 1;
 
@@ -1756,12 +1773,12 @@ static int do_lds_fetch_values(struct r600_shader_ctx 
*ctx, unsigned temp_reg,
                        continue;
 
                memset(&alu, 0, sizeof(struct r600_bytecode_alu));
-               alu.dst.sel = temp_reg;
+               alu.dst.sel = ctx->temp_reg;
                alu.dst.chan = i;
                alu.dst.write = 1;
                alu.op = ALU_OP2_ADD_INT;
-               alu.src[0].sel = temp_reg;
-               alu.src[0].chan = 0;
+               alu.src[0].sel = offs_reg;
+               alu.src[0].chan = offs_chan;
                alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
                alu.src[1].value = 4 * i + 16 * param;
 
@@ -1779,8 +1796,13 @@ static int do_lds_fetch_values(struct r600_shader_ctx 
*ctx, unsigned temp_reg,
                /* emit an LDS_READ_RET */
                memset(&alu, 0, sizeof(alu));
                alu.op = LDS_OP1_LDS_READ_RET;
-               alu.src[0].sel = temp_reg;
-               alu.src[0].chan = i;
+               if (i > 0 || firsti == 0) {
+                       alu.src[0].sel = ctx->temp_reg;
+                       alu.src[0].chan = i;
+               } else {
+                       alu.src[0].sel = offs_reg;
+                       alu.src[0].chan = offs_chan;
+               }
                alu.src[1].sel = V_SQ_ALU_SRC_0;
                alu.src[2].sel = V_SQ_ALU_SRC_0;
                alu.dst.chan = 0;
@@ -1824,20 +1846,18 @@ static int fetch_tes_input(struct r600_shader_ctx *ctx, 
struct tgsi_full_src_reg
                           unsigned int dst_reg, unsigned mask)
 {
        int r, param;
-       unsigned temp_reg = r600_get_temp(ctx);
-
-       r = get_lds_offset0(ctx, 2, temp_reg,
-                           src->Register.Dimension ? false : true);
-       if (r)
-               return r;
+       unsigned temp_reg;
+       unsigned temp_chan;
 
        /* the base address is now in temp.x */
-       r = r600_get_byte_address(ctx, temp_reg,
+       r = r600_get_byte_address(ctx, &temp_reg, &temp_chan,
+                                 ctx->tess_io_info_precalc,
+                                 src->Register.Dimension ? 2:3,
                                  NULL, src, ctx->tess_output_info, 1, &param);
        if (r)
                return r;
 
-       r = do_lds_fetch_values(ctx, temp_reg, dst_reg, mask, param);
+       r = do_lds_fetch_values(ctx, temp_reg, temp_chan, dst_reg, mask, param);
        if (r)
                return r;
        return 0;
@@ -1848,23 +1868,16 @@ static int fetch_tcs_input(struct r600_shader_ctx *ctx, 
struct tgsi_full_src_reg
 {
        int r,param;
        unsigned temp_reg = r600_get_temp(ctx);
-
-       /* t.x = ips * r0.y */
-       r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
-                          temp_reg, 0,
-                          ctx->tess_input_info, 0,
-                          0, 1);
-
-       if (r)
-               return r;
+       unsigned temp_chan = 0;
 
        /* the base address is now in temp.x */
-       r = r600_get_byte_address(ctx, temp_reg,
+       r = r600_get_byte_address(ctx, &temp_reg, &temp_chan,
+                                 ctx->tess_io_info_precalc, 3,
                                  NULL, src, ctx->tess_input_info, 1, &param);
        if (r)
                return r;
 
-       r = do_lds_fetch_values(ctx, temp_reg, dst_reg, mask, param);
+       r = do_lds_fetch_values(ctx, temp_reg, temp_chan, dst_reg, mask, param);
        if (r)
                return r;
        return 0;
@@ -1874,20 +1887,18 @@ static int fetch_tcs_output(struct r600_shader_ctx 
*ctx, struct tgsi_full_src_re
                            unsigned int dst_reg, unsigned mask)
 {
        int r, param;
-       unsigned temp_reg = r600_get_temp(ctx);
+       unsigned temp_reg;
+       unsigned temp_chan;
 
-       r = get_lds_offset0(ctx, 1, temp_reg,
-                           src->Register.Dimension ? false : true);
-       if (r)
-               return r;
-       /* the base address is now in temp.x */
-       r = r600_get_byte_address(ctx, temp_reg,
+       r = r600_get_byte_address(ctx, &temp_reg, &temp_chan,
+                                 ctx->tess_io_info_precalc,
+                                 src->Register.Dimension ? 0:1,
                                  NULL, src,
                                  ctx->tess_output_info, 1, &param);
        if (r)
                return r;
 
-       r = do_lds_fetch_values(ctx, temp_reg, dst_reg, mask, param);
+       r = do_lds_fetch_values(ctx, temp_reg, temp_chan, dst_reg, mask, param);
        if (r)
                return r;
        return 0;
@@ -1896,11 +1907,12 @@ static int fetch_tcs_output(struct r600_shader_ctx 
*ctx, struct tgsi_full_src_re
 static int tgsi_full_src_register_equal_for_cache(struct 
tgsi_full_src_register *lhs,
                                        struct tgsi_full_src_register *rhs)
 {
+       if (lhs->Register.File != rhs->Register.File)
+               return 0;
+       
        if (lhs->Register.Index != rhs->Register.Index)
                return 0;
 
-       if (lhs->Register.File != rhs->Register.File)
-
        if (lhs->Register.Indirect || rhs->Register.Indirect)
                return 0;
 
@@ -2028,9 +2040,10 @@ static void count_tess_inputs(struct r600_shader_ctx 
*ctx)
        for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
                struct tgsi_full_src_register *src = &inst->Src[i];
                if (((src->Register.File == TGSI_FILE_INPUT) && (ctx->type == 
PIPE_SHADER_TESS_EVAL)) ||
-                   (ctx->type == PIPE_SHADER_TESS_CTRL &&
-                    (src->Register.File == TGSI_FILE_INPUT || 
src->Register.File == TGSI_FILE_OUTPUT)))
+                   (ctx->type == PIPE_SHADER_TESS_CTRL)) {
                        tess_input_cache_check(&ctx->tess_input_cache, src);
+                       ctx->tess_input_cache.uses_lds_io = 1;
+               }
        }
 }
 
@@ -2729,7 +2742,7 @@ static int r600_fetch_tess_io_info(struct r600_shader_ctx 
*ctx)
                           0, 0);
        if (r)
                return r;
-
+       
        /* used by VS/TCS */
        if (ctx->tess_input_info) {
                /* fetch tcs input values into resv space */
@@ -2752,12 +2765,13 @@ static int r600_fetch_tess_io_info(struct 
r600_shader_ctx *ctx)
                vtx.dst_sel_w = 3;
                vtx.src_gpr = temp_val;
                vtx.src_sel_x = 0;
-
+               
                r = r600_bytecode_add_vtx(ctx->bc, &vtx);
                if (r)
                        return r;
+               
        }
-
+       
        /* used by TCS/TES */
        if (ctx->tess_output_info) {
                /* fetch tcs output values into resv space */
@@ -2784,6 +2798,64 @@ static int r600_fetch_tess_io_info(struct 
r600_shader_ctx *ctx)
                r = r600_bytecode_add_vtx(ctx->bc, &vtx);
                if (r)
                        return r;
+               
+               if (ctx->tess_input_cache.uses_lds_io) {
+                       
+                       /* Precalc some offsets, after this we have
+                        
+                        */
+                       
+                       /* tess_io_info_precalc.x = tess_output_info.x * R0.y + 
tess_output_info.z */
+                       r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
+                                          ctx->tess_io_info_precalc, 0,
+                                          ctx->tess_output_info, 0,
+                                          0, 1,
+                                          ctx->tess_output_info, 2);
+                       if (r)
+                               return r;
+                       
+                       /* tess_io_info_precalc.y = tess_output_info.x * R0.y + 
tess_output_info.w */
+                       
+                       r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
+                                          ctx->tess_io_info_precalc, 1,
+                                          ctx->tess_output_info, 0,
+                                          0, 1,
+                                          ctx->tess_output_info, 3);
+                       if (r)
+                               return r;
+
+
+                       /* tess_io_info_precalc.z = tess_output_info.x * R0.z + 
tess_output_info.z */
+                       r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
+                                          ctx->tess_io_info_precalc, 2,
+                                          ctx->tess_output_info, 0,
+                                          0, 2,
+                                          ctx->tess_output_info, 2);
+                       if (r)
+                               return r;
+
+                       /* This is a TCS shader */  
+                       if (ctx->tess_input_info) {
+                               
+                               /* t.x = ips * r0.y */
+                               r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
+                                                  ctx->tess_io_info_precalc, 3,
+                                                  ctx->tess_input_info, 0,
+                                                  0, 1);
+                               if (r)
+                                       return r;
+                       } else {
+                               
+                               /* tess_io_info_precalc.w = tess_output_info.x 
* R0.z + tess_output_info.w */
+                               r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
+                                                  ctx->tess_io_info_precalc, 3,
+                                                  ctx->tess_output_info, 0,
+                                                  0, 2,
+                                                  ctx->tess_output_info, 3);
+                               if (r)
+                                       return r;
+                       }
+               }
        }
        return 0;
 }
@@ -2858,8 +2930,10 @@ static int r600_store_tcs_output(struct r600_shader_ctx 
*ctx)
 {
        struct tgsi_full_instruction *inst = 
&ctx->parse.FullToken.FullInstruction;
        const struct tgsi_full_dst_register *dst = &inst->Dst[0];
-       int i, r, lasti;
+       int i, r, lasti, firsti;
        int temp_reg = r600_get_temp(ctx);
+       unsigned offs_reg;
+       unsigned offs_chan;
        struct r600_bytecode_alu alu;
        unsigned write_mask = dst->Register.WriteMask;
        int param;
@@ -2867,19 +2941,18 @@ static int r600_store_tcs_output(struct r600_shader_ctx 
*ctx)
        if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
                return 0;
 
-       r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : 
true);
-       if (r)
-               return r;
-
        /* the base address is now in temp.x */
-       r = r600_get_byte_address(ctx, temp_reg,
+       r = r600_get_byte_address(ctx, &offs_reg, &offs_chan,
+                                 ctx->tess_io_info_precalc,
+                                 dst->Register.Dimension ? 0:1,
                                  &inst->Dst[0], NULL, ctx->tess_output_info, 
1, &param);
        if (r)
                return r;
 
+       firsti = param > 0 ? 0 : 1;
        /* LDS write */
        lasti = tgsi_last_instruction(write_mask);
-       for (i = (param > 0 ? 0: 1); i <= lasti; i++) {
+       for (i = firsti; i <= lasti; i++) {
 
                if (!(write_mask & (1 << i)))
                        continue;
@@ -2888,8 +2961,8 @@ static int r600_store_tcs_output(struct r600_shader_ctx 
*ctx)
                alu.dst.chan = i;
                alu.dst.write = 1;
                alu.op = ALU_OP2_ADD_INT;
-               alu.src[0].sel = temp_reg;
-               alu.src[0].chan = 0;
+               alu.src[0].sel = offs_reg;
+               alu.src[0].chan = offs_chan;
                alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
                alu.src[1].value = 4 * i + 16 * param;
 
@@ -2909,8 +2982,14 @@ static int r600_store_tcs_output(struct r600_shader_ctx 
*ctx)
                    (i == 2 && ((write_mask & 0xc) == 0xc))) {
                        memset(&alu, 0, sizeof(struct r600_bytecode_alu));
                        alu.op = LDS_OP3_LDS_WRITE_REL;
-                       alu.src[0].sel = temp_reg;
-                       alu.src[0].chan = i;
+
+                       if (firsti == 0 || i > 0) {
+                               alu.src[0].sel = temp_reg;
+                               alu.src[0].chan = i;
+                       } else {
+                               alu.src[0].sel = offs_reg;
+                               alu.src[0].chan = offs_chan;
+                       }
 
                        alu.src[1].sel = dst->Register.Index;
                        alu.src[1].sel += ctx->file_offset[dst->Register.File];
@@ -2931,8 +3010,14 @@ static int r600_store_tcs_output(struct r600_shader_ctx 
*ctx)
                }
                memset(&alu, 0, sizeof(struct r600_bytecode_alu));
                alu.op = LDS_OP2_LDS_WRITE;
-               alu.src[0].sel = temp_reg;
-               alu.src[0].chan = i;
+
+               if (firsti == 0 || i > 0) {
+                       alu.src[0].sel = temp_reg;
+                       alu.src[0].chan = i;
+               } else {
+                       alu.src[0].sel = offs_reg;
+                       alu.src[0].chan = offs_chan;
+               }
 
                alu.src[1].sel = dst->Register.Index;
                alu.src[1].sel += ctx->file_offset[dst->Register.File];
@@ -2953,17 +3038,12 @@ static int r600_tess_factor_read(struct r600_shader_ctx 
*ctx,
                                 int output_idx)
 {
        int param;
-       unsigned temp_reg = r600_get_temp(ctx);
        unsigned name = ctx->shader->output[output_idx].name;
        int dreg = ctx->shader->output[output_idx].gpr;
-       int r;
 
        param = r600_get_lds_unique_index(name, 0);
-       r = get_lds_offset0(ctx, 1, temp_reg, true);
-       if (r)
-               return r;
-
-       do_lds_fetch_values(ctx, temp_reg, dreg, 0xf, param);
+       
+       do_lds_fetch_values(ctx, ctx->tess_io_info_precalc, 1, dreg, 0xf, 
param);
        return 0;
 }
 
@@ -3293,11 +3373,13 @@ static int r600_shader_from_tgsi(struct r600_context 
*rctx,
        if (ctx.type == PIPE_SHADER_TESS_CTRL) {
                ctx.tess_input_info = ctx.bc->ar_reg + 3;
                ctx.tess_output_info = ctx.bc->ar_reg + 4;
-               ctx.temp_reg = ctx.bc->ar_reg + 5;
+               ctx.tess_io_info_precalc = ctx.bc->ar_reg + 5;
+               ctx.temp_reg = ctx.bc->ar_reg + 6;
        } else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
                ctx.tess_input_info = 0;
                ctx.tess_output_info = ctx.bc->ar_reg + 3;
-               ctx.temp_reg = ctx.bc->ar_reg + 4;
+               ctx.tess_io_info_precalc = ctx.bc->ar_reg + 4;
+               ctx.temp_reg = ctx.bc->ar_reg + 5;
        } else if (ctx.type == PIPE_SHADER_GEOMETRY) {
                ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
                ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
@@ -3316,18 +3398,27 @@ static int r600_shader_from_tgsi(struct r600_context 
*rctx,
                ctx.temp_reg = ctx.bc->ar_reg + 3;
        }
 
-       if (lds_inputs) {
+       ctx.tess_input_cache.uses_lds_io = 0;
+       if (lds_inputs || lds_outputs) {
                tgsi_parse_init(&ctx.parse, tokens);
+
                while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
                        tgsi_parse_token(&ctx.parse);
-
-                       if (ctx.parse.FullToken.Token.Type != 
TGSI_TOKEN_TYPE_INSTRUCTION)
-                               continue;
-
-                       count_tess_inputs(&ctx);
+                       if (ctx.parse.FullToken.Token.Type == 
TGSI_TOKEN_TYPE_INSTRUCTION)
+                               count_tess_inputs(&ctx);
+                       else if (ctx.parse.FullToken.Token.Type == 
TGSI_TOKEN_TYPE_DECLARATION) {
+                               struct tgsi_full_declaration *d = 
&ctx.parse.FullToken.FullDeclaration;
+                               if (d->Declaration.File == 
TGSI_FILE_SYSTEM_VALUE &&
+                                   (d->Semantic.Name == 
TGSI_SEMANTIC_TESSINNER ||
+                                    d->Semantic.Name == 
TGSI_SEMANTIC_TESSOUTER))
+                                       ctx.tess_input_cache.uses_lds_io = 1;
+                               
+                       }
                }
                ctx.temp_reg += 
tess_input_cache_count_multiused(&ctx.tess_input_cache, ctx.temp_reg);
                tgsi_parse_init(&ctx.parse, tokens);
+       } else {
+
        }
 
        shader->max_arrays = 0;
-- 
2.13.6

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to