Am 02.02.2018 um 05:56 schrieb Dave Airlie:
> From: Dave Airlie <airl...@redhat.com>
> 
> Doing a straight uint/int->fp32->fp64 conversion causes
> some precision issues, Roland suggested splitting the
> integer into two portions and doing two separate
> int->fp32->fp64 conversions then adding the results.
> 
> This passes the tests in CTS and piglit.
> 
> Signed-off-by: Dave Airlie <airl...@redhat.com>
> ---
>  src/gallium/drivers/r600/r600_shader.c | 118 
> +++++++++++++++++++++++++--------
>  1 file changed, 90 insertions(+), 28 deletions(-)
> 
> diff --git a/src/gallium/drivers/r600/r600_shader.c 
> b/src/gallium/drivers/r600/r600_shader.c
> index 13aa681049..22f2736b03 100644
> --- a/src/gallium/drivers/r600/r600_shader.c
> +++ b/src/gallium/drivers/r600/r600_shader.c
> @@ -4490,44 +4490,106 @@ static int egcm_int_to_double(struct r600_shader_ctx 
> *ctx)
>  {
>       struct tgsi_full_instruction *inst = 
> &ctx->parse.FullToken.FullInstruction;
>       struct r600_bytecode_alu alu;
> -     int i, r;
> -     int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
> +     int i, c, r;
> +     int write_mask = inst->Dst[0].Register.WriteMask;
> +     int temp_reg = r600_get_temp(ctx);
>  
>       assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
>               inst->Instruction.Opcode == TGSI_OPCODE_U2D);
>  
> -     for (i = 0; i <= (lasti+1)/2; i++) {
> -             memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> -             alu.op = ctx->inst_info->op;
> -
> -             r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
> -             alu.dst.sel = ctx->temp_reg;
> -             alu.dst.chan = i;
> -             alu.dst.write = 1;
> -             alu.last = 1;
> +     for (c = 0; c < 2; c++) {
> +             int dchan = c * 2;
> +             if (write_mask & (0x3 << dchan)) {
> +     /* split into 24-bit int and 8-bit int */
> +                     memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +                     alu.op = ALU_OP2_AND_INT;
> +                     alu.dst.sel = temp_reg;
> +                     alu.dst.chan = dchan;
> +                     r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
> +                     alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
> +                     alu.src[1].value = 0xffffff00;
> +                     alu.dst.write = 1;
> +                     r = r600_bytecode_add_alu(ctx->bc, &alu);
> +                     if (r)
> +                             return r;
>  
> -             r = r600_bytecode_add_alu(ctx->bc, &alu);
> -             if (r)
> -                     return r;
> +                     memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> +                     alu.op = ALU_OP2_AND_INT;
> +                     alu.dst.sel = temp_reg;
> +                     alu.dst.chan = dchan + 1;
> +                     r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
> +                     alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
> +                     alu.src[1].value = 0xff;
> +                     alu.dst.write = 1;
> +                     alu.last = 1;
> +                     r = r600_bytecode_add_alu(ctx->bc, &alu);
> +                     if (r)
> +                             return r;
> +             }
>       }
>  
> -     for (i = 0; i <= lasti; i++) {
> -             memset(&alu, 0, sizeof(struct r600_bytecode_alu));
> -             alu.op = ALU_OP1_FLT32_TO_FLT64;
> +     for (c = 0; c < 2; c++) {
> +             int dchan = c * 2;
> +             if (write_mask & (0x3 << dchan)) {
> +                     for (i = dchan; i <= dchan + 1; i++) {
> +                             memset(&alu, 0, sizeof(struct 
> r600_bytecode_alu));
> +                             alu.op = i == dchan ? ctx->inst_info->op : 
> ALU_OP1_UINT_TO_FLT;
>  
> -             alu.src[0].chan = i/2;
> -             if (i%2 == 0)
> -                     alu.src[0].sel = ctx->temp_reg;
> -             else {
> -                     alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
> -                     alu.src[0].value = 0x0;
> +                             alu.src[0].sel = temp_reg;
> +                             alu.src[0].chan = i;
> +                             alu.dst.sel = temp_reg;
> +                             alu.dst.chan = i;
> +                             alu.dst.write = 1;
> +                             alu.last = i == dchan + 1;
> +
> +                             r = r600_bytecode_add_alu(ctx->bc, &alu);
> +                             if (r)
> +                                     return r;
> +                     }
>               }
That'll still work on eg (cypress) where UINT_TO_FLT is scalar, right?
I just realized that for the low 8 bits you could actually skip the
masking and use UBYTE0_FLT instead if that instruction does what the
docs say :-). Though I guess on Cayman that won't be much of an
improvement, but might shave off another instruction or two on Cypress
(as this one is a vector instruction)...
In any case,

Reviewed-by: Roland Scheidegger <srol...@vmware.com>

> -             tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
> -             alu.last = i == lasti;
> +     }
>  
> -             r = r600_bytecode_add_alu(ctx->bc, &alu);
> -             if (r)
> -                     return r;
> +     for (c = 0; c < 2; c++) {
> +             int dchan = c * 2;
> +             if (write_mask & (0x3 << dchan)) {
> +                     for (i = 0; i < 4; i++) {
> +                             memset(&alu, 0, sizeof(struct 
> r600_bytecode_alu));
> +                             alu.op = ALU_OP1_FLT32_TO_FLT64;
> +
> +                             alu.src[0].chan = dchan + (i / 2);
> +                             if (i == 0 || i == 2)
> +                                     alu.src[0].sel = temp_reg;
> +                             else {
> +                                     alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
> +                                     alu.src[0].value = 0x0;
> +                             }
> +                             alu.dst.sel = ctx->temp_reg;
> +                             alu.dst.chan = i;
> +                             alu.last = i == 3;
> +                             alu.dst.write = 1;
> +
> +                             r = r600_bytecode_add_alu(ctx->bc, &alu);
> +                             if (r)
> +                                     return r;
> +                     }
> +
> +                     for (i = 0; i <= 1; i++) {
> +                             memset(&alu, 0, sizeof(struct 
> r600_bytecode_alu));
> +                             alu.op = ALU_OP2_ADD_64;
> +
> +                             alu.src[0].chan = fp64_switch(i);
> +                             alu.src[0].sel = ctx->temp_reg;
> +
> +                             alu.src[1].chan = fp64_switch(i + 2);
> +                             alu.src[1].sel = ctx->temp_reg;
> +                             tgsi_dst(ctx, &inst->Dst[0], dchan + i, 
> &alu.dst);
> +                             alu.last = i == 1;
> +
> +                             r = r600_bytecode_add_alu(ctx->bc, &alu);
> +                             if (r)
> +                                     return r;
> +                     }
> +             }
>       }
>  
>       return 0;
> 

_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

Reply via email to