Re: [Qemu-devel] [PATCH 09/10 v11] target-tilegx: Generate tcg instructions to finish "Hello world"

Richard Henderson Mon, 01 Jun 2015 11:42:07 -0700

First, what happened to the decoding skeleton patch?  You seem to have merged
it with patch 9 here.  That said, see the bottom of this message.


On 05/30/2015 02:18 PM, Chen Gang wrote:
> +/* mfspr can be only in X1 pipe, so it doesn't need to be bufferd */
> +static void gen_mfspr(struct DisasContext *dc, uint8_t rdst, uint16_t imm14)

I'm not keen on this as a comment.  Clearly it could be buffered, with what is
implemented here now.  But there are plenty of SPRs for which produce side
effects, and *cannot* be buffered.

Adjust the comment to

/* Many SPR reads have side effects and cannot be buffered.  However, they are
   all in the X1 pipe, which we are executing last, therefore we need not do
   additional buffering.  */

> +/* mtspr can be only in X1 pipe, so it doesn't need to be bufferd */

Same, but s/reads/writes/.

> +#if 1

Do not include this.

> +/*
> + * uint64_t output = 0;
> + * uint32_t counter;
> + * for (counter = 0; counter < (WORD_SIZE / BYTE_SIZE); counter++)
> + * {
> + *     int8_t srca = getByte (rf[SrcA], counter);
> + *     int8_t srcb = signExtend8 (Imm8);
> + *     output = setByte (output, counter, ((srca == srcb) ? 1 : 0));
> + * }
> + * rf[Dest] = output;
> + */
> +static void gen_v1cmpeqi(struct DisasContext *dc,
> +                         uint8_t rdst, uint8_t rsrc, int8_t imm8)

Pass in the condition to use, since you'll eventually need to implement
v1cmpltsi, v1cmpltui.

> +static void gen_v1cmpeq(struct DisasContext *dc,
> +                        uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)

Likewise for v1cmples, v1cmpleu, v1cmplts, v1cmpltu, v1cmpne.

> +    tcg_gen_movi_i64(vdst, 0); /* or Assertion `ts->val_type == 
> TEMP_VAL_REG' */

These comments are unnecessary.  Of course it's illegal to use an uninitialized
temporary.

> +static void gen_v4int_l(struct DisasContext *dc,
> +                        uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
> +{
> +    qemu_log_mask(CPU_LOG_TB_IN_ASM, "v4int_l r%d, r%d, r%d\n",
> +                  rdst, rsrc, rsrcb);
> +    tcg_gen_deposit_i64(dest_gr(dc, rdst), load_gr(dc, rsrc),
> +                        load_gr(dc, rsrcb), 0, 32);

This is incorrect.  This produces { A1, B0 }, not { A0, B0 }.

As I said, you did want "32, 32" as the field insert, but you have the source
operands in the wrong order.

> +static void gen_addx(struct DisasContext *dc,
> +                     uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
> +{
> +    TCGv vdst = dest_gr(dc, rdst);
> +
> +    /* High bits have no effect with low bits, so addx and addxsc are 
> merged. */
> +    qemu_log_mask(CPU_LOG_TB_IN_ASM, "addx(sc) r%d, r%d, r%d\n",
> +                  rdst, rsrc, rsrcb);

Um, no, addxsc does signed saturation before sign extension.

> +static void gen_mul_u_u(struct DisasContext *dc,
> +                        uint8_t rdst, uint8_t rsrc, uint8_t rsrcb,
> +                        int8 high, int8 highb, int8 add, const char *code)

A better name for this function is warranted, since it does much more than
mul_u_u.  The add parameter should be type bool.

Given the existence of mul_hs_hs, mul_hu_ls, etc, you're probably better off
passing in extraction functions.  E.g.

static void ext32s_high(TCGv d, TCGv s)
{
    tcg_gen_sari_i64(d, s, 32);
}

static void ext32u_high(TCGv d, TCGv s)
{
    tcg_gen_shri_i64(d, s, 32);
}

  gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, ext32s_high,
          false, "mul_hs_hs");
  gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, ext32u_high,
          false, "mul_hs_hu");
  gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, tcg_gen_ext32s_i64,
          false, "mul_hs_ls");
  gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, tcg_gen_ext32u_i64,
          false, "mul_hs_lu");

  gen_mul(dc, rdst, rsrc, rsrcb, ext32u_high, ext32u_high,
          false, "mul_hu_hu");
  gen_mul(dc, rdst, rsrc, rsrcb, ext32u_high, tcg_gen_ext32s_i64,
          false, "mul_hu_ls");
  gen_mul(dc, rdst, rsrc, rsrcb, ext32u_high, tcg_gen_ext32u_i64,
          false, "mul_hu_lu");

  gen_mul(dc, rdst, rsrc, rsrcb, tcg_gen_ext32s_i64, tcg_gen_ext32s_i64,
          false, "mul_ls_ls");
  gen_mul(dc, rdst, rsrc, rsrcb, tcg_gen_ext32s_i64, tcg_gen_ext32u_i64,
          false, "mul_ls_lu");

  gen_mul(dc, rdst, rsrc, rsrcb, tcg_gen_ext32u_i64, tcg_gen_ext32u_i64,
          false, "mul_lu_lu");


and of course the same for the mula insns with true instead of false for the
"add" parameter.

> +static void gen_shladd(struct DisasContext *dc,
> +                       uint8_t rdst, uint8_t rsrc, uint8_t rsrcb,
> +                       uint8_t shift, uint8_t cast)

cast should be bool.

> +static void gen_dblalign(struct DisasContext *dc,
> +                         uint8_t rdst, uint8_t rsrc, uint8_t rsrcb)
> +{
> +    TCGv vdst = dest_gr(dc, rdst);
> +    TCGv mask = tcg_temp_new_i64();
> +    TCGv tmp = tcg_temp_new_i64();
> +
> +    qemu_log_mask(CPU_LOG_TB_IN_ASM, "dblalign r%d, r%d, r%d\n",
> +                  rdst, rsrc, rsrcb);
> +
> +    tcg_gen_andi_i64(mask, load_gr(dc, rsrcb), 7);
> +    tcg_gen_muli_i64(mask, mask, 8);

tcg_gen_shli_i64(mask, mask, 3);

> +    tcg_gen_shr_i64(vdst, load_gr(dc, rdst), mask);
> +
> +    tcg_gen_movi_i64(tmp, 64);
> +    tcg_gen_sub_i64(mask, tmp, mask);
> +    tcg_gen_shl_i64(mask, load_gr(dc, rsrc), mask);
> +
> +    tcg_gen_or_i64(vdst, vdst, mask);

Does not produce the correct results for mask == 0.

What you want is when mask == 0, you shift A by 64 bits, i.e. produce a zero.
But you can't do that in TCG (or C for that matter).  Best is to do two shifts:

  tcg_gen_xori_i64(mask, mask, 63); /* compute 1's compliment of the shift */
  tcg_gen_shl_i64(mask, load_gr(dc, rsrc), mask);
  tcg_gen_shli_i64(mask, mask, 1); /* one more to produce 2's compliment */

> +static void gen_ld_add(struct DisasContext *dc,
> +                       uint8_t rdst, uint8_t rsrc, int8_t imm8,
> +                       TCGMemOp ops, const char *code)
> +{
> +    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d, r%d, %d\n",
> +                  code, rdst, rsrc, imm8);
> +
> +    tcg_gen_qemu_ld_i64(dest_gr(dc, rdst), load_gr(dc, rsrc),
> +                        MMU_USER_IDX, ops);
> +    /*
> +     * Each pipe only have one temp val which is already used, and it is only
> +     * for pipe X1, so can use real register
> +     */
> +    if (rsrc < TILEGX_R_COUNT) {
> +        tcg_gen_addi_i64(cpu_regs[rsrc], load_gr(dc, rsrc), imm8);
> +    }
> +}

This is a poor comment.  Clearly each pipe can have two outputs, so this
limitation is simply of your own design.

Further, the < TILEGX_R_COUNT restriction is also incorrect.  True, you don't
actually implement the top 7 special registers, but that doesn't matter, you
should still be incrementing them.

> +
> +    return;

Do not add bare return statments at the ends of functions.

> +static int gen_blb(struct DisasContext *dc, uint8_t rsrc, int32_t off,
> +                   TCGCond cond, const char *code)

Unused return value.  What were you intending?

> +static void decode_rrr_1_opcode_y0(struct DisasContext *dc,
> +                                   tilegx_bundle_bits bundle)
> +{
> +    uint8_t rsrc = get_SrcA_Y0(bundle);
> +    uint8_t rsrcb = get_SrcB_Y0(bundle);
> +    uint8_t rdst = get_Dest_Y0(bundle);
> +
> +    switch (get_RRROpcodeExtension_Y0(bundle)) {
> +    case UNARY_RRR_1_OPCODE_Y0:
> +        switch (get_UnaryOpcodeExtension_Y0(bundle)) {
> +        case CNTLZ_UNARY_OPCODE_Y0:
> +            gen_cntlz(dc, rdst, rsrc);
> +            return;
> +        case CNTTZ_UNARY_OPCODE_Y0:
> +            gen_cnttz(dc, rdst, rsrc);
> +            return;
> +        case NOP_UNARY_OPCODE_Y0:
> +        case  FNOP_UNARY_OPCODE_Y0:
> +            if (!rsrc && !rdst) {
> +                qemu_log_mask(CPU_LOG_TB_IN_ASM, "(f)nop\n");
> +                return;
> +            }
> +            break;
> +        case FSINGLE_PACK1_UNARY_OPCODE_Y0:
> +        case PCNT_UNARY_OPCODE_Y0:
> +        case REVBITS_UNARY_OPCODE_Y0:
> +        case REVBYTES_UNARY_OPCODE_Y0:
> +        case TBLIDXB0_UNARY_OPCODE_Y0:
> +        case TBLIDXB1_UNARY_OPCODE_Y0:
> +        case TBLIDXB2_UNARY_OPCODE_Y0:
> +        case TBLIDXB3_UNARY_OPCODE_Y0:
> +        default:
> +            break;
> +        }
> +        break;
> +    case SHL1ADD_RRR_1_OPCODE_Y0:
> +        gen_shladd(dc, rdst, rsrc, rsrcb, 1, 0);
> +        return;
> +    case SHL2ADD_RRR_1_OPCODE_Y0:
> +        gen_shladd(dc, rdst, rsrc, rsrcb, 2, 0);
> +        return;
> +    case SHL3ADD_RRR_1_OPCODE_Y0:
> +        gen_shladd(dc, rdst, rsrc, rsrcb, 3, 0);
> +        return;
> +    default:
> +        break;
> +    }
> +
> +    qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_opcode_y0, [" FMT64X "]\n", 
> bundle);
> +}
> +

I can't help thinking, as I read all of these decode functions, that it would
be better if the output disassembly, i.e. qemu_log_mask(CPU_LOG_TB_IN_ASM, *),
were to happen here, instead of being spread across 99 other functions.

This has a side effect of reducing many of your functions to a single
statement, invoking another tcg generator, at which point it's worth inlining 
them.

For example:

static void decode_rrr_1_unary_y0(struct DisasContext *dc,
                                  tilegx_bundle_bits bundle,
                                  uint8_t rdst, uint8_t rsrc)
{
    unsigned ext = get_UnaryOpcodeExtension_Y0(bundle);
    const char *mnemonic;
    TCGv vdst, vsrc;

    if (ext == NOP_UNARY_OPCODE_Y0 || ext == FNOP_UNARY_OPCODE_Y0) {
        if (rsrc != 0 || rdst != 0) {
            goto unimplemented;
        }
        qemu_log_mask(CPU_LOG_TB_IN_ASM, "(f)nop\n");
        return;
    }

    vdst = dest_gr(dc, rdst);
    vsrc = load_gr(dc, rsrc);

    switch (ext) {
    case CNTLZ_UNARY_OPCODE_Y0:
        gen_helper_cntlz(vdst, vsrc);
        mnemonic = "cntlz";
        break;
    case CNTTZ_UNARY_OPCODE_Y0:
        gen_helper_cnttz(vdst, vsrc);
        mnemonic = "cnttz";
        break;
    case FSINGLE_PACK1_UNARY_OPCODE_Y0:
    case PCNT_UNARY_OPCODE_Y0:
    case REVBITS_UNARY_OPCODE_Y0:
    case REVBYTES_UNARY_OPCODE_Y0:
    case TBLIDXB0_UNARY_OPCODE_Y0:
    case TBLIDXB1_UNARY_OPCODE_Y0:
    case TBLIDXB2_UNARY_OPCODE_Y0:
    case TBLIDXB3_UNARY_OPCODE_Y0:
    default:
    unimplemented:
        qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_unary_y0, [" FMT64X "]\n",
                      bundle);
        dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
        return;
    }

    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d,r%d\n",
                  mnemonic, rdst, rsrc);
}

static void decode_rrr_1_opcode_y0(struct DisasContext *dc,
                                   tilegx_bundle_bits bundle)
{
    unsigned ext = get_RRROpcodeExtension_Y0(bundle);
    uint8_t rsrca = get_SrcA_Y0(bundle);
    uint8_t rsrcb = get_SrcB_Y0(bundle);
    uint8_t rdst = get_Dest_Y0(bundle);
    const char *mnemonic;
    TCGv vdst, vsrca, vsrcb;

    if (ext == UNARY_RRR_1_OPCODE_Y0) {
        decode_rrr_1_unary_y0(dc, bundle, rdst, rsrc);
        return;
    }

    vdst = dest_gr(dc, rdst);
    vsrca = load_gr(dc, rsrca);
    vsrcb = load_gr(dc, rsrcb);

    switch (ext) {
    case SHL1ADD_RRR_1_OPCODE_Y0:
        gen_shladd(vdst, vsrca, vsrcb, 1, 0);
        mnemonic = "shl1add";
        break;
    case SHL2ADD_RRR_1_OPCODE_Y0:
        gen_shladd(vdst, vsrca, vsrcb, 2, 0);
        mnemonic = "shl2add";
        break;
    case SHL3ADD_RRR_1_OPCODE_Y0:
        gen_shladd(vdst, vsrca, vsrcb, 3, 0);
        mnemonic = "shl3add";
        break;
    default:
        qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_opcode_y0, [" FMT64X "]\n",
                      bundle);
        dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED;
        return;
    }
    qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d,r%d,r%d\n",
                  mnemonic, rdst, rsrca, rsrcb);
}


r~

Re: [Qemu-devel] [PATCH 09/10 v11] target-tilegx: Generate tcg instructions to finish "Hello world"

Reply via email to