First, what happened to the decoding skeleton patch? You seem to have merged it with patch 9 here. That said, see the bottom of this message.
On 05/30/2015 02:18 PM, Chen Gang wrote: > +/* mfspr can be only in X1 pipe, so it doesn't need to be bufferd */ > +static void gen_mfspr(struct DisasContext *dc, uint8_t rdst, uint16_t imm14) I'm not keen on this as a comment. Clearly it could be buffered, with what is implemented here now. But there are plenty of SPRs for which produce side effects, and *cannot* be buffered. Adjust the comment to /* Many SPR reads have side effects and cannot be buffered. However, they are all in the X1 pipe, which we are executing last, therefore we need not do additional buffering. */ > +/* mtspr can be only in X1 pipe, so it doesn't need to be bufferd */ Same, but s/reads/writes/. > +#if 1 Do not include this. > +/* > + * uint64_t output = 0; > + * uint32_t counter; > + * for (counter = 0; counter < (WORD_SIZE / BYTE_SIZE); counter++) > + * { > + * int8_t srca = getByte (rf[SrcA], counter); > + * int8_t srcb = signExtend8 (Imm8); > + * output = setByte (output, counter, ((srca == srcb) ? 1 : 0)); > + * } > + * rf[Dest] = output; > + */ > +static void gen_v1cmpeqi(struct DisasContext *dc, > + uint8_t rdst, uint8_t rsrc, int8_t imm8) Pass in the condition to use, since you'll eventually need to implement v1cmpltsi, v1cmpltui. > +static void gen_v1cmpeq(struct DisasContext *dc, > + uint8_t rdst, uint8_t rsrc, uint8_t rsrcb) Likewise for v1cmples, v1cmpleu, v1cmplts, v1cmpltu, v1cmpne. > + tcg_gen_movi_i64(vdst, 0); /* or Assertion `ts->val_type == > TEMP_VAL_REG' */ These comments are unnecessary. Of course it's illegal to use an uninitialized temporary. > +static void gen_v4int_l(struct DisasContext *dc, > + uint8_t rdst, uint8_t rsrc, uint8_t rsrcb) > +{ > + qemu_log_mask(CPU_LOG_TB_IN_ASM, "v4int_l r%d, r%d, r%d\n", > + rdst, rsrc, rsrcb); > + tcg_gen_deposit_i64(dest_gr(dc, rdst), load_gr(dc, rsrc), > + load_gr(dc, rsrcb), 0, 32); This is incorrect. This produces { A1, B0 }, not { A0, B0 }. As I said, you did want "32, 32" as the field insert, but you have the source operands in the wrong order. > +static void gen_addx(struct DisasContext *dc, > + uint8_t rdst, uint8_t rsrc, uint8_t rsrcb) > +{ > + TCGv vdst = dest_gr(dc, rdst); > + > + /* High bits have no effect with low bits, so addx and addxsc are > merged. */ > + qemu_log_mask(CPU_LOG_TB_IN_ASM, "addx(sc) r%d, r%d, r%d\n", > + rdst, rsrc, rsrcb); Um, no, addxsc does signed saturation before sign extension. > +static void gen_mul_u_u(struct DisasContext *dc, > + uint8_t rdst, uint8_t rsrc, uint8_t rsrcb, > + int8 high, int8 highb, int8 add, const char *code) A better name for this function is warranted, since it does much more than mul_u_u. The add parameter should be type bool. Given the existence of mul_hs_hs, mul_hu_ls, etc, you're probably better off passing in extraction functions. E.g. static void ext32s_high(TCGv d, TCGv s) { tcg_gen_sari_i64(d, s, 32); } static void ext32u_high(TCGv d, TCGv s) { tcg_gen_shri_i64(d, s, 32); } gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, ext32s_high, false, "mul_hs_hs"); gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, ext32u_high, false, "mul_hs_hu"); gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, tcg_gen_ext32s_i64, false, "mul_hs_ls"); gen_mul(dc, rdst, rsrc, rsrcb, ext32s_high, tcg_gen_ext32u_i64, false, "mul_hs_lu"); gen_mul(dc, rdst, rsrc, rsrcb, ext32u_high, ext32u_high, false, "mul_hu_hu"); gen_mul(dc, rdst, rsrc, rsrcb, ext32u_high, tcg_gen_ext32s_i64, false, "mul_hu_ls"); gen_mul(dc, rdst, rsrc, rsrcb, ext32u_high, tcg_gen_ext32u_i64, false, "mul_hu_lu"); gen_mul(dc, rdst, rsrc, rsrcb, tcg_gen_ext32s_i64, tcg_gen_ext32s_i64, false, "mul_ls_ls"); gen_mul(dc, rdst, rsrc, rsrcb, tcg_gen_ext32s_i64, tcg_gen_ext32u_i64, false, "mul_ls_lu"); gen_mul(dc, rdst, rsrc, rsrcb, tcg_gen_ext32u_i64, tcg_gen_ext32u_i64, false, "mul_lu_lu"); and of course the same for the mula insns with true instead of false for the "add" parameter. > +static void gen_shladd(struct DisasContext *dc, > + uint8_t rdst, uint8_t rsrc, uint8_t rsrcb, > + uint8_t shift, uint8_t cast) cast should be bool. > +static void gen_dblalign(struct DisasContext *dc, > + uint8_t rdst, uint8_t rsrc, uint8_t rsrcb) > +{ > + TCGv vdst = dest_gr(dc, rdst); > + TCGv mask = tcg_temp_new_i64(); > + TCGv tmp = tcg_temp_new_i64(); > + > + qemu_log_mask(CPU_LOG_TB_IN_ASM, "dblalign r%d, r%d, r%d\n", > + rdst, rsrc, rsrcb); > + > + tcg_gen_andi_i64(mask, load_gr(dc, rsrcb), 7); > + tcg_gen_muli_i64(mask, mask, 8); tcg_gen_shli_i64(mask, mask, 3); > + tcg_gen_shr_i64(vdst, load_gr(dc, rdst), mask); > + > + tcg_gen_movi_i64(tmp, 64); > + tcg_gen_sub_i64(mask, tmp, mask); > + tcg_gen_shl_i64(mask, load_gr(dc, rsrc), mask); > + > + tcg_gen_or_i64(vdst, vdst, mask); Does not produce the correct results for mask == 0. What you want is when mask == 0, you shift A by 64 bits, i.e. produce a zero. But you can't do that in TCG (or C for that matter). Best is to do two shifts: tcg_gen_xori_i64(mask, mask, 63); /* compute 1's compliment of the shift */ tcg_gen_shl_i64(mask, load_gr(dc, rsrc), mask); tcg_gen_shli_i64(mask, mask, 1); /* one more to produce 2's compliment */ > +static void gen_ld_add(struct DisasContext *dc, > + uint8_t rdst, uint8_t rsrc, int8_t imm8, > + TCGMemOp ops, const char *code) > +{ > + qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d, r%d, %d\n", > + code, rdst, rsrc, imm8); > + > + tcg_gen_qemu_ld_i64(dest_gr(dc, rdst), load_gr(dc, rsrc), > + MMU_USER_IDX, ops); > + /* > + * Each pipe only have one temp val which is already used, and it is only > + * for pipe X1, so can use real register > + */ > + if (rsrc < TILEGX_R_COUNT) { > + tcg_gen_addi_i64(cpu_regs[rsrc], load_gr(dc, rsrc), imm8); > + } > +} This is a poor comment. Clearly each pipe can have two outputs, so this limitation is simply of your own design. Further, the < TILEGX_R_COUNT restriction is also incorrect. True, you don't actually implement the top 7 special registers, but that doesn't matter, you should still be incrementing them. > + > + return; Do not add bare return statments at the ends of functions. > +static int gen_blb(struct DisasContext *dc, uint8_t rsrc, int32_t off, > + TCGCond cond, const char *code) Unused return value. What were you intending? > +static void decode_rrr_1_opcode_y0(struct DisasContext *dc, > + tilegx_bundle_bits bundle) > +{ > + uint8_t rsrc = get_SrcA_Y0(bundle); > + uint8_t rsrcb = get_SrcB_Y0(bundle); > + uint8_t rdst = get_Dest_Y0(bundle); > + > + switch (get_RRROpcodeExtension_Y0(bundle)) { > + case UNARY_RRR_1_OPCODE_Y0: > + switch (get_UnaryOpcodeExtension_Y0(bundle)) { > + case CNTLZ_UNARY_OPCODE_Y0: > + gen_cntlz(dc, rdst, rsrc); > + return; > + case CNTTZ_UNARY_OPCODE_Y0: > + gen_cnttz(dc, rdst, rsrc); > + return; > + case NOP_UNARY_OPCODE_Y0: > + case FNOP_UNARY_OPCODE_Y0: > + if (!rsrc && !rdst) { > + qemu_log_mask(CPU_LOG_TB_IN_ASM, "(f)nop\n"); > + return; > + } > + break; > + case FSINGLE_PACK1_UNARY_OPCODE_Y0: > + case PCNT_UNARY_OPCODE_Y0: > + case REVBITS_UNARY_OPCODE_Y0: > + case REVBYTES_UNARY_OPCODE_Y0: > + case TBLIDXB0_UNARY_OPCODE_Y0: > + case TBLIDXB1_UNARY_OPCODE_Y0: > + case TBLIDXB2_UNARY_OPCODE_Y0: > + case TBLIDXB3_UNARY_OPCODE_Y0: > + default: > + break; > + } > + break; > + case SHL1ADD_RRR_1_OPCODE_Y0: > + gen_shladd(dc, rdst, rsrc, rsrcb, 1, 0); > + return; > + case SHL2ADD_RRR_1_OPCODE_Y0: > + gen_shladd(dc, rdst, rsrc, rsrcb, 2, 0); > + return; > + case SHL3ADD_RRR_1_OPCODE_Y0: > + gen_shladd(dc, rdst, rsrc, rsrcb, 3, 0); > + return; > + default: > + break; > + } > + > + qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_opcode_y0, [" FMT64X "]\n", > bundle); > +} > + I can't help thinking, as I read all of these decode functions, that it would be better if the output disassembly, i.e. qemu_log_mask(CPU_LOG_TB_IN_ASM, *), were to happen here, instead of being spread across 99 other functions. This has a side effect of reducing many of your functions to a single statement, invoking another tcg generator, at which point it's worth inlining them. For example: static void decode_rrr_1_unary_y0(struct DisasContext *dc, tilegx_bundle_bits bundle, uint8_t rdst, uint8_t rsrc) { unsigned ext = get_UnaryOpcodeExtension_Y0(bundle); const char *mnemonic; TCGv vdst, vsrc; if (ext == NOP_UNARY_OPCODE_Y0 || ext == FNOP_UNARY_OPCODE_Y0) { if (rsrc != 0 || rdst != 0) { goto unimplemented; } qemu_log_mask(CPU_LOG_TB_IN_ASM, "(f)nop\n"); return; } vdst = dest_gr(dc, rdst); vsrc = load_gr(dc, rsrc); switch (ext) { case CNTLZ_UNARY_OPCODE_Y0: gen_helper_cntlz(vdst, vsrc); mnemonic = "cntlz"; break; case CNTTZ_UNARY_OPCODE_Y0: gen_helper_cnttz(vdst, vsrc); mnemonic = "cnttz"; break; case FSINGLE_PACK1_UNARY_OPCODE_Y0: case PCNT_UNARY_OPCODE_Y0: case REVBITS_UNARY_OPCODE_Y0: case REVBYTES_UNARY_OPCODE_Y0: case TBLIDXB0_UNARY_OPCODE_Y0: case TBLIDXB1_UNARY_OPCODE_Y0: case TBLIDXB2_UNARY_OPCODE_Y0: case TBLIDXB3_UNARY_OPCODE_Y0: default: unimplemented: qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_unary_y0, [" FMT64X "]\n", bundle); dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED; return; } qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d,r%d\n", mnemonic, rdst, rsrc); } static void decode_rrr_1_opcode_y0(struct DisasContext *dc, tilegx_bundle_bits bundle) { unsigned ext = get_RRROpcodeExtension_Y0(bundle); uint8_t rsrca = get_SrcA_Y0(bundle); uint8_t rsrcb = get_SrcB_Y0(bundle); uint8_t rdst = get_Dest_Y0(bundle); const char *mnemonic; TCGv vdst, vsrca, vsrcb; if (ext == UNARY_RRR_1_OPCODE_Y0) { decode_rrr_1_unary_y0(dc, bundle, rdst, rsrc); return; } vdst = dest_gr(dc, rdst); vsrca = load_gr(dc, rsrca); vsrcb = load_gr(dc, rsrcb); switch (ext) { case SHL1ADD_RRR_1_OPCODE_Y0: gen_shladd(vdst, vsrca, vsrcb, 1, 0); mnemonic = "shl1add"; break; case SHL2ADD_RRR_1_OPCODE_Y0: gen_shladd(vdst, vsrca, vsrcb, 2, 0); mnemonic = "shl2add"; break; case SHL3ADD_RRR_1_OPCODE_Y0: gen_shladd(vdst, vsrca, vsrcb, 3, 0); mnemonic = "shl3add"; break; default: qemu_log_mask(LOG_UNIMP, "UNIMP rrr_1_opcode_y0, [" FMT64X "]\n", bundle); dc->exception = TILEGX_EXCP_OPCODE_UNIMPLEMENTED; return; } qemu_log_mask(CPU_LOG_TB_IN_ASM, "%s r%d,r%d,r%d\n", mnemonic, rdst, rsrca, rsrcb); } r~