> diff --git a/target-bfin/bfin-sim.c b/target-bfin/bfin-sim.c Why this separate file from translate.c?
> +#include <stdbool.h> > +#include <stdint.h> > +#include <stdio.h> > +#include <stdlib.h> > +#include <string.h> > +#include <inttypes.h> Certainly you shouldn't need these, since this isn't a separately compiled object -- you're included from translate.c. > +static void > +unhandled_instruction(DisasContext *dc, const char *insn) > +{ > + fprintf(stderr, "unhandled insn: %s\n", insn); Use LOG_UNIMP. > +#define HOST_LONG_WORD_SIZE (sizeof(long) * 8) You mean TCG_TARGET_REG_BITS? > +static TCGv > +get_allreg(DisasContext *dc, int grp, int reg) > +{ > + TCGv *ret = cpu_regs[(grp << 3) | reg]; > + if (ret) { > + return *ret; > + } > + abort(); > + illegal_instruction(dc); > +} Well, which is it? abort or illegal_instruction. And come to that, how is abort any better than SEGV from dereferecing the null? Certainly the later will generate a faster translator... > +decode_multfunc_tl(DisasContext *dc, int h0, int h1, int src0, int src1, > + int mmod, int MM, TCGv psat) > +{ > + TCGv s0, s1, val; > + > + s0 = tcg_temp_local_new(); You'll really want to avoid local temps and branches, if at all possible. For some of the more complex stuff that you're open-coding, you may be better off with helper functions instead. > + l = gen_new_label(); > + endl = gen_new_label(); > + > + tcg_gen_brcondi_tl(TCG_COND_NE, val, 0x40000000, l); > + if (mmod == M_W32) { > + tcg_gen_movi_tl(val, 0x7fffffff); > + } else { > + tcg_gen_movi_tl(val, 0x80000000); > + } > + tcg_gen_movi_tl(psat, 1); > + tcg_gen_br(endl); > + > + gen_set_label(l); > + tcg_gen_shli_tl(val, val, 1); > + > + gen_set_label(endl); Certainly possible here with 2 movcond, or 1 movcond, 1 setcond + 1 or. > + l = gen_new_label(); > + tcg_gen_brcondi_tl(TCG_COND_EQ, psat, 0, l); > + tcg_gen_ext32u_i64(val1, val1); > + gen_set_label(l); movcond again. > +static void > +saturate_s32(TCGv_i64 val, TCGv overflow) I shall now stop mentioning movcond. I sense there are many locations to come. > + } else if (prgfunc == 11 && poprnd < 6) { > + /* TESTSET (Preg{poprnd}); */ > + TCGv tmp = tcg_temp_new(); > + tcg_gen_qemu_ld8u(tmp, cpu_preg[poprnd], dc->mem_idx); > + tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_cc, tmp, 0); > + tcg_gen_ori_tl(tmp, tmp, 0x80); > + tcg_gen_qemu_st8(tmp, cpu_preg[poprnd], dc->mem_idx); > + tcg_temp_free(tmp); I'll note that this is fine for system code, but for user code ought to be atomic. There are a bunch of really bad examples in the tree, and no real good solutions atm. > + /* Can't push/pop reserved registers */ > + /*if (reg_is_reserved(grp, reg)) > + illegal_instruction(dc);*/ No commented out code like this. > + /* Everything here needs to be aligned, so check once */ > + gen_align_check(dc, cpu_spreg, 4, false); You ought not need to generate explicit alignment checks. Yes, we don't do that correctly for user-mode, but we do for system mode. My hope is that user mode eventually has the option of using the system mode page tables too -- there are just too many things that don't work correctly when host and target page sizes don't match, or the host and target don't have the same unaligned access characteristics. > + } else if (grp == 4 && (reg == 0 || reg == 2)) { > + /* Pop A#.X */ > + tmp = tcg_temp_new(); > + tcg_gen_qemu_ld32u(tmp, cpu_spreg, dc->mem_idx); > + tcg_gen_andi_tl(tmp, tmp, 0xff); > + tmp64 = tcg_temp_new_i64(); > + tcg_gen_extu_i32_i64(tmp64, tmp); > + tcg_temp_free(tmp); > + > + tcg_gen_andi_i64(cpu_areg[reg >> 1], cpu_areg[reg >> 1], > 0xffffffff); > + tcg_gen_shli_i64(tmp64, tmp64, 32); > + tcg_gen_or_i64(cpu_areg[reg >> 1], cpu_areg[reg >> 1], tmp64); > + tcg_temp_free_i64(tmp64); Drop the andi with 0xff and use tcg_gen_deposit_i64(cpu_areg[reg >> 1], cpu_areg[reg >> 1], tmp64, 32, 8) > + } else if (grp == 4 && (reg == 1 || reg == 3)) { > + /* Pop A#.W */ > + tcg_gen_andi_i64(cpu_areg[reg >> 1], cpu_areg[reg >> 1], > 0xff00000000); > + tmp = tcg_temp_new(); > + tcg_gen_qemu_ld32u(tmp, cpu_spreg, dc->mem_idx); > + tmp64 = tcg_temp_new_i64(); > + tcg_gen_extu_i32_i64(tmp64, tmp); > + tcg_temp_free(tmp); > + tcg_gen_or_i64(cpu_areg[reg >> 1], cpu_areg[reg >> 1], tmp64); > + tcg_temp_free_i64(tmp64); And then this one becomes deposit(areg, areg, tmp64, 0, 32). > + } else if (grp == 4 && (reg == 0 || reg == 2)) { > + /* Push A#.X */ > + tmp64 = tcg_temp_new_i64(); > + tcg_gen_shri_i64(tmp64, cpu_areg[reg >> 1], 32); > + tmp = tcg_temp_new(); > + tcg_gen_trunc_i64_i32(tmp, tmp64); > + tcg_temp_free_i64(tmp64); > + tcg_gen_andi_tl(tmp, tmp, 0xff); Do we ever allow the high 24 bits to be non-zero? Is this andi actually redundant? > + if (W == 1) { > + /* [--SP] = ({d}R7:imm{dr}, {p}P5:imm{pr}); */ > + if (d) { > + for (i = dr; i < 8; i++) { > + tcg_gen_subi_tl(cpu_spreg, cpu_spreg, 4); > + tcg_gen_qemu_st32(cpu_dreg[i], cpu_spreg, dc->mem_idx); > + } > + } What's the cpu exception effect of the second store causing a page fault? Normally one needs to do the address increment in a temporary and only update the real SP register at the end, so that the instruction can be restarted. > + /* CC = CC; is invalid. */ > + if (cbit == 5) > + illegal_instruction(dc); Please handle all checkpatch.pl style errors. > + if (opc == 0) { > + /* CC = ! BITTST (Dreg{dst}, imm{uimm}); */ > + tmp = tcg_temp_new(); > + tcg_gen_movi_tl(tmp, 1 << uimm); > + tcg_gen_and_tl(tmp, tmp, cpu_dreg[dst]); > + tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_cc, tmp, 0); > + tcg_temp_free(tmp); > + } else if (opc == 1) { > + /* CC = BITTST (Dreg{dst}, imm{uimm}); */ > + tmp = tcg_temp_new(); > + tcg_gen_movi_tl(tmp, 1 << uimm); > + tcg_gen_and_tl(tmp, tmp, cpu_dreg[dst]); > + tcg_gen_setcondi_tl(TCG_COND_NE, cpu_cc, tmp, 0); > + tcg_temp_free(tmp); You're writing (x & (1 << I)) != 0 whereas the alternative (x >> I) & 1 does not require the setcond, and will be faster on most hosts. > + if (aop == 1 && W == 0 && idx == ptr) { > + /* Dreg_lo{reg} = W[Preg{ptr}]; */ > + tmp = tcg_temp_local_new(); > + tcg_gen_andi_tl(cpu_dreg[reg], cpu_dreg[reg], 0xffff0000); > + gen_aligned_qemu_ld16u(dc, tmp, cpu_preg[ptr]); > + tcg_gen_or_tl(cpu_dreg[reg], cpu_dreg[reg], tmp); > + tcg_temp_free(tmp); Deposit again. Lots of instances in this function. > + /* LINK imm{framesize}; */ > + int size = uimm16s4(framesize); > + tcg_gen_subi_tl(cpu_spreg, cpu_spreg, 4); > + tcg_gen_qemu_st32(cpu_rets, cpu_spreg, dc->mem_idx); > + tcg_gen_subi_tl(cpu_spreg, cpu_spreg, 4); > + tcg_gen_qemu_st32(cpu_fpreg, cpu_spreg, dc->mem_idx); > + tcg_gen_mov_tl(cpu_fpreg, cpu_spreg); > + tcg_gen_subi_tl(cpu_spreg, cpu_spreg, size); > + } else if (framesize == 0) { > + /* UNLINK; */ > + /* Restore SP from FP. */ > + tcg_gen_mov_tl(cpu_spreg, cpu_fpreg); > + tcg_gen_qemu_ld32u(cpu_fpreg, cpu_spreg, dc->mem_idx); > + tcg_gen_addi_tl(cpu_spreg, cpu_spreg, 4); > + tcg_gen_qemu_ld32u(cpu_rets, cpu_spreg, dc->mem_idx); > + tcg_gen_addi_tl(cpu_spreg, cpu_spreg, 4); Similarly to push/pop multiple wrt intermediate SP. > + if ((aop == 0 || aop == 2) && aopcde == 9 && HL == 0 && s == 0) { > + int a = aop >> 1; > + /* Areg_lo{a} = Dreg_lo{src0}; */ > + tcg_gen_andi_i64(cpu_areg[a], cpu_areg[a], ~0xffff); > + tmp64 = tcg_temp_new_i64(); > + tcg_gen_extu_i32_i64(tmp64, cpu_dreg[src0]); > + tcg_gen_andi_i64(tmp64, tmp64, 0xffff); > + tcg_gen_or_i64(cpu_areg[a], cpu_areg[a], tmp64); > + tcg_temp_free_i64(tmp64); More deposits in this function. I'll stop mentioning them, but pretty much every place you touch aregs can use this. > +#include "linux-fixed-code.h" > + > +static uint32_t bfin_lduw_code(DisasContext *dc, target_ulong pc) > +{ > +#ifdef CONFIG_USER_ONLY > + /* Intercept jump to the magic kernel page */ > + if (((dc->env->personality & 0xff/*PER_MASK*/) == 0/*PER_LINUX*/) && > + (pc & 0xFFFFFF00) == 0x400) { > + uint32_t off = pc - 0x400; > + if (off < sizeof(bfin_linux_fixed_code)) { > + return ((uint16_t)bfin_linux_fixed_code[off + 1] << 8) | > + bfin_linux_fixed_code[off]; > + } > + } > +#endif Surely this memory setup belongs in linux-user/. > +/* Interpret a single Blackfin insn; breaks up parallel insns */ > +static void > +interp_insn_bfin(DisasContext *dc) > +{ > + _interp_insn_bfin(dc, dc->pc); I'd prefer a suffix like "1" rather than a prefix of "_". > +typedef struct CPUBfinState { > + CPU_COMMON COMMON should come last, or just about. Certainly the cpu registers should come first, for most efficient translation access on the host. > +static inline void bfin_astat_write(CPUArchState *env, uint32_t astat) > +{ > + unsigned int i; > + for (i = 0; i < 32; ++i) > + env->astat[i] = !!(astat & (1 << i)); = (astat >> i) & 1 > +typedef void (*hwloop_callback)(struct DisasContext *dc, int loop); > + > +typedef struct DisasContext { > + CPUArchState *env; > + struct TranslationBlock *tb; > + /* The current PC we're decoding (could be middle of parallel insn) */ > + target_ulong pc; > + /* Length of current insn (2/4/8) */ > + target_ulong insn_len; > + > + /* For delayed ASTAT handling */ > + enum astat_ops astat_op; > + > + /* For hardware lop processing */ > + hwloop_callback hwloop_callback; > + void *hwloop_data; > + > + /* Was a DISALGNEXCPT used in this parallel insn ? */ > + int disalgnexcpt; > + > + int is_jmp; > + int mem_idx; > +} DisasContext; Really, this type should be private to translate.c. > +static inline void cpu_get_tb_cpu_state(CPUArchState *env, target_ulong *pc, > + target_ulong *cs_base, int *flags) > +{ > + *pc = cpu_get_pc(env); > + *cs_base = 0; > + *flags = env->astat[ASTAT_RND_MOD]; > +} You'll probably be better off with a bit that notes whether the loop registers are active, or something, so that you don't have to always generate code that handles them. > +DEF_HELPER_3(raise_exception, void, env, i32, i32) Lots of these can use better settings for flags. Here, the only side effect is to raise an exception, which leads to reading the globals. So TCG_CALL_NO_WG. > +DEF_HELPER_5(memalign, void, env, i32, i32, i32, i32) > + > +DEF_HELPER_4(dbga_l, void, env, i32, i32, i32) > +DEF_HELPER_4(dbga_h, void, env, i32, i32, i32) Likewise. > +/* Count the number of bits set to 1 in the 32bit value */ > +uint32_t HELPER(ones)(uint32_t val) > +{ > + uint32_t i; > + uint32_t ret; > + > + ret = 0; > + for (i = 0; i < 32; ++i) > + ret += !!(val & (1 << i)); ctpop32. > +/* Count number of leading bits that match the sign bit */ > +uint32_t HELPER(signbits)(uint32_t val, uint32_t size) ... > +/* Count number of leading bits that match the sign bit */ > +uint32_t HELPER(signbits_64)(uint64_t val, uint32_t size) Surely we can make some use of clz here. But I guess for now this is ok. > +static void gen_goto_tb(DisasContext *dc, int tb_num, TCGv dest) > +{ > +/* > + TranslationBlock *tb; > + tb = dc->tb; > + > + if ((tb->pc & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK)) { > + tcg_gen_goto_tb(tb_num); > + tcg_gen_mov_tl(cpu_pc, dest); > + tcg_gen_exit_tb((long)tb + tb_num); > + } else */{ > + gen_astat_update(dc, false); > + tcg_gen_mov_tl(cpu_pc, dest); > + tcg_gen_exit_tb(0); > + } Why the astat update here, when you have it on almost no other exits from the tb? > + if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP))) > + tcg_gen_debug_insn_start(dc->pc); CPU_LOG_TB_OP | CPU_LOG_TB_OP_OPT r~