Re: [Qemu-devel] [PATCH 2/5] Blackfin: initial port

Richard Henderson Tue, 25 Jun 2013 14:25:11 -0700

> diff --git a/target-bfin/bfin-sim.c b/target-bfin/bfin-sim.c

Why this separate file from translate.c?


> +#include <stdbool.h>
> +#include <stdint.h>
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <string.h>
> +#include <inttypes.h>

Certainly you shouldn't need these, since this isn't a separately
compiled object -- you're included from translate.c.

> +static void
> +unhandled_instruction(DisasContext *dc, const char *insn)
> +{
> +    fprintf(stderr, "unhandled insn: %s\n", insn);

Use LOG_UNIMP.

> +#define HOST_LONG_WORD_SIZE (sizeof(long) * 8)

You mean TCG_TARGET_REG_BITS?

> +static TCGv
> +get_allreg(DisasContext *dc, int grp, int reg)
> +{
> +    TCGv *ret = cpu_regs[(grp << 3) | reg];
> +    if (ret) {
> +       return *ret;
> +    }
> +    abort();
> +    illegal_instruction(dc);
> +}

Well, which is it?  abort or illegal_instruction.  And come to that, how is
abort any better than SEGV from dereferecing the null?  Certainly the later
will generate a faster translator...

> +decode_multfunc_tl(DisasContext *dc, int h0, int h1, int src0, int src1,
> +                   int mmod, int MM, TCGv psat)
> +{
> +    TCGv s0, s1, val;
> +
> +    s0 = tcg_temp_local_new();

You'll really want to avoid local temps and branches, if at all possible.  For
some of the more complex stuff that you're open-coding, you may be better off
with helper functions instead.

> +        l = gen_new_label();
> +        endl = gen_new_label();
> +
> +        tcg_gen_brcondi_tl(TCG_COND_NE, val, 0x40000000, l);
> +        if (mmod == M_W32) {
> +            tcg_gen_movi_tl(val, 0x7fffffff);
> +        } else {
> +            tcg_gen_movi_tl(val, 0x80000000);
> +        }
> +        tcg_gen_movi_tl(psat, 1);
> +        tcg_gen_br(endl);
> +
> +        gen_set_label(l);
> +        tcg_gen_shli_tl(val, val, 1);
> +
> +        gen_set_label(endl);

Certainly possible here with 2 movcond, or 1 movcond, 1 setcond + 1 or.

> +    l = gen_new_label();
> +    tcg_gen_brcondi_tl(TCG_COND_EQ, psat, 0, l);
> +    tcg_gen_ext32u_i64(val1, val1);
> +    gen_set_label(l);

movcond again.

> +static void
> +saturate_s32(TCGv_i64 val, TCGv overflow)

I shall now stop mentioning movcond.  I sense there are many locations to come.

> +    } else if (prgfunc == 11 && poprnd < 6) {
> +        /* TESTSET (Preg{poprnd}); */
> +        TCGv tmp = tcg_temp_new();
> +        tcg_gen_qemu_ld8u(tmp, cpu_preg[poprnd], dc->mem_idx);
> +        tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_cc, tmp, 0);
> +        tcg_gen_ori_tl(tmp, tmp, 0x80);
> +        tcg_gen_qemu_st8(tmp, cpu_preg[poprnd], dc->mem_idx);
> +        tcg_temp_free(tmp);

I'll note that this is fine for system code, but for user code ought to be
atomic.  There are a bunch of really bad examples in the tree, and no real
good solutions atm.

> +    /* Can't push/pop reserved registers */
> +    /*if (reg_is_reserved(grp, reg))
> +        illegal_instruction(dc);*/

No commented out code like this.

> +    /* Everything here needs to be aligned, so check once */
> +    gen_align_check(dc, cpu_spreg, 4, false);

You ought not need to generate explicit alignment checks.  Yes, we don't do
that correctly for user-mode, but we do for system mode.

My hope is that user mode eventually has the option of using the system mode
page tables too -- there are just too many things that don't work correctly
when host and target page sizes don't match, or the host and target don't have
the same unaligned access characteristics.

> +        } else if (grp == 4 && (reg == 0 || reg == 2)) {
> +            /* Pop A#.X */
> +            tmp = tcg_temp_new();
> +            tcg_gen_qemu_ld32u(tmp, cpu_spreg, dc->mem_idx);
> +            tcg_gen_andi_tl(tmp, tmp, 0xff);
> +            tmp64 = tcg_temp_new_i64();
> +            tcg_gen_extu_i32_i64(tmp64, tmp);
> +            tcg_temp_free(tmp);
> +
> +            tcg_gen_andi_i64(cpu_areg[reg >> 1], cpu_areg[reg >> 1], 
> 0xffffffff);
> +            tcg_gen_shli_i64(tmp64, tmp64, 32);
> +            tcg_gen_or_i64(cpu_areg[reg >> 1], cpu_areg[reg >> 1], tmp64);
> +            tcg_temp_free_i64(tmp64);

Drop the andi with 0xff and use

tcg_gen_deposit_i64(cpu_areg[reg >> 1], cpu_areg[reg >> 1], tmp64, 32, 8)

> +        } else if (grp == 4 && (reg == 1 || reg == 3)) {
> +            /* Pop A#.W */
> +            tcg_gen_andi_i64(cpu_areg[reg >> 1], cpu_areg[reg >> 1], 
> 0xff00000000);
> +            tmp = tcg_temp_new();
> +            tcg_gen_qemu_ld32u(tmp, cpu_spreg, dc->mem_idx);
> +            tmp64 = tcg_temp_new_i64();
> +            tcg_gen_extu_i32_i64(tmp64, tmp);
> +            tcg_temp_free(tmp);
> +            tcg_gen_or_i64(cpu_areg[reg >> 1], cpu_areg[reg >> 1], tmp64);
> +            tcg_temp_free_i64(tmp64);

And then this one becomes deposit(areg, areg, tmp64, 0, 32).

> +        } else if (grp == 4 && (reg == 0 || reg == 2)) {
> +            /* Push A#.X */
> +            tmp64 = tcg_temp_new_i64();
> +            tcg_gen_shri_i64(tmp64, cpu_areg[reg >> 1], 32);
> +            tmp = tcg_temp_new();
> +            tcg_gen_trunc_i64_i32(tmp, tmp64);
> +            tcg_temp_free_i64(tmp64);
> +            tcg_gen_andi_tl(tmp, tmp, 0xff);

Do we ever allow the high 24 bits to be non-zero?  Is this andi actually 
redundant?

> +    if (W == 1) {
> +        /* [--SP] = ({d}R7:imm{dr}, {p}P5:imm{pr}); */
> +        if (d) {
> +            for (i = dr; i < 8; i++) {
> +                tcg_gen_subi_tl(cpu_spreg, cpu_spreg, 4);
> +                tcg_gen_qemu_st32(cpu_dreg[i], cpu_spreg, dc->mem_idx);
> +            }
> +        }

What's the cpu exception effect of the second store causing a page fault?
Normally one needs to do the address increment in a temporary and only update
the real SP register at the end, so that the instruction can be restarted.

> +    /* CC = CC; is invalid.  */
> +    if (cbit == 5)
> +        illegal_instruction(dc);

Please handle all checkpatch.pl style errors.

> +    if (opc == 0) {
> +        /* CC = ! BITTST (Dreg{dst}, imm{uimm}); */
> +        tmp = tcg_temp_new();
> +        tcg_gen_movi_tl(tmp, 1 << uimm);
> +        tcg_gen_and_tl(tmp, tmp, cpu_dreg[dst]);
> +        tcg_gen_setcondi_tl(TCG_COND_EQ, cpu_cc, tmp, 0);
> +        tcg_temp_free(tmp);
> +    } else if (opc == 1) {
> +        /* CC = BITTST (Dreg{dst}, imm{uimm}); */
> +        tmp = tcg_temp_new();
> +        tcg_gen_movi_tl(tmp, 1 << uimm);
> +        tcg_gen_and_tl(tmp, tmp, cpu_dreg[dst]);
> +        tcg_gen_setcondi_tl(TCG_COND_NE, cpu_cc, tmp, 0);
> +        tcg_temp_free(tmp);

You're writing

        (x & (1 << I)) != 0

whereas the alternative

        (x >> I) & 1

does not require the setcond, and will be faster on most hosts.

> +    if (aop == 1 && W == 0 && idx == ptr) {
> +        /* Dreg_lo{reg} = W[Preg{ptr}]; */
> +        tmp = tcg_temp_local_new();
> +        tcg_gen_andi_tl(cpu_dreg[reg], cpu_dreg[reg], 0xffff0000);
> +        gen_aligned_qemu_ld16u(dc, tmp, cpu_preg[ptr]);
> +        tcg_gen_or_tl(cpu_dreg[reg], cpu_dreg[reg], tmp);
> +        tcg_temp_free(tmp);

Deposit again.  Lots of instances in this function.

> +        /* LINK imm{framesize}; */
> +        int size = uimm16s4(framesize);
> +        tcg_gen_subi_tl(cpu_spreg, cpu_spreg, 4);
> +        tcg_gen_qemu_st32(cpu_rets, cpu_spreg, dc->mem_idx);
> +        tcg_gen_subi_tl(cpu_spreg, cpu_spreg, 4);
> +        tcg_gen_qemu_st32(cpu_fpreg, cpu_spreg, dc->mem_idx);
> +        tcg_gen_mov_tl(cpu_fpreg, cpu_spreg);
> +        tcg_gen_subi_tl(cpu_spreg, cpu_spreg, size);
> +    } else if (framesize == 0) {
> +        /* UNLINK; */
> +        /* Restore SP from FP.  */
> +        tcg_gen_mov_tl(cpu_spreg, cpu_fpreg);
> +        tcg_gen_qemu_ld32u(cpu_fpreg, cpu_spreg, dc->mem_idx);
> +        tcg_gen_addi_tl(cpu_spreg, cpu_spreg, 4);
> +        tcg_gen_qemu_ld32u(cpu_rets, cpu_spreg, dc->mem_idx);
> +        tcg_gen_addi_tl(cpu_spreg, cpu_spreg, 4);

Similarly to push/pop multiple wrt intermediate SP.

> +    if ((aop == 0 || aop == 2) && aopcde == 9 && HL == 0 && s == 0) {
> +        int a = aop >> 1;
> +        /* Areg_lo{a} = Dreg_lo{src0}; */
> +        tcg_gen_andi_i64(cpu_areg[a], cpu_areg[a], ~0xffff);
> +        tmp64 = tcg_temp_new_i64();
> +        tcg_gen_extu_i32_i64(tmp64, cpu_dreg[src0]);
> +        tcg_gen_andi_i64(tmp64, tmp64, 0xffff);
> +        tcg_gen_or_i64(cpu_areg[a], cpu_areg[a], tmp64);
> +        tcg_temp_free_i64(tmp64);

More deposits in this function.  I'll stop mentioning them, but pretty much
every place you touch aregs can use this.

> +#include "linux-fixed-code.h"
> +
> +static uint32_t bfin_lduw_code(DisasContext *dc, target_ulong pc)
> +{
> +#ifdef CONFIG_USER_ONLY
> +    /* Intercept jump to the magic kernel page */
> +    if (((dc->env->personality & 0xff/*PER_MASK*/) == 0/*PER_LINUX*/) &&
> +        (pc & 0xFFFFFF00) == 0x400) {
> +        uint32_t off = pc - 0x400;
> +        if (off < sizeof(bfin_linux_fixed_code)) {
> +            return ((uint16_t)bfin_linux_fixed_code[off + 1] << 8) |
> +                   bfin_linux_fixed_code[off];
> +        }
> +    }
> +#endif

Surely this memory setup belongs in linux-user/.

> +/* Interpret a single Blackfin insn; breaks up parallel insns */
> +static void
> +interp_insn_bfin(DisasContext *dc)
> +{
> +    _interp_insn_bfin(dc, dc->pc);

I'd prefer a suffix like "1" rather than a prefix of "_".

> +typedef struct CPUBfinState {
> +    CPU_COMMON

COMMON should come last, or just about.
Certainly the cpu registers should come first, for most
efficient translation access on the host.

> +static inline void bfin_astat_write(CPUArchState *env, uint32_t astat)
> +{
> +    unsigned int i;
> +    for (i = 0; i < 32; ++i)
> +        env->astat[i] = !!(astat & (1 << i));

 = (astat >> i) & 1

> +typedef void (*hwloop_callback)(struct DisasContext *dc, int loop);
> +
> +typedef struct DisasContext {
> +    CPUArchState *env;
> +    struct TranslationBlock *tb;
> +    /* The current PC we're decoding (could be middle of parallel insn) */
> +    target_ulong pc;
> +    /* Length of current insn (2/4/8) */
> +    target_ulong insn_len;
> +
> +    /* For delayed ASTAT handling */
> +    enum astat_ops astat_op;
> +
> +    /* For hardware lop processing */
> +    hwloop_callback hwloop_callback;
> +    void *hwloop_data;
> +
> +    /* Was a DISALGNEXCPT used in this parallel insn ? */
> +    int disalgnexcpt;
> +
> +    int is_jmp;
> +    int mem_idx;
> +} DisasContext;

Really, this type should be private to translate.c.

> +static inline void cpu_get_tb_cpu_state(CPUArchState *env, target_ulong *pc,
> +                                        target_ulong *cs_base, int *flags)
> +{
> +    *pc = cpu_get_pc(env);
> +    *cs_base = 0;
> +    *flags = env->astat[ASTAT_RND_MOD];
> +}

You'll probably be better off with a bit that notes whether the loop registers
are active, or something, so that you don't have to always generate code that
handles them.

> +DEF_HELPER_3(raise_exception, void, env, i32, i32)

Lots of these can use better settings for flags.  Here, the only side effect is
to raise an exception, which leads to reading the globals.  So TCG_CALL_NO_WG.

> +DEF_HELPER_5(memalign, void, env, i32, i32, i32, i32)
> +
> +DEF_HELPER_4(dbga_l, void, env, i32, i32, i32)
> +DEF_HELPER_4(dbga_h, void, env, i32, i32, i32)

Likewise.

> +/* Count the number of bits set to 1 in the 32bit value */
> +uint32_t HELPER(ones)(uint32_t val)
> +{
> +    uint32_t i;
> +    uint32_t ret;
> +
> +    ret = 0;
> +    for (i = 0; i < 32; ++i)
> +        ret += !!(val & (1 << i));

ctpop32.

> +/* Count number of leading bits that match the sign bit */
> +uint32_t HELPER(signbits)(uint32_t val, uint32_t size)
...
> +/* Count number of leading bits that match the sign bit */
> +uint32_t HELPER(signbits_64)(uint64_t val, uint32_t size)

Surely we can make some use of clz here.  But I guess for now this is ok.

> +static void gen_goto_tb(DisasContext *dc, int tb_num, TCGv dest)
> +{
> +/*
> +    TranslationBlock *tb;
> +    tb = dc->tb;
> +
> +    if ((tb->pc & TARGET_PAGE_MASK) == (dest & TARGET_PAGE_MASK)) {
> +        tcg_gen_goto_tb(tb_num);
> +        tcg_gen_mov_tl(cpu_pc, dest);
> +        tcg_gen_exit_tb((long)tb + tb_num);
> +    } else */{
> +        gen_astat_update(dc, false);
> +        tcg_gen_mov_tl(cpu_pc, dest);
> +        tcg_gen_exit_tb(0);
> +    }

Why the astat update here, when you have it on almost no other exits from the 
tb?

> +        if (unlikely(qemu_loglevel_mask(CPU_LOG_TB_OP)))
> +            tcg_gen_debug_insn_start(dc->pc);

CPU_LOG_TB_OP | CPU_LOG_TB_OP_OPT


r~

Re: [Qemu-devel] [PATCH 2/5] Blackfin: initial port

Reply via email to