On Feb 13 13:03, Alex Bennée wrote: > When icount is enabled and we recompile an MMIO access we end up > double counting the instruction execution. To avoid this we introduce > the CF_MEMI cflag which only allows memory instrumentation for the > next TB (which won't yet have been counted). As this is part of the > hashed compile flags we will only execute the generated TB while > coming out of a cpu_io_recompile. > > While we are at it delete the old TODO. We might as well keep the > translation handy as it's likely you will repeatedly hit it on each > MMIO access. > > Reported-by: Aaron Lindsay <aa...@os.amperecomputing.com> > Signed-off-by: Alex Bennée <alex.ben...@linaro.org> > Reviewed-by: Richard Henderson <richard.hender...@linaro.org> > Message-Id: <20210210221053.18050-21-alex.ben...@linaro.org>
This resolves the issue for me - I'm now seeing one instruction callback and one memory callback for both MMIO load and store instructions, as expected. Tested-by: Aaron Lindsay <aa...@os.amperecomputing.com> Thanks! -Aaron > > --- > v3 > - s/CF_NOINSTR/CF_MEMI_ONY/ > - Limit instrumentation at API call sites instead of skipping altogether > - clean-up commit log message > --- > include/exec/exec-all.h | 6 +++--- > include/exec/plugin-gen.h | 4 ++-- > include/qemu/plugin.h | 4 ++++ > accel/tcg/plugin-gen.c | 3 ++- > accel/tcg/translate-all.c | 18 +++++++++--------- > accel/tcg/translator.c | 5 ++++- > plugins/api.c | 36 +++++++++++++++++++++++++----------- > 7 files changed, 49 insertions(+), 27 deletions(-) > > diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h > index e08179de34..77a2dc044d 100644 > --- a/include/exec/exec-all.h > +++ b/include/exec/exec-all.h > @@ -454,14 +454,14 @@ struct TranslationBlock { > uint32_t cflags; /* compile flags */ > #define CF_COUNT_MASK 0x00007fff > #define CF_LAST_IO 0x00008000 /* Last insn may be an IO access. */ > +#define CF_MEMI_ONLY 0x00010000 /* Only instrument memory ops */ > #define CF_USE_ICOUNT 0x00020000 > #define CF_INVALID 0x00040000 /* TB is stale. Set with @jmp_lock held */ > #define CF_PARALLEL 0x00080000 /* Generate code for a parallel context */ > #define CF_CLUSTER_MASK 0xff000000 /* Top 8 bits are cluster ID */ > #define CF_CLUSTER_SHIFT 24 > -/* cflags' mask for hashing/comparison */ > -#define CF_HASH_MASK \ > - (CF_COUNT_MASK | CF_LAST_IO | CF_USE_ICOUNT | CF_PARALLEL | > CF_CLUSTER_MASK) > +/* cflags' mask for hashing/comparison, basically ignore CF_INVALID */ > +#define CF_HASH_MASK (~CF_INVALID) > > /* Per-vCPU dynamic tracing state used to generate this TB */ > uint32_t trace_vcpu_dstate; > diff --git a/include/exec/plugin-gen.h b/include/exec/plugin-gen.h > index 4834a9e2f4..b1b72b5d90 100644 > --- a/include/exec/plugin-gen.h > +++ b/include/exec/plugin-gen.h > @@ -19,7 +19,7 @@ struct DisasContextBase; > > #ifdef CONFIG_PLUGIN > > -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb); > +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool > supress); > void plugin_gen_tb_end(CPUState *cpu); > void plugin_gen_insn_start(CPUState *cpu, const struct DisasContextBase *db); > void plugin_gen_insn_end(void); > @@ -41,7 +41,7 @@ static inline void plugin_insn_append(const void *from, > size_t size) > #else /* !CONFIG_PLUGIN */ > > static inline > -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb) > +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool > supress) > { > return false; > } > diff --git a/include/qemu/plugin.h b/include/qemu/plugin.h > index 841deed79c..c5a79a89f0 100644 > --- a/include/qemu/plugin.h > +++ b/include/qemu/plugin.h > @@ -92,6 +92,7 @@ struct qemu_plugin_dyn_cb { > }; > }; > > +/* Internal context for instrumenting an instruction */ > struct qemu_plugin_insn { > GByteArray *data; > uint64_t vaddr; > @@ -99,6 +100,7 @@ struct qemu_plugin_insn { > GArray *cbs[PLUGIN_N_CB_TYPES][PLUGIN_N_CB_SUBTYPES]; > bool calls_helpers; > bool mem_helper; > + bool mem_only; > }; > > /* > @@ -128,6 +130,7 @@ static inline struct qemu_plugin_insn > *qemu_plugin_insn_alloc(void) > return insn; > } > > +/* Internal context for this TranslationBlock */ > struct qemu_plugin_tb { > GPtrArray *insns; > size_t n; > @@ -135,6 +138,7 @@ struct qemu_plugin_tb { > uint64_t vaddr2; > void *haddr1; > void *haddr2; > + bool mem_only; > GArray *cbs[PLUGIN_N_CB_SUBTYPES]; > }; > > diff --git a/accel/tcg/plugin-gen.c b/accel/tcg/plugin-gen.c > index 8a1bb801e0..c3dc3effe7 100644 > --- a/accel/tcg/plugin-gen.c > +++ b/accel/tcg/plugin-gen.c > @@ -842,7 +842,7 @@ static void plugin_gen_inject(const struct qemu_plugin_tb > *plugin_tb) > pr_ops(); > } > > -bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb) > +bool plugin_gen_tb_start(CPUState *cpu, const TranslationBlock *tb, bool > mem_only) > { > struct qemu_plugin_tb *ptb = tcg_ctx->plugin_tb; > bool ret = false; > @@ -855,6 +855,7 @@ bool plugin_gen_tb_start(CPUState *cpu, const > TranslationBlock *tb) > ptb->vaddr2 = -1; > get_page_addr_code_hostp(cpu->env_ptr, tb->pc, &ptb->haddr1); > ptb->haddr2 = NULL; > + ptb->mem_only = mem_only; > > plugin_gen_empty_callback(PLUGIN_GEN_FROM_TB); > } > diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c > index 0666f9ef14..fdf88dc1c3 100644 > --- a/accel/tcg/translate-all.c > +++ b/accel/tcg/translate-all.c > @@ -2399,7 +2399,8 @@ void tb_check_watchpoint(CPUState *cpu, uintptr_t > retaddr) > } > > #ifndef CONFIG_USER_ONLY > -/* in deterministic execution mode, instructions doing device I/Os > +/* > + * In deterministic execution mode, instructions doing device I/Os > * must be at the end of the TB. > * > * Called by softmmu_template.h, with iothread mutex not held. > @@ -2430,19 +2431,18 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t > retaddr) > n = 2; > } > > - /* Generate a new TB executing the I/O insn. */ > - cpu->cflags_next_tb = curr_cflags() | CF_LAST_IO | n; > + /* > + * Exit the loop and potentially generate a new TB executing the > + * just the I/O insns. We also limit instrumentation to memory > + * operations only (which execute after completion) so we don't > + * double instrument the instruction. > + */ > + cpu->cflags_next_tb = curr_cflags() | CF_MEMI_ONLY | CF_LAST_IO | n; > > qemu_log_mask_and_addr(CPU_LOG_EXEC, tb->pc, > "cpu_io_recompile: rewound execution of TB to " > TARGET_FMT_lx "\n", tb->pc); > > - /* TODO: If env->pc != tb->pc (i.e. the faulting instruction was not > - * the first in the TB) then we end up generating a whole new TB and > - * repeating the fault, which is horribly inefficient. > - * Better would be to execute just this insn uncached, or generate a > - * second new TB. > - */ > cpu_loop_exit_noexc(cpu); > } > > diff --git a/accel/tcg/translator.c b/accel/tcg/translator.c > index a49a794065..2dfc27102f 100644 > --- a/accel/tcg/translator.c > +++ b/accel/tcg/translator.c > @@ -58,7 +58,8 @@ void translator_loop(const TranslatorOps *ops, > DisasContextBase *db, > ops->tb_start(db, cpu); > tcg_debug_assert(db->is_jmp == DISAS_NEXT); /* no early exit */ > > - plugin_enabled = plugin_gen_tb_start(cpu, tb); > + plugin_enabled = plugin_gen_tb_start(cpu, tb, > + tb_cflags(db->tb) & CF_MEMI_ONLY); > > while (true) { > db->num_insns++; > @@ -100,6 +101,8 @@ void translator_loop(const TranslatorOps *ops, > DisasContextBase *db, > gen_io_start(); > ops->translate_insn(db, cpu); > } else { > + /* we should only see CF_MEMI_ONLY for io_recompile */ > + tcg_debug_assert(!(tb_cflags(db->tb) & CF_MEMI_ONLY)); > ops->translate_insn(db, cpu); > } > > diff --git a/plugins/api.c b/plugins/api.c > index 5dc8e6f934..0b04380d57 100644 > --- a/plugins/api.c > +++ b/plugins/api.c > @@ -84,15 +84,19 @@ void qemu_plugin_register_vcpu_tb_exec_cb(struct > qemu_plugin_tb *tb, > enum qemu_plugin_cb_flags flags, > void *udata) > { > - plugin_register_dyn_cb__udata(&tb->cbs[PLUGIN_CB_REGULAR], > - cb, flags, udata); > + if (!tb->mem_only) { > + plugin_register_dyn_cb__udata(&tb->cbs[PLUGIN_CB_REGULAR], > + cb, flags, udata); > + } > } > > void qemu_plugin_register_vcpu_tb_exec_inline(struct qemu_plugin_tb *tb, > enum qemu_plugin_op op, > void *ptr, uint64_t imm) > { > - plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, imm); > + if (!tb->mem_only) { > + plugin_register_inline_op(&tb->cbs[PLUGIN_CB_INLINE], 0, op, ptr, > imm); > + } > } > > void qemu_plugin_register_vcpu_insn_exec_cb(struct qemu_plugin_insn *insn, > @@ -100,20 +104,27 @@ void qemu_plugin_register_vcpu_insn_exec_cb(struct > qemu_plugin_insn *insn, > enum qemu_plugin_cb_flags flags, > void *udata) > { > - > plugin_register_dyn_cb__udata(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_REGULAR], > - cb, flags, udata); > + if (!insn->mem_only) { > + > plugin_register_dyn_cb__udata(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_REGULAR], > + cb, flags, udata); > + } > } > > void qemu_plugin_register_vcpu_insn_exec_inline(struct qemu_plugin_insn > *insn, > enum qemu_plugin_op op, > void *ptr, uint64_t imm) > { > - plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE], > - 0, op, ptr, imm); > + if (!insn->mem_only) { > + > plugin_register_inline_op(&insn->cbs[PLUGIN_CB_INSN][PLUGIN_CB_INLINE], > + 0, op, ptr, imm); > + } > } > > > - > +/* > + * We always plant memory instrumentation because they don't finalise until > + * after the operation has complete. > + */ > void qemu_plugin_register_vcpu_mem_cb(struct qemu_plugin_insn *insn, > qemu_plugin_vcpu_mem_cb_t cb, > enum qemu_plugin_cb_flags flags, > @@ -121,7 +132,7 @@ void qemu_plugin_register_vcpu_mem_cb(struct > qemu_plugin_insn *insn, > void *udata) > { > plugin_register_vcpu_mem_cb(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_REGULAR], > - cb, flags, rw, udata); > + cb, flags, rw, udata); > } > > void qemu_plugin_register_vcpu_mem_inline(struct qemu_plugin_insn *insn, > @@ -130,7 +141,7 @@ void qemu_plugin_register_vcpu_mem_inline(struct > qemu_plugin_insn *insn, > uint64_t imm) > { > plugin_register_inline_op(&insn->cbs[PLUGIN_CB_MEM][PLUGIN_CB_INLINE], > - rw, op, ptr, imm); > + rw, op, ptr, imm); > } > > void qemu_plugin_register_vcpu_tb_trans_cb(qemu_plugin_id_t id, > @@ -181,10 +192,13 @@ uint64_t qemu_plugin_tb_vaddr(const struct > qemu_plugin_tb *tb) > struct qemu_plugin_insn * > qemu_plugin_tb_get_insn(const struct qemu_plugin_tb *tb, size_t idx) > { > + struct qemu_plugin_insn *insn; > if (unlikely(idx >= tb->n)) { > return NULL; > } > - return g_ptr_array_index(tb->insns, idx); > + insn = g_ptr_array_index(tb->insns, idx); > + insn->mem_only = tb->mem_only; > + return insn; > } > > /* > -- > 2.20.1 >