Hi Richard,
Thank you very much for adding atomic support to translator_ld(). It
has been a big help.

Hi Alistair,
I can rebase the Ziccif patch when Richard's patch has been merged.


Jim Shu

On Fri, Apr 4, 2025 at 12:41 PM Alistair Francis <alistai...@gmail.com> wrote:
>
> On Fri, Jun 7, 2024 at 8:15 PM Jim Shu <jim....@sifive.com> wrote:
> >
> > Support 4-byte atomic instruction fetch when instruction is natural
> > aligned.
> >
> > Current implementation is not atomic because it loads instruction twice
> > for first and last 2 bytes. We load 4 bytes at once to keep the
> > atomicity. This instruction preload method only applys when instruction
> > is 4-byte aligned. If instruction is unaligned, it could be across pages
> > so that preload will trigger additional page fault.
> >
> > We encounter this issue when doing pressure test of enabling & disabling
> > Linux kernel ftrace. Ftrace with kernel preemption requires concurrent
> > modification and execution of instruction, so non-atomic instruction
> > fetch will cause the race condition. We may fetch the wrong instruction
> > which is the mixing of 2 instructions.
> >
> > Also, RISC-V Profile wants to provide this feature by HW. RVA20U64
> > Ziccif protects the atomicity of instruction fetch when it is
> > natural aligned.
> >
> > Signed-off-by: Jim Shu <jim....@sifive.com>
> > Reviewed-by: Frank Chang <frank.ch...@sifive.com>
>
> Once https://patchwork.kernel.org/project/qemu-devel/list/?series=945333
> (specifically 
> https://patchwork.kernel.org/project/qemu-devel/patch/20250318213209.2579218-12-richard.hender...@linaro.org/)
> is merged this should be good to go in as well.
>
> Alistair
>
> > ---
> >  target/riscv/translate.c | 45 ++++++++++++++++++++++++++++++----------
> >  1 file changed, 34 insertions(+), 11 deletions(-)
> >
> > diff --git a/target/riscv/translate.c b/target/riscv/translate.c
> > index 0569224e53..2be8ef63e6 100644
> > --- a/target/riscv/translate.c
> > +++ b/target/riscv/translate.c
> > @@ -1133,13 +1133,37 @@ const RISCVDecoder decoder_table[] = {
> >
> >  const size_t decoder_table_size = ARRAY_SIZE(decoder_table);
> >
> > -static void decode_opc(CPURISCVState *env, DisasContext *ctx, uint16_t 
> > opcode)
> > +static void decode_opc(CPURISCVState *env, DisasContext *ctx)
> >  {
> >      ctx->virt_inst_excp = false;
> > +
> > +    uint32_t opcode;
> > +    bool is_4byte_align = false;
> > +
> > +    if ((ctx->base.pc_next % 4) == 0) {
> > +        /*
> > +         * Load 4 bytes at once to make instruction fetch atomically.
> > +         *
> > +         * Note: When pc is 4-byte aligned, 4-byte instruction wouldn't be
> > +         * across pages. We could preload 4 bytes instruction no matter
> > +         * real one is 2 or 4 bytes. Instruction preload wouldn't trigger
> > +         * additional page fault.
> > +         */
> > +        opcode = translator_ldl(env, &ctx->base, ctx->base.pc_next);
> > +        is_4byte_align = true;
> > +    } else {
> > +        /*
> > +         * For unaligned pc, instruction preload may trigger additional
> > +         * page fault so we only load 2 bytes here.
> > +         */
> > +        opcode = (uint32_t) translator_lduw(env, &ctx->base, 
> > ctx->base.pc_next);
> > +    }
> > +    ctx->ol = ctx->xl;
> > +
> >      ctx->cur_insn_len = insn_len(opcode);
> >      /* Check for compressed insn */
> >      if (ctx->cur_insn_len == 2) {
> > -        ctx->opcode = opcode;
> > +        ctx->opcode = (uint16_t)opcode;
> >          /*
> >           * The Zca extension is added as way to refer to instructions in 
> > the C
> >           * extension that do not include the floating-point loads and 
> > stores
> > @@ -1149,15 +1173,16 @@ static void decode_opc(CPURISCVState *env, 
> > DisasContext *ctx, uint16_t opcode)
> >              return;
> >          }
> >      } else {
> > -        uint32_t opcode32 = opcode;
> > -        opcode32 = deposit32(opcode32, 16, 16,
> > -                             translator_lduw(env, &ctx->base,
> > -                                             ctx->base.pc_next + 2));
> > -        ctx->opcode = opcode32;
> > +        if (!is_4byte_align) {
> > +            /* Load last 2 bytes of instruction here */
> > +            opcode = deposit32(opcode, 16, 16,
> > +                               translator_lduw(env, &ctx->base,
> > +                                               ctx->base.pc_next + 2));
> > +        }
> >
> >          for (guint i = 0; i < ctx->decoders->len; ++i) {
> >              riscv_cpu_decode_fn func = g_ptr_array_index(ctx->decoders, i);
> > -            if (func(ctx, opcode32)) {
> > +            if (func(ctx, opcode)) {
> >                  return;
> >              }
> >          }
> > @@ -1226,10 +1251,8 @@ static void riscv_tr_translate_insn(DisasContextBase 
> > *dcbase, CPUState *cpu)
> >  {
> >      DisasContext *ctx = container_of(dcbase, DisasContext, base);
> >      CPURISCVState *env = cpu_env(cpu);
> > -    uint16_t opcode16 = translator_lduw(env, &ctx->base, 
> > ctx->base.pc_next);
> >
> > -    ctx->ol = ctx->xl;
> > -    decode_opc(env, ctx, opcode16);
> > +    decode_opc(env, ctx);
> >      ctx->base.pc_next += ctx->cur_insn_len;
> >
> >      /* Only the first insn within a TB is allowed to cross a page 
> > boundary. */
> > --
> > 2.17.1
> >
> >

Reply via email to