On Fri, Jun 7, 2024 at 8:15 PM Jim Shu <jim....@sifive.com> wrote:
>
> Support 4-byte atomic instruction fetch when instruction is natural
> aligned.
>
> Current implementation is not atomic because it loads instruction twice
> for first and last 2 bytes. We load 4 bytes at once to keep the
> atomicity. This instruction preload method only applys when instruction
> is 4-byte aligned. If instruction is unaligned, it could be across pages
> so that preload will trigger additional page fault.
>
> We encounter this issue when doing pressure test of enabling & disabling
> Linux kernel ftrace. Ftrace with kernel preemption requires concurrent
> modification and execution of instruction, so non-atomic instruction
> fetch will cause the race condition. We may fetch the wrong instruction
> which is the mixing of 2 instructions.
>
> Also, RISC-V Profile wants to provide this feature by HW. RVA20U64
> Ziccif protects the atomicity of instruction fetch when it is
> natural aligned.
>
> Signed-off-by: Jim Shu <jim....@sifive.com>
> Reviewed-by: Frank Chang <frank.ch...@sifive.com>

Once https://patchwork.kernel.org/project/qemu-devel/list/?series=945333
(specifically 
https://patchwork.kernel.org/project/qemu-devel/patch/20250318213209.2579218-12-richard.hender...@linaro.org/)
is merged this should be good to go in as well.

Alistair

> ---
>  target/riscv/translate.c | 45 ++++++++++++++++++++++++++++++----------
>  1 file changed, 34 insertions(+), 11 deletions(-)
>
> diff --git a/target/riscv/translate.c b/target/riscv/translate.c
> index 0569224e53..2be8ef63e6 100644
> --- a/target/riscv/translate.c
> +++ b/target/riscv/translate.c
> @@ -1133,13 +1133,37 @@ const RISCVDecoder decoder_table[] = {
>
>  const size_t decoder_table_size = ARRAY_SIZE(decoder_table);
>
> -static void decode_opc(CPURISCVState *env, DisasContext *ctx, uint16_t 
> opcode)
> +static void decode_opc(CPURISCVState *env, DisasContext *ctx)
>  {
>      ctx->virt_inst_excp = false;
> +
> +    uint32_t opcode;
> +    bool is_4byte_align = false;
> +
> +    if ((ctx->base.pc_next % 4) == 0) {
> +        /*
> +         * Load 4 bytes at once to make instruction fetch atomically.
> +         *
> +         * Note: When pc is 4-byte aligned, 4-byte instruction wouldn't be
> +         * across pages. We could preload 4 bytes instruction no matter
> +         * real one is 2 or 4 bytes. Instruction preload wouldn't trigger
> +         * additional page fault.
> +         */
> +        opcode = translator_ldl(env, &ctx->base, ctx->base.pc_next);
> +        is_4byte_align = true;
> +    } else {
> +        /*
> +         * For unaligned pc, instruction preload may trigger additional
> +         * page fault so we only load 2 bytes here.
> +         */
> +        opcode = (uint32_t) translator_lduw(env, &ctx->base, 
> ctx->base.pc_next);
> +    }
> +    ctx->ol = ctx->xl;
> +
>      ctx->cur_insn_len = insn_len(opcode);
>      /* Check for compressed insn */
>      if (ctx->cur_insn_len == 2) {
> -        ctx->opcode = opcode;
> +        ctx->opcode = (uint16_t)opcode;
>          /*
>           * The Zca extension is added as way to refer to instructions in the 
> C
>           * extension that do not include the floating-point loads and stores
> @@ -1149,15 +1173,16 @@ static void decode_opc(CPURISCVState *env, 
> DisasContext *ctx, uint16_t opcode)
>              return;
>          }
>      } else {
> -        uint32_t opcode32 = opcode;
> -        opcode32 = deposit32(opcode32, 16, 16,
> -                             translator_lduw(env, &ctx->base,
> -                                             ctx->base.pc_next + 2));
> -        ctx->opcode = opcode32;
> +        if (!is_4byte_align) {
> +            /* Load last 2 bytes of instruction here */
> +            opcode = deposit32(opcode, 16, 16,
> +                               translator_lduw(env, &ctx->base,
> +                                               ctx->base.pc_next + 2));
> +        }
>
>          for (guint i = 0; i < ctx->decoders->len; ++i) {
>              riscv_cpu_decode_fn func = g_ptr_array_index(ctx->decoders, i);
> -            if (func(ctx, opcode32)) {
> +            if (func(ctx, opcode)) {
>                  return;
>              }
>          }
> @@ -1226,10 +1251,8 @@ static void riscv_tr_translate_insn(DisasContextBase 
> *dcbase, CPUState *cpu)
>  {
>      DisasContext *ctx = container_of(dcbase, DisasContext, base);
>      CPURISCVState *env = cpu_env(cpu);
> -    uint16_t opcode16 = translator_lduw(env, &ctx->base, ctx->base.pc_next);
>
> -    ctx->ol = ctx->xl;
> -    decode_opc(env, ctx, opcode16);
> +    decode_opc(env, ctx);
>      ctx->base.pc_next += ctx->cur_insn_len;
>
>      /* Only the first insn within a TB is allowed to cross a page boundary. 
> */
> --
> 2.17.1
>
>

Reply via email to