stores.

Alex Bennée Wed, 22 Jan 2025 09:44:05 -0800

Paolo Savini <paolo.sav...@embecosm.com> writes:

> This patch replaces the use of a helper function with direct tcg ops 
> generation
> in order to emulate whole register loads and stores. This is done in order to
> improve the performance of QEMU.


Generally having the frontend second guess what the backend will do is
not recommended.

> We still use the helper function when vstart is not 0 at the beginning of the
> emulation of the whole register load or store or when we would end up 
> generating
> partial loads or stores of vector elements (e.g. emulating 64 bits element 
> loads
> with pairs of 32 bits loads on hosts with 32 bits registers).
> The latter condition ensures that we are not surprised by a trap in 
> mid-element
> and consecutively that we can update vstart correctly.

This is what probe functions are for, so you can verify you won't fault
and then fully unroll the loop.

> We also use the helper function when it performs better than tcg for specific
> combinations of vector length, number of fields and element size.
>
> Signed-off-by: Paolo Savini <paolo.sav...@embecosm.com>
> ---
>  target/riscv/insn_trans/trans_rvv.c.inc | 164 +++++++++++++++++-------
>  1 file changed, 119 insertions(+), 45 deletions(-)
>
> diff --git a/target/riscv/insn_trans/trans_rvv.c.inc 
> b/target/riscv/insn_trans/trans_rvv.c.inc
> index b9883a5d32..85935276de 100644
> --- a/target/riscv/insn_trans/trans_rvv.c.inc
> +++ b/target/riscv/insn_trans/trans_rvv.c.inc
> @@ -1100,25 +1100,99 @@ GEN_VEXT_TRANS(vle64ff_v, MO_64, r2nfvm, ldff_op, 
> ld_us_check)
>  typedef void gen_helper_ldst_whole(TCGv_ptr, TCGv, TCGv_env, TCGv_i32);
>  
>  static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf,
> -                             gen_helper_ldst_whole *fn,
> -                             DisasContext *s)
> +                             uint32_t log2_esz, gen_helper_ldst_whole *fn,
> +                             DisasContext *s, bool is_load)
>  {
> -    TCGv_ptr dest;
> -    TCGv base;
> -    TCGv_i32 desc;
> +    mark_vs_dirty(s);
>  
> -    uint32_t data = FIELD_DP32(0, VDATA, NF, nf);
> -    data = FIELD_DP32(data, VDATA, VM, 1);
> -    dest = tcg_temp_new_ptr();
> -    desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
> -                                      s->cfg_ptr->vlenb, data));
> +    uint32_t vlen = s->cfg_ptr->vlenb << 3;
>  
> -    base = get_gpr(s, rs1, EXT_NONE);
> -    tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
> +    /*
> +     * Load/store multiple bytes per iteration.
> +     * When possible do this atomically.
> +     * Update vstart with the number of processed elements.
> +     * Use the helper function if either:
> +     * - vstart is not 0.
> +     * - the target has 32 bit registers and we are loading/storing 64 bit 
> long
> +     *   elements. This is to ensure that we process every element with a 
> single
> +     *   memory instruction.
> +     * - whether the helper function performs better:
> +     *   on x86 the helper function performs better with few combinations of 
> NF,
> +     *   ESZ and VLEN.
> +     *   Other architectures may have other combinations or conditions and 
> they
> +     *   can be added here if necessary.
> +     */
>  
> -    mark_vs_dirty(s);
> +    bool use_helper_fn = !s->vstart_eq_zero || (TCG_TARGET_REG_BITS == 32 && 
> log2_esz == 3);
> +
> +#if defined(HOST_X86_64)
> +    use_helper_fn |= ((nf == 4) && (log2_esz == 0) && (vlen == 1024)) ||
> +                     ((nf == 8) && (log2_esz == 0) && (vlen == 512))  ||
> +                     ((nf == 8) && (log2_esz == 0) && (vlen == 1024)) ||
> +                     ((nf == 8) && (log2_esz == 3) && (vlen == 1024));
> +#endif

Using host architecture ifdefs is generally discouraged except in a few places.

>  
> -    fn(dest, base, tcg_env, desc);
> +     if (!use_helper_fn) {
> +        TCGv addr = tcg_temp_new();
> +        uint32_t size = s->cfg_ptr->vlenb * nf;
> +        TCGv_i64 t8 = tcg_temp_new_i64();
> +        TCGv_i32 t4 = tcg_temp_new_i32();
> +        MemOp atomicity = MO_ATOM_NONE;
> +        if (log2_esz == 0) {
> +            atomicity = MO_ATOM_NONE;
> +        } else {
> +            atomicity = MO_ATOM_IFALIGN_PAIR;
> +        }
> +        if (TCG_TARGET_REG_BITS == 64) {
> +            for (int i = 0; i < size; i += 8) {
> +                addr = get_address(s, rs1, i);
> +                if (is_load) {
> +                    tcg_gen_qemu_ld_i64(t8, addr, s->mem_idx,
> +                            MO_LE | MO_64 | atomicity);
> +                    tcg_gen_st_i64(t8, tcg_env, vreg_ofs(s, vd) + i);
> +                } else {
> +                    tcg_gen_ld_i64(t8, tcg_env, vreg_ofs(s, vd) + i);
> +                    tcg_gen_qemu_st_i64(t8, addr, s->mem_idx,
> +                            MO_LE | MO_64 | atomicity);
> +                }
> +                if (i == size - 8) {
> +                    tcg_gen_movi_tl(cpu_vstart, 0);
> +                } else {
> +                    tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 8 >> log2_esz);
> +                }
> +            }
> +        } else {
> +            for (int i = 0; i < size; i += 4) {
> +                addr = get_address(s, rs1, i);
> +                if (is_load) {
> +                    tcg_gen_qemu_ld_i32(t4, addr, s->mem_idx,
> +                            MO_LE | MO_32 | atomicity);
> +                    tcg_gen_st_i32(t4, tcg_env, vreg_ofs(s, vd) + i);
> +                } else {
> +                    tcg_gen_ld_i32(t4, tcg_env, vreg_ofs(s, vd) + i);
> +                    tcg_gen_qemu_st_i32(t4, addr, s->mem_idx,
> +                            MO_LE | MO_32 | atomicity);
> +                }
> +                if (i == size - 4) {
> +                    tcg_gen_movi_tl(cpu_vstart, 0);
> +                } else {
> +                    tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 4 >> log2_esz);
> +                }
> +            }
> +        }
> +    } else {
> +        TCGv_ptr dest;
> +        TCGv base;
> +        TCGv_i32 desc;
> +        uint32_t data = FIELD_DP32(0, VDATA, NF, nf);
> +        data = FIELD_DP32(data, VDATA, VM, 1);
> +        dest = tcg_temp_new_ptr();
> +        desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb,
> +                        s->cfg_ptr->vlenb, data));
> +        base = get_gpr(s, rs1, EXT_NONE);
> +        tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd));
> +        fn(dest, base, tcg_env, desc);
> +    }
>  
>      finalize_rvv_inst(s);
>      return true;
> @@ -1128,42 +1202,42 @@ static bool ldst_whole_trans(uint32_t vd, uint32_t 
> rs1, uint32_t nf,
>   * load and store whole register instructions ignore vtype and vl setting.
>   * Thus, we don't need to check vill bit. (Section 7.9)
>   */
> -#define GEN_LDST_WHOLE_TRANS(NAME, ARG_NF)                                \
> -static bool trans_##NAME(DisasContext *s, arg_##NAME * a)                 \
> -{                                                                         \
> -    if (require_rvv(s) &&                                                 \
> -        QEMU_IS_ALIGNED(a->rd, ARG_NF)) {                                 \
> -        return ldst_whole_trans(a->rd, a->rs1, ARG_NF,                    \
> -                                gen_helper_##NAME, s);                    \
> -    }                                                                     \
> -    return false;                                                         \
> -}
> -
> -GEN_LDST_WHOLE_TRANS(vl1re8_v,  1)
> -GEN_LDST_WHOLE_TRANS(vl1re16_v, 1)
> -GEN_LDST_WHOLE_TRANS(vl1re32_v, 1)
> -GEN_LDST_WHOLE_TRANS(vl1re64_v, 1)
> -GEN_LDST_WHOLE_TRANS(vl2re8_v,  2)
> -GEN_LDST_WHOLE_TRANS(vl2re16_v, 2)
> -GEN_LDST_WHOLE_TRANS(vl2re32_v, 2)
> -GEN_LDST_WHOLE_TRANS(vl2re64_v, 2)
> -GEN_LDST_WHOLE_TRANS(vl4re8_v,  4)
> -GEN_LDST_WHOLE_TRANS(vl4re16_v, 4)
> -GEN_LDST_WHOLE_TRANS(vl4re32_v, 4)
> -GEN_LDST_WHOLE_TRANS(vl4re64_v, 4)
> -GEN_LDST_WHOLE_TRANS(vl8re8_v,  8)
> -GEN_LDST_WHOLE_TRANS(vl8re16_v, 8)
> -GEN_LDST_WHOLE_TRANS(vl8re32_v, 8)
> -GEN_LDST_WHOLE_TRANS(vl8re64_v, 8)
> +#define GEN_LDST_WHOLE_TRANS(NAME, ETYPE, ARG_NF, IS_LOAD)                  \
> +static bool trans_##NAME(DisasContext *s, arg_##NAME * a)                   \
> +{                                                                           \
> +    if (require_rvv(s) &&                                                   \
> +        QEMU_IS_ALIGNED(a->rd, ARG_NF)) {                                   \
> +        return ldst_whole_trans(a->rd, a->rs1, ARG_NF, ctzl(sizeof(ETYPE)), \
> +                                gen_helper_##NAME, s, IS_LOAD);             \
> +    }                                                                       \
> +    return false;                                                           \
> +}
> +
> +GEN_LDST_WHOLE_TRANS(vl1re8_v,  int8_t,  1, true)
> +GEN_LDST_WHOLE_TRANS(vl1re16_v, int16_t, 1, true)
> +GEN_LDST_WHOLE_TRANS(vl1re32_v, int32_t, 1, true)
> +GEN_LDST_WHOLE_TRANS(vl1re64_v, int64_t, 1, true)
> +GEN_LDST_WHOLE_TRANS(vl2re8_v,  int8_t,  2, true)
> +GEN_LDST_WHOLE_TRANS(vl2re16_v, int16_t, 2, true)
> +GEN_LDST_WHOLE_TRANS(vl2re32_v, int32_t, 2, true)
> +GEN_LDST_WHOLE_TRANS(vl2re64_v, int64_t, 2, true)
> +GEN_LDST_WHOLE_TRANS(vl4re8_v,  int8_t,  4, true)
> +GEN_LDST_WHOLE_TRANS(vl4re16_v, int16_t, 4, true)
> +GEN_LDST_WHOLE_TRANS(vl4re32_v, int32_t, 4, true)
> +GEN_LDST_WHOLE_TRANS(vl4re64_v, int64_t, 4, true)
> +GEN_LDST_WHOLE_TRANS(vl8re8_v,  int8_t,  8, true)
> +GEN_LDST_WHOLE_TRANS(vl8re16_v, int16_t, 8, true)
> +GEN_LDST_WHOLE_TRANS(vl8re32_v, int32_t, 8, true)
> +GEN_LDST_WHOLE_TRANS(vl8re64_v, int64_t, 8, true)
>  
>  /*
>   * The vector whole register store instructions are encoded similar to
>   * unmasked unit-stride store of elements with EEW=8.
>   */
> -GEN_LDST_WHOLE_TRANS(vs1r_v, 1)
> -GEN_LDST_WHOLE_TRANS(vs2r_v, 2)
> -GEN_LDST_WHOLE_TRANS(vs4r_v, 4)
> -GEN_LDST_WHOLE_TRANS(vs8r_v, 8)
> +GEN_LDST_WHOLE_TRANS(vs1r_v, int8_t, 1, false)
> +GEN_LDST_WHOLE_TRANS(vs2r_v, int8_t, 2, false)
> +GEN_LDST_WHOLE_TRANS(vs4r_v, int8_t, 4, false)
> +GEN_LDST_WHOLE_TRANS(vs8r_v, int8_t, 8, false)
>  
>  /*
>   *** Vector Integer Arithmetic Instructions

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [RFC 1/1 v3] target/riscv: use tcg ops generation to emulate whole reg rvv loads/stores.

Reply via email to