Paolo Savini <paolo.sav...@embecosm.com> writes: > This patch replaces the use of a helper function with direct tcg ops > generation > in order to emulate whole register loads and stores. This is done in order to > improve the performance of QEMU.
Generally having the frontend second guess what the backend will do is not recommended. > We still use the helper function when vstart is not 0 at the beginning of the > emulation of the whole register load or store or when we would end up > generating > partial loads or stores of vector elements (e.g. emulating 64 bits element > loads > with pairs of 32 bits loads on hosts with 32 bits registers). > The latter condition ensures that we are not surprised by a trap in > mid-element > and consecutively that we can update vstart correctly. This is what probe functions are for, so you can verify you won't fault and then fully unroll the loop. > We also use the helper function when it performs better than tcg for specific > combinations of vector length, number of fields and element size. > > Signed-off-by: Paolo Savini <paolo.sav...@embecosm.com> > --- > target/riscv/insn_trans/trans_rvv.c.inc | 164 +++++++++++++++++------- > 1 file changed, 119 insertions(+), 45 deletions(-) > > diff --git a/target/riscv/insn_trans/trans_rvv.c.inc > b/target/riscv/insn_trans/trans_rvv.c.inc > index b9883a5d32..85935276de 100644 > --- a/target/riscv/insn_trans/trans_rvv.c.inc > +++ b/target/riscv/insn_trans/trans_rvv.c.inc > @@ -1100,25 +1100,99 @@ GEN_VEXT_TRANS(vle64ff_v, MO_64, r2nfvm, ldff_op, > ld_us_check) > typedef void gen_helper_ldst_whole(TCGv_ptr, TCGv, TCGv_env, TCGv_i32); > > static bool ldst_whole_trans(uint32_t vd, uint32_t rs1, uint32_t nf, > - gen_helper_ldst_whole *fn, > - DisasContext *s) > + uint32_t log2_esz, gen_helper_ldst_whole *fn, > + DisasContext *s, bool is_load) > { > - TCGv_ptr dest; > - TCGv base; > - TCGv_i32 desc; > + mark_vs_dirty(s); > > - uint32_t data = FIELD_DP32(0, VDATA, NF, nf); > - data = FIELD_DP32(data, VDATA, VM, 1); > - dest = tcg_temp_new_ptr(); > - desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb, > - s->cfg_ptr->vlenb, data)); > + uint32_t vlen = s->cfg_ptr->vlenb << 3; > > - base = get_gpr(s, rs1, EXT_NONE); > - tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd)); > + /* > + * Load/store multiple bytes per iteration. > + * When possible do this atomically. > + * Update vstart with the number of processed elements. > + * Use the helper function if either: > + * - vstart is not 0. > + * - the target has 32 bit registers and we are loading/storing 64 bit > long > + * elements. This is to ensure that we process every element with a > single > + * memory instruction. > + * - whether the helper function performs better: > + * on x86 the helper function performs better with few combinations of > NF, > + * ESZ and VLEN. > + * Other architectures may have other combinations or conditions and > they > + * can be added here if necessary. > + */ > > - mark_vs_dirty(s); > + bool use_helper_fn = !s->vstart_eq_zero || (TCG_TARGET_REG_BITS == 32 && > log2_esz == 3); > + > +#if defined(HOST_X86_64) > + use_helper_fn |= ((nf == 4) && (log2_esz == 0) && (vlen == 1024)) || > + ((nf == 8) && (log2_esz == 0) && (vlen == 512)) || > + ((nf == 8) && (log2_esz == 0) && (vlen == 1024)) || > + ((nf == 8) && (log2_esz == 3) && (vlen == 1024)); > +#endif Using host architecture ifdefs is generally discouraged except in a few places. > > - fn(dest, base, tcg_env, desc); > + if (!use_helper_fn) { > + TCGv addr = tcg_temp_new(); > + uint32_t size = s->cfg_ptr->vlenb * nf; > + TCGv_i64 t8 = tcg_temp_new_i64(); > + TCGv_i32 t4 = tcg_temp_new_i32(); > + MemOp atomicity = MO_ATOM_NONE; > + if (log2_esz == 0) { > + atomicity = MO_ATOM_NONE; > + } else { > + atomicity = MO_ATOM_IFALIGN_PAIR; > + } > + if (TCG_TARGET_REG_BITS == 64) { > + for (int i = 0; i < size; i += 8) { > + addr = get_address(s, rs1, i); > + if (is_load) { > + tcg_gen_qemu_ld_i64(t8, addr, s->mem_idx, > + MO_LE | MO_64 | atomicity); > + tcg_gen_st_i64(t8, tcg_env, vreg_ofs(s, vd) + i); > + } else { > + tcg_gen_ld_i64(t8, tcg_env, vreg_ofs(s, vd) + i); > + tcg_gen_qemu_st_i64(t8, addr, s->mem_idx, > + MO_LE | MO_64 | atomicity); > + } > + if (i == size - 8) { > + tcg_gen_movi_tl(cpu_vstart, 0); > + } else { > + tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 8 >> log2_esz); > + } > + } > + } else { > + for (int i = 0; i < size; i += 4) { > + addr = get_address(s, rs1, i); > + if (is_load) { > + tcg_gen_qemu_ld_i32(t4, addr, s->mem_idx, > + MO_LE | MO_32 | atomicity); > + tcg_gen_st_i32(t4, tcg_env, vreg_ofs(s, vd) + i); > + } else { > + tcg_gen_ld_i32(t4, tcg_env, vreg_ofs(s, vd) + i); > + tcg_gen_qemu_st_i32(t4, addr, s->mem_idx, > + MO_LE | MO_32 | atomicity); > + } > + if (i == size - 4) { > + tcg_gen_movi_tl(cpu_vstart, 0); > + } else { > + tcg_gen_addi_tl(cpu_vstart, cpu_vstart, 4 >> log2_esz); > + } > + } > + } > + } else { > + TCGv_ptr dest; > + TCGv base; > + TCGv_i32 desc; > + uint32_t data = FIELD_DP32(0, VDATA, NF, nf); > + data = FIELD_DP32(data, VDATA, VM, 1); > + dest = tcg_temp_new_ptr(); > + desc = tcg_constant_i32(simd_desc(s->cfg_ptr->vlenb, > + s->cfg_ptr->vlenb, data)); > + base = get_gpr(s, rs1, EXT_NONE); > + tcg_gen_addi_ptr(dest, tcg_env, vreg_ofs(s, vd)); > + fn(dest, base, tcg_env, desc); > + } > > finalize_rvv_inst(s); > return true; > @@ -1128,42 +1202,42 @@ static bool ldst_whole_trans(uint32_t vd, uint32_t > rs1, uint32_t nf, > * load and store whole register instructions ignore vtype and vl setting. > * Thus, we don't need to check vill bit. (Section 7.9) > */ > -#define GEN_LDST_WHOLE_TRANS(NAME, ARG_NF) \ > -static bool trans_##NAME(DisasContext *s, arg_##NAME * a) \ > -{ \ > - if (require_rvv(s) && \ > - QEMU_IS_ALIGNED(a->rd, ARG_NF)) { \ > - return ldst_whole_trans(a->rd, a->rs1, ARG_NF, \ > - gen_helper_##NAME, s); \ > - } \ > - return false; \ > -} > - > -GEN_LDST_WHOLE_TRANS(vl1re8_v, 1) > -GEN_LDST_WHOLE_TRANS(vl1re16_v, 1) > -GEN_LDST_WHOLE_TRANS(vl1re32_v, 1) > -GEN_LDST_WHOLE_TRANS(vl1re64_v, 1) > -GEN_LDST_WHOLE_TRANS(vl2re8_v, 2) > -GEN_LDST_WHOLE_TRANS(vl2re16_v, 2) > -GEN_LDST_WHOLE_TRANS(vl2re32_v, 2) > -GEN_LDST_WHOLE_TRANS(vl2re64_v, 2) > -GEN_LDST_WHOLE_TRANS(vl4re8_v, 4) > -GEN_LDST_WHOLE_TRANS(vl4re16_v, 4) > -GEN_LDST_WHOLE_TRANS(vl4re32_v, 4) > -GEN_LDST_WHOLE_TRANS(vl4re64_v, 4) > -GEN_LDST_WHOLE_TRANS(vl8re8_v, 8) > -GEN_LDST_WHOLE_TRANS(vl8re16_v, 8) > -GEN_LDST_WHOLE_TRANS(vl8re32_v, 8) > -GEN_LDST_WHOLE_TRANS(vl8re64_v, 8) > +#define GEN_LDST_WHOLE_TRANS(NAME, ETYPE, ARG_NF, IS_LOAD) \ > +static bool trans_##NAME(DisasContext *s, arg_##NAME * a) \ > +{ \ > + if (require_rvv(s) && \ > + QEMU_IS_ALIGNED(a->rd, ARG_NF)) { \ > + return ldst_whole_trans(a->rd, a->rs1, ARG_NF, ctzl(sizeof(ETYPE)), \ > + gen_helper_##NAME, s, IS_LOAD); \ > + } \ > + return false; \ > +} > + > +GEN_LDST_WHOLE_TRANS(vl1re8_v, int8_t, 1, true) > +GEN_LDST_WHOLE_TRANS(vl1re16_v, int16_t, 1, true) > +GEN_LDST_WHOLE_TRANS(vl1re32_v, int32_t, 1, true) > +GEN_LDST_WHOLE_TRANS(vl1re64_v, int64_t, 1, true) > +GEN_LDST_WHOLE_TRANS(vl2re8_v, int8_t, 2, true) > +GEN_LDST_WHOLE_TRANS(vl2re16_v, int16_t, 2, true) > +GEN_LDST_WHOLE_TRANS(vl2re32_v, int32_t, 2, true) > +GEN_LDST_WHOLE_TRANS(vl2re64_v, int64_t, 2, true) > +GEN_LDST_WHOLE_TRANS(vl4re8_v, int8_t, 4, true) > +GEN_LDST_WHOLE_TRANS(vl4re16_v, int16_t, 4, true) > +GEN_LDST_WHOLE_TRANS(vl4re32_v, int32_t, 4, true) > +GEN_LDST_WHOLE_TRANS(vl4re64_v, int64_t, 4, true) > +GEN_LDST_WHOLE_TRANS(vl8re8_v, int8_t, 8, true) > +GEN_LDST_WHOLE_TRANS(vl8re16_v, int16_t, 8, true) > +GEN_LDST_WHOLE_TRANS(vl8re32_v, int32_t, 8, true) > +GEN_LDST_WHOLE_TRANS(vl8re64_v, int64_t, 8, true) > > /* > * The vector whole register store instructions are encoded similar to > * unmasked unit-stride store of elements with EEW=8. > */ > -GEN_LDST_WHOLE_TRANS(vs1r_v, 1) > -GEN_LDST_WHOLE_TRANS(vs2r_v, 2) > -GEN_LDST_WHOLE_TRANS(vs4r_v, 4) > -GEN_LDST_WHOLE_TRANS(vs8r_v, 8) > +GEN_LDST_WHOLE_TRANS(vs1r_v, int8_t, 1, false) > +GEN_LDST_WHOLE_TRANS(vs2r_v, int8_t, 2, false) > +GEN_LDST_WHOLE_TRANS(vs4r_v, int8_t, 4, false) > +GEN_LDST_WHOLE_TRANS(vs8r_v, int8_t, 8, false) > > /* > *** Vector Integer Arithmetic Instructions -- Alex Bennée Virtualisation Tech Lead @ Linaro