st instructions

Max Chou Sun, 23 Jun 2024 23:51:29 -0700

On 2024/6/20 12:38 PM, Richard Henderson wrote:

On 6/13/24 10:51, Max Chou wrote:

The vector unmasked unit-stride and whole register load/store
instructions will load/store continuous memory. If the endian of both
the host and guest architecture are the same, then we can group the
element load/store to load/store more data at a time.


Signed-off-by: Max Chou <max.c...@sifive.com>
---
  target/riscv/vector_helper.c | 160 +++++++++++++++++++++++++----------
  1 file changed, 117 insertions(+), 43 deletions(-)

diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 793337a6f96..cba46ef16a5 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -457,6 +457,69 @@ GEN_VEXT_ST_ELEM(ste_h, uint16_t, H2, stw)
  GEN_VEXT_ST_ELEM(ste_w, uint32_t, H4, stl)
  GEN_VEXT_ST_ELEM(ste_d, uint64_t, H8, stq)
  +static inline uint32_t
+vext_group_ldst_host(CPURISCVState *env, void *vd, uint32_t byte_end,
+                     uint32_t byte_offset, void *host, uint32_t esz,
+                     bool is_load)
+{
+    uint32_t group_size;
+    static vext_ldst_elem_fn_host * const fns[2][4] = {
+        /* Store */
+        { ste_b_host, ste_h_host, ste_w_host, ste_d_host },
+        /* Load */
+        { lde_b_host, lde_h_host, lde_w_host, lde_d_host }
+    };
+    vext_ldst_elem_fn_host *fn;
+
+    if (byte_offset + 8 < byte_end) {
+        group_size = MO_64;
+    } else if (byte_offset + 4 < byte_end) {
+        group_size = MO_32;
+    } else if (byte_offset + 2 < byte_end) {
+        group_size = MO_16;
+    } else {
+        group_size = MO_8;
+    }
+
+    fn = fns[is_load][group_size];
+    fn(vd, byte_offset, host + byte_offset);

This is a really bad idea. The table and indirect call means thatnone of these will be properly inlined. Anyway...

+
+    return 1 << group_size;
+}
+
+static inline void

+vext_continus_ldst_tlb(CPURISCVState *env, vext_ldst_elem_fn_tlb*ldst_tlb,

+                       void *vd, uint32_t evl, target_ulong addr,
+                       uint32_t reg_start, uintptr_t ra, uint32_t esz,
+                       bool is_load)
+{
+    for (; reg_start < evl; reg_start++, addr += esz) {
+        ldst_tlb(env, adjust_addr(env, addr), reg_start * esz, vd, ra);
+    }
+}
+
+static inline void

+vext_continus_ldst_host(CPURISCVState *env, vext_ldst_elem_fn_host*ldst_host,+ void *vd, uint32_t evl, uint32_t reg_start,void *host,

+                        uint32_t esz, bool is_load)
+{
+#if TARGET_BIG_ENDIAN != HOST_BIG_ENDIAN
+    for (; reg_start < evl; reg_start++) {
+        uint32_t byte_off = reg_start * esz;
+        ldst_host(vd, byte_off, host + byte_off);
+    }
+#else
+    uint32_t group_byte;
+    uint32_t byte_start = reg_start * esz;
+    uint32_t byte_end = evl * esz;
+    while (byte_start < byte_end) {

+ group_byte = vext_group_ldst_host(env, vd, byte_end,byte_start, host,

+                                          esz, is_load);
+        byte_start += group_byte;
+    }

... this is much better handled with memcpy, given that you knowendianness matches.

Thanks for the suggestion.

I'll try to replace the original implementation of the table andindirect calls by handled with memcpy at the next version.


Max.

r~

Re: [RFC PATCH v4 4/5] target/riscv: rvv: Provide group continuous ld/st flow for unit-stride ld/st instructions

Reply via email to