riscv: Inline vext_ldst_us and coressponding function for performance

Daniel Henrique Barboza Thu, 15 Feb 2024 13:12:30 -0800



On 2/15/24 16:28, Max Chou wrote:

In the vector unit-stride load/store helper functions. the vext_ldst_us
function corresponding most of the execution time. Inline the functions
can avoid the function call overhead to imperove the helper function
performance.

Signed-off-by: Max Chou <max.c...@sifive.com>
---


The inline is a good idea but I think we can do better. I mentioned in a thread
last year [1] about the time we're spending in single byte loads/stores, even
for strided instructions.

E.g. in vext_ldst_stride():


    for (i = env->vstart; i < env->vl; i++, env->vstart++) {
        k = 0;
        while (k < nf) {
            if (!vm && !vext_elem_mask(v0, i)) {
                /* set masked-off elements to 1s */
                vext_set_elems_1s(vd, vma, (i + k * max_elems) * esz,
                                  (i + k * max_elems + 1) * esz);
                k++;
                continue;
            }
            target_ulong addr = base + stride * i + (k << log2_esz);
            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
            k++;
        }
    }

We're doing single byte load/stores in ldst_elem() when, in this case, we could 
do
it in a whole block only once. ARM does something similar in SVE.

I update the gitlab bug https://gitlab.com/qemu-project/qemu/-/issues/2137 with 
this
additional info too.



Thanks,

Daniel


[1] 
https://lore.kernel.org/qemu-riscv/0e54c6c1-2903-7942-eff2-2b8c5e211...@ventanamicro.com/

  target/riscv/vector_helper.c | 30 ++++++++++++++++--------------
  1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index e8fbb921449..866f77d321d 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -149,25 +149,27 @@ static inline void vext_set_elem_mask(void *v0, int index,
  typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr,
                                 uint32_t idx, void *vd, uintptr_t retaddr);

-#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF) \

-static void NAME(CPURISCVState *env, abi_ptr addr,         \
-                 uint32_t idx, void *vd, uintptr_t retaddr)\
-{                                                          \
-    ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
-    *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
-}                                                          \
+#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)         \
+static inline QEMU_ALWAYS_INLINE                        \
+void NAME(CPURISCVState *env, abi_ptr addr,             \
+          uint32_t idx, void *vd, uintptr_t retaddr)    \
+{                                                       \
+    ETYPE *cur = ((ETYPE *)vd + H(idx));                \
+    *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);   \
+}                                                       \

GEN_VEXT_LD_ELEM(lde_b, int8_t, H1, ldsb)

  GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
  GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
  GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)

-#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF) \

-static void NAME(CPURISCVState *env, abi_ptr addr,         \
-                 uint32_t idx, void *vd, uintptr_t retaddr)\
-{                                                          \
-    ETYPE data = *((ETYPE *)vd + H(idx));                  \
-    cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
+#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)         \
+static inline QEMU_ALWAYS_INLINE                        \
+void NAME(CPURISCVState *env, abi_ptr addr,             \
+          uint32_t idx, void *vd, uintptr_t retaddr)    \
+{                                                       \
+    ETYPE data = *((ETYPE *)vd + H(idx));               \
+    cpu_##STSUF##_data_ra(env, addr, data, retaddr);    \
  }

GEN_VEXT_ST_ELEM(ste_b, int8_t, H1, stb)

@@ -289,7 +291,7 @@ GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
   */

/* unmasked unit-stride load and store operation */

-static void
+static inline QEMU_ALWAYS_INLINE void
  vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
               vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
               uintptr_t ra)

Re: [RFC PATCH 3/6] target/riscv: Inline vext_ldst_us and coressponding function for performance

Reply via email to