Hi,
I found there is a performance problem with some simd intrinsics
(vld2_dup_*) on aarch64-none-elf. Now the vld2_dup_* are defined as
follows:

#define __LD2R_FUNC(rettype, structtype, ptrtype, \
    regsuffix, funcsuffix, Q) \
  __extension__ static __inline rettype \
  __attribute__ ((__always_inline__))  \
  vld2 ## Q ## _dup_ ## funcsuffix (const ptrtype *ptr) \
  { \
    rettype result; \
    __asm__ ("ld2r {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \
     "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t" \
     : "=Q"(result) \
     : "Q"(*(const structtype *)ptr) \
     : "memory", "v16", "v17"); \
    return result; \
  }

It loads from memory to registers, and then store the value of
registers to memory as a result. Such code is terribly low in
performance because of redundant memory visit and limited registers
allocation.

Some intinsics like vld2_* were similar to vld2_dup_*, but now they
are realized by builtin functions.

__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
vld2_s16 (const int16_t * __a)
{
  int16x4x2_t ret;
  __builtin_aarch64_simd_oi __o;
  __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a);
  ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0);
  ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1);
  return ret;
}

Could vld2_dup_* also be written as builtin ?  If not, i think the
inline assembler can be optimized as follows :

#define __LD2R_FUNC(rettype, structtype, ptrtype, \
    regsuffix, funcsuffix, Q) \
  __extension__ static __inline rettype \
  __attribute__ ((__always_inline__))  \
  vld2 ## Q ## _dup_ ## funcsuffix (const ptrtype *ptr) \
  { \
    rettype result; \
    __asm__ (
             "ld2r {%0.4h, %1.4h}, %2"                           \
             : "=V16"(result.val[0]), "=V17"(result.val[1]) \
     : "Q"(*(const structtype *)ptr) \
     : "memory", "v16", "v17"); \
    return result; \
  }

It need to add a reg_class_name v16&v17 and add constraints V16 &  V17
for them. For this, aarch64.h、aarch64.c、constraints.md should be
modified.
-- 
Shanyao Chen

Reply via email to