Hi, I found there is a performance problem with some simd intrinsics (vld2_dup_*) on aarch64-none-elf. Now the vld2_dup_* are defined as follows:
#define __LD2R_FUNC(rettype, structtype, ptrtype, \ regsuffix, funcsuffix, Q) \ __extension__ static __inline rettype \ __attribute__ ((__always_inline__)) \ vld2 ## Q ## _dup_ ## funcsuffix (const ptrtype *ptr) \ { \ rettype result; \ __asm__ ("ld2r {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \ "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t" \ : "=Q"(result) \ : "Q"(*(const structtype *)ptr) \ : "memory", "v16", "v17"); \ return result; \ } It loads from memory to registers, and then store the value of registers to memory as a result. Such code is terribly low in performance because of redundant memory visit and limited registers allocation. Some intinsics like vld2_* were similar to vld2_dup_*, but now they are realized by builtin functions. __extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__)) vld2_s16 (const int16_t * __a) { int16x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a); ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); return ret; } Could vld2_dup_* also be written as builtin ? If not, i think the inline assembler can be optimized as follows : #define __LD2R_FUNC(rettype, structtype, ptrtype, \ regsuffix, funcsuffix, Q) \ __extension__ static __inline rettype \ __attribute__ ((__always_inline__)) \ vld2 ## Q ## _dup_ ## funcsuffix (const ptrtype *ptr) \ { \ rettype result; \ __asm__ ( "ld2r {%0.4h, %1.4h}, %2" \ : "=V16"(result.val[0]), "=V17"(result.val[1]) \ : "Q"(*(const structtype *)ptr) \ : "memory", "v16", "v17"); \ return result; \ } It need to add a reg_class_name v16&v17 and add constraints V16 & V17 for them. For this, aarch64.h、aarch64.c、constraints.md should be modified. -- Shanyao Chen