https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63173
Bug ID: 63173 Summary: performance problem with simd intrinsics vld2_dup_* on aarch64-none-elf Product: gcc Version: 4.9.2 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: chenshanyaoboy at gmail dot com Hi, I found there is a performance problem with some simd intrinsics (vld2_dup_*) on aarch64-none-elf. Now the vld2_dup_* are defined as follows: #define __LD2R_FUNC(rettype, structtype, ptrtype, \ regsuffix, funcsuffix, Q) \ __extension__ static __inline rettype \ __attribute__ ((__always_inline__)) \ vld2 ## Q ## _dup_ ## funcsuffix (const ptrtype *ptr) \ { \ rettype result; \ __asm__ ("ld2r {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \ "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t" \ : "=Q"(result) \ : "Q"(*(const structtype *)ptr) \ : "memory", "v16", "v17"); \ return result; \ } It loads from memory to registers, and then store the value of registers to memory as a result. Such code is terribly low in performance because of redundant memory visit and limited registers allocation. cat test1.c #include <arm_neon.h> int 16x4x2_t foo(int16_t __restrict pDataA, int16_t __restrict pDataB,) { int 16x4x2_t DataA, DataB, DataC; DataA = vld2_dup_s16(pDataA); DataB = vld2_dup_s16(pDataB); DataC.val[0] = vqadd_s16( DataA.val[0], DataB.val[0] ); DataC.val[1] = vqadd_s16( DataA.val[1], DataB.val[1] ); return DataC; } aarch64-none-elf-gcc -S -O2 test1.c cat test1.s foo: sub sp, sp, #16 //start of user assembly ld2r {v16.4h, v17.4h}, [x0] st1 {v16.4h, v17.4h}, [sp] //end of user assembly ldr d0, [sp] ldr d1, [sp,8] //start of user assembly ld2r {v16.4h, v17.4h}, [x1] st1 {v16.4h, v17.4h}, [sp] //end of user assembly ldr d2, [sp] sqadd v0.4h, v0.4h, v2.4h ldr d2, [sp,8] add sp, sp, 16 sqadd v1.4h, v1.4h, v2,4h ret Some intinsics like vld2_* were similar to vld2_dup_*, but now they are realized by builtin functions. __extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__)) vld2_s16 (const int16_t * __a) { int16x4x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v4hi ((const __builtin_aarch64_simd_hi *) __a); ret.val[0] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 0); ret.val[1] = (int16x4_t) __builtin_aarch64_get_dregoiv4hi (__o, 1); return ret; } test2.c is similar to test1.c ,only vld2_dup_s16 is instead of vld2_s16 cat test2.c #include <arm_neon.h> int 16x4x2_t foo (int16_t __restrict pDataA, int16_t __restrict pDataB,) { int 16x4x2_t DataA, DataB, DataC; DataA = vld2_s16(pDataA); DataB = vld2_s16(pDataB); DataC.val[0] = vqadd_s16( DataA.val[0], DataB.val[0] ); DataC.val[1] = vqadd_s16( DataA.val[1], DataB.val[1] ); return DataC; } aarch64-none-elf-gcc -S -O2 test2.c cat test2.s foo: ld2 {v2.4h-v3.4h}, [x0] ld2 {v4.4h-v5.4h}, [x1] sqadd v1.4h, v5.4h, v3.4h sqadd v0.4h, v4.4h, v2.4h ret Could vld2_dup_* also be written with builtin as vld2_* ?